From 51ec2670116d95bb6bdcd7871fce685fc2eaaeb0 Mon Sep 17 00:00:00 2001
From: Tom Keiser <tkeiser@sinenomine.net>
Date: Fri, 17 Mar 2006 19:54:26 +0000
Subject: [PATCH] dafs-20060317

FIXES 26648

demand attach/fast restart fileserver
---
 Makefile.in                    |   24 +-
 acinclude.m4                   |   16 +
 configure.in                   |    1 +
 src/auth/Makefile.in           |    2 +-
 src/bozo/bos.c                 |  271 +-
 src/bozo/bosserver.c           |    3 +-
 src/bozo/fsbnodeops.c          |  441 ++-
 src/cf/osconf.m4               |   12 +
 src/config/param.rs_aix51.h    |    2 -
 src/config/param.rs_aix52.h    |    2 -
 src/config/param.rs_aix53.h    |    2 -
 src/config/stds.h              |   25 +-
 src/rx/rx_queue.h              |   32 +
 src/tsalvaged/Makefile.in      |  200 ++
 src/tsalvaged/salvsync-debug.c |  475 +++
 src/tviced/Makefile.in         |   43 +-
 src/tviced/NTMakefile          |    2 +-
 src/tviced/serialize_state.c   | 1120 +++++++
 src/tviced/serialize_state.h   |  311 ++
 src/tviced/state_analyzer.c    | 2004 ++++++++++++
 src/tvolser/Makefile.in        |   15 +-
 src/util/Makefile.in           |    5 +-
 src/util/afsutil_prototypes.h  |    7 +
 src/util/dirpath.c             |   10 +
 src/util/dirpath.hin           |   16 +
 src/util/dirpath_nt.h          |   16 +
 src/util/errors.h              |    1 +
 src/util/strnlen.c             |   35 +
 src/viced/Makefile.in          |    1 +
 src/viced/NTMakefile           |    2 +
 src/viced/afsfileprocs.c       |   25 +-
 src/viced/callback.c           | 1149 ++++++-
 src/viced/callback.h           |  158 +
 src/viced/host.c               |  634 +++-
 src/viced/host.h               |   24 +-
 src/viced/viced.c              |  234 +-
 src/viced/viced.h              |   60 +-
 src/viced/viced_prototypes.h   |   23 +
 src/vol/Makefile.in            |   82 +-
 src/vol/NTMakefile             |    2 +
 src/vol/daemon_com.c           |  473 +++
 src/vol/daemon_com.h           |  141 +
 src/vol/fssync-client.c        |  222 ++
 src/vol/fssync-debug.c         | 1148 +++++++
 src/vol/fssync-server.c        | 1179 ++++++++
 src/vol/fssync.c               |  751 -----
 src/vol/fssync.h               |  137 +-
 src/vol/nuke.c                 |    1 +
 src/vol/partition.c            |   90 +-
 src/vol/partition.h            |   31 +-
 src/vol/purge.c                |   19 +-
 src/vol/salvage.h              |    5 +
 src/vol/salvaged.c             |  738 +++++
 src/vol/salvager.c             |  499 +++
 src/vol/salvsync-client.c      |  172 ++
 src/vol/salvsync-server.c      | 1009 +++++++
 src/vol/salvsync.h             |  111 +
 src/vol/test/listVicepx.c      |    1 +
 src/vol/test/updateDirInode.c  |    1 +
 src/vol/vnode.c                |  320 +-
 src/vol/vnode.h                |    2 +
 src/vol/vol-salvage.c          |  614 +---
 src/vol/vol-salvage.h          |  282 ++
 src/vol/voldefs.h              |    3 +
 src/vol/volinodes.h            |    5 +
 src/vol/volume.c               | 5191 ++++++++++++++++++++++++++++----
 src/vol/volume.h               |  316 +-
 src/volser/NTMakefile          |    2 +
 src/volser/dumpstuff.c         |    1 +
 src/volser/volprocs.c          |   22 +-
 src/volser/volser.p.h          |    3 +-
 71 files changed, 18626 insertions(+), 2350 deletions(-)
 create mode 100644 src/tsalvaged/Makefile.in
 create mode 100644 src/tsalvaged/salvsync-debug.c
 create mode 100644 src/tviced/serialize_state.c
 create mode 100644 src/tviced/serialize_state.h
 create mode 100644 src/tviced/state_analyzer.c
 create mode 100644 src/util/strnlen.c
 create mode 100644 src/viced/callback.h
 create mode 100644 src/vol/daemon_com.c
 create mode 100644 src/vol/daemon_com.h
 create mode 100644 src/vol/fssync-client.c
 create mode 100644 src/vol/fssync-debug.c
 create mode 100644 src/vol/fssync-server.c
 delete mode 100644 src/vol/fssync.c
 create mode 100644 src/vol/salvaged.c
 create mode 100644 src/vol/salvager.c
 create mode 100644 src/vol/salvsync-client.c
 create mode 100644 src/vol/salvsync-server.c
 create mode 100644 src/vol/salvsync.h
 create mode 100644 src/vol/vol-salvage.h

diff --git a/Makefile.in b/Makefile.in
index 7e8033d60a..209d9b272b 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -213,6 +213,24 @@ sgiefs:
 vol: cmd comerr dir afs sgiefs
 	${COMPILE_PART1} vol ${COMPILE_PART2}
 
+tsalvaged: vol libafsrpc libafsauthent cmd util
+	set -x; \
+	if test "@DEMAND_ATTACH@" = "yes" ; then \
+		case ${SYS_NAME} in \
+		alpha_dux*|sgi_*|sun*_5*|rs_aix*|*linux*|hp_ux11*|ia64_hpux*|*fbsd*|*nbsd2*) \
+			${COMPILE_PART1} tsalvaged ${COMPILE_PART2} ;; \
+		*_darwin_[1-6][0-9]) \
+			echo Not building MT tsalvaged for ${SYS_NAME} ;; \
+		*_darwin_*) \
+			${COMPILE_PART1} tsalvaged  ${COMPILE_PART2} ;; \
+		*) \
+			echo Not building MT tsalvaged for ${SYS_NAME} ;; \
+		esac \
+	else \
+		echo skipping tsalvaged ; \
+	fi
+
+
 vlserver: cmd comerr vol audit vlserver_depinstall
 	${COMPILE_PART1} vlserver ${COMPILE_PART2}
 
@@ -569,13 +587,13 @@ jafs: libjafs
 jafsadm: libjafsadm
 
 finale: project cmd comerr afsd butc tbutc @ENABLE_KERNEL_MODULE@ libuafs audit kauth log package \
-	ptserver scout bu_utils ubik uss bozo vfsck volser tvolser \
+	ptserver scout bu_utils ubik uss bozo vfsck volser tvolser tsalvaged \
 	venus update xstat afsmonitor dauth rxdebug libafsrpc \
 	libafsauthent shlibafsrpc shlibafsauthent libadmin login man-pages
 	${COMPILE_PART1} finale ${COMPILE_PART2}
 
 finale_nolibafs: project cmd comerr afsd butc tbutc libuafs audit kauth log package \
-	ptserver scout bu_utils ubik uss bozo vfsck volser tvolser \
+	ptserver scout bu_utils ubik uss bozo vfsck volser tvolser tsalvaged \
 	venus update xstat afsmonitor dauth rxdebug libafsrpc \
 	libafsauthent shlibafsrpc shlibafsauthent libadmin login man-pages
 	${COMPILE_PART1} finale ${COMPILE_PART2}
@@ -633,6 +651,7 @@ clean2:
 	-${COMPILE_PART1} tviced ${COMPILE_CLEAN}
 	-${COMPILE_PART1} volser ${COMPILE_CLEAN}
 	-${COMPILE_PART1} tvolser ${COMPILE_CLEAN}
+	-${COMPILE_PART1} tsalvaged ${COMPILE_CLEAN}
 	-${COMPILE_PART1} venus ${COMPILE_CLEAN}
 	-${COMPILE_PART1} venus/test ${COMPILE_CLEAN}
 	-${COMPILE_PART1} afsd ${COMPILE_CLEAN}
@@ -791,6 +810,7 @@ distclean: clean
 	src/tests/Makefile \
 	src/tests/run-tests \
 	src/tests/OpenAFS/Dirpath.pm \
+	src/tsalvaged/Makefile \
 	src/tsm41/Makefile \
 	src/tviced/Makefile \
 	src/tvolser/Makefile \
diff --git a/acinclude.m4 b/acinclude.m4
index c9b8417dd7..d33fec3f10 100644
--- a/acinclude.m4
+++ b/acinclude.m4
@@ -33,6 +33,8 @@ AC_ARG_ENABLE( fast-restart,
 [  --enable-fast-restart 		enable fast startup of file server without salvaging],, enable_fast_restart="no")
 AC_ARG_ENABLE( bitmap-later,
 [  --enable-bitmap-later 		enable fast startup of file server by not reading bitmap till needed],, enable_bitmap_later="no")
+AC_ARG_ENABLE( demand-attach-fs,
+[  --enable-demand-attach-fs 		enable Demand Attach Fileserver (please see documentation)],, enable_demand_attach_fs="no")
 AC_ARG_ENABLE( full-vos-listvol-switch,
 [  --disable-full-vos-listvol-switch    disable vos full listvol switch for formatted output],, enable_full_vos_listvol_switch="yes")
 AC_ARG_WITH(dux-kernel-headers,
@@ -948,6 +950,20 @@ if test "$enable_bitmap_later" = "yes"; then
 	AC_DEFINE(BITMAP_LATER, 1, [define if you want to salvager to check bitmasks later])
 fi
 
+if test "$enable_demand_attach_fs" = "yes"; then
+	AC_DEFINE(DEMAND_ATTACH_ENABLE, 1, [define if you want the demand attach fileserver])
+	DEMAND_ATTACH="yes"
+else
+	DEMAND_ATTACH="no"
+fi
+AC_SUBST(DEMAND_ATTACH)
+
+if test "$enable_fast_restart" = "yes" &&
+   test "$enable_demand_attach_fs" = "yes" ; then
+	AC_MSG_ERROR([The Demand Attach and Fast Restart extensions are mutually exclusive.  Demand Attach fileservers automatically salvage volumes in the background, thereby making Fast Restart pointless.])
+	exit 1
+fi
+
 if test "$enable_full_vos_listvol_switch" = "yes"; then
 	AC_DEFINE(FULL_LISTVOL_SWITCH, 1, [define if you want to want listvol switch])
 fi
diff --git a/configure.in b/configure.in
index e96a93be9b..c20cce9f2c 100644
--- a/configure.in
+++ b/configure.in
@@ -106,6 +106,7 @@ src/tbutc/Makefile \
 src/tests/Makefile \
 src/tests/run-tests \
 src/tests/OpenAFS/Dirpath.pm \
+src/tsalvaged/Makefile \
 src/tsm41/Makefile \
 src/tviced/Makefile \
 src/tvolser/Makefile \
diff --git a/src/auth/Makefile.in b/src/auth/Makefile.in
index 33797066b1..975775badb 100644
--- a/src/auth/Makefile.in
+++ b/src/auth/Makefile.in
@@ -96,7 +96,7 @@ test:
 	cd test; $(MAKE)
 
 clean:
-	$(RM) -f *.o *.a copyauth setkey auth.h cellconfig.h acfg_errors.c ktc_errors.c core\
+	$(RM) -f *.o *.a copyauth setkey auth.h cellconfig.h acfg_errors.c ktc_errors.c core \
 	AFS_component_version_number.c
 
 include ../config/Makefile.version
diff --git a/src/bozo/bos.c b/src/bozo/bos.c
index ad5a00f4f8..cca66c03a6 100644
--- a/src/bozo/bos.c
+++ b/src/bozo/bos.c
@@ -52,10 +52,12 @@ static DoStat();
 
 #include "bosint.h"
 
-#define MRAFS_OFFSET  9
-#define ADDPARMOFFSET 26
+/* command offsets for bos salvage command */
+#define MRAFS_OFFSET  10
+#define ADDPARMOFFSET 27
 
-static struct SalvageParms {
+/* MR-AFS salvage parameters */
+struct MRAFSSalvageParms {
     afs_int32 Optdebug;
     afs_int32 Optnowrite;
     afs_int32 Optforce;
@@ -74,7 +76,7 @@ static struct SalvageParms {
     afs_int32 OptLogLevel;
     afs_int32 OptRxDebug;
     afs_uint32 OptResidencies;
-} mrafsParm;
+};
 
 /* dummy routine for the audit work.  It should do nothing since audits */
 /* occur at the server level and bos is not a server. */
@@ -1224,17 +1226,11 @@ StopServer(as)
 
 #define PARMBUFFERSSIZE 32
 
-static
-DoSalvage(aconn, aparm1, aparm2, aoutName, showlog, parallel, atmpDir,
-	  orphans)
-     struct rx_connection *aconn;
-     char *aoutName;
-     char *aparm1;
-     char *aparm2;
-     afs_int32 showlog;
-     char *parallel;
-     char *atmpDir;
-     char *orphans;
+static afs_int32
+DoSalvage(struct rx_connection * aconn, char * aparm1, char * aparm2, 
+	  char * aoutName, afs_int32 showlog, char * parallel, 
+	  char * atmpDir, char * orphans, int dafs, 
+	  struct MRAFSSalvageParms * mrafsParm)
 {
     register afs_int32 code;
     char *parms[6];
@@ -1285,19 +1281,43 @@ DoSalvage(aconn, aparm1, aparm2, aoutName, showlog, parallel, atmpDir,
 	parms[code] = "";
     if (!aparm2)
 	aparm2 = "";
+
     /* MUST pass canonical (wire-format) salvager path to bosserver */
-    strncpy(tbuffer, AFSDIR_CANONICAL_SERVER_SALVAGER_FILEPATH, BOZO_BSSIZE);
     if (*aparm2 != 0) {
-	if ((strlen(tbuffer) + 1 + strlen(partName) + 1 + strlen(aparm2) +
-	     1) > BOZO_BSSIZE) {
-	    printf("bos: command line too big\n");
-	    return (E2BIG);
+	/* single volume salvage */
+	if (dafs) {
+	    /* for DAFS, we call the salvagserver binary with special options.
+	     * in this mode, it simply uses SALVSYNC to tell the currently
+	     * running salvageserver to offline and salvage the volume in question */
+	    strncpy(tbuffer, AFSDIR_CANONICAL_SERVER_SALSRV_FILEPATH, BOZO_BSSIZE);
+
+	    if ((strlen(tbuffer) + 9 + strlen(partName) + 1 + strlen(aparm2) +
+		 1) > BOZO_BSSIZE) {
+		printf("bos: command line too big\n");
+		return (E2BIG);
+	    }
+
+	    strcat(tbuffer, " -client ");
+	    strcat(tbuffer, partName);
+	    strcat(tbuffer, " ");
+	    strcat(tbuffer, aparm2);
+	} else {
+	    strncpy(tbuffer, AFSDIR_CANONICAL_SERVER_SALVAGER_FILEPATH, BOZO_BSSIZE);
+
+	    if ((strlen(tbuffer) + 1 + strlen(partName) + 1 + strlen(aparm2) +
+		 1) > BOZO_BSSIZE) {
+		printf("bos: command line too big\n");
+		return (E2BIG);
+	    }
+
+	    strcat(tbuffer, " ");
+	    strcat(tbuffer, partName);
+	    strcat(tbuffer, " ");
+	    strcat(tbuffer, aparm2);
 	}
-	strcat(tbuffer, " ");
-	strcat(tbuffer, partName);
-	strcat(tbuffer, " ");
-	strcat(tbuffer, aparm2);
     } else {
+	/* partition salvage */
+	strncpy(tbuffer, AFSDIR_CANONICAL_SERVER_SALVAGER_FILEPATH, BOZO_BSSIZE);
 	if ((strlen(tbuffer) + 4 + strlen(partName) + 1) > BOZO_BSSIZE) {
 	    printf("bos: command line too big\n");
 	    return (E2BIG);
@@ -1306,75 +1326,82 @@ DoSalvage(aconn, aparm1, aparm2, aoutName, showlog, parallel, atmpDir,
 	strcat(tbuffer, partName);
     }
 
-    /* add the parallel option if given */
-    if (parallel != NULL) {
-	if ((strlen(tbuffer) + 11 + strlen(parallel) + 1) > BOZO_BSSIZE) {
-	    printf("bos: command line too big\n");
-	    return (E2BIG);
+    /* For DAFS, specifying a single volume does not result in a standard
+     * salvager call.  Instead, it simply results in a SALVSYNC call to the
+     * online salvager daemon.  This interface does not give us the same rich
+     * set of call flags.  Thus, we skip these steps for DAFS single-volume 
+     * calls */
+    if (!dafs || (*aparm2 == 0)) {
+	/* add the parallel option if given */
+	if (parallel != NULL) {
+	    if ((strlen(tbuffer) + 11 + strlen(parallel) + 1) > BOZO_BSSIZE) {
+		printf("bos: command line too big\n");
+		return (E2BIG);
+	    }
+	    strcat(tbuffer, " -parallel ");
+	    strcat(tbuffer, parallel);
 	}
-	strcat(tbuffer, " -parallel ");
-	strcat(tbuffer, parallel);
-    }
 
-    /* add the tmpdir option if given */
-    if (atmpDir != NULL) {
-	if ((strlen(tbuffer) + 9 + strlen(atmpDir) + 1) > BOZO_BSSIZE) {
-	    printf("bos: command line too big\n");
-	    return (E2BIG);
+	/* add the tmpdir option if given */
+	if (atmpDir != NULL) {
+	    if ((strlen(tbuffer) + 9 + strlen(atmpDir) + 1) > BOZO_BSSIZE) {
+		printf("bos: command line too big\n");
+		return (E2BIG);
+	    }
+	    strcat(tbuffer, " -tmpdir ");
+	    strcat(tbuffer, atmpDir);
 	}
-	strcat(tbuffer, " -tmpdir ");
-	strcat(tbuffer, atmpDir);
-    }
 
-    /* add the orphans option if given */
-    if (orphans != NULL) {
-	if ((strlen(tbuffer) + 10 + strlen(orphans) + 1) > BOZO_BSSIZE) {
-	    printf("bos: command line too big\n");
-	    return (E2BIG);
+	/* add the orphans option if given */
+	if (orphans != NULL) {
+	    if ((strlen(tbuffer) + 10 + strlen(orphans) + 1) > BOZO_BSSIZE) {
+		printf("bos: command line too big\n");
+		return (E2BIG);
+	    }
+	    strcat(tbuffer, " -orphans ");
+	    strcat(tbuffer, orphans);
 	}
-	strcat(tbuffer, " -orphans ");
-	strcat(tbuffer, orphans);
-    }
 
-    if (mrafsParm.Optdebug)
-	strcat(tbuffer, " -debug");
-    if (mrafsParm.Optnowrite)
-	strcat(tbuffer, " -nowrite");
-    if (mrafsParm.Optforce)
-	strcat(tbuffer, " -force");
-    if (mrafsParm.Optoktozap)
-	strcat(tbuffer, " -oktozap");
-    if (mrafsParm.Optrootfiles)
-	strcat(tbuffer, " -rootfiles");
-    if (mrafsParm.Optsalvagedirs)
-	strcat(tbuffer, " -salvagedirs");
-    if (mrafsParm.Optblockreads)
-	strcat(tbuffer, " -blockreads");
-    if (mrafsParm.OptListResidencies)
-	strcat(tbuffer, " -ListResidencies");
-    if (mrafsParm.OptSalvageRemote)
-	strcat(tbuffer, " -SalvageRemote");
-    if (mrafsParm.OptSalvageArchival)
-	strcat(tbuffer, " -SalvageArchival");
-    if (mrafsParm.OptIgnoreCheck)
-	strcat(tbuffer, " -IgnoreCheck");
-    if (mrafsParm.OptForceOnLine)
-	strcat(tbuffer, " -ForceOnLine");
-    if (mrafsParm.OptUseRootDirACL)
-	strcat(tbuffer, " -UseRootDirACL");
-    if (mrafsParm.OptTraceBadLinkCounts)
-	strcat(tbuffer, " -TraceBadLinkCounts");
-    if (mrafsParm.OptDontAskFS)
-	strcat(tbuffer, " -DontAskFS");
-    if (mrafsParm.OptLogLevel) {
-	sprintf(pbuffer, " -LogLevel %ld", mrafsParm.OptLogLevel);
-	strcat(tbuffer, pbuffer);
-    }
-    if (mrafsParm.OptRxDebug)
-	strcat(tbuffer, " -rxdebug");
-    if (mrafsParm.OptResidencies) {
-	sprintf(pbuffer, " -Residencies %lu", mrafsParm.OptResidencies);
-	strcat(tbuffer, pbuffer);
+	if (mrafsParm->Optdebug)
+	    strcat(tbuffer, " -debug");
+	if (mrafsParm->Optnowrite)
+	    strcat(tbuffer, " -nowrite");
+	if (mrafsParm->Optforce)
+	    strcat(tbuffer, " -force");
+	if (mrafsParm->Optoktozap)
+	    strcat(tbuffer, " -oktozap");
+	if (mrafsParm->Optrootfiles)
+	    strcat(tbuffer, " -rootfiles");
+	if (mrafsParm->Optsalvagedirs)
+	    strcat(tbuffer, " -salvagedirs");
+	if (mrafsParm->Optblockreads)
+	    strcat(tbuffer, " -blockreads");
+	if (mrafsParm->OptListResidencies)
+	    strcat(tbuffer, " -ListResidencies");
+	if (mrafsParm->OptSalvageRemote)
+	    strcat(tbuffer, " -SalvageRemote");
+	if (mrafsParm->OptSalvageArchival)
+	    strcat(tbuffer, " -SalvageArchival");
+	if (mrafsParm->OptIgnoreCheck)
+	    strcat(tbuffer, " -IgnoreCheck");
+	if (mrafsParm->OptForceOnLine)
+	    strcat(tbuffer, " -ForceOnLine");
+	if (mrafsParm->OptUseRootDirACL)
+	    strcat(tbuffer, " -UseRootDirACL");
+	if (mrafsParm->OptTraceBadLinkCounts)
+	    strcat(tbuffer, " -TraceBadLinkCounts");
+	if (mrafsParm->OptDontAskFS)
+	    strcat(tbuffer, " -DontAskFS");
+	if (mrafsParm->OptLogLevel) {
+	    sprintf(pbuffer, " -LogLevel %ld", mrafsParm->OptLogLevel);
+	    strcat(tbuffer, pbuffer);
+	}
+	if (mrafsParm->OptRxDebug)
+	    strcat(tbuffer, " -rxdebug");
+	if (mrafsParm->OptResidencies) {
+	    sprintf(pbuffer, " -Residencies %lu", mrafsParm->OptResidencies);
+	    strcat(tbuffer, pbuffer);
+	}
     }
 
     parms[0] = tbuffer;
@@ -1481,22 +1508,36 @@ SalvageCmd(as)
     char tname[BOZO_BSSIZE];
     afs_int32 newID;
     extern struct ubik_client *cstruct;
-    afs_int32 curGoal, showlog = 0, mrafs = 0;
+    afs_int32 curGoal, showlog = 0, dafs = 0, mrafs = 0;
     char *parallel;
     char *tmpDir;
     char *orphans;
     char *tp;
+    char * serviceName;
+    struct MRAFSSalvageParms mrafsParm;
 
     memset(&mrafsParm, 0, sizeof(mrafsParm));
 
     /* parm 0 is machine name, 1 is partition, 2 is volume, 3 is -all flag */
     tconn = GetConn(as, 0);
 
-    /* Find out whether fileserver is running MR-AFS (has a scanner instance) */
-    /* XXX this should really be done some other way, potentially by RPC */
     tp = &tname[0];
-    if (code = BOZO_GetInstanceParm(tconn, "fs", 3, &tp) == 0)
-	mrafs = 1;
+
+    /* find out whether fileserver is running demand attach fs */
+    if (code = BOZO_GetInstanceParm(tconn, "dafs", 0, &tp) == 0) {
+	dafs = 1;
+	serviceName = "dafs";
+	/* Find out whether fileserver is running MR-AFS (has a scanner instance) */
+	/* XXX this should really be done some other way, potentially by RPC */
+	if (code = BOZO_GetInstanceParm(tconn, serviceName, 4, &tp) == 0)
+	    mrafs = 1;
+    } else {
+	serviceName = "fs";
+	/* Find out whether fileserver is running MR-AFS (has a scanner instance) */
+	/* XXX this should really be done some other way, potentially by RPC */
+	if (code = BOZO_GetInstanceParm(tconn, serviceName, 3, &tp) == 0)
+	    mrafs = 1;
+    }
 
     /* we can do a volume, a partition or the whole thing, but not mixtures
      * thereof */
@@ -1542,6 +1583,14 @@ SalvageCmd(as)
 	orphans = as->parms[8].items->data;
     }
 
+    if (dafs) {
+	if (!as->parms[9].items) { /* -forceDAFS flag */
+	    printf("This is a demand attach fileserver.  Are you sure you want to proceed with a manual salvage?\n");
+	    printf("must specify -forceDAFS flag in order to proceed.\n");
+	    return EINVAL;
+	}
+    }
+
     if (mrafs) {
 	if (as->parms[MRAFS_OFFSET].items)
 	    mrafsParm.Optdebug = 1;
@@ -1597,7 +1646,7 @@ SalvageCmd(as)
     } else {
 	int stop = 0;
 
-	for (i = 9; i < ADDPARMOFFSET; i++) {
+	for (i = MRAFS_OFFSET; i < ADDPARMOFFSET; i++) {
 	    if (as->parms[i].items) {
 		printf(" %s only possible for MR-AFS fileserver.\n",
 		       as->parms[i].name);
@@ -1610,12 +1659,12 @@ SalvageCmd(as)
 
     if (as->parms[4].items) {
 	/* salvage whole enchilada */
-	curGoal = GetServerGoal(tconn, "fs");
+	curGoal = GetServerGoal(tconn, serviceName);
 	if (curGoal == BSTAT_NORMAL) {
-	    printf("bos: shutting down fs.\n");
-	    code = BOZO_SetTStatus(tconn, "fs", BSTAT_SHUTDOWN);
+	    printf("bos: shutting down '%s'.\n", serviceName);
+	    code = BOZO_SetTStatus(tconn, serviceName, BSTAT_SHUTDOWN);
 	    if (code) {
-		printf("bos: failed to stop 'fs' (%s)\n", em(code));
+		printf("bos: failed to stop '%s' (%s)\n", serviceName, em(code));
 		return code;
 	    }
 	    code = BOZO_WaitAll(tconn);	/* wait for shutdown to complete */
@@ -1626,12 +1675,12 @@ SalvageCmd(as)
 	/* now do the salvage operation */
 	printf("Starting salvage.\n");
 	rc = DoSalvage(tconn, NULL, NULL, outName, showlog, parallel, tmpDir,
-		       orphans);
+		       orphans, dafs, &mrafsParm);
 	if (curGoal == BSTAT_NORMAL) {
-	    printf("bos: restarting fs.\n");
-	    code = BOZO_SetTStatus(tconn, "fs", BSTAT_NORMAL);
+	    printf("bos: restarting %s.\n", serviceName);
+	    code = BOZO_SetTStatus(tconn, serviceName, BSTAT_NORMAL);
 	    if (code) {
-		printf("bos: failed to restart 'fs' (%s)\n", em(code));
+		printf("bos: failed to restart '%s' (%s)\n", serviceName, em(code));
 		return code;
 	    }
 	}
@@ -1651,13 +1700,13 @@ SalvageCmd(as)
 		   as->parms[1].items->data);
 	    return -1;
 	}
-	curGoal = GetServerGoal(tconn, "fs");
+	curGoal = GetServerGoal(tconn, serviceName);
 	/* salvage a whole partition (specified by parms[1]) */
 	if (curGoal == BSTAT_NORMAL) {
-	    printf("bos: shutting down fs.\n");
-	    code = BOZO_SetTStatus(tconn, "fs", BSTAT_SHUTDOWN);
+	    printf("bos: shutting down '%s'.\n", serviceName);
+	    code = BOZO_SetTStatus(tconn, serviceName, BSTAT_SHUTDOWN);
 	    if (code) {
-		printf("bos: can't stop 'fs' (%s)\n", em(code));
+		printf("bos: can't stop '%s' (%s)\n", serviceName, em(code));
 		return code;
 	    }
 	    code = BOZO_WaitAll(tconn);	/* wait for shutdown to complete */
@@ -1668,12 +1717,12 @@ SalvageCmd(as)
 	/* now do the salvage operation */
 	printf("Starting salvage.\n");
 	rc = DoSalvage(tconn, as->parms[1].items->data, NULL, outName,
-		       showlog, parallel, tmpDir, orphans);
+		       showlog, parallel, tmpDir, orphans, dafs, &mrafsParm);
 	if (curGoal == BSTAT_NORMAL) {
-	    printf("bos: restarting fs.\n");
-	    code = BOZO_SetTStatus(tconn, "fs", BSTAT_NORMAL);
+	    printf("bos: restarting '%s'.\n", serviceName);
+	    code = BOZO_SetTStatus(tconn, serviceName, BSTAT_NORMAL);
 	    if (code) {
-		printf("bos: failed to restart 'fs' (%s)\n", em(code));
+		printf("bos: failed to restart '%s' (%s)\n", serviceName, em(code));
 		return code;
 	    }
 	}
@@ -1723,7 +1772,7 @@ SalvageCmd(as)
 	}
 	printf("Starting salvage.\n");
 	rc = DoSalvage(tconn, as->parms[1].items->data, tname, outName,
-		       showlog, parallel, tmpDir, orphans);
+		       showlog, parallel, tmpDir, orphans, dafs, &mrafsParm);
 	if (rc)
 	    return rc;
     }
@@ -2153,6 +2202,8 @@ main(argc, argv)
 		"directory to place tmp files");
     cmd_AddParm(ts, "-orphans", CMD_SINGLE, CMD_OPTIONAL,
 		"ignore | remove | attach");
+    cmd_AddParm(ts, "-forceDAFS", CMD_FLAG, CMD_OPTIONAL,
+		"(DAFS) force salvage of demand attach fileserver");
     cmd_AddParm(ts, "-debug", CMD_FLAG, CMD_OPTIONAL,
 		"(MR-AFS) Run in Debugging mode");
     cmd_AddParm(ts, "-nowrite", CMD_FLAG, CMD_OPTIONAL,
diff --git a/src/bozo/bosserver.c b/src/bozo/bosserver.c
index 635a6810e4..2351eeb066 100644
--- a/src/bozo/bosserver.c
+++ b/src/bozo/bosserver.c
@@ -51,7 +51,7 @@ RCSID
 #define BOZO_LWP_STACKSIZE	16000
 extern int BOZO_ExecuteRequest();
 extern int RXSTATS_ExecuteRequest();
-extern struct bnode_ops fsbnode_ops, ezbnode_ops, cronbnode_ops;
+extern struct bnode_ops fsbnode_ops, dafsbnode_ops, ezbnode_ops, cronbnode_ops;
 
 void bozo_Log();
 
@@ -895,6 +895,7 @@ main(int argc, char **argv, char **envp)
     }
 
     bnode_Register("fs", &fsbnode_ops, 3);
+    bnode_Register("dafs", &dafsbnode_ops, 4);
     bnode_Register("simple", &ezbnode_ops, 1);
     bnode_Register("cron", &cronbnode_ops, 2);
 
diff --git a/src/bozo/fsbnodeops.c b/src/bozo/fsbnodeops.c
index 2ac65e4621..e38670e80e 100644
--- a/src/bozo/fsbnodeops.c
+++ b/src/bozo/fsbnodeops.c
@@ -41,13 +41,6 @@ RCSID
 #include <afs/afsutil.h>
 #include "bnode.h"
 
-static int fs_timeout(), fs_getstat(), fs_setstat(), fs_delete();
-static int fs_procexit(), fs_getstring(), fs_getparm(), fs_restartp();
-static int fs_hascore();
-struct bnode *fs_create();
-
-static SetNeedsClock();
-static NudgeProcs();
 
 static int emergency = 0;
 
@@ -76,6 +69,77 @@ static int emergency = 0;
     The needsSalvage flag is cleared when the salvager exits.
 */
 
+struct fsbnode {
+    struct bnode b;
+    afs_int32 timeSDStarted;	/* time shutdown operation started */
+    char *filecmd;		/* command to start primary file server */
+    char *volcmd;		/* command to start secondary vol server */
+    char *salsrvcmd;            /* command to start salvageserver (demand attach fs) */
+    char *salcmd;		/* command to start salvager */
+    char *scancmd;		/* command to start scanner (MR-AFS) */
+    struct bnode_proc *fileProc;	/* process for file server */
+    struct bnode_proc *volProc;	/* process for vol server */
+    struct bnode_proc *salsrvProc;	/* process for salvageserver (demand attach fs) */
+    struct bnode_proc *salProc;	/* process for salvager */
+    struct bnode_proc *scanProc;	/* process for scanner (MR-AFS) */
+    afs_int32 lastFileStart;	/* last start for file */
+    afs_int32 lastVolStart;	/* last start for vol */
+    afs_int32 lastSalsrvStart;	/* last start for salvageserver (demand attach fs) */
+    afs_int32 lastScanStart;	/* last start for scanner (MR-AFS) */
+    char fileRunning;		/* file process is running */
+    char volRunning;		/* volser is running */
+    char salsrvRunning;		/* salvageserver is running (demand attach fs) */
+    char salRunning;		/* salvager is running */
+    char scanRunning;		/* scanner is running (MR_AFS) */
+    char fileSDW;		/* file shutdown wait */
+    char volSDW;		/* vol shutdown wait */
+    char salsrvSDW;		/* salvageserver shutdown wait (demand attach fs) */
+    char salSDW;		/* waiting for the salvager to shutdown */
+    char scanSDW;		/* scanner shutdown wait (MR_AFS) */
+    char fileKillSent;		/* kill signal has been sent */
+    char volKillSent;
+    char salsrvKillSent;        /* kill signal has been sent (demand attach fs) */
+    char salKillSent;
+    char scanKillSent;		/* kill signal has been sent (MR_AFS) */
+    char needsSalvage;		/* salvage before running */
+    char needsClock;		/* do we need clock ticks */
+};
+
+
+
+struct bnode * fs_create(char *ainstance, char *afilecmd, char *avolcmd, 
+			 char *asalcmd, char *ascancmd);
+struct bnode * dafs_create(char *ainstance, char *afilecmd, char *avolcmd, 
+			   char * asalsrvcmd, char *asalcmd, char *ascancmd);
+
+static int fs_hascore(register struct ezbnode *abnode);
+static int fs_restartp(register struct fsbnode *abnode);
+static int SetSalFlag(register struct fsbnode *abnode, register int aflag);
+static int RestoreSalFlag(register struct fsbnode *abnode);
+static int fs_delete(struct fsbnode *abnode);
+static int fs_timeout(struct fsbnode *abnode);
+static int fs_getstat(struct fsbnode *abnode, afs_int32 * astatus);
+static int fs_setstat(register struct fsbnode *abnode, afs_int32 astatus);
+static int fs_procexit(struct fsbnode *abnode, struct bnode_proc *aproc);
+static int fs_getstring(struct fsbnode *abnode, char *abuffer, afs_int32 alen);
+
+
+static int fs_getparm(struct fsbnode *abnode, afs_int32 aindex, 
+		      char *abuffer, afs_int32 alen);
+static int dafs_getparm(struct fsbnode *abnode, afs_int32 aindex, 
+			char *abuffer, afs_int32 alen);
+
+#ifdef AFS_NT40_ENV
+static void AppendExecutableExtension(char *cmd)
+#else
+#define AppendExecutableExtension(x)
+#endif
+
+static void SetNeedsClock(register struct fsbnode *ab);
+static int NudgeProcs(register struct fsbnode *abnode);
+
+
+
 struct bnode_ops fsbnode_ops = {
     fs_create,
     fs_timeout,
@@ -89,36 +153,21 @@ struct bnode_ops fsbnode_ops = {
     fs_hascore,
 };
 
-struct fsbnode {
-    struct bnode b;
-    afs_int32 timeSDStarted;	/* time shutdown operation started */
-    char *filecmd;		/* command to start primary file server */
-    char *volcmd;		/* command to start secondary vol server */
-    char *salcmd;		/* command to start salvager */
-    char *scancmd;		/* command to start scanner (MR-AFS) */
-    struct bnode_proc *fileProc;	/* process for file server */
-    struct bnode_proc *volProc;	/* process for vol server */
-    struct bnode_proc *salProc;	/* process for salvager */
-    struct bnode_proc *scanProc;	/* process for scanner (MR-AFS) */
-    afs_int32 lastFileStart;	/* last start for file */
-    afs_int32 lastVolStart;	/* last start for vol */
-    afs_int32 lastScanStart;	/* last start for scanner (MR-AFS) */
-    char fileRunning;		/* file process is running */
-    char volRunning;		/* volser is running */
-    char salRunning;		/* salvager is running */
-    char scanRunning;		/* scanner is running (MR_AFS) */
-    char fileSDW;		/* file shutdown wait */
-    char volSDW;		/* vol shutdown wait */
-    char salSDW;		/* waiting for the salvager to shutdown */
-    char scanSDW;		/* scanner shutdown wait (MR_AFS) */
-    char fileKillSent;		/* kill signal has been sent */
-    char volKillSent;
-    char salKillSent;
-    char scanKillSent;		/* kill signal has been sent (MR_AFS) */
-    char needsSalvage;		/* salvage before running */
-    char needsClock;		/* do we need clock ticks */
+/* demand attach fs bnode ops */
+struct bnode_ops dafsbnode_ops = {
+    dafs_create,
+    fs_timeout,
+    fs_getstat,
+    fs_setstat,
+    fs_delete,
+    fs_procexit,
+    fs_getstring,
+    dafs_getparm,
+    fs_restartp,
+    fs_hascore,
 };
 
+
 /* Function to tell whether this bnode has a core file or not.  You might
  * think that this could be in bnode.c, and decide what core files to check
  * for based on the bnode's coreName property, but that doesn't work because
@@ -140,6 +189,11 @@ fs_hascore(register struct ezbnode *abnode)
     if (access(tbuffer, 0) == 0)
 	return 1;
 
+    /* see if salvageserver left a core file */
+    bnode_CoreName(abnode, "salsrv", tbuffer);
+    if (access(tbuffer, 0) == 0)
+	return 1;
+
     /* see if salvager left a core file */
     bnode_CoreName(abnode, "salv", tbuffer);
     if (access(tbuffer, 0) == 0)
@@ -198,6 +252,25 @@ fs_restartp(register struct fsbnode *abnode)
     if (code)
 	return code;
 
+    if (abnode->salsrvcmd) {    /* only in demand attach fs */
+	/* now do same for salsrvcmd (demand attach fs) */
+	code = bnode_ParseLine(abnode->salsrvcmd, &tt);
+	if (code)
+	    return 0;
+	if (!tt)
+	    return 0;
+	code = stat(tt->key, &tstat);
+	if (code) {
+	    bnode_FreeTokens(tt);
+	    return 0;
+	}
+	if (tstat.st_ctime > abnode->lastScanStart)
+	    code = 1;
+	else
+	    code = 0;
+	bnode_FreeTokens(tt);
+    }
+
     if (abnode->scancmd) {	/* Only in MR-AFS */
 	/* now do same for scancmd (MR-AFS) */
 	code = bnode_ParseLine(abnode->scancmd, &tt);
@@ -228,14 +301,17 @@ SetSalFlag(register struct fsbnode *abnode, register int aflag)
     char tbuffer[AFSDIR_PATH_MAX];
     int fd;
 
-    abnode->needsSalvage = aflag;
-    strcompose(tbuffer, AFSDIR_PATH_MAX, AFSDIR_SERVER_LOCAL_DIRPATH, "/",
-	       SALFILE, abnode->b.name, NULL);
-    if (aflag) {
-	fd = open(tbuffer, O_CREAT | O_TRUNC | O_RDWR, 0666);
-	close(fd);
-    } else {
-	unlink(tbuffer);
+    /* don't use the salvage flag for demand attach fs */
+    if (abnode->salsrvcmd == NULL) {
+	abnode->needsSalvage = aflag;
+	strcompose(tbuffer, AFSDIR_PATH_MAX, AFSDIR_SERVER_LOCAL_DIRPATH, "/",
+		   SALFILE, abnode->b.name, NULL);
+	if (aflag) {
+	    fd = open(tbuffer, O_CREAT | O_TRUNC | O_RDWR, 0666);
+	    close(fd);
+	} else {
+	    unlink(tbuffer);
+	}
     }
     return 0;
 }
@@ -246,13 +322,18 @@ RestoreSalFlag(register struct fsbnode *abnode)
 {
     char tbuffer[AFSDIR_PATH_MAX];
 
-    strcompose(tbuffer, AFSDIR_PATH_MAX, AFSDIR_SERVER_LOCAL_DIRPATH, "/",
-	       SALFILE, abnode->b.name, NULL);
-    if (access(tbuffer, 0) == 0) {
-	/* file exists, so need to salvage */
-	abnode->needsSalvage = 1;
-    } else {
+    /* never set needs salvage flag for demand attach fs */
+    if (abnode->salsrvcmd != NULL) {
 	abnode->needsSalvage = 0;
+    } else {
+	strcompose(tbuffer, AFSDIR_PATH_MAX, AFSDIR_SERVER_LOCAL_DIRPATH, "/",
+		   SALFILE, abnode->b.name, NULL);
+	if (access(tbuffer, 0) == 0) {
+	    /* file exists, so need to salvage */
+	    abnode->needsSalvage = 1;
+	} else {
+	    abnode->needsSalvage = 0;
+	}
     }
     return 0;
 }
@@ -272,6 +353,8 @@ fs_delete(struct fsbnode *abnode)
     free(abnode->filecmd);
     free(abnode->volcmd);
     free(abnode->salcmd);
+    if (abnode->salsrvcmd)
+	free(abnode->salsrvcmd);
     if (abnode->scancmd)
 	free(abnode->scancmd);
     free(abnode);
@@ -304,95 +387,235 @@ fs_create(char *ainstance, char *afilecmd, char *avolcmd, char *asalcmd,
     char *fileCmdpath, *volCmdpath, *salCmdpath, *scanCmdpath;
     int bailout = 0;
 
-    fileCmdpath = volCmdpath = salCmdpath = NULL;
+    te = fileCmdpath = volCmdpath = salCmdpath = scanCmdpath = NULL;
 
     /* construct local paths from canonical (wire-format) paths */
     if (ConstructLocalBinPath(afilecmd, &fileCmdpath)) {
 	bozo_Log("BNODE: command path invalid '%s'\n", afilecmd);
 	bailout = 1;
+	goto done;
     }
     if (ConstructLocalBinPath(avolcmd, &volCmdpath)) {
 	bozo_Log("BNODE: command path invalid '%s'\n", avolcmd);
 	bailout = 1;
+	goto done;
     }
     if (ConstructLocalBinPath(asalcmd, &salCmdpath)) {
 	bozo_Log("BNODE: command path invalid '%s'\n", asalcmd);
 	bailout = 1;
+	goto done;
     }
 
     if (ascancmd && strlen(ascancmd)) {
 	if (ConstructLocalBinPath(ascancmd, &scanCmdpath)) {
 	    bozo_Log("BNODE: command path invalid '%s'\n", ascancmd);
 	    bailout = 1;
+	    goto done;
 	}
     }
 
     if (!bailout) {
 	sscanf(fileCmdpath, "%s", cmdname);
-#ifdef AFS_NT40_ENV
 	AppendExecutableExtension(cmdname);
-#endif
 	if (stat(cmdname, &tstat)) {
 	    bozo_Log("BNODE: file server binary '%s' not found\n", cmdname);
 	    bailout = 1;
+	    goto done;
 	}
 
 	sscanf(volCmdpath, "%s", cmdname);
-#ifdef AFS_NT40_ENV
 	AppendExecutableExtension(cmdname);
-#endif
 	if (stat(cmdname, &tstat)) {
 	    bozo_Log("BNODE: volume server binary '%s' not found\n", cmdname);
 	    bailout = 1;
+	    goto done;
 	}
 
 	sscanf(salCmdpath, "%s", cmdname);
-#ifdef AFS_NT40_ENV
 	AppendExecutableExtension(cmdname);
-#endif
 	if (stat(cmdname, &tstat)) {
 	    bozo_Log("BNODE: salvager binary '%s' not found\n", cmdname);
 	    bailout = 1;
+	    goto done;
 	}
 
 	if (ascancmd && strlen(ascancmd)) {
 	    sscanf(scanCmdpath, "%s", cmdname);
-#ifdef AFS_NT40_ENV
 	    AppendExecutableExtension(cmdname);
-#endif
 	    if (stat(cmdname, &tstat)) {
 		bozo_Log("BNODE: scanner binary '%s' not found\n", cmdname);
 		bailout = 1;
+		goto done;
 	    }
 	}
     }
 
-    if (bailout) {
-	free(fileCmdpath);
-	free(volCmdpath);
-	free(salCmdpath);
-	return NULL;
-    }
-
     te = (struct fsbnode *)malloc(sizeof(struct fsbnode));
+    if (te == NULL) {
+	bailout = 1;
+	goto done;
+    }
     memset(te, 0, sizeof(struct fsbnode));
     te->filecmd = fileCmdpath;
     te->volcmd = volCmdpath;
+    te->salsrvcmd = NULL;
     te->salcmd = salCmdpath;
     if (ascancmd && strlen(ascancmd))
 	te->scancmd = scanCmdpath;
     else
 	te->scancmd = NULL;
     if (bnode_InitBnode(te, &fsbnode_ops, ainstance) != 0) {
-	free(te);
-	free(fileCmdpath);
-	free(volCmdpath);
-	free(salCmdpath);
-	return NULL;
+	bailout = 1;
+	goto done;
     }
     bnode_SetTimeout(te, POLLTIME);	/* ask for timeout activations every 10 seconds */
     RestoreSalFlag(te);		/* restore needsSalvage flag based on file's existence */
     SetNeedsClock(te);		/* compute needsClock field */
+
+ done:
+    if (bailout) {
+	if (te)
+	    free(te);
+	if (fileCmdpath)
+	    free(fileCmdpath);
+	if (volCmdpath)
+	    free(volCmdpath);
+	if (salCmdpath)
+	    free(salCmdpath);
+	if (scanCmdpath)
+	    free(scanCmdpath);
+	return NULL;
+    }
+
+    return (struct bnode *)te;
+}
+
+/* create a demand attach fs bnode */
+struct bnode *
+dafs_create(char *ainstance, char *afilecmd, char *avolcmd, 
+	    char * asalsrvcmd, char *asalcmd, char *ascancmd)
+{
+    struct stat tstat;
+    register struct fsbnode *te;
+    char cmdname[AFSDIR_PATH_MAX];
+    char *fileCmdpath, *volCmdpath, *salsrvCmdpath, *salCmdpath, *scanCmdpath;
+    int bailout = 0;
+
+    te = fileCmdpath = volCmdpath = salsrvCmdpath = salCmdpath = scanCmdpath = NULL;
+
+    /* construct local paths from canonical (wire-format) paths */
+    if (ConstructLocalBinPath(afilecmd, &fileCmdpath)) {
+	bozo_Log("BNODE: command path invalid '%s'\n", afilecmd);
+	bailout = 1;
+	goto done;
+    }
+    if (ConstructLocalBinPath(avolcmd, &volCmdpath)) {
+	bozo_Log("BNODE: command path invalid '%s'\n", avolcmd);
+	bailout = 1;
+	goto done;
+    }
+    if (ConstructLocalBinPath(asalsrvcmd, &salsrvCmdpath)) {
+	bozo_Log("BNODE: command path invalid '%s'\n", asalsrvcmd);
+	bailout = 1;
+	goto done;
+    }
+    if (ConstructLocalBinPath(asalcmd, &salCmdpath)) {
+	bozo_Log("BNODE: command path invalid '%s'\n", asalcmd);
+	bailout = 1;
+	goto done;
+    }
+
+    if (ascancmd && strlen(ascancmd)) {
+	if (ConstructLocalBinPath(ascancmd, &scanCmdpath)) {
+	    bozo_Log("BNODE: command path invalid '%s'\n", ascancmd);
+	    bailout = 1;
+	    goto done;
+	}
+    }
+
+    if (!bailout) {
+	sscanf(fileCmdpath, "%s", cmdname);
+	AppendExecutableExtension(cmdname);
+	if (stat(cmdname, &tstat)) {
+	    bozo_Log("BNODE: file server binary '%s' not found\n", cmdname);
+	    bailout = 1;
+	    goto done;
+	}
+
+	sscanf(volCmdpath, "%s", cmdname);
+	AppendExecutableExtension(cmdname);
+	if (stat(cmdname, &tstat)) {
+	    bozo_Log("BNODE: volume server binary '%s' not found\n", cmdname);
+	    bailout = 1;
+	    goto done;
+	}
+
+	sscanf(salsrvCmdpath, "%s", cmdname);
+	AppendExecutableExtension(cmdname);
+	if (stat(cmdname, &tstat)) {
+	    bozo_Log("BNODE: salvageserver binary '%s' not found\n", cmdname);
+	    bailout = 1;
+	    goto done;
+	}
+
+	sscanf(salCmdpath, "%s", cmdname);
+	AppendExecutableExtension(cmdname);
+	if (stat(cmdname, &tstat)) {
+	    bozo_Log("BNODE: salvager binary '%s' not found\n", cmdname);
+	    bailout = 1;
+	    goto done;
+	}
+
+	if (ascancmd && strlen(ascancmd)) {
+	    sscanf(scanCmdpath, "%s", cmdname);
+	    AppendExecutableExtension(cmdname);
+	    if (stat(cmdname, &tstat)) {
+		bozo_Log("BNODE: scanner binary '%s' not found\n", cmdname);
+		bailout = 1;
+		goto done;
+	    }
+	}
+    }
+
+    te = (struct fsbnode *)malloc(sizeof(struct fsbnode));
+    if (te == NULL) {
+	bailout = 1;
+	goto done;
+    }
+    memset(te, 0, sizeof(struct fsbnode));
+    te->filecmd = fileCmdpath;
+    te->volcmd = volCmdpath;
+    te->salsrvcmd = salsrvCmdpath;
+    te->salcmd = salCmdpath;
+    if (ascancmd && strlen(ascancmd))
+	te->scancmd = scanCmdpath;
+    else
+	te->scancmd = NULL;
+    if (bnode_InitBnode(te, &dafsbnode_ops, ainstance) != 0) {
+	bailout = 1;
+	goto done;
+    }
+    bnode_SetTimeout(te, POLLTIME);	/* ask for timeout activations every 10 seconds */
+    RestoreSalFlag(te);		/* restore needsSalvage flag based on file's existence */
+    SetNeedsClock(te);		/* compute needsClock field */
+
+ done:
+    if (bailout) {
+	if (te)
+	    free(te);
+	if (fileCmdpath)
+	    free(fileCmdpath);
+	if (volCmdpath)
+	    free(volCmdpath);
+	if (salsrvCmdpath)
+	    free(salsrvCmdpath);
+	if (salCmdpath)
+	    free(salCmdpath);
+	if (scanCmdpath)
+	    free(scanCmdpath);
+	return NULL;
+    }
+
     return (struct bnode *)te;
 }
 
@@ -431,6 +654,15 @@ fs_timeout(struct fsbnode *abnode)
 		 FSSDTIME);
 	}
     }
+    if (abnode->salsrvSDW) {
+	if (!abnode->salsrvKillSent && now - abnode->timeSDStarted > SDTIME) {
+	    bnode_StopProc(abnode->salsrvProc, SIGKILL);
+	    abnode->salsrvKillSent = 1;
+	    bozo_Log
+		("bos shutdown: salvageserver failed to shutdown within %d seconds\n",
+		 SDTIME);
+	}
+    }
     if (abnode->scanSDW) {
 	if (!abnode->scanKillSent && now - abnode->timeSDStarted > SDTIME) {
 	    bnode_StopProc(abnode->scanProc, SIGKILL);
@@ -449,15 +681,17 @@ fs_getstat(struct fsbnode *abnode, afs_int32 * astatus)
 {
     register afs_int32 temp;
     if (abnode->volSDW || abnode->fileSDW || abnode->salSDW
-	|| abnode->scanSDW)
+	|| abnode->scanSDW || abnode->salsrvSDW)
 	temp = BSTAT_SHUTTINGDOWN;
     else if (abnode->salRunning)
 	temp = BSTAT_NORMAL;
     else if (abnode->volRunning && abnode->fileRunning
-	     && (!abnode->scancmd || abnode->scanRunning))
+	     && (!abnode->scancmd || abnode->scanRunning)
+	     && (!abnode->salsrvcmd || abnode->salsrvRunning))
 	temp = BSTAT_NORMAL;
     else if (!abnode->salRunning && !abnode->volRunning
-	     && !abnode->fileRunning && !abnode->scanRunning)
+	     && !abnode->fileRunning && !abnode->scanRunning
+	     && !abnode->salsrvRunning)
 	temp = BSTAT_SHUTDOWN;
     else
 	temp = BSTAT_STARTINGUP;
@@ -508,6 +742,11 @@ fs_procexit(struct fsbnode *abnode, struct bnode_proc *aproc)
 	abnode->scanRunning = 0;
 	abnode->scanSDW = 0;
 	abnode->scanKillSent = 0;
+    } else if (aproc == abnode->salsrvProc) {
+	abnode->salsrvProc = 0;
+	abnode->salsrvRunning = 0;
+	abnode->salsrvSDW = 0;
+	abnode->salsrvKillSent = 0;
     }
 
     /* now restart anyone who needs to restart */
@@ -515,14 +754,15 @@ fs_procexit(struct fsbnode *abnode, struct bnode_proc *aproc)
 }
 
 /* make sure we're periodically checking the state if we need to */
-static int
+static void
 SetNeedsClock(register struct fsbnode *ab)
 {
     if (ab->b.goal == 1 && ab->fileRunning && ab->volRunning
-	&& (!ab->scancmd || ab->scanRunning))
+	&& (!ab->scancmd || ab->scanRunning)
+	&& (!ab->salsrvcmd || ab->salsrvRunning))
 	ab->needsClock = 0;	/* running normally */
     else if (ab->b.goal == 0 && !ab->fileRunning && !ab->volRunning
-	     && !ab->salRunning && !ab->scanRunning)
+	     && !ab->salRunning && !ab->scanRunning && !ab->salsrvRunning)
 	ab->needsClock = 0;	/* halted normally */
     else
 	ab->needsClock = 1;	/* other */
@@ -562,6 +802,18 @@ NudgeProcs(register struct fsbnode *abnode)
 		    abnode->volRunning = 1;
 		}
 	    }
+	    if (abnode->salsrvcmd) {
+		if (!abnode->salsrvRunning) {
+		    abnode->lastSalsrvStart = FT_ApproxTime();
+		    code =
+			bnode_NewProc(abnode, abnode->salsrvcmd, "salsrv",
+				      &tp);
+		    if (code == 0) {
+			abnode->salsrvProc = tp;
+			abnode->salsrvRunning = 1;
+		    }
+		}
+	    }
 	    if (abnode->scancmd) {
 		if (!abnode->scanRunning) {
 		    abnode->lastScanStart = FT_ApproxTime();
@@ -576,7 +828,8 @@ NudgeProcs(register struct fsbnode *abnode)
 	    }
 	} else {		/* file is not running */
 	    /* see how to start */
-	    if (!abnode->needsSalvage) {
+	    /* for demand attach fs, needsSalvage flag is ignored */
+	    if (!abnode->needsSalvage || abnode->salsrvcmd) {
 		/* no crash apparent, just start up normally */
 		if (!abnode->fileRunning) {
 		    abnode->lastFileStart = FT_ApproxTime();
@@ -596,6 +849,16 @@ NudgeProcs(register struct fsbnode *abnode)
 			abnode->volRunning = 1;
 		    }
 		}
+		if (abnode->salsrvcmd && !abnode->salsrvRunning) {
+		    abnode->lastSalsrvStart = FT_ApproxTime();
+		    code =
+			bnode_NewProc(abnode, abnode->salsrvcmd, "salsrv",
+				      &tp);
+		    if (code == 0) {
+			abnode->salsrvProc = tp;
+			abnode->salsrvRunning = 1;
+		    }
+		}
 		if (abnode->scancmd && !abnode->scanRunning) {
 		    abnode->lastScanStart = FT_ApproxTime();
 		    code =
@@ -656,6 +919,11 @@ NudgeProcs(register struct fsbnode *abnode)
 	    abnode->volSDW = 1;
 	    abnode->timeSDStarted = now;
 	}
+	if (abnode->salsrvRunning && !abnode->salsrvSDW) {
+	    bnode_StopProc(abnode->salsrvProc, SIGTERM);
+	    abnode->salsrvSDW = 1;
+	    abnode->timeSDStarted = now;
+	}
 	if (abnode->scanRunning && !abnode->scanSDW) {
 	    bnode_StopProc(abnode->scanProc, SIGTERM);
 	    abnode->scanSDW = 1;
@@ -724,3 +992,22 @@ fs_getparm(struct fsbnode *abnode, afs_int32 aindex, char *abuffer,
 	return BZDOM;
     return 0;
 }
+
+static int
+dafs_getparm(struct fsbnode *abnode, afs_int32 aindex, char *abuffer,
+	     afs_int32 alen)
+{
+    if (aindex == 0)
+	strcpy(abuffer, abnode->filecmd);
+    else if (aindex == 1)
+	strcpy(abuffer, abnode->volcmd);
+    else if (aindex == 2)
+	strcpy(abuffer, abnode->salsrvcmd);
+    else if (aindex == 3)
+	strcpy(abuffer, abnode->salcmd);
+    else if (aindex == 4 && abnode->scancmd)
+	strcpy(abuffer, abnode->scancmd);
+    else
+	return BZDOM;
+    return 0;
+}
diff --git a/src/cf/osconf.m4 b/src/cf/osconf.m4
index 9fe6161d8b..22daf81e3e 100644
--- a/src/cf/osconf.m4
+++ b/src/cf/osconf.m4
@@ -971,6 +971,18 @@ case $AFS_SYSNAME in
 	;;
 esac
 
+
+
+dnl pthreads fixes
+case $AFS_SYSNAME in
+dnl we'll go ahead and turn on XOPEN2K and ISO_C99
+dnl if this causes problems, we should scale back to _XOPEN_SOURCE=500
+	*linux*)
+		MT_CFLAGS="${MT_CFLAGS} -D_XOPEN_SOURCE=600 -D_BSD_SOURCE"
+	;;
+esac
+
+
 dnl Disable the default for debugging/optimization if not enabled
 if test "x$enable_debug_kernel" = "xno"; then
   KERN_DBG=
diff --git a/src/config/param.rs_aix51.h b/src/config/param.rs_aix51.h
index ecfe978c4e..cd49793bae 100644
--- a/src/config/param.rs_aix51.h
+++ b/src/config/param.rs_aix51.h
@@ -25,8 +25,6 @@
 #ifdef AFS_NAMEI_ENV
 #define AFS_64BIT_IOPS_ENV	1
 #endif
-#define BITMAP_LATER		1
-#define FAST_RESTART		1
 
 #define AFS_HAVE_FLOCK_SYSID    1
 
diff --git a/src/config/param.rs_aix52.h b/src/config/param.rs_aix52.h
index 0ee9986ec9..b20bb378dc 100644
--- a/src/config/param.rs_aix52.h
+++ b/src/config/param.rs_aix52.h
@@ -26,8 +26,6 @@
 #ifdef AFS_NAMEI_ENV
 #define AFS_64BIT_IOPS_ENV	1
 #endif
-#define BITMAP_LATER		1
-#define FAST_RESTART		1
 
 #define AFS_HAVE_FLOCK_SYSID    1
 
diff --git a/src/config/param.rs_aix53.h b/src/config/param.rs_aix53.h
index ba4f151f3c..ecfb3671a2 100644
--- a/src/config/param.rs_aix53.h
+++ b/src/config/param.rs_aix53.h
@@ -27,8 +27,6 @@
 #ifdef AFS_NAMEI_ENV
 #define AFS_64BIT_IOPS_ENV	1
 #endif
-#define BITMAP_LATER		1
-#define FAST_RESTART		1
 
 #define AFS_HAVE_FLOCK_SYSID    1
 
diff --git a/src/config/stds.h b/src/config/stds.h
index 7b256b6735..9266b0c7f6 100644
--- a/src/config/stds.h
+++ b/src/config/stds.h
@@ -56,8 +56,16 @@ typedef unsigned __int64 afs_uint64;
 typedef long long afs_int64;
 typedef unsigned long long afs_uint64;
 #endif
-#define ZeroInt64(a)       (a) = 0
+#define ZeroInt64(a)       (a = 0)
 #define AssignInt64(a, b) *(b) = (a) 
+#define IncInt64(a) (*(a))++
+#define IncUInt64(a) (*(a))++
+#define DecInt64(a) (*(a))--
+#define DecUInt64(a) (*(a))--
+#define GTInt64(a,b) ((a) > (b))
+#define GEInt64(a,b) ((a) >= (b))
+#define LEInt64(a,b) ((a) <= (b))
+#define LTInt64(a,b) ((a) < (b))
 #define AddInt64(a,b,c) *(c) = (afs_int64)(a) + (afs_int64)(b)
 #define AddUInt64(a,b,c) *(c) = (afs_uint64)(a) + (afs_uint64)(b)
 #define SubtractInt64(a,b,c) *(c) = (afs_int64)(a) - (afs_int64)(b)
@@ -83,8 +91,16 @@ struct u_Int64 {
     afs_uint32 low;
 };
 typedef struct u_Int64 afs_uint64;
-#define ZeroInt64(a) (a).high = (a).low = 0
+#define ZeroInt64(a) ((a).high = (a).low = 0)
 #define AssignInt64(a, b) (b)->high = (a).high; (b)->low = (a).low
+#define IncInt64(a) ((++((a)->low)) ? 0 : (a)->high++ )
+#define IncUInt64(a) ((++((a)->low)) ? 0 : (a)->high++ )
+#define DecInt64(a) (((a)->low)-- ? 0 : (a)->high-- )
+#define DecUInt64(a) (((a)->low)-- ? 0 : (a)->high-- )
+#define GTInt64(a,b) (((a).high > (b).high) || (((a).high == (b).high) && ((a).low > (b).low)))
+#define GEInt64(a,b) (((a).high > (b).high) || (((a).high == (b).high) && ((a).low >= (b).low)))
+#define LEInt64(a,b) (((a).high < (b).high) || (((a).high == (b).high) && ((a).low <= (b).low)))
+#define LTInt64(a,b) (((a).high < (b).high) || (((a).high == (b).high) && ((a).low < (b).low)))
 #define CompareInt64(a,b) (((afs_int32)(a).high - (afs_int32)(b).high) || (((a).high == (b).high) && ((a).low - (b).low))) 
 #define AddInt64(a, b, c) {  afs_int64 _a, _b; _a = a; _b = b; (c)->low = _a.low + _b.low; (c)->high = _a.high + _b.high + ((c)->low < _b.low); } 
 #define SubtractInt64(a, b, c) { afs_int64 _a, _b; _a = a; _b = b; (c)->low = _a.low - _b.low;  (c)->high = _a.high - _b.high - (_a.low < _b.low); } 
@@ -246,4 +262,9 @@ struct afsUUID {
 };
 typedef struct afsUUID afsUUID;
 
+/* for now, demand attach fileserver is only support on unix pthreads builds */
+#if defined(DEMAND_ATTACH_ENABLE) && defined(AFS_PTHREAD_ENV) && !defined(AFS_NT40_ENV)
+#define AFS_DEMAND_ATTACH_FS 1
+#endif
+
 #endif /* OPENAFS_CONFIG_AFS_STDS_H */
diff --git a/src/rx/rx_queue.h b/src/rx/rx_queue.h
index fcd813c407..1e930a6765 100644
--- a/src/rx/rx_queue.h
+++ b/src/rx/rx_queue.h
@@ -78,6 +78,13 @@ for (n=0, queue_Scan(&myqueue, qe, nqe, myelement), n++) {}
 #define _RXQSP(q1,q2,i,a,b,c,d,x,y) if (!queue_IsEnd(q1,i->c)) \
     (((y->b->a=q2->a)->b=y->b), ((x->a->b=q2)->a=x->a), ((i->c=q1)->d=i))
 
+/* This one moves a chain of elements from (s) to (e) from its
+ * current position to either before or after element (i)
+ * if (a,b,x,y) is (prev,next,s,e) then chain is moved before (i)
+ * if (a,b,x,y) is (next,prev,e,s) then chain is moved after (i) */
+#define _RXQMV(i, s, e, a, b, x, y) if (i->a != y) \
+    (((e->next->prev=s->prev)->next=e->next), ((i->a->b=x)->a=i->a), ((y->b=i)->a=y))
+
 /* Basic remove operation.  Doesn't update the queue item to indicate it's been removed */
 #define _RXQR(i) ((_RXQ(i)->prev->next=_RXQ(i)->next)->prev=_RXQ(i)->prev)
 
@@ -120,6 +127,12 @@ for (n=0, queue_Scan(&myqueue, qe, nqe, myelement), n++) {}
 #define queue_Replace(q1,q2) if (queue_IsEmpty(q2)) queue_Init(q1); else \
     (*_RXQ(q1) = *_RXQ(q2), _RXQ(q1)->next->prev = _RXQ(q1)->prev->next = _RXQ(q1), queue_Init(q2))
 
+/* move a chain of elements beginning at (s) and ending at (e) before node (i) */
+#define queue_MoveChainBefore(i, s, e) _RXQMV(_RXQ(i),_RXQ(s),_RXQ(e),prev,next,_RXQ(s),_RXQ(e))
+
+/* move a chain of elements beginning at (s) and ending at (e) after node (i) */
+#define queue_MoveChainAfter(i, s, e) _RXQMV(_RXQ(i),_RXQ(s),_RXQ(e),next,prev,_RXQ(e),_RXQ(s))
+
 /* Remove a queue element (*i) from it's queue.  The next field is 0'd, so that any further use of this q entry will hopefully cause a core dump.  Multiple removes of the same queue item are not supported */
 #define queue_Remove(i) (_RXQR(i), _RXQ(i)->next = 0)
 
@@ -155,6 +168,10 @@ for (n=0, queue_Scan(&myqueue, qe, nqe, myelement), n++) {}
 /* Returns false if the item was removed from a queue OR is uninitialized (zero) */
 #define queue_IsOnQueue(i) (_RXQ(i)->next != 0)
 
+/* Returns true if the item was removed from a queue OR is uninitialized (zero) */
+/* Return false if the queue item is currently in a queue */
+#define queue_IsNotOnQueue(i) (_RXQ(i)->next == 0)
+
 /* Returns true if the queue item (i) is the first element of the queue (q) */
 #define queue_IsFirst(q,i) (_RXQ(q)->first == _RXQ(i))
 
@@ -164,6 +181,9 @@ for (n=0, queue_Scan(&myqueue, qe, nqe, myelement), n++) {}
 /* Returns true if the queue item (i) is the end of the queue (q), that is, i is the head of the queue */
 #define queue_IsEnd(q,i) (_RXQ(q) == _RXQ(i))
 
+/* Returns false if the queue item (i) is the end of the queue (q), that is, i is the head of the queue */
+#define queue_IsNotEnd(q,i) (_RXQ(q) != _RXQ(i))
+
 /* Prototypical loop to scan an entire queue forwards.  q is the queue
  * head, qe is the loop variable, next is a variable used to store the
  * queue entry for the next iteration of the loop, s is the user's
@@ -180,12 +200,24 @@ for (n=0, queue_Scan(&myqueue, qe, nqe, myelement), n++) {}
 	!queue_IsEnd(q,	qe);				\
 	(qe) = (next), next = queue_Next(qe, s)
 
+/* similar to queue_Scan except start at element 'start' instead of the beginning */
+#define        queue_ScanFrom(q, start, qe, next, s)      \
+    (qe) = (struct s*)(start), next = queue_Next(qe, s);  \
+       !queue_IsEnd(q, qe);                               \
+       (qe) = (next), next = queue_Next(qe, s)
+
 /* This is similar to queue_Scan, but scans from the end of the queue to the beginning.  Next is the previous queue entry.  */
 #define	queue_ScanBackwards(q, qe, prev, s)		\
     (qe) = queue_Last(q, s), prev = queue_Prev(qe, s);	\
 	!queue_IsEnd(q,	qe);				\
 	(qe) = prev, prev = queue_Prev(qe, s)
 
+/* This is similar to queue_ScanBackwards, but start at element 'start' instead of the end.  Next is the previous queue entry.  */
+#define        queue_ScanBackwardsFrom(q, start, qe, prev, s)  \
+    (qe) = (struct s*)(start), prev = queue_Prev(qe, s);       \
+       !queue_IsEnd(q, qe);                                    \
+       (qe) = prev, prev = queue_Prev(qe, s)
+
 #define queue_Count(q, qe, nqe, s, n) 			\
     for (n=0, queue_Scan(q, qe, nqe, s), n++) {}
 #endif /* _RX_QUEUE_ */
diff --git a/src/tsalvaged/Makefile.in b/src/tsalvaged/Makefile.in
new file mode 100644
index 0000000000..1f4ccc6001
--- /dev/null
+++ b/src/tsalvaged/Makefile.in
@@ -0,0 +1,200 @@
+# Copyright 2000, International Business Machines Corporation and others.
+# All Rights Reserved.
+# 
+# This software has been released under the terms of the IBM Public
+# License.  For details, see the LICENSE file in the top-level source
+# directory or online at http://www.openafs.org/dl/license10.html
+#
+# Portions Copyright (c) 2003 Apple Computer, Inc.
+# Portions Copyright (c) 2006 Sine Nomine Associates
+
+srcdir=@srcdir@
+include @TOP_OBJDIR@/src/config/Makefile.config
+
+CC=${MT_CC}
+CFLAGS=${COMMON_CFLAGS} -I.. -DNINTERFACE ${MT_CFLAGS} -DRXDEBUG -DFSSYNC_BUILD_CLIENT \
+	-DSALVSYNC_BUILD_SERVER -DSALVSYNC_BUILD_CLIENT
+
+CCRULE=${CC} ${CFLAGS} -c $?
+
+VICED=../viced
+VLSERVER=../vlserver
+LWP=../lwp
+LIBACL=../libacl
+UTIL=../util
+DIR=../dir
+VOL=../vol
+FSINT=../fsint
+
+SALVAGEDOBJS=salvaged.o vol-salvage.o physio.o
+
+DIROBJS=buffer.o dir.o salvage.o
+
+LWPOBJS=lock.o threadname.o
+
+UTILOBJS=assert.o uuid.o serverLog.o fileutil.o netutils.o dirpath.o volparse.o flipbase64.o softsig.o fstab.o
+
+VLIBOBJS=vnode.o volume.o vutil.o partition.o fssync-client.o \
+	 clone.o nuke.o devname.o listinodes.o ihandle.o \
+	 namei_ops.o salvsync-server.o salvsync-client.o \
+	 daemon_com.o
+
+OBJECTS= ${SALVAGEDOBJS} ${UTILOBJS} ${VLIBOBJS} ${DIROBJS} ${LWPOBJS}
+
+FSSDEBUG_OBJS = fssync-debug.o physio.o common.o ${UTILOBJS} ${VLIBOBJS} ${DIROBJS} ${LWPOBJS}
+
+SSSDEBUG_OBJS = salvsync-debug.o physio.o common.o ${UTILOBJS} ${VLIBOBJS} ${DIROBJS} ${LWPOBJS}
+
+LIBS=${TOP_LIBDIR}/libafsauthent.a ${TOP_LIBDIR}/libafsrpc.a ${TOP_LIBDIR}/util.a ${TOP_LIBDIR}/libcmd.a
+
+INSTALL_TARGS = ${DESTDIR}${afssrvlibexecdir}/salvageserver \
+		${DESTDIR}${afssrvsbindir}/fssync-debug \
+		${DESTDIR}${afssrvsbindir}/salvsync-debug
+
+DEST_TARGS =	${DEST}/root.server/usr/afs/bin/salvageserver \
+		${DEST}/root.server/usr/afs/bin/fssync-debug \
+		${DEST}/root.server/usr/afs/bin/salvsync-debug
+
+all: salvageserver fssync-debug salvsync-debug
+
+salvaged.o: ${VOL}/salvaged.c
+	${CCRULE}
+
+vol-salvage.o: ${VOL}/vol-salvage.c
+	${CCRULE}
+
+physio.o: ${VOL}/physio.c
+	${CCRULE}
+
+fssync-debug.o: ${VOL}/fssync-debug.c
+	${CCRULE}
+
+salvsync-debug.o: salvsync-debug.c
+	${CCRULE}
+
+assert.o: ${UTIL}/assert.c
+	${CCRULE}
+
+uuid.o: ${UTIL}/uuid.c
+	${CCRULE}
+
+serverLog.o: ${UTIL}/serverLog.c
+	${CCRULE}
+
+fileutil.o: ${UTIL}/fileutil.c
+	${CCRULE}
+
+volparse.o: ${UTIL}/volparse.c
+	${CCRULE}
+
+flipbase64.o: ${UTIL}/flipbase64.c
+	${CCRULE}
+
+netutils.o: ${UTIL}/netutils.c
+	${CCRULE}
+
+dirpath.o: ${UTIL}/dirpath.c
+	${CCRULE}
+
+softsig.o: ${UTIL}/softsig.c
+	${CCRULE}
+
+buffer.o: ${DIR}/buffer.c
+	${CCRULE}
+
+dir.o: ${DIR}/dir.c
+	${CCRULE}
+
+salvage.o: ${DIR}/salvage.c
+	${CCRULE}
+
+lock.o: ${LWP}/lock.c
+	${CCRULE}
+
+threadname.o: ${LWP}/threadname.c
+	${CCRULE}
+
+vnode.o: ${VOL}/vnode.c
+	${CCRULE}
+
+volume.o: ${VOL}/volume.c
+	${CCRULE}
+
+vutil.o: ${VOL}/vutil.c
+	${CCRULE}
+
+partition.o: ${VOL}/partition.c
+	${CCRULE}
+
+fssync-client.o: ${VOL}/fssync-client.c
+	${CCRULE}
+
+salvsync-server.o: ${VOL}/salvsync-server.c
+	${CCRULE}
+
+salvsync-client.o: ${VOL}/salvsync-client.c
+	${CCRULE}
+
+daemon_com.o: ${VOL}/daemon_com.c
+	${CCRULE}
+
+clone.o: ${VOL}/clone.c
+	${CCRULE}
+
+nuke.o: ${VOL}/nuke.c
+	${CCRULE}
+
+devname.o: ${VOL}/devname.c
+	${CCRULE}
+
+# only for darwin?
+fstab.o: ${UTIL}/fstab.c
+	${CCRULE}
+
+common.o: ${VOL}/common.c
+	${CCRULE}
+
+listinodes.o: ${VOL}/listinodes.c
+	${CCRULE}
+
+ihandle.o: ${VOL}/ihandle.c
+	${CCRULE}
+
+namei_ops.o: ${VOL}/namei_ops.c
+	${CCRULE}
+
+salvageserver: ${OBJECTS} ${LIBS}
+	${CC} ${LDFLAGS} -o salvageserver ${OBJECTS} ${LIBS} ${MT_LIBS} ${XLIBS}
+
+fssync-debug: ${FSSDEBUG_OBJS} ${LIBS}
+	${CC} ${LDFLAGS} -o fssync-debug ${FSSDEBUG_OBJS} ${LIBS} ${MT_LIBS} ${XLIBS}
+
+salvsync-debug: ${SSSDEBUG_OBJS} ${LIBS}
+	${CC} ${LDFLAGS} -o salvsync-debug ${SSSDEBUG_OBJS} ${LIBS} ${MT_LIBS} ${XLIBS}
+
+${DEST}/root.server/usr/afs/bin/salvageserver: salvageserver
+	${INSTALL} -ns $? $@
+
+${DEST}/root.server/usr/afs/bin/fssync-debug: fssync-debug
+	${INSTALL} -s $? $@
+
+${DEST}/root.server/usr/afs/bin/salvsync-debug: salvsync-debug
+	${INSTALL} -s $? $@
+
+install: ${INSTALL_TARGS}
+
+clean:
+	$(RM) -f *.o salvageserver core AFS_component_version_number.c
+
+include ../config/Makefile.version
+
+${DESTDIR}${afssrvlibexecdir}/salvageserver: salvageserver
+	${INSTALL} -ns $? $@
+
+${DESTDIR}${afssrvsbindir}/fssync-debug: fssync-debug
+	${INSTALL} -s $? $@
+
+${DESTDIR}${afssrvsbindir}/salvsync-debug: salvsync-debug
+	${INSTALL} -s $? $@
+
+dest: ${DEST_TARGS}
diff --git a/src/tsalvaged/salvsync-debug.c b/src/tsalvaged/salvsync-debug.c
new file mode 100644
index 0000000000..4d4949aff2
--- /dev/null
+++ b/src/tsalvaged/salvsync-debug.c
@@ -0,0 +1,475 @@
+/*
+ * Copyright 2006, Sine Nomine Associates and others.
+ * All Rights Reserved.
+ * 
+ * This software has been released under the terms of the IBM Public
+ * License.  For details, see the LICENSE file in the top-level source
+ * directory or online at http://www.openafs.org/dl/license10.html
+ */
+
+/* Main program file. Define globals. */
+#define MAIN 1
+
+/*
+ * salvsync debug tool
+ */
+
+
+#include <afsconfig.h>
+#include <afs/param.h>
+
+RCSID
+    ("$Header$");
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <dirent.h>
+#include <sys/stat.h>
+#include <time.h>
+#include <errno.h>
+#ifdef AFS_NT40_ENV
+#include <io.h>
+#include <WINNT/afsevent.h>
+#else
+#include <sys/param.h>
+#include <sys/file.h>
+#ifndef ITIMER_REAL
+#include <sys/time.h>
+#endif /* ITIMER_REAL */
+#endif
+#include <rx/xdr.h>
+#include <afs/afsint.h>
+#include <afs/assert.h>
+
+
+#include <fcntl.h>
+
+#ifndef AFS_NT40_ENV
+#include <afs/osi_inode.h>
+#endif
+
+#include <afs/cmd.h>
+#include <afs/afsutil.h>
+#include <afs/fileutil.h>
+
+#include "nfs.h"
+#include "lwp.h"
+#include "lock.h"
+#include "ihandle.h"
+#include "vnode.h"
+#include "volume.h"
+#include "partition.h"
+#include "daemon_com.h"
+#include "salvsync.h"
+#ifdef AFS_NT40_ENV
+#include <pthread.h>
+#endif
+
+int VolumeChanged; /* hack to make dir package happy */
+
+
+#ifndef AFS_DEMAND_ATTACH_FS
+int
+main(int argc, char ** argv)
+{
+    fprintf(stderr, "*** salvsync-debug is only supported for OpenAFS builds with the demand-attach fileserver extension\n");
+    return -1;
+}
+#else /* AFS_DEMAND_ATTACH_FS */
+
+struct salv_state {
+    afs_uint32 prio;
+    afs_uint32 volume;
+    char partName[16];
+};
+
+struct state {
+    afs_int32 reason;
+    struct salv_state * sop;
+};
+
+static int common_prolog(struct cmd_syndesc *, struct state *);
+static int common_salv_prolog(struct cmd_syndesc *, struct state *);
+
+static int do_salvop(struct state *, afs_int32 command, SYNC_response * res);
+
+static char * response_code_to_string(afs_int32);
+static char * command_code_to_string(afs_int32);
+static char * reason_code_to_string(afs_int32);
+static char * program_type_to_string(afs_int32);
+static char * state_code_to_string(afs_int32);
+
+
+static int OpStats(struct cmd_syndesc * as, char * rock);
+static int OpSalvage(struct cmd_syndesc * as, char * rock);
+static int OpCancel(struct cmd_syndesc * as, char * rock);
+static int OpCancelAll(struct cmd_syndesc * as, char * rock);
+static int OpRaisePrio(struct cmd_syndesc * as, char * rock);
+static int OpQuery(struct cmd_syndesc * as, char * rock);
+
+
+#ifndef AFS_NT40_ENV
+#include "AFS_component_version_number.c"
+#endif
+#define MAX_ARGS 128
+
+#define COMMON_PARMS_OFFSET    13
+#define COMMON_PARMS(ts) \
+    cmd_Seek(ts, COMMON_PARMS_OFFSET); \
+    cmd_AddParm(ts, "-reason", CMD_SINGLE, CMD_OPTIONAL, "sync protocol reason code"); \
+    cmd_AddParm(ts, "-programtype", CMD_SINGLE, CMD_OPTIONAL, "program type code")
+
+#define COMMON_SALV_PARMS_OFFSET    10
+#define COMMON_SALV_PARMS(ts) \
+    cmd_Seek(ts, COMMON_SALV_PARMS_OFFSET); \
+    cmd_AddParm(ts, "-volumeid", CMD_SINGLE, 0, "volume id"); \
+    cmd_AddParm(ts, "-partition", CMD_SINGLE, CMD_OPTIONAL, "partition name"); \
+    cmd_AddParm(ts, "-priority", CMD_SINGLE, CMD_OPTIONAL, "priority")
+
+#define SALV_PARMS_DECL(ts) \
+    COMMON_SALV_PARMS(ts); \
+    COMMON_PARMS(ts)
+
+#define COMMON_PARMS_DECL(ts) \
+    COMMON_PARMS(ts)
+
+int
+main(int argc, char **argv)
+{
+    struct cmd_syndesc *ts;
+    int err = 0;
+    int i;
+    extern char cml_version_number[];
+
+    /* Initialize directory paths */
+    if (!(initAFSDirPath() & AFSDIR_SERVER_PATHS_OK)) {
+#ifdef AFS_NT40_ENV
+	ReportErrorEventAlt(AFSEVT_SVR_NO_INSTALL_DIR, 0, argv[0], 0);
+#endif
+	fprintf(stderr, "%s: Unable to obtain AFS server directory.\n",
+		argv[0]);
+	exit(2);
+    }
+
+
+    ts = cmd_CreateSyntax("stats", OpStats, 0, "get salvageserver statistics (SALVSYNC_NOP opcode)");
+    COMMON_PARMS_DECL(ts);
+    cmd_CreateAlias(ts, "nop");
+
+    ts = cmd_CreateSyntax("salvage", OpSalvage, 0, "schedule a salvage (SALVSYNC_SALVAGE opcode)");
+    SALV_PARMS_DECL(ts);
+
+    ts = cmd_CreateSyntax("cancel", OpCancel, 0, "cancel a salvage (SALVSYNC_CANCEL opcode)");
+    SALV_PARMS_DECL(ts);
+
+    ts = cmd_CreateSyntax("raiseprio", OpRaisePrio, 0, "raise a salvage priority (SALVSYNC_RAISEPRIO opcode)");
+    SALV_PARMS_DECL(ts);
+    cmd_CreateAlias(ts, "rp");
+
+    ts = cmd_CreateSyntax("query", OpQuery, 0, "query salvage status (SALVSYNC_QUERY opcode)");
+    SALV_PARMS_DECL(ts);
+    cmd_CreateAlias(ts, "qry");
+
+    ts = cmd_CreateSyntax("kill", OpCancelAll, 0, "cancel all scheduled salvages (SALVSYNC_CANCELALL opcode)");
+    COMMON_PARMS_DECL(ts);
+
+    err = cmd_Dispatch(argc, argv);
+    exit(err);
+}
+
+static int
+common_prolog(struct cmd_syndesc * as, struct state * state)
+{
+    register struct cmd_item *ti;
+
+#ifdef AFS_NT40_ENV
+    if (afs_winsockInit() < 0) {
+	Exit(1);
+    }
+#endif
+
+    VInitVolumePackage(debugUtility, 1, 1,
+		       DONT_CONNECT_FS, 0);
+    DInit(1);
+
+    if ((ti = as->parms[COMMON_PARMS_OFFSET].items)) {	/* -reason */
+	state->reason = atoi(ti->data);
+    }
+    if ((ti = as->parms[COMMON_PARMS_OFFSET+1].items)) {	/* -programtype */
+	if (!strcmp(ti->data, "fileServer")) {
+	    programType = fileServer;
+	} else if (!strcmp(ti->data, "volumeUtility")) {
+	    programType = volumeUtility;
+	} else if (!strcmp(ti->data, "salvager")) {
+	    programType = salvager;
+	} else if (!strcmp(ti->data, "salvageServer")) {
+	    programType = salvageServer;
+	} else {
+	    programType = (ProgramType) atoi(ti->data);
+	}
+    }
+
+    VConnectSALV();
+
+    return 0;
+}
+
+static int
+common_salv_prolog(struct cmd_syndesc * as, struct state * state)
+{
+    register struct cmd_item *ti;
+    char pname[100], *temp;
+
+    state->sop = (struct salv_state *) calloc(1, sizeof(struct salv_state));
+    assert(state->sop != NULL);
+
+    if ((ti = as->parms[COMMON_SALV_PARMS_OFFSET].items)) {	/* -volumeid */
+	state->sop->volume = atoi(ti->data);
+    } else {
+	fprintf(stderr, "required argument -volumeid not given\n");
+    }
+
+    if ((ti = as->parms[COMMON_SALV_PARMS_OFFSET+1].items)) {	/* -partition */
+	strlcpy(state->sop->partName, ti->data, sizeof(state->sop->partName));
+    } else {
+	memset(state->sop->partName, 0, sizeof(state->sop->partName));
+    }
+
+    if ((ti = as->parms[COMMON_SALV_PARMS_OFFSET+2].items)) {	/* -prio */
+	state->sop->prio = atoi(ti->data);
+    } else {
+	state->sop->prio = 0;
+    }
+
+    return 0;
+}
+
+static int
+do_salvop(struct state * state, afs_int32 command, SYNC_response * res)
+{
+    afs_int32 code;
+    SALVSYNC_response_hdr hdr_l, *hdr;
+    SYNC_response res_l;
+
+    if (!res) {
+	res = &res_l;
+	res->payload.len = sizeof(hdr_l);
+	res->payload.buf = hdr = &hdr_l;
+    } else {
+	hdr = (SALVSYNC_response_hdr *) res->payload.buf;
+    }
+
+    fprintf(stderr, "calling SALVSYNC_SalvageVolume with command code %d (%s)\n", 
+	    command, command_code_to_string(command));
+
+    code = SALVSYNC_SalvageVolume(state->sop->volume,
+				  state->sop->partName,
+				  command,
+				  state->reason,
+				  state->sop->prio,
+				  res);
+
+    switch (code) {
+    case SYNC_OK:
+    case SYNC_DENIED:
+	break;
+    default:
+	fprintf(stderr, "possible sync protocol error. return code was %d\n", code);
+    }
+
+    fprintf(stderr, "SALVSYNC_SalvageVolume returned %d (%s)\n", code, response_code_to_string(code));
+    fprintf(stderr, "protocol response code was %d (%s)\n", 
+	    res->hdr.response, response_code_to_string(res->hdr.response));
+    fprintf(stderr, "protocol reason code was %d (%s)\n", 
+	    res->hdr.reason, reason_code_to_string(res->hdr.reason));
+
+    printf("state = {\n");
+    if (res->hdr.flags & SALVSYNC_FLAG_VOL_STATS_VALID) {
+	printf("\tstate = %d (%s)\n",
+	       hdr->state, state_code_to_string(hdr->state));
+	printf("\tprio = %d\n", hdr->prio);
+    }
+    printf("\tsq_len = %d\n", hdr->sq_len);
+    printf("\tpq_len = %d\n", hdr->pq_len);
+    printf("}\n");
+
+    VDisconnectSALV();
+}
+
+static char *
+response_code_to_string(afs_int32 response)
+{
+    switch (response) {
+    case SYNC_OK:
+	return "SYNC_OK";
+    case SYNC_DENIED:
+	return "SYNC_DENIED";
+    case SYNC_COM_ERROR:
+	return "SYNC_COM_ERROR";
+    case SYNC_BAD_COMMAND:
+	return "SYNC_BAD_COMMAND";
+    case SYNC_FAILED:
+	return "SYNC_FAILED";
+    default:
+	return "**UNKNOWN**";
+    }
+}
+
+static char *
+command_code_to_string(afs_int32 command)
+{
+    switch (command) {
+    case SYNC_COM_CHANNEL_CLOSE:
+	return "SYNC_COM_CHANNEL_CLOSE";
+    case SALVSYNC_NOP:
+	return "SALVSYNC_NOP";
+    case SALVSYNC_SALVAGE:
+	return "SALVSYNC_SALVAGE";
+    case SALVSYNC_CANCEL:
+	return "SALVSYNC_CANCEL";
+    case SALVSYNC_RAISEPRIO:
+	return "SALVSYNC_RAISEPRIO";
+    case SALVSYNC_QUERY:
+	return "SALVSYNC_QUERY";
+    case SALVSYNC_CANCELALL:
+	return "SALVSYNC_CANCELLALL";
+    default:
+	return "**UNKNOWN**";
+    }
+}
+
+static char *
+reason_code_to_string(afs_int32 reason)
+{
+    switch (reason) {
+    case SALVSYNC_WHATEVER:
+	return "SALVSYNC_WHATEVER";
+    case SALVSYNC_ERROR:
+	return "SALVSYNC_ERROR";
+    case SALVSYNC_OPERATOR:
+	return "SALVSYNC_OPERATOR";
+    case SALVSYNC_SHUTDOWN:
+	return "SALVSYNC_SHUTDOWN";
+    case SALVSYNC_NEEDED:
+	return "SALVSYNC_NEEDED";
+    default:
+	return "**UNKNOWN**";
+    }
+}
+
+static char *
+program_type_to_string(afs_int32 type)
+{
+    switch ((ProgramType)type) {
+    case fileServer:
+	return "fileServer";
+    case volumeUtility:
+	return "volumeUtility";
+    case salvager:
+	return "salvager";
+    case salvageServer:
+	return "salvageServer";
+    default:
+	return "**UNKNOWN**";
+    }
+}
+
+static char *
+state_code_to_string(afs_int32 state)
+{
+    switch (state) {
+    case SALVSYNC_STATE_UNKNOWN:
+	return "SALVSYNC_STATE_UNKNOWN";
+    case SALVSYNC_STATE_QUEUED:
+	return "SALVSYNC_STATE_QUEUED";
+    case SALVSYNC_STATE_SALVAGING:
+	return "SALVSYNC_STATE_SALVAGING";
+    case SALVSYNC_STATE_ERROR:
+	return "SALVSYNC_STATE_ERROR";
+    case SALVSYNC_STATE_DONE:
+	return "SALVSYNC_STATE_DONE";
+    default:
+	return "**UNKNOWN**";
+    }
+}
+
+static int
+OpStats(struct cmd_syndesc * as, char * rock)
+{
+    struct state state;
+
+    common_prolog(as, &state);
+    common_salv_prolog(as, &state);
+
+    do_salvop(&state, SALVSYNC_NOP, NULL);
+
+    return 0;
+}
+
+static int
+OpSalvage(struct cmd_syndesc * as, char * rock)
+{
+    struct state state;
+
+    common_prolog(as, &state);
+    common_salv_prolog(as, &state);
+
+    do_salvop(&state, SALVSYNC_SALVAGE, NULL);
+
+    return 0;
+}
+
+static int
+OpCancel(struct cmd_syndesc * as, char * rock)
+{
+    struct state state;
+
+    common_prolog(as, &state);
+    common_salv_prolog(as, &state);
+
+    do_salvop(&state, SALVSYNC_CANCEL, NULL);
+
+    return 0;
+}
+
+static int
+OpCancelAll(struct cmd_syndesc * as, char * rock)
+{
+    struct state state;
+
+    common_prolog(as, &state);
+    common_salv_prolog(as, &state);
+
+    do_salvop(&state, SALVSYNC_CANCELALL, NULL);
+
+    return 0;
+}
+
+static int
+OpRaisePrio(struct cmd_syndesc * as, char * rock)
+{
+    struct state state;
+
+    common_prolog(as, &state);
+    common_salv_prolog(as, &state);
+
+    do_salvop(&state, SALVSYNC_RAISEPRIO, NULL);
+
+    return 0;
+}
+
+static int
+OpQuery(struct cmd_syndesc * as, char * rock)
+{
+    struct state state;
+
+    common_prolog(as, &state);
+    common_salv_prolog(as, &state);
+
+    do_salvop(&state, SALVSYNC_QUERY, NULL);
+
+    return 0;
+}
+
+#endif /* AFS_DEMAND_ATTACH_FS */
diff --git a/src/tviced/Makefile.in b/src/tviced/Makefile.in
index b10e1a4ca8..68363fc543 100644
--- a/src/tviced/Makefile.in
+++ b/src/tviced/Makefile.in
@@ -11,7 +11,7 @@ srcdir=@srcdir@
 include @TOP_OBJDIR@/src/config/Makefile.config
 
 CC=${MT_CC}
-CFLAGS=${COMMON_CFLAGS} -I.. -DNINTERFACE ${MT_CFLAGS} -DRXDEBUG
+CFLAGS=${COMMON_CFLAGS} -I.. -DNINTERFACE ${MT_CFLAGS} -DRXDEBUG -DFSSYNC_BUILD_SERVER -DSALVSYNC_BUILD_CLIENT
 
 CCRULE=${CC} ${CFLAGS} -c $?
 
@@ -24,7 +24,7 @@ DIR=../dir
 VOL=../vol
 FSINT=../fsint
 
-VICEDOBJS=viced.o afsfileprocs.o host.o physio.o callback.o	
+VICEDOBJS=viced.o afsfileprocs.o host.o physio.o callback.o serialize_state.o	
 
 VLSERVEROBJS=vldbint.cs.o vldbint.xdr.o
 
@@ -36,18 +36,20 @@ UTILOBJS=assert.o uuid.o serverLog.o fileutil.o netutils.o dirpath.o volparse.o
 
 DIROBJS=buffer.o dir.o salvage.o
 
-VOLOBJS= vnode.o volume.o vutil.o partition.o fssync.o purge.o \
+VOLOBJS= vnode.o volume.o vutil.o partition.o fssync-server.o \
 	 clone.o devname.o common.o ihandle.o listinodes.o namei_ops.o \
-	 fstab.o
+	 fstab.o salvsync-client.o daemon_com.o
 
 FSINTOBJS= afsaux.o afscbint.cs.o afsint.ss.o afsint.xdr.o
 
 objects= ${VICEDOBJS} ${VLSERVEROBJS} ${LWPOBJS} ${LIBACLOBJS} \
 	 ${UTILOBJS} ${DIROBJS} ${VOLOBJS} ${FSINTOBJS}
 
+SDBGOBJS = state_analyzer.o uuid.o dirpath.o fileutil.o ${TOP_LIBDIR}/util.a
+
 LIBS=${TOP_LIBDIR}/libafsauthent.a ${TOP_LIBDIR}/libafsrpc.a ${TOP_LIBDIR}/util.a
 
-all: fileserver
+all: fileserver state_analyzer
 
 viced.o: ${VICED}/viced.c
 	${CCRULE}
@@ -64,6 +66,9 @@ physio.o: ${VICED}/physio.c
 callback.o: ${VICED}/callback.c
 	${CCRULE}
 
+serialize_state.o: ./serialize_state.c
+	${CCRULE}
+
 assert.o: ${UTIL}/assert.c
 	${CCRULE}
 
@@ -130,10 +135,16 @@ vutil.o: ${VOL}/vutil.c
 partition.o: ${VOL}/partition.c
 	${CCRULE}
 
-fssync.o: ${VOL}/fssync.c
+fssync-server.o: ${VOL}/fssync-server.c
 	${CCRULE}
 
-purge.o: ${VOL}/purge.c
+fssync-client.o: ${VOL}/fssync-client.c
+	${CCRULE}
+
+salvsync-client.o: ${VOL}/salvsync-client.c
+	${CCRULE}
+
+daemon_com.o: ${VOL}/daemon_com.c
 	${CCRULE}
 
 clone.o: ${VOL}/clone.c
@@ -179,21 +190,33 @@ afsint.ss.o: ${FSINT}/afsint.ss.c
 afsint.xdr.o: ${FSINT}/afsint.xdr.c
 	${CCRULE}
 
+state_analyzer.o: state_analyzer.c
+	${CCRULE}
+
 fileserver: ${objects} ${LIBS}
 	${CC} ${LDFLAGS} -o fileserver ${objects} ${LIBS} ${MT_LIBS} ${XLIBS}
 
+state_analyzer: ${SDBGOBJS}
+	${CC} ${LDFLAGS} -o state_analyzer ${SDBGOBJS} ${MT_LIBS} ${XLIBS}
+
 ${DEST}/root.server/usr/afs/bin/fileserver: fileserver
 	${INSTALL} -ns $? $@
 
-install: ${DESTDIR}${afssrvlibexecdir}/fileserver
+${DEST}/root.server/usr/afs/bin/state_analyzer: state_analyzer
+	${INSTALL} $? $@
+
+install: ${DESTDIR}${afssrvlibexecdir}/fileserver ${DESTDIR}${afssrvsbindir}/state_analyzer
 
 clean:
-	$(RM) -f *.o fileserver core AFS_component_version_number.c
+	$(RM) -f *.o fileserver state_analyzer core AFS_component_version_number.c
 
 include ../config/Makefile.version
 
 ${DESTDIR}${afssrvlibexecdir}/fileserver: fileserver
 	${INSTALL} -ns $? $@
 
-dest: ${DEST}/root.server/usr/afs/bin/fileserver
+${DESTDIR}${afssrvsbindir}/state_analyzer: state_analyzer
+	${INSTALL} $? $@
+
+dest: ${DEST}/root.server/usr/afs/bin/fileserver ${DEST}/root.server/usr/afs/bin/state_analyzer
 
diff --git a/src/tviced/NTMakefile b/src/tviced/NTMakefile
index e9e2c270e9..e58c5cc226 100644
--- a/src/tviced/NTMakefile
+++ b/src/tviced/NTMakefile
@@ -5,7 +5,7 @@
 # License.  For details, see the LICENSE file in the top-level source
 # directory or online at http://www.openafs.org/dl/license10.html
 
-AFSDEV_AUXCDEFINES = -DAFS_PTHREAD_ENV -DRXDEBUG
+AFSDEV_AUXCDEFINES = -DAFS_PTHREAD_ENV -DRXDEBUG -DFSSYNC_BUILD_SERVER
 
 RELDIR=tviced
 !INCLUDE ..\config\NTMakefile.$(SYS_NAME)
diff --git a/src/tviced/serialize_state.c b/src/tviced/serialize_state.c
new file mode 100644
index 0000000000..c1b4583153
--- /dev/null
+++ b/src/tviced/serialize_state.c
@@ -0,0 +1,1120 @@
+/*
+ * Copyright 2006, Sine Nomine Associates and others.
+ * All Rights Reserved.
+ * 
+ * This software has been released under the terms of the IBM Public
+ * License.  For details, see the LICENSE file in the top-level source
+ * directory or online at http://www.openafs.org/dl/license10.html
+ */
+
+/*
+ * demand attach fs
+ * fileserver state serialization
+ */
+
+#include <afsconfig.h>
+#include <afs/param.h>
+
+RCSID
+    ("$Header$");
+
+#include <stdio.h>
+#include <stdlib.h>		/* for malloc() */
+#include <time.h>		/* ANSI standard location for time stuff */
+#ifdef AFS_NT40_ENV
+#include <fcntl.h>
+#include <io.h>
+#else
+#include <sys/time.h>
+#include <sys/file.h>
+#endif
+#ifdef HAVE_STRING_H
+#include <string.h>
+#else
+#ifdef HAVE_STRINGS_H
+#include <strings.h>
+#endif
+#endif
+#include <afs/assert.h>
+#include <sys/stat.h>
+
+#include <afs/stds.h>
+
+#include <rx/xdr.h>
+#include <lwp.h>
+#include <lock.h>
+#include <afs/afsint.h>
+#include <afs/rxgen_consts.h>
+#include <afs/nfs.h>
+#include <afs/errors.h>
+#include <afs/ihandle.h>
+#include <afs/vnode.h>
+#include <afs/volume.h>
+#include <afs/acl.h>
+#include <afs/ptclient.h>
+#include <afs/prs_fs.h>
+#include <afs/auth.h>
+#include <afs/afsutil.h>
+#include <rx/rx.h>
+#include <afs/cellconfig.h>
+#include <stdlib.h>
+
+#include "../viced/viced_prototypes.h"
+#include "../viced/viced.h"
+#include "../viced/host.h"
+#include "../viced/callback.h"
+#include "serialize_state.h"
+
+/*@+fcnmacros +macrofcndecl@*/
+#ifdef O_LARGEFILE
+#ifdef S_SPLINT_S
+extern off64_t afs_lseek(int FD, off64_t O, int F);
+#endif /*S_SPLINT_S */
+#define afs_lseek(FD, O, F)	lseek64(FD, (off64_t)(O), F)
+#define afs_stat		stat64
+#define afs_fstat		fstat64
+#define afs_open		open64
+#define afs_fopen		fopen64
+#define afs_ftruncate           ftruncate64
+#define afs_mmap                mmap64
+#ifdef AFS_AIX_ENV
+extern void * mmap64();  /* ugly hack since aix build env appears to be somewhat broken */
+#endif
+#else /* !O_LARGEFILE */
+#ifdef S_SPLINT_S
+extern off_t afs_lseek(int FD, off_t O, int F);
+#endif /*S_SPLINT_S */
+#define afs_lseek(FD, O, F)	lseek(FD, (off_t)(O), F)
+#define afs_stat		stat
+#define afs_fstat		fstat
+#define afs_open		open
+#define afs_fopen		fopen
+#define afs_ftruncate           ftruncate
+#define afs_mmap                mmap
+#endif /* !O_LARGEFILE */
+/*@=fcnmacros =macrofcndecl@*/
+
+
+#ifdef AFS_DEMAND_ATTACH_FS
+
+/*
+ * demand attach fs
+ * state dump routines
+ *
+ * in order to make state dump/restore as fast as possible,
+ * we use memory mapped files
+ *
+ * if this causes problems on certain platforms, the APIs
+ * have been written so that it will be very simple to go
+ * back to standard I/O for just those poorly written platforms
+ */
+#define FS_STATE_USE_MMAP
+
+
+#ifdef FS_STATE_USE_MMAP
+#define FS_STATE_INIT_FILESIZE (8 * 1024 * 1024)  /* truncate to 8MB initially */
+#include <sys/mman.h>
+#endif
+
+static int fs_stateCreateDump(struct fs_dump_state * state);
+static int fs_stateLoadDump(struct fs_dump_state * state);
+static int fs_stateInvalidateDump(struct fs_dump_state * state);
+static int fs_stateCommitDump(struct fs_dump_state * state);
+static int fs_stateCloseDump(struct fs_dump_state * state);
+
+#ifdef FS_STATE_USE_MMAP
+static int fs_stateSizeFile(struct fs_dump_state * state);
+static int fs_stateResizeFile(struct fs_dump_state * state, size_t min_add);
+static int fs_stateTruncateFile(struct fs_dump_state * state);
+
+static int fs_stateMapFile(struct fs_dump_state * state);
+static int fs_stateUnmapFile(struct fs_dump_state * state);
+
+static int fs_stateIncCursor(struct fs_dump_state * state, size_t len);
+static int fs_stateCheckIOSafety(struct fs_dump_state * state,
+				 size_t len);
+#endif
+
+static int fs_stateFillHeader(struct fs_state_header * hdr);
+static int fs_stateCheckHeader(struct fs_state_header * hdr);
+
+static int fs_stateAlloc(struct fs_dump_state * state);
+static int fs_stateFree(struct fs_dump_state * state);
+
+extern afsUUID FS_HostUUID;
+extern char cml_version_number[];
+
+/*
+ * demand attach fs
+ * save all fileserver state 
+ */
+int
+fs_stateSave(void)
+{
+    int ret = 0, verified = 1;
+    struct fs_dump_state state;
+
+    /* save and restore need to be atomic wrt other host package operations */
+    H_LOCK; 
+
+    ViceLog(0, ("fs_stateSave: commencing fileserver state dump\n"));
+
+    if (fs_stateAlloc(&state)) {
+	ViceLog(0, ("fs_stateSave: memory allocation failed; dump aborted\n"));
+	ret = 1;
+	goto done;
+    }
+
+    /* XXX
+     * on busy servers, these checks will inevitably fail since stuff drops H_LOCK
+     * all over the place (with structs left in inconsistent states) while RPCs to
+     * clients happen (grumble, grumble, the host package needs to be rewritten...)
+     *
+     * the current hack is to force the background threads that deal with host and
+     * callback state offline early in the shutdown process, do VShutdown, come
+     * back and wait for those threads to die, THEN do the state dump
+     *
+     * BUT, this still has one flaw -- what do we do about rx worker threads that
+     * are blocked in the host package making an RPC call to a cm???
+     *
+     * perhaps we need a refcounter that keeps track of threads blocked in rpc calls
+     * with H_LOCK dropped (and the host struct likely left in an inconsistent state)
+     *
+     * or better yet, we need to associate a state machine with each host object
+     * (kind of like demand attach Volume structures).
+     *
+     * sigh. I suspect we'll need to revisit this issue
+     */
+
+    if (fs_state.options.fs_state_verify_before_save) {
+	ViceLog(0, ("fs_stateSave: performing internal consistency checks before proceeding with state dump\n"));
+
+	if (h_stateVerify(&state)) {
+	    ViceLog(0, ("fs_stateSave: error: host table consistency checks failed; state dump will not be marked clean\n"));
+	    verified = 0;
+	    ret = 1;
+	}
+
+	if (cb_stateVerify(&state)) {
+	    ViceLog(0, ("fs_stateSave: error: callback table consistency checks failed; state dump will not be marked clean\n"));
+	    verified = 0;
+	    ret = 1;
+	}
+
+	/* if a consistency check asserted the bail flag, reset it */
+	state.bail = 0;
+
+	ViceLog(0, ("fs_stateSave: proceeding with dump\n"));
+    }
+
+    if (fs_stateCreateDump(&state)) {
+	ViceLog(0, ("fs_stateSave: error: dump create failed\n"));
+	ret = 1;
+	goto done;
+    }
+
+    if (h_stateSave(&state)) {
+	ViceLog(0, ("fs_stateSave: error: host state dump failed\n"));
+	ret = 1;
+	goto done;
+    }
+
+    if (cb_stateSave(&state)) {
+	ViceLog(0, ("fs_stateSave: error: callback state dump failed\n"));
+	ret = 1;
+	goto done;
+    }
+
+    if (!verified) {
+	state.bail = 1;
+    }
+
+    if (fs_stateCommitDump(&state)) {
+	ViceLog(0, ("fs_stateSave: error: dump commit failed\n"));
+	ret = 1; 
+	goto done;
+    }
+
+    if (verified) {
+	ViceLog(0, ("fs_stateSave: fileserver state dump completed successfully\n"));
+    } else {
+	ViceLog(0, ("fs_stateSave: fileserver state dump completed, but not marked clean.\n"));
+	ViceLog(0, ("fs_stateSave: please save a copy of '%s' for use by technical support\n",
+		    state.fn));
+    }
+
+ done:
+    if (state.fd >= 0)
+	fs_stateCloseDump(&state);
+    fs_stateFree(&state);
+    H_UNLOCK;
+    return ret;
+}
+
+/*
+ * demand attach fs
+ * restore all fileserver state
+ *
+ * this function must appear as one atomic operation to the host and callback
+ * packages, hence H_LOCK is held for the entirety of the process.
+ */
+int
+fs_stateRestore(void)
+{
+    int ret = 0;
+    struct fs_dump_state state;
+
+    /* save and restore need to be atomic wrt other host package operations */
+    H_LOCK;
+
+    ViceLog(0, ("fs_stateRestore: commencing fileserver state restore\n"));
+
+    if (fs_stateAlloc(&state)) {
+	ViceLog(0, ("fs_stateRestore: memory allocation failed\n"));
+	ret = 1;
+	goto done;
+    }
+
+    if (fs_stateLoadDump(&state)) {
+	ViceLog(0, ("fs_stateRestore: failed to load dump file '%s'\n", state.fn));
+	ret = 1;
+	goto done;
+    }
+
+    if (fs_stateInvalidateDump(&state)) {
+	ViceLog(0, ("fs_stateRestore: failed to invalidate dump file '%s'\n", state.fn));
+	ret = 1;
+	goto done;
+    }
+
+
+    if (state.flags.do_host_restore) {
+	if (h_stateRestore(&state)) {
+	    ViceLog(0, ("fs_stateRestore: error: host state restore failed. exiting avoid further corruption\n"));
+	    exit(0);
+	}
+	ViceLog(0, ("fs_stateRestore: host table restored\n"));
+
+	if (cb_stateRestore(&state)) {
+	    ViceLog(0, ("fs_stateRestore: error: callback state restore failed. exiting to avoid further corruption\n"));
+	    exit(0);
+	}
+	ViceLog(0, ("fs_stateRestore: FileEntry and CallBack tables restored\n"));
+
+	if (h_stateRestoreIndices(&state)) {
+	    ViceLog(0, ("fs_stateRestore: error: host index remapping failed. exiting to avoid further corruption\n"));
+	    exit(0);
+	}
+	ViceLog(0, ("fs_stateRestore: host table indices remapped\n"));
+
+	if (cb_stateRestoreIndices(&state)) {
+	    ViceLog(0, ("fs_stateRestore: error: callback index remapping failed. exiting to avoid further corruption\n"));
+	    exit(0);
+	}
+	ViceLog(0, ("fs_stateRestore: FileEntry and CallBack indices remapped\n"));
+    }
+
+    ViceLog(0, ("fs_stateRestore: restore phase complete\n"));
+
+    if (fs_state.options.fs_state_verify_after_restore) {
+	ViceLog(0, ("fs_stateRestore: beginning state verification phase\n"));
+
+	if (state.flags.do_host_restore) {
+	    if (h_stateVerify(&state)) {
+		ViceLog(0, ("fs_stateRestore: error: host table consistency checks failed; exiting to avoid further corruption\n"));
+		exit(0);
+	    }
+
+	    if (cb_stateVerify(&state)) {
+		ViceLog(0, ("fs_stateRestore: error: callback table consistency checks failed; exiting to avoid further corruption\n"));
+		exit(0);
+	    }
+	}
+
+	ViceLog(0, ("fs_stateRestore: fileserver state verification complete\n"));
+    }
+
+    ViceLog(0, ("fs_stateRestore: restore was successful\n"));
+
+ done:
+    if (state.fd >= 0) {
+	fs_stateInvalidateDump(&state);
+	fs_stateCloseDump(&state);
+    }
+    fs_stateFree(&state);
+    H_UNLOCK;
+    return ret;
+}
+
+static int
+fs_stateCreateDump(struct fs_dump_state * state)
+{
+    int fd, ret = 0;
+    char savedump[MAXPATHLEN];
+    struct afs_stat status;
+
+    afs_snprintf(savedump, sizeof(savedump), "%s.old", state->fn);
+
+    if (afs_stat(state->fn, &status) == 0) {
+	renamefile(state->fn, savedump);
+    }
+
+    if (((fd = afs_open(state->fn, 
+			O_RDWR | O_CREAT | O_TRUNC, 
+			S_IRUSR | S_IWUSR)) == -1) ||
+	(afs_fstat(fd, &status) == -1)) {
+	ViceLog(0, ("fs_stateCreateDump: failed to create state dump file '%s'\n",
+		    state->fn));
+	ret = 1;
+	goto done;
+    }
+
+    state->fd = fd;
+    state->mode = FS_STATE_DUMP_MODE;
+    memset(state->hdr, 0, sizeof(struct fs_state_header));
+    fs_stateIncEOF(state, sizeof(struct fs_state_header));
+
+#ifdef FS_STATE_USE_MMAP
+    if (fs_stateSizeFile(state)) {
+	ViceLog(0, ("fs_stateCreateDump: failed to resize state dump file '%s'\n",
+		    state->fn));
+	ret = 1;
+	goto done;
+    }
+
+    if (fs_stateMapFile(state)) {
+	ViceLog(0, ("fs_stateCreateDump: failed to memory map state dump file '%s'\n",
+		    state->fn));
+	ret = 1;
+	goto done;
+    }
+#endif
+
+    ret = fs_stateInvalidateDump(state);
+
+ done:
+    return ret;
+}
+
+static int
+fs_stateInvalidateDump(struct fs_dump_state * state)
+{
+    afs_uint64 z;
+    int ret = 0;
+    struct fs_state_header hdr;
+
+#ifdef FS_STATE_USE_MMAP
+    if (state->mmap.map == NULL) {
+	return 1;
+    }
+#endif
+
+    memcpy(&hdr, state->hdr, sizeof(hdr));
+    hdr.valid = 0;
+    ZeroInt64(z);
+
+    /* write a bogus header to flag dump in progress */
+    if (fs_stateWriteHeader(state, &z, &hdr, sizeof(hdr))) {
+	ViceLog(0, ("fs_stateInvalidateDump: failed to invalidate old dump file header '%s'\n",
+		    state->fn));
+	ret = 1;
+	goto done;
+    }
+    if (fs_stateSync(state)) {
+	ViceLog(0, ("fs_stateInvalidateDump: failed to sync changes to disk\n"));
+	ret = 1;
+	goto done;
+    }
+
+ done:
+    return ret;
+}
+
+static int
+fs_stateCommitDump(struct fs_dump_state * state)
+{
+    afs_uint64 z;
+    int ret = 0;
+
+    ZeroInt64(z);
+
+#ifdef FS_STATE_USE_MMAP
+    if (fs_stateTruncateFile(state)) {
+	ViceLog(0, ("fs_stateCommitDump: failed to truncate dump file to proper size\n"));
+	ret = 1;
+	goto done;
+    }
+#endif
+
+    /* ensure that all pending data I/Os for the state file have been committed 
+     * _before_ we make the metadata I/Os */
+    if (fs_stateSync(state)) {
+	ViceLog(0, ("fs_stateCommitDump: failed to sync changes to disk\n"));
+	ret = 1;
+	goto done;
+    }
+
+#ifdef FS_STATE_USE_MMAP
+    /* XXX madvise may not exist on all platforms, so
+     * we may need to add some ifdefs at some point... */
+    {
+	madvise((((char *)state->mmap.map) + sizeof(struct fs_state_header)), 
+		state->mmap.size - sizeof(struct fs_state_header), 
+		MADV_DONTNEED);
+    }
+#endif
+
+    /* build the header, and write it to disk */
+    fs_stateFillHeader(state->hdr);
+    if (state->bail) {
+	state->hdr->valid = 0;
+    }
+    if (fs_stateWriteHeader(state, &z, state->hdr, sizeof(struct fs_state_header))) {
+	ViceLog(0, ("fs_stateCommitDump: failed to write header to dump file '%s'\n",
+		    state->fn));
+	ret = 1;
+	goto done;
+    }
+    if (fs_stateSync(state)) {
+	ViceLog(0, ("fs_stateCommitDump: failed to sync new header to disk\n"));
+	ret = 1;
+	goto done;
+    }
+
+ done:
+    return ret;
+}
+
+static int
+fs_stateLoadDump(struct fs_dump_state * state)
+{
+    afs_uint64 z;
+    int fd, ret = 0;
+    struct afs_stat status;
+    afs_int32 now = FT_ApproxTime();
+
+    ZeroInt64(z);
+
+    if ((fd = afs_open(state->fn, O_RDWR)) == -1 ||
+	(afs_fstat(fd, &status) == -1)) {
+	ViceLog(0, ("fs_stateLoadDump: failed to load state dump file '%s'\n",
+		    state->fn));
+	ret = 1;
+	goto done;
+    }
+    state->fd = fd;
+    state->mode = FS_STATE_LOAD_MODE;
+    state->file_len = status.st_size;
+
+#ifdef FS_STATE_USE_MMAP
+    if (fs_stateMapFile(state)) {
+	ViceLog(0, ("fs_stateLoadDump: failed to memory map state dump file '%s'\n",
+		    state->fn));
+	ret = 1;
+	goto done;
+    }
+#endif
+
+    if (fs_stateReadHeader(state, &z, state->hdr, sizeof(struct fs_state_header))) {
+	ViceLog(0, ("fs_stateLoadDump: failed to read header from dump file '%s'\n",
+		    state->fn));
+	ret = 1;
+	goto done;
+    }
+
+    /* check the validity of the header */
+    if (fs_stateCheckHeader(state->hdr)) {
+	ViceLog(1, ("fs_stateLoadDump: header failed validity checks; not restoring '%s'\n",
+		    state->fn));
+	ret = 1;
+	goto done;
+    }
+
+    if ((state->hdr->timestamp + HOST_STATE_VALID_WINDOW) >= now) {
+	state->flags.do_host_restore = 1;
+    } else {
+	ViceLog(0, ("fs_stateLoadDump: warning: dump is too old for host and callback restore; skipping those steps\n"));
+    }
+
+ done:
+    return ret;
+}
+
+static int
+fs_stateCloseDump(struct fs_dump_state * state)
+{
+#ifdef FS_STATE_USE_MMAP
+    fs_stateUnmapFile(state);
+#endif
+    close(state->fd);
+    return 0;
+}
+
+int
+fs_stateWrite(struct fs_dump_state * state,
+	      void * buf, size_t len)
+{
+    int ret = 0;
+
+#ifdef FS_STATE_USE_MMAP
+    if (fs_stateCheckIOSafety(state, len)) {
+	if (fs_stateResizeFile(state, len)) {
+	    ViceLog(0, ("fs_stateWrite: could not resize dump file '%s'\n",
+			state->fn));
+	    ret = 1;
+	    goto done;
+	}
+    }
+	    
+    memcpy(state->mmap.cursor, buf, len);
+    fs_stateIncCursor(state, len);
+#else
+    if (write(state->fd, buf, len) != len) {
+	ViceLog(0, ("fs_stateWrite: write failed\n"));
+	ret = 1;
+	goto done;
+    }
+#endif
+
+ done:
+    return ret;
+}
+
+int
+fs_stateRead(struct fs_dump_state * state,
+	     void * buf, size_t len)
+{
+    int ret = 0;
+
+#ifdef FS_STATE_USE_MMAP
+    if (fs_stateCheckIOSafety(state, len)) {
+	ViceLog(0, ("fs_stateRead: read beyond EOF for dump file '%s'\n",
+		    state->fn));
+	ret = 1;
+	goto done;
+    }
+
+    memcpy(buf, state->mmap.cursor, len);
+    fs_stateIncCursor(state, len);
+#else
+    if (read(state->fd, buf, len) != len) {
+	ViceLog(0, ("fs_stateRead: read failed\n"));
+	ret = 1;
+	goto done;
+    }
+#endif
+
+ done:
+    return ret;
+}
+
+int
+fs_stateWriteV(struct fs_dump_state * state,
+	       struct iovec * iov, int niov)
+{
+    int i, ret = 0;
+    size_t len = 0;
+
+    for (i=0; i < niov; i++) {
+	len += iov[i].iov_len;
+    }
+
+#ifdef FS_STATE_USE_MMAP
+    if (fs_stateCheckIOSafety(state, len)) {
+	if (fs_stateResizeFile(state, len)) {
+	    ViceLog(0, ("fs_stateWrite: could not resize dump file '%s'\n",
+			state->fn));
+	    ret = 1;
+	    goto done;
+	}
+    }
+
+    for (i=0; i < niov; i++) {
+	memcpy(state->mmap.cursor, iov[i].iov_base, iov[i].iov_len);
+	fs_stateIncCursor(state, iov[i].iov_len);
+    }
+#else
+    if (writev(state->fd, iov, niov) != len) {
+	ViceLog(0, ("fs_stateWriteV: write failed\n"));
+	ret = 1;
+	goto done;
+    }
+#endif
+
+ done:
+    return ret;
+}
+
+int
+fs_stateReadV(struct fs_dump_state * state,
+	      struct iovec * iov, int niov)
+{
+    int i, ret = 0;
+    size_t len = 0;
+
+    for (i=0; i < niov; i++) {
+	len += iov[i].iov_len;
+    }
+
+#ifdef FS_STATE_USE_MMAP
+    if (fs_stateCheckIOSafety(state, len)) {
+	ViceLog(0, ("fs_stateRead: read beyond EOF for dump file '%s'\n",
+		    state->fn));
+	ret = 1;
+	goto done;
+    }
+
+    for (i=0; i < niov; i++) {
+	memcpy(iov[i].iov_base, state->mmap.cursor, iov[i].iov_len);
+	fs_stateIncCursor(state, iov[i].iov_len);
+    }
+#else
+    if (readv(state->fd, iov, niov) != len) {
+	ViceLog(0, ("fs_stateReadV: read failed\n"));
+	ret = 1;
+	goto done;
+    }
+#endif
+
+ done:
+    return ret;
+}
+
+int
+fs_stateWriteHeader(struct fs_dump_state * state,
+		    afs_uint64 * offset,
+		    void * hdr, size_t len)
+{
+    int ret = 0;
+
+    if (fs_stateSeek(state, offset)) {
+	ViceLog(0, ("fs_stateWriteHeader: could not seek to correct position in dump file '%s'\n",
+		    state->fn));
+	ret = 1;
+	goto done;
+    }
+
+    if (fs_stateWrite(state, hdr, len)) {
+	ViceLog(0, ("fs_stateWriteHeader: write failed\n"));
+	ret = 1;
+	goto done;
+    }
+
+ done:
+    return ret;
+}
+
+int
+fs_stateReadHeader(struct fs_dump_state * state,
+		   afs_uint64 * offset,
+		   void * hdr, size_t len)
+{
+    int ret = 0;
+
+    if (fs_stateSeek(state, offset)) {
+	ViceLog(0, ("fs_stateReadHeader: could not seek to correct position in dump file '%s'\n",
+		    state->fn));
+	ret = 1;
+	goto done;
+    }
+
+    if (fs_stateRead(state, hdr,len)) {
+	ViceLog(0, ("fs_stateReadHeader: read failed\n"));
+	ret = 1;
+	goto done;
+    }
+
+ done:
+    return ret;
+}
+
+#ifdef FS_STATE_USE_MMAP
+static int
+fs_stateSizeFile(struct fs_dump_state * state)
+{
+    int ret = 0;
+    state->file_len = FS_STATE_INIT_FILESIZE;
+    if (afs_ftruncate(state->fd, state->file_len) != 0)
+	ret = 1;
+    return ret;
+}
+
+static int
+fs_stateResizeFile(struct fs_dump_state * state, size_t min_add)
+{
+    int ret = 0;
+    afs_foff_t inc;
+
+#ifdef FS_STATE_USE_MMAP
+    fs_stateUnmapFile(state);
+#endif
+
+    inc = ((min_add / FS_STATE_INIT_FILESIZE)+1) * FS_STATE_INIT_FILESIZE;
+    state->file_len += inc;
+
+    if (afs_ftruncate(state->fd, state->file_len) != 0) {
+	ViceLog(0, ("fs_stateResizeFile: truncate failed\n"));
+	ret = 1;
+	goto done;
+    }
+
+#ifdef FS_STATE_USE_MMAP
+    if (fs_stateMapFile(state)) {
+	ViceLog(0, ("fs_stateResizeFile: remapping memory mapped file failed\n"));
+	ret = 1;
+	goto done;
+    }
+#endif
+
+ done:
+    return ret;
+}
+
+static int
+fs_stateTruncateFile(struct fs_dump_state * state)
+{
+    int ret = 0;
+
+#ifdef AFS_LARGEFILE_ENV
+    if (afs_ftruncate(state->fd, state->eof_offset) != 0) {
+	ret = 1;
+    }
+#else
+    afs_uint32 hi, lo;
+    SplitInt64(state->eof_offset, hi, lo);
+    if (afs_ftruncate(state->fd, lo) != 0) {
+	ret = 1;
+    }
+#endif
+
+    return ret;
+}
+#endif
+
+#ifdef FS_STATE_USE_MMAP
+static int
+fs_stateMapFile(struct fs_dump_state * state)
+{
+    int ret = 0, flags;
+
+    switch(state->mode) {
+    case FS_STATE_LOAD_MODE:
+	flags = PROT_READ | PROT_WRITE;   /* loading involves a header invalidation */
+	break;
+    case FS_STATE_DUMP_MODE:
+	flags = PROT_WRITE;
+	break;
+    default:
+	ViceLog(0, ("fs_stateMapFile: invalid dump state mode\n"));
+	return 1;
+    }
+
+    state->mmap.map = afs_mmap(NULL, 
+			       state->file_len, 
+			       flags, 
+			       MAP_SHARED,
+			       state->fd, 
+			       0);
+
+    if (state->mmap.map == MAP_FAILED) {
+	state->mmap.size = 0;
+	state->mmap.map = NULL;
+	ViceLog(0, ("fs_stateMapFile: failed to memory map file '%s'\n",
+		    state->fn));
+	ret = 1;
+	goto done;
+    }
+
+    state->mmap.size = state->file_len;
+    state->mmap.cursor = state->mmap.map;
+    state->mmap.offset = 0;
+
+    /* for state loading, accesses will be sequential, so let's give
+     * the VM subsystem a heads up */
+    if (state->mode == FS_STATE_LOAD_MODE) {
+	/* XXX madvise may not exist on all platforms, so
+	 * we may need to add some ifdefs at some point... */
+	flags = MADV_SEQUENTIAL | MADV_WILLNEED;
+#ifdef AFS_SUN510_ENV
+	flags |= MADV_ACCESS_LWP;   /* added in solaris 9 12/02 */
+#endif
+	madvise(state->mmap.map, state->mmap.size, flags);
+    }
+
+ done:
+    return ret;
+}
+
+static int
+fs_stateUnmapFile(struct fs_dump_state * state)
+{
+    int ret = 0;
+
+    if (munmap(state->mmap.map, state->mmap.size) == -1) {
+	ViceLog(0, ("fs_stateUnmapFile: failed to unmap dump file '%s'\n",
+		    state->fn));
+	ret = 1;
+	goto done;
+    }
+
+ done:
+    return ret;
+}
+#endif /* FS_STATE_USE_MMAP */
+
+#ifdef FS_STATE_USE_MMAP
+int
+fs_stateSync(struct fs_dump_state * state)
+{
+    int ret = 0;
+
+    msync(state->mmap.map, state->mmap.size, MS_SYNC);
+
+ done:
+    return ret;
+}
+#else /* !FS_STATE_USE_MMAP */
+int
+fs_stateSync(struct fs_dump_state * state)
+{
+    int ret = 0;
+
+    if (fsync(state->fd) == -1)
+	ret = 1;
+
+ done:
+    return ret;
+}
+#endif /* !FS_STATE_USE_MMAP */
+
+int
+fs_stateIncEOF(struct fs_dump_state * state, afs_int32 len)
+{
+    afs_uint64 temp;
+    FillInt64(temp, 0, len);
+    AddUInt64(state->eof_offset, temp, &state->eof_offset);
+    return 0;
+}
+
+#ifdef FS_STATE_USE_MMAP
+static int
+fs_stateIncCursor(struct fs_dump_state * state, size_t len)
+{
+    char * p;
+
+    state->mmap.offset += len;
+
+    p = (char *) state->mmap.cursor;
+    p += len;
+    state->mmap.cursor = (void *) p;
+
+    return 0;
+}
+
+static int
+fs_stateCheckIOSafety(struct fs_dump_state * state, size_t len)
+{
+    int ret = 0;
+
+    if ((state->mmap.offset + len) > state->mmap.size) {
+	ret = 1;
+    }
+    return ret;
+}
+#endif /* FS_STATE_USE_MMAP */
+
+#ifdef FS_STATE_USE_MMAP
+int
+fs_stateSeek(struct fs_dump_state * state, afs_uint64 * offset)
+{
+    int ret = 0;
+    char * p;
+    afs_uint32 hi, lo;
+
+    SplitInt64(*offset, hi, lo);
+
+    /* update cursor */
+    p = (char *) state->mmap.map;
+#ifdef AFS_64BIT_ENV
+    p += *offset;
+#else
+    p += lo;
+#endif
+    state->mmap.cursor = (void *) p;
+
+    /* update offset */
+#ifdef AFS_LARGEFILE_ENV
+    state->mmap.offset = *offset;
+#else
+    if (hi)
+	ret = 1;
+    state->mmap.offset = lo;
+#endif
+
+    return ret;
+}
+#else /* !FS_STATE_USE_MMAP */
+int
+fs_stateSeek(struct fs_dump_state * state, afs_uint64 * offset)
+{
+    int ret = 0;
+#ifndef AFS_LARGEFILE_ENV
+    afs_uint32 high, low;
+    
+    SplitInt64(*offset, high, low);
+    if (high) {
+	ret = 1;
+	goto done;
+    }
+    
+    if (afs_lseek(state->fd, low, SEEK_SET) == -1)
+	ret = 1;
+#else
+    if (afs_lseek(state->fd, *offset, SEEK_SET) == -1)
+	ret = 1;
+#endif
+    return ret;
+}
+#endif /* !FS_STATE_USE_MMAP */
+
+static int
+fs_stateFillHeader(struct fs_state_header * hdr)
+{
+    hdr->stamp.magic = FS_STATE_MAGIC;
+    hdr->stamp.version = FS_STATE_VERSION;
+#ifdef SYS_NAME_ID
+    hdr->sys_name = SYS_NAME_ID;
+#else
+    hdr->sys_name = 0xFFFFFFFF;
+#endif
+    hdr->timestamp = FT_ApproxTime();
+    hdr->server_uuid = FS_HostUUID;
+    hdr->valid = 1;
+#ifdef AFSBIG_ENDIAN
+    hdr->endianness = 1;
+#else
+    hdr->endianness = 0;
+#endif
+#ifdef FS_STATS_DETAILED
+    hdr->stats_detailed = 1;
+#else
+    hdr->stats_detailed = 0;
+#endif
+    if (strlcpy(hdr->server_version_string, cml_version_number, sizeof(hdr->server_version_string))
+	>= sizeof(hdr->server_version_string)) {
+	ViceLog(0, ("fs_stateFillHeader: WARNING -- cml_version_number field truncated\n"));
+    }
+    return 0;
+}
+
+static int
+fs_stateCheckHeader(struct fs_state_header * hdr)
+{
+    int ret = 0;
+
+    if (!hdr->valid) {
+	ViceLog(0, ("fs_stateCheckHeader: dump was previously flagged invalid\n"));
+	ret = 1;
+    }
+#ifdef AFSBIG_ENDIAN
+    else if (!hdr->endianness) {
+	ViceLog(0, ("fs_stateCheckHeader: wrong endianness\n"));
+	ret = 1;
+    }
+#else /* AFSLITTLE_ENDIAN */
+    else if (hdr->endianness) {
+	ViceLog(0, ("fs_stateCheckHeader: wrong endianness\n"));
+	ret = 1;
+    }
+#endif /* AFSLITTLE_ENDIAN */
+
+    else if (hdr->stamp.magic != FS_STATE_MAGIC) {
+	ViceLog(0, ("fs_stateCheckHeader: invalid dump header\n"));
+	ret = 1;
+    }
+    else if (hdr->stamp.version != FS_STATE_VERSION) {
+	ViceLog(0, ("fs_stateCheckHeader: unknown dump format version number\n"));
+	ret = 1;
+    }
+
+#ifdef FS_STATS_DETAILED
+    else if (!hdr->stats_detailed) {
+	ViceLog(0, ("fs_stateCheckHeader: wrong config flags\n"));
+	ret = 1;
+    }
+#else /* FS_STATS_DETAILED */
+    else if (hdr->stats_detailed) {
+	ViceLog(0, ("fs_stateCheckHeader: wrong config flags\n"));
+	ret = 1;
+    }
+#endif /* FS_STATS_DETAILED */
+
+    else if (!afs_uuid_equal(&hdr->server_uuid, &FS_HostUUID)) {
+	ViceLog(0, ("fs_stateCheckHeader: server UUID does not match this server's UUID\n"));
+	ret = 1;
+    }
+
+    /* the cml_version_string is included for informational purposes only.  If someone ever
+     * wants to limit state dump reloading based upon the contents of this string, just
+     * uncomment the following code.  uncommenting this code is _strongly discouraged_ because
+     * we already make use of the version stamps in the various dump headers to deal with
+     * data structure version incompatabilities.
+    else if (strncmp(hdr->server_version_string, cml_version_number, 
+		     sizeof(hdr->server_version_string)) != 0) {
+	ViceLog(0, ("fs_stateCheckHeader: dump from different server version\n"));
+	ret = 1;
+    }
+    */
+
+    else if (strncmp(hdr->server_version_string, cml_version_number, 
+		     sizeof(hdr->server_version_string)) != 0) {
+	ViceLog(0, ("fs_stateCheckHeader: dump from different server version ; attempting state reload anyway\n"));
+    }
+
+
+    return ret;
+}
+
+static int
+fs_stateAlloc(struct fs_dump_state * state)
+{
+    int ret = 0;
+    memset(state, 0, sizeof(struct fs_dump_state));
+    state->fd = -1;
+    state->fn = AFSDIR_SERVER_FSSTATE_FILEPATH;
+    state->hdr = (struct fs_state_header *)malloc(sizeof(struct fs_state_header));
+    state->h_hdr = (struct host_state_header *)malloc(sizeof(struct host_state_header));
+    state->cb_hdr = (struct callback_state_header *)malloc(sizeof(struct callback_state_header));
+    state->cb_timeout_hdr = (struct callback_state_timeout_header *)
+      malloc(sizeof(struct callback_state_timeout_header));
+    state->cb_fehash_hdr = (struct callback_state_fehash_header *)
+      malloc(sizeof(struct callback_state_fehash_header));
+    if ((state->hdr == NULL) || (state->h_hdr == NULL) || (state->cb_hdr == NULL) ||
+	(state->cb_timeout_hdr == NULL) || (state->cb_fehash_hdr == NULL))
+	ret = 1;
+    return ret;
+}
+
+static int
+fs_stateFree(struct fs_dump_state * state)
+{
+    if (state->hdr)
+	free(state->hdr);
+    if (state->h_hdr)
+	free(state->h_hdr);
+    if (state->cb_hdr)
+	free(state->cb_hdr);
+    if (state->cb_timeout_hdr)
+	free(state->cb_timeout_hdr);
+    if (state->cb_fehash_hdr)
+	free(state->cb_fehash_hdr);
+    if (state->h_map.entries)
+	free(state->h_map.entries);
+    if (state->fe_map.entries)
+	free(state->fe_map.entries);
+    if (state->cb_map.entries)
+	free(state->cb_map.entries);
+    return 0;
+}
+
+#endif /* AFS_DEMAND_ATTACH_FS */
diff --git a/src/tviced/serialize_state.h b/src/tviced/serialize_state.h
new file mode 100644
index 0000000000..c1a08c08ca
--- /dev/null
+++ b/src/tviced/serialize_state.h
@@ -0,0 +1,311 @@
+/*
+ * Copyright 2006, Sine Nomine Associates and others.
+ * All Rights Reserved.
+ * 
+ * This software has been released under the terms of the IBM Public
+ * License.  For details, see the LICENSE file in the top-level source
+ * directory or online at http://www.openafs.org/dl/license10.html
+ */
+
+/*
+ * demand attach fs
+ * fileserver state serialization
+ */
+
+#ifndef _AFS_TVICED_SERIALIZE_STATE_H
+#define _AFS_TVICED_SERIALIZE_STATE_H
+
+#ifdef AFS_DEMAND_ATTACH_FS
+
+#define FS_STATE_MAGIC 0x62FA841C
+#define FS_STATE_VERSION 2
+
+#define HOST_STATE_MAGIC 0x7B8C9DAE
+#define HOST_STATE_VERSION 2
+
+#define HOST_STATE_ENTRY_MAGIC 0xA8B9CADB
+
+#define CALLBACK_STATE_MAGIC 0x89DE67BC
+#define CALLBACK_STATE_VERSION 1
+
+#define CALLBACK_STATE_TIMEOUT_MAGIC 0x99DD5511
+#define CALLBACK_STATE_FEHASH_MAGIC 0x77BB33FF
+#define CALLBACK_STATE_ENTRY_MAGIC 0x54637281
+
+#define ACTIVE_VOLUME_STATE_MAGIC 0xAC7557CA
+#define ACTIVE_VOLUME_STATE_VERSION 1
+
+#define ACTIVE_VOLUME_STATE_AVEHASH_MAGIC 0xBADDF00D
+
+#define HOST_STATE_VALID_WINDOW 1800 /* 30 minutes */
+
+/*
+ * on-disk structures
+ */
+struct disk_version_stamp {
+    afs_uint32 magic;
+    afs_uint32 version;
+};
+
+/* 1024 byte header structure */
+struct fs_state_header {
+    struct disk_version_stamp stamp;  /* version stamp */
+    afs_uint32 timestamp;             /* timestamp of save */
+    afs_uint32 sys_name;              /* sys name id for this machine */
+    afsUUID server_uuid;              /* server's UUID */
+    byte valid;                       /* whether header contents are valid */
+    byte endianness;                  /* endianness sanity check (0 for LE, 1 for BE) */
+    byte stats_detailed;              /* fs stats detailed sanity check */
+    byte padding1[1];                 /* padding */
+    afs_uint32 reserved1[23];         /* for expansion */
+    afs_uint64 avol_offset;           /* offset of active volumes structure */
+    afs_uint64 h_offset;              /* offset of host_state_header structure */
+    afs_uint64 cb_offset;             /* offset of callback_state_header structure */
+    afs_uint64 vlru_offset;           /* offset of vlru state structure */
+    afs_uint32 reserved2[56];         /* for expansion */
+    char server_version_string[128];  /* version string from AFS_component_version_number.c */
+    afs_uint32 reserved3[128];        /* for expansion */
+};
+
+/*
+ * host package serialization
+ */
+
+/* 256 byte header for the host state data */
+struct host_state_header {
+    struct disk_version_stamp stamp;  /* host state version stamp */
+    afs_uint32 records;               /* number of stored host records */
+    afs_uint32 index_max;             /* max index value encountered */
+    afs_uint32 reserved[60];          /* for expansion */
+};
+
+/* 32 byte host entry header */
+struct host_state_entry_header {
+    afs_uint32 magic;         /* stamp */
+    afs_uint32 len;           /* number of bytes in this record */
+    afs_uint32 interfaces;    /* number of interfaces included in record */
+    afs_uint32 hcps;          /* number of hcps entries in record */
+    afs_uint32 reserved[4];
+};
+
+/* 36 byte host entry structure */
+struct hostDiskEntry {
+    afs_uint32 host;		/* IP address of host interface that is
+				 * currently being used, in network
+				 * byte order */
+    afs_uint16 port;	        /* port address of host */
+    afs_uint16 hostFlags;       /*  bit map */
+    byte Console;		/* XXXX This host is a console */
+    byte hcpsfailed;	        /* Retry the cps call next time */
+    byte hcps_valid;            /* prlist_val not null */
+#if FS_STATS_DETAILED
+    byte InSameNetwork;	        /*Is host's addr in the same network as
+				 * the File Server's? */
+#else
+    byte padding1[1];	        /* for padding */
+#endif				/* FS_STATS_DETAILED */
+    afs_uint32 hcps_len;        /* length of hcps */
+    afs_uint32 LastCall;	/* time of last call from host */
+    afs_uint32 ActiveCall;	/* time of any call but gettime */
+    afs_uint32 cpsCall;		/* time of last cps call from this host */
+    afs_uint32 cblist;		/* Call back list for this host */
+    afs_uint32 index;           /* index for correlating w/ callback dumps */
+};
+
+/*
+ * callback package serialization
+ */
+
+/* 512 byte header */
+struct callback_state_header {
+    struct disk_version_stamp stamp;    /* callback state version stamp */
+    afs_uint32 nFEs;                    /* number of FileEntry records */
+    afs_uint32 nCBs;                    /* number of CallBack records */
+    afs_uint32 fe_max;                  /* max FileEntry index */
+    afs_uint32 cb_max;                  /* max CallBack index */
+    afs_int32 tfirst;                   /* first valid timeout */
+    afs_uint32 reserved[115];           /* for expansion */
+    afs_uint64 timeout_offset;          /* offset of timeout queue heads */
+    afs_uint64 fehash_offset;           /* offset of file entry hash buckets */
+    afs_uint64 fe_offset;               /* offset of first file entry */
+};
+
+/* 32 byte header */
+struct callback_state_timeout_header {
+    afs_uint32 magic;         /* magic number for timeout header */
+    afs_uint32 len;           /* total length of header and timeout records */
+    afs_uint32 records;       /* number of timeout records */
+    afs_uint32 reserved[5];
+};
+
+/* 32 byte header */
+struct callback_state_fehash_header {
+    afs_uint32 magic;         /* magic number for fehash header */
+    afs_uint32 len;           /* total length of header and fehash bucket heads */
+    afs_uint32 records;       /* number of hash buckets */
+    afs_uint32 reserved[5];
+};
+
+/* 32 byte header */
+struct callback_state_entry_header {
+    afs_uint32 magic;         /* magic number for FE entry */
+    afs_uint32 len;           /* number of bytes in this record */
+    afs_uint32 nCBs;          /* number of callbacks for this FE */
+    afs_uint32 reserved[5];
+};
+
+struct FEDiskEntry {
+    struct FileEntry fe;
+    afs_uint32 index;
+};
+
+struct CBDiskEntry {
+    struct CallBack cb;
+    afs_uint32 index;
+};
+
+/*
+ * active volumes state serialization
+ *
+ * these structures are meant to support
+ * automated salvaging of active volumes
+ * in the event of a fileserver crash
+ */
+
+/* 512 byte header */
+struct active_volume_state_header {
+    struct disk_version_stamp stamp;    /* callback state version stamp */
+    afs_uint32 nAVEs;                   /* number of ActiveVolumeEntry records */
+    afs_uint32 init_timestamp;          /* timestamp of AVE initialization */
+    afs_uint32 update_timetamp;         /* timestamp of last AVE update */
+    afs_uint32 reserved[119];           /* for expansion */
+    afs_uint64 avehash_offset;          /* offset of active volume entry hash buckets */
+    afs_uint64 ave_offset;              /* offset of first active volume entry */
+};
+
+/* 32 byte header */
+struct active_volume_state_avehash_header {
+    afs_uint32 magic;         /* magic number for avehash header */
+    afs_uint32 len;           /* total length of header and avehash bucket heads */
+    afs_uint32 records;       /* number of hash buckets */
+    afs_uint32 reserved[5];
+};
+
+typedef afs_uint32 active_volume_state_avehash_entry;
+
+/* active volume entry */
+struct AVDiskEntry {
+    afs_uint32 volume;
+    afs_uint32 partition;
+    afs_uint32 hash_next;
+};
+
+
+/*
+ * dump runtime state
+ */
+struct idx_map_entry_t {
+    afs_uint32 old_idx;                    /* host hash id from last runtime */
+    afs_uint32 new_idx;                    /* host hash id for this runtime */
+};
+
+
+/* verification process sanity check constants
+ *
+ * make them fairly large so we don't get 
+ * false positives 
+ */
+#define FS_STATE_H_MAX_UUID_HASH_CHAIN_LEN    100000     /* max elements in a host uuid-hash chain */
+#define FS_STATE_H_MAX_ADDR_HASH_CHAIN_LEN    2000000    /* max elements in a host ipv4-hash chain */
+#define FS_STATE_FE_MAX_HASH_CHAIN_LEN        100000     /* max elements in a FE fid-hash chain */
+#define FS_STATE_FCB_MAX_LIST_LEN             100000     /* max elements in a per-FE CB list */
+#define FS_STATE_HCB_MAX_LIST_LEN             100000     /* max elements in a per-host CB list */
+#define FS_STATE_TCB_MAX_LIST_LEN             100000     /* max elements in a per-timeout CB list */
+
+
+/*
+ * main state serialization state structure
+ */
+
+struct fs_dump_state {
+    enum {
+	FS_STATE_DUMP_MODE,
+	FS_STATE_LOAD_MODE
+    } mode;
+    struct {
+	byte do_host_restore;              /* whether host restore should be done */
+	byte some_steps_skipped;           /* whether some steps were skipped */
+	byte warnings_generated;           /* whether any warnings were generated during restore */
+    } flags;
+    afs_fsize_t file_len;
+    int fd;                                /* fd of the current dump file */
+    int bail;                              /* non-zero if something went wrong */
+    char * fn;                             /* name of the current dump file */
+    struct {                               /* memory map of dump file */
+	void * map;
+	void * cursor;
+	afs_foff_t offset;
+	afs_fsize_t size;
+    } mmap;
+    struct fs_state_header * hdr;          /* main header */
+    struct host_state_header * h_hdr;      /* header for host state data */
+    struct callback_state_header * cb_hdr; /* header for callback state data */
+    struct callback_state_timeout_header * cb_timeout_hdr;
+    struct callback_state_fehash_header * cb_fehash_hdr;
+    afs_uint64 eof_offset;                 /* current end of file offset */
+    struct {
+	int len;                           /* number of host entries in map */
+	struct idx_map_entry_t * entries;
+    } h_map;
+    struct {
+	int len;
+	struct idx_map_entry_t * entries;
+    } fe_map;
+    struct {
+	int len;
+	struct idx_map_entry_t * entries;
+    } cb_map;
+};
+
+
+/* prototypes */
+
+/* serialize_state.c */
+extern int fs_stateWrite(struct fs_dump_state * state,
+			 void * buf, size_t len);
+extern int fs_stateRead(struct fs_dump_state * state,
+			void * buf, size_t len);
+extern int fs_stateWriteV(struct fs_dump_state * state,
+			  struct iovec * iov, int niov);
+extern int fs_stateReadV(struct fs_dump_state * state,
+			 struct iovec * iov, int niov);
+extern int fs_stateSync(struct fs_dump_state * state);
+extern int fs_stateWriteHeader(struct fs_dump_state * state,
+			       afs_uint64 * offset,
+			       void * hdr, size_t len);
+extern int fs_stateReadHeader(struct fs_dump_state * state,
+			      afs_uint64 * offset,
+			      void * hdr, size_t len);
+extern int fs_stateIncEOF(struct fs_dump_state * state,
+			  afs_int32 len);
+extern int fs_stateSeek(struct fs_dump_state * state,
+			afs_uint64 * offset);
+
+/* host.c */
+extern int h_stateSave(struct fs_dump_state * state);
+extern int h_stateRestore(struct fs_dump_state * state);
+extern int h_stateRestoreIndices(struct fs_dump_state * state);
+extern int h_stateVerify(struct fs_dump_state * state);
+extern int h_OldToNew(struct fs_dump_state * state, afs_uint32 old, afs_uint32 * new);
+
+/* callback.c */
+extern int cb_stateSave(struct fs_dump_state * state);
+extern int cb_stateRestore(struct fs_dump_state * state);
+extern int cb_stateRestoreIndices(struct fs_dump_state * state);
+extern int cb_stateVerify(struct fs_dump_state * state);
+extern int cb_stateVerifyHCBList(struct fs_dump_state * state, struct host * host);
+extern int fe_OldToNew(struct fs_dump_state * state, afs_uint32 old, afs_uint32 * new);
+extern int cb_OldToNew(struct fs_dump_state * state, afs_uint32 old, afs_uint32 * new);
+
+#endif /* AFS_DEMAND_ATTACH_FS */
+#endif /* _AFS_TVICED_SERIALIZE_STATE_H */
diff --git a/src/tviced/state_analyzer.c b/src/tviced/state_analyzer.c
new file mode 100644
index 0000000000..ae8c3ff7ad
--- /dev/null
+++ b/src/tviced/state_analyzer.c
@@ -0,0 +1,2004 @@
+/*
+ * Copyright 2006, Sine Nomine Associates and others.
+ * All Rights Reserved.
+ * 
+ * This software has been released under the terms of the IBM Public
+ * License.  For details, see the LICENSE file in the top-level source
+ * directory or online at http://www.openafs.org/dl/license10.html
+ */
+
+/*
+ * demand attach fs
+ * fileserver state serialization
+ *
+ * state analyzer
+ */
+
+#include <afsconfig.h>
+#include <afs/param.h>
+
+RCSID
+    ("$Header$");
+
+#include <stdio.h>
+#include <errno.h>
+#include <sys/file.h>
+#include <netdb.h>
+#include <netinet/in.h>
+#include <time.h>
+
+#ifdef HAVE_STRING_H
+#include <string.h>
+#else
+#ifdef HAVE_STRINGS_H
+#include <strings.h>
+#endif
+#endif
+
+#include <afs/stds.h>
+#include <rx/xdr.h>
+#include <afs/assert.h>
+#include <lwp.h>
+#include <lock.h>
+#include <afs/afsint.h>
+#include <afs/rxgen_consts.h>
+#include <afs/nfs.h>
+#include <afs/errors.h>
+#include <afs/ihandle.h>
+#include <afs/vnode.h>
+#include <afs/volume.h>
+#ifdef AFS_ATHENA_STDENV
+#include <krb.h>
+#endif
+#include <afs/acl.h>
+#include <afs/ptclient.h>
+#include <afs/prs_fs.h>
+#include <afs/auth.h>
+#include <afs/afsutil.h>
+#include <rx/rx.h>
+#include <afs/cellconfig.h>
+#include <stdlib.h>
+#include "../util/afsutil_prototypes.h"
+#include "../viced/viced.h"
+#include "../viced/host.h"
+#include "../viced/callback.h"
+#include "serialize_state.h"
+#include <fcntl.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+#include <unistd.h>
+
+/*@+fcnmacros +macrofcndecl@*/
+#ifdef O_LARGEFILE
+#ifdef S_SPLINT_S
+extern off64_t afs_lseek(int FD, off64_t O, int F);
+#endif /*S_SPLINT_S */
+#define afs_lseek(FD, O, F)	lseek64(FD, (off64_t)(O), F)
+#define afs_stat		stat64
+#define afs_fstat		fstat64
+#define afs_open		open64
+#define afs_fopen		fopen64
+#define afs_mmap                mmap64
+#ifdef AFS_AIX_ENV
+extern void * mmap64();  /* ugly hack since aix build env appears to be somewhat broken */
+#endif
+#else /* !O_LARGEFILE */
+#ifdef S_SPLINT_S
+extern off_t afs_lseek(int FD, off_t O, int F);
+#endif /*S_SPLINT_S */
+#define afs_lseek(FD, O, F)	lseek(FD, (off_t)(O), F)
+#define afs_stat		stat
+#define afs_fstat		fstat
+#define afs_open		open
+#define afs_fopen		fopen
+#define afs_mmap                mmap
+#endif /* !O_LARGEFILE */
+/*@=fcnmacros =macrofcndecl@*/
+
+
+#ifndef AFS_DEMAND_ATTACH_FS
+int
+main (int argc, char ** argv)
+{
+    fprintf(stderr, "%s is only supported for demand attach fileservers\n",
+	    argv[0] ? argv[0] : "state analyzer");
+    return 1;
+}
+#else /* AFS_DEMAND_ATTACH_FS */
+
+static void usage(char * prog);
+static int openFile(char * path);
+static void initState(void);
+
+static void banner(void);
+static void prompt(void);
+
+static void print_help(void);
+static void print_global_help(void);
+static void print_h_help(void);
+static void print_fe_help(void);
+static void print_cb_help(void);
+
+static void dump_hdr(void);
+static void dump_h_hdr(void);
+static void dump_cb_hdr(void);
+
+static void dump_cb_timeout(void);
+static void dump_cb_fehash(void);
+
+static void dump_all_hes(void);
+static void dump_all_fes(void);
+static void dump_all_cbs(void);
+
+static void dump_he(afs_uint32 idx);
+static void dump_fe(afs_uint32 idx);
+static void dump_cb(afs_uint32 idx);
+static void dump_this_he(void);
+static void dump_this_fe(void);
+static void dump_this_cb(void);
+static void dump_next_he(void);
+static void dump_next_fe(void);
+static void dump_next_cb(void);
+static void dump_prev_he(void);
+static void dump_prev_fe(void);
+static void dump_prev_cb(void);
+static void dump_first_he(void);
+static void dump_first_fe(void);
+static void dump_first_cb(void);
+static void dump_last_he(void);
+static void dump_last_fe(void);
+static void dump_last_cb(void);
+static void dump_he_hdr(void);
+static void dump_he_entry(void);
+static void dump_he_interfaces(void);
+static void dump_he_hcps(void);
+static void dump_fe_hdr(void);
+static void dump_fe_entry(void);
+static void dump_cb_entry(void);
+
+static void hexdump_map(afs_uint32 offset, afs_uint32 len);
+
+static int get_hdr(void);
+static int get_h_hdr(void);
+static int get_cb_hdr(void);
+static int get_cb_timeout_hdr(void);
+static int get_cb_timeout(void);
+static int get_cb_fehash_hdr(void);
+static int get_cb_fehash(void);
+static int get_he(afs_uint32 idx);
+static int get_he_hdr(void);
+static int get_he_entry(void);
+static int get_fe(afs_uint32 idx);
+static int get_fe_hdr(void);
+static int get_fe_entry(void);
+static int get_cb(afs_uint32 idx);
+static int get_cb_entry(void);
+
+static int find_fe_by_index(afs_uint32 idx);
+static int find_cb_by_index(afs_uint32 idx);
+static int find_fe_by_fid(afs_uint32 vol, afs_uint32 vn, afs_uint32 uniq);
+
+
+static int dump_fd = -1;
+static void * map = NULL;
+static size_t map_len;
+
+static struct {
+    struct fs_state_header hdr;
+    struct host_state_header h_hdr;
+    struct callback_state_header cb_hdr;
+    struct callback_state_timeout_header timeout_hdr;
+    struct callback_state_fehash_header fehash_hdr;
+    afs_uint32 * timeout;
+    afs_uint32 * fehash;
+
+    /* pointers into the memory map */
+    void * hdr_p;
+    void * h_hdr_p;
+    void * cb_hdr_p;
+    void * timeout_hdr_p;
+    void * timeout_p;
+    void * fehash_hdr_p;
+    void * fehash_p;
+
+    byte hdr_valid;
+    byte h_hdr_valid;
+    byte cb_hdr_valid;
+    byte timeout_hdr_valid;
+    byte fehash_hdr_valid;
+} hdrs;
+
+static struct {
+    void * fh;
+    void * cursor;
+    void * ifp;
+    void * hcps;
+    struct host_state_entry_header hdr;
+    struct hostDiskEntry he;
+    afs_uint32 idx;
+    byte hdr_valid;
+    byte he_valid;
+} he_cursor;
+
+static struct {
+    void ** cursor;
+} he_cache;
+
+static struct {
+    void * ffe;
+    void * cursor;
+    void * fcb;
+    struct callback_state_entry_header hdr;
+    struct FEDiskEntry fe;
+    afs_uint32 idx;
+    byte hdr_valid;
+    byte fe_valid;
+} fe_cursor;
+
+static struct {
+    void ** cursor;
+} fe_cache;
+
+static struct {
+    void * cursor;
+    struct CBDiskEntry cb;
+    afs_uint32 idx;
+    byte cb_valid;
+} cb_cursor;
+
+static struct {
+    void ** cursor;
+} cb_cache;
+
+static void
+usage(char * prog)
+{
+    fprintf(stderr, "usage: %s [<state dump file>]\n");
+}
+
+int
+main(int argc, char ** argv)
+{
+    banner();
+
+    if (argc > 2 || (argc == 2 && !strcmp(argv[1], "-h"))) {
+	usage(argv[0]);
+	return 1;
+    }
+
+    initState();
+
+    if (argc > 1) {
+	if (openFile(argv[1]))
+	    return 1;
+    } else {
+	if (openFile(AFSDIR_SERVER_FSSTATE_FILEPATH))
+	    return 1;
+    }
+
+    prompt();
+    return 0;
+}
+
+
+static int
+openFile(char * path)
+{
+    int ret = 0;
+    struct afs_stat status;
+    
+    dump_fd = afs_open(path, O_RDWR);
+    if (dump_fd == -1) {
+	fprintf(stderr, "dump file '%s' failed to open\n", path);
+	ret = 1;
+	goto done;
+    }
+
+    printf("opened dump file '%s'\n", path);
+
+    if (afs_fstat(dump_fd, &status) == -1) {
+	fprintf(stderr, "failed to stat file\n");
+	ret = 1;
+	goto done;
+    }
+
+    map_len = status.st_size;
+
+    map = afs_mmap(NULL, map_len, PROT_READ, MAP_SHARED, dump_fd, 0);
+    if (map == MAP_FAILED) {
+	fprintf(stderr, "failed to mmap file\n");
+	ret = 1;
+	goto done;
+    }
+
+    printf("mapped %d bytes at 0x%x\n", map_len, map);
+
+ done:
+    if (ret) {
+	if (map) {
+	    munmap(map, map_len);
+	    map = NULL;
+	}
+	if (dump_fd != -1) {
+	    close(dump_fd);
+	    dump_fd = -1;
+	}
+    }
+    return ret;
+}
+
+static void
+initState(void)
+{
+    hdrs.hdr_valid = hdrs.h_hdr_valid = hdrs.cb_hdr_valid = 0;
+    he_cursor.cursor = fe_cursor.cursor = cb_cursor.cursor = NULL;
+    he_cursor.fh = fe_cursor.ffe = fe_cursor.fcb = NULL;
+    he_cache.cursor = fe_cache.cursor = NULL;
+}
+
+static void
+banner(void)
+{
+    fprintf(stderr, "demand attach fs\n");
+    fprintf(stderr, "fileserver state analyzer\n");
+    fprintf(stderr, "version 0.1\n");
+}
+
+#define PROGNAME "fs state analyzer"
+
+static void
+prompt(void)
+{
+    char input[256];
+    char prev_input[256];
+    char * tok = NULL;
+    afs_uint32 x, y, z;
+    enum {
+	PR_GLOBAL_MODE,
+	PR_H_MODE,
+	PR_FE_MODE,
+	PR_CB_MODE
+    } mode = PR_GLOBAL_MODE, next_mode;
+
+    next_mode = mode;
+    input[0] = prev_input[0] = '\0';
+
+    while (1) {
+	if (!tok) {
+	    switch(mode) {
+	    case PR_GLOBAL_MODE:
+		printf(PROGNAME "> ");
+		break;
+	    case PR_H_MODE:
+		printf(PROGNAME ": h(%d)> ", he_cursor.idx);
+		break;
+	    case PR_FE_MODE:
+		printf(PROGNAME ": fe(%d)> ", fe_cursor.idx);
+		break;
+	    case PR_CB_MODE:
+		printf(PROGNAME ": fe(%d):cb(%d)> ", fe_cursor.idx, cb_cursor.idx);
+		break;
+	    default:
+		fprintf(stderr, "prompt state broken; aborting\n");
+		return;
+	    }
+	    gets(input);
+
+	    if (!strcmp(input, "")) {
+		/* repeat last command */
+		if (!strcmp(prev_input, "")) {
+		    continue;
+		}
+		strlcpy(input, prev_input, sizeof(input));
+	    } else {
+		/* save command for repetition */
+		strlcpy(prev_input, input, sizeof(prev_input));
+	    }
+
+	    tok = strtok(input, " \t");
+	}
+	while (tok && !strcmp(tok, ";")) {
+	    tok = strtok(NULL, "; \t");
+	}
+
+	if (!tok) {
+	    continue;
+	}
+
+	if (!strcasecmp(tok, "exit")) {
+	    return;
+	} else if (!strcasecmp(tok, "quit")) {
+	    switch(mode) {
+	    case PR_CB_MODE:
+		next_mode = PR_FE_MODE;
+		break;
+	    case PR_FE_MODE:
+	    case PR_H_MODE:
+		next_mode = PR_GLOBAL_MODE;
+		break;
+	    default:
+		return;
+	    }
+	} else if (!strcasecmp(tok, "h")) {
+	    tok = strtok(NULL, " \t");
+	    mode = PR_H_MODE;
+	    if (!tok) {
+		next_mode = mode;
+	    }
+	    continue;
+	} else if (!strcasecmp(tok, "fe")) {
+	    tok = strtok(NULL, " \t");
+	    mode = PR_FE_MODE;
+	    if (!tok) {
+		next_mode = mode;
+	    }
+	    continue;
+	} else if (!strcasecmp(tok, "fs")) {
+	    tok = strtok(NULL, " \t");
+	    mode = PR_GLOBAL_MODE;
+	    if (!tok) {
+		next_mode = mode;
+	    }
+	    continue;
+	} else if (!strcasecmp(tok, "cb")) {
+	    tok = strtok(NULL, " \t");
+	    mode = PR_CB_MODE;
+	    if (!tok) {
+		next_mode = mode;
+	    }
+	    continue;
+	} else if (!strcasecmp(tok, "help")) {
+	    switch(mode) {
+	    case PR_H_MODE:
+		print_h_help();
+		break;
+	    case PR_FE_MODE:
+		print_fe_help();
+		break;
+	    case PR_CB_MODE:
+		print_cb_help();
+		break;
+	    default:
+		print_global_help();
+	    }
+	    print_help();
+	} else if (!strcasecmp(tok, "hexdump")) {
+	    tok = strtok(NULL, " \t");
+	    if (!tok) {
+		hexdump_map(0, map_len);
+		continue;
+	    }
+	    if (sscanf(tok, "%u", &x) != 1) {
+		fprintf(stderr, "hexdump parse error 1\n");
+		tok = NULL;
+		continue;
+	    }
+	    tok = strtok(NULL, " \t");
+	    if (!tok) {
+		hexdump_map(x, map_len - x);
+		continue;
+	    }
+	    if (sscanf(tok, "%u", &y) != 1) {
+		fprintf(stderr, "hexdump parse error 2\n");
+		continue;
+	    }
+	    hexdump_map(x,y);
+	} else if (!strcasecmp(tok, "hdr")) {
+	    switch(mode) {
+	    case PR_H_MODE:
+		dump_h_hdr();
+		break;
+	    case PR_FE_MODE:
+		dump_cb_hdr();
+		break;
+	    case PR_CB_MODE:
+		dump_this_fe();
+		break;
+	    default:
+		dump_hdr();
+	    }
+	} else if (!strcasecmp(tok, "this")) {
+	    switch(mode) {
+	    case PR_H_MODE:
+		dump_this_he();
+		break;
+	    case PR_FE_MODE:
+		dump_this_fe();
+		break;
+	    case PR_CB_MODE:
+		dump_this_cb();
+		break;
+	    default:
+		fprintf(stderr, "command not valid for this mode\n");
+	    }
+	} else if (!strcasecmp(tok, "next")) {
+	    switch(mode) {
+	    case PR_H_MODE:
+		dump_next_he();
+		break;
+	    case PR_FE_MODE:
+		dump_next_fe();
+		break;
+	    case PR_CB_MODE:
+		dump_next_cb();
+		break;
+	    default:
+		fprintf(stderr, "command not valid for this mode\n");
+	    }
+	} else if (!strcasecmp(tok, "prev")) {
+	    switch(mode) {
+	    case PR_H_MODE:
+		dump_prev_he();
+		break;
+	    case PR_FE_MODE:
+		dump_prev_fe();
+		break;
+	    case PR_CB_MODE:
+		dump_prev_cb();
+		break;
+	    default:
+		fprintf(stderr, "command not valid for this mode\n");
+	    }
+	} else if (!strcasecmp(tok, "first")) {
+	    switch(mode) {
+	    case PR_H_MODE:
+		dump_first_he();
+		break;
+	    case PR_FE_MODE:
+		dump_first_fe();
+		break;
+	    case PR_CB_MODE:
+		dump_first_cb();
+		break;
+	    default:
+		fprintf(stderr, "command not valid for this mode\n");
+	    }
+	} else if (!strcasecmp(tok, "last")) {
+	    switch(mode) {
+	    case PR_H_MODE:
+		dump_last_he();
+		break;
+	    case PR_FE_MODE:
+		dump_last_fe();
+		break;
+	    case PR_CB_MODE:
+		dump_last_cb();
+		break;
+	    default:
+		fprintf(stderr, "command not valid for this mode\n");
+	    }
+	} else if (!strcasecmp(tok, "dump")) {
+	    switch(mode) {
+	    case PR_H_MODE:
+		dump_all_hes();
+		break;
+	    case PR_FE_MODE:
+		dump_all_fes();
+		break;
+	    case PR_CB_MODE:
+		dump_all_cbs();
+		break;
+	    default:
+		fprintf(stderr, "command not valid for this mode\n");
+	    }
+	} else if (!strcasecmp(tok, "find")) {
+	    tok = strtok(NULL, " \t");
+	    if (!tok || strcasecmp(tok, "by")) {
+		tok = NULL;
+		fprintf(stderr, "find syntax error 1 (%s)\n", 
+			(tok) ? tok : "nil");
+		continue;
+	    }
+	    tok = strtok(NULL, " \t");
+	    if (!tok) {
+		fprintf(stderr, "find syntax error 2\n");
+		continue;
+	    }
+	    switch(mode) {
+	    case PR_H_MODE:
+		fprintf(stderr, "not implemented yet\n");
+		break;
+	    case PR_FE_MODE:
+		if (!strcasecmp(tok, "index")) {
+		    tok = strtok(NULL, " \t");
+		    if (!tok || sscanf(tok, "%u", &x) != 1) {
+			tok = NULL;
+			fprintf(stderr, "find syntax error 3\n");
+			continue;
+		    }
+		    if (find_fe_by_index(x)) {
+			fprintf(stderr, "find returned no results\n");
+		    }
+		} else if (!strcasecmp(tok, "fid")) {
+		    tok = strtok(NULL, "(), \t");
+		    if (!tok || sscanf(tok, "%u", &x) != 1) {
+			tok = NULL;
+			fprintf(stderr, "find syntax error 4\n");
+			continue;
+		    }
+		    tok = strtok(NULL, "(), \t");
+		    if (!tok || sscanf(tok, "%u", &y) != 1) {
+			tok = NULL;
+			fprintf(stderr, "find syntax error 5\n");
+			continue;
+		    }
+		    tok = strtok(NULL, "(), \t");
+		    if (!tok || sscanf(tok, "%u", &z) != 1) {
+			tok = NULL;
+			fprintf(stderr, "find syntax error 6\n");
+			continue;
+		    }
+		    if (find_fe_by_fid(x,y,z)) {
+			fprintf(stderr, "find returned no results\n");
+		    }
+		} else {
+		    fprintf(stderr, "unsupported filter type\n");
+		}
+		break;
+	    case PR_CB_MODE:
+		if (!strcasecmp(tok, "index")) {
+		    tok = strtok(NULL, " \t");
+		    if (!tok || sscanf(tok, "%u", &x) != 1) {
+			tok = NULL;
+			fprintf(stderr, "find syntax error 3\n");
+			continue;
+		    }
+		    if (find_cb_by_index(x)) {
+			fprintf(stderr, "find returned no results\n");
+		    }
+		} else {
+		    fprintf(stderr, "unsupported filter type\n");
+		}
+		break;
+	    default:
+		fprintf(stderr, "find not supported for this menu\n");
+	    }
+	} else if (!strcspn(tok, "0123456789")) {
+	    if (sscanf(tok, "%u", &x) == 1) {
+		switch(mode) {
+		case PR_H_MODE:
+		    dump_he(x);
+		    break;
+		case PR_FE_MODE:
+		    dump_fe(x);
+		    break;
+		case PR_CB_MODE:
+		    dump_cb(x);
+		    break;
+		default:
+		    fprintf(stderr, "command not available from this menu\n");
+		}
+	    } else {
+		fprintf(stderr, "input parse error ('%s')\n", tok);
+	    }
+	} else if (mode == PR_FE_MODE) {
+	    if (!strcmp(tok, "timeout")) {
+		dump_cb_timeout();
+	    } else if (!strcmp(tok, "hash")) {
+		dump_cb_fehash();
+	    }
+	} else {
+	    fprintf(stderr, "unknown command\n");
+	}
+	tok = strtok(NULL, " \t");
+	mode = next_mode;
+    }
+}
+
+static void
+print_help(void)
+{
+    printf("\th <...>  -- host menu commands\n");
+    printf("\tfe <...> -- FileEntry menu commands\n");
+    printf("\tcb <...> -- CallBack menu commands\n");
+    printf("\thexdump [<offset> [<len>]]\n\t\t -- hex dump the raw data\n");
+    printf("\tquit     -- quit this menu\n");
+    printf("\texit     -- exit the debugger\n");
+    printf("\thelp     -- this help message\n");
+}
+
+static void
+print_global_help(void)
+{
+    printf("\thdr      -- display the fs_state_header struct\n");
+}
+
+static void
+print_h_help(void)
+{
+    printf("\thdr      -- display the host_state_header struct\n");
+    printf("\tfirst    -- display the first host\n");
+    printf("\tprev     -- display the previous host\n");
+    printf("\tthis     -- display this host\n");
+    printf("\tnext     -- display the next host\n");
+    printf("\tlast     -- display the last host\n");
+    printf("\tdump     -- display all hosts\n");
+}
+
+static void
+print_fe_help(void)
+{
+    printf("\thdr      -- display the callback_state_header struct\n");
+    printf("\tfirst    -- display the first FE\n");
+    printf("\tprev     -- display the previous FE\n");
+    printf("\tthis     -- display this FE\n");
+    printf("\tnext     -- display the next FE\n");
+    printf("\tlast     -- display the last FE\n");
+    printf("\tdump     -- display all FEs\n");
+    printf("\ttimeout  -- display the timeout queue heads\n");
+    printf("\thash   -- display the file entry hash buckets\n");
+    printf("\tfind by index <id>\n\t\t -- find an fe by its array index\n");
+    printf("\tfind by fid <(vol,vnode,unique)>\n\t\t -- find an fe by its AFSFid\n");
+}
+
+static void
+print_cb_help(void)
+{
+    printf("\thdr      -- display the callback_state_entry_header struct\n");
+    printf("\tfirst    -- display the first CB\n");
+    printf("\tprev     -- display the previous CB\n");
+    printf("\tthis     -- display this CB\n");
+    printf("\tnext     -- display the next CB\n");
+    printf("\tlast     -- display the last CB\n");
+    printf("\tdump     -- display all CBs\n");
+}
+
+#define DPFTB0 "\t"
+#define DPFTB1 "\t\t"
+#define DPFTB2 "\t\t\t"
+
+#define DPFOFF(addr) \
+    do { \
+        char * _p = (char *)addr; \
+        char * _m = (char *)map; \
+        printf("loading structure from address 0x%x (offset %u)\n", \
+               addr, _p-_m); \
+    } while (0)
+
+/* structs */
+#define DPFSO(T, name) printf(T "%s = {\n", name)
+#define DPFSO0(name) DPFSO(DPFTB0, name)
+#define DPFSO1(name) DPFSO(DPFTB1, name)
+#define DPFSC(T) printf(T "}\n")
+#define DPFSC0 DPFSC(DPFTB0)
+#define DPFSC1 DPFSC(DPFTB1)
+
+/* arrays */
+#define DPFAO(T1, T2, name) printf(T1 "%s =\n" T2 "{ ", name)
+#define DPFAO0(name) DPFAO(DPFTB0, DPFTB1, name)
+#define DPFAO1(name) DPFAO(DPFTB1, DPFTB2, name)
+#define DPFAC0 printf(" }\n")
+#define DPFAC1 DPFAC0
+#define DPFA1 printf(DPFTB1 "  ")
+#define DPFA2 printf(DPFTB2 "  ")
+#define DPFAN printf("\n")
+#define DPFALE(type, var) printf("%" type, var)
+#define DPFAE(type, var) printf("%" type ",\t", var)
+
+/* normal vars */
+#define DPFV(T, name, type, var) printf(T "%s = %" type "\n", name, var)
+#define DPFV1(name, type, var) DPFV(DPFTB1, name, type, var)
+#define DPFV2(name, type, var) DPFV(DPFTB2, name, type, var)
+
+/* hex */
+#define DPFX(T, name, var) printf(T "%s = 0x%x\n", name, var)
+#define DPFX1(name, var) DPFX(DPFTB1, name, var)
+#define DPFX2(name, var) DPFX(DPFTB2, name, var)
+
+/* strings */
+#define DPFS(T, name, var) printf(T "%s = \"%s\"\n", name, var)
+#define DPFS1(name, var) DPFS(DPFTB1, name, var)
+#define DPFS2(name, var) DPFS(DPFTB2, name, var)
+
+/* time */
+#define DPFT(T, name, var) \
+    do { \
+        char * last; \
+        printf(T "%s = \"%s\"\n", name, strtok_r(ctime(&(var)), "\r\n", &last)); \
+    } while(0)
+#define DPFT1(name, var) DPFT(DPFTB1, name, var)
+#define DPFT2(name, var) DPFT(DPFTB2, name, var)
+
+static void
+dump_hdr(void)
+{
+    char uuid_str[40];
+    afs_uint32 hi, lo;
+
+    if (get_hdr())
+	return;
+
+    DPFOFF(map);
+    DPFSO0("fs_state_header");
+    DPFSO1("stamp");
+    DPFX2("magic", hdrs.hdr.stamp.magic);
+    DPFV2("version", "u", hdrs.hdr.stamp.version);
+    DPFSC1;
+    DPFT1("timestamp", hdrs.hdr.timestamp);
+    DPFV1("sys_name", "u", hdrs.hdr.sys_name);
+
+    afsUUID_to_string(&hdrs.hdr.server_uuid, uuid_str, sizeof(uuid_str));
+    DPFS1("server_uuid", uuid_str);
+    DPFV1("valid", "d", hdrs.hdr.valid);
+    DPFV1("endianness", "d", hdrs.hdr.endianness);
+    DPFV1("stats_detailed", "d", hdrs.hdr.stats_detailed);
+
+    SplitInt64(hdrs.hdr.h_offset, hi, lo);
+    DPFSO1("h_offset");
+    DPFV2("hi", "u", hi);
+    DPFV2("lo", "u", lo);
+    DPFSC1;
+
+    SplitInt64(hdrs.hdr.cb_offset, hi, lo);
+    DPFSO1("cb_offset");
+    DPFV2("hi", "u", hi);
+    DPFV2("lo", "u", lo);
+    DPFSC1;
+
+    DPFS1("server_version_string", hdrs.hdr.server_version_string);
+    DPFSC0;
+
+    if (hdrs.hdr.stamp.magic != FS_STATE_MAGIC) {
+	fprintf(stderr, "* magic check failed\n");
+    }
+    if (hdrs.hdr.stamp.version != FS_STATE_VERSION) {
+	fprintf(stderr, "* version check failed\n");
+    }
+}
+
+static void
+dump_h_hdr(void)
+{
+    if (get_h_hdr())
+	return;
+
+    DPFOFF(hdrs.h_hdr_p);
+    DPFSO0("host_state_header");
+    DPFSO1("stamp");
+    DPFX2("magic", hdrs.h_hdr.stamp.magic);
+    DPFV2("version", "u", hdrs.h_hdr.stamp.version);
+    DPFSC1;
+    DPFV1("records", "u", hdrs.h_hdr.records);
+    DPFV1("index_max", "u", hdrs.h_hdr.index_max);
+    DPFSC0;
+
+    if (hdrs.h_hdr.stamp.magic != HOST_STATE_MAGIC) {
+	fprintf(stderr, "* magic check failed\n");
+    }
+    if (hdrs.h_hdr.stamp.version != HOST_STATE_VERSION) {
+	fprintf(stderr, "* version check failed\n");
+    }
+}
+
+static void
+dump_cb_hdr(void)
+{
+    afs_uint32 hi, lo;
+
+    if (get_cb_hdr())
+	return;
+
+    DPFOFF(hdrs.cb_hdr_p);
+    DPFSO0("callback_state_header");
+    DPFSO1("stamp");
+    DPFX2("magic", hdrs.cb_hdr.stamp.magic);
+    DPFV2("version", "u", hdrs.cb_hdr.stamp.version);
+    DPFSC1;
+    DPFV1("nFEs", "u", hdrs.cb_hdr.nFEs);
+    DPFV1("nCBs", "u", hdrs.cb_hdr.nCBs);
+    DPFV1("fe_max", "u", hdrs.cb_hdr.fe_max);
+    DPFV1("cb_max", "u", hdrs.cb_hdr.cb_max);
+    DPFV1("tfirst", "d", hdrs.cb_hdr.tfirst);
+
+    SplitInt64(hdrs.cb_hdr.timeout_offset, hi, lo);
+    DPFSO1("timeout_offset");
+    DPFV2("hi", "u", hi);
+    DPFV2("lo", "u", lo);
+    DPFSC1;
+
+    SplitInt64(hdrs.cb_hdr.fehash_offset, hi, lo);
+    DPFSO1("fehash_offset");
+    DPFV2("hi", "u", hi);
+    DPFV2("lo", "u", lo);
+    DPFSC1;
+
+    SplitInt64(hdrs.cb_hdr.fe_offset, hi, lo);
+    DPFSO1("fe_offset");
+    DPFV2("hi", "u", hi);
+    DPFV2("lo", "u", lo);
+    DPFSC1;
+
+    DPFSC0;
+
+    if (hdrs.cb_hdr.stamp.magic != CALLBACK_STATE_MAGIC) {
+	fprintf(stderr, "* magic check failed\n");
+    }
+    if (hdrs.cb_hdr.stamp.version != CALLBACK_STATE_VERSION) {
+	fprintf(stderr, "* version check failed\n");
+    }
+}
+
+static void
+dump_cb_timeout(void)
+{
+    int i;
+
+    if (get_cb_hdr())
+	return;
+
+    if (get_cb_timeout_hdr())
+	return;
+
+    if (get_cb_timeout())
+	return;
+
+    DPFOFF(hdrs.timeout_hdr_p);
+    DPFSO0("callback_state_timeout_header");
+    DPFX1("magic", hdrs.timeout_hdr.magic);
+    DPFV1("len", "u", hdrs.timeout_hdr.len);
+    DPFV1("records", "u", hdrs.timeout_hdr.records);
+    DPFSC0;
+
+    if (hdrs.timeout_hdr.magic != CALLBACK_STATE_TIMEOUT_MAGIC) {
+	fprintf(stderr, "* magic check failed\n");
+    }
+
+    DPFOFF(hdrs.timeout_p);
+    DPFAO0("timeout");
+    for (i = 0; i < 127; i++) {
+	DPFAE("u", hdrs.timeout[i]);
+	if ((i % 8) == 7) {
+	    DPFAN;
+	    DPFA1;
+	}
+    }
+    DPFALE("u", hdrs.timeout[127]);
+    DPFAC0;
+}
+
+static void
+dump_cb_fehash(void)
+{
+    int i;
+
+    if (get_cb_hdr())
+	return;
+
+    if (get_cb_fehash_hdr())
+	return;
+
+    if (get_cb_fehash())
+	return;
+
+    DPFOFF(hdrs.fehash_hdr_p);
+    DPFSO0("callback_state_fehash_header");
+    DPFX1("magic", hdrs.fehash_hdr.magic);
+    DPFV1("len", "u", hdrs.fehash_hdr.len);
+    DPFV1("records", "u", hdrs.fehash_hdr.records);
+    DPFSC0;
+
+    if (hdrs.fehash_hdr.magic != CALLBACK_STATE_FEHASH_MAGIC) {
+	fprintf(stderr, "* magic check failed\n");
+    }
+
+    DPFOFF(hdrs.fehash_p);
+    DPFAO0("fehash");
+    for (i = 0; i < hdrs.fehash_hdr.records - 1; i++) {
+	DPFAE("u", hdrs.fehash[i]);
+	if ((i % 8) == 7) {
+	    DPFAN;
+	    DPFA1;
+	}
+    }
+    DPFALE("u", hdrs.fehash[hdrs.fehash_hdr.records-1]);
+    DPFAC0;
+}
+
+static void
+dump_all_hes(void)
+{
+    int i;
+
+    if (get_h_hdr()) {
+	fprintf(stderr, "error getting host_state_header\n");
+	return;
+    }
+
+    for (i = 0; i < hdrs.h_hdr.records; i++) {
+	dump_he(i);
+    }
+}
+
+static void
+dump_all_fes(void)
+{
+    int i;
+
+    if (get_cb_hdr()) {
+	fprintf(stderr, "error getting callback_state_header\n");
+	return;
+    }
+
+    for (i = 0; i < hdrs.cb_hdr.nFEs; i++) {
+	dump_fe(i);
+    }
+}
+
+static void
+dump_all_cbs(void)
+{
+    int i;
+
+    if (get_fe_hdr()) {
+	fprintf(stderr, "error getting callback_state_entry_header\n");
+	return;
+    }
+
+    for (i = 0; i < fe_cursor.hdr.nCBs; i++) {
+	dump_cb(i);
+    }
+}
+
+static void
+dump_he(afs_uint32 idx)
+{
+    if (get_he(idx)) {
+	fprintf(stderr, "error getting he %d\n", idx);
+	return;
+    }
+
+    DPFOFF(he_cursor.cursor);
+    dump_he_hdr();
+    dump_he_entry();
+    dump_he_interfaces();
+    dump_he_hcps();
+}
+
+static void
+dump_fe(afs_uint32 idx)
+{
+    if (get_fe(idx)) {
+	fprintf(stderr, "error getting fe %d\n", idx);
+	return;
+    }
+
+    DPFOFF(fe_cursor.cursor);
+    dump_fe_hdr();
+    dump_fe_entry();
+}
+
+static void
+dump_cb(afs_uint32 idx)
+{
+    if (get_cb(idx)) {
+	fprintf(stderr, "error getting cb %d\n", idx);
+	return;
+    }
+
+    DPFOFF(cb_cursor.cursor);
+    dump_cb_entry();
+}
+
+static void
+dump_this_he(void)
+{
+    dump_he(he_cursor.idx);
+}
+
+static void
+dump_this_fe(void)
+{
+    dump_fe(fe_cursor.idx);
+}
+
+static void
+dump_this_cb(void)
+{
+    dump_cb(cb_cursor.idx);
+}
+
+static void
+dump_next_he(void)
+{
+    if (get_h_hdr()) {
+	fprintf(stderr, "error getting host_state_header\n");
+	return;
+    }
+
+    if ((he_cursor.idx + 1) >= hdrs.h_hdr.records) {
+	fprintf(stderr, "no more HEs\n");
+	return;
+    }
+    
+    dump_he(he_cursor.idx+1);
+}
+
+static void
+dump_next_fe(void)
+{
+    if (get_cb_hdr()) {
+	fprintf(stderr, "error getting callback_state_header\n");
+	return;
+    }
+
+    if ((fe_cursor.idx + 1) >= hdrs.cb_hdr.nFEs) {
+	fprintf(stderr, "no more FEs\n");
+	return;
+    }
+    
+    dump_fe(fe_cursor.idx+1);
+}
+
+static void
+dump_next_cb(void)
+{
+    if (get_fe_hdr()) {
+	fprintf(stderr, "error getting callback_state_entry_header\n");
+	return;
+    }
+
+    if ((cb_cursor.idx + 1) >= fe_cursor.hdr.nCBs) {
+	fprintf(stderr, "no more CBs\n");
+	return;
+    }
+    
+    dump_cb(cb_cursor.idx+1);
+}
+
+static void
+dump_prev_he(void)
+{
+    if (!he_cursor.idx) {
+	fprintf(stderr, "no more HEs\n");
+	return;
+    }
+    
+    dump_he(he_cursor.idx-1);
+}
+
+static void
+dump_prev_fe(void)
+{
+    if (!fe_cursor.idx) {
+	fprintf(stderr, "no more FEs\n");
+	return;
+    }
+    
+    dump_fe(fe_cursor.idx-1);
+}
+
+static void
+dump_prev_cb(void)
+{
+    if (!cb_cursor.idx) {
+	fprintf(stderr, "no more CBs\n");
+	return;
+    }
+    
+    dump_cb(cb_cursor.idx-1);
+}
+
+static void
+dump_first_fe(void)
+{
+    if (get_cb_hdr()) {
+	fprintf(stderr, "error getting callback_state_header\n");
+	return;
+    }
+
+    if (!hdrs.cb_hdr.nFEs) {
+	fprintf(stderr, "no FEs present\n");
+	return;
+    }
+    
+    dump_fe(0);
+}
+
+static void
+dump_first_he(void)
+{
+    if (get_h_hdr()) {
+	fprintf(stderr, "error getting host_state_header\n");
+	return;
+    }
+
+    if (!hdrs.h_hdr.records) {
+	fprintf(stderr, "no HEs present\n");
+	return;
+    }
+    
+    dump_he(0);
+}
+
+static void
+dump_first_cb(void)
+{
+    if (get_fe_hdr()) {
+	fprintf(stderr, "error getting callback_state_entry_header\n");
+	return;
+    }
+
+    if (!fe_cursor.hdr.nCBs) {
+	fprintf(stderr, "no CBs present\n");
+	return;
+    }
+    
+    dump_cb(0);
+}
+
+static void
+dump_last_he(void)
+{
+    if (get_h_hdr()) {
+	fprintf(stderr, "error getting host_state_header\n");
+	return;
+    }
+
+    if (!hdrs.h_hdr.records) {
+	fprintf(stderr, "no HEs present\n");
+	return;
+    }
+    
+    dump_he(hdrs.h_hdr.records-1);
+}
+
+static void
+dump_last_fe(void)
+{
+    if (get_cb_hdr()) {
+	fprintf(stderr, "error getting callback_state_header\n");
+	return;
+    }
+
+    if (!hdrs.cb_hdr.nFEs) {
+	fprintf(stderr, "no FEs present\n");
+	return;
+    }
+    
+    dump_fe(hdrs.cb_hdr.nFEs-1);
+}
+
+static void
+dump_last_cb(void)
+{
+    if (get_fe_hdr()) {
+	fprintf(stderr, "error getting callback_state_entry_header\n");
+	return;
+    }
+
+    if (!fe_cursor.hdr.nCBs) {
+	fprintf(stderr, "no CBs present\n");
+	return;
+    }
+
+    dump_cb(fe_cursor.hdr.nCBs-1);
+}
+
+static void
+dump_he_hdr(void)
+{
+    DPFSO0("host_state_entry_header");
+    DPFX1("magic", he_cursor.hdr.magic);
+    DPFV1("len", "u", he_cursor.hdr.len);
+    DPFV1("interfaces", "u", he_cursor.hdr.interfaces);
+    DPFV1("hcps", "u", he_cursor.hdr.hcps);
+    DPFSC0;
+
+    if (he_cursor.hdr.magic != HOST_STATE_ENTRY_MAGIC) {
+	fprintf(stderr, "* magic check failed\n");
+    }
+}
+
+static void
+dump_he_entry(void)
+{
+    DPFSO0("hostDiskEntry");
+    DPFS1("host", afs_inet_ntoa(he_cursor.he.host));
+    DPFV1("port", "u", he_cursor.he.port);
+    DPFX1("hostFlags", he_cursor.he.hostFlags);
+    DPFV1("Console", "u", he_cursor.he.Console);
+    DPFV1("hcpsfailed", "u", he_cursor.he.hcpsfailed);
+    DPFV1("hcps_valid", "u", he_cursor.he.hcps_valid);
+    if (hdrs.hdr.stats_detailed) {
+#ifdef FS_STATS_DETAILED
+	DPFV1("InSameNetwork", "u", he_cursor.he.InSameNetwork);
+#else
+	DPFV1("InSameNetwork", "u", he_cursor.he.padding1[0]);
+#endif
+    }
+    DPFV1("hcps_len", "u", he_cursor.he.hcps_len);
+    DPFT1("LastCall", he_cursor.he.LastCall);
+    DPFT1("ActiveCall", he_cursor.he.ActiveCall);
+    DPFT1("cpsCall", he_cursor.he.cpsCall);
+    DPFV1("cblist", "u", he_cursor.he.cblist);
+    DPFV1("index", "u", he_cursor.he.index);
+    DPFSC0;
+}
+
+static void
+dump_he_interfaces(void)
+{
+    char temp_str[40];
+    struct Interface * ifp;
+    int len, i;
+
+    if (!he_cursor.hdr.interfaces)
+	return;
+
+    len = sizeof(struct Interface) + ((he_cursor.hdr.interfaces-1)*sizeof(struct AddrPort));
+    ifp = (struct Interface *) malloc(len);
+    assert(ifp != NULL);
+
+    memcpy(ifp, he_cursor.ifp, len);
+
+    DPFSO0("Interface");
+    DPFV1("numberOfInterfaces", "u", ifp->numberOfInterfaces);
+
+    afsUUID_to_string(&ifp->uuid, temp_str, sizeof(temp_str));
+    DPFS1("uuid", temp_str);
+    for (i = 0; i < he_cursor.hdr.interfaces; i++) {
+	snprintf(temp_str, sizeof(temp_str), "interface[%d]", i);
+	DPFSO1(temp_str);
+	DPFS2("addr", afs_inet_ntoa(ifp->interface[i].addr));
+	DPFV2("port", "u", ifp->interface[i].port);
+	DPFSC1;
+    }
+
+    DPFSC0;
+
+    if (he_cursor.hdr.interfaces != ifp->numberOfInterfaces) {
+	fprintf(stderr, "* interface count mismatch between header and Interface struct\n");
+    }
+    free(ifp);
+}
+
+static void
+dump_he_hcps(void)
+{
+    char temp_str[40];
+    afs_int32 * hcps;
+    int len, i;
+
+    if (!he_cursor.hdr.hcps)
+	return;
+
+    len = (he_cursor.hdr.hcps)*sizeof(afs_uint32);
+    hcps = (afs_int32 *) malloc(len);
+    assert(hcps != NULL);
+    memcpy(hcps, he_cursor.hcps, len);
+
+    DPFSO0("hcps");
+    DPFAO1("prlist_val");
+    for (i = 0; i < he_cursor.hdr.hcps - 1; i++) {
+	DPFAE("d", hcps[i]);
+	if ((i % 8) == 7) {
+	    DPFAN;
+	    DPFA2;
+	}
+    }
+    DPFALE("d", hcps[he_cursor.hdr.hcps-1]);
+    DPFAC1;
+    DPFSC0;
+    free(hcps);
+}
+
+static void
+dump_fe_hdr(void)
+{
+    DPFSO0("callback_state_entry_header");
+    DPFX1("magic", fe_cursor.hdr.magic);
+    DPFV1("len", "u", fe_cursor.hdr.len);
+    DPFV1("nCBs", "u", fe_cursor.hdr.nCBs);
+    DPFSC0;
+
+    if (fe_cursor.hdr.magic != CALLBACK_STATE_ENTRY_MAGIC) {
+	fprintf(stderr, "* magic check failed\n");
+    }
+}
+
+static void
+dump_fe_entry(void)
+{
+    DPFSO0("FEDiskEntry");
+    DPFSO1("fe");
+    DPFV2("vnode", "u", fe_cursor.fe.fe.vnode);
+    DPFV2("unique", "u", fe_cursor.fe.fe.unique);
+    DPFV2("volid", "u", fe_cursor.fe.fe.volid);
+    DPFV2("fnext", "u", fe_cursor.fe.fe.fnext);
+    DPFV2("ncbs", "u", fe_cursor.fe.fe.ncbs);
+    DPFV2("firstcb", "u", fe_cursor.fe.fe.firstcb);
+    DPFV2("status", "u", fe_cursor.fe.fe.status);
+    DPFSC1;
+    DPFV1("index", "u", fe_cursor.fe.index);
+    DPFSC0;
+}
+
+static void
+dump_cb_entry(void)
+{
+    DPFSO0("CBDiskEntry");
+    DPFSO1("cb");
+    DPFV2("cnext", "u", cb_cursor.cb.cb.cnext);
+    DPFV2("fhead", "u", cb_cursor.cb.cb.fhead);
+    DPFV2("thead", "u", (afs_uint32)cb_cursor.cb.cb.thead);
+    DPFV2("status", "u", (afs_uint32)cb_cursor.cb.cb.status);
+    DPFV2("hhead", "u", cb_cursor.cb.cb.hhead);
+    DPFV2("tprev", "u", cb_cursor.cb.cb.tprev);
+    DPFV2("tnext", "u", cb_cursor.cb.cb.tnext);
+    DPFV2("hprev", "u", cb_cursor.cb.cb.hprev);
+    DPFV2("hnext", "u", cb_cursor.cb.cb.hnext);
+    DPFSC1;
+    DPFV1("index", "u", cb_cursor.cb.index);
+    DPFSC0;
+}
+
+#define DPFHMS printf("  ")
+#define DPFHS printf("    ")
+#define DPFHN(offset) printf("\n%u\t", offset)
+#define DPFHD(x) printf("%02X  ", x)
+#define DPFHE printf("\n")
+
+static void
+hexdump_map(afs_uint32 offset, afs_uint32 len)
+{
+    int i;
+    unsigned char * p = (unsigned char *)map;
+    afs_uint32 c32;
+
+    if (!len)
+	return;
+
+    if ((offset + len) > map_len) {
+	fprintf(stderr, "offset + length exceeds memory map size (%u > %u)\n",
+		offset+len, map_len);
+	return;
+    }
+
+    p += offset;
+    DPFOFF(p);
+    DPFHN(offset);
+
+    for (i = offset % 16; i > 0; i--) {
+	DPFHS;
+    }
+
+    for (i=0; i < len; i++, p++, offset++) {
+	if (!(offset % 16)) {
+	    DPFHN(offset);
+	} else if (!(offset % 8)) {
+	    DPFHMS;
+	}
+	DPFHD(*p);
+    }
+    DPFHE;
+}
+
+static int
+get_hdr(void)
+{
+    if (!hdrs.hdr_valid) {
+	if (map_len < sizeof(struct fs_state_header)) {
+	    fprintf(stderr, "corrupt state dump: fs_state_header larger than memory map\n");
+	    return 1;
+	}
+	memcpy(&hdrs.hdr, map, sizeof(hdrs.hdr));
+	hdrs.hdr_p = map;
+	hdrs.hdr_valid = 1;
+    }
+    return 0;
+}
+
+static int
+get_h_hdr(void)
+{
+    char * buf;
+    afs_uint32 hi, lo;
+
+    if (hdrs.h_hdr_valid)
+	return 0;
+
+    if (get_hdr())
+	return 1;
+
+    SplitInt64(hdrs.hdr.h_offset, hi, lo);
+
+    if (hi) {
+	fprintf(stderr, "hi offset bits set in h_offset; can't get host_state_header\n");
+	return 1;
+    }
+    if ((lo >= map_len) || 
+	((lo + sizeof(struct host_state_header)) > map_len) ||
+	(lo + sizeof(struct host_state_header) < lo)) {
+	fprintf(stderr, "h_offset puts host_state_header beyond end of memory map\n");
+	return 1;
+    }
+
+    buf = (char *) map;
+    buf += lo;
+    memcpy(&hdrs.h_hdr, buf, sizeof(struct host_state_header));
+    hdrs.h_hdr_p = buf;
+    buf += sizeof(struct host_state_header);
+    he_cursor.fh = (void *)buf;
+    return 0;
+}
+
+static int
+get_cb_hdr(void)
+{
+    char * buf;
+    afs_uint32 hi, lo;
+
+    if (hdrs.cb_hdr_valid)
+	return 0;
+
+    if (get_hdr())
+	return 1;
+
+    SplitInt64(hdrs.hdr.cb_offset, hi, lo);
+
+    if (hi) {
+	fprintf(stderr, "hi offset bits set in cb_offset; can't get callback_state_header\n");
+	return 1;
+    }
+    if ((lo >= map_len) || 
+	((lo + sizeof(struct callback_state_header)) > map_len) ||
+	(lo + sizeof(struct callback_state_header) < lo)) {
+	fprintf(stderr, "cb_offset puts callback_state_header beyond end of memory map\n");
+	return 1;
+    }
+
+    buf = (char *) map;
+    buf += lo;
+    memcpy(&hdrs.cb_hdr, buf, sizeof(struct callback_state_header));
+    hdrs.cb_hdr_p = buf;
+    hdrs.cb_hdr_valid = 1;
+
+    SplitInt64(hdrs.cb_hdr.fe_offset, hi, lo);
+
+    if (hi) {
+	fprintf(stderr, "hi offset bits set in fe_offset; can't get callback_state_entry_header\n");
+	return 1;
+    }
+    hi = lo + (hdrs.cb_hdr.nFEs * (sizeof(struct callback_state_entry_header) +
+				  sizeof(struct FEDiskEntry)) +
+	       hdrs.cb_hdr.nCBs * sizeof(struct CBDiskEntry));
+    if ((hi > map_len) ||
+	(lo > hi)) {
+	fprintf(stderr, "fe_offset puts callback_state_entry_header beyond end of memory map\n");
+	return 1;
+    }
+
+    buf = (char *) map;
+    buf += lo;
+    fe_cursor.ffe = (void *)buf;
+
+    return 0;
+}
+
+static int
+get_cb_timeout_hdr(void)
+{
+    char * buf;
+    afs_uint32 hi, lo;
+
+    if (hdrs.timeout_hdr_valid)
+	return 0;
+
+    if (get_cb_hdr())
+	return 1;
+
+    SplitInt64(hdrs.cb_hdr.timeout_offset, hi, lo);
+
+    if (hi) {
+	fprintf(stderr, "hi offset bits set in timeout_offset; can't get callback_state_timeout_header\n");
+	return 1;
+    }
+    if ((lo >= map_len) || 
+	((lo + sizeof(struct callback_state_timeout_header)) > map_len) ||
+	(lo + sizeof(struct callback_state_timeout_header) < lo)) {
+	fprintf(stderr, "timeout_offset puts callback_state_timeout_header beyond end of memory map\n");
+	return 1;
+    }
+
+    buf = (char *) map;
+    buf += lo;
+    memcpy(&hdrs.timeout_hdr, buf, sizeof(struct callback_state_timeout_header));
+    hdrs.timeout_hdr_p = buf;
+    hdrs.timeout_hdr_valid = 1;
+    buf += sizeof(struct callback_state_timeout_header);
+    hdrs.timeout_p = buf;
+
+    return 0;
+}
+
+static int
+get_cb_timeout(void)
+{
+    char * buf;
+
+    if (hdrs.timeout)
+	return 0;
+
+    if (get_cb_timeout_hdr())
+	return 1;
+
+    hdrs.timeout = (afs_uint32 *) calloc(hdrs.timeout_hdr.records, sizeof(afs_uint32));
+    assert(hdrs.timeout != NULL);
+    memcpy(hdrs.timeout, hdrs.timeout_p, hdrs.timeout_hdr.records * sizeof(afs_uint32));
+    return 0;
+}
+
+static int
+get_cb_fehash_hdr(void)
+{
+    char * buf;
+    afs_uint32 hi, lo;
+
+    if (hdrs.fehash_hdr_valid)
+	return 0;
+
+    if (get_cb_hdr())
+	return 1;
+
+    SplitInt64(hdrs.cb_hdr.fehash_offset, hi, lo);
+
+    if (hi) {
+	fprintf(stderr, "hi offset bits set in fehash_offset; can't get callback_state_fehash_header\n");
+	return 1;
+    }
+    if ((lo >= map_len) || 
+	((lo + sizeof(struct callback_state_fehash_header)) > map_len) ||
+	(lo + sizeof(struct callback_state_fehash_header) < lo)) {
+	fprintf(stderr, "timeout_offset puts callback_state_fehash_header beyond end of memory map\n");
+	return 1;
+    }
+
+    buf = (char *) map;
+    buf += lo;
+    memcpy(&hdrs.fehash_hdr, buf, sizeof(struct callback_state_fehash_header));
+    hdrs.fehash_hdr_p = buf;
+    hdrs.fehash_hdr_valid = 1;
+    buf += sizeof(struct callback_state_fehash_header);
+    hdrs.fehash_p = buf;
+
+    return 0;
+}
+
+static int
+get_cb_fehash(void)
+{
+    char * buf;
+
+    if (hdrs.fehash)
+	return 0;
+
+    if (get_cb_fehash_hdr())
+	return 1;
+
+    hdrs.fehash = (afs_uint32 *) calloc(hdrs.fehash_hdr.records, sizeof(afs_uint32));
+    assert(hdrs.fehash != NULL);
+    memcpy(hdrs.fehash, hdrs.fehash_p, hdrs.fehash_hdr.records * sizeof(afs_uint32));
+    return 0;
+}
+
+static int
+get_he(afs_uint32 idx)
+{
+    int i;
+    char * p;
+
+    if (get_h_hdr())
+	return 1;
+
+    if (idx >= hdrs.h_hdr.records)
+	return 1;
+
+    if (he_cursor.idx == idx && he_cursor.hdr_valid && he_cursor.he_valid)
+	return 0;
+
+    he_cursor.hdr_valid = he_cursor.he_valid = 0;
+
+    if (he_cache.cursor == NULL) {
+	he_cache.cursor = (void **) calloc(hdrs.h_hdr.records, sizeof(void *));
+	assert(he_cache.cursor != NULL);
+    }
+
+    if (idx && he_cache.cursor[idx-1] == NULL) {
+	for (i = 0; i < idx; i++) {
+	    if (he_cache.cursor[i] == NULL) {
+		get_he(i);
+	    }
+	}
+    }
+
+    if (!idx) {
+	he_cursor.cursor = he_cursor.fh;
+    } else if (he_cursor.cursor == he_cache.cursor[idx-1]) {
+	p = (char *)he_cursor.cursor;
+	p += he_cursor.hdr.len;
+	he_cursor.cursor = (void *)p;
+    } else {
+	he_cursor.cursor = he_cache.cursor[idx-1];
+	if (get_he_hdr())
+	    return 1;
+	p = (char *)he_cursor.cursor;
+	p += he_cursor.hdr.len;
+	he_cursor.cursor = (void *)p;
+    }
+
+    he_cursor.idx = idx;
+    he_cache.cursor[idx] = he_cursor.cursor;
+
+    if (get_he_hdr())
+	return 1;
+    if (get_he_entry())
+	return 1;
+
+    return 0;
+}
+
+static int
+get_he_hdr(void)
+{
+    memcpy(&he_cursor.hdr, he_cursor.cursor, sizeof(struct host_state_entry_header));
+    he_cursor.hdr_valid = 1;
+    return 0;
+}
+
+static int
+get_he_entry(void)
+{
+    char * p;
+
+    if (!he_cursor.hdr_valid) {
+	if (get_he_hdr()) {
+	    return 1;
+	}
+    }
+
+    p = (char *) he_cursor.cursor;
+    p += sizeof(struct host_state_entry_header);
+
+    memcpy(&he_cursor.he, p, sizeof(struct hostDiskEntry));
+
+    he_cursor.he_valid = 1;
+    p += sizeof(struct hostDiskEntry);
+    he_cursor.ifp = (void *)p;
+    if (he_cursor.hdr.interfaces) {
+	p += sizeof(struct Interface) + ((he_cursor.hdr.interfaces-1)*sizeof(struct AddrPort));
+	he_cursor.hcps = (void *)p;
+    } else {
+	he_cursor.hcps = he_cursor.ifp;
+    }
+    return 0;
+}
+
+static int
+get_fe(afs_uint32 idx)
+{
+    int i;
+    char * p;
+
+    cb_cursor.cb_valid = 0;
+
+    if (get_cb_hdr())
+	return 1;
+
+    if (idx >= hdrs.cb_hdr.nFEs)
+	return 1;
+
+    if (fe_cursor.idx == idx && fe_cursor.hdr_valid && fe_cursor.fe_valid)
+	return 0;
+
+    fe_cursor.hdr_valid = fe_cursor.fe_valid = 0;
+
+    if (fe_cache.cursor == NULL) {
+	fe_cache.cursor = (void **) calloc(hdrs.cb_hdr.nFEs, sizeof(void *));
+	assert(fe_cache.cursor != NULL);
+    }
+
+    if (idx && fe_cache.cursor[idx-1] == NULL) {
+	for (i = 0; i < idx; i++) {
+	    if (fe_cache.cursor[i] == NULL) {
+		get_fe(i);
+	    }
+	}
+    }
+
+    if (!idx) {
+	fe_cursor.cursor = fe_cursor.ffe;
+    } else if (fe_cursor.cursor == fe_cache.cursor[idx-1]) {
+	p = (char *)fe_cursor.cursor;
+	p += fe_cursor.hdr.len;
+	fe_cursor.cursor = (void *)p;
+    } else {
+	fe_cursor.cursor = fe_cache.cursor[idx-1];
+	if (get_fe_hdr())
+	    return 1;
+	p = (char *)fe_cursor.cursor;
+	p += fe_cursor.hdr.len;
+	fe_cursor.cursor = (void *)p;
+    }
+
+    fe_cursor.idx = idx;
+    fe_cache.cursor[idx] = fe_cursor.cursor;
+
+    if (get_fe_hdr())
+	return 1;
+    if (get_fe_entry())
+	return 1;
+
+    return 0;
+}
+
+static int
+get_fe_hdr(void)
+{
+    memcpy(&fe_cursor.hdr, fe_cursor.cursor, sizeof(struct callback_state_entry_header));
+    fe_cursor.hdr_valid = 1;
+    return 0;
+}
+
+static int
+get_fe_entry(void)
+{
+    char * p;
+
+    if (!fe_cursor.hdr_valid) {
+	if (get_fe_hdr()) {
+	    return 1;
+	}
+    }
+
+    p = (char *) fe_cursor.cursor;
+    p += sizeof(struct callback_state_entry_header);
+
+    memcpy(&fe_cursor.fe, p, sizeof(struct FEDiskEntry));
+
+    fe_cursor.fe_valid = 1;
+    p += sizeof(struct FEDiskEntry);
+    fe_cursor.fcb = (void *)p;
+    return 0;
+}
+
+static int
+get_cb(afs_uint32 idx)
+{
+    int i;
+    char * p;
+
+    if (get_fe(fe_cursor.idx))
+	return 1;
+
+    if (idx >= fe_cursor.hdr.nCBs)
+	return 1;
+
+    if (idx == cb_cursor.idx && cb_cursor.cb_valid)
+	return 0;
+
+    cb_cursor.cb_valid = 0;
+
+    p = (char *)fe_cursor.fcb;
+    p += idx * sizeof(struct CBDiskEntry);
+    cb_cursor.cursor = (void *)p;
+
+    cb_cursor.idx = idx;
+
+    if (get_cb_entry())
+	return 1;
+
+    return 0;
+}
+
+static int
+get_cb_entry(void)
+{
+    memcpy(&cb_cursor.cb, cb_cursor.cursor, sizeof(struct CBDiskEntry));
+    cb_cursor.cb_valid = 1;
+    return 0;
+}
+
+static int
+find_he_by_index(afs_uint32 idx)
+{
+    int i;
+
+    if (get_h_hdr()) {
+	return 1;
+    }
+
+    for (i = 0; i < hdrs.h_hdr.records; i++) {
+	if (get_he(i)) {
+	    fprintf(stderr, "error getting he %d\n", i);
+	    return 1;
+	}
+	if (he_cursor.he.index == idx)
+	    break;
+    }
+
+    if (i < hdrs.h_hdr.records) {
+	dump_this_he();
+	return 0;
+    }
+    return 1;
+}
+
+static int
+find_fe_by_index(afs_uint32 idx)
+{
+    int i;
+
+    if (get_cb_hdr()) {
+	return 1;
+    }
+
+    for (i = 0; i < hdrs.cb_hdr.nFEs; i++) {
+	if (get_fe(i)) {
+	    fprintf(stderr, "error getting fe %d\n", i);
+	    return 1;
+	}
+	if (fe_cursor.fe.index == idx)
+	    break;
+    }
+
+    if (i < hdrs.cb_hdr.nFEs) {
+	dump_this_fe();
+	return 0;
+    }
+    return 1;
+}
+
+static int
+find_fe_by_fid(afs_uint32 volid, afs_uint32 vnode, afs_uint32 unique)
+{
+    int i;
+
+    if (get_cb_hdr()) {
+	return 1;
+    }
+
+    for (i = 0; i < hdrs.cb_hdr.nFEs; i++) {
+	if (get_fe(i)) {
+	    fprintf(stderr, "error getting fe %d\n", i);
+	    return 1;
+	}
+	if ((fe_cursor.fe.fe.unique == unique) &&
+	    (fe_cursor.fe.fe.volid == volid) &&
+	    (fe_cursor.fe.fe.vnode == vnode))
+	    break;
+    }
+
+    if (i < hdrs.cb_hdr.nFEs) {
+	dump_this_fe();
+	return 0;
+    }
+    return 1;
+}
+
+static int
+find_cb_by_index(afs_uint32 idx)
+{
+    int i;
+
+    if (get_fe_hdr()) {
+	return 1;
+    }
+
+    for (i = 0; i < fe_cursor.hdr.nCBs; i++) {
+	if (get_cb(i)) {
+	    fprintf(stderr, "error getting cb %d\n", i);
+	    return 1;
+	}
+	if (cb_cursor.cb.index == idx)
+	    break;
+    }
+
+    if (i < fe_cursor.hdr.nCBs) {
+	dump_this_cb();
+	return 0;
+    }
+    return 1;
+}
+
+#endif /* AFS_DEMAND_ATTACH_FS */
diff --git a/src/tvolser/Makefile.in b/src/tvolser/Makefile.in
index 8b8b1a7578..bfeb3a24af 100644
--- a/src/tvolser/Makefile.in
+++ b/src/tvolser/Makefile.in
@@ -10,7 +10,7 @@ include @TOP_OBJDIR@/src/config/Makefile.config
 HELPER_SPLINT=@HELPER_SPLINT@
 
 CC=${MT_CC}
-CFLAGS=${COMMON_CFLAGS} -I.. -DNINTERFACE ${MT_CFLAGS} -DRXDEBUG
+CFLAGS=${COMMON_CFLAGS} -I.. -DNINTERFACE ${MT_CFLAGS} -DRXDEBUG -DFSSYNC_BUILD_CLIENT
 
 CCRULE=${CC} ${CFLAGS} -c $?
 
@@ -36,8 +36,9 @@ UTILOBJS=assert.o uuid.o serverLog.o fileutil.o netutils.o dirpath.o volparse.o
 
 DIROBJS=buffer.o dir.o salvage.o
 
-VOLOBJS= vnode.o volume.o vutil.o partition.o fssync.o purge.o \
-	 clone.o devname.o common.o ihandle.o listinodes.o namei_ops.o nuke.o
+VOLOBJS= vnode.o volume.o vutil.o partition.o fssync-client.o purge.o \
+	 clone.o devname.o common.o ihandle.o listinodes.o \
+	 namei_ops.o nuke.o salvsync-client.o daemon_com.o
 
 FSINTOBJS=# afsaux.o afscbint.cs.o afsint.ss.o afsint.xdr.o
 
@@ -138,7 +139,13 @@ partition.o: ${VOL}/partition.c
 nuke.o: ${VOL}/nuke.c
 	${COMPILE}
 
-fssync.o: ${VOL}/fssync.c
+fssync-client.o: ${VOL}/fssync-client.c
+	${COMPILE}
+
+salvsync-client.o: ${VOL}/salvsync-client.c
+	${COMPILE}
+
+daemon_com.o: ${VOL}/daemon_com.c
 	${COMPILE}
 
 purge.o: ${VOL}/purge.c
diff --git a/src/util/Makefile.in b/src/util/Makefile.in
index 7b8c36e3ea..ccf3446695 100644
--- a/src/util/Makefile.in
+++ b/src/util/Makefile.in
@@ -13,7 +13,7 @@ HELPER_SPLINT=@HELPER_SPLINT@
 objects = assert.o base64.o casestrcpy.o ktime.o volparse.o hostparse.o \
 	 hputil.o kreltime.o isathing.o get_krbrlm.o uuid.o serverLog.o \
 	 dirpath.o fileutil.o netutils.o flipbase64.o fstab.o \
-	 afs_atomlist.o afs_lhash.o snprintf.o strlcat.o strlcpy.o \
+	 afs_atomlist.o afs_lhash.o snprintf.o strlcat.o strlcpy.o strnlen.o \
 	 daemon.o rxkstats.o ${REGEX_OBJ}
 
 includes = \
@@ -134,6 +134,9 @@ strlcat.o: ${srcdir}/strlcat.c ${includes}
 strlcpy.o: ${srcdir}/strlcpy.c ${includes}
 	${CCOBJ} ${CFLAGS} -c ${srcdir}/strlcpy.c
 
+strnlen.o: ${srcdir}/strnlen.c ${includes}
+	${CCOBJ} ${CFLAGS} -c ${srcdir}/strnlen.c
+
 daemon.o: ${srcdir}/daemon.c ${includes}
 	${CCOBJ} ${CFLAGS} -c ${srcdir}/daemon.c
 
diff --git a/src/util/afsutil_prototypes.h b/src/util/afsutil_prototypes.h
index 89f05365e6..2848da3641 100644
--- a/src/util/afsutil_prototypes.h
+++ b/src/util/afsutil_prototypes.h
@@ -173,6 +173,9 @@ extern size_t strlcpy(char *dst, const char *src, size_t siz);
 extern size_t strlcat(char *dst, const char *src, size_t siz);
 #endif
 
+/* strn */
+extern size_t afs_strnlen(char * buf, size_t len);
+
 
 /* sys.c */
 
@@ -184,6 +187,10 @@ extern void afs_htonuuid(afsUUID * uuidp);
 extern void afs_ntohuuid(afsUUID * uuidp);
 extern afs_int32 afs_uuid_create(afsUUID * uuid);
 extern u_short afs_uuid_hash(afsUUID * uuid);
+#if !defined(KERNEL) && !defined(UKERNEL)
+extern int afsUUID_from_string(const char *str, afsUUID * uuid);
+extern int afsUUID_to_string(const afsUUID * uuid, char *str, size_t strsz);
+#endif
 
 /* volparse.c */
 extern afs_int32 volutil_GetPartitionID(char *aname);
diff --git a/src/util/dirpath.c b/src/util/dirpath.c
index ff856f9523..1e9d78da76 100644
--- a/src/util/dirpath.c
+++ b/src/util/dirpath.c
@@ -292,10 +292,17 @@ initDirPathArray(void)
     pathp = dirPathArray[AFSDIR_SERVER_SLVGLOG_FILEPATH_ID];
     AFSDIR_SERVER_FILEPATH(pathp, AFSDIR_LOGS_DIR, AFSDIR_SLVGLOG_FILE);
 
+    pathp = dirPathArray[AFSDIR_SERVER_SALSRVLOG_FILEPATH_ID];
+    AFSDIR_SERVER_FILEPATH(pathp, AFSDIR_LOGS_DIR, AFSDIR_SALSRVLOG_FILE);
+
     pathp = dirPathArray[AFSDIR_SERVER_SALVAGER_FILEPATH_ID];
     AFSDIR_SERVER_FILEPATH(pathp, AFSDIR_SERVER_BIN_DIR,
 			   AFSDIR_SALVAGER_FILE);
 
+    pathp = dirPathArray[AFSDIR_SERVER_SALSRV_FILEPATH_ID];
+    AFSDIR_SERVER_FILEPATH(pathp, AFSDIR_SERVER_BIN_DIR,
+			   AFSDIR_SALSRV_FILE);
+
     pathp = dirPathArray[AFSDIR_SERVER_SLVGLOCK_FILEPATH_ID];
     AFSDIR_SERVER_FILEPATH(pathp, AFSDIR_LOCAL_DIR, AFSDIR_SLVGLOCK_FILE);
 
@@ -368,6 +375,9 @@ initDirPathArray(void)
     pathp = dirPathArray[AFSDIR_SERVER_KRB_EXCL_FILEPATH_ID];
     AFSDIR_SERVER_FILEPATH(pathp, AFSDIR_SERVER_ETC_DIR, AFSDIR_KRB_EXCL_FILE);
 
+    pathp = dirPathArray[AFSDIR_SERVER_FSSTATE_FILEPATH_ID];
+    AFSDIR_SERVER_FILEPATH(pathp, AFSDIR_LOCAL_DIR, AFSDIR_FSSTATE_FILE);
+
     /* client file paths */
 #ifdef AFS_NT40_ENV
     strcpy(dirPathArray[AFSDIR_CLIENT_THISCELL_FILEPATH_ID],
diff --git a/src/util/dirpath.hin b/src/util/dirpath.hin
index 23590ad4a9..ae1c46a78e 100644
--- a/src/util/dirpath.hin
+++ b/src/util/dirpath.hin
@@ -135,7 +135,9 @@ ConstructLocalLogPath(const char *cpath,
 #define AFSDIR_VLOG_FILE        "VLLog"
 #define AFSDIR_CORE_FILE        "core"
 #define AFSDIR_SLVGLOG_FILE     "SalvageLog"
+#define AFSDIR_SALSRVLOG_FILE   "SalsrvLog"
 #define AFSDIR_SALVAGER_FILE    "salvager"
+#define AFSDIR_SALSRV_FILE      "salvageserver"
 #define AFSDIR_SLVGLOCK_FILE    "salvage.lock"
 #define AFSDIR_BOZCONF_FILE     "BosConfig"
 #define AFSDIR_BOZCONFNEW_FILE  "BosConfig.new"
@@ -155,6 +157,8 @@ ConstructLocalLogPath(const char *cpath,
 #define AFSDIR_FILELOG_FILE     "FileLog"
 #define AFSDIR_MIGRATE_LOGNAME  "wtlog."
 
+#define AFSDIR_FSSTATE_FILE     "fsstate.dat"
+
 #define AFSDIR_CELLSERVDB_FILE_NTCLIENT  "afsdcell.ini"
 
 #define AFSDIR_NETINFO_FILE     "NetInfo"
@@ -194,9 +198,15 @@ AFSDIR_CANONICAL_SERVER_AFS_DIRPATH "/local"
 #define AFSDIR_CANONICAL_SERVER_SALVAGER_FILEPATH \
 AFSDIR_CANONICAL_SERVER_BIN_DIRPATH "/" AFSDIR_SALVAGER_FILE
 
+#define AFSDIR_CANONICAL_SERVER_SALSRV_FILEPATH \
+AFSDIR_CANONICAL_SERVER_BIN_DIRPATH "/" AFSDIR_SALSRV_FILE
+
 #define AFSDIR_CANONICAL_SERVER_SLVGLOG_FILEPATH \
 AFSDIR_CANONICAL_SERVER_LOGS_DIRPATH "/" AFSDIR_SLVGLOG_FILE
 
+#define AFSDIR_CANONICAL_SERVER_SALSRVLOG_FILEPATH \
+AFSDIR_CANONICAL_SERVER_LOGS_DIRPATH "/" AFSDIR_SALSRVLOG_FILE
+
 
 /* ---------------------  Local path macros ---------------------- */
 
@@ -264,6 +274,9 @@ typedef enum afsdir_id {
       AFSDIR_SERVER_BIN_FILE_DIRPATH_ID,
       AFSDIR_CLIENT_CELLALIAS_FILEPATH_ID,
       AFSDIR_SERVER_KRB_EXCL_FILEPATH_ID,
+      AFSDIR_SERVER_SALSRV_FILEPATH_ID,
+      AFSDIR_SERVER_SALSRVLOG_FILEPATH_ID,
+      AFSDIR_SERVER_FSSTATE_FILEPATH_ID,
       AFSDIR_PATHSTRING_MAX } afsdir_id_t;
 
 /* getDirPath() returns a pointer to a string from an internal array of path strings 
@@ -310,7 +323,9 @@ const char *getDirPath(afsdir_id_t string_id);
 #define AFSDIR_SERVER_VLOG_FILEPATH getDirPath(AFSDIR_SERVER_VLOG_FILEPATH_ID)
 #define AFSDIR_SERVER_CORELOG_FILEPATH getDirPath(AFSDIR_SERVER_CORELOG_FILEPATH_ID)
 #define AFSDIR_SERVER_SLVGLOG_FILEPATH getDirPath(AFSDIR_SERVER_SLVGLOG_FILEPATH_ID)
+#define AFSDIR_SERVER_SALSRVLOG_FILEPATH getDirPath(AFSDIR_SERVER_SALSRVLOG_FILEPATH_ID)
 #define AFSDIR_SERVER_SALVAGER_FILEPATH getDirPath(AFSDIR_SERVER_SALVAGER_FILEPATH_ID)
+#define AFSDIR_SERVER_SALSRV_FILEPATH getDirPath(AFSDIR_SERVER_SALSRV_FILEPATH_ID)
 #define AFSDIR_SERVER_BOZCONF_FILEPATH getDirPath(AFSDIR_SERVER_BOZCONF_FILEPATH_ID)
 #define AFSDIR_SERVER_BOZCONFNEW_FILEPATH getDirPath(AFSDIR_SERVER_BOZCONFNEW_FILEPATH_ID)
 #define AFSDIR_SERVER_BOZINIT_FILEPATH getDirPath(AFSDIR_SERVER_BOZINIT_FILEPATH_ID)
@@ -332,6 +347,7 @@ const char *getDirPath(afsdir_id_t string_id);
 #define AFSDIR_SERVER_THRESHOLD_CONSTANTS_FILEPATH getDirPath(AFSDIR_SERVER_THRESHOLD_CONSTANTS_FILEPATH_ID)
 #define AFSDIR_SERVER_MIGRATELOG_FILEPATH getDirPath(AFSDIR_SERVER_MIGRATELOG_FILEPATH_ID)
 #define AFSDIR_SERVER_KRB_EXCL_FILEPATH getDirPath(AFSDIR_SERVER_KRB_EXCL_FILEPATH_ID)
+#define AFSDIR_SERVER_FSSTATE_FILEPATH getDirPath(AFSDIR_SERVER_FSSTATE_FILEPATH_ID)
 
 /* client file paths */
 #define AFSDIR_CLIENT_THISCELL_FILEPATH getDirPath(AFSDIR_CLIENT_THISCELL_FILEPATH_ID)
diff --git a/src/util/dirpath_nt.h b/src/util/dirpath_nt.h
index b0c62bc392..1d49d8155a 100644
--- a/src/util/dirpath_nt.h
+++ b/src/util/dirpath_nt.h
@@ -126,7 +126,9 @@ extern int
 #define AFSDIR_VLOG_FILE        "VLLog"
 #define AFSDIR_CORE_FILE        "core"
 #define AFSDIR_SLVGLOG_FILE     "SalvageLog"
+#define AFSDIR_SALSRVLOG_FILE   "SalsrvLog"
 #define AFSDIR_SALVAGER_FILE    "salvager"
+#define AFSDIR_SALSRV_FILE      "salvageserver"
 #define AFSDIR_SLVGLOCK_FILE    "salvage.lock"
 #define AFSDIR_BOZCONF_FILE     "BosConfig"
 #define AFSDIR_BOZCONFNEW_FILE  "BosConfig.new"
@@ -146,6 +148,8 @@ extern int
 #define AFSDIR_FILELOG_FILE     "FileLog"
 #define AFSDIR_MIGRATE_LOGNAME  "wtlog."
 
+#define AFSDIR_FSSTATE_FILE     "fsstate.dat"
+
 #ifdef COMMENT
 #define AFSDIR_CELLSERVDB_FILE_NTCLIENT  "afsdcell.ini"
 #else
@@ -189,9 +193,15 @@ AFSDIR_LOCAL_DIR
 #define AFSDIR_CANONICAL_SERVER_SALVAGER_FILEPATH \
 AFSDIR_CANONICAL_SERVER_BIN_DIRPATH "/" AFSDIR_SALVAGER_FILE
 
+#define AFSDIR_CANONICAL_SERVER_SALSRV_FILEPATH \
+AFSDIR_CANONICAL_SERVER_BIN_DIRPATH "/" AFSDIR_SALSRV_FILE
+
 #define AFSDIR_CANONICAL_SERVER_SLVGLOG_FILEPATH \
 AFSDIR_CANONICAL_SERVER_LOGS_DIRPATH "/" AFSDIR_SLVGLOG_FILE
 
+#define AFSDIR_CANONICAL_SERVER_SALSRVLOG_FILEPATH \
+AFSDIR_CANONICAL_SERVER_LOGS_DIRPATH "/" AFSDIR_SALSRVLOG_FILE
+
 
 /* ---------------------  Local path macros ---------------------- */
 
@@ -259,6 +269,9 @@ typedef enum afsdir_id {
     AFSDIR_SERVER_BIN_FILE_DIRPATH_ID,
     AFSDIR_CLIENT_CELLALIAS_FILEPATH_ID,
     AFSDIR_SERVER_KRB_EXCL_FILEPATH_ID,
+    AFSDIR_SERVER_SALSRV_FILEPATH_ID,
+    AFSDIR_SERVER_SALSRVLOG_FILEPATH_ID,
+    AFSDIR_SERVER_FSSTATE_FILEPATH_ID,
     AFSDIR_PATHSTRING_MAX
 } afsdir_id_t;
 
@@ -306,7 +319,9 @@ const char *getDirPath(afsdir_id_t string_id);
 #define AFSDIR_SERVER_VLOG_FILEPATH getDirPath(AFSDIR_SERVER_VLOG_FILEPATH_ID)
 #define AFSDIR_SERVER_CORELOG_FILEPATH getDirPath(AFSDIR_SERVER_CORELOG_FILEPATH_ID)
 #define AFSDIR_SERVER_SLVGLOG_FILEPATH getDirPath(AFSDIR_SERVER_SLVGLOG_FILEPATH_ID)
+#define AFSDIR_SERVER_SALSRVLOG_FILEPATH getDirPath(AFSDIR_SERVER_SALSRVLOG_FILEPATH_ID)
 #define AFSDIR_SERVER_SALVAGER_FILEPATH getDirPath(AFSDIR_SERVER_SALVAGER_FILEPATH_ID)
+#define AFSDIR_SERVER_SALSRV_FILEPATH getDirPath(AFSDIR_SERVER_SALSRV_FILEPATH_ID)
 #define AFSDIR_SERVER_BOZCONF_FILEPATH getDirPath(AFSDIR_SERVER_BOZCONF_FILEPATH_ID)
 #define AFSDIR_SERVER_BOZCONFNEW_FILEPATH getDirPath(AFSDIR_SERVER_BOZCONFNEW_FILEPATH_ID)
 #define AFSDIR_SERVER_BOZINIT_FILEPATH getDirPath(AFSDIR_SERVER_BOZINIT_FILEPATH_ID)
@@ -328,6 +343,7 @@ const char *getDirPath(afsdir_id_t string_id);
 #define AFSDIR_SERVER_THRESHOLD_CONSTANTS_FILEPATH getDirPath(AFSDIR_SERVER_THRESHOLD_CONSTANTS_FILEPATH_ID)
 #define AFSDIR_SERVER_MIGRATELOG_FILEPATH getDirPath(AFSDIR_SERVER_MIGRATELOG_FILEPATH_ID)
 #define AFSDIR_SERVER_KRB_EXCL_FILEPATH getDirPath(AFSDIR_SERVER_KRB_EXCL_FILEPATH_ID)
+#define AFSDIR_SERVER_FSSTATE_FILEPATH getDirPath(AFSDIR_SERVER_FSSTATE_FILEPATH_ID)
 
 /* client file paths */
 #define AFSDIR_CLIENT_THISCELL_FILEPATH getDirPath(AFSDIR_CLIENT_THISCELL_FILEPATH_ID)
diff --git a/src/util/errors.h b/src/util/errors.h
index aa805d27ca..bc16dd6eb4 100644
--- a/src/util/errors.h
+++ b/src/util/errors.h
@@ -50,6 +50,7 @@
 				 * to THIS server to find out where */
 #define VIO		112	/* Vnode temporarily unaccessible, but not known 
 				 * to be permanently bad. */
+#define VSALVAGING      113     /* Volume is being salvaged (demand attach fs) */
 #define VRESTRICTED     120	/* Volume is restricted from using one or more
 				 * of the given residencies; do a
 				 * vos examine to find out the current
diff --git a/src/util/strnlen.c b/src/util/strnlen.c
new file mode 100644
index 0000000000..6c350df90d
--- /dev/null
+++ b/src/util/strnlen.c
@@ -0,0 +1,35 @@
+/*
+ * Copyright 2006, Sine Nomine Associates and others.
+ * All Rights Reserved.
+ * 
+ * This software has been released under the terms of the IBM Public
+ * License.  For details, see the LICENSE file in the top-level source
+ * directory or online at http://www.openafs.org/dl/license10.html
+ */
+
+/* strnlen.c - fixed length string length */
+
+#include <afsconfig.h>
+#include <afs/param.h>
+
+RCSID
+    ("$Header$");
+
+#include <sys/types.h>
+#include <stdarg.h>
+#include <ctype.h>
+
+
+size_t
+afs_strnlen(char * buf, size_t len)
+{
+    size_t i;
+
+    for (i = 0; i < len; i++) {
+	if (buf[i] == '\0')
+	    break;
+    }
+
+    return i;
+}
+
diff --git a/src/viced/Makefile.in b/src/viced/Makefile.in
index 1b7d23f597..6de76052eb 100644
--- a/src/viced/Makefile.in
+++ b/src/viced/Makefile.in
@@ -50,6 +50,7 @@ headers=${TOP_INCDIR}/lwp.h		\
 	${TOP_INCDIR}/afs/afsint.h	\
 	viced.h				\
 	host.h				\
+	callback.h			\
 	fs_stats.h
 
 objects=viced.o		\
diff --git a/src/viced/NTMakefile b/src/viced/NTMakefile
index 125d1ca6aa..0ffb6b7e95 100644
--- a/src/viced/NTMakefile
+++ b/src/viced/NTMakefile
@@ -5,6 +5,8 @@
 # License.  For details, see the LICENSE file in the top-level source
 # directory or online at http://www.openafs.org/dl/license10.html
 
+AFSDEV_AUXCDEFINES = -DFSSYNC_BUILD_SERVER
+
 RELDIR=viced
 !INCLUDE ..\config\NTMakefile.$(SYS_NAME)
 !INCLUDE ..\config\NTMakefile.version
diff --git a/src/viced/afsfileprocs.c b/src/viced/afsfileprocs.c
index 4743a2cb99..429a7de3a2 100644
--- a/src/viced/afsfileprocs.c
+++ b/src/viced/afsfileprocs.c
@@ -112,6 +112,7 @@ RCSID
 #include "viced_prototypes.h"
 #include "viced.h"
 #include "host.h"
+#include "callback.h"
 #include <afs/unified_afs.h>
 #include <afs/audit.h>
 #include <afs/afsutil.h>
@@ -209,7 +210,7 @@ extern afs_int32 readonlyServer;
 /*
  * Externals used by the xstat code.
  */
-extern int VolumeCacheSize, VolumeGets, VolumeReplacements;
+extern VolPkgStats VStats;
 extern int CEs, CEBlocks;
 
 extern int HTs, HTBlocks;
@@ -438,7 +439,7 @@ static afs_int32
 CheckVnode(AFSFid * fid, Volume ** volptr, Vnode ** vptr, int lock)
 {
     int fileCode = 0;
-    int errorCode = -1;
+    afs_int32 local_errorCode, errorCode = -1;
     static struct timeval restartedat = { 0, 0 };
 
     if (fid->Volume == 0 || fid->Vnode == 0)	/* not: || fid->Unique == 0) */
@@ -448,7 +449,7 @@ CheckVnode(AFSFid * fid, Volume ** volptr, Vnode ** vptr, int lock)
 
 	while (1) {
 	    errorCode = 0;
-	    *volptr = VGetVolume(&errorCode, (afs_int32) fid->Volume);
+	    *volptr = VGetVolume(&local_errorCode, &errorCode, (afs_int32) fid->Volume);
 	    if (!errorCode) {
 		assert(*volptr);
 		break;
@@ -525,8 +526,10 @@ CheckVnode(AFSFid * fid, Volume ** volptr, Vnode ** vptr, int lock)
 		    }
 		}
 	    }
-	    /* allow read operations on busy volume */
-	    else if (errorCode == VBUSY && lock == READ_LOCK) {
+	    /* allow read operations on busy volume. 
+	     * must check local_errorCode because demand attach fs
+	     * can have local_errorCode == VSALVAGING, errorCode == VBUSY */
+	    else if (local_errorCode == VBUSY && lock == READ_LOCK) {
 		errorCode = 0;
 		break;
 	    } else if (errorCode)
@@ -1151,6 +1154,8 @@ CopyOnWrite(Vnode * targetptr, Volume * volptr)
 			 wrlen, errno));
 #ifdef FAST_RESTART		/* if running in no-salvage, don't core the server */
 		ViceLog(0, ("CopyOnWrite failed: taking volume offline\n"));
+#elif defined(AFS_DEMAND_ATTACH_FS)
+		ViceLog(0, ("CopyOnWrite failed: requesting salvage\n"));
 #else /* Avoid further corruption and try to get a core. */
 		assert(0);
 #endif
@@ -5564,7 +5569,7 @@ SRXAFS_XStatsVersion(struct rx_call * a_call, afs_int32 * a_versionP)
 static void
 FillPerfValues(struct afs_PerfStats *a_perfP)
 {				/*FillPerfValues */
-
+    afs_uint32 hi, lo;
     int dir_Buffers;		/*# buffers in use by dir package */
     int dir_Calls;		/*# read calls in dir package */
     int dir_IOs;		/*# I/O ops in dir package */
@@ -5582,9 +5587,11 @@ FillPerfValues(struct afs_PerfStats *a_perfP)
     a_perfP->vcache_S_Gets = VnodeClassInfo[vSmall].gets;
     a_perfP->vcache_S_Reads = VnodeClassInfo[vSmall].reads;
     a_perfP->vcache_S_Writes = VnodeClassInfo[vSmall].writes;
-    a_perfP->vcache_H_Entries = VolumeCacheSize;
-    a_perfP->vcache_H_Gets = VolumeGets;
-    a_perfP->vcache_H_Replacements = VolumeReplacements;
+    a_perfP->vcache_H_Entries = VStats.hdr_cache_size;
+    SplitInt64(VStats.hdr_gets, hi, lo);
+    a_perfP->vcache_H_Gets = lo;
+    SplitInt64(VStats.hdr_loads, hi, lo);
+    a_perfP->vcache_H_Replacements = lo;
 
     /*
      * Directory section.
diff --git a/src/viced/callback.c b/src/viced/callback.c
index 8c3040dc96..44b4523576 100644
--- a/src/viced/callback.c
+++ b/src/viced/callback.c
@@ -5,6 +5,8 @@
  * This software has been released under the terms of the IBM Public
  * License.  For details, see the LICENSE file in the top-level source
  * directory or online at http://www.openafs.org/dl/license10.html
+ *
+ * Portions Copyright (c) 2006 Sine Nomine Associates
  */
 
 /*
@@ -120,94 +122,24 @@ RCSID
 
 #include <afs/ptclient.h>	/* need definition of prlist for host.h */
 #include "host.h"
+#include "callback.h"
+#ifdef AFS_DEMAND_ATTACH_FS
+#include "../tviced/serialize_state.h"
+#endif /* AFS_DEMAND_ATTACH_FS */
+
 
 extern afsUUID FS_HostUUID;
 extern int hostCount;
-int ShowProblems = 1;
-
-/* Maximum number of call backs to break at once, single fid */
-/* There is some debate as to just how large this value should be */
-/* Ideally, it would be very very large, but I am afraid that the */
-/* cache managers will all send in their responses simultaneously, */
-/* thereby swamping the file server.  As a result, something like */
-/* 10 or 15 might be a better bet. */
-#define MAX_CB_HOSTS	10
-
-/* max time to break a callback, otherwise client is dead or net is hosed */
-#define MAXCBT 25
-
-#define u_byte	unsigned char
+static int ShowProblems = 1;
 
 struct cbcounters cbstuff;
 
-struct cbstruct {
-    struct host *hp;
-    afs_uint32 thead;
-};
+static struct FileEntry * FE = NULL;    /* don't use FE[0] */
+static struct CallBack * CB = NULL;     /* don't use CB[0] */
 
-struct FileEntry {
-    afs_uint32 vnode;
-    afs_uint32 unique;
-    afs_uint32 volid;
-    afs_uint32 fnext;
-    afs_uint32 ncbs;
-    afs_uint32 firstcb;
-    afs_uint32 status;
-    afs_uint32 spare;
-} *FE;				/* Don't use FE[0] */
-#define FE_LATER 0x1
+static struct CallBack * CBfree = NULL;
+static struct FileEntry * FEfree = NULL;
 
-struct CallBack {
-    afs_uint32 cnext;		/* Next call back entry */
-    afs_uint32 fhead;		/* Head of this call back chain */
-    u_byte thead;		/* Head of timeout chain */
-    u_byte status;		/* Call back status; see definitions, below */
-    afs_uint32 hhead;		/* Head of host table chain */
-    afs_uint32 tprev, tnext;	/* Timeout chain */
-    afs_uint32 hprev, hnext;	/* Chain from host table */
-    unsigned short spare;	/* make it a multiple of 32 bits. */
-} *CB;				/* Don't use CB[0] */
-
-/* status values for status field of CallBack structure */
-#define CB_NORMAL   1		/* Normal call back */
-#define CB_DELAYED  2		/* Delayed call back due to rpc problems.
-				 * The call back entry will be added back to the
-				 * host list at the END of the list, so that
-				 * searching backwards in the list will find all
-				 * the (consecutive)host. delayed call back entries */
-#define CB_VOLUME   3		/* Callback for a volume */
-#define CB_BULK     4		/* Normal callbacks, handed out from FetchBulkStatus */
-
-/* call back indices to pointers, and vice-versa */
-#define itocb(i)    ((i)?CB+(i):0)
-#define cbtoi(cbp)  (!(cbp)?0:(cbp)-CB)
-
-/* file entry indices to pointers, and vice-versa */
-#define itofe(i)    ((i)?FE+(i):0)
-#define fetoi(fep)  (!(fep)?0:(fep)-FE)
-
-/* Timeouts:  there are 128 possible timeout values in effect at any
- * given time.  Each timeout represents timeouts in an interval of 128
- * seconds.  So the maximum timeout for a call back is 128*128=16384
- * seconds, or 4 1/2 hours.  The timeout cleanup stuff is called only
- * if space runs out or by the file server every 5 minutes.  This 5
- * minute slack should be allowed for--so a maximum time of 4 hours
- * is safer.
- *
- * Timeouts must be chosen to correspond to an exact multiple
- * of 128, because all times are truncated to a 128 multiple, and
- * timed out if the current truncated time is <= to the truncated time
- * corresponding to the timeout queue.
- */
-
-/* Unix time to Call Back time, and vice-versa.  Call back time is
-   in units of 128 seconds, corresponding to time queues. */
-#define CBtime(uxtime)	((uxtime)>>7)
-#define UXtime(cbtime)	((cbtime)<<7)
-
-/* Given a Unix time, compute the closest Unix time that corresponds to
-   a time queue, rounding up */
-#define TimeCeiling(uxtime)	(((uxtime)+127)&~127)
 
 /* Time to live for call backs depends upon number of users of the file.
  * TimeOuts is indexed by this number/8 (using TimeOut macro).  Times
@@ -229,52 +161,17 @@ static int TimeOuts[] = {
 /* minimum time given for a call back */
 static int MinTimeOut = (7 * 60);
 
-#define TimeOutCutoff   ((sizeof(TimeOuts)/sizeof(TimeOuts[0]))*8)
-#define TimeOut(nusers)  ((nusers)>=TimeOutCutoff? MinTimeOut: TimeOuts[(nusers)>>3])
-
-/* time out at server is 3 minutes more than ws */
-#define ServerBias	  (3*60)
-
 /* Heads of CB queues; a timeout index is 1+index into this array */
-static afs_uint32 timeout[128];
-
-/* Convert cbtime to timeout queue index */
-#define TIndex(cbtime)  (((cbtime)&127)+1)
-
-/* Convert cbtime to pointer to timeout queue head */
-#define THead(cbtime)	(&timeout[TIndex(cbtime)-1])
+static afs_uint32 timeout[CB_NUM_TIMEOUT_QUEUES];
 
 static afs_int32 tfirst;	/* cbtime of oldest unexpired call back time queue */
 
-/* Normalize index into timeout array so that two such indices will be
-   ordered correctly, so that they can be compared to see which times
-   sooner, or so that the difference in time out times between them
-   can be computed. */
-#define TNorm(index)   ((index)<TIndex(tfirst)?(index)+128:(index))
-
-/* This converts a timeout index into the actual time it will expire */
-#define TIndexToTime(index) (UXtime(TNorm(index) - TIndex(tfirst) + tfirst))
-
-
-/* Convert pointer to timeout queue head to index, and vice versa */
-#define ttoi(t)		((t-timeout)+1)
-#define itot(i)		((timeout)+(i-1))
 
 /* 16 byte object get/free routines */
 struct object {
     struct object *next;
 };
 
-struct VCBParams {
-    struct cbstruct cba[MAX_CB_HOSTS];	/* re-entrant storage */
-    unsigned int ncbas;
-    afs_uint32 thead;		/* head of timeout queue for youngest callback */
-    struct AFSFid *fid;
-};
-
-struct CallBack *CBfree = 0;
-struct FileEntry *FEfree = 0;
-
 /* Prototypes for static routines */
 static struct FileEntry *FindFE(register AFSFid * fid);
 static struct CallBack *iGetCB(register int *nused);
@@ -308,12 +205,11 @@ static int ClearHostCallbacks_r(struct host *hp, int locked);
 #define FreeCB(cb) iFreeCB((struct CallBack *)cb, &cbstuff.nCBs)
 #define FreeFE(fe) iFreeFE((struct FileEntry *)fe, &cbstuff.nFEs)
 
+
 /* Other protos - move out sometime */
 void PrintCB(register struct CallBack *cb, afs_uint32 now);
 
-#define VHASH 512		/* Power of 2 */
-static afs_uint32 HashTable[VHASH];	/* File entry hash table */
-#define VHash(volume, unique) (((volume)+(unique))&(VHASH-1))
+static afs_uint32 HashTable[FEHASH_SIZE];	/* File entry hash table */
 
 static struct FileEntry *
 FindFE(register AFSFid * fid)
@@ -322,7 +218,7 @@ FindFE(register AFSFid * fid)
     register int fei;
     register struct FileEntry *fe;
 
-    hash = VHash(fid->Volume, fid->Unique);
+    hash = FEHash(fid->Volume, fid->Unique);
     for (fei = HashTable[hash]; fei; fei = fe->fnext) {
 	fe = itofe(fei);
 	if (fe->volid == fid->Volume && fe->unique == fid->Unique
@@ -421,11 +317,11 @@ HAdd(register struct CallBack *cb, register struct host *host)
     if (!host->cblist) {
 	host->cblist = cb->hnext = cb->hprev = cbtoi(cb);
     } else {
-	register struct CallBack *hhp = itocb(host->cblist);
+	register struct CallBack *fcb = itocb(host->cblist);
 
-	cb->hprev = hhp->hprev;
-	cb->hnext = host->cblist;
-	hhp->hprev = (itocb(hhp->hprev)->hnext = cbtoi(cb));
+	cb->hprev = fcb->hprev;
+	cb->hnext = cbtoi(fcb);
+	fcb->hprev = (itocb(fcb->hprev)->hnext = cbtoi(cb));
     }
     return 0;
 }
@@ -475,7 +371,7 @@ CDel(struct CallBack *cb, int deletefe)
 /* N.B.  This one also deletes the CB, and also possibly parent FE, so
  * make sure that it is not on any other list before calling this
  * routine */
-int Ccdelpt = 0, CcdelB = 0;
+static int Ccdelpt = 0, CcdelB = 0;
 
 static int
 CDelPtr(register struct FileEntry *fe, register afs_uint32 * cbp,
@@ -522,7 +418,7 @@ static int
 FDel(register struct FileEntry *fe)
 {
     register int fei = fetoi(fe);
-    register afs_uint32 *p = &HashTable[VHash(fe->volid, fe->unique)];
+    register afs_uint32 *p = &HashTable[FEHash(fe->volid, fe->unique)];
 
     while (*p && *p != fei)
 	p = &itofe(*p)->fnext;
@@ -532,6 +428,7 @@ FDel(register struct FileEntry *fe)
     return 0;
 }
 
+/* initialize the callback package */
 int
 InitCallBack(int nblks)
 {
@@ -539,19 +436,21 @@ InitCallBack(int nblks)
     tfirst = CBtime(FT_ApproxTime());
     /* N.B. The "-1", below, is because
      * FE[0] and CB[0] are not used--and not allocated */
-    FE = ((struct FileEntry *)(calloc(nblks, sizeof(struct FileEntry)))) - 1;
+    FE = ((struct FileEntry *)(calloc(nblks, sizeof(struct FileEntry))));
     if (!FE) {
 	ViceLog(0, ("Failed malloc in InitCallBack\n"));
 	assert(0);
     }
+    FE--;  /* FE[0] is supposed to point to junk */
     cbstuff.nFEs = nblks;
     while (cbstuff.nFEs)
 	FreeFE(&FE[cbstuff.nFEs]);	/* This is correct */
-    CB = ((struct CallBack *)(calloc(nblks, sizeof(struct CallBack)))) - 1;
+    CB = ((struct CallBack *)(calloc(nblks, sizeof(struct CallBack))));
     if (!CB) {
 	ViceLog(0, ("Failed malloc in InitCallBack\n"));
 	assert(0);
     }
+    CB--;  /* CB[0] is supposed to point to junk */
     cbstuff.nCBs = nblks;
     while (cbstuff.nCBs)
 	FreeCB(&CB[cbstuff.nCBs]);	/* This is correct */
@@ -696,7 +595,7 @@ AddCallBack1_r(struct host *host, AFSFid * fid, afs_uint32 * thead, int type,
 	fe->unique = fid->Unique;
 	fe->ncbs = 0;
 	fe->status = 0;
-	hash = VHash(fid->Volume, fid->Unique);
+	hash = FEHash(fid->Volume, fid->Unique);
 	fe->fnext = HashTable[hash];
 	HashTable[hash] = fetoi(fe);
     }
@@ -1302,7 +1201,7 @@ BreakVolumeCallBacks(afs_uint32 volume)
 
     H_LOCK;
     fid.Volume = volume, fid.Vnode = fid.Unique = 0;
-    for (hash = 0; hash < VHASH; hash++) {
+    for (hash = 0; hash < FEHASH_SIZE; hash++) {
 	for (feip = &HashTable[hash]; (fe = itofe(*feip));) {
 	    if (fe->volid == volume) {
 		register struct CallBack *cbnext;
@@ -1360,7 +1259,7 @@ int
 BreakVolumeCallBacksLater(afs_uint32 volume)
 {
     int hash;
-    afs_int32 *feip;
+    afs_uint32 *feip;
     struct FileEntry *fe;
     struct CallBack *cb;
     struct host *host;
@@ -1368,7 +1267,7 @@ BreakVolumeCallBacksLater(afs_uint32 volume)
 
     ViceLog(25, ("Setting later on volume %u\n", volume));
     H_LOCK;
-    for (hash = 0; hash < VHASH; hash++) {
+    for (hash = 0; hash < FEHASH_SIZE; hash++) {
 	for (feip = &HashTable[hash]; (fe = itofe(*feip)) != NULL; ) {
 	    if (fe->volid == volume) {
 		register struct CallBack *cbnext;
@@ -1381,7 +1280,7 @@ BreakVolumeCallBacksLater(afs_uint32 volume)
 		FSYNC_LOCK;
 		fe->status |= FE_LATER;
 		FSYNC_UNLOCK;
-		found++;
+		found = 1;
 	    }
 	    feip = &fe->fnext;
 	}
@@ -1408,7 +1307,7 @@ BreakLaterCallBacks(void)
 {
     struct AFSFid fid;
     int hash;
-    afs_int32 *feip;
+    afs_uint32 *feip;
     struct CallBack *cb;
     struct FileEntry *fe = NULL;
     struct FileEntry *myfe = NULL;
@@ -1424,7 +1323,7 @@ BreakLaterCallBacks(void)
     /* Pick the first volume we see to clean up */
     fid.Volume = fid.Vnode = fid.Unique = 0;
 
-    for (hash = 0; hash < VHASH; hash++) {
+    for (hash = 0; hash < FEHASH_SIZE; hash++) {
 	for (feip = &HashTable[hash]; (fe = itofe(*feip)) != NULL; ) {
 	    if (fe && (fe->status & FE_LATER)
 		&& (fid.Volume == 0 || fid.Volume == fe->volid)) {
@@ -1775,6 +1674,973 @@ PrintCallBackStats(void)
 
 #ifndef INTERPRET_DUMP
 
+#ifdef AFS_DEMAND_ATTACH_FS
+/*
+ * demand attach fs
+ * callback state serialization
+ */
+static int cb_stateSaveTimeouts(struct fs_dump_state * state);
+static int cb_stateSaveFEHash(struct fs_dump_state * state);
+static int cb_stateSaveFEs(struct fs_dump_state * state);
+static int cb_stateSaveFE(struct fs_dump_state * state, struct FileEntry * fe);
+static int cb_stateRestoreTimeouts(struct fs_dump_state * state);
+static int cb_stateRestoreFEHash(struct fs_dump_state * state);
+static int cb_stateRestoreFEs(struct fs_dump_state * state);
+static int cb_stateRestoreFE(struct fs_dump_state * state);
+static int cb_stateRestoreCBs(struct fs_dump_state * state, struct FileEntry * fe, 
+			      struct iovec * iov, int niovecs);
+
+static int cb_stateVerifyFEHash(struct fs_dump_state * state);
+static int cb_stateVerifyFE(struct fs_dump_state * state, struct FileEntry * fe);
+static int cb_stateVerifyFCBList(struct fs_dump_state * state, struct FileEntry * fe);
+static int cb_stateVerifyTimeoutQueues(struct fs_dump_state * state);
+
+static int cb_stateFEToDiskEntry(struct FileEntry *, struct FEDiskEntry *);
+static int cb_stateDiskEntryToFE(struct fs_dump_state * state,
+				 struct FEDiskEntry *, struct FileEntry *);
+
+static int cb_stateCBToDiskEntry(struct CallBack *, struct CBDiskEntry *);
+static int cb_stateDiskEntryToCB(struct fs_dump_state * state,
+				 struct CBDiskEntry *, struct CallBack *);
+
+static int cb_stateFillHeader(struct callback_state_header * hdr);
+static int cb_stateCheckHeader(struct callback_state_header * hdr);
+
+static int cb_stateAllocMap(struct fs_dump_state * state);
+
+int
+cb_stateSave(struct fs_dump_state * state)
+{
+    int ret = 0;
+
+    AssignInt64(state->eof_offset, &state->hdr->cb_offset);
+
+    /* invalidate callback state header */
+    memset(state->cb_hdr, 0, sizeof(struct callback_state_header));
+    if (fs_stateWriteHeader(state, &state->hdr->cb_offset, state->cb_hdr,
+			    sizeof(struct callback_state_header))) {
+	ret = 1;
+	goto done;
+    }
+
+    fs_stateIncEOF(state, sizeof(struct callback_state_header));
+
+    /* dump timeout state */
+    if (cb_stateSaveTimeouts(state)) {
+	ret = 1;
+	goto done;
+    }
+
+    /* dump fe hashtable state */
+    if (cb_stateSaveFEHash(state)) {
+	ret = 1;
+	goto done;
+    }
+
+    /* dump callback state */
+    if (cb_stateSaveFEs(state)) {
+	ret = 1;
+	goto done;
+    }
+
+    /* write the callback state header to disk */
+    cb_stateFillHeader(state->cb_hdr);
+    if (fs_stateWriteHeader(state, &state->hdr->cb_offset, state->cb_hdr,
+			    sizeof(struct callback_state_header))) {
+	ret = 1;
+	goto done;
+    }
+    
+ done:
+    return ret;
+}
+
+int
+cb_stateRestore(struct fs_dump_state * state)
+{
+    int ret = 0;
+
+    if (fs_stateReadHeader(state, &state->hdr->cb_offset, state->cb_hdr,
+			   sizeof(struct callback_state_header))) {
+	ret = 1;
+	goto done;
+    }
+
+    if (cb_stateCheckHeader(state->cb_hdr)) {
+	ret = 1;
+	goto done;
+    }
+
+    if (cb_stateAllocMap(state)) {
+	ret = 1;
+	goto done;
+    }
+
+    if (cb_stateRestoreTimeouts(state)) {
+	ret = 1;
+	goto done;
+    }
+
+    if (cb_stateRestoreFEHash(state)) {
+	ret = 1;
+	goto done;
+    }
+
+    /* restore FEs and CBs from disk */
+    if (cb_stateRestoreFEs(state)) {
+	ret = 1;
+	goto done;
+    }
+
+    /* restore the timeout queue heads */
+    tfirst = state->cb_hdr->tfirst;
+
+ done:
+    return ret;
+}
+
+int
+cb_stateRestoreIndices(struct fs_dump_state * state)
+{
+    int i, ret = 0;
+    struct FileEntry * fe;
+    struct CallBack * cb;
+
+    /* restore indices in the FileEntry structures */
+    for (i = 1; i < state->fe_map.len; i++) {
+	if (state->fe_map.entries[i].new_idx) {
+	    fe = itofe(state->fe_map.entries[i].new_idx);
+
+	    /* restore the fe->fnext entry */
+	    if (fe_OldToNew(state, fe->fnext, &fe->fnext)) {
+		ret = 1;
+		goto done;
+	    }
+
+	    /* restore the fe->firstcb entry */
+	    if (cb_OldToNew(state, fe->firstcb, &fe->firstcb)) {
+		ret = 1;
+		goto done;
+	    }
+	}
+    }
+    
+    /* restore indices in the CallBack structures */
+    for (i = 1; i < state->cb_map.len; i++) {
+	if (state->cb_map.entries[i].new_idx) {
+	    cb = itocb(state->cb_map.entries[i].new_idx);
+
+	    /* restore the cb->cnext entry */
+	    if (cb_OldToNew(state, cb->cnext, &cb->cnext)) {
+		ret = 1;
+		goto done;
+	    }
+	    
+	    /* restore the cb->fhead entry */
+	    if (fe_OldToNew(state, cb->fhead, &cb->fhead)) {
+		ret = 1;
+		goto done;
+	    }
+
+	    /* restore the cb->hhead entry */
+	    if (h_OldToNew(state, cb->hhead, &cb->hhead)) {
+		ret = 1;
+		goto done;
+	    }
+
+	    /* restore the cb->tprev entry */
+	    if (cb_OldToNew(state, cb->tprev, &cb->tprev)) {
+		ret = 1;
+		goto done;
+	    }
+
+	    /* restore the cb->tnext entry */
+	    if (cb_OldToNew(state, cb->tnext, &cb->tnext)) {
+		ret = 1;
+		goto done;
+	    }
+
+	    /* restore the cb->hprev entry */
+	    if (cb_OldToNew(state, cb->hprev, &cb->hprev)) {
+		ret = 1;
+		goto done;
+	    }
+
+	    /* restore the cb->hnext entry */
+	    if (cb_OldToNew(state, cb->hnext, &cb->hnext)) {
+		ret = 1;
+		goto done;
+	    }
+	}
+    }
+
+    /* restore the timeout queue head indices */
+    for (i = 0; i < state->cb_timeout_hdr->records; i++) {
+	if (cb_OldToNew(state, timeout[i], &timeout[i])) {
+	    ret = 1;
+	    goto done;
+	}
+    }
+
+    /* restore the FE hash table queue heads */
+    for (i = 0; i < state->cb_fehash_hdr->records; i++) {
+	if (fe_OldToNew(state, HashTable[i], &HashTable[i])) {
+	    ret = 1;
+	    goto done;
+	}
+    }
+
+ done:
+    return ret;
+}
+
+int
+cb_stateVerify(struct fs_dump_state * state)
+{
+    int ret = 0;
+
+    if (cb_stateVerifyFEHash(state)) {
+	ret = 1;
+    }
+
+    if (cb_stateVerifyTimeoutQueues(state)) {
+	ret = 1;
+    }
+
+ done:
+    return ret;
+}
+
+static int
+cb_stateVerifyFEHash(struct fs_dump_state * state)
+{
+    int ret = 0, i;
+    struct FileEntry * fe;
+    afs_uint32 fei, chain_len;
+
+    for (i = 0; i < FEHASH_SIZE; i++) {
+	chain_len = 0;
+	for (fei = HashTable[i], fe = itofe(fei);
+	     fe;
+	     fei = fe->fnext, fe = itofe(fei)) {
+	    if (fei > cbstuff.nblks) {
+		ViceLog(0, ("cb_stateVerifyFEHash: error: index out of range (fei=%d)\n", fei));
+		ret = 1;
+		break;
+	    }
+	    if (cb_stateVerifyFE(state, fe)) {
+		ret = 1;
+	    }
+	    if (chain_len > FS_STATE_FE_MAX_HASH_CHAIN_LEN) {
+		ViceLog(0, ("cb_stateVerifyFEHash: error: hash chain %d length exceeds %d; assuming there's a loop\n",
+			    i, FS_STATE_FE_MAX_HASH_CHAIN_LEN));
+		ret = 1;
+		break;
+	    }
+	    chain_len++;
+	}
+    }
+
+ done:
+    return ret;
+}
+
+static int
+cb_stateVerifyFE(struct fs_dump_state * state, struct FileEntry * fe)
+{
+    int ret = 0;
+
+    if ((fe->firstcb && !fe->ncbs) ||
+	(!fe->firstcb && fe->ncbs)) {
+	ViceLog(0, ("cb_stateVerifyFE: error: fe->firstcb does not agree with fe->ncbs (fei=%d, fe->firstcb=%d, fe->ncbs=%d)\n",
+		    fetoi(fe), fe->firstcb, fe->ncbs));
+	ret = 1;
+    }
+    if (cb_stateVerifyFCBList(state, fe)) {
+	ViceLog(0, ("cb_stateVerifyFE: error: FCBList failed verification (fei=%d)\n", fetoi(fe)));
+	ret = 1;
+    }
+
+ done:
+    return ret;
+}
+
+static int
+cb_stateVerifyFCBList(struct fs_dump_state * state, struct FileEntry * fe)
+{
+    int ret = 0;
+    afs_uint32 cbi, fei, chain_len = 0;
+    struct CallBack * cb;
+
+    fei = fetoi(fe);
+
+    for (cbi = fe->firstcb, cb = itocb(cbi);
+	 cb;
+	 cbi = cb->cnext, cb = itocb(cbi)) {
+	if (cbi > cbstuff.nblks) {
+	    ViceLog(0, ("cb_stateVerifyFCBList: error: list index out of range (cbi=%d, ncbs=%d)\n",
+			cbi, cbstuff.nblks));
+	    ret = 1;
+	    goto done;
+	}
+	if (cb->fhead != fei) {
+	    ViceLog(0, ("cb_stateVerifyFCBList: error: cb->fhead != fei (fei=%d, cb->fhead=%d)\n",
+			fei, cb->fhead));
+	    ret = 1;
+	}
+	if (chain_len > FS_STATE_FCB_MAX_LIST_LEN) {
+	    ViceLog(0, ("cb_stateVerifyFCBList: error: list length exceeds %d (fei=%d); assuming there's a loop\n",
+			FS_STATE_FCB_MAX_LIST_LEN, fei));
+	    ret = 1;
+	    goto done;
+	}
+	chain_len++;
+    }
+
+    if (fe->ncbs != chain_len) {
+	ViceLog(0, ("cb_stateVerifyFCBList: error: list length mismatch (len=%d, fe->ncbs=%d)\n",
+		    chain_len, fe->ncbs));
+	ret = 1;
+    }
+
+ done:
+    return ret;
+}
+
+int
+cb_stateVerifyHCBList(struct fs_dump_state * state, struct host * host)
+{
+    int ret = 0;
+    afs_uint32 hi, chain_len, cbi;
+    struct CallBack *cb, *ncb;
+
+    hi = h_htoi(host);
+    chain_len = 0;
+
+    for (cbi = host->cblist, cb = itocb(cbi);
+	 cb;
+	 cbi = cb->hnext, cb = ncb) {
+	if (chain_len && (host->cblist == cbi)) {
+	    /* we've wrapped around the circular list, and everything looks ok */
+	    break;
+	}
+	if (cb->hhead != hi) {
+	    ViceLog(0, ("cb_stateVerifyHCBList: error: incorrect cb->hhead (cbi=%d, h->index=%d, cb->hhead=%d)\n",
+			cbi, hi, cb->hhead));
+	    ret = 1;
+	}
+	if (!cb->hprev || !cb->hnext) {
+	    ViceLog(0, ("cb_stateVerifyHCBList: error: null index in circular list (cbi=%d, h->index=%d)\n",
+			cbi, hi));
+	    ret = 1;
+	    goto done;
+	}
+	if ((cb->hprev > cbstuff.nblks) ||
+	    (cb->hnext > cbstuff.nblks)) {
+	    ViceLog(0, ("cb_stateVerifyHCBList: error: list index out of range (cbi=%d, h->index=%d, cb->hprev=%d, cb->hnext=%d, nCBs=%d)\n",
+			cbi, hi, cb->hprev, cb->hnext, cbstuff.nblks));
+	    ret = 1;
+	    goto done;
+	}
+	ncb = itocb(cb->hnext);
+	if (cbi != ncb->hprev) {
+	    ViceLog(0, ("cb_stateVerifyHCBList: error: corrupt linked list (cbi=%d, h->index=%d)\n",
+			cbi, hi));
+	    ret = 1;
+	    goto done;
+	}
+	if (chain_len > FS_STATE_HCB_MAX_LIST_LEN) {
+	    ViceLog(0, ("cb_stateVerifyFCBList: error: list length exceeds %d (h->index=%d); assuming there's a loop\n",
+			FS_STATE_HCB_MAX_LIST_LEN, hi));
+	    ret = 1;
+	    goto done;
+	}
+	chain_len++;
+    }
+
+ done:
+    return ret;
+}
+
+static int
+cb_stateVerifyTimeoutQueues(struct fs_dump_state * state)
+{
+    int ret = 0, i;
+    afs_uint32 cbi, chain_len;
+    struct CallBack *cb, *ncb;
+
+    for (i = 0; i < CB_NUM_TIMEOUT_QUEUES; i++) {
+	chain_len = 0;
+	for (cbi = timeout[i], cb = itocb(cbi);
+	     cb;
+	     cbi = cb->tnext, cb = ncb) {
+	    if (chain_len && (cbi == timeout[i])) {
+		/* we've wrapped around the circular list, and everything looks ok */
+		break;
+	    }
+	    if (cbi > cbstuff.nblks) {
+		ViceLog(0, ("cb_stateVerifyTimeoutQueues: error: list index out of range (cbi=%d, tindex=%d)\n",
+			    cbi, i));
+		ret = 1;
+		break;
+	    }
+	    if (itot(cb->thead) != &timeout[i]) {
+		ViceLog(0, ("cb_stateVerifyTimeoutQueues: error: cb->thead points to wrong timeout queue (tindex=%d, cbi=%d, cb->thead=%d)\n",
+			    i, cbi, cb->thead));
+		ret = 1;
+	    }
+	    if (!cb->tprev || !cb->tnext) {
+		ViceLog(0, ("cb_stateVerifyTimeoutQueues: null index in circular list (cbi=%d, tindex=%d)\n",
+			    cbi, i));
+		ret = 1;
+		break;
+	    }
+	    if ((cb->tprev > cbstuff.nblks) ||
+		(cb->tnext > cbstuff.nblks)) {
+		ViceLog(0, ("cb_stateVerifyTimeoutQueues: list index out of range (cbi=%d, tindex=%d, cb->tprev=%d, cb->tnext=%d, nCBs=%d)\n",
+			    cbi, i, cb->tprev, cb->tnext, cbstuff.nblks));
+		ret = 1;
+		break;
+	    }
+	    ncb = itocb(cb->tnext);
+	    if (cbi != ncb->tprev) {
+		ViceLog(0, ("cb_stateVerifyTimeoutQueues: corrupt linked list (cbi=%d, tindex=%d)\n",
+			    cbi, i));
+		ret = 1;
+		break;
+	    }
+	    if (chain_len > FS_STATE_TCB_MAX_LIST_LEN) {
+		ViceLog(0, ("cb_stateVerifyTimeoutQueues: list length exceeds %d (tindex=%d); assuming there's a loop\n",
+			    FS_STATE_TCB_MAX_LIST_LEN, i));
+		ret = 1;
+		break;
+	    }
+	    chain_len++;
+	}
+    }
+
+ done:
+    return ret;
+}
+
+static int
+cb_stateSaveTimeouts(struct fs_dump_state * state)
+{
+    int ret = 0;
+    struct iovec iov[2];
+
+    AssignInt64(state->eof_offset, &state->cb_hdr->timeout_offset);
+
+    memset(state->cb_timeout_hdr, 0, sizeof(struct callback_state_fehash_header));
+    state->cb_timeout_hdr->magic = CALLBACK_STATE_TIMEOUT_MAGIC;
+    state->cb_timeout_hdr->records = CB_NUM_TIMEOUT_QUEUES;
+    state->cb_timeout_hdr->len = sizeof(struct callback_state_timeout_header) +
+	(state->cb_timeout_hdr->records * sizeof(afs_uint32));
+
+    iov[0].iov_base = (char *)state->cb_timeout_hdr;
+    iov[0].iov_len = sizeof(struct callback_state_timeout_header);
+    iov[1].iov_base = (char *)timeout;
+    iov[1].iov_len = sizeof(timeout);
+
+    if (fs_stateSeek(state, &state->cb_hdr->timeout_offset)) {
+	ret = 1;
+	goto done;
+    }
+
+    if (fs_stateWriteV(state, iov, 2)) {
+	ret = 1;
+	goto done;
+    }
+
+    fs_stateIncEOF(state, state->cb_timeout_hdr->len);
+
+ done:
+    return ret;
+}
+
+static int
+cb_stateRestoreTimeouts(struct fs_dump_state * state)
+{
+    int ret = 0, len;
+
+    if (fs_stateReadHeader(state, &state->cb_hdr->timeout_offset,
+			   state->cb_timeout_hdr, 
+			   sizeof(struct callback_state_timeout_header))) {
+	ret = 1;
+	goto done;
+    }
+
+    if (state->cb_timeout_hdr->magic != CALLBACK_STATE_TIMEOUT_MAGIC) {
+	ret = 1;
+	goto done;
+    }
+    if (state->cb_timeout_hdr->records != CB_NUM_TIMEOUT_QUEUES) {
+	ret = 1;
+	goto done;
+    }
+
+    len = state->cb_timeout_hdr->records * sizeof(afs_uint32);
+
+    if (state->cb_timeout_hdr->len !=
+	(sizeof(struct callback_state_timeout_header) + len)) {
+	ret = 1;
+	goto done;
+    }
+
+    if (fs_stateRead(state, timeout, len)) {
+	ret = 1;
+	goto done;
+    }
+
+ done:
+    return ret;
+}
+
+static int
+cb_stateSaveFEHash(struct fs_dump_state * state)
+{
+    int ret = 0;
+    struct iovec iov[2];
+
+    AssignInt64(state->eof_offset, &state->cb_hdr->fehash_offset);
+
+    memset(state->cb_fehash_hdr, 0, sizeof(struct callback_state_fehash_header));
+    state->cb_fehash_hdr->magic = CALLBACK_STATE_FEHASH_MAGIC;
+    state->cb_fehash_hdr->records = FEHASH_SIZE;
+    state->cb_fehash_hdr->len = sizeof(struct callback_state_fehash_header) +
+	(state->cb_fehash_hdr->records * sizeof(afs_uint32));
+
+    iov[0].iov_base = (char *)state->cb_fehash_hdr;
+    iov[0].iov_len = sizeof(struct callback_state_fehash_header);
+    iov[1].iov_base = (char *)HashTable;
+    iov[1].iov_len = sizeof(HashTable);
+
+    if (fs_stateSeek(state, &state->cb_hdr->fehash_offset)) {
+	ret = 1;
+	goto done;
+    }
+
+    if (fs_stateWriteV(state, iov, 2)) {
+	ret = 1;
+	goto done;
+    }
+
+    fs_stateIncEOF(state, state->cb_fehash_hdr->len);
+
+ done:
+    return ret;
+}
+
+static int
+cb_stateRestoreFEHash(struct fs_dump_state * state)
+{
+    int ret = 0, len;
+
+    if (fs_stateReadHeader(state, &state->cb_hdr->fehash_offset,
+			   state->cb_fehash_hdr, 
+			   sizeof(struct callback_state_fehash_header))) {
+	ret = 1;
+	goto done;
+    }
+
+    if (state->cb_fehash_hdr->magic != CALLBACK_STATE_FEHASH_MAGIC) {
+	ret = 1;
+	goto done;
+    }
+    if (state->cb_fehash_hdr->records != FEHASH_SIZE) {
+	ret = 1;
+	goto done;
+    }
+
+    len = state->cb_fehash_hdr->records * sizeof(afs_uint32);
+
+    if (state->cb_fehash_hdr->len !=
+	(sizeof(struct callback_state_fehash_header) + len)) {
+	ret = 1;
+	goto done;
+    }
+
+    if (fs_stateRead(state, HashTable, len)) {
+	ret = 1;
+	goto done;
+    }
+
+ done:
+    return ret;
+}
+
+static int
+cb_stateSaveFEs(struct fs_dump_state * state)
+{
+    int ret = 0;
+    register int fei, hash;
+    register struct FileEntry *fe;
+
+    AssignInt64(state->eof_offset, &state->cb_hdr->fe_offset);
+
+    for (hash = 0; hash < FEHASH_SIZE ; hash++) {
+	for (fei = HashTable[hash]; fei; fei = fe->fnext) {
+	    fe = itofe(fei);
+	    if (cb_stateSaveFE(state, fe)) {
+		ret = 1;
+		goto done;
+	    }
+	}
+    }
+
+ done:
+    return ret;
+}
+
+static int
+cb_stateRestoreFEs(struct fs_dump_state * state)
+{
+    int count, nFEs, ret = 0;
+
+    nFEs = state->cb_hdr->nFEs;
+
+    for (count = 0; count < nFEs; count++) {
+	if (cb_stateRestoreFE(state)) {
+	    ret = 1;
+	    goto done;
+	}
+    }
+
+ done:
+    return ret;
+}
+
+static int
+cb_stateSaveFE(struct fs_dump_state * state, struct FileEntry * fe)
+{
+    int ret = 0, iovcnt, cbi, idx, len, written = 0;
+    afs_uint32 fei;
+    struct callback_state_entry_header hdr;
+    struct FEDiskEntry fedsk;
+    struct CBDiskEntry cbdsk[16];
+    struct iovec iov[16];
+    struct CallBack *cb;
+
+    fei = fetoi(fe);
+    if (fei > state->cb_hdr->fe_max) {
+	state->cb_hdr->fe_max = fei;
+    }
+
+    memset(&hdr, 0, sizeof(struct callback_state_entry_header));
+
+    if (cb_stateFEToDiskEntry(fe, &fedsk)) {
+	ret = 1;
+	goto done;
+    }
+
+    iov[0].iov_base = (char *)&hdr;
+    len = iov[0].iov_len = sizeof(hdr);
+    iov[1].iov_base = (char *)&fedsk;
+    len += iov[1].iov_len = sizeof(struct FEDiskEntry);
+    iovcnt = 2;
+
+    for (cbi = fe->firstcb, cb = itocb(cbi), idx = 2; 
+	 cb != NULL; 
+	 cbi = cb->cnext, cb = itocb(cbi), idx++, hdr.nCBs++) {
+	if (cbi > state->cb_hdr->cb_max) {
+	    state->cb_hdr->cb_max = cbi;
+	}
+	if (cb_stateCBToDiskEntry(cb, &cbdsk[idx])) {
+	    ret = 1;
+	    goto done;
+	}
+	cbdsk[idx].index = cbi;
+	iov[idx].iov_base = (char *)&cbdsk[idx];
+	len += iov[idx].iov_len = sizeof(struct CBDiskEntry);
+	iovcnt++;
+	if ((iovcnt == 16) || (!cb->cnext)) {
+	    if (fs_stateWriteV(state, iov, iovcnt)) {
+		ret = 1;
+		goto done;
+	    }
+	    written = 1;
+	    iovcnt = 0;
+	    len = 0;
+	}
+    }
+
+    hdr.magic = CALLBACK_STATE_ENTRY_MAGIC;
+    hdr.len = sizeof(hdr) + sizeof(struct FEDiskEntry) + 
+	(hdr.nCBs * sizeof(struct CBDiskEntry));
+
+    if (!written) {
+	if (fs_stateWriteV(state, iov, iovcnt)) {
+	    ret = 1;
+	    goto done;
+	}
+    } else {
+	if (fs_stateWriteHeader(state, &state->eof_offset, &hdr, sizeof(hdr))) {
+	    ret = 1;
+	    goto done;
+	}
+    }
+
+    fs_stateIncEOF(state, hdr.len);
+
+    if (written) {
+	if (fs_stateSeek(state, &state->eof_offset)) {
+	    ret = 1;
+	    goto done;
+	}
+    }
+
+    state->cb_hdr->nFEs++;
+    state->cb_hdr->nCBs += hdr.nCBs;
+
+ done:
+    return ret;
+}
+
+static int
+cb_stateRestoreFE(struct fs_dump_state * state)
+{
+    int ret = 0, iovcnt, len, nCBs, idx;
+    struct callback_state_entry_header hdr;
+    struct FEDiskEntry fedsk;
+    struct CBDiskEntry cbdsk[16];
+    struct iovec iov[16];
+    struct FileEntry * fe;
+    struct CallBack * cb;
+
+    iov[0].iov_base = (char *)&hdr;
+    len = iov[0].iov_len = sizeof(hdr);
+    iov[1].iov_base = (char *)&fedsk;
+    len += iov[1].iov_len = sizeof(fedsk);
+    iovcnt = 2;
+
+    if (fs_stateReadV(state, iov, iovcnt)) {
+	ret = 1;
+	goto done;
+    }
+
+    if (hdr.magic != CALLBACK_STATE_ENTRY_MAGIC) {
+	ret = 1;
+	goto done;
+    }
+
+    fe = GetFE();
+    if (fe == NULL) {
+	ViceLog(0, ("cb_stateRestoreFE: ran out of free FileEntry structures\n"));
+	ret = 1;
+	goto done;
+    }
+
+    if (cb_stateDiskEntryToFE(state, &fedsk, fe)) {
+	ret = 1;
+	goto done;
+    }
+
+    if (hdr.nCBs) {
+	for (iovcnt = 0, idx = 0, len = 0, nCBs = 0;
+	     nCBs < hdr.nCBs;
+	     idx++, nCBs++) {
+	    iov[idx].iov_base = (char *)&cbdsk[idx];
+	    len += iov[idx].iov_len = sizeof(struct CBDiskEntry);
+	    iovcnt++;
+	    if ((iovcnt == 16) || (nCBs == hdr.nCBs - 1)) {
+		if (fs_stateReadV(state, iov, iovcnt)) {
+		    ret = 1;
+		    goto done;
+		}
+		if (cb_stateRestoreCBs(state, fe, iov, iovcnt)) {
+		    ret = 1;
+		    goto done;
+		}
+		len = 0;
+		iovcnt = 0;
+	    }
+	}
+    }
+    
+ done:
+    return ret;
+}
+
+static int
+cb_stateRestoreCBs(struct fs_dump_state * state, struct FileEntry * fe, 
+		   struct iovec * iov, int niovecs)
+{
+    int ret = 0, idx;
+    register struct CallBack * cb;
+    struct CBDiskEntry * cbdsk;
+    afs_uint32 fei;
+
+    fei = fetoi(fe);
+
+    for (idx = 0; idx < niovecs; idx++) {
+	cbdsk = (struct CBDiskEntry *) iov[idx].iov_base;
+	if ((cb = GetCB()) == NULL) {
+	    ViceLog(0, ("cb_stateRestoreCBs: ran out of free CallBack structures\n"));
+	    ret = 1;
+	    goto done;
+	}
+	if (cb_stateDiskEntryToCB(state, cbdsk, cb)) {
+	    ViceLog(0, ("cb_stateRestoreCBs: corrupt CallBack disk entry\n"));
+	    ret = 1;
+	    goto done;
+	}
+    }
+
+ done:
+    return ret;
+}
+
+
+static int
+cb_stateFillHeader(struct callback_state_header * hdr)
+{
+    hdr->stamp.magic = CALLBACK_STATE_MAGIC;
+    hdr->stamp.version = CALLBACK_STATE_VERSION;
+    hdr->tfirst = tfirst;
+    return 0;
+}
+
+static int
+cb_stateCheckHeader(struct callback_state_header * hdr)
+{
+    int ret = 0;
+
+    if (hdr->stamp.magic != CALLBACK_STATE_MAGIC) {
+	ret = 1;
+    } else if (hdr->stamp.version != CALLBACK_STATE_VERSION) {
+	ret = 1;
+    } else if ((hdr->nFEs > cbstuff.nblks) || (hdr->nCBs > cbstuff.nblks)) {
+	ViceLog(0, ("cb_stateCheckHeader: saved callback state larger than callback memory allocation\n"));
+	ret = 1;
+    }
+    return ret;
+}
+
+/* disk entry conversion routines */
+static int
+cb_stateFEToDiskEntry(struct FileEntry * in, struct FEDiskEntry * out)
+{
+    memcpy(&out->fe, in, sizeof(struct FileEntry));
+    out->index = fetoi(in);
+    return 0;
+}
+
+static int
+cb_stateDiskEntryToFE(struct fs_dump_state * state, 
+		      struct FEDiskEntry * in, struct FileEntry * out)
+{
+    int ret = 0;
+
+    memcpy(out, &in->fe, sizeof(struct FileEntry));
+
+    /* setup FE map entry */
+    if (!in->index || (in->index >= state->fe_map.len)) {
+	ViceLog(0, ("cb_stateDiskEntryToFE: index (%d) out of range",
+		    in->index));
+	ret = 1;
+	goto done;
+    }
+    state->fe_map.entries[in->index].old_idx = in->index;
+    state->fe_map.entries[in->index].new_idx = fetoi(out);
+
+ done:
+    return ret;
+}
+
+static int
+cb_stateCBToDiskEntry(struct CallBack * in, struct CBDiskEntry * out)
+{
+    memcpy(&out->cb, in, sizeof(struct CallBack));
+    out->index = cbtoi(in);
+    return 0;
+}
+
+static int
+cb_stateDiskEntryToCB(struct fs_dump_state * state,
+		      struct CBDiskEntry * in, struct CallBack * out)
+{
+    int ret = 0;
+
+    memcpy(out, &in->cb, sizeof(struct CallBack));
+
+    /* setup CB map entry */
+    if (!in->index || (in->index >= state->cb_map.len)) {
+	ViceLog(0, ("cb_stateDiskEntryToCB: index (%d) out of range\n",
+		    in->index));
+	ret = 1;
+	goto done;
+    }
+    state->cb_map.entries[in->index].old_idx = in->index;
+    state->cb_map.entries[in->index].new_idx = cbtoi(out);
+
+ done:
+    return ret;
+}
+
+/* index map routines */
+static int
+cb_stateAllocMap(struct fs_dump_state * state)
+{
+    state->fe_map.len = state->cb_hdr->fe_max + 1;
+    state->cb_map.len = state->cb_hdr->cb_max + 1;
+    state->fe_map.entries = (struct idx_map_entry_t *)
+	calloc(state->fe_map.len, sizeof(struct idx_map_entry_t));
+    state->cb_map.entries = (struct idx_map_entry_t *)
+	calloc(state->cb_map.len, sizeof(struct idx_map_entry_t));
+    return ((state->fe_map.entries != NULL) && (state->cb_map.entries != NULL)) ? 0 : 1;
+}
+
+int
+fe_OldToNew(struct fs_dump_state * state, afs_uint32 old, afs_uint32 * new)
+{
+    int ret = 0;
+
+    /* FEs use a one-based indexing system, so old==0 implies no mapping */
+    if (!old) {
+	*new = 0;
+	goto done;
+    }
+
+    if (old >= state->fe_map.len) {
+	ViceLog(0, ("fe_OldToNew: index %d is out of range\n", old));
+	ret = 1;
+    } else if (state->fe_map.entries[old].old_idx != old) { /* sanity check */
+	ViceLog(0, ("fe_OldToNew: index %d points to an invalid FileEntry record\n", old));
+	ret = 1;
+    } else {
+	*new = state->fe_map.entries[old].new_idx;
+    }
+
+ done:
+    return ret;
+}
+
+int
+cb_OldToNew(struct fs_dump_state * state, afs_uint32 old, afs_uint32 * new)
+{
+    int ret = 0;
+
+    /* CBs use a one-based indexing system, so old==0 implies no mapping */
+    if (!old) {
+	*new = 0;
+	goto done;
+    }
+
+    if (old >= state->cb_map.len) {
+	ViceLog(0, ("cb_OldToNew: index %d is out of range\n", old));
+	ret = 1;
+    } else if (state->cb_map.entries[old].old_idx != old) { /* sanity check */
+	ViceLog(0, ("cb_OldToNew: index %d points to an invalid CallBack record\n", old));
+	ret = 1;
+    } else {
+	*new = state->cb_map.entries[old].new_idx;
+    }
+
+ done:
+    return ret;
+}
+#endif /* AFS_DEMAND_ATTACH_FS */
+
 int
 DumpCallBackState(void)
 {
@@ -1807,7 +2673,7 @@ DumpCallBackState(void)
     return 0;
 }
 
-#endif
+#endif /* !INTERPRET_DUMP */
 
 #ifdef INTERPRET_DUMP
 
@@ -1931,7 +2797,7 @@ main(int argc, char **argv)
 	struct CallBack *cb;
 	struct FileEntry *fe;
 
-	for (hash = 0; hash < VHASH; hash++) {
+	for (hash = 0; hash < FEHASH_SIZE; hash++) {
 	    for (feip = &HashTable[hash]; fe = itofe(*feip);) {
 		if (!vol || (fe->volid == vol)) {
 		    register struct CallBack *cbnext;
@@ -2201,6 +3067,15 @@ MultiProbeAlternateAddress_r(struct host *host)
                 H_UNLOCK;
             }
         }
+#ifdef AFS_DEMAND_ATTACH_FS
+	/* try to bail ASAP if the fileserver is shutting down */
+	FS_STATE_RDLOCK;
+	if (fs_state.mode == FS_MODE_SHUTDOWN) {
+	    FS_STATE_UNLOCK;
+	    multi_Abort;
+	}
+	FS_STATE_UNLOCK;
+#endif
     }
     multi_End_Ignore;
     H_LOCK;
diff --git a/src/viced/callback.h b/src/viced/callback.h
new file mode 100644
index 0000000000..2f4cca8036
--- /dev/null
+++ b/src/viced/callback.h
@@ -0,0 +1,158 @@
+/*
+ * Copyright 2000, International Business Machines Corporation and others.
+ * All Rights Reserved.
+ * 
+ * This software has been released under the terms of the IBM Public
+ * License.  For details, see the LICENSE file in the top-level source
+ * directory or online at http://www.openafs.org/dl/license10.html
+ *
+ * Portions Copyright (c) 2006 Sine Nomine Associates
+ */
+
+#ifndef _AFS_VICED_CALLBACK_H
+#define _AFS_VICED_CALLBACK_H
+
+/* Maximum number of call backs to break at once, single fid
+ * There is some debate as to just how large this value should be
+ * Ideally, it would be very very large, but I am afraid that the
+ * cache managers will all send in their responses simultaneously,
+ * thereby swamping the file server.  As a result, something like
+ * 10 or 15 might be a better bet.
+ */
+#define MAX_CB_HOSTS	10
+
+/* max time to break a callback, otherwise client is dead or net is hosed */
+#define MAXCBT 25
+
+#define u_byte	unsigned char
+
+struct cbcounters {
+    afs_int32 DeleteFiles;
+    afs_int32 DeleteCallBacks;
+    afs_int32 BreakCallBacks;
+    afs_int32 AddCallBacks;
+    afs_int32 GotSomeSpaces;
+    afs_int32 DeleteAllCallBacks;
+    afs_int32 nFEs, nCBs, nblks;
+    afs_int32 CBsTimedOut;
+    afs_int32 nbreakers;
+    afs_int32 GSS1, GSS2, GSS3, GSS4, GSS5;
+};
+extern struct cbcounters cbstuff;
+
+struct cbstruct {
+    struct host *hp;
+    afs_uint32 thead;
+};
+
+/* structure MUST be multiple of 8 bytes, otherwise the casts to
+ * struct object will have alignment issues on *P64 userspaces */
+struct FileEntry {
+    afs_uint32 vnode;
+    afs_uint32 unique;
+    afs_uint32 volid;
+    afs_uint32 fnext;           /* index of next FE in hash chain */
+    afs_uint32 ncbs;            /* number of callbacks for this FE */
+    afs_uint32 firstcb;         /* index of first cb in per-FE list */
+    afs_uint32 status;          /* status bits for this FE */
+    afs_uint32 spare;
+};
+#define FE_LATER 0x1
+
+/* structure MUST be multiple of 8 bytes, otherwise the casts to
+ * struct object will have alignment issues on *P64 userspaces */
+struct CallBack {
+    afs_uint32 cnext;		/* index of next cb in per-FE list */
+    afs_uint32 fhead;		/* index of associated FE */
+    u_byte thead;		/* Head of timeout chain */
+    u_byte status;		/* Call back status; see definitions, below */
+    unsigned short spare;	/* ensure proper alignment */
+    afs_uint32 hhead;		/* Head of host table chain */
+    afs_uint32 tprev, tnext;	/* per-timeout circular list of callbacks */
+    afs_uint32 hprev, hnext;	/* per-host circular list of callbacks */
+};
+
+struct VCBParams {
+    struct cbstruct cba[MAX_CB_HOSTS];	/* re-entrant storage */
+    unsigned int ncbas;
+    afs_uint32 thead;		/* head of timeout queue for youngest callback */
+    struct AFSFid *fid;
+};
+
+
+/* callback hash macros */
+#define FEHASH_SIZE 512		/* Power of 2 */
+#define FEHASH_MASK (FEHASH_SIZE-1)
+#define FEHash(volume, unique) (((volume)+(unique))&(FEHASH_MASK))
+
+#define CB_NUM_TIMEOUT_QUEUES 128
+
+
+/* status values for status field of CallBack structure */
+#define CB_NORMAL   1		/* Normal call back */
+#define CB_DELAYED  2		/* Delayed call back due to rpc problems.
+				 * The call back entry will be added back to the
+				 * host list at the END of the list, so that
+				 * searching backwards in the list will find all
+				 * the (consecutive)host. delayed call back entries */
+#define CB_VOLUME   3		/* Callback for a volume */
+#define CB_BULK     4		/* Normal callbacks, handed out from FetchBulkStatus */
+
+/* call back indices to pointers, and vice-versa */
+#define itocb(i)    ((i)?CB+(i):0)
+#define cbtoi(cbp)  (!(cbp)?0:(cbp)-CB)
+
+/* file entry indices to pointers, and vice-versa */
+#define itofe(i)    ((i)?FE+(i):0)
+#define fetoi(fep)  (!(fep)?0:(fep)-FE)
+
+/* Timeouts:  there are 128 possible timeout values in effect at any
+ * given time.  Each timeout represents timeouts in an interval of 128
+ * seconds.  So the maximum timeout for a call back is 128*128=16384
+ * seconds, or 4 1/2 hours.  The timeout cleanup stuff is called only
+ * if space runs out or by the file server every 5 minutes.  This 5
+ * minute slack should be allowed for--so a maximum time of 4 hours
+ * is safer.
+ *
+ * Timeouts must be chosen to correspond to an exact multiple
+ * of 128, because all times are truncated to a 128 multiple, and
+ * timed out if the current truncated time is <= to the truncated time
+ * corresponding to the timeout queue.
+ */
+
+/* Unix time to Call Back time, and vice-versa.  Call back time is
+   in units of 128 seconds, corresponding to time queues. */
+#define CBtime(uxtime)	((uxtime)>>7)
+#define UXtime(cbtime)	((cbtime)<<7)
+
+/* Given a Unix time, compute the closest Unix time that corresponds to
+   a time queue, rounding up */
+#define TimeCeiling(uxtime)	(((uxtime)+127)&~127)
+
+#define TimeOutCutoff   ((sizeof(TimeOuts)/sizeof(TimeOuts[0]))*8)
+#define TimeOut(nusers)  ((nusers)>=TimeOutCutoff? MinTimeOut: TimeOuts[(nusers)>>3])
+
+/* time out at server is 3 minutes more than ws */
+#define ServerBias	  (3*60)
+
+/* Convert cbtime to timeout queue index */
+#define TIndex(cbtime)  (((cbtime)&127)+1)
+
+/* Convert cbtime to pointer to timeout queue head */
+#define THead(cbtime)	(&timeout[TIndex(cbtime)-1])
+
+/* Normalize index into timeout array so that two such indices will be
+   ordered correctly, so that they can be compared to see which times
+   sooner, or so that the difference in time out times between them
+   can be computed. */
+#define TNorm(index)   ((index)<TIndex(tfirst)?(index)+128:(index))
+
+/* This converts a timeout index into the actual time it will expire */
+#define TIndexToTime(index) (UXtime(TNorm(index) - TIndex(tfirst) + tfirst))
+
+
+/* Convert pointer to timeout queue head to index, and vice versa */
+#define ttoi(t)		((t-timeout)+1)
+#define itot(i)		((timeout)+(i-1))
+
+#endif /* _AFS_VICED_CALLBACK_H */
diff --git a/src/viced/host.c b/src/viced/host.c
index 092a18da1b..5f2f940dec 100644
--- a/src/viced/host.c
+++ b/src/viced/host.c
@@ -5,6 +5,8 @@
  * This software has been released under the terms of the IBM Public
  * License.  For details, see the LICENSE file in the top-level source
  * directory or online at http://www.openafs.org/dl/license10.html
+ *
+ * Portions Copyright (c) 2006 Sine Nomine Associates
  */
 
 #include <afsconfig.h>
@@ -59,7 +61,11 @@ RCSID
 #include "viced_prototypes.h"
 #include "viced.h"
 #include "host.h"
-
+#include "callback.h"
+#ifdef AFS_DEMAND_ATTACH_FS
+#include "../util/afsutil_prototypes.h"
+#include "../tviced/serialize_state.h"
+#endif /* AFS_DEMAND_ATTACH_FS */
 
 #ifdef AFS_PTHREAD_ENV
 pthread_mutex_t host_glock_mutex;
@@ -83,6 +89,13 @@ int hostCount = 0;		/* number of hosts in hostList */
 int rxcon_ident_key;
 int rxcon_client_key;
 
+static struct rx_securityClass *sc = NULL;
+
+static void h_SetupCallbackConn_r(struct host * host);
+static void h_AddHostToHashTable_r(afs_uint32 addr, afs_uint16 port, struct host * host);
+static void h_AddHostToUuidHashTable_r(afsUUID * uuid, struct host * host);
+static int h_DeleteHostFromHashTableByAddr_r(afs_uint32 addr, afs_uint16 port, struct host *host);
+
 #define CESPERBLOCK 73
 struct CEBlock {		/* block of CESPERBLOCK file entries */
     struct client entry[CESPERBLOCK];
@@ -232,9 +245,9 @@ GetHT()
 {
     register struct host *entry;
 
-    if (HTFree == 0)
+    if (HTFree == NULL)
 	GetHTBlock();
-    assert(HTFree != 0);
+    assert(HTFree != NULL);
     entry = HTFree;
     HTFree = entry->next;
     HTs++;
@@ -448,7 +461,7 @@ h_gethostcps_r(register struct host *host, register afs_int32 now)
 	free(host->hcps.prlist_val);	/* this is for hostaclRefresh */
     host->hcps.prlist_val = NULL;
     host->hcps.prlist_len = 0;
-    slept ? (host->cpsCall = FT_ApproxTime()) : (host->cpsCall = now);
+    host->cpsCall = slept ? (FT_ApproxTime()) : (now);
 
     H_UNLOCK;
     code = pr_GetHostCPS(ntohl(host->host), &host->hcps);
@@ -533,7 +546,6 @@ h_Alloc_r(register struct rx_connection *r_con)
 {
     struct servent *serverentry;
     struct host *host;
-    static struct rx_securityClass *sc = 0;
     afs_int32 now;
 #if FS_STATS_DETAILED
     afs_uint32 newHostAddr_HBO;	/*New host IP addr, in host byte order */
@@ -544,7 +556,7 @@ h_Alloc_r(register struct rx_connection *r_con)
     host->host = rxr_HostOf(r_con);
     host->port = rxr_PortOf(r_con);
 
-    hashInsert_r(host->host, host->port, host);
+    h_AddHostToHashTable_r(host->host, host->port, host);
 
     if (consolePort == 0) {	/* find the portal number for console */
 #if	defined(AFS_OSF_ENV)
@@ -561,24 +573,17 @@ h_Alloc_r(register struct rx_connection *r_con)
 	host->Console = 1;
     /* Make a callback channel even for the console, on the off chance that it
      * makes a request that causes a break call back.  It shouldn't. */
-    {
-	if (!sc)
-	    sc = rxnull_NewClientSecurityObject();
-	host->callback_rxcon =
-	    rx_NewConnection(host->host, host->port, 1, sc, 0);
-	rx_SetConnDeadTime(host->callback_rxcon, 50);
-	rx_SetConnHardDeadTime(host->callback_rxcon, AFS_HARDDEADTIME);
-    }
+    h_SetupCallbackConn_r(host);
     now = host->LastCall = host->cpsCall = host->ActiveCall = FT_ApproxTime();
     host->hostFlags = 0;
     host->hcps.prlist_val = NULL;
     host->hcps.prlist_len = 0;
-    host->interface = 0;
+    host->interface = NULL;
 #ifdef undef
     host->hcpsfailed = 0;	/* save cycles */
     h_gethostcps(host);		/* do this under host hold/lock */
 #endif
-    host->FirstClient = 0;
+    host->FirstClient = NULL;
     h_Hold_r(host);
     h_Lock_r(host);
     h_InsertList_r(host);	/* update global host List */
@@ -596,6 +601,20 @@ h_Alloc_r(register struct rx_connection *r_con)
 }				/*h_Alloc_r */
 
 
+
+/* Make a callback channel even for the console, on the off chance that it
+ * makes a request that causes a break call back.  It shouldn't. */
+static void
+h_SetupCallbackConn_r(struct host * host)
+{
+    if (!sc)
+	sc = rxnull_NewClientSecurityObject();
+    host->callback_rxcon =
+	rx_NewConnection(host->host, host->port, 1, sc, 0);
+    rx_SetConnDeadTime(host->callback_rxcon, 50);
+    rx_SetConnHardDeadTime(host->callback_rxcon, AFS_HARDDEADTIME);
+}
+
 /* Lookup a host given an IP address and UDP port number. */
 /* hostaddr and hport are in network order */
 /* Note: host should be released by caller if 0 == *heldp and non-null */
@@ -833,7 +852,7 @@ h_FreeConnection(struct rx_connection *tcon)
     if (client) {
 	H_LOCK;
 	if (client->tcon == tcon)
-	    client->tcon = (struct rx_connection *)0;
+	    client->tcon = NULL;
 	H_UNLOCK;
     }
     return 0;
@@ -878,8 +897,11 @@ h_Enumerate(int (*proc) (), char *param)
     H_UNLOCK;
     for (i = 0; i < count; i++) {
 	held[i] = (*proc) (list[i], held[i], param);
-	if (!held[i])
+	if (!H_ENUMERATE_ISSET_HELD(held[i]))
 	    h_Release(list[i]);	/* this might free up the host */
+	/* bail out of the enumeration early */
+	if (H_ENUMERATE_ISSET_BAIL(held[i]))
+	    break;
     }
     free((void *)list);
     free((void *)held);
@@ -908,17 +930,19 @@ h_Enumerate_r(int (*proc) (), struct host *enumstart, char *param)
 	h_Hold_r(enumstart); 
     for (host = enumstart; host; host = next, held = nheld) {
 	next = host->next;
-	if (next && !(nheld = h_Held_r(next)))
+	if (next && !(nheld = h_Held_r(next)) && !H_ENUMERATE_ISSET_BAIL(held))
 	    h_Hold_r(next);
 	held = (*proc) (host, held, param);
-	if (!held)
+	if (!H_ENUMERATE_ISSET_HELD(held))
 	    h_Release_r(host); /* this might free up the host */
+	if (H_ENUMERATE_ISSET_BAIL(held))
+	    break;
     }
 }				/*h_Enumerate_r */
 
 /* inserts a new HashChain structure corresponding to this UUID */
-void
-hashInsertUuid_r(struct afsUUID *uuid, struct host *host)
+static void
+h_AddHostToUuidHashTable_r(struct afsUUID *uuid, struct host *host)
 {
     int index;
     struct h_hashChain *chain;
@@ -929,7 +953,7 @@ hashInsertUuid_r(struct afsUUID *uuid, struct host *host)
     /* insert into beginning of list for this bucket */
     chain = (struct h_hashChain *)malloc(sizeof(struct h_hashChain));
     if (!chain) {
-	ViceLog(0, ("Failed malloc in hashInsertUuid_r\n"));
+	ViceLog(0, ("Failed malloc in h_AddHostToUuidHashTable_r\n"));
 	assert(0);
     }
     assert(chain);
@@ -940,8 +964,8 @@ hashInsertUuid_r(struct afsUUID *uuid, struct host *host)
 
 
 /* inserts a new HashChain structure corresponding to this address */
-void
-hashInsert_r(afs_uint32 addr, afs_uint16 port, struct host *host)
+static void
+h_AddHostToHashTable_r(afs_uint32 addr, afs_uint16 port, struct host *host)
 {
     int index;
     struct h_hashChain *chain;
@@ -952,7 +976,7 @@ hashInsert_r(afs_uint32 addr, afs_uint16 port, struct host *host)
     /* insert into beginning of list for this bucket */
     chain = (struct h_hashChain *)malloc(sizeof(struct h_hashChain));
     if (!chain) {
-	ViceLog(0, ("Failed malloc in hashInsert_r\n"));
+	ViceLog(0, ("Failed malloc in h_AddHostToHashTable_r\n"));
 	assert(0);
     }
     chain->hostPtr = host;
@@ -1017,7 +1041,7 @@ addInterfaceAddr_r(struct host *host, afs_uint32 addr, afs_uint16 port)
     /*
      * Create a hash table entry for this address
      */
-    hashInsert_r(addr, port, host);
+    h_AddHostToHashTable_r(addr, port, host);
 
     return 0;
 }
@@ -1072,7 +1096,7 @@ removeInterfaceAddr_r(struct host *host, afs_uint32 addr, afs_uint16 port)
     /*
      * Remove the hash table entry for this address
      */
-    hashDelete_r(addr, port, host);
+    h_DeleteHostFromHashTableByAddr_r(addr, port, host);
 
     return 0;
 }
@@ -1394,7 +1418,7 @@ h_GetHost_r(struct rx_connection *tcon)
 		    /* the new host is held and locked */
 		} else {
 		    /* This really is a new host */
-		    hashInsertUuid_r(&identP->uuid, host);
+		    h_AddHostToUuidHashTable_r(&identP->uuid, host);
 		    cb_conn = host->callback_rxcon;
 		    rx_GetConnection(cb_conn);		
 		    H_UNLOCK;
@@ -1735,7 +1759,7 @@ h_FindClient_r(struct rx_connection *tcon)
 	    client->authClass = authClass;	/* rx only */
 	    client->sid = rxr_CidOf(tcon);
 	    client->VenusEpoch = rxr_GetEpoch(tcon);
-	    client->CPS.prlist_val = 0;
+	    client->CPS.prlist_val = NULL;
 	    client->CPS.prlist_len = 0;
 	    h_Unlock_r(host);
 	}
@@ -2134,6 +2158,540 @@ h_DumpHosts()
 
 }				/*h_DumpHosts */
 
+#ifdef AFS_DEMAND_ATTACH_FS
+/*
+ * demand attach fs
+ * host state serialization
+ */
+static int h_stateFillHeader(struct host_state_header * hdr);
+static int h_stateCheckHeader(struct host_state_header * hdr);
+static int h_stateAllocMap(struct fs_dump_state * state);
+static int h_stateSaveHost(register struct host * host, int held, struct fs_dump_state * state);
+static int h_stateRestoreHost(struct fs_dump_state * state);
+static int h_stateRestoreIndex(struct host * h, int held, struct fs_dump_state * state);
+static int h_stateVerifyHost(struct host * h, int held, struct fs_dump_state * state);
+static int h_stateVerifyAddrHash(struct fs_dump_state * state, struct host * h, afs_uint32 addr, afs_uint16 port);
+static int h_stateVerifyUuidHash(struct fs_dump_state * state, struct host * h);
+static void h_hostToDiskEntry_r(struct host * in, struct hostDiskEntry * out);
+static void h_diskEntryToHost_r(struct hostDiskEntry * in, struct host * out);
+
+
+/* this procedure saves all host state to disk for fast startup */
+int
+h_stateSave(struct fs_dump_state * state)
+{
+    AssignInt64(state->eof_offset, &state->hdr->h_offset);
+
+    /* XXX debug */
+    ViceLog(0, ("h_stateSave:  hostCount=%d\n", hostCount));
+
+    /* invalidate host state header */
+    memset(state->h_hdr, 0, sizeof(struct host_state_header));
+
+    if (fs_stateWriteHeader(state, &state->hdr->h_offset, state->h_hdr,
+			    sizeof(struct host_state_header))) {
+	state->bail = 1;
+	goto done;
+    }
+
+    fs_stateIncEOF(state, sizeof(struct host_state_header));
+
+    h_Enumerate_r(h_stateSaveHost, hostList, (char *)state);
+    if (state->bail) {
+	goto done;
+    }
+
+    h_stateFillHeader(state->h_hdr);
+
+    /* write the real header to disk */
+    state->bail = fs_stateWriteHeader(state, &state->hdr->h_offset, state->h_hdr,
+				      sizeof(struct host_state_header));
+
+ done:
+    return state->bail;
+}
+
+/* demand attach fs
+ * host state serialization
+ *
+ * this procedure restores all host state from a disk for fast startup 
+ */
+int
+h_stateRestore(struct fs_dump_state * state)
+{
+    int i, records;
+
+    /* seek to the right position and read in the host state header */
+    if (fs_stateReadHeader(state, &state->hdr->h_offset, state->h_hdr,
+			   sizeof(struct host_state_header))) {
+	state->bail = 1;
+	goto done;
+    }
+
+    /* check the validity of the header */
+    if (h_stateCheckHeader(state->h_hdr)) {
+	state->bail = 1;
+	goto done;
+    }
+
+    records = state->h_hdr->records;
+
+    if (h_stateAllocMap(state)) {
+	state->bail = 1;
+	goto done;
+    }
+
+    /* iterate over records restoring host state */
+    for (i=0; i < records; i++) {
+	if (h_stateRestoreHost(state) != 0) {
+	    state->bail = 1;
+	    break;
+	}
+    }
+
+ done:
+    return state->bail;
+}
+
+int
+h_stateRestoreIndices(struct fs_dump_state * state)
+{
+    h_Enumerate_r(h_stateRestoreIndex, hostList, (char *)state);
+    return state->bail;
+}
+
+static int
+h_stateRestoreIndex(struct host * h, int held, struct fs_dump_state * state)
+{
+    if (cb_OldToNew(state, h->cblist, &h->cblist)) {
+	return H_ENUMERATE_BAIL(held);
+    }
+    return held;
+}
+
+int
+h_stateVerify(struct fs_dump_state * state)
+{
+    h_Enumerate_r(h_stateVerifyHost, hostList, (char *)state);
+    return state->bail;
+}
+
+static int
+h_stateVerifyHost(struct host * h, int held, struct fs_dump_state * state)
+{
+    int i;
+
+    if (h == NULL) {
+	ViceLog(0, ("h_stateVerifyHost: error: NULL host pointer in linked list\n"));
+	return H_ENUMERATE_BAIL(held);
+    }
+
+    if (h->interface) {
+	for (i = h->interface->numberOfInterfaces-1; i >= 0; i--) {
+	    if (h_stateVerifyAddrHash(state, h, h->interface->interface[i].addr, 
+				      h->interface->interface[i].port)) {
+		state->bail = 1;
+	    }
+	}
+	if (h_stateVerifyUuidHash(state, h)) {
+	    state->bail = 1;
+	}
+    } else if (h_stateVerifyAddrHash(state, h, h->host, h->port)) {
+	state->bail = 1;
+    }
+
+    if (cb_stateVerifyHCBList(state, h)) {
+	state->bail = 1;
+    }
+
+ done:
+    return held;
+}
+
+static int
+h_stateVerifyAddrHash(struct fs_dump_state * state, struct host * h, afs_uint32 addr, afs_uint16 port)
+{
+    int ret = 0, found = 0;
+    struct host *host = NULL;
+    struct h_hashChain *chain;
+    int index = h_HashIndex(addr);
+    char tmp[16];
+    int chain_len = 0;
+
+    for (chain = hostHashTable[index]; chain; chain = chain->next) {
+	host = chain->hostPtr;
+	if (host == NULL) {
+	    afs_inet_ntoa_r(addr, tmp);
+	    ViceLog(0, ("h_stateVerifyAddrHash: error: addr hash chain has NULL host ptr (lookup addr %s)\n", tmp));
+	    ret = 1;
+	    goto done;
+	}
+	if ((chain->addr == addr) && (chain->port == port)) {
+	    if (host != h) {
+		ViceLog(0, ("h_stateVerifyAddrHash: warning: addr hash entry points to different host struct (%d, %d)\n", 
+			    h->index, host->index));
+		state->flags.warnings_generated = 1;
+	    }
+	    found = 1;
+	    break;
+	}
+	if (chain_len > FS_STATE_H_MAX_ADDR_HASH_CHAIN_LEN) {
+	    ViceLog(0, ("h_stateVerifyAddrHash: error: hash chain length exceeds %d; assuming there's a loop\n",
+			FS_STATE_H_MAX_ADDR_HASH_CHAIN_LEN));
+	    ret = 1;
+	    goto done;
+	}
+	chain_len++;
+    }
+
+    if (!found) {
+	afs_inet_ntoa_r(addr, tmp);
+	if (state->mode == FS_STATE_LOAD_MODE) {
+	    ViceLog(0, ("h_stateVerifyAddrHash: error: addr %s not found in hash\n", tmp));
+	    ret = 1;
+	    goto done;
+	} else {
+	    ViceLog(0, ("h_stateVerifyAddrHash: warning: addr %s not found in hash\n", tmp));
+	    state->flags.warnings_generated = 1;
+	}
+    }
+
+ done:
+    return ret;
+}
+
+static int
+h_stateVerifyUuidHash(struct fs_dump_state * state, struct host * h)
+{
+    int ret = 0, found = 0;
+    struct host *host = NULL;
+    struct h_hashChain *chain;
+    afsUUID * uuidp = &h->interface->uuid;
+    int index = h_UuidHashIndex(uuidp);
+    char tmp[40];
+    int chain_len = 0;
+
+    for (chain = hostUuidHashTable[index]; chain; chain = chain->next) {
+	host = chain->hostPtr;
+	if (host == NULL) {
+	    afsUUID_to_string(uuidp, tmp, sizeof(tmp));
+	    ViceLog(0, ("h_stateVerifyUuidHash: error: uuid hash chain has NULL host ptr (lookup uuid %s)\n", tmp));
+	    ret = 1;
+	    goto done;
+	}
+	if (host->interface &&
+	    afs_uuid_equal(&host->interface->uuid, uuidp)) {
+	    if (host != h) {
+		ViceLog(0, ("h_stateVerifyUuidHash: warning: uuid hash entry points to different host struct (%d, %d)\n", 
+			    h->index, host->index));
+		state->flags.warnings_generated = 1;
+	    }
+	    found = 1;
+	    goto done;
+	}
+	if (chain_len > FS_STATE_H_MAX_UUID_HASH_CHAIN_LEN) {
+	    ViceLog(0, ("h_stateVerifyUuidHash: error: hash chain length exceeds %d; assuming there's a loop\n",
+			FS_STATE_H_MAX_UUID_HASH_CHAIN_LEN));
+	    ret = 1;
+	    goto done;
+	}
+	chain_len++;
+    }
+
+    if (!found) {
+	afsUUID_to_string(uuidp, tmp, sizeof(tmp));
+	if (state->mode == FS_STATE_LOAD_MODE) {
+	    ViceLog(0, ("h_stateVerifyUuidHash: error: uuid %s not found in hash\n", tmp));
+	    ret = 1;
+	    goto done;
+	} else {
+	    ViceLog(0, ("h_stateVerifyUuidHash: warning: uuid %s not found in hash\n", tmp));
+	    state->flags.warnings_generated = 1;
+	}
+    }
+
+ done:
+    return ret;
+}
+
+/* create the host state header structure */
+static int
+h_stateFillHeader(struct host_state_header * hdr)
+{
+    hdr->stamp.magic = HOST_STATE_MAGIC;
+    hdr->stamp.version = HOST_STATE_VERSION;
+}
+
+/* check the contents of the host state header structure */
+static int
+h_stateCheckHeader(struct host_state_header * hdr)
+{
+    int ret=0;
+
+    if (hdr->stamp.magic != HOST_STATE_MAGIC) {
+	ViceLog(0, ("check_host_state_header: invalid state header\n"));
+	ret = 1;
+    }
+    else if (hdr->stamp.version != HOST_STATE_VERSION) {
+	ViceLog(0, ("check_host_state_header: unknown version number\n"));
+	ret = 1;
+    }
+    return ret;
+}
+
+/* allocate the host id mapping table */
+static int
+h_stateAllocMap(struct fs_dump_state * state)
+{
+    state->h_map.len = state->h_hdr->index_max + 1;
+    state->h_map.entries = (struct idx_map_entry_t *)
+	calloc(state->h_map.len, sizeof(struct idx_map_entry_t));
+    return (state->h_map.entries != NULL) ? 0 : 1;
+}
+
+/* function called by h_Enumerate to save a host to disk */
+static int
+h_stateSaveHost(register struct host * host, int held, struct fs_dump_state * state)
+{
+    int i, if_len=0, hcps_len=0;
+    struct hostDiskEntry hdsk;
+    struct host_state_entry_header hdr;
+    struct Interface * ifp = NULL;
+    afs_int32 * hcps = NULL;
+    struct iovec iov[4];
+    int iovcnt = 2;
+
+    memset(&hdr, 0, sizeof(hdr));
+
+    if (state->h_hdr->index_max < host->index) {
+	state->h_hdr->index_max = host->index;
+    }
+
+    h_hostToDiskEntry_r(host, &hdsk);
+    if (host->interface) {
+	if_len = sizeof(struct Interface) + 
+	    ((host->interface->numberOfInterfaces-1) * sizeof(struct AddrPort));
+	ifp = (struct Interface *) malloc(if_len);
+	assert(ifp != NULL);
+	memcpy(ifp, host->interface, if_len);
+	hdr.interfaces = host->interface->numberOfInterfaces;
+	iov[iovcnt].iov_base = (char *) ifp;
+	iov[iovcnt].iov_len = if_len;
+	iovcnt++;
+    }
+    if (host->hcps.prlist_val) {
+	hdr.hcps = host->hcps.prlist_len;
+	hcps_len = hdr.hcps * sizeof(afs_int32);
+	hcps = (afs_int32 *) malloc(hcps_len);
+	assert(hcps != NULL);
+	memcpy(hcps, host->hcps.prlist_val, hcps_len);
+	iov[iovcnt].iov_base = (char *) hcps;
+	iov[iovcnt].iov_len = hcps_len;
+	iovcnt++;
+    }
+
+    if (hdsk.index > state->h_hdr->index_max)
+	state->h_hdr->index_max = hdsk.index;
+
+    hdr.len = sizeof(struct host_state_entry_header) + 
+	sizeof(struct hostDiskEntry) + if_len + hcps_len;
+    hdr.magic = HOST_STATE_ENTRY_MAGIC;
+
+    iov[0].iov_base = (char *) &hdr;
+    iov[0].iov_len = sizeof(hdr);
+    iov[1].iov_base = (char *) &hdsk;
+    iov[1].iov_len = sizeof(struct hostDiskEntry);
+    
+    if (fs_stateWriteV(state, iov, iovcnt)) {
+	ViceLog(0, ("h_stateSaveHost: failed to save host %d", host->index));
+	state->bail = 1;
+    }
+
+    fs_stateIncEOF(state, hdr.len);
+
+    state->h_hdr->records++;
+
+ done:
+    if (ifp)
+	free(ifp);
+    if (hcps)
+	free(hcps);
+    if (state->bail) {
+	return H_ENUMERATE_BAIL(held);
+    }
+    return held;
+}
+
+/* restores a host from disk */
+static int
+h_stateRestoreHost(struct fs_dump_state * state)
+{
+    int ifp_len=0, hcps_len=0, bail=0;
+    struct host_state_entry_header hdr;
+    struct hostDiskEntry hdsk;
+    struct host *host = NULL;
+    struct Interface *ifp = NULL;
+    afs_int32 * hcps = NULL;
+    struct iovec iov[3];
+    int iovcnt = 1;
+
+    if (fs_stateRead(state, &hdr, sizeof(hdr))) {
+	ViceLog(0, ("h_stateRestoreHost: failed to read host entry header from dump file '%s'\n",
+		    state->fn));
+	bail = 1;
+	goto done;
+    }
+
+    if (hdr.magic != HOST_STATE_ENTRY_MAGIC) {
+	ViceLog(0, ("h_stateRestoreHost: fileserver state dump file '%s' is corrupt.\n",
+		    state->fn));
+	bail = 1;
+	goto done;
+    }
+
+    iov[0].iov_base = (char *) &hdsk;
+    iov[0].iov_len = sizeof(struct hostDiskEntry);
+
+    if (hdr.interfaces) {
+	ifp_len = sizeof(struct Interface) +
+	    ((hdr.interfaces-1) * sizeof(struct AddrPort));
+	ifp = (struct Interface *) malloc(ifp_len);
+	assert(ifp != NULL);
+	iov[iovcnt].iov_base = (char *) ifp;
+	iov[iovcnt].iov_len = ifp_len;
+	iovcnt++;
+    }
+    if (hdr.hcps) {
+	hcps_len = hdr.hcps * sizeof(afs_int32);
+	hcps = (afs_int32 *) malloc(hcps_len);
+	assert(hcps != NULL);
+	iov[iovcnt].iov_base = (char *) hcps;
+	iov[iovcnt].iov_len = hcps_len;
+	iovcnt++;
+    }
+
+    if ((ifp_len + hcps_len + sizeof(hdsk) + sizeof(hdr)) != hdr.len) {
+	ViceLog(0, ("h_stateRestoreHost: host entry header length fields are inconsistent\n"));
+	bail = 1;
+	goto done;
+    }
+
+    if (fs_stateReadV(state, iov, iovcnt)) {
+	ViceLog(0, ("h_stateRestoreHost: failed to read host entry\n"));
+	bail = 1;
+	goto done;
+    }
+
+    if (!hdr.hcps && hdsk.hcps_valid) {
+	/* valid, zero-length host cps ; does this ever happen? */
+	hcps = (afs_int32 *) malloc(sizeof(afs_int32));
+	assert(hcps != NULL);
+    }
+
+    host = GetHT();
+    assert(host != NULL);
+
+    if (ifp) {
+	host->interface = ifp;
+    }
+    if (hcps) {
+	host->hcps.prlist_val = hcps;
+	host->hcps.prlist_len = hdr.hcps;
+    }
+
+    h_diskEntryToHost_r(&hdsk, host);
+    h_SetupCallbackConn_r(host);
+
+    if (ifp) {
+	int i;
+	for (i = ifp->numberOfInterfaces-1; i >= 0; i--) {
+	    h_AddHostToHashTable_r(ifp->interface[i].addr, 
+				   ifp->interface[i].port, host);
+	}
+	h_AddHostToUuidHashTable_r(&ifp->uuid, host);
+    } else {
+	h_AddHostToHashTable_r(host->host, host->port, host);
+    }
+    h_InsertList_r(host);
+
+    /* setup host id map entry */
+    state->h_map.entries[hdsk.index].old_idx = hdsk.index;
+    state->h_map.entries[hdsk.index].new_idx = host->index;
+
+ done:
+    if (bail) {
+	if (ifp)
+	    free(ifp);
+	if (hcps)
+	    free(hcps);
+    }
+    return bail;
+}
+
+/* serialize a host structure to disk */
+static void
+h_hostToDiskEntry_r(struct host * in, struct hostDiskEntry * out)
+{
+    out->host = in->host;
+    out->port = in->port;
+    out->hostFlags = in->hostFlags;
+    out->Console = in->Console;
+    out->hcpsfailed = in->hcpsfailed;
+    out->LastCall = in->LastCall;
+    out->ActiveCall = in->ActiveCall;
+    out->cpsCall = in->cpsCall;
+    out->cblist = in->cblist;
+#ifdef FS_STATS_DETAILED
+    out->InSameNetwork = in->InSameNetwork;
+#endif
+
+    /* special fields we save, but are not memcpy'd back on restore */
+    out->index = in->index;
+    out->hcps_len = in->hcps.prlist_len;
+    out->hcps_valid = (in->hcps.prlist_val == NULL) ? 0 : 1;
+}
+
+/* restore a host structure from disk */
+static void
+h_diskEntryToHost_r(struct hostDiskEntry * in, struct host * out)
+{
+    out->host = in->host;
+    out->port = in->port;
+    out->hostFlags = in->hostFlags;
+    out->Console = in->Console;
+    out->hcpsfailed = in->hcpsfailed;
+    out->LastCall = in->LastCall;
+    out->ActiveCall = in->ActiveCall;
+    out->cpsCall = in->cpsCall;
+    out->cblist = in->cblist;
+#ifdef FS_STATS_DETAILED
+    out->InSameNetwork = in->InSameNetwork;
+#endif
+}
+
+/* index translation routines */
+int
+h_OldToNew(struct fs_dump_state * state, afs_uint32 old, afs_uint32 * new)
+{
+    int ret = 0;
+
+    /* hosts use a zero-based index, so old==0 is valid */
+
+    if (old >= state->h_map.len) {
+	ViceLog(0, ("h_OldToNew: index %d is out of range\n", old));
+	ret = 1;
+    } else if (state->h_map.entries[old].old_idx != old) { /* sanity check */
+	ViceLog(0, ("h_OldToNew: index %d points to an invalid host record\n", old));
+	ret = 1;
+    } else {
+	*new = state->h_map.entries[old].new_idx;
+    }
+
+ done:
+    return ret;
+}
+#endif /* AFS_DEMAND_ATTACH_FS */
+
 
 /*
  * This counts the number of workstations, the number of active workstations,
@@ -2348,13 +2906,23 @@ static struct AFSFid zerofid;
  * Since it can serialize them, and pile up, it should be a separate LWP
  * from other events.
  */
-int
+static int
 CheckHost(register struct host *host, int held)
 {
     register struct client *client;
     struct rx_connection *cb_conn = NULL;
     int code;
 
+#ifdef AFS_DEMAND_ATTACH_FS
+    /* kill the checkhost lwp ASAP during shutdown */
+    FS_STATE_RDLOCK;
+    if (fs_state.mode == FS_MODE_SHUTDOWN) {
+	FS_STATE_UNLOCK;
+	return H_ENUMERATE_BAIL(held);
+    }
+    FS_STATE_UNLOCK;
+#endif
+
     /* Host is held by h_Enumerate */
     H_LOCK;
     for (client = host->FirstClient; client; client = client->next) {
@@ -2455,7 +3023,7 @@ CheckHost(register struct host *host, int held)
  * This routine is called roughly every 5 minutes.
  */
 void
-h_CheckHosts()
+h_CheckHosts(void)
 {
     afs_uint32 now = FT_ApproxTime();
 
@@ -2570,7 +3138,7 @@ initInterfaceAddr_r(struct host *host, struct interfaceAddr *interf)
 /* deleted a HashChain structure for this address and host */
 /* returns 1 on success */
 static int
-hashDelete_r(afs_uint32 addr, afs_uint16 port, struct host *host)
+h_DeleteHostFromHashTableByAddr_r(afs_uint32 addr, afs_uint16 port, struct host *host)
 {
     int flag;
     register struct h_hashChain **hp, *th;
diff --git a/src/viced/host.h b/src/viced/host.h
index bd17cfd156..60df3bcea7 100644
--- a/src/viced/host.h
+++ b/src/viced/host.h
@@ -5,8 +5,13 @@
  * This software has been released under the terms of the IBM Public
  * License.  For details, see the LICENSE file in the top-level source
  * directory or online at http://www.openafs.org/dl/license10.html
+ *
+ * Portions Copyright (c) 2006 Sine Nomine Associates
  */
 
+#ifndef _AFS_VICED_HOST_H
+#define _AFS_VICED_HOST_H
+
 #include "fs_stats.h"		/*File Server stats package */
 
 #ifdef AFS_PTHREAD_ENV
@@ -59,6 +64,7 @@ struct Interface {
     struct AddrPort interface[1];/* there are actually more than one here */
     /* in network byte order */
 };
+
 struct host {
     struct host *next, *prev;	/* linked list of all hosts */
     struct rx_connection *callback_rxcon;	/* rx callback connection */
@@ -85,7 +91,7 @@ struct host {
     struct client *FirstClient;	/* first connection from host */
     afs_uint32 cpsCall;		/* time of last cps call from this host */
     struct Interface *interface;	/* all alternate addr for client */
-    afs_uint32 cblist;		/* Call back list for this host */
+    afs_uint32 cblist;		/* index of a cb in the per-host circular CB list */
     /*
      * These don't get zeroed, keep them at the end. If index doesn't
      * follow an unsigned short then we need to pad to ensure that
@@ -142,6 +148,7 @@ struct client {
 /* Don't zero the lock */
 #define CLIENT_TO_ZERO(C)	((int)(((char *)(&((C)->lock))-(char *)(C))))
 
+
 /*
  * key for the client structure stored in connection specific data
  */
@@ -245,6 +252,19 @@ extern void h_CheckHosts();
 struct Interface *MultiVerifyInterface_r();
 extern int initInterfaceAddr_r(struct host *host, struct interfaceAddr *interf);
 
+#ifdef AFS_DEMAND_ATTACH_FS
+/*
+ * demand attach fs
+ * state serialization
+ */
+extern int h_SaveState(void);
+extern int h_RestoreState(void);
+#endif
+
+#define H_ENUMERATE_BAIL(held)        ((held)|0x80000000)
+#define H_ENUMERATE_ISSET_BAIL(held)  ((held)&0x80000000)
+#define H_ENUMERATE_ISSET_HELD(held)  ((held)&0x7FFFFFFF)
+
 struct host *(hosttableptrs[h_MAXHOSTTABLES]);	/* Used by h_itoh */
 #define h_htoi(host) ((host)->index)	/* index isn't zeroed, no need to lock */
 #define h_itoh(hostindex) (hosttableptrs[(hostindex)>>h_HTSHIFT]+((hostindex)&(h_HTSPERBLOCK-1)))
@@ -269,4 +289,4 @@ struct host *(hosttableptrs[h_MAXHOSTTABLES]);	/* Used by h_itoh */
 #define HFE_LATER                       0x80	/* host has FE_LATER callbacks */
 #define HERRORTRANS                    0x100	/* do error translation */
 
-
+#endif /* _AFS_VICED_HOST_H */
diff --git a/src/viced/viced.c b/src/viced/viced.c
index 1202d933a4..1c7296bf22 100644
--- a/src/viced/viced.c
+++ b/src/viced/viced.c
@@ -5,6 +5,8 @@
  * This software has been released under the terms of the IBM Public
  * License.  For details, see the LICENSE file in the top-level source
  * directory or online at http://www.openafs.org/dl/license10.html
+ *
+ * Portions Copyright (c) 2006 Sine Nomine Associates
  */
 
 /*  viced.c	- File Server main loop					 */
@@ -215,6 +217,27 @@ afsUUID FS_HostUUID;
 
 static void FlagMsg();
 
+#ifdef AFS_DEMAND_ATTACH_FS
+/*
+ * demand attach fs
+ * fileserver mode support
+ *
+ * during fileserver shutdown, we have to track the graceful shutdown of
+ * certain background threads before we are allowed to dump state to
+ * disk
+ */
+struct fs_state fs_state = 
+    { FS_MODE_NORMAL, 
+      0, 
+      0, 
+      0, 
+      0,
+      { 1,1,1,1 },
+      PTHREAD_COND_INITIALIZER,
+      PTHREAD_RWLOCK_INITIALIZER
+    };
+#endif /* AFS_DEMAND_ATTACH_FS */
+
 /*
  * Home for the performance statistics.
  */
@@ -420,13 +443,31 @@ FiveMinuteCheckLWP()
 
     ViceLog(1, ("Starting five minute check process\n"));
     setThreadId("FiveMinuteCheckLWP");
+
+#ifdef AFS_DEMAND_ATTACH_FS
+    FS_STATE_WRLOCK;
+    while (fs_state.mode == FS_MODE_NORMAL) {
+	fs_state.FiveMinuteLWP_tranquil = 1;
+	FS_STATE_UNLOCK;
+#else
     while (1) {
+#endif
+
 #ifdef AFS_PTHREAD_ENV
 	sleep(fiveminutes);
 #else /* AFS_PTHREAD_ENV */
 	IOMGR_Sleep(fiveminutes);
 #endif /* AFS_PTHREAD_ENV */
 
+#ifdef AFS_DEMAND_ATTACH_FS
+	FS_STATE_WRLOCK;
+	if (fs_state.mode != FS_MODE_NORMAL) {
+	    break;
+	}
+	fs_state.FiveMinuteLWP_tranquil = 0;
+	FS_STATE_UNLOCK;
+#endif
+
 	/* close the log so it can be removed */
 	ReOpenLog(AFSDIR_SERVER_FILELOG_FILEPATH);	/* don't trunc, just append */
 	ViceLog(2, ("Cleaning up timed out callbacks\n"));
@@ -452,7 +493,17 @@ FiveMinuteCheckLWP()
 			 afs_ctime(&now, tbuffer, sizeof(tbuffer))));
 	    }
 	}
+#ifdef AFS_DEMAND_ATTACH_FS
+	FS_STATE_WRLOCK;
+#endif
     }
+#ifdef AFS_DEMAND_ATTACH_FS
+    fs_state.FiveMinuteLWP_tranquil = 1;
+    FS_LOCK;
+    assert(pthread_cond_broadcast(&fs_state.worker_done_cv)==0);
+    FS_UNLOCK;
+    FS_STATE_UNLOCK;
+#endif
 }				/*FiveMinuteCheckLWP */
 
 
@@ -460,20 +511,50 @@ FiveMinuteCheckLWP()
  * other 5 minute activities because it may be delayed by timeouts when
  * it probes the workstations
  */
+
 static void
 HostCheckLWP()
 {
     ViceLog(1, ("Starting Host check process\n"));
     setThreadId("HostCheckLWP");
-    while (1) {
+#ifdef AFS_DEMAND_ATTACH_FS
+    FS_STATE_WRLOCK;
+    while (fs_state.mode == FS_MODE_NORMAL) {
+	fs_state.HostCheckLWP_tranquil = 1;
+	FS_STATE_UNLOCK;
+#else
+    while(1) {
+#endif
+
 #ifdef AFS_PTHREAD_ENV
 	sleep(fiveminutes);
 #else /* AFS_PTHREAD_ENV */
 	IOMGR_Sleep(fiveminutes);
 #endif /* AFS_PTHREAD_ENV */
+
+#ifdef AFS_DEMAND_ATTACH_FS
+	FS_STATE_WRLOCK;
+	if (fs_state.mode != FS_MODE_NORMAL) {
+	    break;
+	}
+	fs_state.HostCheckLWP_tranquil = 0;
+	FS_STATE_UNLOCK;
+#endif
+
 	ViceLog(2, ("Checking for dead venii & clients\n"));
 	h_CheckHosts();
+
+#ifdef AFS_DEMAND_ATTACH_FS
+	FS_STATE_WRLOCK;
+#endif
     }
+#ifdef AFS_DEMAND_ATTACH_FS
+    fs_state.HostCheckLWP_tranquil = 1;
+    FS_LOCK;
+    assert(pthread_cond_broadcast(&fs_state.worker_done_cv)==0);
+    FS_UNLOCK;
+    FS_STATE_UNLOCK;
+#endif
 }				/*HostCheckLWP */
 
 /* This LWP does fsync checks every 5 minutes:  it should not be used for
@@ -496,7 +577,14 @@ FsyncCheckLWP()
     assert(pthread_mutex_init(&fsync_glock_mutex, NULL) == 0);
 #endif
 
-    while (1) {
+#ifdef AFS_DEMAND_ATTACH_FS
+    FS_STATE_WRLOCK;
+    while (fs_state.mode == FS_MODE_NORMAL) {
+	fs_state.FsyncCheckLWP_tranquil = 1;
+	FS_STATE_UNLOCK;
+#else
+    while(1) {
+#endif
 	FSYNC_LOCK;
 #ifdef AFS_PTHREAD_ENV
 	/* rounding is fine */
@@ -513,11 +601,31 @@ FsyncCheckLWP()
 	    ViceLog(0, ("LWP_WaitProcess returned %d\n", code));
 #endif /* AFS_PTHREAD_ENV */
 	FSYNC_UNLOCK;
+
+#ifdef AFS_DEMAND_ATTACH_FS
+	FS_STATE_WRLOCK;
+	if (fs_state.mode != FS_MODE_NORMAL) {
+	    break;
+	}
+	fs_state.FsyncCheckLWP_tranquil = 0;
+	FS_STATE_UNLOCK;
+#endif /* AFS_DEMAND_ATTACH_FS */
+
 	ViceLog(2, ("Checking for fsync events\n"));
 	do {
 	    code = BreakLaterCallBacks();
 	} while (code != 0);
+#ifdef AFS_DEMAND_ATTACH_FS
+	FS_STATE_WRLOCK;
+#endif
     }
+#ifdef AFS_DEMAND_ATTACH_FS
+    fs_state.FsyncCheckLWP_tranquil = 1;
+    FS_LOCK;
+    assert(pthread_cond_broadcast(&fs_state.worker_done_cv)==0);
+    FS_UNLOCK;
+    FS_STATE_UNLOCK;
+#endif /* AFS_DEMAND_ATTACH_FS */
 }
 
 /*------------------------------------------------------------------------
@@ -604,6 +712,11 @@ PrintCounters()
 	    ("Vice was last started at %s\n",
 	     afs_ctime(&StartTime, tbuffer, sizeof(tbuffer))));
 
+#ifdef AFS_DEMAND_ATTACH_FS
+    /* XXX perhaps set extended stats verbosity flags
+     * based upon LogLevel ?? */
+    VPrintExtendedCacheStats(VOL_STATS_PER_CHAIN2);
+#endif
     VPrintCacheStats();
     VPrintDiskStats();
     DStat(&dirbuff, &dircall, &dirio);
@@ -656,6 +769,16 @@ ShutDownAndCore(int dopanic)
     time_t now = time(0);
     char tbuffer[32];
 
+    /* do not allows new reqests to be served from now on, all new requests
+     * are returned with an error code of RX_RESTARTING ( transient failure ) */
+    rx_SetRxTranquil();		/* dhruba */
+
+#ifdef AFS_DEMAND_ATTACH_FS
+    FS_STATE_WRLOCK;
+    fs_state.mode = FS_MODE_SHUTDOWN;
+    FS_STATE_UNLOCK;
+#endif
+
     ViceLog(0,
 	    ("Shutting down file server at %s",
 	     afs_ctime(&now, tbuffer, sizeof(tbuffer))));
@@ -671,11 +794,34 @@ ShutDownAndCore(int dopanic)
     if (!dopanic)
 	PrintCounters();
 
-    /* do not allows new reqests to be served from now on, all new requests
-     * are returned with an error code of RX_RESTARTING ( transient failure ) */
-    rx_SetRxTranquil();		/* dhruba */
+    /* shut down volume package */
     VShutdown();
 
+#ifdef AFS_DEMAND_ATTACH_FS
+    if (fs_state.options.fs_state_save) {
+	/* 
+	 * demand attach fs
+	 * save fileserver state to disk */
+
+	/* make sure background threads have finished all of their asynchronous 
+	 * work on host and callback structures */
+	FS_STATE_RDLOCK;
+	while (!fs_state.FiveMinuteLWP_tranquil ||
+	       !fs_state.HostCheckLWP_tranquil ||
+	       !fs_state.FsyncCheckLWP_tranquil) {
+	    FS_LOCK;
+	    FS_STATE_UNLOCK;
+	    ViceLog(0, ("waiting for background host/callback threads to quiesce before saving fileserver state...\n"));
+	    assert(pthread_cond_wait(&fs_state.worker_done_cv, &fileproc_glock_mutex) == 0);
+	    FS_UNLOCK;
+	    FS_STATE_RDLOCK;
+	}
+
+	/* ok. it should now be fairly safe. let's do the state dump */
+	fs_stateSave();
+    }
+#endif /* AFS_DEMAND_ATTACH_FS */
+
     if (debugFile) {
 	rx_PrintStats(debugFile);
 	fflush(debugFile);
@@ -715,7 +861,7 @@ ShutDown(void)
 static void
 FlagMsg()
 {
-    char buffer[1024];
+    char buffer[2048];
 
     /* default supports help flag */
 
@@ -743,8 +889,18 @@ FlagMsg()
     strcat(buffer, "[-rxdbg (enable rx debugging)] ");
     strcat(buffer, "[-rxdbge (enable rxevent debugging)] ");
     strcat(buffer, "[-rxmaxmtu <bytes>] ");
-#if AFS_PTHREAD_ENV
-    strcat(buffer, "[-vattachpar <number of volume attach threads>] ");
+#ifdef AFS_DEMAND_ATTACH_FS
+    strcat(buffer, "[-fs-state-dont-save (disable state save during shutdown)] ");
+    strcat(buffer, "[-fs-state-dont-restore (disable state restore during startup)] ");
+    strcat(buffer, "[-fs-state-verify <none|save|restore|both> (default is both)] ");
+    strcat(buffer, "[-vattachpar <max number of volume attach/shutdown threads> (default is 1)] ");
+    strcat(buffer, "[-vhashsize <log(2) of number of volume hash buckets> (default is 8)] ");
+    strcat(buffer, "[-vlrudisable (disable VLRU functionality)] ");
+    strcat(buffer, "[-vlruthresh <minutes before unused volumes become eligible for soft detach> (default is 2 hours)] ");
+    strcat(buffer, "[-vlruinterval <seconds between VLRU scans> (default is 2 minutes)] ");
+    strcat(buffer, "[-vlrumax <max volumes to soft detach in one VLRU scan> (default is 8)] ");
+#elif AFS_PTHREAD_ENV
+    strcat(buffer, "[-vattachpar <number of volume attach threads> (default is 1)] ");
 #endif
 #ifdef	AFS_AIX32_ENV
     strcat(buffer, "[-m <min percentage spare in partition>] ");
@@ -945,11 +1101,62 @@ ParseArgs(int argc, char *argv[])
 #ifdef AFS_PTHREAD_ENV
 	} else if (!strcmp(argv[i], "-vattachpar")) {
             if ((i + 1) >= argc) {
-		fprintf(stderr, "missing argument for -vattachpar\n"); 
+		fprintf(stderr, "missing argument for %s\n", argv[i]); 
 		return -1; 
 	    }
 	    vol_attach_threads = atoi(argv[++i]);
 #endif /* AFS_PTHREAD_ENV */
+#ifdef AFS_DEMAND_ATTACH_FS
+	} else if (!strcmp(argv[i], "-fs-state-dont-save")) {
+	    fs_state.options.fs_state_save = 0;
+	} else if (!strcmp(argv[i], "-fs-state-dont-restore")) {
+	    fs_state.options.fs_state_restore = 0;
+	} else if (!strcmp(argv[i], "-fs-state-verify")) {
+            if ((i + 1) >= argc) {
+		fprintf(stderr, "missing argument for %s\n", argv[i]); 
+		return -1; 
+	    }
+	    i++;
+	    if (!strcmp(argv[i], "none")) {
+		fs_state.options.fs_state_verify_before_save = 0;
+		fs_state.options.fs_state_verify_after_restore = 0;
+	    } else if (!strcmp(argv[i], "save")) {
+		fs_state.options.fs_state_verify_after_restore = 0;
+	    } else if (!strcmp(argv[i], "restore")) {
+		fs_state.options.fs_state_verify_before_save = 0;
+	    } else if (!strcmp(argv[i], "both")) {
+		/* default */
+	    } else {
+		fprintf(stderr, "invalid argument for %s\n", argv[i-1]);
+		return -1;
+	    }
+	} else if (!strcmp(argv[i], "-vhashsize")) {
+            if ((i + 1) >= argc) {
+		fprintf(stderr, "missing argument for %s\n", argv[i]); 
+		return -1; 
+	    }
+	    VSetVolHashSize(atoi(argv[++i]));
+	} else if (!strcmp(argv[i], "-vlrudisable")) {
+	    VLRU_SetOptions(VLRU_SET_ENABLED, 0);
+	} else if (!strcmp(argv[i], "-vlruthresh")) {
+            if ((i + 1) >= argc) {
+		fprintf(stderr, "missing argument for %s\n", argv[i]); 
+		return -1; 
+	    }
+	    VLRU_SetOptions(VLRU_SET_THRESH, 60*atoi(argv[++i]));
+	} else if (!strcmp(argv[i], "-vlruinterval")) {
+            if ((i + 1) >= argc) {
+		fprintf(stderr, "missing argument for %s\n", argv[i]); 
+		return -1; 
+	    }
+	    VLRU_SetOptions(VLRU_SET_INTERVAL, atoi(argv[++i]));
+	} else if (!strcmp(argv[i], "-vlrumax")) {
+            if ((i + 1) >= argc) {
+		fprintf(stderr, "missing argument for %s\n", argv[i]); 
+		return -1; 
+	    }
+	    VLRU_SetOptions(VLRU_SET_MAX, atoi(argv[++i]));
+#endif /* AFS_DEMAND_ATTACH_FS */
 	} else if (!strcmp(argv[i], "-s")) {
 	    Sawsmall = 1;
             if ((i + 1) >= argc) {
@@ -1923,6 +2130,15 @@ main(int argc, char *argv[])
 	exit(1);
     }
 
+#ifdef AFS_DEMAND_ATTACH_FS
+    if (fs_state.options.fs_state_restore) {
+	/*
+	 * demand attach fs
+	 * restore fileserver state */
+	fs_stateRestore();
+    }
+#endif /* AFS_DEMAND_ATTACH_FS */
+
     /*
      * We are done calling fopen/fdopen. It is safe to use a large
      * of the file descriptor cache.
diff --git a/src/viced/viced.h b/src/viced/viced.h
index 3b230e5311..d8c837cad8 100644
--- a/src/viced/viced.h
+++ b/src/viced/viced.h
@@ -5,6 +5,8 @@
  * This software has been released under the terms of the IBM Public
  * License.  For details, see the LICENSE file in the top-level source
  * directory or online at http://www.openafs.org/dl/license10.html
+ *
+ * Portions Copyright (c) 2006 Sine Nomine Associates
  */
 
 /*  file.h	- include file for the File Server			*/
@@ -20,6 +22,9 @@
  * Start with clean version to sync test and dev trees.
  * */
 
+#ifndef _AFS_VICED_VICED_H
+#define _AFS_VICED_VICED_H
+
 #include <afs/afssyscalls.h>
 #include <afs/afsutil.h>
 #include "fs_stats.h"		/*Defs for xstat-based statistics */
@@ -46,18 +51,6 @@ typedef struct DirHandle {
 } DirHandle;
 
 
-struct cbcounters {
-    int DeleteFiles;
-    int DeleteCallBacks;
-    int BreakCallBacks;
-    int AddCallBacks;
-    int GotSomeSpaces;
-    int DeleteAllCallBacks;
-    int nFEs, nCBs, nblks;
-    int CBsTimedOut;
-    int nbreakers;
-    int GSS1, GSS2, GSS3, GSS4, GSS5;
-};
 
 #define MAXCNTRS (AFS_HIGHEST_OPCODE+1)
 
@@ -219,3 +212,46 @@ extern pthread_mutex_t fsync_glock_mutex;
 #define FSYNC_LOCK
 #define FSYNC_UNLOCK
 #endif /* AFS_PTHREAD_ENV */
+
+
+#ifdef AFS_DEMAND_ATTACH_FS
+/*
+ * demand attach fs
+ * fileserver mode support
+ */
+struct fs_state {
+    volatile int mode;
+    volatile byte FiveMinuteLWP_tranquil;      /* five minute check thread is shutdown or sleeping */
+    volatile byte HostCheckLWP_tranquil;       /* host check thread is shutdown or sleeping */
+    volatile byte FsyncCheckLWP_tranquil;      /* fsync check thread is shutdown or sleeping */
+    volatile byte salvsync_fatal_error;        /* fatal error with salvsync comm */
+
+    /* some command-line options we use in 
+     * various places
+     *
+     * these fields are immutable once we
+     * go multithreaded */
+    struct {
+	byte fs_state_save;
+	byte fs_state_restore;
+	byte fs_state_verify_before_save;
+	byte fs_state_verify_after_restore;
+    } options;
+
+    pthread_cond_t worker_done_cv;
+    pthread_rwlock_t state_lock;
+};
+
+extern struct fs_state fs_state;
+
+/* this lock is defined to be directly above FS_LOCK in the locking hierarchy */
+#define FS_STATE_RDLOCK  assert(pthread_rwlock_rdlock(&fs_state.state_lock) == 0)
+#define FS_STATE_WRLOCK  assert(pthread_rwlock_wrlock(&fs_state.state_lock) == 0)
+#define FS_STATE_UNLOCK  assert(pthread_rwlock_unlock(&fs_state.state_lock) == 0)
+
+#define FS_MODE_NORMAL    0
+#define FS_MODE_SHUTDOWN  1
+#endif /* AFS_DEMAND_ATTACH_FS */
+
+
+#endif /* _AFS_VICED_VICED_H */
diff --git a/src/viced/viced_prototypes.h b/src/viced/viced_prototypes.h
index df11f8aa5b..556d3500c5 100644
--- a/src/viced/viced_prototypes.h
+++ b/src/viced/viced_prototypes.h
@@ -1,4 +1,27 @@
+/*
+ * Copyright 2000, International Business Machines Corporation and others.
+ * All Rights Reserved.
+ * 
+ * This software has been released under the terms of the IBM Public
+ * License.  For details, see the LICENSE file in the top-level source
+ * directory or online at http://www.openafs.org/dl/license10.html
+ */
+
+#ifndef _AFS_VICED_VICED_PROTOTYPES_H
+#define _AFS_VICED_VICED_PROTOTYPES_H
+
 extern int sendBufSize;
 afs_int32 sys_error_to_et(afs_int32 in);
 void init_sys_error_to_et(void);
+  
+#ifdef AFS_DEMAND_ATTACH_FS
+/*
+ * demand attach fs
+ * fileserver state serialization
+ */
+extern int fs_stateSave(void);
+extern int fs_stateRestore(void);
+#endif /* AFS_DEMAND_ATTACH_FS */
 
+
+#endif /* _AFS_VICED_VICED_PROTOTYPES_H */
diff --git a/src/vol/Makefile.in b/src/vol/Makefile.in
index 114a304997..33131a0600 100644
--- a/src/vol/Makefile.in
+++ b/src/vol/Makefile.in
@@ -16,22 +16,23 @@ LIBS=${TOP_LIBDIR}/libcmd.a vlib.a ${TOP_LIBDIR}/util.a \
 	${TOP_LIBDIR}/libsys.a ${TOP_LIBDIR}/libdir.a \
 	${TOP_LIBDIR}/liblwp.a  ${TOP_LIBDIR}/libacl.a
 
-CFLAGS = ${COMMON_CFLAGS} -D${SYS_NAME} ${FSINCLUDES} ${XCFLAGS} ${ARCHFLAGS}
+CFLAGS = ${COMMON_CFLAGS} -D${SYS_NAME} ${FSINCLUDES} ${XCFLAGS} ${ARCHFLAGS} -DFSSYNC_BUILD_SERVER -DFSSYNC_BUILD_CLIENT
 
-PUBLICHEADERS=nfs.h vnode.h viceinode.h volume.h voldefs.h partition.h\
-	fssync.h ihandle.h namei_ops.h
+PUBLICHEADERS=nfs.h vnode.h viceinode.h volume.h voldefs.h partition.h \
+	fssync.h ihandle.h namei_ops.h salvsync.h daemon_com.h
 
-VLIBOBJS=vnode.o volume.o vutil.o partition.o fssync.o purge.o \
-	 clone.o nuke.o devname.o listinodes.o common.o ihandle.o \
-	 namei_ops.o
+VLIBOBJS=vnode.o volume.o vutil.o partition.o fssync-server.o fssync-client.o \
+	 clone.o nuke.o devname.o listinodes.o common.o ihandle.o purge.o \
+	 namei_ops.o salvsync-server.o salvsync-client.o daemon_com.o
 
-OBJECTS=${VLIBOBJS} physio.o vol-salvage.o vol-info.o vol-dump.o vol-bless.o
+OBJECTS=${VLIBOBJS} physio.o vol-salvage.o vol-info.o vol-dump.o vol-bless.o fssync-debug.o
 
 all: gi \
 	${TOP_LIBDIR}/vlib.a \
 	${TOP_LIBDIR}/libvlib.a \
 	salvager \
 	volinfo \
+	fssync-debug \
 	$(FS_CONV_OSF40D) \
 	$(XFS_SIZE_CHECK) \
 	$(FS_CONV_SOL26) \
@@ -42,6 +43,8 @@ all: gi \
 	${TOP_INCDIR}/afs/voldefs.h \
 	${TOP_INCDIR}/afs/partition.h \
 	${TOP_INCDIR}/afs/fssync.h \
+	${TOP_INCDIR}/afs/salvsync.h \
+	${TOP_INCDIR}/afs/daemon_com.h \
 	${TOP_INCDIR}/afs/ihandle.h \
 	${TOP_INCDIR}/afs/namei_ops.h
 
@@ -53,6 +56,7 @@ install: \
 	${DESTDIR}${libdir}/afs/libvlib.a \
 	${DESTDIR}${afssrvlibexecdir}/salvager \
 	${DESTDIR}${afssrvsbindir}/volinfo \
+	${DESTDIR}${afssrvsbindir}/fssync-debug \
 	$(install_FS_CONV_OSF40D) \
 	$(install_XFS_SIZE_CHECK) \
 	$(install_FS_CONV_SOL26) \
@@ -63,6 +67,8 @@ install: \
 	${DESTDIR}${includedir}/afs/voldefs.h \
 	${DESTDIR}${includedir}/afs/partition.h \
 	${DESTDIR}${includedir}/afs/fssync.h \
+	${DESTDIR}${includedir}/afs/salvsync.h \
+	${DESTDIR}${includedir}/afs/daemon_com.h \
 	${DESTDIR}${includedir}/afs/ihandle.h \
 	${DESTDIR}${includedir}/afs/namei_ops.h
 
@@ -72,6 +78,11 @@ ${DEST}/root.server/usr/afs/bin/salvager: salvager
 ${DEST}/root.server/usr/afs/bin/volinfo: volinfo
 	${INSTALL} -s $? $@
 
+${DEST}/root.server/usr/afs/bin/fssync-debug: fssync-debug
+	if test "@DEMAND_ATTACH@" = "no"; then \
+		${INSTALL} -s $? $@ ; \
+	fi
+
 ${DEST}/lib/afs/vlib.a: vlib.a
 	${INSTALL} $? $@
 
@@ -117,6 +128,12 @@ ${DEST}/include/afs/partition.h: partition.h
 ${DEST}/include/afs/fssync.h: fssync.h
 	${INSTALL} $? $@
 
+${DEST}/include/afs/salvsync.h: salvsync.h
+	${INSTALL} $? $@
+
+${DEST}/include/afs/daemon_com.h: daemon_com.h
+	${INSTALL} $? $@
+
 ${DEST}/include/afs/ihandle.h: ihandle.h
 	${INSTALL} $? $@
 
@@ -129,6 +146,8 @@ ${DEST}/include/afs/namei_ops.h: namei_ops.h
 ${OBJECTS}: ${PUBLICHEADERS} ${TOP_INCDIR}/lwp.h ${TOP_INCDIR}/lock.h ${TOP_INCDIR}/afs/afsint.h vutils.h salvage.h AFS_component_version_number.c
 
 vol-salvage.o vutil.o: volinodes.h
+vol-salvage.o salvager.o: vol-salvage.h
+vol-salvage.o: salvsync.h daemon_com.h
 
 vlib.a:	${VLIBOBJS} AFS_component_version_number.o
 	$(RM) -f $@
@@ -136,8 +155,8 @@ vlib.a:	${VLIBOBJS} AFS_component_version_number.o
 	$(RANLIB) $@
 
 # new salvager:  remove references to /vice by linking with novice.o
-salvager: vol-salvage.o physio.o vlib.a
-	${CC} ${LDFLAGS} -o salvager vol-salvage.o physio.o ${LIBS} ${XLIBS}
+salvager: vol-salvage.o physio.o vlib.a salvager.o ${LIBS}
+	${CC} ${LDFLAGS} -o salvager vol-salvage.o physio.o salvager.o ${LIBS} ${XLIBS}
 
 vol-salvage: vol-salvage.o
 vol-info: vol-info.o physio.o ihandle.o
@@ -167,13 +186,16 @@ volinfo: vol-info.o physio.o ihandle.o ${LIBS}
 	${CC} ${CFLAGS} -o volinfo vol-info.o physio.o \
 		ihandle.o ${LIBS} ${XLIBS}
 
+fssync-debug: fssync-debug.o physio.o AFS_component_version_number.c ${LIBS}
+	${CC} ${LDFLAGS} -o fssync-debug fssync-debug.o physio.o ${LIBS} ${XLIBS}
+
 vol-bless: vol-bless.o physio.o ihandle.o ${LIBS}
 	${CC} ${CFLAGS} -o vol-bless vol-bless.o physio.o ${LIBS} ${XLIBS}
 
-fs_conv_dux40D: fs_conv_411.o
+fs_conv_dux40D: fs_conv_411.o ${LIBS}
 	${CC} ${CFLAGS} ${TOP_LIBDIR}/libcmd.a -o fs_conv_dux40D fs_conv_411.o  ${LIBS} ${XLIBS}
 
-fs_conv_sol26: fs_conv_411.o vlib.a 
+fs_conv_sol26: fs_conv_411.o ${LIBS}
 	${CC} ${CFLAGS} ${TOP_LIBDIR}/libcmd.a -o fs_conv_sol26 fs_conv_411.o  ${LIBS} ${XLIBS}
 
 fs_conv_411.o: fs_conv_411.c AFS_component_version_number.c
@@ -211,6 +233,11 @@ ${DESTDIR}${afssrvlibexecdir}/salvager: salvager
 ${DESTDIR}${afssrvsbindir}/volinfo: volinfo
 	${INSTALL} -s $? $@
 
+${DESTDIR}${afssrvsbindir}/fssync-debug: fssync-debug
+	if test "@DEMAND_ATTACH@" = "no" ; then \
+		${INSTALL} -s $? $@ ; \
+	fi
+
 ${DESTDIR}${includedir}/afs/nfs.h: nfs.h
 	${INSTALL} $? $@
 
@@ -253,6 +280,18 @@ ${DESTDIR}${includedir}/afs/fssync.h: fssync.h
 ${TOP_INCDIR}/afs/fssync.h: fssync.h
 	${INSTALL} $? $@
 
+${DESTDIR}${includedir}/afs/salvsync.h: salvsync.h
+	${INSTALL} $? $@
+
+${TOP_INCDIR}/afs/salvsync.h: salvsync.h
+	${INSTALL} $? $@
+
+${DESTDIR}${includedir}/afs/daemon_com.h: daemon_com.h
+	${INSTALL} $? $@
+
+${TOP_INCDIR}/afs/daemon_com.h: daemon_com.h
+	${INSTALL} $? $@
+
 ${DESTDIR}${includedir}/afs/ihandle.h: ihandle.h
 	${INSTALL} $? $@
 
@@ -265,11 +304,24 @@ ${DESTDIR}${includedir}/afs/namei_ops.h: namei_ops.h
 ${TOP_INCDIR}/afs/namei_ops.h: namei_ops.h
 	${INSTALL} $? $@
 
+${DESTDIR}${includedir}/afs/salvage.h: salvage.h
+	${INSTALL} $? $@
+
+${TOP_INCDIR}/afs/salvage.h: salvage.h
+	${INSTALL} $? $@
+
+${DESTDIR}${includedir}/afs/vol-salvage.h: vol-salvage.h
+	${INSTALL} $? $@
+
+${TOP_INCDIR}/afs/vol-salvage.h: vol-salvage.h
+	${INSTALL} $? $@
+
 dest: \
 	${DEST}/lib/afs/vlib.a \
 	${DEST}/lib/afs/libvlib.a \
 	${DEST}/root.server/usr/afs/bin/salvager \
 	${DEST}/root.server/usr/afs/bin/volinfo \
+	${DEST}/root.server/usr/afs/bin/fssync-debug \
 	$(dest_FS_CONV_OSF40D) \
 	$(dest_XFS_SIZE_CHECK) \
 	$(dest_FS_CONV_SOL26) \
@@ -280,12 +332,14 @@ dest: \
 	${DEST}/include/afs/voldefs.h \
 	${DEST}/include/afs/partition.h \
 	${DEST}/include/afs/fssync.h \
+	${DEST}/include/afs/salvsync.h \
+	${DEST}/include/afs/daemon_com.h \
 	${DEST}/include/afs/ihandle.h \
 	${DEST}/include/afs/namei_ops.h
 
 check-splint::
 	sh $(HELPER_SPLINT) $(CFLAGS) \
-	    vnode.c volume.c vutil.c partition.c fssync.c purge.c \
+	    vnode.c volume.c vutil.c partition.c fssync-server.c fssync-client.c \
 	    clone.c nuke.c devname.c listinodes.c common.c ihandle.c \
-	    namei_ops.c \
-	    physio.c vol-salvage.c vol-info.c vol-bless.c
+	    namei_ops.c salvsync-server.c salvsync-client.c daemon_com.c purge.c \
+	    physio.c vol-salvage.c vol-info.c vol-bless.c fssync-debug.c
diff --git a/src/vol/NTMakefile b/src/vol/NTMakefile
index e09db2b734..096026fe7a 100644
--- a/src/vol/NTMakefile
+++ b/src/vol/NTMakefile
@@ -5,6 +5,8 @@
 # License.  For details, see the LICENSE file in the top-level source
 # directory or online at http://www.openafs.org/dl/license10.html
 
+AFSDEV_AUXCDEFINES = -DFSSYNC_BUILD_SERVER -DFSSYNC_BUILD_CLIENT
+
 RELDIR=vol
 !INCLUDE ..\config\NTMakefile.$(SYS_NAME)
 !INCLUDE ..\config\NTMakefile.version
diff --git a/src/vol/daemon_com.c b/src/vol/daemon_com.c
new file mode 100644
index 0000000000..26bddbf6c9
--- /dev/null
+++ b/src/vol/daemon_com.c
@@ -0,0 +1,473 @@
+/*
+ * Copyright 2006, Sine Nomine Associates and others.
+ * All Rights Reserved.
+ * 
+ * This software has been released under the terms of the IBM Public
+ * License.  For details, see the LICENSE file in the top-level source
+ * directory or online at http://www.openafs.org/dl/license10.html
+ */
+
+/*
+ * localhost interprocess communication for servers
+ *
+ * currently handled by a localhost socket
+ * (yes, this needs to be replaced someday)
+ */
+
+#ifndef _WIN32
+#define FD_SETSIZE 65536
+#endif
+
+#include <afsconfig.h>
+#include <afs/param.h>
+
+RCSID
+    ("$Header$");
+
+#include <sys/types.h>
+#include <stdio.h>
+#ifdef AFS_NT40_ENV
+#include <winsock2.h>
+#include <time.h>
+#else
+#include <sys/param.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <netdb.h>
+#include <sys/time.h>
+#endif
+#include <errno.h>
+#include <assert.h>
+#include <signal.h>
+
+#ifdef HAVE_STRING_H
+#include <string.h>
+#else
+#ifdef HAVE_STRINGS_H
+#include <strings.h>
+#endif
+#endif
+
+
+#include <rx/xdr.h>
+#include <afs/afsint.h>
+#include "nfs.h"
+#include <afs/errors.h>
+#include "daemon_com.h"
+#include "lwp.h"
+#include "lock.h"
+#include <afs/afssyscalls.h>
+#include "ihandle.h"
+#include "vnode.h"
+#include "volume.h"
+#include "partition.h"
+#include <rx/rx_queue.h>
+
+/*@printflike@*/ extern void Log(const char *format, ...);
+
+#ifdef osi_Assert
+#undef osi_Assert
+#endif
+#define osi_Assert(e) (void)(e)
+
+int (*V_BreakVolumeCallbacks) ();
+
+#define MAXHANDLERS	4	/* Up to 4 clients; must be at least 2, so that
+				 * move = dump+restore can run on single server */
+
+#define MAX_BIND_TRIES	5	/* Number of times to retry socket bind */
+
+static int getport(SYNC_client_state * state, struct sockaddr_in *addr);
+static int SYNC_ask_internal(SYNC_client_state * state, SYNC_command * com, SYNC_response * res);
+
+/* daemon com SYNC client interface */
+
+int
+SYNC_connect(SYNC_client_state * state)
+{
+    struct sockaddr_in addr;
+    /* I can't believe the following is needed for localhost connections!! */
+    static time_t backoff[] =
+	{ 3, 3, 3, 5, 5, 5, 7, 15, 16, 24, 32, 40, 48, 0 };
+    time_t *timeout = &backoff[0];
+
+    if (state->fd >= 0) {
+	return 1;
+    }
+
+    for (;;) {
+	state->fd = getport(state, &addr);
+	if (connect(state->fd, (struct sockaddr *)&addr, sizeof(addr)) >= 0)
+	    return 1;
+	if (!*timeout)
+	    break;
+	if (!(*timeout & 1))
+	    Log("SYNC_connect temporary failure (will retry)\n");
+	SYNC_disconnect(state);
+	sleep(*timeout++);
+    }
+    perror("SYNC_connect failed (giving up!)");
+    return 0;
+}
+
+int
+SYNC_disconnect(SYNC_client_state * state)
+{
+#ifdef AFS_NT40_ENV
+    closesocket(state->fd);
+#else
+    close(state->fd);
+#endif
+    state->fd = -1;
+    return 0;
+}
+
+afs_int32
+SYNC_closeChannel(SYNC_client_state * state)
+{
+    afs_int32 code;
+    SYNC_command com;
+    SYNC_response res;
+    SYNC_PROTO_BUF_DECL(ores);
+
+    if (state->fd == -1)
+	return SYNC_OK;
+
+    memset(&com, 0, sizeof(com));
+    memset(&res, 0, sizeof(res));
+
+    res.payload.len = SYNC_PROTO_MAX_LEN;
+    res.payload.buf = ores;
+
+    com.hdr.command = SYNC_COM_CHANNEL_CLOSE;
+    com.hdr.command_len = sizeof(SYNC_command_hdr);
+
+    /* in case the other end dropped, don't do any retries */
+    state->retry_limit = 0;
+    state->hard_timeout = 0;
+
+    code = SYNC_ask(state, &com, &res);
+
+    if (code == SYNC_OK) {
+	if (res.hdr.response != SYNC_OK) {
+	    Log("SYNC_closeChannel:  channel shutdown request denied; closing socket anyway\n");
+	} else if (!(res.hdr.flags & SYNC_FLAG_CHANNEL_SHUTDOWN)) {
+	    Log("SYNC_closeChannel:  channel shutdown request mishandled by server\n");
+	}
+    } else {
+	Log("SYNC_closeChannel: channel communications problem");
+    }
+
+    SYNC_disconnect(state);
+
+    return code;
+}
+
+int
+SYNC_reconnect(SYNC_client_state * state)
+{
+    SYNC_disconnect(state);
+    return SYNC_connect(state);
+}
+
+/* private function to fill in the sockaddr struct for us */
+static int
+getport(SYNC_client_state * state, struct sockaddr_in *addr)
+{
+    int sd;
+
+    memset(addr, 0, sizeof(*addr));
+    assert((sd = socket(AF_INET, SOCK_STREAM, 0)) >= 0);
+#ifdef STRUCT_SOCKADDR_HAS_SA_LEN
+    addr->sin_len = sizeof(struct sockaddr_in);
+#endif
+    addr->sin_addr.s_addr = htonl(0x7f000001);
+    addr->sin_family = AF_INET;	/* was localhost->h_addrtype */
+    addr->sin_port = htons(state->port);	/* XXXX htons not _really_ neccessary */
+
+    return sd;
+}
+
+afs_int32
+SYNC_ask(SYNC_client_state * state, SYNC_command * com, SYNC_response * res)
+{
+    int tries;
+    afs_uint32 now, timeout, code=SYNC_OK;
+
+    if (state->fatal_error) {
+	return SYNC_COM_ERROR;
+    }
+
+    if (state->fd == -1) {
+	SYNC_connect(state);
+    }
+
+    if (state->fd == -1) {
+	state->fatal_error = 1;
+	return SYNC_COM_ERROR;
+    }
+
+#ifdef AFS_DEMAND_ATTACH_FS
+    com->hdr.flags |= SYNC_FLAG_DAFS_EXTENSIONS;
+#endif
+
+    now = FT_ApproxTime();
+    timeout = now + state->hard_timeout;
+    for (tries = 0; 
+	 (tries <= state->retry_limit) && (now <= timeout);
+	 tries++, now = FT_ApproxTime()) {
+	code = SYNC_ask_internal(state, com, res);
+	if (code == SYNC_OK) {
+	    break;
+	} else if (code == SYNC_BAD_COMMAND) {
+	    Log("SYNC_ask: protocol mismatch; make sure fileserver, volserver, salvageserver and salvager are same version\n");
+	    break;
+	} else if (code == SYNC_COM_ERROR) {
+	    Log("SYNC_ask: protocol communications failure; attempting reconnect to server\n");
+	    SYNC_reconnect(state);
+	    /* try again */
+	} else {
+	    /* unknown (probably protocol-specific) response code, pass it up to the caller, and let them deal with it */
+	    break;
+	}
+    }
+
+    if (code == SYNC_COM_ERROR) {
+	Log("SYNC_ask: fatal protocol error; disabling sync protocol to server running on port %d until next server restart\n", 
+	    state->port);
+	state->fatal_error = 1;
+    }
+
+    return code;
+}
+
+static afs_int32
+SYNC_ask_internal(SYNC_client_state * state, SYNC_command * com, SYNC_response * res)
+{
+    int n;
+    SYNC_PROTO_BUF_DECL(buf);
+#ifndef AFS_NT40_ENV
+    int iovcnt;
+    struct iovec iov[2];
+#endif
+
+    if (state->fd == -1) {
+	Log("SYNC_ask:  invalid sync file descriptor\n");
+	res->hdr.response = SYNC_COM_ERROR;
+	goto done;
+    }
+
+    if (com->hdr.command_len > SYNC_PROTO_MAX_LEN) {
+	Log("SYNC_ask:  internal SYNC buffer too small; please file a bug\n");
+	res->hdr.response = SYNC_COM_ERROR;
+	goto done;
+    }
+
+    com->hdr.proto_version = state->proto_version;
+
+    memcpy(buf, &com->hdr, sizeof(com->hdr));
+    if (com->payload.len) {
+	memcpy(buf + sizeof(com->hdr), com->payload.buf, 
+	       com->hdr.command_len - sizeof(com->hdr));
+    }
+
+#ifdef AFS_NT40_ENV
+    n = send(state->fd, buf, com->hdr.command_len, 0);
+    if (n != com->hdr.command_len) {
+	Log("SYNC_ask:  write failed\n");
+	res->hdr.response = SYNC_COM_ERROR;
+	goto done;
+    }
+
+    n = recv(state->fd, buf, SYNC_PROTO_MAX_LEN, 0);
+    if (n == 0 || (n < 0 && WSAEINTR != WSAGetLastError())) {
+	Log("SYNC_ask:  No response\n");
+	res->hdr.response = SYNC_COM_ERROR;
+	goto done;
+    }
+#else /* !AFS_NT40_ENV */
+    n = write(state->fd, buf, com->hdr.command_len);
+    if (com->hdr.command_len != n) {
+	Log("SYNC_ask: write failed\n");
+	res->hdr.response = SYNC_COM_ERROR;
+	goto done;
+    }
+
+    /* receive the response */
+    iov[0].iov_base = (char *)&res->hdr;
+    iov[0].iov_len = sizeof(res->hdr);
+    if (res->payload.len) {
+	iov[1].iov_base = (char *)res->payload.buf;
+	iov[1].iov_len = res->payload.len;
+	iovcnt = 2;
+    } else {
+	iovcnt = 1;
+    }
+    n = readv(state->fd, iov, iovcnt);
+    if (n == 0 || (n < 0 && errno != EINTR)) {
+	Log("SYNC_ask: No response\n");
+	res->hdr.response = SYNC_COM_ERROR;
+	goto done;
+    }
+#endif /* !AFS_NT40_ENV */
+
+    res->recv_len = n;
+
+    if (n < sizeof(res->hdr)) {
+	Log("SYNC_ask:  response too short\n");
+	res->hdr.response = SYNC_COM_ERROR;
+	goto done;
+    }
+#ifdef AFS_NT40_ENV
+    memcpy(&res->hdr, buf, sizeof(res->hdr));
+#endif
+
+    if ((n - sizeof(res->hdr)) > res->payload.len) {
+	Log("SYNC_ask:  response too long\n");
+	res->hdr.response = SYNC_COM_ERROR;
+	goto done;
+    }
+#ifdef AFS_NT40_ENV
+    memcpy(res->payload.buf, buf + sizeof(res->hdr), n - sizeof(res->hdr));
+#endif
+
+    if (res->hdr.response_len != n) {
+	Log("SYNC_ask:  length field in response inconsistent\n");
+	res->hdr.response = SYNC_COM_ERROR;
+	goto done;
+    }
+    if (res->hdr.response == SYNC_DENIED) {
+	Log("SYNC_ask: negative response\n");
+    }
+
+  done:
+    return res->hdr.response;
+}
+
+
+/* 
+ * daemon com SYNC server-side interfaces 
+ */
+
+/* get a command */
+afs_int32
+SYNC_getCom(int fd, SYNC_command * com)
+{
+    int n;
+    afs_int32 code = SYNC_OK;
+#ifdef AFS_NT40_ENV
+    SYNC_PROTO_BUF_DECL(buf);
+#else
+    struct iovec iov[2];
+    int iovcnt;
+#endif
+
+#ifdef AFS_NT40_ENV
+    n = recv(fd, buf, SYNC_PROTO_MAX_LEN, 0);
+
+    if (n == 0 || (n < 0 && WSAEINTR != WSAGetLastError())) {
+	Log("SYNC_getCom:  error receiving command\n");
+	code = SYNC_COM_ERROR;
+	goto done;
+    }
+#else /* !AFS_NT40_ENV */
+    iov[0].iov_base = (char *)&com->hdr;
+    iov[0].iov_len = sizeof(com->hdr);
+    if (com->payload.len) {
+	iov[1].iov_base = (char *)com->payload.buf;
+	iov[1].iov_len = com->payload.len;
+	iovcnt = 2;
+    } else {
+	iovcnt = 1;
+    }
+
+    n = readv(fd, iov, iovcnt);
+    if (n == 0 || (n < 0 && errno != EINTR)) {
+	Log("SYNC_getCom:  error receiving command\n");
+	code = SYNC_COM_ERROR;
+	goto done;
+    }
+#endif /* !AFS_NT40_ENV */
+
+    com->recv_len = n;
+
+    if (n < sizeof(com->hdr)) {
+	Log("SYNC_getCom:  command too short\n");
+	code = SYNC_COM_ERROR;
+	goto done;
+    }
+#ifdef AFS_NT40_ENV
+    memcpy(&com->hdr, buf, sizeof(com->hdr));
+#endif
+
+    if ((n - sizeof(com->hdr)) > com->payload.len) {
+	Log("SYNC_getCom:  command too long\n");
+	code = SYNC_COM_ERROR;
+	goto done;
+    }
+#ifdef AFS_NT40_ENV
+    memcpy(com->payload.buf, buf + sizeof(com->hdr), n - sizeof(com->hdr));
+#endif
+
+ done:
+    return code;
+}
+
+/* put a response */
+afs_int32
+SYNC_putRes(int fd, SYNC_response * res)
+{
+    int n;
+    afs_int32 code = SYNC_OK;
+    SYNC_PROTO_BUF_DECL(buf);
+
+    if (res->hdr.response_len > (sizeof(res->hdr) + res->payload.len)) {
+	Log("SYNC_putRes:  response_len field in response header inconsistent\n");
+	code = SYNC_COM_ERROR;
+	goto done;
+    }
+
+    if (res->hdr.response_len > SYNC_PROTO_MAX_LEN) {
+	Log("SYNC_putRes:  internal SYNC buffer too small; please file a bug\n");
+	code = SYNC_COM_ERROR;
+	goto done;
+    }
+
+#ifdef AFS_DEMAND_ATTACH_FS
+    res->hdr.flags |= SYNC_FLAG_DAFS_EXTENSIONS;
+#endif
+
+    memcpy(buf, &res->hdr, sizeof(res->hdr));
+    if (res->payload.len) {
+	memcpy(buf + sizeof(res->hdr), res->payload.buf, 
+	       res->hdr.response_len - sizeof(res->hdr));
+    }
+
+#ifdef AFS_NT40_ENV
+    n = send(fd, buf, res->hdr.response_len, 0);
+#else /* !AFS_NT40_ENV */
+    n = write(fd, buf, res->hdr.response_len);
+#endif /* !AFS_NT40_ENV */
+
+    if (res->hdr.response_len != n) {
+	Log("SYNC_putRes: write failed\n");
+	res->hdr.response = SYNC_COM_ERROR;
+	goto done;
+    }
+
+ done:
+    return code;
+}
+
+/* return 0 for legal (null-terminated) string,
+ * 1 for illegal (unterminated) string */
+int
+SYNC_verifyProtocolString(char * buf, size_t len)
+{
+    int ret = 0;
+    size_t s_len;
+
+    s_len = afs_strnlen(buf, len);
+
+    return (s_len == len) ? 1 : 0;
+}
diff --git a/src/vol/daemon_com.h b/src/vol/daemon_com.h
new file mode 100644
index 0000000000..846436783f
--- /dev/null
+++ b/src/vol/daemon_com.h
@@ -0,0 +1,141 @@
+/*
+ * Copyright 2006, Sine Nomine Associates and others.
+ * All Rights Reserved.
+ * 
+ * This software has been released under the terms of the IBM Public
+ * License.  For details, see the LICENSE file in the top-level source
+ * directory or online at http://www.openafs.org/dl/license10.html
+ */
+
+#ifndef _AFS_VOL_DAEMON_COM_H
+#define _AFS_VOL_DAEMON_COM_H
+
+/* 
+ * SYNC protocol constants
+ */
+
+/* SYNC protocol command codes
+ *
+ * command codes 0-65535 are reserved for
+ * global SYNC package command codes
+ */
+#define SYNC_COM_CODE_USER_BASE 65536
+#define SYNC_COM_CODE_DECL(code) (SYNC_COM_CODE_USER_BASE+(code))
+
+/* general command codes */
+#define SYNC_COM_CHANNEL_CLOSE 0
+
+
+/* SYNC protocol response codes
+ *
+ * response codes 0-65535 are reserved for 
+ * global SYNC package response codes
+ */
+#define SYNC_RES_CODE_USER_BASE 65536
+#define SYNC_RES_CODE_DECL(code) (SYNC_RES_CODE_USER_BASE+(code))
+
+/* general response codes */
+#define SYNC_OK                0   /* sync call returned ok */
+#define SYNC_DENIED            1   /* sync request denied by server */
+#define SYNC_COM_ERROR         2   /* sync protocol communicaions error */
+#define SYNC_BAD_COMMAND       3   /* sync command code not implemented by server */
+#define SYNC_FAILED            4   /* sync server-side procedure failed */
+
+
+/* SYNC protocol reason codes
+ *
+ * reason codes 0-65535 are reserved for
+ * global SYNC package reason codes
+ */
+#define SYNC_REASON_CODE_USER_BASE 65536
+#define SYNC_REASON_CODE_DECL(code) (SYNC_REASON_CODE_USER_BASE+(code))
+
+/* general reason codes */
+#define SYNC_REASON_NONE                 0
+#define SYNC_REASON_MALFORMED_PACKET     1
+
+
+/* SYNC protocol flags
+ *
+ * flag bits 0-7 are reserved for
+ * global SYNC package flags
+ */
+#define SYNC_FLAG_CODE_USER_BASE 8
+#define SYNC_FLAG_CODE_DECL(code) (1 << (SYNC_FLAG_CODE_USER_BASE+(code)))
+
+/* general flag codes */
+#define SYNC_FLAG_CHANNEL_SHUTDOWN   0x1
+#define SYNC_FLAG_DAFS_EXTENSIONS    0x2   /* signal that other end of socket is compiled
+					    * with demand attach extensions */
+
+/* SYNC protocol response buffers */
+#define SYNC_PROTO_MAX_LEN     768  /* maximum size of sync protocol message */
+
+/* use a large type to get proper buffer alignment so we can safely cast the pointer */
+#define SYNC_PROTO_BUF_DECL(buf) \
+    afs_int64 _##buf##_l[SYNC_PROTO_MAX_LEN/sizeof(afs_int64)]; \
+    char * buf = (char *)(_##buf##_l)
+
+
+/* client-side state object */
+typedef struct SYNC_client_state {
+    int fd;
+    afs_uint16 port;
+    afs_uint32 proto_version;
+    int retry_limit;            /* max number of times for SYNC_ask to retry */
+    afs_int32 hard_timeout;     /* upper limit on time to keep trying */
+    byte fatal_error;           /* fatal error on this client conn */
+} SYNC_client_state;
+
+/* wire types */
+typedef struct SYNC_command_hdr {
+    afs_uint32 proto_version;   /* sync protocol version */
+    afs_int32 programType;      /* type of program issuing the request */
+    afs_int32 command;          /* request type */
+    afs_int32 reason;           /* reason for request */
+    afs_uint32 command_len;     /* entire length of command */
+    afs_uint32 flags;
+} SYNC_command_hdr;
+
+typedef struct SYNC_response_hdr {
+    afs_uint32 proto_version;    /* sync protocol version */
+    afs_uint32 response_len;    /* entire length of response */
+    afs_int32 response;         /* response code */
+    afs_int32 reason;           /* reason for response */
+    afs_uint32 flags;
+} SYNC_response_hdr;
+
+
+/* user-visible types */
+typedef struct SYNC_command {
+    SYNC_command_hdr hdr;
+    struct {
+	afs_uint32 len;
+	void * buf;
+    } payload;
+    afs_int32 recv_len;
+} SYNC_command;
+
+typedef struct SYNC_response {
+    SYNC_response_hdr hdr;
+    struct {
+	afs_uint32 len;
+	void * buf;
+    } payload;
+    afs_int32 recv_len;
+} SYNC_response;
+
+
+/* client-side prototypes */
+extern afs_int32 SYNC_ask(SYNC_client_state *, SYNC_command * com, SYNC_response * res);
+extern int SYNC_connect(SYNC_client_state *);             /* setup the channel */
+extern int SYNC_disconnect(SYNC_client_state *);          /* just close the socket */
+extern afs_int32 SYNC_closeChannel(SYNC_client_state *);  /* do a graceful channel close */
+extern int SYNC_reconnect(SYNC_client_state *);           /* do a reconnect after a protocol error, or from a forked child */
+
+/* server-side prototypes */
+extern int SYNC_getCom(int fd, SYNC_command * com);
+extern int SYNC_putRes(int fd, SYNC_response * res);
+extern int SYNC_verifyProtocolString(char * buf, size_t len);
+
+#endif /* _AFS_VOL_DAEMON_COM_H */
diff --git a/src/vol/fssync-client.c b/src/vol/fssync-client.c
new file mode 100644
index 0000000000..205a08953d
--- /dev/null
+++ b/src/vol/fssync-client.c
@@ -0,0 +1,222 @@
+/*
+ * Copyright 2000, International Business Machines Corporation and others.
+ * All Rights Reserved.
+ * 
+ * This software has been released under the terms of the IBM Public
+ * License.  For details, see the LICENSE file in the top-level source
+ * directory or online at http://www.openafs.org/dl/license10.html
+ *
+ * Portions Copyright (c) 2006 Sine Nomine Associates
+ */
+
+/*
+	System:		VICE-TWO
+	Module:		fssync.c
+	Institution:	The Information Technology Center, Carnegie-Mellon University
+
+ */
+#ifdef notdef
+
+/* All this is going away in early 1989 */
+int newVLDB;			/* Compatibility flag */
+
+#endif
+static int newVLDB = 1;
+
+
+#ifndef AFS_PTHREAD_ENV
+#define USUAL_PRIORITY (LWP_MAX_PRIORITY - 2)
+
+/*
+ * stack size increased from 8K because the HP machine seemed to have trouble
+ * with the smaller stack
+ */
+#define USUAL_STACK_SIZE	(24 * 1024)
+#endif /* !AFS_PTHREAD_ENV */
+
+/*
+   fssync-client.c
+   File server synchronization with external volume utilities.
+   client-side implementation
+ */
+
+#include <afsconfig.h>
+#include <afs/param.h>
+
+RCSID
+    ("$Header$");
+
+#include <sys/types.h>
+#include <stdio.h>
+#ifdef AFS_NT40_ENV
+#include <winsock2.h>
+#include <time.h>
+#else
+#include <sys/param.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <netdb.h>
+#include <sys/time.h>
+#endif
+#include <errno.h>
+#ifdef AFS_PTHREAD_ENV
+#include <assert.h>
+#else /* AFS_PTHREAD_ENV */
+#include <afs/assert.h>
+#endif /* AFS_PTHREAD_ENV */
+#include <signal.h>
+
+#ifdef HAVE_STRING_H
+#include <string.h>
+#else
+#ifdef HAVE_STRINGS_H
+#include <strings.h>
+#endif
+#endif
+
+
+#include <rx/xdr.h>
+#include <afs/afsint.h>
+#include "nfs.h"
+#include <afs/errors.h>
+#include "daemon_com.h"
+#include "fssync.h"
+#include "lwp.h"
+#include "lock.h"
+#include <afs/afssyscalls.h>
+#include "ihandle.h"
+#include "vnode.h"
+#include "volume.h"
+#include "partition.h"
+
+#ifdef FSSYNC_BUILD_CLIENT
+
+/*@printflike@*/ extern void Log(const char *format, ...);
+
+#ifdef osi_Assert
+#undef osi_Assert
+#endif
+#define osi_Assert(e) (void)(e)
+
+extern int LogLevel;
+
+static SYNC_client_state fssync_state = { -1, 2040, FSYNC_PROTO_VERSION, 5, 120 };
+
+#ifdef AFS_PTHREAD_ENV
+static pthread_mutex_t vol_fsync_mutex;
+static volatile vol_fsync_mutex_init = 0;
+#define VFSYNC_LOCK \
+    assert(pthread_mutex_lock(&vol_fsync_mutex) == 0)
+#define VFSYNC_UNLOCK \
+    assert(pthread_mutex_unlock(&vol_fsync_mutex) == 0)
+#else
+#define VFSYNC_LOCK
+#define VFSYNC_UNLOCK
+#endif
+
+int
+FSYNC_clientInit(void)
+{
+#ifdef AFS_PTHREAD_ENV
+    /* this is safe since it gets called with VOL_LOCK held, or before we go multithreaded */
+    if (!vol_fsync_mutex_init) {
+	assert(pthread_mutex_init(&vol_fsync_mutex, NULL) == 0);
+	vol_fsync_mutex_init = 1;
+    }
+#endif
+    return SYNC_connect(&fssync_state);
+}
+
+void
+FSYNC_clientFinis(void)
+{
+    SYNC_closeChannel(&fssync_state);
+}
+
+int
+FSYNC_clientChildProcReconnect(void)
+{
+    return SYNC_reconnect(&fssync_state);
+}
+
+/* fsync client interface */
+afs_int32
+FSYNC_askfs(SYNC_command * com, SYNC_response * res)
+{
+    afs_int32 code;
+
+    VFSYNC_LOCK;
+    code = SYNC_ask(&fssync_state, com, res);
+    VFSYNC_UNLOCK;
+
+    switch (code) {
+    case SYNC_OK:
+    case SYNC_FAILED:
+	break;
+    case SYNC_COM_ERROR:
+    case SYNC_BAD_COMMAND:
+	Log("FSYNC_askfs: fatal FSSYNC protocol error; volume management functionality disabled until next fileserver restart\n");
+	break;
+    case SYNC_DENIED:
+	Log("FSYNC_askfs: FSSYNC request denied for reason=%d\n", res->hdr.reason);
+	break;
+    default:
+	Log("FSYNC_askfs: unknown protocol response %d\n", code);
+	break;
+    }
+    return code;
+}
+
+afs_int32
+FSYNC_GenericOp(void * ext_hdr, size_t ext_len,
+	      int command, int reason,
+	      SYNC_response * res_in)
+{
+    SYNC_response res_l, *res;
+    SYNC_command com;
+
+    if (res_in) {
+	res = res_in;
+    } else {
+	res = &res_l;
+	res_l.payload.buf = NULL;
+	res_l.payload.len = 0;
+    }
+
+    memset(&com, 0, sizeof(com));
+
+    com.hdr.programType = programType;
+    com.hdr.command = command;
+    com.hdr.reason = reason;
+    com.hdr.command_len = sizeof(com.hdr) + ext_len;
+    com.payload.buf = ext_hdr;
+    com.payload.len = ext_len;
+
+    return FSYNC_askfs(&com, res);
+}
+
+afs_int32
+FSYNC_VolOp(VolumeId volume, char * partition, 
+	    int command, int reason,
+	    SYNC_response * res)
+{
+    FSSYNC_VolOp_hdr vcom;
+
+    memset(&vcom, 0, sizeof(vcom));
+
+    vcom.volume = volume;
+    if (partition)
+	strlcpy(vcom.partName, partition, sizeof(vcom.partName));
+
+    return FSYNC_GenericOp(&vcom, sizeof(vcom), command, reason, res);
+}
+
+afs_int32
+FSYNC_StatsOp(FSSYNC_StatsOp_hdr * scom, int command, int reason,
+	      SYNC_response * res)
+{
+    return FSYNC_GenericOp(scom, sizeof(*scom), command, reason, res);
+}
+
+
+#endif /* FSSYNC_BUILD_CLIENT */
diff --git a/src/vol/fssync-debug.c b/src/vol/fssync-debug.c
new file mode 100644
index 0000000000..194204e8ba
--- /dev/null
+++ b/src/vol/fssync-debug.c
@@ -0,0 +1,1148 @@
+/*
+ * Copyright 2006, Sine Nomine Associates and others.
+ * All Rights Reserved.
+ * 
+ * This software has been released under the terms of the IBM Public
+ * License.  For details, see the LICENSE file in the top-level source
+ * directory or online at http://www.openafs.org/dl/license10.html
+ */
+
+/* Main program file. Define globals. */
+#define MAIN 1
+
+/*
+ * fssync administration tool
+ */
+
+
+#include <afsconfig.h>
+#include <afs/param.h>
+
+RCSID
+    ("$Header$");
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <dirent.h>
+#include <sys/stat.h>
+#include <time.h>
+#include <errno.h>
+#ifdef AFS_NT40_ENV
+#include <io.h>
+#include <WINNT/afsevent.h>
+#else
+#include <sys/param.h>
+#include <sys/file.h>
+#ifndef ITIMER_REAL
+#include <sys/time.h>
+#endif /* ITIMER_REAL */
+#endif
+#include <rx/xdr.h>
+#include <afs/afsint.h>
+#include <afs/assert.h>
+
+
+#include <fcntl.h>
+
+#ifndef AFS_NT40_ENV
+#include <afs/osi_inode.h>
+#endif
+
+#include <afs/cmd.h>
+#include <afs/afsutil.h>
+#include <afs/fileutil.h>
+
+#include "nfs.h"
+#include "lwp.h"
+#include "lock.h"
+#include "ihandle.h"
+#include "vnode.h"
+#include "volume.h"
+#include "partition.h"
+#include "daemon_com.h"
+#include "fssync.h"
+#ifdef AFS_NT40_ENV
+#include <pthread.h>
+#endif
+
+int VolumeChanged; /* hack to make dir package happy */
+
+
+struct volop_state {
+    afs_uint32 volume;
+    char partName[16];
+};
+
+struct state {
+    afs_int32 reason;
+    struct volop_state * vop;
+};
+
+static int common_prolog(struct cmd_syndesc *, struct state *);
+static int common_volop_prolog(struct cmd_syndesc *, struct state *);
+
+static int do_volop(struct state *, afs_int32 command, SYNC_response * res);
+
+static char * response_code_to_string(afs_int32);
+static char * command_code_to_string(afs_int32);
+static char * reason_code_to_string(afs_int32);
+static char * program_type_to_string(afs_int32);
+
+static int VolOnline(struct cmd_syndesc * as, char * rock);
+static int VolOffline(struct cmd_syndesc * as, char * rock);
+static int VolMode(struct cmd_syndesc * as, char * rock);
+static int VolDetach(struct cmd_syndesc * as, char * rock);
+static int VolBreakCBKs(struct cmd_syndesc * as, char * rock);
+static int VolMove(struct cmd_syndesc * as, char * rock);
+static int VolList(struct cmd_syndesc * as, char * rock);
+static int VolQuery(struct cmd_syndesc * as, char * rock);
+static int VolHdrQuery(struct cmd_syndesc * as, char * rock);
+static int VolOpQuery(struct cmd_syndesc * as, char * rock);
+static int StatsQuery(struct cmd_syndesc * as, char * rock);
+
+
+static void print_vol_stats_general(VolPkgStats * stats);
+static void print_vol_stats_viceP(struct DiskPartitionStats * stats);
+static void print_vol_stats_hash(struct VolumeHashChainStats * stats);
+#ifdef AFS_DEMAND_ATTACH_FS
+static void print_vol_stats_hdr(struct volume_hdr_LRU_stats * stats);
+#endif
+
+#ifndef AFS_NT40_ENV
+#include "AFS_component_version_number.c"
+#endif
+#define MAX_ARGS 128
+
+#define COMMON_PARMS_OFFSET    12
+#define COMMON_PARMS(ts) \
+    cmd_Seek(ts, COMMON_PARMS_OFFSET); \
+    cmd_AddParm(ts, "-reason", CMD_SINGLE, CMD_OPTIONAL, "sync protocol reason code"); \
+    cmd_AddParm(ts, "-programtype", CMD_SINGLE, CMD_OPTIONAL, "program type code")
+
+#define COMMON_VOLOP_PARMS_OFFSET    10
+#define COMMON_VOLOP_PARMS(ts) \
+    cmd_Seek(ts, COMMON_VOLOP_PARMS_OFFSET); \
+    cmd_AddParm(ts, "-volumeid", CMD_SINGLE, 0, "volume id"); \
+    cmd_AddParm(ts, "-partition", CMD_SINGLE, CMD_OPTIONAL, "partition name")
+
+#define CUSTOM_PARMS_OFFSET 1
+
+
+#define VOLOP_PARMS_DECL(ts) \
+    COMMON_VOLOP_PARMS(ts); \
+    COMMON_PARMS(ts)
+#define COMMON_PARMS_DECL(ts) \
+    COMMON_PARMS(ts)
+
+int
+main(int argc, char **argv)
+{
+    struct cmd_syndesc *ts;
+    int err = 0;
+    int i;
+    extern char cml_version_number[];
+
+    /* Initialize directory paths */
+    if (!(initAFSDirPath() & AFSDIR_SERVER_PATHS_OK)) {
+#ifdef AFS_NT40_ENV
+	ReportErrorEventAlt(AFSEVT_SVR_NO_INSTALL_DIR, 0, argv[0], 0);
+#endif
+	fprintf(stderr, "%s: Unable to obtain AFS server directory.\n",
+		argv[0]);
+	exit(2);
+    }
+
+    
+    ts = cmd_CreateSyntax("online", VolOnline, 0, "bring a volume online (FSYNC_VOL_ON opcode)");
+    VOLOP_PARMS_DECL(ts);
+
+    ts = cmd_CreateSyntax("offline", VolOffline, 0, "take a volume offline (FSYNC_VOL_OFF opcode)");
+    VOLOP_PARMS_DECL(ts);
+
+    ts = cmd_CreateSyntax("mode", VolMode, 0, "change volume attach mode (FSYNC_VOL_NEEDVOLUME opcode)");
+    VOLOP_PARMS_DECL(ts);
+    cmd_CreateAlias(ts, "needvolume");
+
+    ts = cmd_CreateSyntax("detach", VolDetach, 0, "detach a volume (FSYNC_VOL_DONE opcode)");
+    VOLOP_PARMS_DECL(ts);
+
+    ts = cmd_CreateSyntax("callback", VolBreakCBKs, 0, "break callbacks for volume (FSYNC_VOL_BREAKCBKS opcode)");
+    VOLOP_PARMS_DECL(ts);
+    cmd_CreateAlias(ts, "cbk");
+
+    ts = cmd_CreateSyntax("move", VolMove, 0, "set volume moved flag (FSYNC_VOL_MOVE opcode)");
+    VOLOP_PARMS_DECL(ts);
+
+    ts = cmd_CreateSyntax("list", VolList, 0, "sync local volume list (FSYNC_VOL_LISTVOLUMES opcode)");
+    VOLOP_PARMS_DECL(ts);
+    cmd_CreateAlias(ts, "ls");
+
+    ts = cmd_CreateSyntax("query", VolQuery, 0, "get volume structure (FSYNC_VOL_QUERY opcode)");
+    VOLOP_PARMS_DECL(ts);
+    cmd_CreateAlias(ts, "qry");
+
+    ts = cmd_CreateSyntax("header", VolHdrQuery, 0, "get volume disk data structure (FSYNC_VOL_QUERY_HDR opcode)");
+    VOLOP_PARMS_DECL(ts);
+    cmd_CreateAlias(ts, "hdr");
+
+    ts = cmd_CreateSyntax("volop", VolOpQuery, 0, "get pending volume operation info (FSYNC_VOL_QUERY_VOP opcode)");
+    VOLOP_PARMS_DECL(ts);
+    cmd_CreateAlias(ts, "vop");
+
+    ts = cmd_CreateSyntax("stats", StatsQuery, 0, "see 'stats help' for more information");
+    cmd_Seek(ts, CUSTOM_PARMS_OFFSET);
+    cmd_AddParm(ts, "-cmd", CMD_SINGLE, 0, "subcommand");
+    cmd_AddParm(ts, "-arg1", CMD_SINGLE, CMD_OPTIONAL, "arg1");
+    cmd_AddParm(ts, "-arg2", CMD_SINGLE, CMD_OPTIONAL, "arg2");
+    COMMON_PARMS_DECL(ts);
+
+    err = cmd_Dispatch(argc, argv);
+    exit(err);
+}
+
+static int
+common_prolog(struct cmd_syndesc * as, struct state * state)
+{
+    register struct cmd_item *ti;
+
+#ifdef AFS_NT40_ENV
+    if (afs_winsockInit() < 0) {
+	Exit(1);
+    }
+#endif
+
+    VInitVolumePackage(debugUtility, 1, 1,
+		       DONT_CONNECT_FS, 0);
+    DInit(1);
+
+    if ((ti = as->parms[COMMON_PARMS_OFFSET].items)) {	/* -reason */
+	state->reason = atoi(ti->data);
+    }
+    if ((ti = as->parms[COMMON_PARMS_OFFSET+1].items)) {	/* -programtype */
+	if (!strcmp(ti->data, "fileServer")) {
+	    programType = fileServer;
+	} else if (!strcmp(ti->data, "volumeUtility")) {
+	    programType = volumeUtility;
+	} else if (!strcmp(ti->data, "salvager")) {
+	    programType = salvager;
+	} else if (!strcmp(ti->data, "salvageServer")) {
+	    programType = salvageServer;
+	} else {
+	    programType = (ProgramType) atoi(ti->data);
+	}
+    }
+
+    VConnectFS();
+
+    return 0;
+}
+
+static int
+common_volop_prolog(struct cmd_syndesc * as, struct state * state)
+{
+    register struct cmd_item *ti;
+    char pname[100], *temp;
+
+    state->vop = (struct volop_state *) calloc(1, sizeof(struct volop_state));
+    assert(state->vop != NULL);
+
+    if ((ti = as->parms[COMMON_VOLOP_PARMS_OFFSET].items)) {	/* -volumeid */
+	state->vop->volume = atoi(ti->data);
+    } else {
+	fprintf(stderr, "required argument -volumeid not given\n");
+    }
+
+    if ((ti = as->parms[COMMON_VOLOP_PARMS_OFFSET+1].items)) {	/* -partition */
+	strlcpy(state->vop->partName, ti->data, sizeof(state->vop->partName));
+    } else {
+	memset(state->vop->partName, 0, sizeof(state->vop->partName));
+    }
+
+    return 0;
+}
+
+static int
+do_volop(struct state * state, afs_int32 command, SYNC_response * res)
+{
+    afs_int32 code;
+    SYNC_PROTO_BUF_DECL(res_buf);
+    SYNC_response res_l;
+
+    if (!res) {
+	res = &res_l;
+	res->payload.len = SYNC_PROTO_MAX_LEN;
+	res->payload.buf = res_buf;
+    }
+
+    fprintf(stderr, "calling FSYNC_VolOp with command code %d (%s)\n", 
+	    command, command_code_to_string(command));
+
+    code = FSYNC_VolOp(state->vop->volume,
+		       state->vop->partName,
+		       command,
+		       state->reason,
+		       res);
+
+    switch (code) {
+    case SYNC_OK:
+    case SYNC_DENIED:
+	break;
+    default:
+	fprintf(stderr, "possible sync protocol error. return code was %d\n", code);
+    }
+
+    fprintf(stderr, "FSYNC_VolOp returned %d (%s)\n", code, response_code_to_string(code));
+    fprintf(stderr, "protocol response code was %d (%s)\n", 
+	    res->hdr.response, response_code_to_string(res->hdr.response));
+    fprintf(stderr, "protocol reason code was %d (%s)\n", 
+	    res->hdr.reason, reason_code_to_string(res->hdr.reason));
+
+    VDisconnectFS();
+}
+
+static char *
+response_code_to_string(afs_int32 response)
+{
+    switch (response) {
+    case SYNC_OK:
+	return "SYNC_OK";
+    case SYNC_DENIED:
+	return "SYNC_DENIED";
+    case SYNC_COM_ERROR:
+	return "SYNC_COM_ERROR";
+    case SYNC_BAD_COMMAND:
+	return "SYNC_BAD_COMMAND";
+    case SYNC_FAILED:
+	return "SYNC_FAILED";
+    default:
+	return "**UNKNOWN**";
+    }
+}
+
+static char *
+command_code_to_string(afs_int32 command)
+{
+    switch (command) {
+    case SYNC_COM_CHANNEL_CLOSE:
+	return "SYNC_COM_CHANNEL_CLOSE";
+    case FSYNC_VOL_ON:
+	return "FSYNC_VOL_ON";
+    case FSYNC_VOL_OFF:
+	return "FSYNC_VOL_OFF";
+    case FSYNC_VOL_LISTVOLUMES:
+	return "FSYNC_VOL_LISTVOLUMES";
+    case FSYNC_VOL_NEEDVOLUME:
+	return "FSYNC_VOL_NEEDVOLUME";
+    case FSYNC_VOL_MOVE:
+	return "FSYNC_VOL_MOVE";
+    case FSYNC_VOL_BREAKCBKS:
+	return "FSYNC_VOL_BREAKCBKS";
+    case FSYNC_VOL_DONE:
+	return "FSYNC_VOL_DONE";
+    case FSYNC_VOL_QUERY:
+	return "FSYNC_VOL_QUERY";
+    case FSYNC_VOL_QUERY_HDR:
+	return "FSYNC_VOL_QUERY_HDR";
+    case FSYNC_VOL_QUERY_VOP:
+	return "FSYNC_VOL_QUERY_VOP";
+    case FSYNC_VOL_STATS_GENERAL:
+	return "FSYNC_VOL_STATS_GENERAL";
+    case FSYNC_VOL_STATS_VICEP:
+	return "FSYNC_VOL_STATS_VICEP";
+    case FSYNC_VOL_STATS_HASH:
+	return "FSYNC_VOL_STATS_HASH";
+    case FSYNC_VOL_STATS_HDR:
+	return "FSYNC_VOL_STATS_HDR";
+    case FSYNC_VOL_STATS_VLRU:
+	return "FSYNC_VOL_STATS_VLRU";
+    default:
+	return "**UNKNOWN**";
+    }
+}
+
+static char *
+reason_code_to_string(afs_int32 reason)
+{
+    switch (reason) {
+    case SYNC_REASON_NONE:
+	return "SYNC_REASON_NONE";
+    case SYNC_REASON_MALFORMED_PACKET:
+	return "SYNC_REASON_MALFORMED_PACKET";
+    case FSYNC_WHATEVER:
+	return "FSYNC_WHATEVER";
+    case FSYNC_SALVAGE:
+	return "FSYNC_SALVAGE";
+    case FSYNC_MOVE:
+	return "FSYNC_MOVE";
+    case FSYNC_OPERATOR:
+	return "FSYNC_OPERATOR";
+    case FSYNC_EXCLUSIVE:
+	return "FSYNC_EXCLUSIVE";
+    case FSYNC_UNKNOWN_VOLID:
+	return "FSYNC_UNKNOWN_VOLID";
+    case FSYNC_HDR_NOT_ATTACHED:
+	return "FSYNC_HDR_NOT_ATTACHED";
+    case FSYNC_NO_PENDING_VOL_OP:
+	return "FSYNC_NO_PENDING_VOL_OP";
+    case FSYNC_VOL_PKG_ERROR:
+	return "FSYNC_VOL_PKG_ERROR";
+    default:
+	return "**UNKNOWN**";
+    }
+}
+
+static char *
+program_type_to_string(afs_int32 type)
+{
+    switch ((ProgramType)type) {
+    case fileServer:
+	return "fileServer";
+    case volumeUtility:
+	return "volumeUtility";
+    case salvager:
+	return "salvager";
+    case salvageServer:
+	return "salvageServer";
+    case debugUtility:
+      return "debugUtility";
+    default:
+	return "**UNKNOWN**";
+    }
+}
+
+static int 
+VolOnline(struct cmd_syndesc * as, char * rock)
+{
+    struct state state;
+
+    common_prolog(as, &state);
+    common_volop_prolog(as, &state);
+
+    do_volop(&state, FSYNC_VOL_ON, NULL);
+
+    return 0;
+}
+
+static int 
+VolOffline(struct cmd_syndesc * as, char * rock)
+{
+    struct state state;
+
+    common_prolog(as, &state);
+    common_volop_prolog(as, &state);
+
+    do_volop(&state, FSYNC_VOL_OFF, NULL);
+
+    return 0;
+}
+
+static int
+VolMode(struct cmd_syndesc * as, char * rock)
+{
+    struct state state;
+
+    common_prolog(as, &state);
+    common_volop_prolog(as, &state);
+
+    do_volop(&state, FSYNC_VOL_NEEDVOLUME, NULL);
+
+    return 0;
+}
+
+static int
+VolDetach(struct cmd_syndesc * as, char * rock)
+{
+    struct state state;
+
+    common_prolog(as, &state);
+    common_volop_prolog(as, &state);
+
+    do_volop(&state, FSYNC_VOL_DONE, NULL);
+
+    return 0;
+}
+
+static int
+VolBreakCBKs(struct cmd_syndesc * as, char * rock)
+{
+    struct state state;
+
+    common_prolog(as, &state);
+    common_volop_prolog(as, &state);
+
+    do_volop(&state, FSYNC_VOL_BREAKCBKS, NULL);
+
+    return 0;
+}
+
+static int
+VolMove(struct cmd_syndesc * as, char * rock)
+{
+    struct state state;
+
+    common_prolog(as, &state);
+    common_volop_prolog(as, &state);
+
+    do_volop(&state, FSYNC_VOL_MOVE, NULL);
+
+    return 0;
+}
+
+static int
+VolList(struct cmd_syndesc * as, char * rock)
+{
+    struct state state;
+
+    common_prolog(as, &state);
+    common_volop_prolog(as, &state);
+
+    do_volop(&state, FSYNC_VOL_LISTVOLUMES, NULL);
+
+    return 0;
+}
+
+#ifdef AFS_DEMAND_ATTACH_FS
+static char *
+vol_state_to_string(VolState state)
+{
+    switch (state) {
+    case VOL_STATE_UNATTACHED:
+	return "VOL_STATE_UNATTACHED";
+    case VOL_STATE_PREATTACHED:
+	return "VOL_STATE_PREATTACHED";
+    case VOL_STATE_ATTACHING:
+	return "VOL_STATE_ATTACHING";
+    case VOL_STATE_ATTACHED:
+	return "VOL_STATE_ATTACHED";
+    case VOL_STATE_UPDATING:
+	return "VOL_STATE_UPDATING";
+    case VOL_STATE_GET_BITMAP:
+	return "VOL_STATE_GET_BITMAP";
+    case VOL_STATE_HDR_LOADING:
+	return "VOL_STATE_HDR_LOADING";
+    case VOL_STATE_HDR_ATTACHING:
+	return "VOL_STATE_HDR_ATTACHING";
+    case VOL_STATE_SHUTTING_DOWN:
+	return "VOL_STATE_SHUTTING_DOWN";
+    case VOL_STATE_GOING_OFFLINE:
+	return "VOL_STATE_GOING_OFFLINE";
+    case VOL_STATE_OFFLINING:
+	return "VOL_STATE_OFFLINING";
+    case VOL_STATE_DETACHING:
+	return "VOL_STATE_DETACHING";
+    case VOL_STATE_SALVSYNC_REQ:
+      return "VOL_STATE_SALVSYNC_REQ";
+    case VOL_STATE_SALVAGING:
+	return "VOL_STATE_SALVAGING";
+    case VOL_STATE_ERROR:
+	return "VOL_STATE_ERROR";
+    case VOL_STATE_FREED:
+	return "VOL_STATE_FREED";
+    default:
+	return "**UNKNOWN**";
+    }
+}
+
+static char *
+vol_flags_to_string(afs_uint16 flags)
+{
+    static char str[128];
+    int count = 0;
+    str[0]='\0';
+
+    if (flags & VOL_HDR_ATTACHED) {
+	strlcat(str, "VOL_HDR_ATTACHED", sizeof(str));
+	count++;
+    }
+
+    if (flags & VOL_HDR_LOADED) {
+	if (count) {
+	    strlcat(str, " | ", sizeof(str));
+	}
+	strlcat(str, "VOL_HDR_LOADED", sizeof(str));
+	count++;
+    }
+
+    if (flags & VOL_HDR_IN_LRU) {
+	if (count) {
+	    strlcat(str, " | ", sizeof(str));
+	}
+	strlcat(str, "VOL_HDR_IN_LRU", sizeof(str));
+	count++;
+    }
+
+    if (flags & VOL_IN_HASH) {
+	if (count) {
+	    strlcat(str, " | ", sizeof(str));
+	}
+	strlcat(str, "VOL_IN_HASH", sizeof(str));
+	count++;
+    }
+
+    if (flags & VOL_ON_VBYP_LIST) {
+	if (count) {
+	    strlcat(str, " | ", sizeof(str));
+	}
+	strlcat(str, "VOL_ON_VBYP_LIST", sizeof(str));
+	count++;
+    }
+
+    if (flags & VOL_IS_BUSY) {
+	if (count) {
+	    strlcat(str, " | ", sizeof(str));
+	}
+	strlcat(str, "VOL_IS_BUSY", sizeof(str));
+	count++;
+    }
+
+    if (flags & VOL_ON_VLRU) {
+	if (count) {
+	    strlcat(str, " | ", sizeof(str));
+	}
+	strlcat(str, "VOL_ON_VLRU", sizeof(str));
+    }
+
+    if (flags & VOL_HDR_DONTSALV) {
+	if (count) {
+	    strlcat(str, " | ", sizeof(str));
+	}
+	strlcat(str, "VOL_HDR_DONTSALV", sizeof(str));
+    }
+
+    return str;
+}
+
+static char *
+vlru_idx_to_string(int idx)
+{
+    switch (idx) {
+    case VLRU_QUEUE_NEW:
+	return "VLRU_QUEUE_NEW";
+    case VLRU_QUEUE_MID:
+	return "VLRU_QUEUE_MID";
+    case VLRU_QUEUE_OLD:
+	return "VLRU_QUEUE_OLD";
+    case VLRU_QUEUE_CANDIDATE:
+	return "VLRU_QUEUE_CANDIDATE";
+    case VLRU_QUEUE_HELD:
+	return "VLRU_QUEUE_HELD";
+    case VLRU_QUEUE_INVALID:
+	return "VLRU_QUEUE_INVALID";
+    default:
+	return "**UNKNOWN**";
+    }
+}
+#endif
+
+static int
+VolQuery(struct cmd_syndesc * as, char * rock)
+{
+    struct state state;
+    SYNC_PROTO_BUF_DECL(res_buf);
+    SYNC_response res;
+    Volume v;
+    int hi, lo;
+
+    res.hdr.response_len = sizeof(res.hdr);
+    res.payload.buf = res_buf;
+    res.payload.len = SYNC_PROTO_MAX_LEN;
+
+    common_prolog(as, &state);
+    common_volop_prolog(as, &state);
+
+    do_volop(&state, FSYNC_VOL_QUERY, &res);
+
+    if (res.hdr.response == SYNC_OK) {
+	memcpy(&v, res.payload.buf, sizeof(Volume));
+
+	printf("volume = {\n");
+	printf("\thashid          = %u\n", v.hashid);
+	printf("\theader          = 0x%x\n", v.header);
+	printf("\tdevice          = %d\n", v.device);
+	printf("\tpartition       = 0x%x\n", v.partition);
+	printf("\tlinkHandle      = 0x%x\n", v.linkHandle);
+	printf("\tnextVnodeUnique = %u\n", v.nextVnodeUnique);
+	printf("\tdiskDataHandle  = 0x%x\n", v.diskDataHandle);
+	printf("\tvnodeHashOffset = %u\n", v.vnodeHashOffset);
+	printf("\tshuttingDown    = %d\n", v.shuttingDown);
+	printf("\tgoingOffline    = %d\n", v.goingOffline);
+	printf("\tcacheCheck      = %u\n", v.cacheCheck);
+	printf("\tnUsers          = %d\n", v.nUsers);
+	printf("\tneedsPutBack    = %d\n", v.needsPutBack);
+	printf("\tspecialStatus   = %d\n", v.specialStatus);
+	printf("\tupdateTime      = %u\n", v.updateTime);
+	
+	printf("\tvnodeIndex[vSmall] = {\n");
+        printf("\t\thandle       = 0x%x\n", v.vnodeIndex[vSmall].handle);
+        printf("\t\tbitmap       = 0x%x\n", v.vnodeIndex[vSmall].bitmap);
+	printf("\t\tbitmapSize   = %u\n", v.vnodeIndex[vSmall].bitmapSize);
+	printf("\t\tbitmapOffset = %u\n", v.vnodeIndex[vSmall].bitmapOffset);
+	printf("\t}\n");
+	printf("\tvnodeIndex[vLarge] = {\n");
+        printf("\t\thandle       = 0x%x\n", v.vnodeIndex[vLarge].handle);
+        printf("\t\tbitmap       = 0x%x\n", v.vnodeIndex[vLarge].bitmap);
+	printf("\t\tbitmapSize   = %u\n", v.vnodeIndex[vLarge].bitmapSize);
+	printf("\t\tbitmapOffset = %u\n", v.vnodeIndex[vLarge].bitmapOffset);
+	printf("\t}\n");
+#ifdef AFS_DEMAND_ATTACH_FS
+	if (res.hdr.flags & SYNC_FLAG_DAFS_EXTENSIONS) {
+	    printf("\tupdateTime      = %u\n", v.updateTime);
+	    printf("\tattach_state    = %s\n", vol_state_to_string(v.attach_state));
+	    printf("\tattach_flags    = %s\n", vol_flags_to_string(v.attach_flags));
+	    printf("\tnWaiters        = %d\n", v.nWaiters);
+	    printf("\tchainCacheCheck = %d\n", v.chainCacheCheck);
+	    
+	    /* online salvage structure */
+	    printf("\tsalvage = {\n");
+	    printf("\t\tprio      = %u\n", v.salvage.prio);
+	    printf("\t\treason    = %d\n", v.salvage.reason);
+	    printf("\t\trequested = %d\n", v.salvage.requested);
+	    printf("\t\tscheduled = %d\n", v.salvage.scheduled);
+	    printf("\t}\n");
+	    
+	    /* statistics structure */
+	    printf("\tstats = {\n");
+
+	    printf("\t\thash_lookups = {\n");
+	    SplitInt64(v.stats.hash_lookups,hi,lo);
+	    printf("\t\t\thi = %u\n", hi);
+	    printf("\t\t\tlo = %u\n", lo);
+	    printf("\t\t}\n");
+
+	    printf("\t\thash_short_circuits = {\n");
+	    SplitInt64(v.stats.hash_short_circuits,hi,lo);
+	    printf("\t\t\thi = %u\n", hi);
+	    printf("\t\t\tlo = %u\n", lo);
+	    printf("\t\t}\n");
+
+	    printf("\t\thdr_loads = {\n");
+	    SplitInt64(v.stats.hdr_loads,hi,lo);
+	    printf("\t\t\thi = %u\n", hi);
+	    printf("\t\t\tlo = %u\n", lo);
+	    printf("\t\t}\n");
+
+	    printf("\t\thdr_gets = {\n");
+	    SplitInt64(v.stats.hdr_gets,hi,lo);
+	    printf("\t\t\thi = %u\n", hi);
+	    printf("\t\t\tlo = %u\n", lo);
+	    printf("\t\t}\n");
+	    
+	    printf("\t\tattaches         = %u\n", v.stats.attaches);
+	    printf("\t\tsoft_detaches    = %u\n", v.stats.soft_detaches);
+	    printf("\t\tsalvages         = %u\n", v.stats.salvages);
+	    printf("\t\tvol_ops          = %u\n", v.stats.vol_ops);
+	    
+	    printf("\t\tlast_attach      = %u\n", v.stats.last_attach);
+	    printf("\t\tlast_get         = %u\n", v.stats.last_get);
+	    printf("\t\tlast_promote     = %u\n", v.stats.last_promote);
+	    printf("\t\tlast_hdr_get     = %u\n", v.stats.last_hdr_get);
+	    printf("\t\tlast_salvage     = %u\n", v.stats.last_salvage);
+	    printf("\t\tlast_salvage_req = %u\n", v.stats.last_salvage_req);
+	    printf("\t\tlast_vol_op      = %u\n", v.stats.last_vol_op);
+	    printf("\t}\n");
+	    
+	    /* VLRU state */
+	    printf("\tvlru = {\n");
+	    printf("\t\tidx = %d (%s)\n", 
+		   v.vlru.idx, vlru_idx_to_string(v.vlru.idx));
+	    printf("\t}\n");
+
+	    /* volume op state */
+	    printf("\tpending_vol_op  = 0x%x\n", v.pending_vol_op);
+	}
+#else /* !AFS_DEMAND_ATTACH_FS */
+	if (res.hdr.flags & SYNC_FLAG_DAFS_EXTENSIONS) {
+	    printf("*** server asserted demand attach extensions. fssync-debug not built to\n");
+	    printf("*** recognize those extensions. please recompile fssync-debug if you need\n");
+	    printf("*** to dump dafs extended state\n");
+	}
+#endif /* !AFS_DEMAND_ATTACH_FS */
+	printf("}\n");
+    }
+
+    return 0;
+}
+
+static int
+VolHdrQuery(struct cmd_syndesc * as, char * rock)
+{
+    struct state state;
+    SYNC_PROTO_BUF_DECL(res_buf);
+    SYNC_response res;
+    VolumeDiskData v;
+    int i;
+
+    res.hdr.response_len = sizeof(res.hdr);
+    res.payload.buf = res_buf;
+    res.payload.len = SYNC_PROTO_MAX_LEN;
+
+    common_prolog(as, &state);
+    common_volop_prolog(as, &state);
+
+    do_volop(&state, FSYNC_VOL_QUERY_HDR, &res);
+
+    if (res.hdr.response == SYNC_OK) {
+	memcpy(&v, res.payload.buf, sizeof(VolumeDiskData));
+
+	printf("VolumeDiskData = {\n");
+	printf("\tstamp = {\n");
+	printf("\t\tmagic   = 0x%x\n", v.stamp.magic);
+	printf("\t\tversion = %u\n", v.stamp.version);
+	printf("\t}\n");
+	
+	printf("\tid               = %u\n", v.id);
+	printf("\tname             = '%s'\n", v.name);
+	printf("\tinUse            = %d\n", v.inUse);
+	printf("\tinService        = %d\n", v.inService);
+	printf("\tblessed          = %d\n", v.blessed);
+	printf("\tneedsSalvaged    = %d\n", v.needsSalvaged);
+	printf("\tuniquifier       = %u\n", v.uniquifier);
+	printf("\ttype             = %d\n", v.type);
+	printf("\tparentId         = %u\n", v.parentId);
+	printf("\tcloneId          = %u\n", v.cloneId);
+	printf("\tbackupId         = %u\n", v.backupId);
+	printf("\trestoredFromId   = %u\n", v.restoredFromId);
+	printf("\tneedsCallback    = %d\n", v.needsCallback);
+	printf("\tdestroyMe        = %d\n", v.destroyMe);
+	printf("\tdontSalvage      = %d\n", v.dontSalvage);
+	printf("\tmaxquota         = %d\n", v.maxquota);
+	printf("\tminquota         = %d\n", v.minquota);
+	printf("\tmaxfiles         = %d\n", v.maxfiles);
+	printf("\taccountNumber    = %u\n", v.accountNumber);
+	printf("\towner            = %u\n", v.owner);
+	printf("\tfilecount        = %d\n", v.filecount);
+	printf("\tdiskused         = %d\n", v.diskused);
+	printf("\tdayUse           = %d\n", v.dayUse);
+	for (i = 0; i < 7; i++) {
+	    printf("\tweekUse[%d]       = %d\n", i, v.weekUse[i]);
+	}
+	printf("\tdayUseDate       = %u\n", v.dayUseDate);
+	printf("\tcreationDate     = %u\n", v.creationDate);
+	printf("\taccessDate       = %u\n", v.accessDate);
+	printf("\tupdateDate       = %u\n", v.updateDate);
+	printf("\texpirationDate   = %u\n", v.expirationDate);
+	printf("\tbackupDate       = %u\n", v.backupDate);
+	printf("\tcopyDate         = %u\n", v.copyDate);
+#ifdef OPENAFS_VOL_STATS
+	printf("\tstat_initialized = %d\n", v.stat_initialized);
+#else
+        printf("\tmtd              = '%s'\n", v.motd);
+#endif
+	printf("}\n");
+    }
+
+    return 0;
+}
+
+static int
+VolOpQuery(struct cmd_syndesc * as, char * rock)
+{
+    struct state state;
+    SYNC_PROTO_BUF_DECL(res_buf);
+    SYNC_response res;
+    FSSYNC_VolOp_info vop;
+    int i;
+
+    res.hdr.response_len = sizeof(res.hdr);
+    res.payload.buf = res_buf;
+    res.payload.len = SYNC_PROTO_MAX_LEN;
+
+    common_prolog(as, &state);
+    common_volop_prolog(as, &state);
+
+    do_volop(&state, FSYNC_VOL_QUERY_VOP, &res);
+
+    if (!(res.hdr.flags & SYNC_FLAG_DAFS_EXTENSIONS)) {
+	printf("*** file server not compiled with demand attach extensions.\n");
+	printf("*** pending volume operation metadata not available.\n");
+    }
+
+    if (res.hdr.response == SYNC_OK) {
+	memcpy(&vop, res.payload.buf, sizeof(FSSYNC_VolOp_info));
+
+	printf("pending_vol_op = {\n");
+
+	printf("\tcom = {\n");
+	printf("\t\tproto_version  = %u\n", vop.com.proto_version);
+	printf("\t\tprogramType    = %d (%s)\n", 
+	       vop.com.programType, program_type_to_string(vop.com.programType));
+	printf("\t\tcommand        = %d (%s)\n", 
+	       vop.com.command, command_code_to_string(vop.com.command));
+	printf("\t\treason         = %d (%s)\n", 
+	       vop.com.reason, reason_code_to_string(vop.com.reason));
+	printf("\t\tcommand_len    = %u\n", vop.com.command_len);
+	printf("\t\tflags          = 0x%x\n", vop.com.flags);
+	printf("\t}\n");
+
+	printf("\tvop = {\n");
+	printf("\t\tvolume         = %u\n", vop.vop.volume);
+	if (afs_strnlen(vop.vop.partName, sizeof(vop.vop.partName)) <
+	    sizeof(vop.vop.partName)) {
+	    printf("\t\tpartName       = '%s'\n", vop.vop.partName);
+	} else {
+	    printf("\t\tpartName       = (illegal string)\n");
+	}
+	printf("\t}\n");
+
+	printf("}\n");
+    }
+
+    return 0;
+}
+
+static int
+StatsQuery(struct cmd_syndesc * as, char * rock)
+{
+    afs_int32 code;
+    int command;
+    struct cmd_item *ti;
+    struct state state;
+    SYNC_PROTO_BUF_DECL(res_buf);
+    SYNC_response res;
+    FSSYNC_StatsOp_hdr scom;
+    union {
+	void * ptr;
+	struct VolPkgStats * vol_stats;
+	struct VolumeHashChainStats * hash_stats;
+#ifdef AFS_DEMAND_ATTACH_FS
+	struct volume_hdr_LRU_stats * hdr_stats;
+#endif
+	struct DiskPartitionStats * vicep_stats;
+    } sres;
+
+    sres.ptr = res_buf;
+    res.hdr.response_len = sizeof(res.hdr);
+    res.payload.buf = res_buf;
+    res.payload.len = SYNC_PROTO_MAX_LEN;
+
+    if ((ti = as->parms[CUSTOM_PARMS_OFFSET].items)) {	/* -subcommand */
+	if (!strcasecmp(ti->data, "vicep")) {
+	    command = FSYNC_VOL_STATS_VICEP;
+	} else if (!strcasecmp(ti->data, "hash")) {
+	    command = FSYNC_VOL_STATS_HASH;
+#ifdef AFS_DEMAND_ATTACH_FS
+	} else if (!strcasecmp(ti->data, "hdr")) {
+	    command = FSYNC_VOL_STATS_HDR;
+	} else if (!strcasecmp(ti->data, "vlru")) {
+	    command = FSYNC_VOL_STATS_VLRU;
+#endif
+	} else if (!strcasecmp(ti->data, "pkg")) {
+	    command = FSYNC_VOL_STATS_GENERAL;
+	} else if (!strcasecmp(ti->data, "help")) {
+	    fprintf(stderr, "fssync-debug stats subcommands:\n");
+	    fprintf(stderr, "\tpkg\tgeneral volume package stats\n");
+	    fprintf(stderr, "\tvicep\tvice partition stats\n");
+	    fprintf(stderr, "\thash\tvolume hash chain stats\n");
+#ifdef AFS_DEMAND_ATTACH_FS
+	    fprintf(stderr, "\thdr\tvolume header cache stats\n");
+	    fprintf(stderr, "\tvlru\tvlru generation stats\n");
+#endif
+	    exit(0);
+	} else {
+	    fprintf(stderr, "invalid stats subcommand");
+	    exit(1);
+	}
+    } else {
+	command = FSYNC_VOL_STATS_GENERAL;
+    }
+
+    if ((ti = as->parms[CUSTOM_PARMS_OFFSET+1].items)) {	/* -arg1 */
+	switch (command) {
+	case FSYNC_VOL_STATS_VICEP:
+	    strlcpy(scom.args.partName, ti->data, sizeof(state.vop->partName));
+	    break;
+	case FSYNC_VOL_STATS_HASH:
+	    scom.args.hash_bucket = atoi(ti->data);
+	    break;
+	case FSYNC_VOL_STATS_VLRU:
+	    scom.args.vlru_generation = atoi(ti->data);
+	    break;
+	default:
+	    fprintf(stderr, "unrecognized arguments\n");
+	    exit(1);
+	}
+    } else {
+	switch (command) {
+	case FSYNC_VOL_STATS_VICEP:
+	case FSYNC_VOL_STATS_HASH:
+	case FSYNC_VOL_STATS_VLRU:
+	    fprintf(stderr, "this subcommand requires more parameters\n");
+	    exit(1);
+	}
+    }
+
+    common_prolog(as, &state);
+
+    fprintf(stderr, "calling FSYNC_askfs with command code %d (%s)\n", 
+	    command, command_code_to_string(command));
+
+    code = FSYNC_StatsOp(&scom, command, FSYNC_WHATEVER, &res);
+
+    switch (code) {
+    case SYNC_OK:
+    case SYNC_DENIED:
+	break;
+    default:
+	fprintf(stderr, "possible sync protocol error. return code was %d\n", code);
+    }
+
+    fprintf(stderr, "FSYNC_VolOp returned %d (%s)\n", code, response_code_to_string(code));
+    fprintf(stderr, "protocol response code was %d (%s)\n", 
+	    res.hdr.response, response_code_to_string(res.hdr.response));
+    fprintf(stderr, "protocol reason code was %d (%s)\n", 
+	    res.hdr.reason, reason_code_to_string(res.hdr.reason));
+
+    VDisconnectFS();
+
+    if (res.hdr.response == SYNC_OK) {
+	switch (command) {
+	case FSYNC_VOL_STATS_GENERAL:
+	    print_vol_stats_general(sres.vol_stats);
+	    break;
+	case FSYNC_VOL_STATS_VICEP:
+	    print_vol_stats_viceP(sres.vicep_stats);
+	    break;
+	case FSYNC_VOL_STATS_HASH:
+	    print_vol_stats_hash(sres.hash_stats);
+	    break;
+#ifdef AFS_DEMAND_ATTACH_FS
+	case FSYNC_VOL_STATS_HDR:
+	    print_vol_stats_hdr(sres.hdr_stats);
+	    break;
+#endif /* AFS_DEMAND_ATTACH_FS */
+	}
+    }
+
+    return 0;
+}
+
+static void
+print_vol_stats_general(VolPkgStats * stats)
+{
+    int i;
+    afs_uint32 hi, lo;
+
+    printf("VolPkgStats = {\n");
+#ifdef AFS_DEMAND_ATTACH_FS
+    for (i = 0; i < VOL_STATE_COUNT; i++) {
+	printf("\tvol_state_count[%s] = %d\n", 
+	       vol_state_to_string(i),
+	       stats->state_levels[i]);
+    }
+
+    SplitInt64(stats->hash_looks, hi, lo);
+    printf("\thash_looks = {\n");
+    printf("\t\thi = %u\n", hi);
+    printf("\t\tlo = %u\n", lo);
+    printf("\t}\n");
+
+    SplitInt64(stats->hash_reorders, hi, lo);
+    printf("\thash_reorders = {\n");
+    printf("\t\thi = %u\n", hi);
+    printf("\t\tlo = %u\n", lo);
+    printf("\t}\n");
+
+    SplitInt64(stats->salvages, hi, lo);
+    printf("\tsalvages = {\n");
+    printf("\t\thi = %u\n", hi);
+    printf("\t\tlo = %u\n", lo);
+    printf("\t}\n");
+
+    SplitInt64(stats->vol_ops, hi, lo);
+    printf("\tvol_ops = {\n");
+    printf("\t\thi = %u\n", hi);
+    printf("\t\tlo = %u\n", lo);
+    printf("\t}\n");
+#endif
+    SplitInt64(stats->hdr_loads, hi, lo);
+    printf("\thdr_loads = {\n");
+    printf("\t\thi = %u\n", hi);
+    printf("\t\tlo = %u\n", lo);
+    printf("\t}\n");
+
+    SplitInt64(stats->hdr_gets, hi, lo);
+    printf("\thdr_gets = {\n");
+    printf("\t\thi = %u\n", hi);
+    printf("\t\tlo = %u\n", lo);
+    printf("\t}\n");
+
+    SplitInt64(stats->attaches, hi, lo);
+    printf("\tattaches = {\n");
+    printf("\t\thi = %u\n", hi);
+    printf("\t\tlo = %u\n", lo);
+    printf("\t}\n");
+
+    SplitInt64(stats->soft_detaches, hi, lo);
+    printf("\tsoft_detaches = {\n");
+    printf("\t\thi = %u\n", hi);
+    printf("\t\tlo = %u\n", lo);
+    printf("\t}\n");
+
+    printf("\thdr_cache_size = %d\n", stats->hdr_cache_size);
+	    
+    printf("}\n");
+}
+
+static void
+print_vol_stats_viceP(struct DiskPartitionStats * stats)
+{
+    printf("DiskPartitionStats = {\n");
+    printf("\tfree = %d\n", stats->free);
+    printf("\tminFree = %d\n", stats->minFree);
+    printf("\ttotalUsable = %d\n", stats->totalUsable);
+    printf("\tf_files = %d\n", stats->f_files);
+#ifdef AFS_DEMAND_ATTACH_FS
+    printf("\tvol_list_len = %d\n", stats->vol_list_len);
+#endif
+    printf("}\n");
+}
+
+static void
+print_vol_stats_hash(struct VolumeHashChainStats * stats)
+{
+    afs_uint32 hi, lo;
+
+    printf("DiskPartitionStats = {\n");
+    printf("\ttable_size = %d\n", stats->table_size);
+    printf("\tchain_len = %d\n", stats->chain_len);
+
+#ifdef AFS_DEMAND_ATTACH_FS
+    printf("\tchain_cacheCheck = %d\n", stats->chain_cacheCheck);
+    printf("\tchain_busy = %d\n", stats->chain_busy);
+
+    SplitInt64(stats->chain_looks, hi, lo);
+    printf("\tchain_looks = {\n");
+    printf("\t\thi = %u\n", hi);
+    printf("\t\tlo = %u\n", lo);
+    printf("\t}\n");
+
+    SplitInt64(stats->chain_gets, hi, lo);
+    printf("\tchain_gets = {\n");
+    printf("\t\thi = %u\n", hi);
+    printf("\t\tlo = %u\n", lo);
+    printf("\t}\n");
+
+    SplitInt64(stats->chain_reorders, hi, lo);
+    printf("\tchain_reorders = {\n");
+    printf("\t\thi = %u\n", hi);
+    printf("\t\tlo = %u\n", lo);
+    printf("\t}\n");
+#endif /* AFS_DEMAND_ATTACH_FS */
+
+    printf("}\n");
+}
+
+
+#ifdef AFS_DEMAND_ATTACH_FS
+static void
+print_vol_stats_hdr(struct volume_hdr_LRU_stats * stats)
+{
+    printf("volume_hdr_LRU_stats = {\n");
+    printf("\tfree = %d\n", stats->free);
+    printf("\tused = %d\n", stats->used);
+    printf("\tattached = %d\n", stats->attached);
+    printf("}\n");
+}
+#endif /* AFS_DEMAND_ATTACH_FS */
+
diff --git a/src/vol/fssync-server.c b/src/vol/fssync-server.c
new file mode 100644
index 0000000000..44494ca739
--- /dev/null
+++ b/src/vol/fssync-server.c
@@ -0,0 +1,1179 @@
+/*
+ * Copyright 2000, International Business Machines Corporation and others.
+ * All Rights Reserved.
+ * 
+ * This software has been released under the terms of the IBM Public
+ * License.  For details, see the LICENSE file in the top-level source
+ * directory or online at http://www.openafs.org/dl/license10.html
+ *
+ * Portions Copyright (c) 2006 Sine Nomine Associates
+ */
+
+/*
+	System:		VICE-TWO
+	Module:		fssync.c
+	Institution:	The Information Technology Center, Carnegie-Mellon University
+
+ */
+#ifdef notdef
+
+/* All this is going away in early 1989 */
+int newVLDB;			/* Compatibility flag */
+
+#endif
+static int newVLDB = 1;
+
+
+#ifndef AFS_PTHREAD_ENV
+#define USUAL_PRIORITY (LWP_MAX_PRIORITY - 2)
+
+/*
+ * stack size increased from 8K because the HP machine seemed to have trouble
+ * with the smaller stack
+ */
+#define USUAL_STACK_SIZE	(24 * 1024)
+#endif /* !AFS_PTHREAD_ENV */
+
+/*
+   fssync-server.c
+   File server synchronization with external volume utilities.
+   server-side implementation
+ */
+
+/* This controls the size of an fd_set; it must be defined early before
+ * the system headers define that type and the macros that operate on it.
+ * Its value should be as large as the maximum file descriptor limit we
+ * are likely to run into on any platform.  Right now, that is 65536
+ * which is the default hard fd limit on Solaris 9 */
+#ifndef _WIN32
+#define FD_SETSIZE 65536
+#endif
+
+#include <afsconfig.h>
+#include <afs/param.h>
+
+RCSID
+    ("$Header$");
+
+#include <sys/types.h>
+#include <stdio.h>
+#ifdef AFS_NT40_ENV
+#include <winsock2.h>
+#include <time.h>
+#else
+#include <sys/param.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <netdb.h>
+#include <sys/time.h>
+#endif
+#include <errno.h>
+#ifdef AFS_PTHREAD_ENV
+#include <assert.h>
+#else /* AFS_PTHREAD_ENV */
+#include <afs/assert.h>
+#endif /* AFS_PTHREAD_ENV */
+#include <signal.h>
+
+#ifdef HAVE_STRING_H
+#include <string.h>
+#else
+#ifdef HAVE_STRINGS_H
+#include <strings.h>
+#endif
+#endif
+
+
+#include <rx/xdr.h>
+#include <afs/afsint.h>
+#include "nfs.h"
+#include <afs/errors.h>
+#include "daemon_com.h"
+#include "fssync.h"
+#include "lwp.h"
+#include "lock.h"
+#include <afs/afssyscalls.h>
+#include "ihandle.h"
+#include "vnode.h"
+#include "volume.h"
+#include "partition.h"
+
+
+#ifdef FSSYNC_BUILD_SERVER
+
+/*@printflike@*/ extern void Log(const char *format, ...);
+
+#ifdef osi_Assert
+#undef osi_Assert
+#endif
+#define osi_Assert(e) (void)(e)
+
+int (*V_BreakVolumeCallbacks) ();
+
+#define MAXHANDLERS	4	/* Up to 4 clients; must be at least 2, so that
+				 * move = dump+restore can run on single server */
+#define MAXOFFLINEVOLUMES 128	/* This needs to be as big as the maximum
+				 * number that would be offline for 1 operation.
+				 * Current winner is salvage, which needs all
+				 * cloned read-only copies offline when salvaging
+				 * a single read-write volume */
+
+#define MAX_BIND_TRIES	5	/* Number of times to retry socket bind */
+
+
+
+static struct offlineInfo OfflineVolumes[MAXHANDLERS][MAXOFFLINEVOLUMES];
+
+static int AcceptSd = -1;	/* Socket used by server for accepting connections */
+
+static int getport();
+
+/* Forward declarations */
+static void FSYNC_sync();
+static void FSYNC_newconnection();
+static void FSYNC_com();
+static void FSYNC_Drop();
+static void AcceptOn();
+static void AcceptOff();
+static void InitHandler();
+static void CallHandler(fd_set * fdsetp);
+static int AddHandler();
+static int FindHandler();
+static int FindHandler_r();
+static int RemoveHandler();
+static void GetHandler(fd_set * fdsetp, int *maxfdp);
+
+extern int LogLevel;
+
+static afs_int32 FSYNC_com_VolOp(int fd, SYNC_command * com, SYNC_response * res);
+
+static afs_int32 FSYNC_com_VolOn(FSSYNC_VolOp_command * com, SYNC_response * res);
+static afs_int32 FSYNC_com_VolOff(FSSYNC_VolOp_command * com, SYNC_response * res);
+static afs_int32 FSYNC_com_VolMove(FSSYNC_VolOp_command * com, SYNC_response * res);
+static afs_int32 FSYNC_com_VolBreakCBKs(FSSYNC_VolOp_command * com, SYNC_response * res);
+static afs_int32 FSYNC_com_VolDone(FSSYNC_VolOp_command * com, SYNC_response * res);
+static afs_int32 FSYNC_com_VolQuery(FSSYNC_VolOp_command * com, SYNC_response * res);
+static afs_int32 FSYNC_com_VolHdrQuery(FSSYNC_VolOp_command * com, SYNC_response * res);
+#ifdef AFS_DEMAND_ATTACH_FS
+static afs_int32 FSYNC_com_VolOpQuery(FSSYNC_VolOp_command * com, SYNC_response * res);
+#endif /* AFS_DEMAND_ATTACH_FS */
+
+static afs_int32 FSYNC_com_StatsOp(int fd, SYNC_command * com, SYNC_response * res);
+
+static afs_int32 FSYNC_com_StatsOpGeneral(FSSYNC_StatsOp_command * scom, SYNC_response * res);
+static afs_int32 FSYNC_com_StatsOpViceP(FSSYNC_StatsOp_command * scom, SYNC_response * res);
+static afs_int32 FSYNC_com_StatsOpHash(FSSYNC_StatsOp_command * scom, SYNC_response * res);
+static afs_int32 FSYNC_com_StatsOpHdr(FSSYNC_StatsOp_command * scom, SYNC_response * res);
+static afs_int32 FSYNC_com_StatsOpVLRU(FSSYNC_StatsOp_command * scom, SYNC_response * res);
+
+
+static void FSYNC_com_to_info(FSSYNC_VolOp_command * vcom, FSSYNC_VolOp_info * info);
+
+
+/*
+ * This lock controls access to the handler array. The overhead
+ * is minimal in non-preemptive environments.
+ */
+struct Lock FSYNC_handler_lock;
+
+void
+FSYNC_fsInit(void)
+{
+#ifdef AFS_PTHREAD_ENV
+    pthread_t tid;
+    pthread_attr_t tattr;
+#else /* AFS_PTHREAD_ENV */
+    PROCESS pid;
+#endif /* AFS_PTHREAD_ENV */
+
+    Lock_Init(&FSYNC_handler_lock);
+
+#ifdef AFS_PTHREAD_ENV
+    assert(pthread_attr_init(&tattr) == 0);
+    assert(pthread_attr_setdetachstate(&tattr, PTHREAD_CREATE_DETACHED) == 0);
+    assert(pthread_create(&tid, &tattr, FSYNC_sync, NULL) == 0);
+#else /* AFS_PTHREAD_ENV */
+    assert(LWP_CreateProcess
+	   (FSYNC_sync, USUAL_STACK_SIZE, USUAL_PRIORITY, (void *)0,
+	    "FSYNC_sync", &pid) == LWP_SUCCESS);
+#endif /* AFS_PTHREAD_ENV */
+}
+
+static fd_set FSYNC_readfds;
+
+static int
+getport(struct sockaddr_in *addr)
+{
+    int sd;
+
+    memset(addr, 0, sizeof(*addr));
+    assert((sd = socket(AF_INET, SOCK_STREAM, 0)) >= 0);
+#ifdef STRUCT_SOCKADDR_HAS_SA_LEN
+    addr->sin_len = sizeof(struct sockaddr_in);
+#endif
+    addr->sin_addr.s_addr = htonl(0x7f000001);
+    addr->sin_family = AF_INET;	/* was localhost->h_addrtype */
+    addr->sin_port = htons(2040);	/* XXXX htons not _really_ neccessary */
+
+    return sd;
+}
+
+
+static void
+FSYNC_sync()
+{
+    struct sockaddr_in addr;
+    int on = 1;
+    extern int VInit;
+    int code;
+    int numTries;
+#ifdef AFS_PTHREAD_ENV
+    int tid;
+#endif
+
+#ifndef AFS_NT40_ENV
+    (void)signal(SIGPIPE, SIG_IGN);
+#endif
+
+#ifdef AFS_PTHREAD_ENV
+    /* set our 'thread-id' so that the host hold table works */
+    MUTEX_ENTER(&rx_stats_mutex);	/* protects rxi_pthread_hinum */
+    tid = ++rxi_pthread_hinum;
+    MUTEX_EXIT(&rx_stats_mutex);
+    pthread_setspecific(rx_thread_id_key, (void *)tid);
+    Log("Set thread id %d for FSYNC_sync\n", tid);
+#endif /* AFS_PTHREAD_ENV */
+
+    while (!VInit) {
+	/* Let somebody else run until level > 0.  That doesn't mean that 
+	 * all volumes have been attached. */
+#ifdef AFS_PTHREAD_ENV
+	pthread_yield();
+#else /* AFS_PTHREAD_ENV */
+	LWP_DispatchProcess();
+#endif /* AFS_PTHREAD_ENV */
+    }
+    AcceptSd = getport(&addr);
+    /* Reuseaddr needed because system inexplicably leaves crud lying around */
+    code =
+	setsockopt(AcceptSd, SOL_SOCKET, SO_REUSEADDR, (char *)&on,
+		   sizeof(on));
+    if (code)
+	Log("FSYNC_sync: setsockopt failed with (%d)\n", errno);
+
+    for (numTries = 0; numTries < MAX_BIND_TRIES; numTries++) {
+	if ((code =
+	     bind(AcceptSd, (struct sockaddr *)&addr, sizeof(addr))) == 0)
+	    break;
+	Log("FSYNC_sync: bind failed with (%d), will sleep and retry\n",
+	    errno);
+	sleep(5);
+    }
+    assert(!code);
+    listen(AcceptSd, 100);
+    InitHandler();
+    AcceptOn();
+    for (;;) {
+	int maxfd;
+	GetHandler(&FSYNC_readfds, &maxfd);
+	/* Note: check for >= 1 below is essential since IOMGR_select
+	 * doesn't have exactly same semantics as select.
+	 */
+#ifdef AFS_PTHREAD_ENV
+	if (select(maxfd + 1, &FSYNC_readfds, NULL, NULL, NULL) >= 1)
+#else /* AFS_PTHREAD_ENV */
+	if (IOMGR_Select(maxfd + 1, &FSYNC_readfds, NULL, NULL, NULL) >= 1)
+#endif /* AFS_PTHREAD_ENV */
+	    CallHandler(&FSYNC_readfds);
+    }
+}
+
+static void
+FSYNC_newconnection(int afd)
+{
+    struct sockaddr_in other;
+    int junk, fd;
+    junk = sizeof(other);
+    fd = accept(afd, (struct sockaddr *)&other, &junk);
+    if (fd == -1) {
+	Log("FSYNC_newconnection:  accept failed, errno==%d\n", errno);
+	assert(1 == 2);
+    } else if (!AddHandler(fd, FSYNC_com)) {
+	AcceptOff();
+	assert(AddHandler(fd, FSYNC_com));
+    }
+}
+
+/* this function processes commands from an fssync file descriptor (fd) */
+afs_int32 FS_cnt = 0;
+static void
+FSYNC_com(int fd)
+{
+    SYNC_command com;
+    SYNC_response res;
+    SYNC_PROTO_BUF_DECL(com_buf);
+    SYNC_PROTO_BUF_DECL(res_buf);
+
+    memset(&res.hdr, 0, sizeof(res.hdr));
+
+    com.payload.buf = (void *)com_buf;
+    com.payload.len = SYNC_PROTO_MAX_LEN;
+    res.hdr.response_len = sizeof(res.hdr);
+    res.hdr.proto_version = FSYNC_PROTO_VERSION;
+    res.payload.len = SYNC_PROTO_MAX_LEN;
+    res.payload.buf = (void *)res_buf;
+
+    FS_cnt++;
+    if (SYNC_getCom(fd, &com)) {
+	Log("FSYNC_com:  read failed; dropping connection (cnt=%d)\n", FS_cnt);
+	FSYNC_Drop(fd);
+	return;
+    }
+
+    if (com.hdr.proto_version != FSYNC_PROTO_VERSION) {
+	Log("FSYNC_com:  invalid protocol version (%u)\n", com.hdr.proto_version);
+	res.hdr.response = SYNC_COM_ERROR;
+	res.hdr.flags |= SYNC_FLAG_CHANNEL_SHUTDOWN;
+	goto respond;
+    }
+
+    VOL_LOCK;
+    switch (com.hdr.command) {
+    case FSYNC_VOL_ON:
+    case FSYNC_VOL_OFF:
+    case FSYNC_VOL_LISTVOLUMES:
+    case FSYNC_VOL_NEEDVOLUME:
+    case FSYNC_VOL_MOVE:
+    case FSYNC_VOL_BREAKCBKS:
+    case FSYNC_VOL_DONE:
+    case FSYNC_VOL_QUERY:
+    case FSYNC_VOL_QUERY_HDR:
+    case FSYNC_VOL_QUERY_VOP:
+	res.hdr.response = FSYNC_com_VolOp(fd, &com, &res);
+	break;
+    case FSYNC_VOL_STATS_GENERAL:
+    case FSYNC_VOL_STATS_VICEP:
+    case FSYNC_VOL_STATS_HASH:
+    case FSYNC_VOL_STATS_HDR:
+    case FSYNC_VOL_STATS_VLRU:
+	res.hdr.response = FSYNC_com_StatsOp(fd, &com, &res);
+	break;
+    case SYNC_COM_CHANNEL_CLOSE:
+	res.hdr.response = SYNC_OK;
+	res.hdr.flags |= SYNC_FLAG_CHANNEL_SHUTDOWN;
+	break;
+    default:
+	res.hdr.response = SYNC_BAD_COMMAND;
+	break;
+    }
+    VOL_UNLOCK;
+
+ respond:
+    SYNC_putRes(fd, &res);
+    if (res.hdr.flags & SYNC_FLAG_CHANNEL_SHUTDOWN) {
+	FSYNC_Drop(fd);
+    }
+}
+
+static afs_int32
+FSYNC_com_VolOp(int fd, SYNC_command * com, SYNC_response * res)
+{
+    int i;
+    afs_int32 code = SYNC_OK;
+    FSSYNC_VolOp_command vcom;
+
+    if (com->recv_len != (sizeof(com->hdr) + sizeof(FSSYNC_VolOp_hdr))) {
+	res->hdr.reason = SYNC_REASON_MALFORMED_PACKET;
+	res->hdr.flags |= SYNC_FLAG_CHANNEL_SHUTDOWN;
+	return SYNC_COM_ERROR;
+    }
+
+    vcom.hdr = &com->hdr;
+    vcom.vop = (FSSYNC_VolOp_hdr *) com->payload.buf;
+    vcom.com = com;
+
+    vcom.volumes = OfflineVolumes[FindHandler(fd)];
+    for (vcom.v = NULL, i = 0; i < MAXOFFLINEVOLUMES; i++) {
+	if ((vcom.volumes[i].volumeID == vcom.vop->volume) &&
+	    (strncmp(vcom.volumes[i].partName, vcom.vop->partName,
+		     sizeof(vcom.volumes[i].partName)) == 0)) {
+	    vcom.v = &vcom.volumes[i];
+	    break;
+	}
+    }
+
+    switch (com->hdr.command) {
+    case FSYNC_VOL_ON:
+	code = FSYNC_com_VolOn(&vcom, res);
+	break;
+    case FSYNC_VOL_OFF:
+    case FSYNC_VOL_NEEDVOLUME:
+	code = FSYNC_com_VolOff(&vcom, res);
+	break;
+    case FSYNC_VOL_LISTVOLUMES:
+	code = SYNC_OK;
+	break;
+    case FSYNC_VOL_MOVE:
+	code = FSYNC_com_VolMove(&vcom, res);
+	break;
+    case FSYNC_VOL_BREAKCBKS:
+	code = FSYNC_com_VolBreakCBKs(&vcom, res);
+	break;
+    case FSYNC_VOL_DONE:
+	code = FSYNC_com_VolDone(&vcom, res);
+	break;
+    case FSYNC_VOL_QUERY:
+	code = FSYNC_com_VolQuery(&vcom, res);
+	break;
+    case FSYNC_VOL_QUERY_HDR:
+	code = FSYNC_com_VolHdrQuery(&vcom, res);
+	break;
+#ifdef AFS_DEMAND_ATTACH_FS
+    case FSYNC_VOL_QUERY_VOP:
+	code = FSYNC_com_VolOpQuery(&vcom, res);
+	break;
+#endif /* AFS_DEMAND_ATTACH_FS */
+    default:
+	code = SYNC_BAD_COMMAND;
+    }
+
+    return code;
+}
+
+static afs_int32
+FSYNC_com_VolOn(FSSYNC_VolOp_command * vcom, SYNC_response * res)
+{
+    afs_int32 code = SYNC_OK;
+    char tvolName[VMAXPATHLEN];
+    Volume * vp;
+    Error error;
+
+    if (SYNC_verifyProtocolString(vcom->vop->partName, sizeof(vcom->vop->partName))) {
+	res->hdr.reason = SYNC_REASON_MALFORMED_PACKET;
+	code = SYNC_FAILED;
+	goto done;
+    }
+
+    /*
+      This is where a detatched volume gets reattached. However in the
+      special case where the volume is merely busy, it is already
+      attatched and it is only necessary to clear the busy flag. See
+      defect #2080 for details.
+    */
+
+    /* is the volume already attatched? */
+#ifdef	notdef
+    /*
+     * XXX With the following enabled we had bizarre problems where the backup id would
+     * be reset to 0; that was due to the interaction between fileserver/volserver in that they
+     * both keep volumes in memory and the changes wouldn't be made to the fileserver. Some of
+     * the problems were due to refcnt changes as result of VGetVolume/VPutVolume which would call
+     * VOffline, etc. when we don't want to; someday the whole #2080 issue should be revisited to
+     * be done right XXX
+     */
+    vp = VGetVolume_r(&error, vcom->vop->volume);
+    if (vp) {
+	/* yep, is the BUSY flag set? */
+	if (vp->specialStatus == VBUSY) {
+
+	    /* yep, clear BUSY flag */
+
+	    vp->specialStatus = 0;
+	    /* make sure vol is online */
+	    if (vcom->v) {
+		vcom->v->volumeID = 0;
+		V_inUse(vp) = 1;	/* online */
+	    }
+	    VPutVolume_r(vp);
+	    break;
+	}
+	VPutVolume_r(vp);
+    }
+#endif /* notdef */
+
+    /* so, we need to attach the volume */
+
+    if (vcom->v)
+	vcom->v->volumeID = 0;
+    tvolName[0] = '/';
+    snprintf(&tvolName[1], sizeof(tvolName)-1, VFORMAT, vcom->vop->volume);
+    tvolName[sizeof(tvolName)-1] = '\0';
+
+#ifdef AFS_DEMAND_ATTACH_FS
+    vp = VPreAttachVolumeByName_r(&error, vcom->vop->partName, tvolName,
+				  V_VOLUPD);
+    if (vp && vp->pending_vol_op) {
+	VDeregisterVolOp_r(vp, vp->pending_vol_op);
+    }
+#else /* AFS_DEMAND_ATTACH_FS */
+    vp = VAttachVolumeByName_r(&error, vcom->vop->partName, tvolName,
+			       V_VOLUPD);
+    if (vp)
+	VPutVolume_r(vp);
+#endif /* AFS_DEMAND_ATTACH_FS */
+
+    if (error) {
+	code = SYNC_DENIED;
+	res->hdr.reason = error;
+    }
+
+ done:
+    return code;
+}
+
+static afs_int32
+FSYNC_com_VolOff(FSSYNC_VolOp_command * vcom, SYNC_response * res)
+{
+    FSSYNC_VolOp_info info;
+    afs_int32 code = SYNC_OK;
+    int i;
+    Volume * vp, * nvp;
+    Error error;
+
+    if (SYNC_verifyProtocolString(vcom->vop->partName, sizeof(vcom->vop->partName))) {
+	res->hdr.reason = SYNC_REASON_MALFORMED_PACKET;
+	code = SYNC_FAILED;
+	goto done;
+    }
+
+    /* not already offline, we need to find a slot for newly offline volume */
+    if (vcom->hdr->programType == debugUtility) {
+	/* debug utilities do not have their operations tracked */
+	vcom->v = NULL;
+    } else {
+	if (!vcom->v) {
+	    for (i = 0; i < MAXOFFLINEVOLUMES; i++) {
+		if (vcom->volumes[i].volumeID == 0) {
+		    vcom->v = &vcom->volumes[i];
+		    break;
+		}
+	    }
+	}
+	if (!vcom->v) {
+	    goto deny;
+	}
+    }
+
+    FSYNC_com_to_info(vcom, &info);
+
+#ifdef AFS_DEMAND_ATTACH_FS
+    vp = VLookupVolume_r(&error, vcom->vop->volume, NULL);
+#else
+    vp = VGetVolume_r(&error, vcom->vop->volume);
+#endif
+
+    if (vp) {
+	if ((vcom->vop->partName[0] != 0) &&
+	    (strncmp(vcom->vop->partName, vp->partition->name, 
+		    sizeof(vcom->vop->partName)) != 0)) {
+	    /* volume on desired partition is not online, so we
+	     * should treat this as an offline volume.
+	     */
+#ifndef AFS_DEMAND_ATTACH_FS
+	    VPutVolume_r(vp);
+#endif
+	    vp = NULL;
+	    goto done;
+	}
+    }
+
+#ifdef AFS_DEMAND_ATTACH_FS
+    if (vp) {
+	ProgramType type = (ProgramType) vcom->hdr->programType;
+
+	/* do initial filtering of requests */
+
+	/* enforce mutual exclusion for volume ops */
+	if (vp->pending_vol_op) {
+	    if (vp->pending_vol_op->com.programType != type) {
+		Log("volume %u already checked out\n", vp->hashid);
+		/* XXX debug */
+		Log("vp->vop = { com = { ver=%u, prog=%d, com=%d, reason=%d, len=%u, flags=0x%x }, vop = { vol=%u, part='%s' } }\n",
+		    vp->pending_vol_op->com.proto_version, 
+		    vp->pending_vol_op->com.programType,
+		    vp->pending_vol_op->com.command,
+		    vp->pending_vol_op->com.reason,
+		    vp->pending_vol_op->com.command_len,
+		    vp->pending_vol_op->com.flags,
+		    vp->pending_vol_op->vop.volume,
+		    vp->pending_vol_op->vop.partName );
+		Log("vcom = { com = { ver=%u, prog=%d, com=%d, reason=%d, len=%u, flags=0x%x } , vop = { vol=%u, part='%s' } }\n",
+		    vcom->hdr->proto_version,
+		    vcom->hdr->programType,
+		    vcom->hdr->command,
+		    vcom->hdr->reason,
+		    vcom->hdr->command_len,
+		    vcom->hdr->flags,
+		    vcom->vop->volume,
+		    vcom->vop->partName);
+		res->hdr.reason = FSYNC_EXCLUSIVE;
+		goto deny;
+	    } else {
+		Log("warning: volume %u recursively checked out by programType id %d\n",
+		    vp->hashid, vcom->hdr->programType);
+	    }
+	}
+
+	/* filter based upon requestor
+	 *
+	 * volume utilities are not allowed to check out volumes
+	 * which are in an error state
+	 *
+	 * unknown utility programs will be denied on principal
+	 */
+	switch (type) {
+	case salvageServer:
+	case debugUtility:
+	    /* give the salvageserver lots of liberty */
+	    break;
+	case volumeUtility:
+	    if ((V_attachState(vp) == VOL_STATE_ERROR) ||
+		(V_attachState(vp) == VOL_STATE_SALVAGING)) {
+		goto deny;
+	    }
+	    break;
+	default:
+	    Log("bad program type passed to FSSYNC\n");
+	    goto deny;
+	}
+
+	/* short circuit for offline volume states
+	 * so we can avoid I/O penalty of attachment */
+	switch (V_attachState(vp)) {
+	case VOL_STATE_UNATTACHED:
+	case VOL_STATE_PREATTACHED:
+	case VOL_STATE_SALVAGING:
+	case VOL_STATE_ERROR:
+	    /* register the volume operation metadata with the volume
+	     *
+	     * if the volume is currently pre-attached, attach2()
+	     * will evaluate the vol op metadata to determine whether
+	     * attaching the volume would be safe */
+	    VRegisterVolOp_r(vp, &info);
+	    goto done;
+	default:
+	    break;
+	}
+
+	/* convert to heavyweight ref */
+	nvp = VGetVolumeByVp_r(&error, vp);
+
+	/* register the volume operation metadata with the volume */
+	VRegisterVolOp_r(vp, &info);
+
+	if (!nvp) {
+	    Log("FSYNC_com_VolOff: failed to get heavyweight reference to volume %u\n",
+		vcom->vop->volume);
+	    res->hdr.reason = FSYNC_VOL_PKG_ERROR;
+	    goto deny;
+	}
+	vp = nvp;
+    }
+#endif /* AFS_DEMAND_ATTACH_FS */
+
+    if (vp) {
+	if (VVolOpLeaveOnline_r(vp, &info)) {
+	    VUpdateVolume_r(&error, vp, VOL_UPDATE_WAIT);	/* At least get volume stats right */
+	    if (LogLevel) {
+		Log("FSYNC: Volume %u (%s) was left on line for an external %s request\n", 
+		    V_id(vp), V_name(vp), 
+		    vcom->hdr->reason == V_CLONE ? "clone" : 
+		    vcom->hdr->reason == V_READONLY ? "readonly" : 
+		    vcom->hdr->reason == V_DUMP ? "dump" : 
+		    "UNKNOWN");
+	    }
+	    VPutVolume_r(vp);
+	} else {
+	    if (VVolOpSetVBusy_r(vp, &info)) {
+		vp->specialStatus = VBUSY;
+	    }
+
+	    /* remember what volume we got, so we can keep track of how
+	     * many volumes the volserver or whatever is using.  Note that
+	     * vp is valid since leaveonline is only set when vp is valid.
+	     */
+	    if (vcom->v) {
+		vcom->v->volumeID = vcom->vop->volume;
+		strlcpy(vcom->v->partName, vp->partition->name, sizeof(vcom->v->partName));
+	    }
+
+	    VOffline_r(vp, "A volume utility is running.");
+	    vp = NULL;
+	}
+    }
+
+ done:
+    return code;
+
+ deny:
+    return SYNC_DENIED;
+}
+
+static afs_int32
+FSYNC_com_VolMove(FSSYNC_VolOp_command * vcom, SYNC_response * res)
+{
+    Error error;
+    Volume * vp;
+
+    /* Yuch:  the "reason" for the move is the site it got moved to... */
+    /* still set specialStatus so we stop sending back VBUSY.
+     * also should still break callbacks.  Note that I don't know
+     * how to tell if we should break all or not, so we just do it
+     * since it doesn't matter much if we do an extra break
+     * volume callbacks on a volume move within the same server */
+#ifdef AFS_DEMAND_ATTACH_FS
+    vp = VLookupVolume_r(&error, vcom->vop->volume, NULL);
+#else
+    vp = VGetVolume_r(&error, vcom->vop->volume);
+#endif
+    if (vp) {
+	vp->specialStatus = VMOVED;
+#ifndef AFS_DEMAND_ATTACH_FS
+	VPutVolume_r(vp);
+#endif
+    }
+
+    if (V_BreakVolumeCallbacks) {
+	Log("fssync: volume %u moved to %x; breaking all call backs\n",
+	    vcom->vop->volume, vcom->hdr->reason);
+	VOL_UNLOCK;
+	(*V_BreakVolumeCallbacks) (vcom->vop->volume);
+	VOL_LOCK;
+    }
+
+    return SYNC_OK;
+}
+
+static afs_int32
+FSYNC_com_VolDone(FSSYNC_VolOp_command * vcom, SYNC_response * res)
+{
+#ifdef AFS_DEMAND_ATTACH_FS
+    Error error;
+    Volume * vp;
+#endif
+
+    /* don't try to put online, this call is made only after deleting
+     * a volume, in which case we want to remove the vol # from the
+     * OfflineVolumes array only */
+    if (vcom->v)
+	vcom->v->volumeID = 0;
+
+#ifdef AFS_DEMAND_ATTACH_FS
+    vp = VLookupVolume_r(&error, vcom->vop->volume, NULL);
+    if (vp && vp->pending_vol_op) {
+	VDeregisterVolOp_r(vp, vp->pending_vol_op);
+    }
+#endif
+
+    return SYNC_OK;
+}
+
+static afs_int32
+FSYNC_com_VolBreakCBKs(FSSYNC_VolOp_command * vcom, SYNC_response * res)
+{
+    /* if the volume is being restored, break all callbacks on it */
+    if (V_BreakVolumeCallbacks) {
+	Log("fssync: breaking all call backs for volume %u\n",
+	    vcom->vop->volume);
+	VOL_UNLOCK;
+	(*V_BreakVolumeCallbacks) (vcom->vop->volume);
+	VOL_LOCK;
+    }
+    return SYNC_OK;
+}
+
+static afs_int32
+FSYNC_com_VolQuery(FSSYNC_VolOp_command * vcom, SYNC_response * res)
+{
+    afs_int32 code = SYNC_OK;
+    Error error;
+    Volume * vp;
+
+#ifdef AFS_DEMAND_ATTACH_FS
+    vp = VLookupVolume_r(&error, vcom->vop->volume, NULL);
+#else /* !AFS_DEMAND_ATTACH_FS */
+    vp = VGetVolume_r(&error, vcom->vop->volume);
+#endif /* !AFS_DEMAND_ATTACH_FS */
+
+    if (vp) {
+	assert(sizeof(Volume) <= res->payload.len);
+	memcpy(res->payload.buf, vp, sizeof(Volume));
+	res->hdr.response_len += sizeof(Volume);
+#ifndef AFS_DEMAND_ATTACH_FS
+	VPutVolume_r(vp);
+#endif
+    } else {
+	res->hdr.reason = FSYNC_UNKNOWN_VOLID;
+	code = SYNC_FAILED;
+    }
+    return code;
+}
+
+static afs_int32
+FSYNC_com_VolHdrQuery(FSSYNC_VolOp_command * vcom, SYNC_response * res)
+{
+    afs_int32 code = SYNC_OK;
+    Error error;
+    Volume * vp;
+    int hdr_ok = 0;
+
+#ifdef AFS_DEMAND_ATTACH_FS
+    vp = VLookupVolume_r(&error, vcom->vop->volume, NULL);
+    if (vp &&
+	(vp->header != NULL) &&
+	(V_attachFlags(vp) & VOL_HDR_ATTACHED) &&
+	(V_attachFlags(vp) & VOL_HDR_LOADED)) {
+	hdr_ok = 1;
+    }
+#else /* !AFS_DEMAND_ATTACH_FS */
+    vp = VGetVolume_r(&error, vcom->vop->volume);
+    if (vp && vp->header) {
+	hdr_ok = 1;
+    }
+#endif /* !AFS_DEMAND_ATTACH_FS */
+
+ load_done:
+    if (hdr_ok) {
+	assert(sizeof(VolumeDiskData) <= res->payload.len);
+	memcpy(res->payload.buf, &V_disk(vp), sizeof(VolumeDiskData));
+	res->hdr.response_len += sizeof(VolumeDiskData);
+#ifndef AFS_DEMAND_ATTACH_FS
+	VPutVolume_r(vp);
+#endif
+    } else {
+	if (vp) {
+	    res->hdr.reason = FSYNC_HDR_NOT_ATTACHED;
+	} else {
+	    res->hdr.reason = FSYNC_UNKNOWN_VOLID;
+	}
+	code = SYNC_FAILED;
+    }
+    return code;
+}
+
+#ifdef AFS_DEMAND_ATTACH_FS
+static afs_int32
+FSYNC_com_VolOpQuery(FSSYNC_VolOp_command * vcom, SYNC_response * res)
+{
+    afs_int32 code = SYNC_OK;
+    Error error;
+    Volume * vp;
+
+    vp = VLookupVolume_r(&error, vcom->vop->volume, NULL);
+
+    if (vp && vp->pending_vol_op) {
+	assert(sizeof(FSSYNC_VolOp_info) <= res->payload.len);
+	memcpy(res->payload.buf, vp->pending_vol_op, sizeof(FSSYNC_VolOp_info));
+	res->hdr.response_len += sizeof(FSSYNC_VolOp_info);
+    } else {
+	if (vp) {
+	    res->hdr.reason = FSYNC_NO_PENDING_VOL_OP;
+	} else {
+	    res->hdr.reason = FSYNC_UNKNOWN_VOLID;
+	}
+	code = SYNC_FAILED;
+    }
+    return code;
+}
+#endif /* AFS_DEMAND_ATTACH_FS */
+
+static afs_int32
+FSYNC_com_StatsOp(int fd, SYNC_command * com, SYNC_response * res)
+{
+    int i;
+    afs_int32 code = SYNC_OK;
+    FSSYNC_StatsOp_command scom;
+
+    if (com->recv_len != (sizeof(com->hdr) + sizeof(FSSYNC_StatsOp_hdr))) {
+	res->hdr.reason = SYNC_REASON_MALFORMED_PACKET;
+	res->hdr.flags |= SYNC_FLAG_CHANNEL_SHUTDOWN;
+	return SYNC_COM_ERROR;
+    }
+
+    scom.hdr = &com->hdr;
+    scom.sop = (FSSYNC_StatsOp_hdr *) com->payload.buf;
+    scom.com = com;
+
+    switch (com->hdr.command) {
+    case FSYNC_VOL_STATS_GENERAL:
+	code = FSYNC_com_StatsOpGeneral(&scom, res);
+	break;
+#ifdef AFS_DEMAND_ATTACH_FS
+	/* statistics for the following subsystems are only tracked
+	 * for demand attach fileservers */
+    case FSYNC_VOL_STATS_VICEP:
+	code = FSYNC_com_StatsOpViceP(&scom, res);
+	break;
+    case FSYNC_VOL_STATS_HASH:
+	code = FSYNC_com_StatsOpHash(&scom, res);
+	break;
+    case FSYNC_VOL_STATS_HDR:
+	code = FSYNC_com_StatsOpHdr(&scom, res);
+	break;
+    case FSYNC_VOL_STATS_VLRU:
+	code = FSYNC_com_StatsOpVLRU(&scom, res);
+	break;
+#endif /* AFS_DEMAND_ATTACH_FS */
+    default:
+	code = SYNC_BAD_COMMAND;
+    }
+
+    return code;
+}
+
+static afs_int32
+FSYNC_com_StatsOpGeneral(FSSYNC_StatsOp_command * scom, SYNC_response * res)
+{
+    afs_int32 code = SYNC_OK;
+
+    memcpy(res->payload.buf, &VStats, sizeof(VStats));
+    res->hdr.response_len += sizeof(VStats);
+
+    return code;
+}
+
+#ifdef AFS_DEMAND_ATTACH_FS
+static afs_int32
+FSYNC_com_StatsOpViceP(FSSYNC_StatsOp_command * scom, SYNC_response * res)
+{
+    afs_int32 code = SYNC_OK;
+    struct DiskPartition * dp;
+    struct DiskPartitionStats * stats;
+
+    if (SYNC_verifyProtocolString(scom->sop->args.partName, sizeof(scom->sop->args.partName))) {
+	res->hdr.reason = SYNC_REASON_MALFORMED_PACKET;
+	code = SYNC_FAILED;
+	goto done;
+    }
+
+    dp = VGetPartition_r(scom->sop->args.partName, 0);
+    if (!dp) {
+	code = SYNC_FAILED;
+    } else {
+	stats = (struct DiskPartitionStats *) res->payload.buf;
+	stats->free = dp->free;
+	stats->totalUsable = dp->totalUsable;
+	stats->minFree = dp->minFree;
+	stats->f_files = dp->f_files;
+	stats->vol_list_len = dp->vol_list.len;
+	
+	res->hdr.response_len += sizeof(struct DiskPartitionStats);
+    }
+
+ done:
+    return code;
+}
+
+static afs_int32
+FSYNC_com_StatsOpHash(FSSYNC_StatsOp_command * scom, SYNC_response * res)
+{
+    afs_int32 code = SYNC_OK;
+    struct VolumeHashChainStats * stats;
+    struct VolumeHashChainHead * head;
+
+    if (scom->sop->args.hash_bucket >= VolumeHashTable.Size) {
+	return SYNC_FAILED;
+    }
+
+    head = &VolumeHashTable.Table[scom->sop->args.hash_bucket];
+    stats = (struct VolumeHashChainStats *) res->payload.buf;
+    stats->table_size = VolumeHashTable.Size;
+    stats->chain_len = head->len;
+    stats->chain_cacheCheck = head->cacheCheck;
+    stats->chain_busy = head->busy;
+    AssignInt64(head->looks, &stats->chain_looks);
+    AssignInt64(head->gets, &stats->chain_gets);
+    AssignInt64(head->reorders, &stats->chain_reorders);
+
+    res->hdr.response_len += sizeof(struct VolumeHashChainStats);
+    
+    return code;
+}
+
+static afs_int32
+FSYNC_com_StatsOpHdr(FSSYNC_StatsOp_command * scom, SYNC_response * res)
+{
+    afs_int32 code = SYNC_OK;
+
+    memcpy(res->payload.buf, &volume_hdr_LRU.stats, sizeof(volume_hdr_LRU.stats));
+    res->hdr.response_len += sizeof(volume_hdr_LRU.stats);
+
+    return code;
+}
+
+static afs_int32
+FSYNC_com_StatsOpVLRU(FSSYNC_StatsOp_command * scom, SYNC_response * res)
+{
+    afs_int32 code = SYNC_OK;
+
+    code = SYNC_BAD_COMMAND;
+
+    return code;
+}
+#endif /* AFS_DEMAND_ATTACH_FS */
+
+static void
+FSYNC_com_to_info(FSSYNC_VolOp_command * vcom, FSSYNC_VolOp_info * info)
+{
+    memcpy(&info->com, vcom->hdr, sizeof(SYNC_command_hdr));
+    memcpy(&info->vop, vcom->vop, sizeof(FSSYNC_VolOp_hdr));
+}
+
+static void
+FSYNC_Drop(int fd)
+{
+    struct offlineInfo *p;
+    int i;
+    Error error;
+    char tvolName[VMAXPATHLEN];
+
+    VOL_LOCK;
+    p = OfflineVolumes[FindHandler(fd)];
+    for (i = 0; i < MAXOFFLINEVOLUMES; i++) {
+	if (p[i].volumeID) {
+
+	    Volume *vp;
+
+	    tvolName[0] = '/';
+	    sprintf(&tvolName[1], VFORMAT, p[i].volumeID);
+	    vp = VAttachVolumeByName_r(&error, p[i].partName, tvolName,
+				       V_VOLUPD);
+	    if (vp)
+		VPutVolume_r(vp);
+	    p[i].volumeID = 0;
+	}
+    }
+    VOL_UNLOCK;
+    RemoveHandler(fd);
+#ifdef AFS_NT40_ENV
+    closesocket(fd);
+#else
+    close(fd);
+#endif
+    AcceptOn();
+}
+
+static int AcceptHandler = -1;	/* handler id for accept, if turned on */
+
+static void
+AcceptOn()
+{
+    if (AcceptHandler == -1) {
+	assert(AddHandler(AcceptSd, FSYNC_newconnection));
+	AcceptHandler = FindHandler(AcceptSd);
+    }
+}
+
+static void
+AcceptOff()
+{
+    if (AcceptHandler != -1) {
+	assert(RemoveHandler(AcceptSd));
+	AcceptHandler = -1;
+    }
+}
+
+/* The multiple FD handling code. */
+
+static int HandlerFD[MAXHANDLERS];
+static int (*HandlerProc[MAXHANDLERS]) ();
+
+static void
+InitHandler()
+{
+    register int i;
+    ObtainWriteLock(&FSYNC_handler_lock);
+    for (i = 0; i < MAXHANDLERS; i++) {
+	HandlerFD[i] = -1;
+	HandlerProc[i] = 0;
+    }
+    ReleaseWriteLock(&FSYNC_handler_lock);
+}
+
+static void
+CallHandler(fd_set * fdsetp)
+{
+    register int i;
+    ObtainReadLock(&FSYNC_handler_lock);
+    for (i = 0; i < MAXHANDLERS; i++) {
+	if (HandlerFD[i] >= 0 && FD_ISSET(HandlerFD[i], fdsetp)) {
+	    ReleaseReadLock(&FSYNC_handler_lock);
+	    (*HandlerProc[i]) (HandlerFD[i]);
+	    ObtainReadLock(&FSYNC_handler_lock);
+	}
+    }
+    ReleaseReadLock(&FSYNC_handler_lock);
+}
+
+static int
+AddHandler(int afd, int (*aproc) ())
+{
+    register int i;
+    ObtainWriteLock(&FSYNC_handler_lock);
+    for (i = 0; i < MAXHANDLERS; i++)
+	if (HandlerFD[i] == -1)
+	    break;
+    if (i >= MAXHANDLERS) {
+	ReleaseWriteLock(&FSYNC_handler_lock);
+	return 0;
+    }
+    HandlerFD[i] = afd;
+    HandlerProc[i] = aproc;
+    ReleaseWriteLock(&FSYNC_handler_lock);
+    return 1;
+}
+
+static int
+FindHandler(register int afd)
+{
+    register int i;
+    ObtainReadLock(&FSYNC_handler_lock);
+    for (i = 0; i < MAXHANDLERS; i++)
+	if (HandlerFD[i] == afd) {
+	    ReleaseReadLock(&FSYNC_handler_lock);
+	    return i;
+	}
+    ReleaseReadLock(&FSYNC_handler_lock);	/* just in case */
+    assert(1 == 2);
+    return -1;			/* satisfy compiler */
+}
+
+static int
+FindHandler_r(register int afd)
+{
+    register int i;
+    for (i = 0; i < MAXHANDLERS; i++)
+	if (HandlerFD[i] == afd) {
+	    return i;
+	}
+    assert(1 == 2);
+    return -1;			/* satisfy compiler */
+}
+
+static int
+RemoveHandler(register int afd)
+{
+    ObtainWriteLock(&FSYNC_handler_lock);
+    HandlerFD[FindHandler_r(afd)] = -1;
+    ReleaseWriteLock(&FSYNC_handler_lock);
+    return 1;
+}
+
+static void
+GetHandler(fd_set * fdsetp, int *maxfdp)
+{
+    register int i;
+    register int maxfd = -1;
+    FD_ZERO(fdsetp);
+    ObtainReadLock(&FSYNC_handler_lock);	/* just in case */
+    for (i = 0; i < MAXHANDLERS; i++)
+	if (HandlerFD[i] != -1) {
+	    FD_SET(HandlerFD[i], fdsetp);
+	    if (maxfd < HandlerFD[i])
+		maxfd = HandlerFD[i];
+	}
+    *maxfdp = maxfd;
+    ReleaseReadLock(&FSYNC_handler_lock);	/* just in case */
+}
+
+#endif /* FSSYNC_BUILD_SERVER */
diff --git a/src/vol/fssync.c b/src/vol/fssync.c
deleted file mode 100644
index 714aaf5fea..0000000000
--- a/src/vol/fssync.c
+++ /dev/null
@@ -1,751 +0,0 @@
-/*
- * Copyright 2000, International Business Machines Corporation and others.
- * All Rights Reserved.
- * 
- * This software has been released under the terms of the IBM Public
- * License.  For details, see the LICENSE file in the top-level source
- * directory or online at http://www.openafs.org/dl/license10.html
- */
-
-/*
-	System:		VICE-TWO
-	Module:		fssync.c
-	Institution:	The Information Technology Center, Carnegie-Mellon University
-
- */
-#ifdef notdef
-
-/* All this is going away in early 1989 */
-int newVLDB;			/* Compatibility flag */
-
-#endif
-static int newVLDB = 1;
-
-
-#ifndef AFS_PTHREAD_ENV
-#define USUAL_PRIORITY (LWP_MAX_PRIORITY - 2)
-
-/*
- * stack size increased from 8K because the HP machine seemed to have trouble
- * with the smaller stack
- */
-#define USUAL_STACK_SIZE	(24 * 1024)
-#endif /* !AFS_PTHREAD_ENV */
-
-/*
-   fsync.c
-   File server synchronization with external volume utilities.
- */
-
-/* This controls the size of an fd_set; it must be defined early before
- * the system headers define that type and the macros that operate on it.
- * Its value should be as large as the maximum file descriptor limit we
- * are likely to run into on any platform.  Right now, that is 65536
- * which is the default hard fd limit on Solaris 9 */
-#ifndef _WIN32
-#define FD_SETSIZE 65536
-#endif
-
-#include <afsconfig.h>
-#include <afs/param.h>
-
-RCSID
-    ("$Header$");
-
-#include <sys/types.h>
-#include <stdio.h>
-#ifdef AFS_NT40_ENV
-#include <winsock2.h>
-#include <time.h>
-#else
-#include <sys/param.h>
-#include <sys/socket.h>
-#include <netinet/in.h>
-#include <netdb.h>
-#include <sys/time.h>
-#endif
-#include <errno.h>
-#ifdef AFS_PTHREAD_ENV
-#include <assert.h>
-#else /* AFS_PTHREAD_ENV */
-#include <afs/assert.h>
-#endif /* AFS_PTHREAD_ENV */
-#include <signal.h>
-
-#ifdef HAVE_STRING_H
-#include <string.h>
-#else
-#ifdef HAVE_STRINGS_H
-#include <strings.h>
-#endif
-#endif
-
-
-#include <rx/xdr.h>
-#include <afs/afsint.h>
-#include "nfs.h"
-#include <afs/errors.h>
-#include "fssync.h"
-#include "lwp.h"
-#include "lock.h"
-#include <afs/afssyscalls.h>
-#include "ihandle.h"
-#include "vnode.h"
-#include "volume.h"
-#include "partition.h"
-
-/*@printflike@*/ extern void Log(const char *format, ...);
-
-#ifdef osi_Assert
-#undef osi_Assert
-#endif
-#define osi_Assert(e) (void)(e)
-
-int (*V_BreakVolumeCallbacks) ();
-
-#define MAXHANDLERS	4	/* Up to 4 clients; must be at least 2, so that
-				 * move = dump+restore can run on single server */
-#define MAXOFFLINEVOLUMES 128	/* This needs to be as big as the maximum
-				 * number that would be offline for 1 operation.
-				 * Current winner is salvage, which needs all
-				 * cloned read-only copies offline when salvaging
-				 * a single read-write volume */
-
-#define MAX_BIND_TRIES	5	/* Number of times to retry socket bind */
-
-
-struct offlineInfo {
-    VolumeId volumeID;
-    char partName[16];
-};
-
-static struct offlineInfo OfflineVolumes[MAXHANDLERS][MAXOFFLINEVOLUMES];
-
-static FS_sd = -1;		/* Client socket for talking to file server */
-static AcceptSd = -1;		/* Socket used by server for accepting connections */
-
-static int getport();
-
-struct command {
-    bit32 command;
-    bit32 reason;
-    VolumeId volume;
-    char partName[16];		/* partition name, e.g. /vicepa */
-};
-
-/* Forward declarations */
-static void FSYNC_sync();
-static void FSYNC_newconnection();
-static void FSYNC_com();
-static void FSYNC_Drop();
-static void AcceptOn();
-static void AcceptOff();
-static void InitHandler();
-static void CallHandler(fd_set * fdsetp);
-static int AddHandler();
-static int FindHandler();
-static int FindHandler_r();
-static int RemoveHandler();
-static void GetHandler(fd_set * fdsetp, int *maxfdp);
-
-extern int LogLevel;
-
-/*
- * This lock controls access to the handler array. The overhead
- * is minimal in non-preemptive environments.
- */
-struct Lock FSYNC_handler_lock;
-
-int
-FSYNC_clientInit(void)
-{
-    struct sockaddr_in addr;
-    /* I can't believe the following is needed for localhost connections!! */
-    static time_t backoff[] =
-	{ 3, 3, 3, 5, 5, 5, 7, 15, 16, 24, 32, 40, 48, 0 };
-    time_t *timeout = &backoff[0];
-
-    for (;;) {
-	FS_sd = getport(&addr);
-	if (connect(FS_sd, (struct sockaddr *)&addr, sizeof(addr)) >= 0)
-	    return 1;
-	if (!*timeout)
-	    break;
-	if (!(*timeout & 1))
-	    Log("FSYNC_clientInit temporary failure (will retry)");
-	FSYNC_clientFinis();
-	sleep(*timeout++);
-    }
-    perror("FSYNC_clientInit failed (giving up!)");
-    return 0;
-}
-
-void
-FSYNC_clientFinis(void)
-{
-#ifdef AFS_NT40_ENV
-    closesocket(FS_sd);
-#else
-    close(FS_sd);
-#endif
-    FS_sd = -1;
-}
-
-int
-FSYNC_askfs(VolumeId volume, char *partName, int com, int reason)
-{
-    byte response;
-    struct command command;
-    int n;
-    command.volume = volume;
-    command.command = com;
-    command.reason = reason;
-    if (partName)
-	strcpy(command.partName, partName);
-    else
-	command.partName[0] = 0;
-    assert(FS_sd != -1);
-    VFSYNC_LOCK;
-#ifdef AFS_NT40_ENV
-    if (send(FS_sd, (char *)&command, sizeof(command), 0) != sizeof(command)) {
-	printf("FSYNC_askfs: write to file server failed\n");
-	response = FSYNC_DENIED;
-	goto done;
-    }
-    while ((n = recv(FS_sd, &response, 1, 0)) != 1) {
-	if (n == 0 || WSAEINTR != WSAGetLastError()) {
-	    printf("FSYNC_askfs: No response from file server\n");
-	    response = FSYNC_DENIED;
-	    goto done;
-	}
-    }
-#else
-    if (write(FS_sd, &command, sizeof(command)) != sizeof(command)) {
-	printf("FSYNC_askfs: write to file server failed\n");
-	response = FSYNC_DENIED;
-	goto done;
-    }
-    while ((n = read(FS_sd, &response, 1)) != 1) {
-	if (n == 0 || errno != EINTR) {
-	    printf("FSYNC_askfs: No response from file server\n");
-	    response = FSYNC_DENIED;
-	    goto done;
-	}
-    }
-#endif
-    if (response == 0) {
-	printf
-	    ("FSYNC_askfs: negative response from file server; volume %u, command %d\n",
-	     command.volume, (int)command.command);
-    }
-  done:
-    VFSYNC_UNLOCK;
-    return response;
-}
-
-void
-FSYNC_fsInit(void)
-{
-#ifdef AFS_PTHREAD_ENV
-    pthread_t tid;
-    pthread_attr_t tattr;
-    assert(pthread_attr_init(&tattr) == 0);
-    assert(pthread_attr_setdetachstate(&tattr, PTHREAD_CREATE_DETACHED) == 0);
-    assert(pthread_create(&tid, &tattr, FSYNC_sync, NULL) == 0);
-#else /* AFS_PTHREAD_ENV */
-    PROCESS pid;
-    assert(LWP_CreateProcess
-	   (FSYNC_sync, USUAL_STACK_SIZE, USUAL_PRIORITY, (void *)0,
-	    "FSYNC_sync", &pid) == LWP_SUCCESS);
-#endif /* AFS_PTHREAD_ENV */
-}
-
-static int
-getport(struct sockaddr_in *addr)
-{
-    int sd;
-
-    memset(addr, 0, sizeof(*addr));
-    assert((sd = socket(AF_INET, SOCK_STREAM, 0)) >= 0);
-#ifdef STRUCT_SOCKADDR_HAS_SA_LEN
-    addr->sin_len = sizeof(struct sockaddr_in);
-#endif
-    addr->sin_addr.s_addr = htonl(0x7f000001);
-    addr->sin_family = AF_INET;	/* was localhost->h_addrtype */
-    addr->sin_port = htons(2040);	/* XXXX htons not _really_ neccessary */
-
-    return sd;
-}
-
-static fd_set FSYNC_readfds;
-
-static void
-FSYNC_sync()
-{
-    struct sockaddr_in addr;
-    int on = 1;
-    extern VInit;
-    int code;
-    int numTries;
-#ifdef AFS_PTHREAD_ENV
-    int tid;
-#endif
-
-#ifndef AFS_NT40_ENV
-    (void)signal(SIGPIPE, SIG_IGN);
-#endif
-
-#ifdef AFS_PTHREAD_ENV
-    /* set our 'thread-id' so that the host hold table works */
-    MUTEX_ENTER(&rx_stats_mutex);	/* protects rxi_pthread_hinum */
-    tid = ++rxi_pthread_hinum;
-    MUTEX_EXIT(&rx_stats_mutex);
-    pthread_setspecific(rx_thread_id_key, (void *)tid);
-    Log("Set thread id %d for FSYNC_sync\n", tid);
-#endif /* AFS_PTHREAD_ENV */
-
-    while (!VInit) {
-	/* Let somebody else run until level > 0.  That doesn't mean that 
-	 * all volumes have been attached. */
-#ifdef AFS_PTHREAD_ENV
-	pthread_yield();
-#else /* AFS_PTHREAD_ENV */
-	LWP_DispatchProcess();
-#endif /* AFS_PTHREAD_ENV */
-    }
-    AcceptSd = getport(&addr);
-    /* Reuseaddr needed because system inexplicably leaves crud lying around */
-    code =
-	setsockopt(AcceptSd, SOL_SOCKET, SO_REUSEADDR, (char *)&on,
-		   sizeof(on));
-    if (code)
-	Log("FSYNC_sync: setsockopt failed with (%d)\n", errno);
-
-    for (numTries = 0; numTries < MAX_BIND_TRIES; numTries++) {
-	if ((code =
-	     bind(AcceptSd, (struct sockaddr *)&addr, sizeof(addr))) == 0)
-	    break;
-	Log("FSYNC_sync: bind failed with (%d), will sleep and retry\n",
-	    errno);
-	sleep(5);
-    }
-    assert(!code);
-    listen(AcceptSd, 100);
-    InitHandler();
-    AcceptOn();
-    for (;;) {
-	int maxfd;
-	GetHandler(&FSYNC_readfds, &maxfd);
-	/* Note: check for >= 1 below is essential since IOMGR_select
-	 * doesn't have exactly same semantics as select.
-	 */
-#ifdef AFS_PTHREAD_ENV
-	if (select(maxfd + 1, &FSYNC_readfds, NULL, NULL, NULL) >= 1)
-#else /* AFS_PTHREAD_ENV */
-	if (IOMGR_Select(maxfd + 1, &FSYNC_readfds, NULL, NULL, NULL) >= 1)
-#endif /* AFS_PTHREAD_ENV */
-	    CallHandler(&FSYNC_readfds);
-    }
-}
-
-static void
-FSYNC_newconnection(int afd)
-{
-    struct sockaddr_in other;
-    int junk, fd;
-    junk = sizeof(other);
-    fd = accept(afd, (struct sockaddr *)&other, &junk);
-    if (fd == -1) {
-	Log("FSYNC_newconnection:  accept failed, errno==%d\n", errno);
-	assert(1 == 2);
-    } else if (!AddHandler(fd, FSYNC_com)) {
-	AcceptOff();
-	assert(AddHandler(fd, FSYNC_com));
-    }
-}
-
-/*
-#define TEST2081
-*/
-
-afs_int32 FS_cnt = 0;
-static void
-FSYNC_com(int fd)
-{
-    byte rc = FSYNC_OK;
-    int n, i;
-    Error error;
-    struct command command;
-    int leaveonline;
-    register struct offlineInfo *volumes, *v;
-    Volume *vp;
-    char tvolName[VMAXPATHLEN];
-
-    FS_cnt++;
-#ifdef AFS_NT40_ENV
-    n = recv(fd, &command, sizeof(command), 0);
-#else
-    n = read(fd, &command, sizeof(command));
-#endif
-    if (n <= 0) {
-	FSYNC_Drop(fd);
-	return;
-    }
-    if (n < sizeof(command)) {
-	Log("FSYNC_com:  partial read (%d instead of %d); dropping connection (cnt=%d)\n", n, sizeof(command), FS_cnt);
-	FSYNC_Drop(fd);
-	return;
-    }
-    VATTACH_LOCK;
-    VOL_LOCK;
-    volumes = OfflineVolumes[FindHandler(fd)];
-    for (v = 0, i = 0; i < MAXOFFLINEVOLUMES; i++) {
-	if (volumes[i].volumeID == command.volume
-	    && strcmp(volumes[i].partName, command.partName) == 0) {
-	    v = &volumes[i];
-	    break;
-	}
-    }
-    switch (command.command) {
-    case FSYNC_DONE:
-	/* don't try to put online, this call is made only after deleting
-	 * a volume, in which case we want to remove the vol # from the
-	 * OfflineVolumes array only */
-	if (v)
-	    v->volumeID = 0;
-	break;
-    case FSYNC_ON:
-
-/*
-This is where a detatched volume gets reattached. However in the
-special case where the volume is merely busy, it is already
-attatched and it is only necessary to clear the busy flag. See
-defect #2080 for details.
-*/
-
-	/* is the volume already attatched? */
-#ifdef	notdef
-/*
- * XXX With the following enabled we had bizarre problems where the backup id would
- * be reset to 0; that was due to the interaction between fileserver/volserver in that they
- * both keep volumes in memory and the changes wouldn't be made to the fileserver. Some of
- * the problems were due to refcnt changes as result of VGetVolume/VPutVolume which would call
- * VOffline, etc. when we don't want to; someday the whole #2080 issue should be revisited to
- * be done right XXX
- */
-	vp = VGetVolume_r(&error, command.volume);
-	if (vp) {
-	    /* yep, is the BUSY flag set? */
-	    if (vp->specialStatus == VBUSY) {
-/* test harness for defect #2081 */
-
-#ifdef TEST2081
-		/*
-		 * test #2081 by releasing TEST.2081,
-		 * so leave it alone here, zap it after
-		 */
-
-		if (strcmp(vp->header->diskstuff.name, "TEST.2081") == 0)
-		    break;
-#endif
-		/* yep, clear BUSY flag */
-
-		vp->specialStatus = 0;
-		/* make sure vol is online */
-		if (v) {
-		    v->volumeID = 0;
-		    V_inUse(vp) = 1;	/* online */
-		}
-		VPutVolume_r(vp);
-		break;
-	    }
-	    VPutVolume_r(vp);
-	}
-#endif
-
-	/* so, we need to attach the volume */
-
-	if (v)
-	    v->volumeID = 0;
-	tvolName[0] = '/';
-	sprintf(&tvolName[1], VFORMAT, command.volume);
-
-	vp = VAttachVolumeByName_r(&error, command.partName, tvolName,
-				   V_VOLUPD);
-	if (vp)
-	    VPutVolume_r(vp);
-	break;
-    case FSYNC_OFF:
-    case FSYNC_NEEDVOLUME:{
-	    leaveonline = 0;
-	    /* not already offline, we need to find a slot for newly offline volume */
-	    if (!v) {
-		for (i = 0; i < MAXOFFLINEVOLUMES; i++) {
-		    if (volumes[i].volumeID == 0) {
-			v = &volumes[i];
-			break;
-		    }
-		}
-	    }
-	    if (!v) {
-		rc = FSYNC_DENIED;
-		break;
-	    }
-	    vp = VGetVolume_r(&error, command.volume);
-	    if (vp) {
-		if (command.partName[0] != 0
-		    && strcmp(command.partName, vp->partition->name) != 0) {
-		    /* volume on desired partition is not online, so we
-		     * should treat this as an offline volume.
-		     */
-		    VPutVolume_r(vp);
-		    vp = (Volume *) 0;
-		}
-	    }
-	    if (vp) {
-		leaveonline = (command.command == FSYNC_NEEDVOLUME
-			       && (command.reason == V_READONLY
-				   || (!VolumeWriteable(vp)
-				       && (command.reason == V_CLONE
-					   || command.reason == V_DUMP))
-			       )
-		    );
-		if (!leaveonline) {
-		    if (command.command == FSYNC_NEEDVOLUME
-			&& (command.reason == V_CLONE
-			    || command.reason == V_DUMP)) {
-			vp->specialStatus = VBUSY;
-		    }
-		    /* remember what volume we got, so we can keep track of how
-		     * many volumes the volserver or whatever is using.  Note that
-		     * vp is valid since leaveonline is only set when vp is valid.
-		     */
-		    v->volumeID = command.volume;
-		    strcpy(v->partName, vp->partition->name);
-		    if (!V_inUse(vp)) {
-			/* in this case, VOffline just returns sans decrementing
-			 * ref count.  We could try to fix it, but it has lots of
-			 * weird callers.
-			 */
-			VPutVolume_r(vp);
-		    } else {
-			VOffline_r(vp, "A volume utility is running.");
-		    }
-		    vp = 0;
-		} else {
-		    VUpdateVolume_r(&error, vp);	/* At least get volume stats right */
-		    if (LogLevel) {
-			Log("FSYNC: Volume %u (%s) was left on line for an external %s request\n", V_id(vp), V_name(vp), command.reason == V_CLONE ? "clone" : command.reason == V_READONLY ? "readonly" : command.reason == V_DUMP ? "dump" : "UNKNOWN");
-		    }
-		}
-		if (vp)
-		    VPutVolume_r(vp);
-	    }
-	    rc = FSYNC_OK;
-	    break;
-	}
-    case FSYNC_MOVEVOLUME:
-	/* Yuch:  the "reason" for the move is the site it got moved to... */
-	/* still set specialStatus so we stop sending back VBUSY.
-	 * also should still break callbacks.  Note that I don't know
-	 * how to tell if we should break all or not, so we just do it
-	 * since it doesn't matter much if we do an extra break
-	 * volume callbacks on a volume move within the same server */
-	vp = VGetVolume_r(&error, command.volume);
-	if (vp) {
-	    vp->specialStatus = VMOVED;
-	    VPutVolume_r(vp);
-	}
-
-	if (V_BreakVolumeCallbacks) {
-	    Log("fssync: volume %u moved to %x; breaking all call backs\n",
-		command.volume, command.reason);
-	    VOL_UNLOCK;
-	    VATTACH_UNLOCK;
-	    (*V_BreakVolumeCallbacks) (command.volume);
-	    VATTACH_LOCK;
-	    VOL_LOCK;
-	}
-	break;
-    case FSYNC_RESTOREVOLUME:
-	/* if the volume is being restored, break all callbacks on it */
-	if (V_BreakVolumeCallbacks) {
-	    Log("fssync: volume %u restored; breaking all call backs\n",
-		command.volume);
-	    VOL_UNLOCK;
-	    VATTACH_UNLOCK;
-	    (*V_BreakVolumeCallbacks) (command.volume);
-	    VATTACH_LOCK;
-	    VOL_LOCK;
-	}
-	break;
-    default:
-	rc = FSYNC_DENIED;
-	break;
-    }
-    VOL_UNLOCK;
-    VATTACH_UNLOCK;
-#ifdef AFS_NT40_ENV
-    (void)send(fd, &rc, 1, 0);
-#else
-    (void)write(fd, &rc, 1);
-#endif
-}
-
-static void
-FSYNC_Drop(int fd)
-{
-    struct offlineInfo *p;
-    register i;
-    Error error;
-    char tvolName[VMAXPATHLEN];
-
-    VATTACH_LOCK;
-    VOL_LOCK;
-    p = OfflineVolumes[FindHandler(fd)];
-    for (i = 0; i < MAXOFFLINEVOLUMES; i++) {
-	if (p[i].volumeID) {
-	    Volume *vp;
-
-	    tvolName[0] = '/';
-	    sprintf(&tvolName[1], VFORMAT, p[i].volumeID);
-	    vp = VAttachVolumeByName_r(&error, p[i].partName, tvolName,
-				       V_VOLUPD);
-	    if (vp)
-		VPutVolume_r(vp);
-	    p[i].volumeID = 0;
-	}
-    }
-    VOL_UNLOCK;
-    VATTACH_UNLOCK;
-    RemoveHandler(fd);
-#ifdef AFS_NT40_ENV
-    closesocket(fd);
-#else
-    close(fd);
-#endif
-    AcceptOn();
-}
-
-static int AcceptHandler = -1;	/* handler id for accept, if turned on */
-
-static void
-AcceptOn()
-{
-    if (AcceptHandler == -1) {
-	assert(AddHandler(AcceptSd, FSYNC_newconnection));
-	AcceptHandler = FindHandler(AcceptSd);
-    }
-}
-
-static void
-AcceptOff()
-{
-    if (AcceptHandler != -1) {
-	assert(RemoveHandler(AcceptSd));
-	AcceptHandler = -1;
-    }
-}
-
-/* The multiple FD handling code. */
-
-static int HandlerFD[MAXHANDLERS];
-static int (*HandlerProc[MAXHANDLERS]) ();
-
-static void
-InitHandler()
-{
-    register int i;
-    ObtainWriteLock(&FSYNC_handler_lock);
-    for (i = 0; i < MAXHANDLERS; i++) {
-	HandlerFD[i] = -1;
-	HandlerProc[i] = 0;
-    }
-    ReleaseWriteLock(&FSYNC_handler_lock);
-}
-
-static void
-CallHandler(fd_set * fdsetp)
-{
-    register int i;
-    ObtainReadLock(&FSYNC_handler_lock);
-    for (i = 0; i < MAXHANDLERS; i++) {
-	if (HandlerFD[i] >= 0 && FD_ISSET(HandlerFD[i], fdsetp)) {
-	    ReleaseReadLock(&FSYNC_handler_lock);
-	    (*HandlerProc[i]) (HandlerFD[i]);
-	    ObtainReadLock(&FSYNC_handler_lock);
-	}
-    }
-    ReleaseReadLock(&FSYNC_handler_lock);
-}
-
-static int
-AddHandler(int afd, int (*aproc) ())
-{
-    register int i;
-    ObtainWriteLock(&FSYNC_handler_lock);
-    for (i = 0; i < MAXHANDLERS; i++)
-	if (HandlerFD[i] == -1)
-	    break;
-    if (i >= MAXHANDLERS) {
-	ReleaseWriteLock(&FSYNC_handler_lock);
-	return 0;
-    }
-    HandlerFD[i] = afd;
-    HandlerProc[i] = aproc;
-    ReleaseWriteLock(&FSYNC_handler_lock);
-    return 1;
-}
-
-static int
-FindHandler(register int afd)
-{
-    register int i;
-    ObtainReadLock(&FSYNC_handler_lock);
-    for (i = 0; i < MAXHANDLERS; i++)
-	if (HandlerFD[i] == afd) {
-	    ReleaseReadLock(&FSYNC_handler_lock);
-	    return i;
-	}
-    ReleaseReadLock(&FSYNC_handler_lock);	/* just in case */
-    assert(1 == 2);
-    return -1;			/* satisfy compiler */
-}
-
-static int
-FindHandler_r(register int afd)
-{
-    register int i;
-    for (i = 0; i < MAXHANDLERS; i++)
-	if (HandlerFD[i] == afd) {
-	    return i;
-	}
-    assert(1 == 2);
-    return -1;			/* satisfy compiler */
-}
-
-static int
-RemoveHandler(register int afd)
-{
-    ObtainWriteLock(&FSYNC_handler_lock);
-    HandlerFD[FindHandler_r(afd)] = -1;
-    ReleaseWriteLock(&FSYNC_handler_lock);
-    return 1;
-}
-
-static void
-GetHandler(fd_set * fdsetp, int *maxfdp)
-{
-    register int i;
-    register int maxfd = -1;
-    FD_ZERO(fdsetp);
-    ObtainReadLock(&FSYNC_handler_lock);	/* just in case */
-    for (i = 0; i < MAXHANDLERS; i++)
-	if (HandlerFD[i] != -1) {
-	    FD_SET(HandlerFD[i], fdsetp);
-	    if (maxfd < HandlerFD[i])
-		maxfd = HandlerFD[i];
-	}
-    *maxfdp = maxfd;
-    ReleaseReadLock(&FSYNC_handler_lock);	/* just in case */
-}
diff --git a/src/vol/fssync.h b/src/vol/fssync.h
index af5ab02c71..873b274970 100644
--- a/src/vol/fssync.h
+++ b/src/vol/fssync.h
@@ -5,6 +5,8 @@
  * This software has been released under the terms of the IBM Public
  * License.  For details, see the LICENSE file in the top-level source
  * directory or online at http://www.openafs.org/dl/license10.html
+ *
+ * Portions Copyright (c) 2006 Sine Nomine Associates
  */
 
 /*
@@ -14,38 +16,117 @@
 
  */
 
-
-/* FSYNC commands */
-
-#define FSYNC_ON		1	/* Volume online */
-#define FSYNC_OFF		2	/* Volume offline */
-#define FSYNC_LISTVOLUMES	3	/* Update local volume list */
-#define FSYNC_NEEDVOLUME	4	/* Put volume in whatever mode (offline, or whatever)
-					 * best fits the attachment mode provided in reason */
-#define FSYNC_MOVEVOLUME	5	/* Generate temporary relocation information
-					 * for this volume to another site, to be used
-					 * if this volume disappears */
-#define	FSYNC_RESTOREVOLUME	6	/* Break all the callbacks on this volume since                                   it is being restored */
-#define FSYNC_DONE		7	/* Done with this volume (used after a delete).
-					 * Don't put online, but remove from list */
+#ifndef __fssync_h_
+#define __fssync_h_
 
 
-/* Reasons (these could be communicated to venus or converted to messages) */
-
-#define FSYNC_WHATEVER		0	/* XXXX */
-#define FSYNC_SALVAGE		1	/* volume is being salvaged */
-#define FSYNC_MOVE		2	/* volume is being moved */
-#define FSYNC_OPERATOR		3	/* operator forced volume offline */
+#define FSYNC_PROTO_VERSION     2
 
 
-/* Replies (1 byte) */
+/* FSYNC command codes */
+#define FSYNC_VOL_ON		SYNC_COM_CODE_DECL(0)	/* Volume online */
+#define FSYNC_VOL_OFF		SYNC_COM_CODE_DECL(1)	/* Volume offline */
+#define FSYNC_VOL_LISTVOLUMES	SYNC_COM_CODE_DECL(2)	/* Update local volume list */
+#define FSYNC_VOL_NEEDVOLUME	SYNC_COM_CODE_DECL(3)	/* Put volume in whatever mode (offline, or whatever)
+							 * best fits the attachment mode provided in reason */
+#define FSYNC_VOL_MOVE	        SYNC_COM_CODE_DECL(4)	/* Generate temporary relocation information
+							 * for this volume to another site, to be used
+							 * if this volume disappears */
+#define	FSYNC_VOL_BREAKCBKS	SYNC_COM_CODE_DECL(5)	/* Break all the callbacks on this volume */
+#define FSYNC_VOL_DONE		SYNC_COM_CODE_DECL(6)	/* Done with this volume (used after a delete).
+							 * Don't put online, but remove from list */
+#define FSYNC_VOL_QUERY         SYNC_COM_CODE_DECL(7)   /* query the volume state */
+#define FSYNC_VOL_QUERY_HDR     SYNC_COM_CODE_DECL(8)   /* query the volume disk data structure */
+#define FSYNC_VOL_QUERY_VOP     SYNC_COM_CODE_DECL(9)   /* query the volume for pending vol op info */
+#define FSYNC_VOL_STATS_GENERAL SYNC_COM_CODE_DECL(10)  /* query the general volume package statistics */
+#define FSYNC_VOL_STATS_VICEP   SYNC_COM_CODE_DECL(11)  /* query the per-partition volume package stats */
+#define FSYNC_VOL_STATS_HASH    SYNC_COM_CODE_DECL(12)  /* query the per hash-chain volume package stats */
+#define FSYNC_VOL_STATS_HDR     SYNC_COM_CODE_DECL(13)  /* query the volume header cache statistics */
+#define FSYNC_VOL_STATS_VLRU    SYNC_COM_CODE_DECL(14)  /* query the VLRU statistics */
 
-#define FSYNC_DENIED		0
-#define FSYNC_OK		1
+/* FSYNC reason codes */
+#define FSYNC_WHATEVER		SYNC_REASON_CODE_DECL(0)  /* XXXX */
+#define FSYNC_SALVAGE		SYNC_REASON_CODE_DECL(1)  /* volume is being salvaged */
+#define FSYNC_MOVE		SYNC_REASON_CODE_DECL(2)  /* volume is being moved */
+#define FSYNC_OPERATOR		SYNC_REASON_CODE_DECL(3)  /* operator forced volume offline */
+#define FSYNC_EXCLUSIVE         SYNC_REASON_CODE_DECL(4)  /* somebody else has the volume offline */
+#define FSYNC_UNKNOWN_VOLID     SYNC_REASON_CODE_DECL(5)  /* volume id not known by fileserver */
+#define FSYNC_HDR_NOT_ATTACHED  SYNC_REASON_CODE_DECL(6)  /* volume header not currently attached */
+#define FSYNC_NO_PENDING_VOL_OP SYNC_REASON_CODE_DECL(7)  /* no volume operation pending */
+#define FSYNC_VOL_PKG_ERROR     SYNC_REASON_CODE_DECL(8)  /* error in the volume package */
+
+/* FSYNC response codes */
+
+/* FSYNC flag codes */
 
 
-/* Prototypes from fssync.c */
-void FSYNC_clientFinis(void);
-int FSYNC_clientInit(void);
-void FSYNC_fsInit(void);
-int FSYNC_askfs(VolumeId volume, char *partName, int com, int reason);
+
+struct offlineInfo {
+    afs_uint32 volumeID;
+    char partName[16];
+};
+
+typedef struct FSSYNC_VolOp_hdr {
+    afs_uint32 volume;          /* volume id associated with request */
+    char partName[16];		/* partition name, e.g. /vicepa */
+} FSSYNC_VolOp_hdr;
+
+typedef struct FSSYNC_VolOp_command {
+    SYNC_command_hdr * hdr;
+    FSSYNC_VolOp_hdr * vop;
+    SYNC_command * com;
+    struct offlineInfo * v;
+    struct offlineInfo * volumes;
+} FSSYNC_VolOp_command;
+
+typedef struct FSSYNC_VolOp_info {
+    SYNC_command_hdr com;
+    FSSYNC_VolOp_hdr vop;
+} FSSYNC_VolOp_info;
+
+
+typedef struct FSSYNC_StatsOp_hdr {
+    union {
+	afs_uint32 vlru_generation;
+	afs_uint32 hash_bucket;
+	char partName[16];
+    } args;
+} FSSYNC_StatsOp_hdr;
+
+typedef struct FSSYNC_StatsOp_command {
+    SYNC_command_hdr * hdr;
+    FSSYNC_StatsOp_hdr * sop;
+    SYNC_command * com;
+} FSSYNC_StatsOp_command;
+
+
+
+/*
+ * common interfaces
+ */
+extern void FSYNC_Init(void);
+
+/* 
+ * fsync client interfaces 
+ */
+extern void FSYNC_clientFinis(void);
+extern int FSYNC_clientInit(void);
+extern int FSYNC_clientChildProcReconnect(void);
+
+/* generic low-level interface */
+extern afs_int32 FSYNC_askfs(SYNC_command * com, SYNC_response * res);
+
+/* generic higher-level interface */
+extern afs_int32 FSYNC_GenericOp(void * ext_hdr, size_t ext_len,
+				 int command, int reason,
+				 SYNC_response * res);
+
+/* volume operations interface */
+extern afs_int32 FSYNC_VolOp(VolumeId volume, char *partName, int com, int reason, 
+			     SYNC_response * res);
+
+/* statistics query interface */
+extern afs_int32 FSYNC_StatsOp(FSSYNC_StatsOp_hdr * scom, int command, int reason,
+			       SYNC_response * res_in);
+
+#endif /* __fssync_h_ */
diff --git a/src/vol/nuke.c b/src/vol/nuke.c
index f787b5ae39..5b52e46a06 100644
--- a/src/vol/nuke.c
+++ b/src/vol/nuke.c
@@ -41,6 +41,7 @@ RCSID
 #include "partition.h"
 #include "viceinode.h"
 #include "salvage.h"
+#include "daemon_com.h"
 #include "fssync.h"
 
 #ifdef O_LARGEFILE
diff --git a/src/vol/partition.c b/src/vol/partition.c
index f8aa3a81dd..9eea9f577d 100644
--- a/src/vol/partition.c
+++ b/src/vol/partition.c
@@ -7,6 +7,7 @@
  * directory or online at http://www.openafs.org/dl/license10.html
  *
  * Portions Copyright (c) 2003 Apple Computer, Inc.
+ * Portions Copyright (c) 2006 Sine Nomine Associates
  */
 
 /*
@@ -189,6 +190,14 @@ RCSID
 int aixlow_water = 8;		/* default 8% */
 struct DiskPartition *DiskPartitionList;
 
+#ifdef AFS_DEMAND_ATTACH_FS
+static struct DiskPartition *DiskPartitionTable[VOLMAXPARTS+1];
+
+static struct DiskPartition * VLookupPartition_r(char * path);
+static void AddPartitionToTable_r(struct DiskPartition *);
+static void DeletePartitionFromTable_r(struct DiskPartition *);
+#endif /* AFS_DEMAND_ATTACH_FS */
+
 #ifdef AFS_SGI_XFS_IOPS_ENV
 /* Verify that the on disk XFS inodes on the partition are large enough to
  * hold the AFS attribute. Returns -1 if the attribute can't be set or is
@@ -225,8 +234,16 @@ VerifyXFSInodeSize(char *part, char *fstype)
     }
     return code;
 }
-#endif
+#endif /* AFS_SGI_XFS_IOPS_ENV */
 
+int
+VInitPartitionPackage(void)
+{
+#ifdef AFS_DEMAND_ATTACH_ENV
+    memset(&DiskPartitionTable, 0, sizeof(DiskPartitionTable));
+#endif /* AFS_DEMAND_ATTACH_ENV */
+    return 0;
+}
 
 static void
 VInitPartition_r(char *path, char *devname, Device dev)
@@ -245,6 +262,7 @@ VInitPartition_r(char *path, char *devname, Device dev)
     dp->next = 0;
     dp->name = (char *)malloc(strlen(path) + 1);
     strncpy(dp->name, path, strlen(path) + 1);
+    dp->index = volutil_GetPartitionID(path);
 #if defined(AFS_NAMEI_ENV) && !defined(AFS_NT40_ENV)
     /* Create a lockfile for the partition, of the form /vicepa/Lock/vicepa */
     dp->devName = (char *)malloc(2 * strlen(path) + 6);
@@ -254,7 +272,7 @@ VInitPartition_r(char *path, char *devname, Device dev)
     mkdir(dp->devName, 0700);
     strcat(dp->devName, path);
     close(afs_open(dp->devName, O_RDWR | O_CREAT, 0600));
-    dp->device = volutil_GetPartitionID(path);
+    dp->device = dp->index;
 #else
     dp->devName = (char *)malloc(strlen(devname) + 1);
     strncpy(dp->devName, devname, strlen(devname) + 1);
@@ -268,6 +286,11 @@ VInitPartition_r(char *path, char *devname, Device dev)
 	(void)namei_ViceREADME(VPartitionPath(dp));
 #endif
     VSetPartitionDiskUsage_r(dp);
+#ifdef AFS_DEMAND_ATTACH_FS
+    AddPartitionToTable_r(dp);
+    queue_Init(&dp->vol_list);
+    assert(pthread_cond_init(&dp->vol_list.cv, NULL) == 0);
+#endif /* AFS_DEMAND_ATTACH_FS */
 }
 
 static void
@@ -352,7 +375,7 @@ VCheckPartition(char *part, char *devname)
 	return -1;
 #endif
 #endif /* AFS_NAMEI_ENV */
-#endif
+#endif /* !AFS_LINUX20_ENV && !AFS_NT40_ENV */
 
 #if defined(AFS_DUX40_ENV) && !defined(AFS_NAMEI_ENV)
     if (status.st_ino != ROOTINO) {
@@ -825,10 +848,14 @@ struct DiskPartition *
 VGetPartition_r(char *name, int abortp)
 {
     register struct DiskPartition *dp;
+#ifdef AFS_DEMAND_ATTACH_FS
+    dp = VLookupPartition_r(name);
+#else /* AFS_DEMAND_ATTACH_FS */
     for (dp = DiskPartitionList; dp; dp = dp->next) {
 	if (strcmp(dp->name, name) == 0)
 	    break;
     }
+#endif /* AFS_DEMAND_ATTACH_FS */
     if (abortp)
 	assert(dp != NULL);
     return dp;
@@ -1234,3 +1261,60 @@ VUnlockPartition(char *name)
     VUnlockPartition_r(name);
     VOL_UNLOCK;
 }
+
+#ifdef AFS_DEMAND_ATTACH_FS
+/* XXX not sure this will work on AFS_NT40_ENV
+ * needs to be tested!
+ */
+struct DiskPartition * 
+VGetPartitionById_r(afs_int32 id, int abortp)
+{
+    struct DiskPartition * dp = NULL;
+
+    if ((id >= 0) && (id <= VOLMAXPARTS)) {
+	dp = DiskPartitionTable[id];
+    }
+
+    if (abortp) {
+	assert(dp != NULL);
+    }
+    return dp;
+}
+
+struct DiskPartition *
+VGetPartitionById(afs_int32 id, int abortp)
+{
+    struct Diskpartition * dp;
+
+    VOL_LOCK;
+    dp = VGetPartitionById_r(id, abortp);
+    VOL_UNLOCK;
+
+    return dp;
+}
+
+static struct DiskPartition * 
+VLookupPartition_r(char * path)
+{
+    afs_int32 id = volutil_GetPartitionID(path);
+
+    if (id < 0 || id > VOLMAXPARTS)
+	return NULL;
+
+    return DiskPartitionTable[id];
+}
+
+static void 
+AddPartitionToTable_r(struct DiskPartition * dp)
+{
+    assert(dp->index >= 0 && dp->index <= VOLMAXPARTS);
+    DiskPartitionTable[dp->index] = dp;
+}
+
+static void 
+DeletePartitionFromTable_r(struct DiskPartition * dp)
+{
+    assert(dp->index >= 0 && dp->index <= VOLMAXPARTS);
+    DiskPartitionTable[dp->index] = NULL;
+}
+#endif /* AFS_DEMAND_ATTACH_FS */
diff --git a/src/vol/partition.h b/src/vol/partition.h
index 547ec94c18..7d869dfae9 100644
--- a/src/vol/partition.h
+++ b/src/vol/partition.h
@@ -5,6 +5,8 @@
  * This software has been released under the terms of the IBM Public
  * License.  For details, see the LICENSE file in the top-level source
  * directory or online at http://www.openafs.org/dl/license10.html
+ *
+ * Portions Copyright (c) 2006 Sine Nomine Associates
  */
 
 /*
@@ -27,6 +29,7 @@
 #define	AFS_RDSKDEV	"/dev/r"
 #endif
 
+
 /* All Vice partitions on a server will have the following name prefix */
 #define VICE_PARTITION_PREFIX	"/vicep"
 #define VICE_PREFIX_SIZE	(sizeof(VICE_PARTITION_PREFIX)-1)
@@ -53,6 +56,7 @@ struct DiskPartition {
     char *name;			/* Mounted partition name */
     char *devName;		/* Device mounted on */
     Device device;		/* device number */
+    afs_int32 index;            /* partition index (0<=x<=VOLMAXPARTS) */
     int lock_fd;		/* File descriptor of this partition if locked; otherwise -1;
 				 * Not used by the file server */
     int free;			/* Total number of blocks (1K) presumed
@@ -77,7 +81,26 @@ struct DiskPartition {
 				 * from the superblock */
     int flags;
     int f_files;		/* total number of files in this partition */
+#ifdef AFS_DEMAND_ATTACH_FS
+    struct {
+	struct rx_queue head;   /* list of volumes on this partition (VByPList) */
+	afs_uint32 len;         /* length of volume list */
+	int busy;               /* asynch vol list op in progress */
+	pthread_cond_t cv;      /* vol_list.busy change cond var */
+    } vol_list;
+#endif /* AFS_DEMAND_ATTACH_FS */
 };
+
+struct DiskPartitionStats {
+    afs_int32 free;
+    afs_int32 totalUsable;
+    afs_int32 minFree;
+    afs_int32 f_files;
+#ifdef AFS_DEMAND_ATTACH_FS
+    afs_int32 vol_list_len;
+#endif
+};
+
 #define	PART_DONTUPDATE	1
 #define PART_DUPLICATE  2	/* NT - used if we find more than one partition 
 				 * using the same drive. Will be dumped before
@@ -93,7 +116,12 @@ extern int VValidVPTEntry(struct vptab *vptp);
 struct Volume;			/* Potentially forward definition */
 
 extern struct DiskPartition *DiskPartitionList;
-extern struct DiskPartition *VGetPartition();
+extern struct DiskPartition *VGetPartition(char * name, int abortp);
+extern struct DiskPartition *VGetPartition_r(char * name, int abortp);
+#ifdef AFS_DEMAND_ATTACH_FS
+extern struct DiskPartition *VGetPartitionById(afs_int32 index, int abortp);
+extern struct DiskPartition *VGetPartitionById_r(afs_int32 index, int abortp);
+#endif
 extern int VAttachPartitions(void);
 extern void VLockPartition(char *name);
 extern void VLockPartition_r(char *name);
@@ -108,3 +136,4 @@ extern void VAdjustDiskUsage(Error * ec, struct Volume *vp,
 			     afs_sfsize_t blocks, afs_sfsize_t checkBlocks);
 extern int VDiskUsage(struct Volume *vp, afs_sfsize_t blocks);
 extern void VPrintDiskStats(void);
+extern int VInitPartitionPackage(void);
diff --git a/src/vol/purge.c b/src/vol/purge.c
index 01bb22efa3..4b13fcf2bc 100644
--- a/src/vol/purge.c
+++ b/src/vol/purge.c
@@ -52,11 +52,16 @@ RCSID
 #include "volume.h"
 #include "viceinode.h"
 #include "partition.h"
+#include "daemon_com.h"
 #include "fssync.h"
 
 /* forward declarations */
-void PurgeIndex_r(Volume * vp, VnodeClass class);
-void PurgeHeader_r(Volume * vp);
+static int ObliterateRegion(Volume * avp, VnodeClass aclass, StreamHandle_t * afile,
+			    afs_int32 * aoffset);
+static void PurgeIndex(Volume * vp, VnodeClass class);
+static void PurgeIndex_r(Volume * vp, VnodeClass class);
+static void PurgeHeader_r(Volume * vp);
+static void PurgeHeader(Volume * vp);
 
 void
 VPurgeVolume_r(Error * ec, Volume * vp)
@@ -78,7 +83,7 @@ VPurgeVolume_r(Error * ec, Volume * vp)
     /*
      * Call the fileserver to break all call backs for that volume
      */
-    FSYNC_askfs(V_id(vp), tpartp->name, FSYNC_RESTOREVOLUME, 0);
+    FSYNC_VolOp(V_id(vp), tpartp->name, FSYNC_VOL_BREAKCBKS, 0, NULL);
 }
 
 void
@@ -161,7 +166,7 @@ ObliterateRegion(Volume * avp, VnodeClass aclass, StreamHandle_t * afile,
     return -1;
 }
 
-void
+static void
 PurgeIndex(Volume * vp, VnodeClass class)
 {
     VOL_LOCK;
@@ -169,7 +174,7 @@ PurgeIndex(Volume * vp, VnodeClass class)
     VOL_UNLOCK;
 }
 
-void
+static void
 PurgeIndex_r(Volume * vp, VnodeClass class)
 {
     StreamHandle_t *ifile;
@@ -199,7 +204,7 @@ PurgeIndex_r(Volume * vp, VnodeClass class)
     FDH_CLOSE(fdP);
 }
 
-void
+static void
 PurgeHeader(Volume * vp)
 {
     VOL_LOCK;
@@ -207,7 +212,7 @@ PurgeHeader(Volume * vp)
     VOL_UNLOCK;
 }
 
-void
+static void
 PurgeHeader_r(Volume * vp)
 {
     IH_REALLYCLOSE(V_diskDataHandle(vp));
diff --git a/src/vol/salvage.h b/src/vol/salvage.h
index a18a24574c..ce53539070 100644
--- a/src/vol/salvage.h
+++ b/src/vol/salvage.h
@@ -14,6 +14,9 @@
 
  */
 
+#ifndef __salvage_h_
+#define __salvage_h_
+
 #include <afs/afssyscalls.h>
 /* Definition of DirHandle for salvager.  Not the same as for the file server */
 
@@ -24,3 +27,5 @@ typedef struct DirHandle {
     IHandle_t *dirh_handle;
     afs_int32 dirh_cacheCheck;
 } DirHandle;
+
+#endif /* __salvage_h_ */
diff --git a/src/vol/salvaged.c b/src/vol/salvaged.c
new file mode 100644
index 0000000000..d5b318b39e
--- /dev/null
+++ b/src/vol/salvaged.c
@@ -0,0 +1,738 @@
+/*
+ * Copyright 2006, Sine Nomine Associates and others.
+ * All Rights Reserved.
+ * 
+ * This software has been released under the terms of the IBM Public
+ * License.  For details, see the LICENSE file in the top-level source
+ * directory or online at http://www.openafs.org/dl/license10.html
+ */
+
+/* 
+ * demand attach fs
+ * online salvager daemon
+ */
+
+/* Main program file. Define globals. */
+#define MAIN 1
+
+#include <afsconfig.h>
+#include <afs/param.h>
+
+RCSID
+    ("$Header$");
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <dirent.h>
+#include <sys/stat.h>
+#include <time.h>
+#include <errno.h>
+#ifdef AFS_NT40_ENV
+#include <io.h>
+#include <WINNT/afsevent.h>
+#else
+#include <sys/param.h>
+#include <sys/file.h>
+#ifndef ITIMER_REAL
+#include <sys/time.h>
+#endif /* ITIMER_REAL */
+#endif
+#if	defined(AFS_AIX_ENV) || defined(AFS_SUN4_ENV)
+#define WCOREDUMP(x)	(x & 0200)
+#endif
+#include <rx/xdr.h>
+#include <afs/afsint.h>
+#include <afs/assert.h>
+#if !defined(AFS_SGI_ENV) && !defined(AFS_NT40_ENV)
+#if defined(AFS_VFSINCL_ENV)
+#include <sys/vnode.h>
+#ifdef	AFS_SUN5_ENV
+#include <sys/fs/ufs_inode.h>
+#else
+#if defined(AFS_DARWIN_ENV) || defined(AFS_XBSD_ENV)
+#include <ufs/ufs/dinode.h>
+#include <ufs/ffs/fs.h>
+#else
+#include <ufs/inode.h>
+#endif
+#endif
+#else /* AFS_VFSINCL_ENV */
+#ifdef	AFS_OSF_ENV
+#include <ufs/inode.h>
+#else /* AFS_OSF_ENV */
+#if !defined(AFS_LINUX20_ENV) && !defined(AFS_XBSD_ENV)
+#include <sys/inode.h>
+#endif
+#endif
+#endif /* AFS_VFSINCL_ENV */
+#endif /* AFS_SGI_ENV */
+#ifdef	AFS_AIX_ENV
+#include <sys/vfs.h>
+#include <sys/lockf.h>
+#else
+#ifdef	AFS_HPUX_ENV
+#include <unistd.h>
+#include <checklist.h>
+#else
+#if defined(AFS_SGI_ENV)
+#include <unistd.h>
+#include <fcntl.h>
+#include <mntent.h>
+#else
+#if	defined(AFS_SUN_ENV) || defined(AFS_SUN5_ENV)
+#ifdef	  AFS_SUN5_ENV
+#include <unistd.h>
+#include <sys/mnttab.h>
+#include <sys/mntent.h>
+#else
+#include <mntent.h>
+#endif
+#else
+#endif /* AFS_SGI_ENV */
+#endif /* AFS_HPUX_ENV */
+#endif
+#endif
+#include <fcntl.h>
+#ifndef AFS_NT40_ENV
+#include <afs/osi_inode.h>
+#endif
+#include <afs/cmd.h>
+#include <afs/afsutil.h>
+#include <afs/fileutil.h>
+#include <afs/procmgmt.h>	/* signal(), kill(), wait(), etc. */
+#ifndef AFS_NT40_ENV
+#include <syslog.h>
+#endif
+
+#include "nfs.h"
+#include "lwp.h"
+#include "lock.h"
+#include <afs/afssyscalls.h>
+#include "ihandle.h"
+#include "vnode.h"
+#include "volume.h"
+#include "partition.h"
+#include "daemon_com.h"
+#include "fssync.h"
+#include "salvsync.h"
+#include "viceinode.h"
+#include "salvage.h"
+#include "volinodes.h"		/* header magic number, etc. stuff */
+#include "vol-salvage.h"
+#ifdef AFS_NT40_ENV
+#include <pthread.h>
+#endif
+
+
+#if !defined(AFS_DEMAND_ATTACH_FS)
+#error "online salvager only supported for demand attach fileserver"
+#endif /* AFS_DEMAND_ATTACH_FS */
+
+#if defined(AFS_NT40_ENV)
+#error "online salvager not supported on NT"
+#endif /* AFS_NT40_ENV */
+
+
+/* Forward declarations */
+/*@printflike@*/ void Log(const char *format, ...);
+/*@printflike@*/ void Abort(const char *format, ...);
+
+
+/*@+fcnmacros +macrofcndecl@*/
+#ifdef O_LARGEFILE
+#define afs_fopen	fopen64
+#else /* !O_LARGEFILE */
+#define afs_fopen	fopen
+#endif /* !O_LARGEFILE */
+/*@=fcnmacros =macrofcndecl@*/
+
+
+
+static volatile int current_workers = 0;
+static volatile struct rx_queue pending_q;
+static pthread_mutex_t worker_lock;
+static pthread_cond_t worker_cv;
+
+static void * SalvageChildReaperThread(void *);
+static int DoSalvageVolume(struct SalvageQueueNode * node, int slot);
+
+static void SalvageServer(void);
+static void SalvageClient(VolumeId vid, char * pname);
+
+static int Reap_Child(char * prog, int * pid, int * status);
+
+static void * SalvageLogCleanupThread(void *);
+static int SalvageLogCleanup(int pid);
+
+struct log_cleanup_node {
+    struct rx_queue q;
+    int pid;
+};
+
+struct {
+    struct rx_queue queue_head;
+    pthread_cond_t queue_change_cv;
+} log_cleanup_queue;
+
+
+#define DEFAULT_PARALLELISM 4 /* allow 4 parallel salvage workers by default */
+
+static int
+handleit(struct cmd_syndesc *as)
+{
+    register struct cmd_item *ti;
+    char pname[100], *temp;
+    afs_int32 seenpart = 0, seenvol = 0, vid = 0, seenany = 0;
+    struct DiskPartition *partP;
+
+
+#ifdef AFS_SGI_VNODE_GLUE
+    if (afs_init_kernel_config(-1) < 0) {
+	printf
+	    ("Can't determine NUMA configuration, not starting salvager.\n");
+	exit(1);
+    }
+#endif
+
+    if (as->parms[2].items)	/* -debug */
+	debug = 1;
+    if (as->parms[3].items)	/* -nowrite */
+	Testing = 1;
+    if (as->parms[4].items)	/* -inodes */
+	ListInodeOption = 1;
+    if (as->parms[5].items)	/* -oktozap */
+	OKToZap = 1;
+    if (as->parms[6].items)	/* -rootinodes */
+	ShowRootFiles = 1;
+    if (as->parms[8].items)	/* -ForceReads */
+	forceR = 1;
+    if ((ti = as->parms[9].items)) {	/* -Parallel # */
+	temp = ti->data;
+	if (strncmp(temp, "all", 3) == 0) {
+	    PartsPerDisk = 1;
+	    temp += 3;
+	}
+	if (strlen(temp) != 0) {
+	    Parallel = atoi(temp);
+	    if (Parallel < 1)
+		Parallel = 1;
+	    if (Parallel > MAXPARALLEL) {
+		printf("Setting parallel salvages to maximum of %d \n",
+		       MAXPARALLEL);
+		Parallel = MAXPARALLEL;
+	    }
+	}
+    } else {
+	Parallel = MIN(DEFAULT_PARALLELISM, MAXPARALLEL);
+    }
+    if ((ti = as->parms[10].items)) {	/* -tmpdir */
+	DIR *dirp;
+
+	tmpdir = ti->data;
+	dirp = opendir(tmpdir);
+	if (!dirp) {
+	    printf
+		("Can't open temporary placeholder dir %s; using current partition \n",
+		 tmpdir);
+	    tmpdir = NULL;
+	} else
+	    closedir(dirp);
+    }
+    if ((ti = as->parms[11].items))	/* -showlog */
+	ShowLog = 1;
+    if ((ti = as->parms[12].items)) {	/* -orphans */
+	if (Testing)
+	    orphans = ORPH_IGNORE;
+	else if (strcmp(ti->data, "remove") == 0
+		 || strcmp(ti->data, "r") == 0)
+	    orphans = ORPH_REMOVE;
+	else if (strcmp(ti->data, "attach") == 0
+		 || strcmp(ti->data, "a") == 0)
+	    orphans = ORPH_ATTACH;
+    }
+#ifndef AFS_NT40_ENV		/* ignore options on NT */
+    if ((ti = as->parms[13].items)) {	/* -syslog */
+	useSyslog = 1;
+	ShowLog = 0;
+    }
+    if ((ti = as->parms[14].items)) {	/* -syslogfacility */
+	useSyslogFacility = atoi(ti->data);
+    }
+
+    if ((ti = as->parms[15].items)) {	/* -datelogs */
+	TimeStampLogFile(AFSDIR_SERVER_SALSRVLOG_FILEPATH);
+    }
+#endif
+
+    if ((ti = as->parms[16].items)) {   /* -client */
+	if ((ti = as->parms[0].items)) {	/* -partition */
+	    seenpart = 1;
+	    strlcpy(pname, ti->data, sizeof(pname));
+	}
+	if ((ti = as->parms[1].items)) {	/* -volumeid */
+	    seenvol = 1;
+	    vid = atoi(ti->data);
+	}
+
+	if (!seenpart || !seenvol) {
+	    printf("You must specify '-partition' and '-volumeid' with the '-client' option\n");
+	    exit(-1);
+	}
+
+	SalvageClient(vid, pname);
+
+    } else {  /* salvageserver mode */
+	SalvageServer();
+    }
+    return (0);
+}
+
+
+#ifndef AFS_NT40_ENV
+#include "AFS_component_version_number.c"
+#endif
+#define MAX_ARGS 128
+#ifdef AFS_NT40_ENV
+char *save_args[MAX_ARGS];
+int n_save_args = 0;
+pthread_t main_thread;
+#endif
+
+static char commandLine[150];
+
+int
+main(int argc, char **argv)
+{
+    struct cmd_syndesc *ts;
+    int err = 0;
+
+    int i;
+    extern char cml_version_number[];
+
+#ifdef	AFS_AIX32_ENV
+    /*
+     * The following signal action for AIX is necessary so that in case of a 
+     * crash (i.e. core is generated) we can include the user's data section 
+     * in the core dump. Unfortunately, by default, only a partial core is
+     * generated which, in many cases, isn't too useful.
+     */
+    struct sigaction nsa;
+
+    sigemptyset(&nsa.sa_mask);
+    nsa.sa_handler = SIG_DFL;
+    nsa.sa_flags = SA_FULLDUMP;
+    sigaction(SIGABRT, &nsa, NULL);
+    sigaction(SIGSEGV, &nsa, NULL);
+#endif
+
+    /* Initialize directory paths */
+    if (!(initAFSDirPath() & AFSDIR_SERVER_PATHS_OK)) {
+#ifdef AFS_NT40_ENV
+	ReportErrorEventAlt(AFSEVT_SVR_NO_INSTALL_DIR, 0, argv[0], 0);
+#endif
+	fprintf(stderr, "%s: Unable to obtain AFS server directory.\n",
+		argv[0]);
+	exit(2);
+    }
+#ifdef AFS_NT40_ENV
+    main_thread = pthread_self();
+    if (spawnDatap && spawnDataLen) {
+	/* This is a child per partition salvager. Don't setup log or
+	 * try to lock the salvager lock.
+	 */
+	if (nt_SetupPartitionSalvage(spawnDatap, spawnDataLen) < 0)
+	    exit(3);
+    } else {
+#endif
+	for (commandLine[0] = '\0', i = 0; i < argc; i++) {
+	    if (i > 0)
+		strlcat(commandLine, " ", sizeof(commandLine));
+	    strlcat(commandLine, argv[i], sizeof(commandLine));
+	}
+
+#ifndef AFS_NT40_ENV
+	if (geteuid() != 0) {
+	    printf("Salvager must be run as root.\n");
+	    fflush(stdout);
+	    Exit(0);
+	}
+#endif
+
+	/* bad for normal help flag processing, but can do nada */
+
+#ifdef AFS_NT40_ENV
+    }
+#endif
+
+    ts = cmd_CreateSyntax("initcmd", handleit, 0, "initialize the program");
+    cmd_AddParm(ts, "-partition", CMD_SINGLE, CMD_OPTIONAL,
+		"Name of partition to salvage");
+    cmd_AddParm(ts, "-volumeid", CMD_SINGLE, CMD_OPTIONAL,
+		"Volume Id to salvage");
+    cmd_AddParm(ts, "-debug", CMD_FLAG, CMD_OPTIONAL,
+		"Run in Debugging mode");
+    cmd_AddParm(ts, "-nowrite", CMD_FLAG, CMD_OPTIONAL,
+		"Run readonly/test mode");
+    cmd_AddParm(ts, "-inodes", CMD_FLAG, CMD_OPTIONAL,
+		"Just list affected afs inodes - debugging flag");
+    cmd_AddParm(ts, "-oktozap", CMD_FLAG, CMD_OPTIONAL,
+		"Give permission to destroy bogus inodes/volumes - debugging flag");
+    cmd_AddParm(ts, "-rootinodes", CMD_FLAG, CMD_OPTIONAL,
+		"Show inodes owned by root - debugging flag");
+    cmd_AddParm(ts, "-salvagedirs", CMD_FLAG, CMD_OPTIONAL,
+		"Force rebuild/salvage of all directories");
+    cmd_AddParm(ts, "-blockreads", CMD_FLAG, CMD_OPTIONAL,
+		"Read smaller blocks to handle IO/bad blocks");
+    cmd_AddParm(ts, "-parallel", CMD_SINGLE, CMD_OPTIONAL,
+		"# of max parallel partition salvaging");
+    cmd_AddParm(ts, "-tmpdir", CMD_SINGLE, CMD_OPTIONAL,
+		"Name of dir to place tmp files ");
+    cmd_AddParm(ts, "-showlog", CMD_FLAG, CMD_OPTIONAL,
+		"Show log file upon completion");
+    cmd_AddParm(ts, "-orphans", CMD_SINGLE, CMD_OPTIONAL,
+		"ignore | remove | attach");
+
+    /* note - syslog isn't avail on NT, but if we make it conditional, have
+     * to deal with screwy offsets for cmd params */
+    cmd_AddParm(ts, "-syslog", CMD_FLAG, CMD_OPTIONAL,
+		"Write salvage log to syslogs");
+    cmd_AddParm(ts, "-syslogfacility", CMD_SINGLE, CMD_OPTIONAL,
+		"Syslog facility number to use");
+    cmd_AddParm(ts, "-datelogs", CMD_FLAG, CMD_OPTIONAL,
+		"Include timestamp in logfile filename");
+
+    cmd_AddParm(ts, "-client", CMD_FLAG, CMD_OPTIONAL,
+		"Use SALVSYNC to ask salvageserver to salvage a volume");
+
+    err = cmd_Dispatch(argc, argv);
+    Exit(err);
+}
+
+static void
+SalvageClient(VolumeId vid, char * pname)
+{
+    int done = 0;
+    afs_int32 code;
+    SYNC_response res;
+    SALVSYNC_response_hdr sres;
+
+    VInitVolumePackage(volumeUtility, 5, 5, DONT_CONNECT_FS, 0);
+    SALVSYNC_clientInit();
+    
+    code = SALVSYNC_SalvageVolume(vid, pname, SALVSYNC_SALVAGE, SALVSYNC_OPERATOR, 0, NULL);
+    if (code != SYNC_OK) {
+	goto sync_error;
+    }
+
+    res.payload.buf = (void *) &sres;
+    res.payload.len = sizeof(sres);
+
+    while(!done) {
+	sleep(2);
+	code = SALVSYNC_SalvageVolume(vid, pname, SALVSYNC_QUERY, SALVSYNC_WHATEVER, 0, &res);
+	if (code != SYNC_OK) {
+	    goto sync_error;
+	}
+	switch (sres.state) {
+	case SALVSYNC_STATE_ERROR:
+	    printf("salvageserver reports salvage ended in an error; check log files for more details\n");
+	case SALVSYNC_STATE_DONE:
+	case SALVSYNC_STATE_UNKNOWN:
+	    done = 1;
+	}
+    }
+    SALVSYNC_clientFinis();
+    return;
+
+ sync_error:
+    if (code == SYNC_DENIED) {
+	printf("salvageserver refused to salvage volume %u on partition %s\n",
+	       vid, pname);
+    } else if (code == SYNC_BAD_COMMAND) {
+	printf("SALVSYNC protocol mismatch; please make sure fileserver, volserver, salvageserver and salvager are same version\n");
+    } else if (code == SYNC_COM_ERROR) {
+	printf("SALVSYNC communications error\n");
+    }
+    SALVSYNC_clientFinis();
+    exit(-1);
+}
+
+static int * child_slot;
+
+static void
+SalvageServer(void)
+{
+    int pid, ret;
+    struct SalvageQueueNode * node;
+    pthread_t tid;
+    pthread_attr_t attrs;
+    int slot;
+
+    /* All entries to the log will be appended.  Useful if there are
+     * multiple salvagers appending to the log.
+     */
+
+    CheckLogFile(AFSDIR_SERVER_SALSRVLOG_FILEPATH);
+#ifndef AFS_NT40_ENV
+#ifdef AFS_LINUX20_ENV
+    fcntl(fileno(logFile), F_SETFL, O_APPEND);	/* Isn't this redundant? */
+#else
+    fcntl(fileno(logFile), F_SETFL, FAPPEND);	/* Isn't this redundant? */
+#endif
+#endif
+    setlinebuf(logFile);
+
+    fprintf(logFile, "%s\n", cml_version_number);
+    Log("Starting OpenAFS Online Salvage Server %s (%s)\n", SalvageVersion, commandLine);
+    
+    /* Get and hold a lock for the duration of the salvage to make sure
+     * that no other salvage runs at the same time.  The routine
+     * VInitVolumePackage (called below) makes sure that a file server or
+     * other volume utilities don't interfere with the salvage.
+     */
+    
+    /* even demand attach online salvager
+     * still needs this because we don't want
+     * a stand-alone salvager to conflict with
+     * the salvager daemon */
+    ObtainSalvageLock();
+
+    child_slot = (int *) malloc(Parallel * sizeof(int));
+    assert(child_slot != NULL);
+    memset(child_slot, 0, Parallel * sizeof(int));
+	    
+    /* initialize things */
+    VInitVolumePackage(salvageServer, 5, 5,
+		       1, 0);
+    DInit(10);
+    queue_Init(&pending_q);
+    queue_Init(&log_cleanup_queue);
+    assert(pthread_mutex_init(&worker_lock, NULL) == 0);
+    assert(pthread_cond_init(&worker_cv, NULL) == 0);
+    assert(pthread_cond_init(&log_cleanup_queue.queue_change_cv, NULL) == 0);
+    assert(pthread_attr_init(&attrs) == 0);
+
+    /* start up the reaper and log cleaner threads */
+    assert(pthread_attr_setdetachstate(&attrs, PTHREAD_CREATE_DETACHED) == 0);
+    assert(pthread_create(&tid, 
+			  &attrs, 
+			  &SalvageChildReaperThread,
+			  NULL) == 0);
+    assert(pthread_create(&tid, 
+			  &attrs, 
+			  &SalvageLogCleanupThread,
+			  NULL) == 0);
+
+    /* loop forever serving requests */
+    while (1) {
+	node = SALVSYNC_getWork();
+	assert(node != NULL);
+
+	VOL_LOCK;
+	/* find a slot */
+	for (slot = 0; slot < Parallel; slot++) {
+	  if (!child_slot[slot])
+	    break;
+	}
+	assert (slot < Parallel);
+
+	pid = Fork();
+	if (pid == 0) {
+	    VOL_UNLOCK;
+	    ret = DoSalvageVolume(node, slot);
+	    Exit(ret);
+	} else if (pid < 0) {
+	    VOL_UNLOCK;
+	    SALVSYNC_doneWork(node, 1);
+	} else {
+	    child_slot[slot] = pid;
+	    node->pid = pid;
+	    VOL_UNLOCK;
+	    
+	    assert(pthread_mutex_lock(&worker_lock) == 0);
+	    current_workers++;
+	    
+	    /* let the reaper thread know another worker was spawned */
+	    assert(pthread_cond_broadcast(&worker_cv) == 0);
+	    
+	    /* if we're overquota, wait for the reaper */
+	    while (current_workers >= Parallel) {
+		assert(pthread_cond_wait(&worker_cv, &worker_lock) == 0);
+	    }
+	    assert(pthread_mutex_unlock(&worker_lock) == 0);
+	}
+    }
+}
+
+static int
+DoSalvageVolume(struct SalvageQueueNode * node, int slot)
+{
+    char childLog[AFSDIR_PATH_MAX];
+    int ret;
+    struct DiskPartition * partP;
+
+    VChildProcReconnectFS();
+
+    /* do not attempt to close parent's logFile handle as
+     * another thread may have held the lock on the FILE
+     * structure when fork was called! */
+
+    afs_snprintf(childLog, sizeof(childLog), "%s.%d", 
+		 AFSDIR_SERVER_SLVGLOG_FILEPATH, getpid());
+
+    logFile = afs_fopen(childLog, "a");
+    if (!logFile) {		/* still nothing, use stdout */
+	logFile = stdout;
+	ShowLog = 0;
+    }
+
+    if (node->command.sop.volume <= 0) {
+	Log("salvageServer: invalid volume id specified; salvage aborted\n");
+	return 1;
+    }
+    
+    partP = VGetPartition(node->command.sop.partName, 0);
+    if (!partP) {
+	Log("salvageServer: Unknown or unmounted partition %s; salvage aborted\n", 
+	    node->command.sop.partName);
+	return 1;
+    }
+
+    /* Salvage individual volume; don't notify fs */
+    SalvageFileSys1(partP, node->command.sop.volume);
+
+    VDisconnectFS();
+
+    fclose(logFile);
+    return 0;
+}
+
+
+static void *
+SalvageChildReaperThread(void * args)
+{
+    int slot, pid, status, code, found;
+    struct SalvageQueueNode *qp, *nqp;
+    struct log_cleanup_node * cleanup;
+
+    assert(pthread_mutex_lock(&worker_lock) == 0);
+
+    /* loop reaping our children */
+    while (1) {
+	/* wait() won't block unless we have children, so
+	 * block on the cond var if we're childless */
+	while (current_workers == 0) {
+	    assert(pthread_cond_wait(&worker_cv, &worker_lock) == 0);
+	}
+
+	assert(pthread_mutex_unlock(&worker_lock) == 0);
+
+	cleanup = (struct log_cleanup_node *) malloc(sizeof(struct log_cleanup_node));
+
+	while (Reap_Child("salvageserver", &pid, &status) < 0) {
+	    /* try to prevent livelock if something goes wrong */
+	    sleep(1);
+	}
+
+	VOL_LOCK;
+	for (slot = 0; slot < Parallel; slot++) {
+	    if (child_slot[slot] == pid)
+		break;
+	}
+	assert(slot < Parallel);
+	child_slot[slot] = 0;
+	VOL_UNLOCK;
+
+	assert(pthread_mutex_lock(&worker_lock) == 0);
+
+	if (cleanup) {
+	    cleanup->pid = pid;
+	    queue_Append(&log_cleanup_queue, cleanup);
+	    assert(pthread_cond_signal(&log_cleanup_queue.queue_change_cv) == 0);
+	}
+
+	/* ok, we've reaped a child */
+	current_workers--;
+	SALVSYNC_doneWorkByPid(pid, 0);
+	assert(pthread_cond_broadcast(&worker_cv) == 0);
+    }
+
+    return NULL;
+}
+
+static int
+Reap_Child(char *prog, int * pid, int * status)
+{
+    int ret;
+    ret = wait(status);
+
+    if (ret >= 0) {
+	*pid = ret;
+        if (WCOREDUMP(*status))
+	    Log("\"%s\" core dumped!\n", prog);
+	if (WIFSIGNALED(*status) != 0 || WEXITSTATUS(*status) != 0)
+	    Log("\"%s\" (pid=%d) terminated abnormally!\n", prog, ret);
+    } else {
+	Log("wait returned -1\n");
+    }
+    return ret;
+}
+
+/*
+ * thread to combine salvager child logs
+ * back into the main salvageserver log
+ */
+static void *
+SalvageLogCleanupThread(void * arg)
+{
+    struct log_cleanup_node * cleanup;
+
+    assert(pthread_mutex_lock(&worker_lock) == 0);
+
+    while (1) {
+	while (queue_IsEmpty(&log_cleanup_queue)) {
+	    assert(pthread_cond_wait(&log_cleanup_queue.queue_change_cv, &worker_lock) == 0);
+	}
+
+	while (queue_IsNotEmpty(&log_cleanup_queue)) {
+	    cleanup = queue_First(&log_cleanup_queue, log_cleanup_node);
+	    queue_Remove(cleanup);
+	    assert(pthread_mutex_unlock(&worker_lock) == 0);
+	    SalvageLogCleanup(cleanup->pid);
+	    free(cleanup);
+	    assert(pthread_mutex_lock(&worker_lock) == 0);
+	}	    
+    }
+
+    assert(pthread_mutex_unlock(&worker_lock) == 0);
+    return NULL;
+}
+
+#define LOG_XFER_BUF_SIZE 65536
+static int
+SalvageLogCleanup(int pid)
+{
+    int pidlog, len;
+    char fn[AFSDIR_PATH_MAX];
+    static char buf[LOG_XFER_BUF_SIZE];
+
+    afs_snprintf(fn, sizeof(fn), "%s.%d", 
+		 AFSDIR_SERVER_SLVGLOG_FILEPATH, pid);
+    
+
+    pidlog = open(fn, O_RDONLY);
+    unlink(fn);
+    if (pidlog < 0)
+	return 1;
+
+    len = read(pidlog, buf, LOG_XFER_BUF_SIZE);
+    while (len) {
+	fwrite(buf, len, 1, logFile);
+	len = read(pidlog, buf, LOG_XFER_BUF_SIZE);
+    }
+
+    close(pidlog);
+
+    return 0;
+}
diff --git a/src/vol/salvager.c b/src/vol/salvager.c
new file mode 100644
index 0000000000..4af0daa21c
--- /dev/null
+++ b/src/vol/salvager.c
@@ -0,0 +1,499 @@
+/*
+ * Copyright 2000, International Business Machines Corporation and others.
+ * All Rights Reserved.
+ * 
+ * This software has been released under the terms of the IBM Public
+ * License.  For details, see the LICENSE file in the top-level source
+ * directory or online at http://www.openafs.org/dl/license10.html
+ */
+
+/*
+ *      System:		VICE-TWO
+ *      Module:		salvager.c
+ *      Institution:	The Information Technology Center, Carnegie-Mellon University
+ */
+
+
+/* Main program file. Define globals. */
+#define MAIN 1
+
+#include <afsconfig.h>
+#include <afs/param.h>
+
+RCSID
+    ("$Header$");
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <dirent.h>
+#include <sys/stat.h>
+#include <time.h>
+#include <errno.h>
+#ifdef AFS_NT40_ENV
+#include <io.h>
+#include <WINNT/afsevent.h>
+#else
+#include <sys/param.h>
+#include <sys/file.h>
+#ifndef ITIMER_REAL
+#include <sys/time.h>
+#endif /* ITIMER_REAL */
+#endif
+#if	defined(AFS_AIX_ENV) || defined(AFS_SUN4_ENV)
+#define WCOREDUMP(x)	(x & 0200)
+#endif
+#include <rx/xdr.h>
+#include <afs/afsint.h>
+#include <afs/assert.h>
+#if !defined(AFS_SGI_ENV) && !defined(AFS_NT40_ENV)
+#if defined(AFS_VFSINCL_ENV)
+#include <sys/vnode.h>
+#ifdef	AFS_SUN5_ENV
+#include <sys/fs/ufs_inode.h>
+#else
+#if defined(AFS_DARWIN_ENV) || defined(AFS_XBSD_ENV)
+#include <ufs/ufs/dinode.h>
+#include <ufs/ffs/fs.h>
+#else
+#include <ufs/inode.h>
+#endif
+#endif
+#else /* AFS_VFSINCL_ENV */
+#ifdef	AFS_OSF_ENV
+#include <ufs/inode.h>
+#else /* AFS_OSF_ENV */
+#if !defined(AFS_LINUX20_ENV) && !defined(AFS_XBSD_ENV)
+#include <sys/inode.h>
+#endif
+#endif
+#endif /* AFS_VFSINCL_ENV */
+#endif /* AFS_SGI_ENV */
+#ifdef	AFS_AIX_ENV
+#include <sys/vfs.h>
+#include <sys/lockf.h>
+#else
+#ifdef	AFS_HPUX_ENV
+#include <unistd.h>
+#include <checklist.h>
+#else
+#if defined(AFS_SGI_ENV)
+#include <unistd.h>
+#include <fcntl.h>
+#include <mntent.h>
+#else
+#if	defined(AFS_SUN_ENV) || defined(AFS_SUN5_ENV)
+#ifdef	  AFS_SUN5_ENV
+#include <unistd.h>
+#include <sys/mnttab.h>
+#include <sys/mntent.h>
+#else
+#include <mntent.h>
+#endif
+#else
+#endif /* AFS_SGI_ENV */
+#endif /* AFS_HPUX_ENV */
+#endif
+#endif
+#include <fcntl.h>
+#ifndef AFS_NT40_ENV
+#include <afs/osi_inode.h>
+#endif
+#include <afs/cmd.h>
+#include <afs/afsutil.h>
+#include <afs/fileutil.h>
+#include <afs/procmgmt.h>	/* signal(), kill(), wait(), etc. */
+#ifndef AFS_NT40_ENV
+#include <syslog.h>
+#endif
+
+#include "nfs.h"
+#include "lwp.h"
+#include "lock.h"
+#include <afs/afssyscalls.h>
+#include "ihandle.h"
+#include "vnode.h"
+#include "volume.h"
+#include "partition.h"
+#include "daemon_com.h"
+#include "fssync.h"
+#include "salvsync.h"
+#include "viceinode.h"
+#include "salvage.h"
+#include "volinodes.h"		/* header magic number, etc. stuff */
+#include "vol-salvage.h"
+#ifdef AFS_NT40_ENV
+#include <pthread.h>
+#endif
+
+
+static int get_salvage_lock = 0;
+
+
+/* Forward declarations */
+/*@printflike@*/ void Log(const char *format, ...);
+/*@printflike@*/ void Abort(const char *format, ...);
+
+
+static int
+handleit(struct cmd_syndesc *as)
+{
+    register struct cmd_item *ti;
+    char pname[100], *temp;
+    afs_int32 seenpart = 0, seenvol = 0, vid = 0, seenany = 0;
+    struct DiskPartition *partP;
+
+#ifdef AFS_SGI_VNODE_GLUE
+    if (afs_init_kernel_config(-1) < 0) {
+	printf
+	    ("Can't determine NUMA configuration, not starting salvager.\n");
+	exit(1);
+    }
+#endif
+
+#ifdef FAST_RESTART
+    {
+	afs_int32 i;
+	for (i = 0; i < CMD_MAXPARMS; i++) {
+	    if (as->parms[i].items) {
+		seenany = 1;
+		break;
+	    }
+	}
+    }
+    if (!seenany) {
+	char *msg =
+	    "Exiting immediately without salvage. Look into the FileLog to find volumes which really need to be salvaged!";
+
+	if (useSyslog)
+	    Log(msg);
+	else
+	    printf("%s\n", msg);
+
+	Exit(0);
+    }
+#endif /* FAST_RESTART */
+    if ((ti = as->parms[0].items)) {	/* -partition */
+	seenpart = 1;
+	strncpy(pname, ti->data, 100);
+    }
+    if ((ti = as->parms[1].items)) {	/* -volumeid */
+	if (!seenpart) {
+	    printf
+		("You must also specify '-partition' option with the '-volumeid' option\n");
+	    exit(-1);
+	}
+	seenvol = 1;
+	vid = atoi(ti->data);
+    }
+    if (as->parms[2].items)	/* -debug */
+	debug = 1;
+    if (as->parms[3].items)	/* -nowrite */
+	Testing = 1;
+    if (as->parms[4].items)	/* -inodes */
+	ListInodeOption = 1;
+    if (as->parms[5].items)	/* -force */
+	ForceSalvage = 1;
+    if (as->parms[6].items)	/* -oktozap */
+	OKToZap = 1;
+    if (as->parms[7].items)	/* -rootinodes */
+	ShowRootFiles = 1;
+    if (as->parms[8].items)	/* -RebuildDirs */
+	RebuildDirs = 1;
+    if (as->parms[9].items)	/* -ForceReads */
+	forceR = 1;
+    if ((ti = as->parms[10].items)) {	/* -Parallel # */
+	temp = ti->data;
+	if (strncmp(temp, "all", 3) == 0) {
+	    PartsPerDisk = 1;
+	    temp += 3;
+	}
+	if (strlen(temp) != 0) {
+	    Parallel = atoi(temp);
+	    if (Parallel < 1)
+		Parallel = 1;
+	    if (Parallel > MAXPARALLEL) {
+		printf("Setting parallel salvages to maximum of %d \n",
+		       MAXPARALLEL);
+		Parallel = MAXPARALLEL;
+	    }
+	}
+    }
+    if ((ti = as->parms[11].items)) {	/* -tmpdir */
+	DIR *dirp;
+
+	tmpdir = ti->data;
+	dirp = opendir(tmpdir);
+	if (!dirp) {
+	    printf
+		("Can't open temporary placeholder dir %s; using current partition \n",
+		 tmpdir);
+	    tmpdir = NULL;
+	} else
+	    closedir(dirp);
+    }
+    if ((ti = as->parms[12].items))	/* -showlog */
+	ShowLog = 1;
+    if ((ti = as->parms[13].items)) {	/* -log */
+	Testing = 1;
+	ShowSuid = 1;
+	Showmode = 1;
+    }
+    if ((ti = as->parms[14].items)) {	/* -showmounts */
+	Testing = 1;
+	Showmode = 1;
+	ShowMounts = 1;
+    }
+    if ((ti = as->parms[15].items)) {	/* -orphans */
+	if (Testing)
+	    orphans = ORPH_IGNORE;
+	else if (strcmp(ti->data, "remove") == 0
+		 || strcmp(ti->data, "r") == 0)
+	    orphans = ORPH_REMOVE;
+	else if (strcmp(ti->data, "attach") == 0
+		 || strcmp(ti->data, "a") == 0)
+	    orphans = ORPH_ATTACH;
+    }
+#ifndef AFS_NT40_ENV		/* ignore options on NT */
+    if ((ti = as->parms[16].items)) {	/* -syslog */
+	useSyslog = 1;
+	ShowLog = 0;
+    }
+    if ((ti = as->parms[17].items)) {	/* -syslogfacility */
+	useSyslogFacility = atoi(ti->data);
+    }
+
+    if ((ti = as->parms[18].items)) {	/* -datelogs */
+	TimeStampLogFile(AFSDIR_SERVER_SLVGLOG_FILEPATH);
+    }
+#endif
+
+#ifdef FAST_RESTART
+    if (ti = as->parms[19].items) {	/* -DontSalvage */
+	char *msg =
+	    "Exiting immediately without salvage. Look into the FileLog to find volumes which really need to be salvaged!";
+
+	if (useSyslog)
+	    Log(msg);
+	else
+	    printf("%s\n", msg);
+	Exit(0);
+    }
+#elif defined(DEMAND_ATTACH_ENABLE)
+    if (seenvol && !as->parms[19].items) {
+	char * msg =
+	    "The standalone salvager cannot be run concurrently with a Demand Attach Fileserver.  Please use 'salvageserver -client <partition> <volume id>' to manually schedule volume salvages with the salvageserver (new versions of 'bos salvage' automatically do this for you).  Or, if you insist on using the standalone salvager, add the -forceDAFS flag to your salvager command line.";
+
+	if (useSyslog)
+	    Log(msg);
+	else
+	    printf("%s\n", msg);
+	Exit(1);
+    }
+#endif
+
+    if (get_salvage_lock) {
+	ObtainSalvageLock();
+    }
+
+    /* Note:  if seenvol we initialize this as a standard volume utility:  this has the
+     * implication that the file server may be running; negotations have to be made with
+     * the file server in this case to take the read write volume and associated read-only
+     * volumes off line before salvaging */
+#ifdef AFS_NT40_ENV
+    if (seenvol) {
+	if (afs_winsockInit() < 0) {
+	    ReportErrorEventAlt(AFSEVT_SVR_WINSOCK_INIT_FAILED, 0,
+				AFSDIR_SALVAGER_FILE, 0);
+	    Log("Failed to initailize winsock, exiting.\n");
+	    Exit(1);
+	}
+    }
+#endif
+    VInitVolumePackage(seenvol ? volumeUtility : salvager, 5, 5,
+		       DONT_CONNECT_FS, 0);
+    DInit(10);
+#ifdef AFS_NT40_ENV
+    if (myjob.cj_number != NOT_CHILD) {
+	if (!seenpart) {
+	    seenpart = 1;
+	    (void)strcpy(pname, myjob.cj_part);
+	}
+    }
+#endif
+    if (seenpart == 0) {
+	for (partP = DiskPartitionList; partP; partP = partP->next) {
+	    SalvageFileSysParallel(partP);
+	}
+	SalvageFileSysParallel(0);
+    } else {
+	partP = VGetPartition(pname, 0);
+	if (!partP) {
+	    Log("salvage: Unknown or unmounted partition %s; salvage aborted\n", pname);
+	    Exit(1);
+	}
+	if (!seenvol)
+	    SalvageFileSys(partP, 0);
+	else {
+	    /* Salvage individual volume */
+	    if (vid <= 0) {
+		Log("salvage: invalid volume id specified; salvage aborted\n");
+		Exit(1);
+	    }
+	    SalvageFileSys(partP, vid);
+	}
+    }
+    return (0);
+}
+
+
+#ifndef AFS_NT40_ENV
+#include "AFS_component_version_number.c"
+#endif
+#define MAX_ARGS 128
+#ifdef AFS_NT40_ENV
+char *save_args[MAX_ARGS];
+int n_save_args = 0;
+pthread_t main_thread;
+#endif
+
+int
+main(int argc, char **argv)
+{
+    struct cmd_syndesc *ts;
+    int err = 0;
+    char commandLine[150];
+
+    int i;
+    extern char cml_version_number[];
+
+#ifdef	AFS_AIX32_ENV
+    /*
+     * The following signal action for AIX is necessary so that in case of a 
+     * crash (i.e. core is generated) we can include the user's data section 
+     * in the core dump. Unfortunately, by default, only a partial core is
+     * generated which, in many cases, isn't too useful.
+     */
+    struct sigaction nsa;
+
+    sigemptyset(&nsa.sa_mask);
+    nsa.sa_handler = SIG_DFL;
+    nsa.sa_flags = SA_FULLDUMP;
+    sigaction(SIGABRT, &nsa, NULL);
+    sigaction(SIGSEGV, &nsa, NULL);
+#endif
+
+    /* Initialize directory paths */
+    if (!(initAFSDirPath() & AFSDIR_SERVER_PATHS_OK)) {
+#ifdef AFS_NT40_ENV
+	ReportErrorEventAlt(AFSEVT_SVR_NO_INSTALL_DIR, 0, argv[0], 0);
+#endif
+	fprintf(stderr, "%s: Unable to obtain AFS server directory.\n",
+		argv[0]);
+	exit(2);
+    }
+#ifdef AFS_NT40_ENV
+    main_thread = pthread_self();
+    if (spawnDatap && spawnDataLen) {
+	/* This is a child per partition salvager. Don't setup log or
+	 * try to lock the salvager lock.
+	 */
+	if (nt_SetupPartitionSalvage(spawnDatap, spawnDataLen) < 0)
+	    exit(3);
+    } else {
+#endif
+	for (commandLine[0] = '\0', i = 0; i < argc; i++) {
+	    if (i > 0)
+		strcat(commandLine, " ");
+	    strcat(commandLine, argv[i]);
+	}
+
+	/* All entries to the log will be appended.  Useful if there are
+	 * multiple salvagers appending to the log.
+	 */
+
+	CheckLogFile(AFSDIR_SERVER_SLVGLOG_FILEPATH);
+#ifndef AFS_NT40_ENV
+#ifdef AFS_LINUX20_ENV
+	fcntl(fileno(logFile), F_SETFL, O_APPEND);	/* Isn't this redundant? */
+#else
+	fcntl(fileno(logFile), F_SETFL, FAPPEND);	/* Isn't this redundant? */
+#endif
+#endif
+	setlinebuf(logFile);
+
+#ifndef AFS_NT40_ENV
+	if (geteuid() != 0) {
+	    printf("Salvager must be run as root.\n");
+	    fflush(stdout);
+	    Exit(0);
+	}
+#endif
+
+	/* bad for normal help flag processing, but can do nada */
+
+	fprintf(logFile, "%s\n", cml_version_number);
+	Log("STARTING AFS SALVAGER %s (%s)\n", SalvageVersion, commandLine);
+
+	/* Get and hold a lock for the duration of the salvage to make sure
+	 * that no other salvage runs at the same time.  The routine
+	 * VInitVolumePackage (called below) makes sure that a file server or
+	 * other volume utilities don't interfere with the salvage.
+	 */
+	get_salvage_lock = 1;
+#ifdef AFS_NT40_ENV
+    }
+#endif
+
+    ts = cmd_CreateSyntax("initcmd", handleit, 0, "initialize the program");
+    cmd_AddParm(ts, "-partition", CMD_SINGLE, CMD_OPTIONAL,
+		"Name of partition to salvage");
+    cmd_AddParm(ts, "-volumeid", CMD_SINGLE, CMD_OPTIONAL,
+		"Volume Id to salvage");
+    cmd_AddParm(ts, "-debug", CMD_FLAG, CMD_OPTIONAL,
+		"Run in Debugging mode");
+    cmd_AddParm(ts, "-nowrite", CMD_FLAG, CMD_OPTIONAL,
+		"Run readonly/test mode");
+    cmd_AddParm(ts, "-inodes", CMD_FLAG, CMD_OPTIONAL,
+		"Just list affected afs inodes - debugging flag");
+    cmd_AddParm(ts, "-force", CMD_FLAG, CMD_OPTIONAL, "Force full salvaging");
+    cmd_AddParm(ts, "-oktozap", CMD_FLAG, CMD_OPTIONAL,
+		"Give permission to destroy bogus inodes/volumes - debugging flag");
+    cmd_AddParm(ts, "-rootinodes", CMD_FLAG, CMD_OPTIONAL,
+		"Show inodes owned by root - debugging flag");
+    cmd_AddParm(ts, "-salvagedirs", CMD_FLAG, CMD_OPTIONAL,
+		"Force rebuild/salvage of all directories");
+    cmd_AddParm(ts, "-blockreads", CMD_FLAG, CMD_OPTIONAL,
+		"Read smaller blocks to handle IO/bad blocks");
+    cmd_AddParm(ts, "-parallel", CMD_SINGLE, CMD_OPTIONAL,
+		"# of max parallel partition salvaging");
+    cmd_AddParm(ts, "-tmpdir", CMD_SINGLE, CMD_OPTIONAL,
+		"Name of dir to place tmp files ");
+    cmd_AddParm(ts, "-showlog", CMD_FLAG, CMD_OPTIONAL,
+		"Show log file upon completion");
+    cmd_AddParm(ts, "-showsuid", CMD_FLAG, CMD_OPTIONAL,
+		"Report on suid/sgid files");
+    cmd_AddParm(ts, "-showmounts", CMD_FLAG, CMD_OPTIONAL,
+		"Report on mountpoints");
+    cmd_AddParm(ts, "-orphans", CMD_SINGLE, CMD_OPTIONAL,
+		"ignore | remove | attach");
+
+    /* note - syslog isn't avail on NT, but if we make it conditional, have
+     * to deal with screwy offsets for cmd params */
+    cmd_AddParm(ts, "-syslog", CMD_FLAG, CMD_OPTIONAL,
+		"Write salvage log to syslogs");
+    cmd_AddParm(ts, "-syslogfacility", CMD_SINGLE, CMD_OPTIONAL,
+		"Syslog facility number to use");
+    cmd_AddParm(ts, "-datelogs", CMD_FLAG, CMD_OPTIONAL,
+		"Include timestamp in logfile filename");
+#ifdef FAST_RESTART
+    cmd_AddParm(ts, "-DontSalvage", CMD_FLAG, CMD_OPTIONAL,
+		"Don't salvage. This my be set in BosConfig to let the fileserver restart immediately after a crash. Bad volumes will be taken offline");
+#elif defined(DEMAND_ATTACH_ENABLE)
+    cmd_AddParm(ts, "-forceDAFS", CMD_FLAG, CMD_OPTIONAL,
+		"For Demand Attach Fileserver, permit a manual volume salvage outside of the salvageserver");
+#endif /* FAST_RESTART */
+    err = cmd_Dispatch(argc, argv);
+    Exit(err);
+}
+
diff --git a/src/vol/salvsync-client.c b/src/vol/salvsync-client.c
new file mode 100644
index 0000000000..7ed96d6ee0
--- /dev/null
+++ b/src/vol/salvsync-client.c
@@ -0,0 +1,172 @@
+/*
+ * Copyright 2006, Sine Nomine Associates and others.
+ * All Rights Reserved.
+ * 
+ * This software has been released under the terms of the IBM Public
+ * License.  For details, see the LICENSE file in the top-level source
+ * directory or online at http://www.openafs.org/dl/license10.html
+ */
+
+/*
+ * salvsync-client.c
+ *
+ * OpenAFS demand attach fileserver
+ * Salvage server synchronization with fileserver.
+ */
+
+#include <afsconfig.h>
+#include <afs/param.h>
+
+RCSID
+    ("$Header$");
+
+#include <sys/types.h>
+#include <stdio.h>
+#ifdef AFS_NT40_ENV
+#include <winsock2.h>
+#include <time.h>
+#else
+#include <sys/param.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <netdb.h>
+#include <sys/time.h>
+#endif
+#include <errno.h>
+#include <assert.h>
+#include <signal.h>
+
+#ifdef HAVE_STRING_H
+#include <string.h>
+#else
+#ifdef HAVE_STRINGS_H
+#include <strings.h>
+#endif
+#endif
+
+
+#include <rx/xdr.h>
+#include <afs/afsint.h>
+#include "nfs.h"
+#include <afs/errors.h>
+#include "salvsync.h"
+#include "lwp.h"
+#include "lock.h"
+#include <afs/afssyscalls.h>
+#include "ihandle.h"
+#include "vnode.h"
+#include "volume.h"
+#include "partition.h"
+#include <rx/rx_queue.h>
+
+/*@printflike@*/ extern void Log(const char *format, ...);
+
+#ifdef osi_Assert
+#undef osi_Assert
+#endif
+#define osi_Assert(e) (void)(e)
+
+
+#ifdef AFS_DEMAND_ATTACH_FS
+/*
+ * SALVSYNC is a feature specific to the demand attach fileserver
+ */
+
+extern int LogLevel;
+extern int VInit;
+extern pthread_mutex_t vol_salvsync_mutex;
+
+static SYNC_client_state salvsync_client_state = { -1, 2041, SALVSYNC_PROTO_VERSION, 5, 120 };
+
+/*
+ * client-side routines
+ */
+
+int
+SALVSYNC_clientInit(void)
+{
+    return SYNC_connect(&salvsync_client_state);
+}
+
+int
+SALVSYNC_clientFinis(void)
+{
+    SYNC_closeChannel(&salvsync_client_state);
+    return 1;
+}
+
+int
+SALVSYNC_clientReconnect(void)
+{
+    return SYNC_reconnect(&salvsync_client_state);
+}
+
+afs_int32
+SALVSYNC_askSalv(SYNC_command * com, SYNC_response * res)
+{
+    afs_int32 code;
+
+    VSALVSYNC_LOCK;
+    code = SYNC_ask(&salvsync_client_state, com, res);
+    VSALVSYNC_UNLOCK;
+
+    switch (code) {
+    case SYNC_OK:
+    case SYNC_FAILED:
+      break;
+    case SYNC_COM_ERROR:
+    case SYNC_BAD_COMMAND:
+	Log("SALVSYNC_askSalv: fatal SALVSYNC protocol error; online salvager functionality disabled until next fileserver restart\n");
+	break;
+    case SYNC_DENIED:
+	Log("SALVSYNC_askSalv: SALVSYNC request denied for reason=%d\n", res->hdr.reason);
+	break;
+    default:
+	Log("SALVSYNC_askSalv: unknown protocol response %d\n", code);
+	break;
+    }
+
+    return code;
+}
+
+afs_int32
+SALVSYNC_SalvageVolume(VolumeId volume, char *partName, int command, int reason, 
+		       afs_uint32 prio, SYNC_response * res_in)
+{
+    SYNC_command com;
+    SYNC_response res_l, *res;
+    SALVSYNC_command_hdr scom;
+    SALVSYNC_response_hdr sres;
+    int n, tot;
+
+    memset(&com, 0, sizeof(com));
+    memset(&scom, 0, sizeof(scom));
+
+    if (res_in) {
+	res = res_in;
+    } else {
+	memset(&res_l, 0, sizeof(res_l));
+	memset(&sres, 0, sizeof(sres));
+	res_l.payload.buf = (void *) &sres;
+	res_l.payload.len = sizeof(sres);
+	res = &res_l;
+    }
+
+    com.payload.buf = (void *) &scom;
+    com.payload.len = sizeof(scom);
+    com.hdr.command = command;
+    com.hdr.reason = reason;
+    com.hdr.command_len = sizeof(com.hdr) + sizeof(scom);
+    scom.volume = volume;
+    scom.prio = prio;
+
+    if (partName) {
+	strlcpy(scom.partName, partName, sizeof(scom.partName));
+    } else {
+	scom.partName[0] = '\0';
+    }
+
+    return SALVSYNC_askSalv(&com, res);
+}
+
+#endif /* AFS_DEMAND_ATTACH_FS */
diff --git a/src/vol/salvsync-server.c b/src/vol/salvsync-server.c
new file mode 100644
index 0000000000..d9e083b23f
--- /dev/null
+++ b/src/vol/salvsync-server.c
@@ -0,0 +1,1009 @@
+/*
+ * Copyright 2006, Sine Nomine Associates and others.
+ * All Rights Reserved.
+ * 
+ * This software has been released under the terms of the IBM Public
+ * License.  For details, see the LICENSE file in the top-level source
+ * directory or online at http://www.openafs.org/dl/license10.html
+ */
+
+/*
+ * salvsync-server.c
+ *
+ * OpenAFS demand attach fileserver
+ * Salvage server synchronization with fileserver.
+ */
+
+/* This controls the size of an fd_set; it must be defined early before
+ * the system headers define that type and the macros that operate on it.
+ * Its value should be as large as the maximum file descriptor limit we
+ * are likely to run into on any platform.  Right now, that is 65536
+ * which is the default hard fd limit on Solaris 9 */
+#ifndef _WIN32
+#define FD_SETSIZE 65536
+#endif
+
+#include <afsconfig.h>
+#include <afs/param.h>
+
+RCSID
+    ("$Header$");
+
+#include <sys/types.h>
+#include <stdio.h>
+#ifdef AFS_NT40_ENV
+#include <winsock2.h>
+#include <time.h>
+#else
+#include <sys/param.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <netdb.h>
+#include <sys/time.h>
+#endif
+#include <errno.h>
+#include <assert.h>
+#include <signal.h>
+
+#ifdef HAVE_STRING_H
+#include <string.h>
+#else
+#ifdef HAVE_STRINGS_H
+#include <strings.h>
+#endif
+#endif
+
+
+#include <rx/xdr.h>
+#include <afs/afsint.h>
+#include "nfs.h"
+#include <afs/errors.h>
+#include "salvsync.h"
+#include "lwp.h"
+#include "lock.h"
+#include <afs/afssyscalls.h>
+#include "ihandle.h"
+#include "vnode.h"
+#include "volume.h"
+#include "partition.h"
+#include <rx/rx_queue.h>
+
+#if !defined(offsetof)
+#include <stddef.h>
+#endif
+
+/*@printflike@*/ extern void Log(const char *format, ...);
+
+#ifdef osi_Assert
+#undef osi_Assert
+#endif
+#define osi_Assert(e) (void)(e)
+
+#define MAXHANDLERS	4	/* Up to 4 clients; must be at least 2, so that
+				 * move = dump+restore can run on single server */
+
+#define MAX_BIND_TRIES	5	/* Number of times to retry socket bind */
+
+
+
+/* Forward declarations */
+static void * SALVSYNC_syncThread(void *);
+static void SALVSYNC_newconnection(int fd);
+static void SALVSYNC_com(int fd);
+static void SALVSYNC_Drop(int fd);
+static void AcceptOn(void);
+static void AcceptOff(void);
+static void InitHandler(void);
+static void CallHandler(fd_set * fdsetp);
+static int AddHandler(int afd, void (*aproc) (int));
+static int FindHandler(register int afd);
+static int FindHandler_r(register int afd);
+static int RemoveHandler(register int afd);
+static void GetHandler(fd_set * fdsetp, int *maxfdp);
+
+
+/*
+ * This lock controls access to the handler array.
+ */
+struct Lock SALVSYNC_handler_lock;
+
+
+#ifdef AFS_DEMAND_ATTACH_FS
+/*
+ * SALVSYNC is a feature specific to the demand attach fileserver
+ */
+
+static int AddToSalvageQueue(struct SalvageQueueNode * node);
+static void DeleteFromSalvageQueue(struct SalvageQueueNode * node);
+static void AddToPendingQueue(struct SalvageQueueNode * node);
+static void DeleteFromPendingQueue(struct SalvageQueueNode * node);
+static struct SalvageQueueNode * LookupPendingCommand(SALVSYNC_command_hdr * qry);
+static struct SalvageQueueNode * LookupPendingCommandByPid(int pid);
+static void RaiseCommandPrio(struct SalvageQueueNode * node, SALVSYNC_command_hdr * com);
+
+static struct SalvageQueueNode * LookupNode(VolumeId vid, char * partName);
+static struct SalvageQueueNode * LookupNodeByCommand(SALVSYNC_command_hdr * qry);
+static void AddNodeToHash(struct SalvageQueueNode * node);
+static void DeleteNodeFromHash(struct SalvageQueueNode * node);
+
+static afs_int32 SALVSYNC_com_Salvage(SALVSYNC_command * com, SALVSYNC_response * res);
+static afs_int32 SALVSYNC_com_Cancel(SALVSYNC_command * com, SALVSYNC_response * res);
+static afs_int32 SALVSYNC_com_RaisePrio(SALVSYNC_command * com, SALVSYNC_response * res);
+static afs_int32 SALVSYNC_com_Query(SALVSYNC_command * com, SALVSYNC_response * res);
+static afs_int32 SALVSYNC_com_CancelAll(SALVSYNC_command * com, SALVSYNC_response * res);
+
+
+extern int LogLevel;
+extern int VInit;
+extern pthread_mutex_t vol_salvsync_mutex;
+
+static int AcceptSd = -1;		/* Socket used by server for accepting connections */
+
+
+/* be careful about rearranging elements in this structure.
+ * element placement has been optimized for locality of reference
+ * in SALVSYNC_getWork() */
+struct SalvageQueue {
+    volatile int total_len;
+    volatile afs_int32 last_insert;    /* id of last partition to have a salvage node insert */
+    volatile int len[VOLMAXPARTS+1];
+    volatile struct rx_queue part[VOLMAXPARTS+1];
+    pthread_cond_t cv;
+};
+static struct SalvageQueue salvageQueue;  /* volumes waiting to be salvaged */
+
+struct QueueHead {
+    volatile struct rx_queue q;
+    volatile int len;
+    pthread_cond_t queue_change_cv;
+};
+static struct QueueHead pendingQueue;  /* volumes being salvaged */
+
+/* XXX
+ * whether a partition has a salvage in progress
+ *
+ * the salvager code only permits one salvage per partition at a time
+ *
+ * the following hack tries to keep salvaged parallelism high by
+ * only permitting one salvage dispatch per partition at a time
+ *
+ * unfortunately, the parallel salvager currently
+ * has a rather braindead routine that won't permit
+ * multiple salvages on the same "device".  this
+ * function happens to break pretty badly on lvm, raid luns, etc.
+ *
+ * this hack isn't good enough to stop the device limiting code from
+ * crippling performance.  someday that code needs to be rewritten
+ */
+static int partition_salvaging[VOLMAXPARTS+1];
+
+#define VSHASH_SIZE 64
+#define VSHASH_MASK (VSHASH_SIZE-1)
+#define VSHASH(vid) ((vid)&VSHASH_MASK)
+
+static struct QueueHead  SalvageHashTable[VSHASH_SIZE];
+
+static struct SalvageQueueNode *
+LookupNode(afs_uint32 vid, char * partName)
+{
+    struct rx_queue *qp, *nqp;
+    struct SalvageQueueNode *vsp;
+    int idx = VSHASH(vid);
+
+    for (queue_Scan(&SalvageHashTable[idx], qp, nqp, rx_queue)) {
+	vsp = (struct SalvageQueueNode *)((char *)qp - offsetof(struct SalvageQueueNode, hash_chain));
+	if ((vsp->command.sop.volume == vid) &&
+	    !strncmp(vsp->command.sop.partName, partName, sizeof(vsp->command.sop.partName))) {
+	    break;
+	}
+    }
+
+    if (queue_IsEnd(&SalvageHashTable[idx], qp)) {
+	vsp = NULL;
+    }
+    return vsp;
+}
+
+static struct SalvageQueueNode *
+LookupNodeByCommand(SALVSYNC_command_hdr * qry)
+{
+    return LookupNode(qry->volume, qry->partName);
+}
+
+static void
+AddNodeToHash(struct SalvageQueueNode * node)
+{
+    int idx = VSHASH(node->command.sop.volume);
+
+    if (queue_IsOnQueue(&node->hash_chain)) {
+	return;
+    }
+
+    queue_Append(&SalvageHashTable[idx], &node->hash_chain);
+    SalvageHashTable[idx].len++;
+}
+
+static void
+DeleteNodeFromHash(struct SalvageQueueNode * node)
+{
+    int idx = VSHASH(node->command.sop.volume);
+
+    if (queue_IsNotOnQueue(&node->hash_chain)) {
+	return;
+    }
+
+    queue_Remove(&node->hash_chain);
+    SalvageHashTable[idx].len--;
+}
+
+void
+SALVSYNC_salvInit(void)
+{
+    int i;
+    pthread_t tid;
+    pthread_attr_t tattr;
+
+    /* initialize the queues */
+    assert(pthread_cond_init(&salvageQueue.cv, NULL) == 0);
+    for (i = 0; i <= VOLMAXPARTS; i++) {
+	queue_Init(&salvageQueue.part[i]);
+	salvageQueue.len[i] = 0;
+    }
+    assert(pthread_cond_init(&pendingQueue.queue_change_cv, NULL) == 0);
+    queue_Init(&pendingQueue);
+    salvageQueue.total_len = pendingQueue.len = 0;
+    salvageQueue.last_insert = -1;
+    memset(partition_salvaging, 0, sizeof(partition_salvaging));
+
+    for (i = 0; i < VSHASH_SIZE; i++) {
+	assert(pthread_cond_init(&SalvageHashTable[i].queue_change_cv, NULL) == 0);
+	SalvageHashTable[i].len = 0;
+	queue_Init(&SalvageHashTable[i]);
+    }
+
+    /* start the salvsync thread */
+    assert(pthread_attr_init(&tattr) == 0);
+    assert(pthread_attr_setdetachstate(&tattr, PTHREAD_CREATE_DETACHED) == 0);
+    assert(pthread_create(&tid, &tattr, SALVSYNC_syncThread, NULL) == 0);
+}
+
+static int
+getport(struct sockaddr_in *addr)
+{
+    int sd;
+
+    memset(addr, 0, sizeof(*addr));
+    assert((sd = socket(AF_INET, SOCK_STREAM, 0)) >= 0);
+#ifdef STRUCT_SOCKADDR_HAS_SA_LEN
+    addr->sin_len = sizeof(struct sockaddr_in);
+#endif
+    addr->sin_addr.s_addr = htonl(0x7f000001);
+    addr->sin_family = AF_INET;	/* was localhost->h_addrtype */
+    addr->sin_port = htons(2041);	/* XXXX htons not _really_ neccessary */
+
+    return sd;
+}
+
+static fd_set SALVSYNC_readfds;
+
+static void *
+SALVSYNC_syncThread(void * args)
+{
+    struct sockaddr_in addr;
+    int on = 1;
+    int code;
+    int numTries;
+    int tid;
+
+#ifndef AFS_NT40_ENV
+    (void)signal(SIGPIPE, SIG_IGN);
+#endif
+
+    /* set our 'thread-id' so that the host hold table works */
+    MUTEX_ENTER(&rx_stats_mutex);	/* protects rxi_pthread_hinum */
+    tid = ++rxi_pthread_hinum;
+    MUTEX_EXIT(&rx_stats_mutex);
+    pthread_setspecific(rx_thread_id_key, (void *)tid);
+    Log("Set thread id %d for SALVSYNC_syncThread\n", tid);
+
+    AcceptSd = getport(&addr);
+    /* Reuseaddr needed because system inexplicably leaves crud lying around */
+    code =
+	setsockopt(AcceptSd, SOL_SOCKET, SO_REUSEADDR, (char *)&on,
+		   sizeof(on));
+    if (code)
+	Log("SALVSYNC_sync: setsockopt failed with (%d)\n", errno);
+
+    for (numTries = 0; numTries < MAX_BIND_TRIES; numTries++) {
+	if ((code =
+	     bind(AcceptSd, (struct sockaddr *)&addr, sizeof(addr))) == 0)
+	    break;
+	Log("SALVSYNC_sync: bind failed with (%d), will sleep and retry\n",
+	    errno);
+	sleep(5);
+    }
+    assert(!code);
+    listen(AcceptSd, 100);
+    InitHandler();
+    AcceptOn();
+
+    for (;;) {
+	int maxfd;
+	GetHandler(&SALVSYNC_readfds, &maxfd);
+	/* Note: check for >= 1 below is essential since IOMGR_select
+	 * doesn't have exactly same semantics as select.
+	 */
+	if (select(maxfd + 1, &SALVSYNC_readfds, NULL, NULL, NULL) >= 1)
+	    CallHandler(&SALVSYNC_readfds);
+    }
+
+    return NULL;
+}
+
+static void
+SALVSYNC_newconnection(int afd)
+{
+    struct sockaddr_in other;
+    int junk, fd;
+    junk = sizeof(other);
+    fd = accept(afd, (struct sockaddr *)&other, &junk);
+    if (fd == -1) {
+	Log("SALVSYNC_newconnection:  accept failed, errno==%d\n", errno);
+	assert(1 == 2);
+    } else if (!AddHandler(fd, SALVSYNC_com)) {
+	AcceptOff();
+	assert(AddHandler(fd, SALVSYNC_com));
+    }
+}
+
+/* this function processes commands from an salvsync file descriptor (fd) */
+static afs_int32 SALV_cnt = 0;
+static void
+SALVSYNC_com(int fd)
+{
+    SYNC_command com;
+    SYNC_response res;
+    SALVSYNC_response_hdr sres_hdr;
+    SALVSYNC_command scom;
+    SALVSYNC_response sres;
+    SYNC_PROTO_BUF_DECL(buf);
+    
+    com.payload.buf = (void *)buf;
+    com.payload.len = SYNC_PROTO_MAX_LEN;
+    res.payload.buf = (void *) &sres_hdr;
+    res.payload.len = sizeof(sres_hdr);
+    res.hdr.response_len = sizeof(res.hdr) + sizeof(sres_hdr);
+    res.hdr.proto_version = SALVSYNC_PROTO_VERSION;
+
+    scom.hdr = &com.hdr;
+    scom.sop = (SALVSYNC_command_hdr *) buf;
+    scom.com = &com;
+    sres.hdr = &res.hdr;
+    sres.sop = &sres_hdr;
+    sres.res = &res;
+
+    SALV_cnt++;
+    if (SYNC_getCom(fd, &com)) {
+	Log("SALVSYNC_com:  read failed; dropping connection (cnt=%d)\n", SALV_cnt);
+	SALVSYNC_Drop(fd);
+	return;
+    }
+
+    if (com.hdr.proto_version != SALVSYNC_PROTO_VERSION) {
+	Log("SALVSYNC_com:  invalid protocol version (%u)\n", com.hdr.proto_version);
+	res.hdr.response = SYNC_COM_ERROR;
+	res.hdr.flags |= SYNC_FLAG_CHANNEL_SHUTDOWN;
+	goto respond;
+    }
+
+    if (com.recv_len != (sizeof(com.hdr) + sizeof(SALVSYNC_command_hdr))) {
+	Log("SALVSYNC_com:  invalid protocol message length (%u)\n", com.recv_len);
+	res.hdr.response = SYNC_COM_ERROR;
+	res.hdr.reason = SYNC_REASON_MALFORMED_PACKET;
+	res.hdr.flags |= SYNC_FLAG_CHANNEL_SHUTDOWN;
+	goto respond;
+    }
+
+    VOL_LOCK;
+    switch (com.hdr.command) {
+    case SALVSYNC_NOP:
+	break;
+    case SALVSYNC_SALVAGE:
+	res.hdr.response = SALVSYNC_com_Salvage(&scom, &sres);
+	break;
+    case SALVSYNC_CANCEL:
+	/* cancel a salvage */
+	res.hdr.response = SALVSYNC_com_Cancel(&scom, &sres);
+	break;
+    case SALVSYNC_CANCELALL:
+	/* cancel all queued salvages */
+	res.hdr.response = SALVSYNC_com_CancelAll(&scom, &sres);
+	break;
+    case SALVSYNC_RAISEPRIO:
+	/* raise the priority of a salvage */
+	res.hdr.response = SALVSYNC_com_RaisePrio(&scom, &sres);
+	break;
+    case SALVSYNC_QUERY:
+	/* query whether a volume is done salvaging */
+	res.hdr.response = SALVSYNC_com_Query(&scom, &sres);
+	break;
+    case SYNC_COM_CHANNEL_CLOSE:
+	res.hdr.response = SYNC_OK;
+	res.hdr.flags |= SYNC_FLAG_CHANNEL_SHUTDOWN;
+	break;
+    default:
+	res.hdr.response = SYNC_BAD_COMMAND;
+	break;
+    }
+
+    sres_hdr.sq_len = salvageQueue.total_len;
+    sres_hdr.pq_len = pendingQueue.len;
+    VOL_UNLOCK;
+
+ respond:
+    SYNC_putRes(fd, &res);
+    if (res.hdr.flags & SYNC_FLAG_CHANNEL_SHUTDOWN) {
+	SALVSYNC_Drop(fd);
+    }
+}
+
+static afs_int32
+SALVSYNC_com_Salvage(SALVSYNC_command * com, SALVSYNC_response * res)
+{
+    afs_int32 code = SYNC_OK;
+    struct SalvageQueueNode * node;
+
+    if (SYNC_verifyProtocolString(com->sop->partName, sizeof(com->sop->partName))) {
+	code = SYNC_FAILED;
+	res->hdr->reason = SYNC_REASON_MALFORMED_PACKET;
+	goto done;
+    }
+
+    node = LookupNodeByCommand(com->sop);
+
+    /* schedule a salvage for this volume */
+    if (node != NULL) {
+	switch (node->state) {
+	case SALVSYNC_STATE_ERROR:
+	case SALVSYNC_STATE_DONE:
+	    memcpy(&node->command.com, com->hdr, sizeof(SYNC_command_hdr));
+	    memcpy(&node->command.sop, com->sop, sizeof(SALVSYNC_command_hdr));
+	    node->command.sop.prio = 0;
+	    if (AddToSalvageQueue(node)) {
+		code = SYNC_DENIED;
+	    }
+	    break;
+	default:
+	    break;
+	}
+    } else {
+	node = (struct SalvageQueueNode *) malloc(sizeof(struct SalvageQueueNode));
+	if (node == NULL) {
+	    code = SYNC_DENIED;
+	    goto done;
+	}
+	memset(node, 0, sizeof(struct SalvageQueueNode));
+	memcpy(&node->command.com, com->hdr, sizeof(SYNC_command_hdr));
+	memcpy(&node->command.sop, com->sop, sizeof(SALVSYNC_command_hdr));
+	AddNodeToHash(node);
+	if (AddToSalvageQueue(node)) {
+	    /* roll back */
+	    DeleteNodeFromHash(node);
+	    free(node);
+	    node = NULL;
+	    code = SYNC_DENIED;
+	    goto done;
+	}
+    }
+
+    res->hdr->flags |= SALVSYNC_FLAG_VOL_STATS_VALID;
+    res->sop->state = node->state;
+    res->sop->prio = node->command.sop.prio;
+
+ done:
+    return code;
+}
+
+static afs_int32
+SALVSYNC_com_Cancel(SALVSYNC_command * com, SALVSYNC_response * res)
+{
+    afs_int32 code = SYNC_OK;
+    struct SalvageQueueNode * node;
+
+    if (SYNC_verifyProtocolString(com->sop->partName, sizeof(com->sop->partName))) {
+	code = SYNC_FAILED;
+	res->hdr->reason = SYNC_REASON_MALFORMED_PACKET;
+	goto done;
+    }
+
+    node = LookupNodeByCommand(com->sop);
+
+    if (node == NULL) {
+	res->sop->state = SALVSYNC_STATE_UNKNOWN;
+	res->sop->prio = 0;
+    } else {
+	res->hdr->flags |= SALVSYNC_FLAG_VOL_STATS_VALID;
+	res->sop->prio = node->command.sop.prio;
+	res->sop->state = node->state;
+	if (node->state == SALVSYNC_STATE_QUEUED) {
+	    DeleteFromSalvageQueue(node);
+	}
+    }
+
+ done:
+    return code;
+}
+
+static afs_int32
+SALVSYNC_com_CancelAll(SALVSYNC_command * com, SALVSYNC_response * res)
+{
+    struct SalvageQueueNode * np, *nnp;
+    struct DiskPartition * dp;
+
+    for (dp = DiskPartitionList ; dp ; dp = dp->next) {
+	for (queue_Scan(&salvageQueue.part[dp->index], np, nnp, SalvageQueueNode)) {
+	    DeleteFromSalvageQueue(np);
+	}
+    }
+
+    return SYNC_OK;
+}
+
+static afs_int32
+SALVSYNC_com_RaisePrio(SALVSYNC_command * com, SALVSYNC_response * res)
+{
+    afs_int32 code = SYNC_OK;
+    struct SalvageQueueNode * node;
+
+    if (SYNC_verifyProtocolString(com->sop->partName, sizeof(com->sop->partName))) {
+	code = SYNC_FAILED;
+	res->hdr->reason = SYNC_REASON_MALFORMED_PACKET;
+	goto done;
+    }
+
+    node = LookupNodeByCommand(com->sop);
+
+    /* raise the priority of a salvage */
+    if (node == NULL) {
+	code = SALVSYNC_com_Salvage(com, res);
+	node = LookupNodeByCommand(com->sop);
+    } else {
+	switch (node->state) {
+	case SALVSYNC_STATE_QUEUED:
+	    RaiseCommandPrio(node, com->sop);
+	    break;
+	case SALVSYNC_STATE_SALVAGING:
+	    break;
+	case SALVSYNC_STATE_ERROR:
+	case SALVSYNC_STATE_DONE:
+	    code = SALVSYNC_com_Salvage(com, res);
+	    break;
+	default:
+	    break;
+	}
+    }
+
+    if (node == NULL) {
+	res->sop->prio = 0;
+	res->sop->state = SALVSYNC_STATE_UNKNOWN;
+    } else {
+	res->hdr->flags |= SALVSYNC_FLAG_VOL_STATS_VALID;
+	res->sop->prio = node->command.sop.prio;
+	res->sop->state = node->state;
+    }
+
+ done:
+    return code;
+}
+
+static afs_int32
+SALVSYNC_com_Query(SALVSYNC_command * com, SALVSYNC_response * res)
+{
+    afs_int32 code = SYNC_OK;
+    struct SalvageQueueNode * node;
+
+    if (SYNC_verifyProtocolString(com->sop->partName, sizeof(com->sop->partName))) {
+	code = SYNC_FAILED;
+	res->hdr->reason = SYNC_REASON_MALFORMED_PACKET;
+	goto done;
+    }
+
+    node = LookupNodeByCommand(com->sop);
+
+    /* query whether a volume is done salvaging */
+    if (node == NULL) {
+	res->sop->state = SALVSYNC_STATE_UNKNOWN;
+	res->sop->prio = 0;
+    } else {
+	res->hdr->flags |= SALVSYNC_FLAG_VOL_STATS_VALID;
+	res->sop->state = node->state;
+	res->sop->prio = node->command.sop.prio;
+    }
+
+ done:
+    return code;
+}
+
+static void
+SALVSYNC_Drop(int fd)
+{
+    RemoveHandler(fd);
+#ifdef AFS_NT40_ENV
+    closesocket(fd);
+#else
+    close(fd);
+#endif
+    AcceptOn();
+}
+
+static int AcceptHandler = -1;	/* handler id for accept, if turned on */
+
+static void
+AcceptOn(void)
+{
+    if (AcceptHandler == -1) {
+	assert(AddHandler(AcceptSd, SALVSYNC_newconnection));
+	AcceptHandler = FindHandler(AcceptSd);
+    }
+}
+
+static void
+AcceptOff(void)
+{
+    if (AcceptHandler != -1) {
+	assert(RemoveHandler(AcceptSd));
+	AcceptHandler = -1;
+    }
+}
+
+/* The multiple FD handling code. */
+
+static int HandlerFD[MAXHANDLERS];
+static void (*HandlerProc[MAXHANDLERS]) (int);
+
+static void
+InitHandler(void)
+{
+    register int i;
+    ObtainWriteLock(&SALVSYNC_handler_lock);
+    for (i = 0; i < MAXHANDLERS; i++) {
+	HandlerFD[i] = -1;
+	HandlerProc[i] = NULL;
+    }
+    ReleaseWriteLock(&SALVSYNC_handler_lock);
+}
+
+static void
+CallHandler(fd_set * fdsetp)
+{
+    register int i;
+    ObtainReadLock(&SALVSYNC_handler_lock);
+    for (i = 0; i < MAXHANDLERS; i++) {
+	if (HandlerFD[i] >= 0 && FD_ISSET(HandlerFD[i], fdsetp)) {
+	    ReleaseReadLock(&SALVSYNC_handler_lock);
+	    (*HandlerProc[i]) (HandlerFD[i]);
+	    ObtainReadLock(&SALVSYNC_handler_lock);
+	}
+    }
+    ReleaseReadLock(&SALVSYNC_handler_lock);
+}
+
+static int
+AddHandler(int afd, void (*aproc) (int))
+{
+    register int i;
+    ObtainWriteLock(&SALVSYNC_handler_lock);
+    for (i = 0; i < MAXHANDLERS; i++)
+	if (HandlerFD[i] == -1)
+	    break;
+    if (i >= MAXHANDLERS) {
+	ReleaseWriteLock(&SALVSYNC_handler_lock);
+	return 0;
+    }
+    HandlerFD[i] = afd;
+    HandlerProc[i] = aproc;
+    ReleaseWriteLock(&SALVSYNC_handler_lock);
+    return 1;
+}
+
+static int
+FindHandler(register int afd)
+{
+    register int i;
+    ObtainReadLock(&SALVSYNC_handler_lock);
+    for (i = 0; i < MAXHANDLERS; i++)
+	if (HandlerFD[i] == afd) {
+	    ReleaseReadLock(&SALVSYNC_handler_lock);
+	    return i;
+	}
+    ReleaseReadLock(&SALVSYNC_handler_lock);	/* just in case */
+    assert(1 == 2);
+    return -1;			/* satisfy compiler */
+}
+
+static int
+FindHandler_r(register int afd)
+{
+    register int i;
+    for (i = 0; i < MAXHANDLERS; i++)
+	if (HandlerFD[i] == afd) {
+	    return i;
+	}
+    assert(1 == 2);
+    return -1;			/* satisfy compiler */
+}
+
+static int
+RemoveHandler(register int afd)
+{
+    ObtainWriteLock(&SALVSYNC_handler_lock);
+    HandlerFD[FindHandler_r(afd)] = -1;
+    ReleaseWriteLock(&SALVSYNC_handler_lock);
+    return 1;
+}
+
+static void
+GetHandler(fd_set * fdsetp, int *maxfdp)
+{
+    register int i;
+    register int maxfd = -1;
+    FD_ZERO(fdsetp);
+    ObtainReadLock(&SALVSYNC_handler_lock);	/* just in case */
+    for (i = 0; i < MAXHANDLERS; i++)
+	if (HandlerFD[i] != -1) {
+	    FD_SET(HandlerFD[i], fdsetp);
+	    if (maxfd < HandlerFD[i])
+		maxfd = HandlerFD[i];
+	}
+    *maxfdp = maxfd;
+    ReleaseReadLock(&SALVSYNC_handler_lock);	/* just in case */
+}
+
+static int
+AddToSalvageQueue(struct SalvageQueueNode * node)
+{
+    afs_int32 id;
+
+    id = volutil_GetPartitionID(node->command.sop.partName);
+    if (id < 0 || id > VOLMAXPARTS) {
+	return 1;
+    }
+    if (!VGetPartitionById_r(id, 0)) {
+	/* don't enqueue salvage requests for unmounted partitions */
+	return 1;
+    }
+    queue_Append(&salvageQueue.part[id], node);
+    salvageQueue.len[id]++;
+    salvageQueue.total_len++;
+    salvageQueue.last_insert = id;
+    node->partition_id = id;
+    node->state = SALVSYNC_STATE_QUEUED;
+    assert(pthread_cond_broadcast(&salvageQueue.cv) == 0);
+    return 0;
+}
+
+static void
+DeleteFromSalvageQueue(struct SalvageQueueNode * node)
+{
+    if (queue_IsOnQueue(node)) {
+	queue_Remove(node);
+	salvageQueue.len[node->partition_id]--;
+	salvageQueue.total_len--;
+	node->state = SALVSYNC_STATE_UNKNOWN;
+	assert(pthread_cond_broadcast(&salvageQueue.cv) == 0);
+    }
+}
+
+static void
+AddToPendingQueue(struct SalvageQueueNode * node)
+{
+    queue_Append(&pendingQueue, node);
+    pendingQueue.len++;
+    node->state = SALVSYNC_STATE_SALVAGING;
+    assert(pthread_cond_broadcast(&pendingQueue.queue_change_cv) == 0);
+}
+
+static void
+DeleteFromPendingQueue(struct SalvageQueueNode * node)
+{
+    if (queue_IsOnQueue(node)) {
+	queue_Remove(node);
+	pendingQueue.len--;
+	node->state = SALVSYNC_STATE_UNKNOWN;
+	assert(pthread_cond_broadcast(&pendingQueue.queue_change_cv) == 0);
+    }
+}
+
+static struct SalvageQueueNode *
+LookupPendingCommand(SALVSYNC_command_hdr * qry)
+{
+    struct SalvageQueueNode * np, * nnp;
+
+    for (queue_Scan(&pendingQueue, np, nnp, SalvageQueueNode)) {
+	if ((np->command.sop.volume == qry->volume) && 
+	    !strncmp(np->command.sop.partName, qry->partName,
+		     sizeof(qry->partName)))
+	    break;
+    }
+
+    if (queue_IsEnd(&pendingQueue, np))
+	np = NULL;
+    return np;
+}
+
+static struct SalvageQueueNode *
+LookupPendingCommandByPid(int pid)
+{
+    struct SalvageQueueNode * np, * nnp;
+
+    for (queue_Scan(&pendingQueue, np, nnp, SalvageQueueNode)) {
+	if (np->pid == pid)
+	    break;
+    }
+
+    if (queue_IsEnd(&pendingQueue, np))
+	np = NULL;
+    return np;
+}
+
+
+/* raise the priority of a previously scheduled salvage */
+static void
+RaiseCommandPrio(struct SalvageQueueNode * node, SALVSYNC_command_hdr * com)
+{
+    struct SalvageQueueNode *np, *nnp;
+    afs_int32 id;
+
+    assert(queue_IsOnQueue(node));
+
+    node->command.sop.prio = com->prio;
+    id = node->partition_id;
+    if (queue_First(&salvageQueue.part[id], SalvageQueueNode)->command.sop.prio < com->prio) {
+	queue_Remove(node);
+	queue_Prepend(&salvageQueue.part[id], node);
+    } else {
+	for (queue_ScanBackwardsFrom(&salvageQueue.part[id], node, np, nnp, SalvageQueueNode)) {
+	    if (np->command.sop.prio > com->prio)
+		break;
+	}
+	if (queue_IsEnd(&salvageQueue.part[id], np)) {
+	    queue_Remove(node);
+	    queue_Prepend(&salvageQueue.part[id], node);
+	} else if (node != np) {
+	    queue_Remove(node);
+	    queue_InsertAfter(np, node);
+	}
+    }
+}
+
+/* this will need to be rearchitected if we ever want more than one thread
+ * to wait for new salvage nodes */
+struct SalvageQueueNode * 
+SALVSYNC_getWork(void)
+{
+    int i, ret;
+    struct DiskPartition * dp = NULL, * fdp;
+    static afs_int32 next_part_sched = 0;
+    struct SalvageQueueNode *node = NULL, *np;
+
+    VOL_LOCK;
+
+    /*
+     * wait for work to be scheduled
+     * if there are no disk partitions, just sit in this wait loop forever
+     */
+    while (!salvageQueue.total_len || !DiskPartitionList) {
+      assert(pthread_cond_wait(&salvageQueue.cv, &vol_glock_mutex) == 0);
+    }
+
+
+    /* 
+     * short circuit for simple case where only one partition has
+     * scheduled salvages
+     */
+    if (salvageQueue.last_insert >= 0 && salvageQueue.last_insert <= VOLMAXPARTS &&
+	(salvageQueue.total_len == salvageQueue.len[salvageQueue.last_insert])) {
+	node = queue_First(&salvageQueue.part[salvageQueue.last_insert], SalvageQueueNode);
+	goto have_node;
+    }
+
+
+    /* 
+     * ok, more than one partition has scheduled salvages.
+     * now search for partitions with scheduled salvages, but no pending salvages. 
+     */
+    dp = VGetPartitionById_r(next_part_sched, 0);
+    if (!dp) {
+	dp = DiskPartitionList;
+    }
+    fdp = dp;
+
+    for (i=0 ; 
+	 !i || dp != fdp ; 
+	 dp = (dp->next) ? dp->next : DiskPartitionList, i++ ) {
+	if (!partition_salvaging[dp->index] && salvageQueue.len[dp->index]) {
+	    node = queue_First(&salvageQueue.part[dp->index], SalvageQueueNode);
+	    goto have_node;
+	}
+    }
+
+
+    /*
+     * all partitions with scheduled salvages have at least one pending.
+     * now do an exhaustive search for a scheduled salvage.
+     */
+    dp = fdp;
+
+    for (i=0 ; 
+	 !i || dp != fdp ; 
+	 dp = (dp->next) ? dp->next : DiskPartitionList, i++ ) {
+	if (salvageQueue.len[dp->index]) {
+	    node = queue_First(&salvageQueue.part[dp->index], SalvageQueueNode);
+	    goto have_node;
+	}
+    }
+
+    /* we should never reach this line */
+    assert(1==2);
+
+ have_node:
+    assert(node != NULL);
+    node->pid = 0;
+    partition_salvaging[node->partition_id]++;
+    DeleteFromSalvageQueue(node);
+    AddToPendingQueue(node);
+
+    if (dp) {
+	/* update next_part_sched field */
+	if (dp->next) {
+	    next_part_sched = dp->next->index;
+	} else if (DiskPartitionList) {
+	    next_part_sched = DiskPartitionList->index;
+	} else {
+	    next_part_sched = -1;
+	}
+    }
+
+ bail:
+    VOL_UNLOCK;
+    return node;
+}
+
+static void
+SALVSYNC_doneWork_r(struct SalvageQueueNode * node, int result)
+{
+    afs_int32 partid;
+    DeleteFromPendingQueue(node);
+    partid = node->partition_id;
+    if (partid >=0 && partid <= VOLMAXPARTS) {
+	partition_salvaging[partid]--;
+    }
+    if (result == 0) {
+	node->state = SALVSYNC_STATE_DONE;
+    } else {
+	node->state = SALVSYNC_STATE_ERROR;
+    }
+}
+
+void 
+SALVSYNC_doneWork(struct SalvageQueueNode * node, int result)
+{
+    VOL_LOCK;
+    SALVSYNC_doneWork_r(node, result);
+    VOL_UNLOCK;
+}
+
+void
+SALVSYNC_doneWorkByPid(int pid, int result)
+{
+    struct SalvageQueueNode * node;
+
+    VOL_LOCK;
+    node = LookupPendingCommandByPid(pid);
+    if (node != NULL) {
+	SALVSYNC_doneWork_r(node, result);
+    }
+    VOL_UNLOCK;
+}
+
+#endif /* AFS_DEMAND_ATTACH_FS */
diff --git a/src/vol/salvsync.h b/src/vol/salvsync.h
new file mode 100644
index 0000000000..6611df6589
--- /dev/null
+++ b/src/vol/salvsync.h
@@ -0,0 +1,111 @@
+/*
+ * Copyright 2006, Sine Nomine Associates and others.
+ * All Rights Reserved.
+ * 
+ * This software has been released under the terms of the IBM Public
+ * License.  For details, see the LICENSE file in the top-level source
+ * directory or online at http://www.openafs.org/dl/license10.html
+ */
+
+/*
+ * demand attach fs
+ * salvage server interface
+ */
+#ifndef _AFS_VOL_SALVSYNC_H
+#define _AFS_VOL_SALVSYNC_H
+
+#ifdef AFS_DEMAND_ATTACH_FS
+#include "daemon_com.h"
+
+
+#define SALVSYNC_PROTO_VERSION        1
+
+
+/* SALVSYNC command codes */
+#define SALVSYNC_NOP            SYNC_COM_CODE_DECL(0)   /* just return stats */
+#define SALVSYNC_SALVAGE	SYNC_COM_CODE_DECL(1)	/* schedule a salvage */
+#define SALVSYNC_CANCEL         SYNC_COM_CODE_DECL(2)   /* Cancel a salvage */
+#define SALVSYNC_RAISEPRIO      SYNC_COM_CODE_DECL(3)   /* move a salvage operation to
+							 * the head of the work queue */
+#define SALVSYNC_QUERY          SYNC_COM_CODE_DECL(4)   /* query the status of a salvage */
+#define SALVSYNC_CANCELALL      SYNC_COM_CODE_DECL(5)   /* cancel all pending salvages */
+
+/* SALVSYNC reason codes */
+#define SALVSYNC_WHATEVER	SYNC_REASON_CODE_DECL(0)  /* XXXX */
+#define SALVSYNC_ERROR		SYNC_REASON_CODE_DECL(1)  /* volume is in error state */
+#define SALVSYNC_OPERATOR	SYNC_REASON_CODE_DECL(2)  /* operator forced salvage */
+#define SALVSYNC_SHUTDOWN       SYNC_REASON_CODE_DECL(3)  /* cancel due to shutdown */
+#define SALVSYNC_NEEDED         SYNC_REASON_CODE_DECL(4)  /* needsSalvaged flag set */
+
+/* SALVSYNC response codes */
+
+/* SALVSYNC flags */
+#define SALVSYNC_FLAG_VOL_STATS_VALID SYNC_FLAG_CODE_DECL(0) /* volume stats in response are valid */
+
+/* SALVSYNC command state fields */
+#define SALVSYNC_STATE_UNKNOWN        0         /* unknown state */
+#define SALVSYNC_STATE_QUEUED         1         /* salvage request on queue */
+#define SALVSYNC_STATE_SALVAGING      2         /* salvage is happening now */
+#define SALVSYNC_STATE_ERROR          3         /* salvage ended in an error */
+#define SALVSYNC_STATE_DONE           4         /* last salvage ended successfully */
+
+
+typedef struct SALVSYNC_command_hdr {
+    afs_uint32 prio;
+    afs_uint32 volume;
+    char partName[16];		/* partition name, e.g. /vicepa */
+} SALVSYNC_command_hdr;
+
+typedef struct SALVSYNC_response_hdr {
+    afs_int32 state;
+    afs_int32 prio;
+    afs_int32 sq_len;
+    afs_int32 pq_len;
+} SALVSYNC_response_hdr;
+
+typedef struct SALVSYNC_command {
+    SYNC_command_hdr * hdr;
+    SALVSYNC_command_hdr * sop;
+    SYNC_command * com;
+} SALVSYNC_command;
+
+typedef struct SALVSYNC_response {
+    SYNC_response_hdr * hdr;
+    SALVSYNC_response_hdr * sop;
+    SYNC_response * res;
+} SALVSYNC_response;
+
+typedef struct SALVSYNC_command_info {
+    SYNC_command_hdr com;
+    SALVSYNC_command_hdr sop;
+} SALVSYNC_command_info;
+
+struct SalvageQueueNode {
+    struct rx_queue q;
+    struct rx_queue hash_chain;
+    afs_uint32 state;
+    struct SALVSYNC_command_info command;
+    afs_int32 partition_id;
+    int pid;
+};
+
+
+/* Prototypes from salvsync.c */
+
+/* online salvager client interfaces */
+extern int SALVSYNC_clientFinis(void);
+extern int SALVSYNC_clientInit(void);
+extern int SALVSYNC_clientReconnect(void);
+extern afs_int32 SALVSYNC_askSalv(SYNC_command * com, SYNC_response * res);
+extern afs_int32 SALVSYNC_SalvageVolume(VolumeId volume, char *partName, int com, int reason,
+					afs_uint32 prio, SYNC_response * res);
+
+/* salvage server interfaces */
+extern void SALVSYNC_salvInit(void);
+extern struct SalvageQueueNode * SALVSYNC_getWork(void);
+extern void SALVSYNC_doneWork(struct SalvageQueueNode *, int result);
+extern void SALVSYNC_doneWorkByPid(int pid, int result);
+
+#endif /* AFS_DEMAND_ATTACH_FS */
+
+#endif /* _AFS_VOL_SALVSYNC_H */
diff --git a/src/vol/test/listVicepx.c b/src/vol/test/listVicepx.c
index 7cb53d7d42..7e9307ee1a 100644
--- a/src/vol/test/listVicepx.c
+++ b/src/vol/test/listVicepx.c
@@ -102,6 +102,7 @@ RCSID
 #include "afs/assert.h"
 #include "filesignal.h"
 #include "vutils.h"
+#include "daemon_com.h"
 #include "fssync.h"
 #include <afs/auxinode.h>
 #include <afs/dir.h>
diff --git a/src/vol/test/updateDirInode.c b/src/vol/test/updateDirInode.c
index 1ebbcda15c..ff2d6b27d0 100644
--- a/src/vol/test/updateDirInode.c
+++ b/src/vol/test/updateDirInode.c
@@ -102,6 +102,7 @@ RCSID
 #include "afs/assert.h"
 #include "filesignal.h"
 #include "vutils.h"
+#include "daemon_com.h"
 #include "fssync.h"
 #include <afs/auxinode.h>
 #include <afs/dir.h>
diff --git a/src/vol/vnode.c b/src/vol/vnode.c
index c9a6c0c58c..75e90bd6ac 100644
--- a/src/vol/vnode.c
+++ b/src/vol/vnode.c
@@ -5,6 +5,8 @@
  * This software has been released under the terms of the IBM Public
  * License.  For details, see the LICENSE file in the top-level source
  * directory or online at http://www.openafs.org/dl/license10.html
+ *
+ * Portions Copyright (c) 2006 Sine Nomine Associates
  */
 
 /*
@@ -46,6 +48,7 @@ RCSID
 #include "vnode.h"
 #include "volume.h"
 #include "partition.h"
+#include "salvsync.h"
 #if defined(AFS_SGI_ENV)
 #include "sys/types.h"
 #include "fcntl.h"
@@ -73,8 +76,8 @@ RCSID
 struct VnodeClassInfo VnodeClassInfo[nVNODECLASSES];
 
 private int moveHash(register Vnode * vnp, bit32 newHash);
-void StickOnLruChain_r(register Vnode * vnp,
-		       register struct VnodeClassInfo *vcp);
+private void StickOnLruChain_r(register Vnode * vnp,
+			       register struct VnodeClassInfo *vcp);
 
 #define BAD_IGET	-1000
 
@@ -162,6 +165,83 @@ private Vnode *VnodeHashTable[VNODE_HASH_TABLE_SIZE];
 #define VNODE_HASH(volumeptr,vnodenumber)\
     ((volumeptr->vnodeHashOffset + vnodenumber)&(VNODE_HASH_TABLE_SIZE-1))
 
+/*
+ * new support to secondarily hash vnodes by volume id
+ */
+#define VNVOLUME_HASH(volumeId) (volumeId&(VolumeHashTable.Mask))
+
+#include "rx/rx_queue.h"
+typedef struct VnodeHashByVolumeChainHead {
+    struct rx_queue queue;
+    int len;
+    /* someday we could put a per-chain lock here... */
+#ifdef AFS_DEMAND_ATTACH_FS
+    int busy;
+    pthread_cond_t chain_busy_cv;
+#endif /* AFS_DEMAND_ATTACH_FS */
+} VnodeHashByVolumeChainHead;
+private VnodeHashByVolumeChainHead *VnodeHashByVolumeTable = NULL;
+
+void
+VInitVnHashByVolume(void)
+{
+    register int i;
+
+    VnodeHashByVolumeTable = (VnodeHashByVolumeChainHead *) calloc(VolumeHashTable.Size, 
+								   sizeof(VnodeHashByVolumeChainHead));
+    assert(VnodeHashByVolumeTable != NULL);
+    
+    for (i=0; i < VolumeHashTable.Size; i++) {
+	queue_Init(&VnodeHashByVolumeTable[i]);
+#ifdef AFS_DEMAND_ATTACH_FS
+	assert(pthread_cond_init(&VnodeHashByVolumeTable[i].chain_busy_cv, NULL) == 0);
+#endif /* AFS_DEMAND_ATTACH_FS */
+    }
+}
+
+static void
+AddToVnHashByVolumeTable(register Vnode * vnp)
+{
+    VnodeHashByVolumeChainHead * head;
+
+    if (queue_IsOnQueue(vnp))
+	return;
+
+    head = &VnodeHashByVolumeTable[VNVOLUME_HASH(vnp->volumePtr->hashid)];
+
+#ifdef AFS_DEMAND_ATTACH_FS
+    while (head->busy) {
+	/* if the hash table is busy, wait */
+	assert(pthread_cond_wait(&head->chain_busy_cv, &vol_glock_mutex) == 0);
+    }
+#endif /* AFS_DEMAND_ATTACH_FS */
+
+    head->len++;
+    queue_Append(head, vnp);
+}
+
+/* for demand-attach, caller MUST hold a ref count on vp */
+static void
+DeleteFromVnHashByVolumeTable(register Vnode * vnp)
+{
+    VnodeHashByVolumeChainHead * head;
+
+    if (!queue_IsOnQueue(vnp))
+	return;
+
+    head = &VnodeHashByVolumeTable[VNVOLUME_HASH(vnp->volumePtr->hashid)];
+
+#ifdef AFS_DEMAND_ATTACH_FS
+    while (head->busy) {
+	/* if the hash table is busy, wait */
+	assert(pthread_cond_wait(&head->chain_busy_cv, &vol_glock_mutex) == 0);
+    }
+#endif /* AFS_DEMAND_ATTACH_FS */
+
+    head->len--;
+    queue_Remove(vnp);
+}
+
 /* Code to invalidate a vnode entry.  Called when we've damaged a vnode, and want
     to prevent future VGetVnode's from applying to it.  Leaves it in the same hash bucket
     but that shouldn't be important.  */
@@ -305,7 +385,7 @@ VAllocVnode_r(Error * ec, Volume * vp, VnodeType type)
 	unique = vp->nextVnodeUnique++;
 
     if (vp->nextVnodeUnique > V_uniquifier(vp)) {
-	VUpdateVolume_r(ec, vp);
+	VUpdateVolume_r(ec, vp, VOL_UPDATE_WAIT);
 	if (*ec)
 	    return NULL;
     }
@@ -317,7 +397,8 @@ VAllocVnode_r(Error * ec, Volume * vp, VnodeType type)
     }
 
     /* Find a slot in the bit map */
-    bitNumber = VAllocBitmapEntry_r(ec, vp, &vp->vnodeIndex[class]);
+    bitNumber = VAllocBitmapEntry_r(ec, vp, &vp->vnodeIndex[class],
+				    VOL_ALLOC_BITMAP_WAIT);
     if (*ec)
 	return NULL;
     vnodeNumber = bitNumberToVnodeNumber(bitNumber, class);
@@ -376,7 +457,6 @@ VAllocVnode_r(Error * ec, Volume * vp, VnodeType type)
 	vnp->volumePtr = vp;
 	vnp->cacheCheck = vp->cacheCheck;
 	vnp->nUsers = 1;
-	moveHash(vnp, newHash);
 	/* This will never block */
 	ObtainWriteLock(&vnp->lock);
 #ifdef AFS_PTHREAD_ENV
@@ -391,18 +471,33 @@ VAllocVnode_r(Error * ec, Volume * vp, VnodeType type)
 	    FdHandle_t *fdP;
 	    off_t off = vnodeIndexOffset(vcp, vnodeNumber);
 
+	    /* XXX we have a potential race here if two threads
+	     * allocate new vnodes at the same time, and they
+	     * both decide it's time to extend the index
+	     * file size... */
+
 	    VOL_UNLOCK;
 	    fdP = IH_OPEN(ihP);
-	    if (fdP == NULL)
-		Abort("VAllocVnode: can't open index file!\n");
-	    if ((size = FDH_SIZE(fdP)) < 0)
-		Abort("VAllocVnode: can't stat index file!\n");
-	    if (FDH_SEEK(fdP, off, SEEK_SET) < 0)
-		Abort("VAllocVnode: can't seek on index file!\n");
-	    if (off < size) {
-		if (FDH_READ(fdP, &vnp->disk, vcp->diskSize) == vcp->diskSize) {
-		    if (vnp->disk.type != vNull)
-			Abort("VAllocVnode:  addled bitmap or index!\n");
+	    if (fdP == NULL) {
+		Log("VAllocVnode: can't open index file!\n");
+		goto error_encountered;
+	    }
+	    if ((size = FDH_SIZE(fdP)) < 0) {
+		Log("VAllocVnode: can't stat index file!\n");
+		goto error_encountered;
+	    }
+	    if (FDH_SEEK(fdP, off, SEEK_SET) < 0) {
+		Log("VAllocVnode: can't seek on index file!\n");
+		goto error_encountered;
+	    }
+	    if (off + vcp->diskSize <= size) {
+		if (FDH_READ(fdP, &vnp->disk, vcp->diskSize) != vcp->diskSize) {
+		    Log("VAllocVnode: can't read index file!\n");
+		    goto error_encountered;
+		}
+		if (vnp->disk.type != vNull) {
+		    Log("VAllocVnode:  addled bitmap or index!\n");
+		    goto error_encountered;
 		}
 	    } else {
 		/* growing file - grow in a reasonable increment */
@@ -414,9 +509,28 @@ VAllocVnode_r(Error * ec, Volume * vp, VnodeType type)
 		free(buf);
 	    }
 	    FDH_CLOSE(fdP);
+	    fdP = NULL;
 	    VOL_LOCK;
+	    goto sane;
+
+	error_encountered:
+#ifdef AFS_DEMAND_ATTACH_FS
+	    VOL_LOCK;
+	    VRequestSalvage_r(vp, SALVSYNC_ERROR, 0);
+	    if (fdP)
+		FDH_CLOSE(fdP);
+	    VInvalidateVnode_r(vnp);
+	    StickOnLruChain_r(vnp, vcp);
+	    return NULL;
+#else
+	    assert(1 == 2);
+#endif
+
 	}
+    sane:
 	VNLog(4, 2, vnodeNumber, (afs_int32) vnp);
+	AddToVnHashByVolumeTable(vnp);
+	moveHash(vnp, newHash);
     }
 
     VNLog(5, 1, (afs_int32) vnp);
@@ -510,6 +624,8 @@ VGetVnode_r(Error * ec, Volume * vp, VnodeId vnodeNumber, int locktype)
 	vcp->reads++;
 	vnp = VGetFreeVnode_r(vcp);
 	/* Remove it from the old hash chain */
+	if (vnp->volumePtr)
+	    DeleteFromVnHashByVolumeTable(vnp);
 	moveHash(vnp, newHash);
 	/* Remove it from the LRU chain */
 	if (vnp == vcp->lruHead)
@@ -525,6 +641,7 @@ VGetVnode_r(Error * ec, Volume * vp, VnodeId vnodeNumber, int locktype)
 	vnp->volumePtr = vp;
 	vnp->cacheCheck = vp->cacheCheck;
 	vnp->nUsers = 1;
+	AddToVnHashByVolumeTable(vnp);
 
 	/* This will never block */
 	ObtainWriteLock(&vnp->lock);
@@ -540,11 +657,21 @@ VGetVnode_r(Error * ec, Volume * vp, VnodeId vnodeNumber, int locktype)
 	if (fdP == NULL) {
 	    Log("VGetVnode: can't open index dev=%u, i=%s\n", vp->device,
 		PrintInode(NULL, vp->vnodeIndex[class].handle->ih_ino));
+#ifdef AFS_DEMAND_ATTACH_FS
+	    VOL_LOCK;
+	    VRequestSalvage_r(vp, SALVSYNC_ERROR, 0);
+	    VOL_UNLOCK;
+#endif
 	    *ec = VIO;
 	    mlkReason = 9;
 	} else if (FDH_SEEK(fdP, vnodeIndexOffset(vcp, vnodeNumber), SEEK_SET)
 		   < 0) {
 	    Log("VGetVnode: can't seek on index file vn=%u\n", vnodeNumber);
+#ifdef AFS_DEMAND_ATTACH_FS
+	    VOL_LOCK;
+	    VRequestSalvage_r(vp, SALVSYNC_ERROR, 0);
+	    VOL_UNLOCK;
+#endif
 	    *ec = VIO;
 	    mlkReason = 10;
 	    FDH_REALLYCLOSE(fdP);
@@ -564,8 +691,18 @@ VGetVnode_r(Error * ec, Volume * vp, VnodeId vnodeNumber, int locktype)
 	     * is not allocated */
 	    if (n == -1 && errno == EIO) {
 		Log("VGetVnode: Couldn't read vnode %u, volume %u (%s); volume needs salvage\n", vnodeNumber, V_id(vp), V_name(vp));
-		VForceOffline_r(vp);
+#ifdef AFS_DEMAND_ATTACH_FS
+		if (programType == fileServer) {
+		    VRequestSalvage_r(vp, SALVSYNC_ERROR, 0);
+		    *ec = VSALVAGING;
+		} else {
+		    VForceOffline_r(vp, 0);
+		    *ec = VSALVAGE;
+		}
+#else
+		VForceOffline_r(vp, 0);
 		*ec = VSALVAGE;
+#endif
 		mlkReason = 4;
 	    } else {
 		mlkReason = 5;
@@ -603,9 +740,19 @@ VGetVnode_r(Error * ec, Volume * vp, VnodeId vnodeNumber, int locktype)
 		    *ec = VNOVNODE;
 		} else {
 		    Log("VGetVnode: Bad magic number, vnode %u, volume %u (%s); volume needs salvage\n", vnodeNumber, V_id(vp), V_name(vp));
+#ifdef AFS_DEMAND_ATTACH_FS
+		    if (programType == fileServer) {
+			VRequestSalvage_r(vp, SALVSYNC_ERROR, 0);
+			*ec = VSALVAGING;
+		    } else {
+			vp->goingOffline = 1;
+			*ec = VSALVAGE;
+		    }
+#else
 		    vp->goingOffline = 1;	/* used to call VOffline, but that would mess
 						 * up the volume ref count if called here */
 		    *ec = VSALVAGE;
+#endif
 		    mlkReason = 7;
 		}
 		VInvalidateVnode_r(vnp);
@@ -728,20 +875,27 @@ VPutVnode_r(Error * ec, register Vnode * vnp)
 
 	    /* The vnode has been changed. Write it out to disk */
 	    if (!V_inUse(vp)) {
+#ifdef AFS_DEMAND_ATTACH_FS
+		VRequestSalvage_r(vp, SALVSYNC_ERROR, 0);
+		*ec = VSALVAGING;
+#else
 		assert(V_needsSalvaged(vp));
 		*ec = VSALVAGE;
+#endif
 	    } else {
 		IHandle_t *ihP = vp->vnodeIndex[class].handle;
 		FdHandle_t *fdP;
 		VOL_UNLOCK;
 		fdP = IH_OPEN(ihP);
-		if (fdP == NULL)
-		    Abort("VPutVnode: can't open index file!\n");
+		if (fdP == NULL) {
+		    Log("VPutVnode: can't open index file!\n");
+		    goto error_encountered;
+		}
 		offset = vnodeIndexOffset(vcp, vnp->vnodeNumber);
 		if (FDH_SEEK(fdP, offset, SEEK_SET) < 0) {
-		    Abort
-			("VPutVnode: can't seek on index file! fdp=0x%x offset=%d, errno=%d\n",
-			 fdP, offset, errno);
+		    Log("VPutVnode: can't seek on index file! fdp=0x%x offset=%d, errno=%d\n",
+			fdP, offset, errno);
+		    goto error_encountered;
 		}
 		code = FDH_WRITE(fdP, &vnp->disk, vcp->diskSize);
 		if (code != vcp->diskSize) {
@@ -756,8 +910,13 @@ VPutVnode_r(Error * ec, register Vnode * vnp)
 			*ec = VIO;
 		    } else {
 			Log("VPutVnode: Couldn't write vnode %u, volume %u (%s) (error %d)\n", vnp->vnodeNumber, V_id(vnp->volumePtr), V_name(vnp->volumePtr), code);
-			VForceOffline_r(vp);
+#ifdef AFS_DEMAND_ATTACH_FS
+			VRequestSalvage_r(vp, SALVSYNC_ERROR, 0);
+			*ec = VSALVAGING;
+#else
+			VForceOffline_r(vp, 0);
 			*ec = VSALVAGE;
+#endif
 		    }
 		    VOL_UNLOCK;
 		    FDH_REALLYCLOSE(fdP);
@@ -765,6 +924,23 @@ VPutVnode_r(Error * ec, register Vnode * vnp)
 		    FDH_CLOSE(fdP);
 		}
 		VOL_LOCK;
+		goto sane;
+
+	    error_encountered:
+#ifdef AFS_DEMAND_ATTACH_FS
+		/* XXX instead of dumping core, let's try to request a salvage
+		 * and just fail the putvnode */
+		if (fdP)
+		    FDH_CLOSE(fdP);
+		VOL_LOCK;
+		VRequestSalvage_r(vp, SALVSYNC_ERROR, 0);
+		*ec = VSALVAGING;
+		goto done;
+#else
+		assert(1 == 2);
+#endif
+
+	    sane:
 		/* If the vnode is to be deleted, and we wrote the vnode out,
 		 * free its bitmap entry. Do after the vnode is written so we
 		 * don't allocate from bitmap before the vnode is written
@@ -787,6 +963,7 @@ VPutVnode_r(Error * ec, register Vnode * vnp)
 		 vnp);
     }
 
+ done:
     /* Do not look at disk portion of vnode after this point; it may
      * have been deleted above */
     if (vnp->nUsers-- == 1)
@@ -865,19 +1042,28 @@ VVnodeWriteToRead_r(Error * ec, register Vnode * vnp)
 
 	/* The inode has been changed.  Write it out to disk */
 	if (!V_inUse(vp)) {
+#ifdef AFS_DEMAND_ATTACH_FS
+	    VRequestSalvage_r(vp, SALVSYNC_ERROR, 0);
+	    *ec = VSALVAGING;
+#else
 	    assert(V_needsSalvaged(vp));
 	    *ec = VSALVAGE;
+#endif
 	} else {
 	    IHandle_t *ihP = vp->vnodeIndex[class].handle;
 	    FdHandle_t *fdP;
 	    off_t off = vnodeIndexOffset(vcp, vnp->vnodeNumber);
 	    VOL_UNLOCK;
 	    fdP = IH_OPEN(ihP);
-	    if (fdP == NULL)
-		Abort("VPutVnode: can't open index file!\n");
+	    if (fdP == NULL) {
+		Log("VPutVnode: can't open index file!\n");
+		goto error_encountered;
+	    }
 	    code = FDH_SEEK(fdP, off, SEEK_SET);
-	    if (code < 0)
-		Abort("VPutVnode: can't seek on index file!\n");
+	    if (code < 0) {
+		Log("VPutVnode: can't seek on index file!\n");
+		goto error_encountered;
+	    }
 	    code = FDH_WRITE(fdP, &vnp->disk, vcp->diskSize);
 	    if (code != vcp->diskSize) {
 		/*
@@ -892,14 +1078,33 @@ VVnodeWriteToRead_r(Error * ec, register Vnode * vnp)
 		    *ec = VIO;
 		} else {
 		    Log("VPutVnode: Couldn't write vnode %u, volume %u (%s)\n", vnp->vnodeNumber, V_id(vnp->volumePtr), V_name(vnp->volumePtr));
-		    VForceOffline_r(vp);
+#ifdef AFS_DEMAND_ATTACH_FS
+		    VRequestSalvage_r(vp, SALVSYNC_ERROR, 0);
+		    *ec = VSALVAGING;
+#else
+		    VForceOffline_r(vp, 0);
 		    *ec = VSALVAGE;
+#endif
 		}
 		VOL_UNLOCK;
 	    }
 	    FDH_CLOSE(fdP);
 	    VOL_LOCK;
+	    goto sane;
+
+	error_encountered:
+#ifdef AFS_DEMAND_ATTACH_FS
+	    if (fdP)
+		FDH_CLOSE(fdP);
+	    VOL_LOCK;
+	    VRequestSalvage_r(vp, SALVSYNC_ERROR, 0);
+	    *ec = VSALVAGING;
+#else
+	    assert(1 == 2);
+#endif
+
 	}
+    sane:
 	vcp->writes++;
 	vnp->changed_newTime = vnp->changed_oldTime = 0;
     }
@@ -931,7 +1136,7 @@ moveHash(register Vnode * vnp, bit32 newHash)
     return 0;
 }
 
-void
+private void
 StickOnLruChain_r(register Vnode * vnp, register struct VnodeClassInfo *vcp)
 {
     /* Add it to the circular LRU list */
@@ -950,8 +1155,10 @@ StickOnLruChain_r(register Vnode * vnp, register struct VnodeClassInfo *vcp)
 	vcp->lruHead = vnp->lruNext;
     /* If caching is turned off, set volumeptr to NULL to invalidate the
      * entry */
-    if (!TrustVnodeCacheEntry)
+    if (!TrustVnodeCacheEntry) {
+	DeleteFromVnHashByVolumeTable(vnp);
 	vnp->volumePtr = NULL;
+    }
 }
 
 /* VCloseVnodeFiles - called when a volume is going off line. All open
@@ -962,15 +1169,30 @@ void
 VCloseVnodeFiles_r(Volume * vp)
 {
     int i;
-    Vnode *vnp;
+    Vnode *vnp, *nvnp;
+    VnodeHashByVolumeChainHead * head;
 
-    for (i = 0; i < VNODE_HASH_TABLE_SIZE; i++) {
-	for (vnp = VnodeHashTable[i]; vnp; vnp = vnp->hashNext) {
-	    if (vnp->volumePtr == vp) {
-		IH_REALLYCLOSE(vnp->handle);
-	    }
+    head = &VnodeHashByVolumeTable[VNVOLUME_HASH(vp->hashid)];
+#ifdef AFS_DEMAND_ATTACH_FS
+    while (head->busy) {
+	assert(pthread_cond_wait(&head->chain_busy_cv, &vol_glock_mutex) == 0);
+    }
+
+    head->busy = 1;
+    VOL_UNLOCK;
+#endif /* AFS_DEMAND_ATTACH_FS */
+
+    for (queue_Scan(head, vnp, nvnp, Vnode)) {
+	if (vnp->volumePtr == vp) {
+	    IH_REALLYCLOSE(vnp->handle);
 	}
     }
+
+#ifdef AFS_DEMAND_ATTACH_FS
+    VOL_LOCK;
+    head->busy = 0;
+    assert(pthread_cond_broadcast(&head->chain_busy_cv) == 0);
+#endif /* AFS_DEMAND_ATTACH_FS */
 }
 
 /* VReleaseVnodeFiles - called when a volume is going detached. All open
@@ -981,13 +1203,29 @@ void
 VReleaseVnodeFiles_r(Volume * vp)
 {
     int i;
-    Vnode *vnp;
+    Vnode *vnp, *nvnp;
+    VnodeHashByVolumeChainHead * head;
 
-    for (i = 0; i < VNODE_HASH_TABLE_SIZE; i++) {
-	for (vnp = VnodeHashTable[i]; vnp; vnp = vnp->hashNext) {
-	    if (vnp->volumePtr == vp) {
-		IH_RELEASE(vnp->handle);
-	    }
+    head = &VnodeHashByVolumeTable[VNVOLUME_HASH(vp->hashid)];
+
+#ifdef AFS_DEMAND_ATTACH_FS
+    while (head->busy) {
+	assert(pthread_cond_wait(&head->chain_busy_cv, &vol_glock_mutex) == 0);
+    }
+
+    head->busy = 1;
+    VOL_UNLOCK;
+#endif /* AFS_DEMAND_ATTACH_FS */
+
+    for (queue_Scan(head, vnp, nvnp, Vnode)) {
+	if (vnp->volumePtr == vp) {
+	    IH_RELEASE(vnp->handle);
 	}
     }
+
+#ifdef AFS_DEMAND_ATTACH_FS
+    VOL_LOCK;
+    head->busy = 0;
+    assert(pthread_cond_broadcast(&head->chain_busy_cv) == 0);
+#endif /* AFS_DEMAND_ATTACH_FS */
 }
diff --git a/src/vol/vnode.h b/src/vol/vnode.h
index 9446f79320..618cb83635 100644
--- a/src/vol/vnode.h
+++ b/src/vol/vnode.h
@@ -118,6 +118,7 @@ typedef struct VnodeDiskObject {
 #define SIZEOF_LARGEDISKVNODE	256
 
 typedef struct Vnode {
+    struct rx_queue vid_hash;   /* for vnode by volume id hash */
     struct Vnode *hashNext;	/* Next vnode on hash conflict chain */
     struct Vnode *lruNext;	/* Less recently used vnode than this one */
     struct Vnode *lruPrev;	/* More recently used vnode than this one */
@@ -216,3 +217,4 @@ extern Vnode *VAllocVnode(Error * ec, struct Volume *vp, VnodeType type);
 extern Vnode *VAllocVnode_r(Error * ec, struct Volume *vp, VnodeType type);
 /*extern VFreeVnode();*/
 extern Vnode *VGetFreeVnode_r(struct VnodeClassInfo *vcp);
+extern void VInitVnHashByVolume(void);
diff --git a/src/vol/vol-salvage.c b/src/vol/vol-salvage.c
index 04eb2694f9..eaaf6b96e2 100644
--- a/src/vol/vol-salvage.c
+++ b/src/vol/vol-salvage.c
@@ -83,11 +83,6 @@ Vnodes with 0 inode pointers in RW volumes are now deleted.
 */
 
 
-#define SalvageVersion "2.4"
-
-/* Main program file. Define globals. */
-#define MAIN 1
-
 #include <afsconfig.h>
 #include <afs/param.h>
 
@@ -186,10 +181,13 @@ RCSID
 #include "vnode.h"
 #include "volume.h"
 #include "partition.h"
+#include "daemon_com.h"
 #include "fssync.h"
+#include "salvsync.h"
 #include "viceinode.h"
 #include "salvage.h"
 #include "volinodes.h"		/* header magic number, etc. stuff */
+#include "vol-salvage.h"
 #ifdef AFS_NT40_ENV
 #include <pthread.h>
 #endif
@@ -221,10 +219,6 @@ extern void *calloc();
 #endif
 static char *TimeStamp(time_t clock, int precision);
 
-#define ORPH_IGNORE 0
-#define ORPH_REMOVE 1
-#define ORPH_ATTACH 2
-
 
 int debug;			/* -d flag */
 int Testing = 0;		/* -n flag */
@@ -251,7 +245,7 @@ int OKToZap;			/* -o flag */
 int ForceSalvage;		/* If salvage should occur despite the DONT_SALVAGE flag
 				 * in the volume header */
 
-static FILE *logFile = 0;	/* one of {/usr/afs/logs,/vice/file}/SalvageLog */
+FILE *logFile = 0;	/* one of {/usr/afs/logs,/vice/file}/SalvageLog */
 
 #define ROOTINODE	2	/* Root inode of a 4.2 Unix file system
 				 * partition */
@@ -279,201 +273,30 @@ int VolumeChanged;		/* Set by any routine which would change the volume in
 
 VolumeDiskData VolInfo;		/* A copy of the last good or salvaged volume header dealt with */
 
-struct InodeSummary {		/* Inode summary file--an entry for each
-				 * volume in the inode file for a partition */
-    VolId volumeId;		/* Volume id */
-    VolId RWvolumeId;		/* RW volume associated */
-    int index;			/* index into inode file (0, 1, 2 ...) */
-    int nInodes;		/* Number of inodes for this volume */
-    int nSpecialInodes;		/* Number of special inodes, i.e.  volume
-				 * header, index, etc.  These are all
-				 * marked (viceinode.h) and will all be sorted
-				 * to the beginning of the information for
-				 * this volume.  Read-only volumes should
-				 * ONLY have special inodes (all the other
-				 * inodes look as if they belong to the
-				 * original RW volume). */
-    Unique maxUniquifier;	/* The maximum uniquifier found in all the inodes.
-				 * This is only useful for RW volumes and is used
-				 * to compute a new volume uniquifier in the event
-				 * that the header needs to be recreated. The inode
-				 * uniquifier may be a truncated version of vnode
-				 * uniquifier (AFS_3DISPARES). The real maxUniquifer
-				 * is from the vnodes and later calcuated from it */
-    struct VolumeSummary *volSummary;
-    /* Either a pointer to the original volume
-     * header summary, or constructed summary
-     * information */
-} *inodeSummary;
-#define readOnly(isp)	((isp)->volumeId != (isp)->RWvolumeId)
 int nVolumesInInodeFile;	/* Number of read-write volumes summarized */
 int inodeFd;			/* File descriptor for inode file */
 
 
-struct VolumeSummary {		/* Volume summary an entry for each
-				 * volume in a volume directory.
-				 * Assumption: one volume directory per
-				 * partition */
-    char *fileName;		/* File name on the partition for the volume
-				 * header */
-    struct VolumeHeader header;
-    /* volume number, rw volume number, inode
-     * numbers of each major component of
-     * the volume */
-    IHandle_t *volumeInfoHandle;
-    byte wouldNeedCallback;	/* set if the file server should issue
-				 * call backs for all the files in this volume when
-				 * the volume goes back on line */
-};
-
-struct VnodeInfo {
-    IHandle_t *handle;		/* Inode containing this index */
-    int nVnodes;		/* Total number of vnodes in index */
-    int nAllocatedVnodes;	/* Total number actually used */
-    int volumeBlockCount;	/* Total number of blocks used by volume */
-    Inode *inodes;		/* Directory only */
-    struct VnodeEssence {
-	short count;		/* Number of references to vnode; MUST BE SIGNED */
-	unsigned claimed:1;	/* Set when a parent directory containing an entry
-				 * referencing this vnode is found.  The claim
-				 * is that the parent in "parent" can point to
-				 * this vnode, and no other */
-	unsigned changed:1;	/* Set if any parameters (other than the count)
-				 * in the vnode change.   It is determined if the
-				 * link count has changed by noting whether it is
-				 * 0 after scanning all directories */
-	unsigned salvaged:1;	/* Set if this directory vnode has already been salvaged. */
-	unsigned todelete:1;	/* Set if this vnode is to be deleted (should not be claimed) */
-	afs_fsize_t blockCount;
-	/* Number of blocks (1K) used by this vnode,
-	 * approximately */
-	VnodeId parent;		/* parent in vnode */
-	Unique unique;		/* Must match entry! */
-	char *name;		/* Name of directory entry */
-	int modeBits;		/* File mode bits */
-	Inode InodeNumber;	/* file's inode */
-	int type;		/* File type */
-	int author;		/* File author */
-	int owner;		/* File owner */
-	int group;		/* File group */
-    } *vnodes;
-} vnodeInfo[nVNODECLASSES];
-
-struct DirSummary {
-    struct DirHandle dirHandle;
-    VnodeId vnodeNumber;
-    Unique unique;
-    unsigned haveDot, haveDotDot;
-    VolumeId rwVid;
-    int copied;			/* If the copy-on-write stuff has been applied */
-    VnodeId parent;
-    char *name;
-    char *vname;
-    IHandle_t *ds_linkH;
-};
+struct VnodeInfo vnodeInfo[nVNODECLASSES];
 
 
 struct VolumeSummary *volumeSummaryp;	/* Holds all the volumes in a part */
 int nVolumes;			/* Number of volumes (read-write and read-only)
 				 * in volume summary */
 
-#ifdef AFS_NT40_ENV
-/* For NT, we can fork the per partition salvagers to gain the required
- * safety against Aborts. But there's too many complex data structures at
- * the per volume salvager layer to easilty copy the data across.
- * childJobNumber is resset from -1 to the job number if this is a
- * per partition child of the main salvager. This information is passed
- * out-of-band in the extra data area setup for the now unused parent/child
- * data transfer.
- */
-#define SALVAGER_MAGIC 0x00BBaaDD
-#define NOT_CHILD -1		/* job numbers start at 0 */
-/* If new options need to be passed to child, add them here. */
-typedef struct {
-    int cj_magic;
-    int cj_number;
-    char cj_part[32];
-} childJob_t;
+extern char * tmpdir = 0;
 
+
+#ifdef AFS_NT40_ENV
 /* Child job this process is running. */
 childJob_t myjob = { SALVAGER_MAGIC, NOT_CHILD, "" };
-
-int nt_SalvagePartition(char *partName, int jobn);
-int nt_SetupPartitionSalvage(void *datap, int len);
-
-typedef struct {
-    struct InodeSummary *svgp_inodeSummaryp;
-    int svgp_count;
-} SVGParms_t;
-#define canfork 0
-#else
-#define canfork 1
-#endif
+#endif /* AFS_NT40_ENV */
 
 
 
 /* Forward declarations */
 /*@printflike@*/ void Log(const char *format, ...);
 /*@printflike@*/ void Abort(const char *format, ...);
-void Exit(int code);
-int Fork(void);
-int Wait(char *prog);
-char *ToString(char *s);
-void AskOffline(VolumeId volumeId);
-void AskOnline(VolumeId volumeId, char *partition);
-void CheckLogFile(void);
-#ifndef AFS_NT40_ENV
-void TimeStampLogFile(void);
-#endif
-void ClearROInUseBit(struct VolumeSummary *summary);
-void CopyAndSalvage(register struct DirSummary *dir);
-int CopyInode(Device device, Inode inode1, Inode inode2, int rwvolume);
-void CopyOnWrite(register struct DirSummary *dir);
-void CountVolumeInodes(register struct ViceInodeInfo *ip, int maxInodes,
-		       register struct InodeSummary *summary);
-void DeleteExtraVolumeHeaderFile(register struct VolumeSummary *vsp);
-void DistilVnodeEssence(VolumeId vid, VnodeClass class, Inode ino,
-			Unique * maxu);
-int GetInodeSummary(char *path, VolumeId singleVolumeNumber);
-void GetVolumeSummary(VolumeId singleVolumeNumber);
-void JudgeEntry(struct DirSummary *dir, char *name, VnodeId vnodeNumber,
-		Unique unique);
-void MaybeZapVolume(register struct InodeSummary *isp, char *message,
-		    int deleteMe, int check);
-void ObtainSalvageLock(void);
-void PrintInodeList(void);
-void PrintInodeSummary(void);
-void PrintVolumeSummary(void);
-int QuickCheck(register struct InodeSummary *isp, int nVols);
-void RemoveTheForce(char *path);
-void SalvageDir(char *name, VolumeId rwVid, struct VnodeInfo *dirVnodeInfo,
-		IHandle_t * alinkH, int i, struct DirSummary *rootdir,
-		int *rootdirfound);
-void SalvageFileSysParallel(struct DiskPartition *partP);
-void SalvageFileSys(struct DiskPartition *partP, VolumeId singleVolumeNumber);
-void SalvageFileSys1(struct DiskPartition *partP,
-		     VolumeId singleVolumeNumber);
-int SalvageHeader(register struct stuff *sp, struct InodeSummary *isp,
-		  int check, int *deleteMe);
-int SalvageIndex(Inode ino, VnodeClass class, int RW,
-		 register struct ViceInodeInfo *ip, int nInodes,
-		 struct VolumeSummary *volSummary, int check);
-int SalvageVnodes(register struct InodeSummary *rwIsp,
-		  register struct InodeSummary *thisIsp,
-		  register struct ViceInodeInfo *inodes, int check);
-int SalvageVolume(register struct InodeSummary *rwIsp, IHandle_t * alinkH);
-void DoSalvageVolumeGroup(register struct InodeSummary *isp, int nVols);
-#ifdef AFS_NT40_ENV
-void SalvageVolumeGroup(register struct InodeSummary *isp, int nVols);
-#else
-#define SalvageVolumeGroup DoSalvageVolumeGroup
-#endif
-int SalvageVolumeHeaderFile(register struct InodeSummary *isp,
-			    register struct ViceInodeInfo *inodes, int RW,
-			    int check, int *deleteMe);
-void showlog(void);
-int UseTheForceLuke(char *path);
-
 static int IsVnodeOrphaned(VnodeId vnode);
 
 /* Uniquifier stored in the Inode */
@@ -500,207 +323,6 @@ BadError(register int aerror)
 }
 
 
-char *tmpdir = 0;
-static int
-handleit(struct cmd_syndesc *as)
-{
-    register struct cmd_item *ti;
-    char pname[100], *temp;
-    afs_int32 seenpart = 0, seenvol = 0, vid = 0, seenany = 0;
-    struct DiskPartition *partP;
-
-#ifdef AFS_SGI_VNODE_GLUE
-    if (afs_init_kernel_config(-1) < 0) {
-	printf
-	    ("Can't determine NUMA configuration, not starting salvager.\n");
-	exit(1);
-    }
-#endif
-
-#ifdef FAST_RESTART
-    {
-	afs_int32 i;
-	for (i = 0; i < CMD_MAXPARMS; i++) {
-	    if (as->parms[i].items) {
-		seenany = 1;
-		break;
-	    }
-	}
-    }
-    if (!seenany) {
-	char *msg =
-	    "Exiting immediately without salvage. Look into the FileLog to find volumes which really need to be salvaged!";
-
-	if (useSyslog)
-	    Log(msg);
-	else
-	    printf("%s\n", msg);
-
-	Exit(0);
-    }
-#endif /* FAST_RESTART */
-    if ((ti = as->parms[0].items)) {	/* -partition */
-	seenpart = 1;
-	strncpy(pname, ti->data, 100);
-    }
-    if ((ti = as->parms[1].items)) {	/* -volumeid */
-	if (!seenpart) {
-	    printf
-		("You must also specify '-partition' option with the '-volumeid' option\n");
-	    exit(-1);
-	}
-	seenvol = 1;
-	vid = atoi(ti->data);
-    }
-    if (as->parms[2].items)	/* -debug */
-	debug = 1;
-    if (as->parms[3].items)	/* -nowrite */
-	Testing = 1;
-    if (as->parms[4].items)	/* -inodes */
-	ListInodeOption = 1;
-    if (as->parms[5].items)	/* -force */
-	ForceSalvage = 1;
-    if (as->parms[6].items)	/* -oktozap */
-	OKToZap = 1;
-    if (as->parms[7].items)	/* -rootinodes */
-	ShowRootFiles = 1;
-    if (as->parms[8].items)	/* -RebuildDirs */
-	RebuildDirs = 1;
-    if (as->parms[9].items)	/* -ForceReads */
-	forceR = 1;
-    if ((ti = as->parms[10].items)) {	/* -Parallel # */
-	temp = ti->data;
-	if (strncmp(temp, "all", 3) == 0) {
-	    PartsPerDisk = 1;
-	    temp += 3;
-	}
-	if (strlen(temp) != 0) {
-	    Parallel = atoi(temp);
-	    if (Parallel < 1)
-		Parallel = 1;
-	    if (Parallel > MAXPARALLEL) {
-		printf("Setting parallel salvages to maximum of %d \n",
-		       MAXPARALLEL);
-		Parallel = MAXPARALLEL;
-	    }
-	}
-    }
-    if ((ti = as->parms[11].items)) {	/* -tmpdir */
-	DIR *dirp;
-
-	tmpdir = ti->data;
-	dirp = opendir(tmpdir);
-	if (!dirp) {
-	    printf
-		("Can't open temporary placeholder dir %s; using current partition \n",
-		 tmpdir);
-	    tmpdir = NULL;
-	} else
-	    closedir(dirp);
-    }
-    if ((ti = as->parms[12].items))	/* -showlog */
-	ShowLog = 1;
-    if ((ti = as->parms[13].items)) {	/* -log */
-	Testing = 1;
-	ShowSuid = 1;
-	Showmode = 1;
-    }
-    if ((ti = as->parms[14].items)) {	/* -showmounts */
-	Testing = 1;
-	Showmode = 1;
-	ShowMounts = 1;
-    }
-    if ((ti = as->parms[15].items)) {	/* -orphans */
-	if (Testing)
-	    orphans = ORPH_IGNORE;
-	else if (strcmp(ti->data, "remove") == 0
-		 || strcmp(ti->data, "r") == 0)
-	    orphans = ORPH_REMOVE;
-	else if (strcmp(ti->data, "attach") == 0
-		 || strcmp(ti->data, "a") == 0)
-	    orphans = ORPH_ATTACH;
-    }
-#ifndef AFS_NT40_ENV		/* ignore options on NT */
-    if ((ti = as->parms[16].items)) {	/* -syslog */
-	useSyslog = 1;
-	ShowLog = 0;
-    }
-    if ((ti = as->parms[17].items)) {	/* -syslogfacility */
-	useSyslogFacility = atoi(ti->data);
-    }
-
-    if ((ti = as->parms[18].items)) {	/* -datelogs */
-	TimeStampLogFile();
-    }
-#endif
-
-#ifdef FAST_RESTART
-    if (ti = as->parms[19].items) {	/* -DontSalvage */
-	char *msg =
-	    "Exiting immediately without salvage. Look into the FileLog to find volumes which really need to be salvaged!";
-
-	if (useSyslog)
-	    Log(msg);
-	else
-	    printf("%s\n", msg);
-	Exit(0);
-    }
-#endif /* FAST_RESTART */
-
-    /* Note:  if seemvol we initialize this as a standard volume utility:  this has the
-     * implication that the file server may be running; negotations have to be made with
-     * the file server in this case to take the read write volume and associated read-only
-     * volumes off line before salvaging */
-#ifdef AFS_NT40_ENV
-    if (seenvol) {
-	if (afs_winsockInit() < 0) {
-	    ReportErrorEventAlt(AFSEVT_SVR_WINSOCK_INIT_FAILED, 0,
-				AFSDIR_SALVAGER_FILE, 0);
-	    Log("Failed to initailize winsock, exiting.\n");
-	    Exit(1);
-	}
-    }
-#endif
-    VInitVolumePackage(seenvol ? volumeUtility : salvager, 5, 5,
-		       DONT_CONNECT_FS, 0);
-    DInit(10);
-#ifdef AFS_NT40_ENV
-    if (myjob.cj_number != NOT_CHILD) {
-	if (!seenpart) {
-	    seenpart = 1;
-	    (void)strcpy(pname, myjob.cj_part);
-	}
-    }
-#endif
-    if (seenpart == 0) {
-	for (partP = DiskPartitionList; partP; partP = partP->next) {
-	    SalvageFileSysParallel(partP);
-	}
-	SalvageFileSysParallel(0);
-    } else {
-	partP = VGetPartition(pname, 0);
-	if (!partP) {
-	    Log("salvage: Unknown or unmounted partition %s; salvage aborted\n", pname);
-	    Exit(1);
-	}
-	if (!seenvol)
-	    SalvageFileSys(partP, 0);
-	else {
-	    /* Salvage individual volume */
-	    if (vid <= 0) {
-		Log("salvage: invalid volume id specified; salvage aborted\n");
-		Exit(1);
-	    }
-	    SalvageFileSys(partP, vid);
-	}
-    }
-    return (0);
-}
-
-
-#ifndef AFS_NT40_ENV
-#include "AFS_component_version_number.c"
-#endif
 #define MAX_ARGS 128
 #ifdef AFS_NT40_ENV
 char *save_args[MAX_ARGS];
@@ -708,143 +330,6 @@ int n_save_args = 0;
 pthread_t main_thread;
 #endif
 
-int
-main(int argc, char **argv)
-{
-    struct cmd_syndesc *ts;
-    int err = 0;
-    char commandLine[150];
-
-    int i;
-    extern char cml_version_number[];
-
-#ifdef	AFS_AIX32_ENV
-    /*
-     * The following signal action for AIX is necessary so that in case of a 
-     * crash (i.e. core is generated) we can include the user's data section 
-     * in the core dump. Unfortunately, by default, only a partial core is
-     * generated which, in many cases, isn't too useful.
-     */
-    struct sigaction nsa;
-
-    sigemptyset(&nsa.sa_mask);
-    nsa.sa_handler = SIG_DFL;
-    nsa.sa_flags = SA_FULLDUMP;
-    sigaction(SIGABRT, &nsa, NULL);
-    sigaction(SIGSEGV, &nsa, NULL);
-#endif
-
-    /* Initialize directory paths */
-    if (!(initAFSDirPath() & AFSDIR_SERVER_PATHS_OK)) {
-#ifdef AFS_NT40_ENV
-	ReportErrorEventAlt(AFSEVT_SVR_NO_INSTALL_DIR, 0, argv[0], 0);
-#endif
-	fprintf(stderr, "%s: Unable to obtain AFS server directory.\n",
-		argv[0]);
-	exit(2);
-    }
-#ifdef AFS_NT40_ENV
-    main_thread = pthread_self();
-    if (spawnDatap && spawnDataLen) {
-	/* This is a child per partition salvager. Don't setup log or
-	 * try to lock the salvager lock.
-	 */
-	if (nt_SetupPartitionSalvage(spawnDatap, spawnDataLen) < 0)
-	    exit(3);
-    } else {
-#endif
-	for (commandLine[0] = '\0', i = 0; i < argc; i++) {
-	    if (i > 0)
-		strcat(commandLine, " ");
-	    strcat(commandLine, argv[i]);
-	}
-
-	/* All entries to the log will be appended.  Useful if there are
-	 * multiple salvagers appending to the log.
-	 */
-
-	CheckLogFile();
-#ifndef AFS_NT40_ENV
-#ifdef AFS_LINUX20_ENV
-	fcntl(fileno(logFile), F_SETFL, O_APPEND);	/* Isn't this redundant? */
-#else
-	fcntl(fileno(logFile), F_SETFL, FAPPEND);	/* Isn't this redundant? */
-#endif
-#endif
-	setlinebuf(logFile);
-
-#ifndef AFS_NT40_ENV
-	if (geteuid() != 0) {
-	    printf("Salvager must be run as root.\n");
-	    fflush(stdout);
-	    Exit(0);
-	}
-#endif
-
-	/* bad for normal help flag processing, but can do nada */
-
-	fprintf(logFile, "%s\n", cml_version_number);
-	Log("STARTING AFS SALVAGER %s (%s)\n", SalvageVersion, commandLine);
-
-	/* Get and hold a lock for the duration of the salvage to make sure
-	 * that no other salvage runs at the same time.  The routine
-	 * VInitVolumePackage (called below) makes sure that a file server or
-	 * other volume utilities don't interfere with the salvage.
-	 */
-	ObtainSalvageLock();
-#ifdef AFS_NT40_ENV
-    }
-#endif
-
-    ts = cmd_CreateSyntax("initcmd", handleit, 0, "initialize the program");
-    cmd_AddParm(ts, "-partition", CMD_SINGLE, CMD_OPTIONAL,
-		"Name of partition to salvage");
-    cmd_AddParm(ts, "-volumeid", CMD_SINGLE, CMD_OPTIONAL,
-		"Volume Id to salvage");
-    cmd_AddParm(ts, "-debug", CMD_FLAG, CMD_OPTIONAL,
-		"Run in Debugging mode");
-    cmd_AddParm(ts, "-nowrite", CMD_FLAG, CMD_OPTIONAL,
-		"Run readonly/test mode");
-    cmd_AddParm(ts, "-inodes", CMD_FLAG, CMD_OPTIONAL,
-		"Just list affected afs inodes - debugging flag");
-    cmd_AddParm(ts, "-force", CMD_FLAG, CMD_OPTIONAL, "Force full salvaging");
-    cmd_AddParm(ts, "-oktozap", CMD_FLAG, CMD_OPTIONAL,
-		"Give permission to destroy bogus inodes/volumes - debugging flag");
-    cmd_AddParm(ts, "-rootinodes", CMD_FLAG, CMD_OPTIONAL,
-		"Show inodes owned by root - debugging flag");
-    cmd_AddParm(ts, "-salvagedirs", CMD_FLAG, CMD_OPTIONAL,
-		"Force rebuild/salvage of all directories");
-    cmd_AddParm(ts, "-blockreads", CMD_FLAG, CMD_OPTIONAL,
-		"Read smaller blocks to handle IO/bad blocks");
-    cmd_AddParm(ts, "-parallel", CMD_SINGLE, CMD_OPTIONAL,
-		"# of max parallel partition salvaging");
-    cmd_AddParm(ts, "-tmpdir", CMD_SINGLE, CMD_OPTIONAL,
-		"Name of dir to place tmp files ");
-    cmd_AddParm(ts, "-showlog", CMD_FLAG, CMD_OPTIONAL,
-		"Show log file upon completion");
-    cmd_AddParm(ts, "-showsuid", CMD_FLAG, CMD_OPTIONAL,
-		"Report on suid/sgid files");
-    cmd_AddParm(ts, "-showmounts", CMD_FLAG, CMD_OPTIONAL,
-		"Report on mountpoints");
-    cmd_AddParm(ts, "-orphans", CMD_SINGLE, CMD_OPTIONAL,
-		"ignore | remove | attach");
-
-    /* note - syslog isn't avail on NT, but if we make it conditional, have
-     * to deal with screwy offsets for cmd params */
-    cmd_AddParm(ts, "-syslog", CMD_FLAG, CMD_OPTIONAL,
-		"Write salvage log to syslogs");
-    cmd_AddParm(ts, "-syslogfacility", CMD_SINGLE, CMD_OPTIONAL,
-		"Syslog facility number to use");
-    cmd_AddParm(ts, "-datelogs", CMD_FLAG, CMD_OPTIONAL,
-		"Include timestamp in logfile filename");
-
-#ifdef FAST_RESTART
-    cmd_AddParm(ts, "-DontSalvage", CMD_FLAG, CMD_OPTIONAL,
-		"Don't salvage. This my be set in BosConfig to let the fileserver restart immediately after a crash. Bad volumes will be taken offline");
-#endif /* FAST_RESTART */
-    err = cmd_Dispatch(argc, argv);
-    Exit(err);
-}
 
 /* Get the salvage lock if not already held. Hold until process exits. */
 void
@@ -1249,7 +734,8 @@ SalvageFileSys1(struct DiskPartition *partP, VolumeId singleVolumeNumber)
 	ForceSalvage = UseTheForceLuke(fileSysPath);
 
     if (singleVolumeNumber) {
-	if (!VConnectFS()) {
+	/* salvageserver already setup fssync conn for us */
+	if ((programType != salvageServer) && !VConnectFS()) {
 	    Abort("Couldn't connect to file server\n");
 	}
 	AskOffline(singleVolumeNumber);
@@ -2554,7 +2040,7 @@ SalvageIndex(Inode ino, VnodeClass class, int RW,
 		     * if no such match, take the first determined by our sort
 		     * order */
 		    register struct ViceInodeInfo *lip = ip;
-		    register lnInodes = nInodes;
+		    register int lnInodes = nInodes;
 		    while (lnInodes
 			   && lip->u.vnode.vnodeNumber == vnodeNumber) {
 			if (VNDISK_GET_INO(vnode) == lip->inodeNumber) {
@@ -3628,8 +3114,38 @@ MaybeZapVolume(register struct InodeSummary *isp, char *message, int deleteMe,
 void
 AskOffline(VolumeId volumeId)
 {
-    if (FSYNC_askfs(volumeId, NULL, FSYNC_OFF, FSYNC_SALVAGE) == FSYNC_DENIED) {
-	Log("AskOffline:  file server denied offline request; a general salvage is required.\n");
+    afs_int32 code, i;
+
+    for (i = 0; i < 3; i++) {
+	code = FSYNC_VolOp(volumeId, NULL, FSYNC_VOL_OFF, FSYNC_SALVAGE, NULL);
+
+	if (code == SYNC_OK) {
+	    break;
+	} else if (code == SYNC_DENIED) {
+#ifdef DEMAND_ATTACH_ENABLE
+	    Log("AskOffline:  file server denied offline request; a general salvage may be required.\n");
+#else
+	    Log("AskOffline:  file server denied offline request; a general salvage is required.\n");
+#endif
+	    Abort("Salvage aborted\n");
+	} else if (code == SYNC_BAD_COMMAND) {
+	    Log("AskOffline:  fssync protocol mismatch (bad command word '%d'); salvage aborting.\n",
+		FSYNC_VOL_OFF);
+#ifdef DEMAND_ATTACH_ENABLE
+	    Log("AskOffline:  please make sure fileserver, volserver, salvageserver and salvager binaries are same version.\n");
+#else
+	    Log("AskOffline:  please make sure fileserver, volserver and salvager binaries are same version.\n");
+#endif
+	    Abort("Salvage aborted\n");
+	} else if (i < 2) {
+	    /* try it again */
+	    Log("AskOffline:  request for fileserver to take volume offline failed; trying again...\n");
+	    FSYNC_clientFinis();
+	    FSYNC_clientInit();
+	}
+    }
+    if (code != SYNC_OK) {
+	Log("AskOffline:  request for fileserver to take volume offline failed; salvage aborting.\n");
 	Abort("Salvage aborted\n");
     }
 }
@@ -3637,8 +3153,30 @@ AskOffline(VolumeId volumeId)
 void
 AskOnline(VolumeId volumeId, char *partition)
 {
-    if (FSYNC_askfs(volumeId, partition, FSYNC_ON, 0) == FSYNC_DENIED) {
-	Log("AskOnline:  file server denied online request to volume %u partition %s\n", volumeId, partition);
+    afs_int32 code, i;
+
+    for (i = 0; i < 3; i++) {
+	code = FSYNC_VolOp(volumeId, partition, FSYNC_VOL_ON, FSYNC_WHATEVER, NULL);
+
+	if (code == SYNC_OK) {
+	    break;
+	} else if (code == SYNC_DENIED) {
+	    Log("AskOnline:  file server denied online request to volume %u partition %s; trying again...\n", volumeId, partition);
+	} else if (code == SYNC_BAD_COMMAND) {
+	    Log("AskOnline:  fssync protocol mismatch (bad command word '%d')\n",
+		FSYNC_VOL_ON);
+#ifdef DEMAND_ATTACH_ENABLE
+	    Log("AskOnline:  please make sure fileserver, volserver, salvageserver and salvager binaries are same version.\n");
+#else
+	    Log("AskOnline:  please make sure fileserver, volserver and salvager binaries are same version.\n");
+#endif
+	    break;
+	} else if (i < 2) {
+	    /* try it again */
+	    Log("AskOnline:  request for fileserver to take volume offline failed; trying again...\n");
+	    FSYNC_clientFinis();
+	    FSYNC_clientInit();
+	}
     }
 }
 
@@ -3772,7 +3310,7 @@ TimeStamp(time_t clock, int precision)
 }
 
 void
-CheckLogFile(void)
+CheckLogFile(char * log_path)
 {
     char oldSlvgLog[AFSDIR_PATH_MAX];
 
@@ -3783,11 +3321,11 @@ CheckLogFile(void)
     }
 #endif
 
-    strcpy(oldSlvgLog, AFSDIR_SERVER_SLVGLOG_FILEPATH);
+    strcpy(oldSlvgLog, log_path);
     strcat(oldSlvgLog, ".old");
     if (!logFile) {
-	renamefile(AFSDIR_SERVER_SLVGLOG_FILEPATH, oldSlvgLog);
-	logFile = afs_fopen(AFSDIR_SERVER_SLVGLOG_FILEPATH, "a");
+	renamefile(log_path, oldSlvgLog);
+	logFile = afs_fopen(log_path, "a");
 
 	if (!logFile) {		/* still nothing, use stdout */
 	    logFile = stdout;
@@ -3801,7 +3339,7 @@ CheckLogFile(void)
 
 #ifndef AFS_NT40_ENV
 void
-TimeStampLogFile(void)
+TimeStampLogFile(char * log_path)
 {
     char stampSlvgLog[AFSDIR_PATH_MAX];
     struct tm *lt;
@@ -3811,13 +3349,13 @@ TimeStampLogFile(void)
     lt = localtime(&now);
     (void)afs_snprintf(stampSlvgLog, sizeof stampSlvgLog,
 		       "%s.%04d-%02d-%02d.%02d:%02d:%02d",
-		       AFSDIR_SERVER_SLVGLOG_FILEPATH, lt->tm_year + 1900,
+		       log_path, lt->tm_year + 1900,
 		       lt->tm_mon + 1, lt->tm_mday, lt->tm_hour, lt->tm_min,
 		       lt->tm_sec);
 
     /* try to link the logfile to a timestamped filename */
     /* if it fails, oh well, nothing we can do */
-    link(AFSDIR_SERVER_SLVGLOG_FILEPATH, stampSlvgLog);
+    link(log_path, stampSlvgLog);
 }
 #endif
 
@@ -3937,7 +3475,7 @@ UseTheForceLuke(char *path)
  *
  * NOTE:
  *	The VRMIX fsck will not muck with the filesystem it is supposedly
- *	fixing and create a "FORCESAVAGE" file (by design).  Instead, we
+ *	fixing and create a "FORCESALVAGE" file (by design).  Instead, we
  *	muck directly with the root inode, which is within the normal
  *	domain of fsck.
  *	ListViceInodes() has a side effect of setting ForceSalvage if
diff --git a/src/vol/vol-salvage.h b/src/vol/vol-salvage.h
new file mode 100644
index 0000000000..c95ce249dc
--- /dev/null
+++ b/src/vol/vol-salvage.h
@@ -0,0 +1,282 @@
+/*
+ * Copyright 2000, International Business Machines Corporation and others.
+ * All Rights Reserved.
+ * 
+ * This software has been released under the terms of the IBM Public
+ * License.  For details, see the LICENSE file in the top-level source
+ * directory or online at http://www.openafs.org/dl/license10.html
+ */
+
+/*
+ *      Module:		vol-salvage.h
+ */
+
+#ifndef __vol_salvage_h_
+#define __vol_salvage_h_
+
+#define SalvageVersion "2.4"
+
+#include "salvage.h"
+#include "volinodes.h"
+
+/* salvager data structures */
+struct InodeSummary {		/* Inode summary file--an entry for each
+				 * volume in the inode file for a partition */
+    VolId volumeId;		/* Volume id */
+    VolId RWvolumeId;		/* RW volume associated */
+    int index;			/* index into inode file (0, 1, 2 ...) */
+    int nInodes;		/* Number of inodes for this volume */
+    int nSpecialInodes;		/* Number of special inodes, i.e.  volume
+				 * header, index, etc.  These are all
+				 * marked (viceinode.h) and will all be sorted
+				 * to the beginning of the information for
+				 * this volume.  Read-only volumes should
+				 * ONLY have special inodes (all the other
+				 * inodes look as if they belong to the
+				 * original RW volume). */
+    Unique maxUniquifier;	/* The maximum uniquifier found in all the inodes.
+				 * This is only useful for RW volumes and is used
+				 * to compute a new volume uniquifier in the event
+				 * that the header needs to be recreated. The inode
+				 * uniquifier may be a truncated version of vnode
+				 * uniquifier (AFS_3DISPARES). The real maxUniquifer
+				 * is from the vnodes and later calcuated from it */
+    struct VolumeSummary *volSummary;
+    /* Either a pointer to the original volume
+     * header summary, or constructed summary
+     * information */
+} *inodeSummary;
+#define readOnly(isp)	((isp)->volumeId != (isp)->RWvolumeId)
+
+struct VolumeSummary {		/* Volume summary an entry for each
+				 * volume in a volume directory.
+				 * Assumption: one volume directory per
+				 * partition */
+    char *fileName;		/* File name on the partition for the volume
+				 * header */
+    struct VolumeHeader header;
+    /* volume number, rw volume number, inode
+     * numbers of each major component of
+     * the volume */
+    IHandle_t *volumeInfoHandle;
+    byte wouldNeedCallback;	/* set if the file server should issue
+				 * call backs for all the files in this volume when
+				 * the volume goes back on line */
+};
+
+struct VnodeInfo {
+    IHandle_t *handle;		/* Inode containing this index */
+    int nVnodes;		/* Total number of vnodes in index */
+    int nAllocatedVnodes;	/* Total number actually used */
+    int volumeBlockCount;	/* Total number of blocks used by volume */
+    Inode *inodes;		/* Directory only */
+    struct VnodeEssence {
+	short count;		/* Number of references to vnode; MUST BE SIGNED */
+	unsigned claimed:1;	/* Set when a parent directory containing an entry
+				 * referencing this vnode is found.  The claim
+				 * is that the parent in "parent" can point to
+				 * this vnode, and no other */
+	unsigned changed:1;	/* Set if any parameters (other than the count)
+				 * in the vnode change.   It is determined if the
+				 * link count has changed by noting whether it is
+				 * 0 after scanning all directories */
+	unsigned salvaged:1;	/* Set if this directory vnode has already been salvaged. */
+	unsigned todelete:1;	/* Set if this vnode is to be deleted (should not be claimed) */
+	afs_fsize_t blockCount;
+	/* Number of blocks (1K) used by this vnode,
+	 * approximately */
+	VnodeId parent;		/* parent in vnode */
+	Unique unique;		/* Must match entry! */
+	char *name;		/* Name of directory entry */
+	int modeBits;		/* File mode bits */
+	Inode InodeNumber;	/* file's inode */
+	int type;		/* File type */
+	int author;		/* File author */
+	int owner;		/* File owner */
+	int group;		/* File group */
+    } *vnodes;
+};
+
+struct DirSummary {
+    struct DirHandle dirHandle;
+    VnodeId vnodeNumber;
+    Unique unique;
+    unsigned haveDot, haveDotDot;
+    VolumeId rwVid;
+    int copied;			/* If the copy-on-write stuff has been applied */
+    VnodeId parent;
+    char *name;
+    char *vname;
+    IHandle_t *ds_linkH;
+};
+
+#define ORPH_IGNORE 0
+#define ORPH_REMOVE 1
+#define ORPH_ATTACH 2
+
+
+/* command line options */
+extern int debug;			/* -d flag */
+extern int Testing;		        /* -n flag */
+extern int ListInodeOption;		/* -i flag */
+extern int ShowRootFiles;		/* -r flag */
+extern int RebuildDirs;		        /* -sal flag */
+extern int Parallel;		        /* -para X flag */
+extern int PartsPerDisk;		/* Salvage up to 8 partitions on same disk sequentially */
+extern int forceR;			/* -b flag */
+extern int ShowLog;		        /* -showlog flag */
+extern int ShowSuid;		        /* -showsuid flag */
+extern int ShowMounts;		        /* -showmounts flag */
+extern int orphans;	                /* -orphans option */
+extern int Showmode;
+
+#ifndef AFS_NT40_ENV
+extern int useSyslog;		        /* -syslog flag */
+extern int useSyslogFacility;	        /* -syslogfacility option */
+#endif
+
+#define	MAXPARALLEL	32
+
+extern int OKToZap;			/* -o flag */
+extern int ForceSalvage;		/* If salvage should occur despite the DONT_SALVAGE flag
+					 * in the volume header */
+
+
+#define ROOTINODE	2	/* Root inode of a 4.2 Unix file system
+				 * partition */
+extern Device fileSysDevice;	/* The device number of the current
+				 * partition being salvaged */
+#ifdef AFS_NT40_ENV
+extern char fileSysPath[8];
+#else
+extern char *fileSysPath;	/* The path of the mounted partition currently
+				 * being salvaged, i.e. the directory
+				 * containing the volume headers */
+#endif /* AFS_NT40_ENV */
+extern char *fileSysPathName;	/* NT needs this to make name pretty in log. */
+extern IHandle_t *VGLinkH;	/* Link handle for current volume group. */
+extern int VGLinkH_cnt;	        /* # of references to lnk handle. */
+extern struct DiskPartition *fileSysPartition;	/* Partition  being salvaged */
+#ifndef AFS_NT40_ENV
+extern char *fileSysDeviceName;	/* The block device where the file system
+				 * being salvaged was mounted */
+extern char *filesysfulldev;
+#endif /* AFS_NT40_ENV */
+extern int VolumeChanged;	/* Set by any routine which would change the volume in
+				 * a way which would require callback is to be broken if the
+				 * volume was put back on line by an active file server */
+
+extern VolumeDiskData VolInfo;	/* A copy of the last good or salvaged volume header dealt with */
+
+extern int nVolumesInInodeFile;	/* Number of read-write volumes summarized */
+extern int inodeFd;     	/* File descriptor for inode file */
+
+
+extern struct VnodeInfo vnodeInfo[nVNODECLASSES];
+
+
+extern struct VolumeSummary *volumeSummaryp;	/* Holds all the volumes in a part */
+extern int nVolumes;		/* Number of volumes (read-write and read-only)
+				 * in volume summary */
+
+extern char * tmpdir;
+extern FILE *logFile;	        /* one of {/usr/afs/logs,/vice/file}/SalvageLog */
+
+
+#ifdef AFS_NT40_ENV
+/* For NT, we can fork the per partition salvagers to gain the required
+ * safety against Aborts. But there's too many complex data structures at
+ * the per volume salvager layer to easilty copy the data across.
+ * childJobNumber is resset from -1 to the job number if this is a
+ * per partition child of the main salvager. This information is passed
+ * out-of-band in the extra data area setup for the now unused parent/child
+ * data transfer.
+ */
+#define SALVAGER_MAGIC 0x00BBaaDD
+#define NOT_CHILD -1		/* job numbers start at 0 */
+/* If new options need to be passed to child, add them here. */
+typedef struct {
+    int cj_magic;
+    int cj_number;
+    char cj_part[32];
+} childJob_t;
+
+/* Child job this process is running. */
+extern childJob_t myjob = { SALVAGER_MAGIC, NOT_CHILD, "" };
+
+extern int nt_SalvagePartition(char *partName, int jobn);
+extern int nt_SetupPartitionSalvage(void *datap, int len);
+
+typedef struct {
+    struct InodeSummary *svgp_inodeSummaryp;
+    int svgp_count;
+} SVGParms_t;
+#define canfork 0
+#else /* AFS_NT40_ENV */
+#define canfork 1
+#endif /* AFS_NT40_ENV */
+
+
+/* prototypes */
+extern void Exit(int code);
+extern int Fork(void);
+extern int Wait(char *prog);
+extern char *ToString(char *s);
+extern void AskOffline(VolumeId volumeId);
+extern void AskOnline(VolumeId volumeId, char *partition);
+extern void CheckLogFile(char * log_path);
+#ifndef AFS_NT40_ENV
+extern void TimeStampLogFile(char * log_path);
+#endif
+extern void ClearROInUseBit(struct VolumeSummary *summary);
+extern void CopyAndSalvage(register struct DirSummary *dir);
+extern int CopyInode(Device device, Inode inode1, Inode inode2, int rwvolume);
+extern void CopyOnWrite(register struct DirSummary *dir);
+extern void CountVolumeInodes(register struct ViceInodeInfo *ip, int maxInodes,
+		       register struct InodeSummary *summary);
+extern void DeleteExtraVolumeHeaderFile(register struct VolumeSummary *vsp);
+extern void DistilVnodeEssence(VolumeId vid, VnodeClass class, Inode ino,
+			       Unique * maxu);
+extern int GetInodeSummary(char *path, VolumeId singleVolumeNumber);
+extern void GetVolumeSummary(VolumeId singleVolumeNumber);
+extern void JudgeEntry(struct DirSummary *dir, char *name, VnodeId vnodeNumber,
+		       Unique unique);
+extern void MaybeZapVolume(register struct InodeSummary *isp, char *message,
+			   int deleteMe, int check);
+extern void ObtainSalvageLock(void);
+extern void PrintInodeList(void);
+extern void PrintInodeSummary(void);
+extern void PrintVolumeSummary(void);
+extern int QuickCheck(register struct InodeSummary *isp, int nVols);
+extern void RemoveTheForce(char *path);
+extern void SalvageDir(char *name, VolumeId rwVid, struct VnodeInfo *dirVnodeInfo,
+		       IHandle_t * alinkH, int i, struct DirSummary *rootdir,
+		       int *rootdirfound);
+extern void SalvageFileSysParallel(struct DiskPartition *partP);
+extern void SalvageFileSys(struct DiskPartition *partP, VolumeId singleVolumeNumber);
+extern void SalvageFileSys1(struct DiskPartition *partP,
+			    VolumeId singleVolumeNumber);
+extern int SalvageHeader(register struct stuff *sp, struct InodeSummary *isp,
+			 int check, int *deleteMe);
+extern int SalvageIndex(Inode ino, VnodeClass class, int RW,
+			register struct ViceInodeInfo *ip, int nInodes,
+			struct VolumeSummary *volSummary, int check);
+extern int SalvageVnodes(register struct InodeSummary *rwIsp,
+			 register struct InodeSummary *thisIsp,
+			 register struct ViceInodeInfo *inodes, int check);
+extern int SalvageVolume(register struct InodeSummary *rwIsp, IHandle_t * alinkH);
+extern void DoSalvageVolumeGroup(register struct InodeSummary *isp, int nVols);
+#ifdef AFS_NT40_ENV
+extern void SalvageVolumeGroup(register struct InodeSummary *isp, int nVols);
+#else
+#define SalvageVolumeGroup DoSalvageVolumeGroup
+#endif
+extern int SalvageVolumeHeaderFile(register struct InodeSummary *isp,
+				   register struct ViceInodeInfo *inodes, int RW,
+				   int check, int *deleteMe);
+extern void showlog(void);
+extern int UseTheForceLuke(char *path);
+
+
+
+#endif /* __vol_salvage_h_ */
diff --git a/src/vol/voldefs.h b/src/vol/voldefs.h
index 2094a0ca04..b546be24f3 100644
--- a/src/vol/voldefs.h
+++ b/src/vol/voldefs.h
@@ -25,6 +25,9 @@
 #define ROVOL			1
 #define BACKVOL			2
 
+/* maximum numbe of Vice partitions */
+#define	VOLMAXPARTS	255
+
 /* All volumes will have a volume header name in this format */
 #if	defined(AFS_AIX_ENV) || defined(AFS_HPUX_ENV)
 /* Note that <afs/param.h> must have been included before we get here... */
diff --git a/src/vol/volinodes.h b/src/vol/volinodes.h
index cb72b9c0b6..37b00fef6b 100644
--- a/src/vol/volinodes.h
+++ b/src/vol/volinodes.h
@@ -14,6 +14,9 @@
 
  */
 
+#ifndef __volinodes_h_
+#define __volinodes_h_
+
 /* Used by vutil.c and salvager.c */
 
 private struct VolumeHeader tempHeader;
@@ -56,3 +59,5 @@ LINKTABLEMAGIC, LINKTABLEVERSION}, VI_LINKTABLE,
 #define MAXINODETYPE VI_LINKTABLE
 
 Volume *VWaitAttachVolume();
+
+#endif /* __volinodes_h_ */
diff --git a/src/vol/volume.c b/src/vol/volume.c
index 7eb8854e86..fae9f87b56 100644
--- a/src/vol/volume.c
+++ b/src/vol/volume.c
@@ -5,6 +5,8 @@
  * This software has been released under the terms of the IBM Public
  * License.  For details, see the LICENSE file in the top-level source
  * directory or online at http://www.openafs.org/dl/license10.html
+ *
+ * Portions Copyright (c) 2006 Sine Nomine Associates
  */
 
 /* 1/1/89: NB:  this stuff is all going to be replaced.  Don't take it too seriously */
@@ -121,6 +123,9 @@ RCSID
 #ifdef AFS_NT40_ENV
 #include <io.h>
 #endif
+#include "daemon_com.h"
+#include "fssync.h"
+#include "salvsync.h"
 #include "vnode.h"
 #include "volume.h"
 #include "partition.h"
@@ -130,11 +135,15 @@ RCSID
 #include "afs/assert.h"
 #endif /* AFS_PTHREAD_ENV */
 #include "vutils.h"
-#include "fssync.h"
+#include <dir/dir.h>
 #ifndef AFS_NT40_ENV
 #include <unistd.h>
 #endif
 
+#if !defined(offsetof)
+#include <stddef.h>
+#endif
+
 #ifdef O_LARGEFILE
 #define afs_stat	stat64
 #define afs_fstat	fstat64
@@ -147,14 +156,16 @@ RCSID
 
 #ifdef AFS_PTHREAD_ENV
 pthread_mutex_t vol_glock_mutex;
-pthread_mutex_t vol_attach_mutex;
-pthread_mutex_t vol_fsync_mutex;
 pthread_mutex_t vol_trans_mutex;
 pthread_cond_t vol_put_volume_cond;
 pthread_cond_t vol_sleep_cond;
 int vol_attach_threads = 1;
 #endif /* AFS_PTHREAD_ENV */
 
+#ifdef AFS_DEMAND_ATTACH_FS
+pthread_mutex_t vol_salvsync_mutex;
+#endif /* AFS_DEMAND_ATTACH_FS */
+
 #ifdef	AFS_OSF_ENV
 extern void *calloc(), *realloc();
 #endif
@@ -162,12 +173,18 @@ extern void *calloc(), *realloc();
 /*@printflike@*/ extern void Log(const char *format, ...);
 
 /* Forward declarations */
-static Volume *attach2(Error * ec, char *path,
+static Volume *attach2(Error * ec, VolId vid, char *path,
 		       register struct VolumeHeader *header,
-		       struct DiskPartition *partp, int isbusy);
+		       struct DiskPartition *partp, Volume * vp, 
+		       int isbusy, int mode);
+static void ReallyFreeVolume(Volume * vp);
+#ifdef AFS_DEMAND_ATTACH_FS
 static void FreeVolume(Volume * vp);
+#else /* !AFS_DEMAND_ATTACH_FS */
+#define FreeVolume(vp) ReallyFreeVolume(vp)
 static void VScanUpdateList(void);
-static void InitLRU(int howMany);
+#endif /* !AFS_DEMAND_ATTACH_FS */
+static void VInitVolumeHeaderCache(afs_uint32 howMany);
 static int GetVolumeHeader(register Volume * vp);
 static void ReleaseVolumeHeader(register struct volHeader *hd);
 static void FreeVolumeHeader(register Volume * vp);
@@ -175,22 +192,72 @@ static void AddVolumeToHashTable(register Volume * vp, int hashid);
 static void DeleteVolumeFromHashTable(register Volume * vp);
 static int VHold(Volume * vp);
 static int VHold_r(Volume * vp);
-static void GetBitmap(Error * ec, Volume * vp, VnodeClass class);
+static void VGetBitmap_r(Error * ec, Volume * vp, VnodeClass class);
 static void GetVolumePath(Error * ec, VolId volumeId, char **partitionp,
 			  char **namep);
 static void VReleaseVolumeHandles_r(Volume * vp);
 static void VCloseVolumeHandles_r(Volume * vp);
+static void LoadVolumeHeader(Error * ec, Volume * vp);
+static int VCheckOffline(register Volume * vp);
+static int VCheckDetach(register Volume * vp);
+static Volume * GetVolume(Error * ec, Error * client_ec, VolId volumeId, Volume * hint, int flags);
+static int VolumeExternalName_r(VolumeId volumeId, char * name, size_t len);
 
 int LogLevel;			/* Vice loglevel--not defined as extern so that it will be
 				 * defined when not linked with vice, XXXX */
 ProgramType programType;	/* The type of program using the package */
 
 
+/* extended volume package statistics */
+VolPkgStats VStats;
+
+
 #define VOLUME_BITMAP_GROWSIZE	16	/* bytes, => 128vnodes */
 					/* Must be a multiple of 4 (1 word) !! */
-#define VOLUME_HASH_TABLE_SIZE 128	/* Must be a power of 2!! */
-#define VOLUME_HASH(volumeId) (volumeId&(VOLUME_HASH_TABLE_SIZE-1))
-private Volume *VolumeHashTable[VOLUME_HASH_TABLE_SIZE];
+
+/* this parameter needs to be tunable at runtime.
+ * 128 was really inadequate for largish servers -- at 16384 volumes this
+ * puts average chain length at 128, thus an average 65 deref's to find a volptr.
+ * talk about bad spatial locality...
+ *
+ * an AVL or splay tree might work a lot better, but we'll just increase
+ * the default hash table size for now
+ */
+#define DEFAULT_VOLUME_HASH_SIZE 256   /* Must be a power of 2!! */
+#define DEFAULT_VOLUME_HASH_MASK (DEFAULT_VOLUME_HASH_SIZE-1)
+#define VOLUME_HASH(volumeId) (volumeId&(VolumeHashTable.Mask))
+
+/*
+ * turn volume hash chains into partially ordered lists.
+ * when the threshold is exceeded between two adjacent elements,
+ * perform a chain rebalancing operation.
+ *
+ * keep the threshold high in order to keep cache line invalidates
+ * low "enough" on SMPs
+ */
+#define VOLUME_HASH_REORDER_THRESHOLD 200
+
+/*
+ * when possible, don't just reorder single elements, but reorder
+ * entire chains of elements at once.  a chain of elements that
+ * exceed the element previous to the pivot by at least CHAIN_THRESH 
+ * accesses are moved in front of the chain whose elements have at
+ * least CHAIN_THRESH less accesses than the pivot element
+ */
+#define VOLUME_HASH_REORDER_CHAIN_THRESH (VOLUME_HASH_REORDER_THRESHOLD / 2)
+
+#include "rx/rx_queue.h"
+
+
+VolumeHashTable_t VolumeHashTable = {
+    DEFAULT_VOLUME_HASH_SIZE,
+    DEFAULT_VOLUME_HASH_MASK,
+    NULL
+};
+
+
+static void VInitVolumeHash(void);
+
 
 #ifndef AFS_HAVE_FFS
 /* This macro is used where an ffs() call does not exist. Was in util/ffs.c */
@@ -211,7 +278,6 @@ ffs(x)
 #endif /* !AFS_HAVE_FFS */
 
 #ifdef AFS_PTHREAD_ENV
-#include "rx/rx_queue.h"
 typedef struct diskpartition_queue_t {
     struct rx_queue queue;
     struct DiskPartition * diskP;
@@ -224,9 +290,120 @@ typedef struct vinitvolumepackage_thread_t {
 static void * VInitVolumePackageThread(void * args);
 #endif /* AFS_PTHREAD_ENV */
 
-struct Lock vol_listLock;	/* Lock obtained when listing volumes:  prevents a volume from being missed if the volume is attached during a list volumes */
+static int VAttachVolumesByPartition(struct DiskPartition *diskP, 
+				     int * nAttached, int * nUnattached);
+
+
+#ifdef AFS_DEMAND_ATTACH_FS
+/* demand attach fileserver extensions */
+
+/* XXX
+ * in the future we will support serialization of VLRU state into the fs_state
+ * disk dumps
+ *
+ * these structures are the beginning of that effort
+ */
+struct VLRU_DiskHeader {
+    struct versionStamp stamp;            /* magic and structure version number */
+    afs_uint32 mtime;                     /* time of dump to disk */
+    afs_uint32 num_records;               /* number of VLRU_DiskEntry records */
+};
+
+struct VLRU_DiskEntry {
+    afs_uint32 vid;                       /* volume ID */
+    afs_uint32 idx;                       /* generation */
+    afs_uint32 last_get;                  /* timestamp of last get */
+};
+
+struct VLRU_StartupQueue {
+    struct VLRU_DiskEntry * entry;
+    int num_entries;
+    int next_idx;
+};
+
+typedef struct vshutdown_thread_t {
+    struct rx_queue q;
+    pthread_mutex_t lock;
+    pthread_cond_t cv;
+    pthread_cond_t master_cv;
+    int n_threads;
+    int n_threads_complete;
+    int vol_remaining;
+    int schedule_version;
+    int pass;
+    byte n_parts;
+    byte n_parts_done_pass;
+    byte part_thread_target[VOLMAXPARTS+1];
+    byte part_done_pass[VOLMAXPARTS+1];
+    struct rx_queue * part_pass_head[VOLMAXPARTS+1];
+    int stats[4][VOLMAXPARTS+1];
+} vshutdown_thread_t;
+static void * VShutdownThread(void * args);
+
+
+static Volume * VAttachVolumeByVp_r(Error * ec, Volume * vp, int mode);
+static int VCheckFree(Volume * vp);
+
+/* VByP List */
+static void AddVolumeToVByPList_r(Volume * vp);
+static void DeleteVolumeFromVByPList_r(Volume * vp);
+static void VVByPListBeginExclusive_r(struct DiskPartition * dp);
+static void VVByPListEndExclusive_r(struct DiskPartition * dp);
+static void VVByPListWait_r(struct DiskPartition * dp);
+
+/* online salvager */
+static int VCheckSalvage(register Volume * vp);
+static int VUpdateSalvagePriority_r(Volume * vp);
+static int VScheduleSalvage_r(Volume * vp);
+static int VCancelSalvage_r(Volume * vp, int reason);
+
+/* Volume hash table */
+static void VReorderHash_r(VolumeHashChainHead * head, Volume * pp, Volume * vp);
+static void VHashBeginExclusive_r(VolumeHashChainHead * head);
+static void VHashEndExclusive_r(VolumeHashChainHead * head);
+static void VHashWait_r(VolumeHashChainHead * head);
+
+/* Volume state machine */
+static void VCreateReservation_r(Volume * vp);
+static void VCancelReservation_r(Volume * vp);
+static void VWaitStateChange_r(Volume * vp);
+static void VWaitExclusiveState_r(Volume * vp);
+static int IsExclusiveState(VolState state);
+static int IsErrorState(VolState state);
+static int IsValidState(VolState state);
+
+/* shutdown */
+static int ShutdownVByPForPass_r(struct DiskPartition * dp, int pass);
+static int ShutdownVolumeWalk_r(struct DiskPartition * dp, int pass,
+				struct rx_queue ** idx);
+static void ShutdownController(vshutdown_thread_t * params);
+static void ShutdownCreateSchedule(vshutdown_thread_t * params);
+
+/* VLRU */
+static void VLRU_ComputeConstants(void);
+static void VInitVLRU(void);
+static void VLRU_Init_Node_r(volatile Volume * vp);
+static void VLRU_Add_r(volatile Volume * vp);
+static void VLRU_Delete_r(volatile Volume * vp);
+static void VLRU_UpdateAccess_r(volatile Volume * vp);
+static void * VLRU_ScannerThread(void * args);
+static void VLRU_Scan_r(int idx);
+static void VLRU_Promote_r(int idx);
+static void VLRU_Demote_r(int idx);
+static void VLRU_SwitchQueues(volatile Volume * vp, int new_idx, int append);
+
+/* soft detach */
+static int VCheckSoftDetach(volatile Volume * vp, afs_uint32 thresh);
+static int VCheckSoftDetachCandidate(volatile Volume * vp, afs_uint32 thresh);
+static int VSoftDetachVolume_r(volatile Volume * vp, afs_uint32 thresh);
+#endif /* AFS_DEMAND_ATTACH_FS */
+
+
+struct Lock vol_listLock;	/* Lock obtained when listing volumes:  
+				 * prevents a volume from being missed 
+				 * if the volume is attached during a 
+				 * list volumes */
 
-extern struct Lock FSYNC_handler_lock;
 
 static int TimeZoneCorrection;	/* Number of seconds west of GMT */
 
@@ -247,12 +424,16 @@ bit32 VolumeCacheCheck;		/* Incremented everytime a volume goes on line--
 				 * vnode will be invalidated
 				 * access only with VOL_LOCK held */
 
-int VolumeCacheSize = 200, VolumeGets = 0, VolumeReplacements = 0, Vlooks = 0;
 
 
+
+/***************************************************/
+/* Startup routines                                */
+/***************************************************/
+
 int
-VInitVolumePackage(ProgramType pt, int nLargeVnodes, int nSmallVnodes,
-		   int connect, int volcache)
+VInitVolumePackage(ProgramType pt, afs_uint32 nLargeVnodes, afs_uint32 nSmallVnodes,
+		   int connect, afs_uint32 volcache)
 {
     int errors = 0;		/* Number of errors while finding vice partitions. */
     struct timeval tv;
@@ -260,10 +441,24 @@ VInitVolumePackage(ProgramType pt, int nLargeVnodes, int nSmallVnodes,
 
     programType = pt;
 
+#ifdef AFS_DEMAND_ATTACH_FS
+    memset(&VStats, 0, sizeof(VStats));
+    VStats.hdr_cache_size = 200;
+#endif
+
+    VInitPartitionPackage();
+    VInitVolumeHash();
+    VInitVnHashByVolume();
+#ifdef AFS_DEMAND_ATTACH_FS
+    if (programType == fileServer) {
+	VInitVLRU();
+    } else {
+	VLRU_SetOptions(VLRU_SET_ENABLED, 0);
+    }
+#endif
+
 #ifdef AFS_PTHREAD_ENV
     assert(pthread_mutex_init(&vol_glock_mutex, NULL) == 0);
-    assert(pthread_mutex_init(&vol_attach_mutex, NULL) == 0);
-    assert(pthread_mutex_init(&vol_fsync_mutex, NULL) == 0);
     assert(pthread_mutex_init(&vol_trans_mutex, NULL) == 0);
     assert(pthread_cond_init(&vol_put_volume_cond, NULL) == 0);
     assert(pthread_cond_init(&vol_sleep_cond, NULL) == 0);
@@ -271,25 +466,41 @@ VInitVolumePackage(ProgramType pt, int nLargeVnodes, int nSmallVnodes,
     IOMGR_Initialize();
 #endif /* AFS_PTHREAD_ENV */
     Lock_Init(&vol_listLock);
-    Lock_Init(&FSYNC_handler_lock);
+
     srandom(time(0));		/* For VGetVolumeInfo */
     gettimeofday(&tv, &tz);
     TimeZoneCorrection = tz.tz_minuteswest * 60;
 
+#ifdef AFS_DEMAND_ATTACH_FS
+    assert(pthread_mutex_init(&vol_salvsync_mutex, NULL) == 0);
+#endif /* AFS_DEMAND_ATTACH_FS */
+
     /* Ok, we have done enough initialization that fileserver can 
      * start accepting calls, even though the volumes may not be 
      * available just yet.
      */
     VInit = 1;
 
+#if defined(AFS_DEMAND_ATTACH_FS) && defined(SALVSYNC_BUILD_SERVER)
+    if (programType == salvageServer) {
+	SALVSYNC_salvInit();
+    }
+#endif /* AFS_DEMAND_ATTACH_FS */
+#ifdef FSSYNC_BUILD_SERVER
     if (programType == fileServer) {
-	/* File server or "stand" */
 	FSYNC_fsInit();
     }
+#endif
+#if defined(AFS_DEMAND_ATTACH_FS) && defined(SALVSYNC_BUILD_CLIENT)
+    if (programType == fileServer) {
+	/* establish a connection to the salvager at this point */
+	assert(VConnectSALV() != 0);
+    }
+#endif /* AFS_DEMAND_ATTACH_FS */
 
-    if (volcache > VolumeCacheSize)
-	VolumeCacheSize = volcache;
-    InitLRU(VolumeCacheSize);
+    if (volcache > VStats.hdr_cache_size)
+	VStats.hdr_cache_size = volcache;
+    VInitVolumeHeaderCache(VStats.hdr_cache_size);
 
     VInitVnodes(vLarge, nLargeVnodes);
     VInitVnodes(vSmall, nSmallVnodes);
@@ -304,7 +515,7 @@ VInitVolumePackage(ProgramType pt, int nLargeVnodes, int nSmallVnodes,
 #ifdef AFS_PTHREAD_ENV
 	struct vinitvolumepackage_thread_t params;
 	struct diskpartition_queue_t * dpq;
-	int i, len;
+	int i, threads, parts;
 	pthread_t tid;
 	pthread_attr_t attrs;
 
@@ -313,29 +524,56 @@ VInitVolumePackage(ProgramType pt, int nLargeVnodes, int nSmallVnodes,
 	params.n_threads_complete = 0;
 
 	/* create partition work queue */
-	for (len=0, diskP = DiskPartitionList; diskP; diskP = diskP->next, len++) {
+	for (parts=0, diskP = DiskPartitionList; diskP; diskP = diskP->next, parts++) {
 	    dpq = (diskpartition_queue_t *) malloc(sizeof(struct diskpartition_queue_t));
 	    assert(dpq != NULL);
 	    dpq->diskP = diskP;
 	    queue_Prepend(&params,dpq);
 	}
 
-	assert(pthread_attr_init(&attrs) == 0);
-	assert(pthread_attr_setdetachstate(&attrs, PTHREAD_CREATE_DETACHED) == 0);
+	threads = MIN(parts, vol_attach_threads);
 
-	len = MIN(len, vol_attach_threads);
-	
-	VOL_LOCK;
-	for (i=0; i < len; i++) {
-	    assert(pthread_create
-		   (&tid, &attrs, &VInitVolumePackageThread,
-		    &params) == 0);
-	}
+	if (threads > 1) {
+	    /* spawn off a bunch of initialization threads */
+	    assert(pthread_attr_init(&attrs) == 0);
+	    assert(pthread_attr_setdetachstate(&attrs, PTHREAD_CREATE_DETACHED) == 0);
 
-	while(params.n_threads_complete < len) {
-  	    pthread_cond_wait(&params.thread_done_cv,&vol_glock_mutex);
+	    Log("VInitVolumePackage: beginning parallel fileserver startup\n");
+#ifdef AFS_DEMAND_ATTACH_FS
+	    Log("VInitVolumePackage: using %d threads to pre-attach volumes on %d partitions\n",
+		threads, parts);
+#else /* AFS_DEMAND_ATTACH_FS */
+	    Log("VInitVolumePackage: using %d threads to attach volumes on %d partitions\n",
+		threads, parts);
+#endif /* AFS_DEMAND_ATTACH_FS */
+
+	    VOL_LOCK;
+	    for (i=0; i < threads; i++) {
+		assert(pthread_create
+		       (&tid, &attrs, &VInitVolumePackageThread,
+			&params) == 0);
+	    }
+
+	    while(params.n_threads_complete < threads) {
+		pthread_cond_wait(&params.thread_done_cv,&vol_glock_mutex);
+	    }
+	    VOL_UNLOCK;
+
+	    assert(pthread_attr_destroy(&attrs) == 0);
+	} else {
+	    /* if we're only going to run one init thread, don't bother creating
+	     * another LWP */
+	    Log("VInitVolumePackage: beginning single-threaded fileserver startup\n");
+#ifdef AFS_DEMAND_ATTACH_FS
+	    Log("VInitVolumePackage: using 1 thread to pre-attach volumes on %d partition(s)\n",
+		parts);
+#else /* AFS_DEMAND_ATTACH_FS */
+	    Log("VInitVolumePackage: using 1 thread to attach volumes on %d partition(s)\n",
+		parts);
+#endif /* AFS_DEMAND_ATTACH_FS */
+
+	    VInitVolumePackageThread(&params);
 	}
-	VOL_UNLOCK;
 
 	assert(pthread_cond_destroy(&params.thread_done_cv) == 0);
 
@@ -346,44 +584,28 @@ VInitVolumePackage(ProgramType pt, int nLargeVnodes, int nSmallVnodes,
 	/* Attach all the volumes in this partition */
 	for (diskP = DiskPartitionList; diskP; diskP = diskP->next) {
 	    int nAttached = 0, nUnattached = 0;
-	    Log("Partition %s: attaching volumes\n", diskP->name);
-	    dirp = opendir(VPartitionPath(diskP));
-	    assert(dirp);
-	    while ((dp = readdir(dirp))) {
-		char *p;
-		p = strrchr(dp->d_name, '.');
-		if (p != NULL && strcmp(p, VHDREXT) == 0) {
-		    Error error;
-		    Volume *vp;
-		    vp = VAttachVolumeByName(&error, diskP->name, dp->d_name,
-					     V_VOLUPD);
-		    (*(vp ? &nAttached : &nUnattached))++;
-		    if (error == VOFFLINE)
-			Log("Volume %d stays offline (/vice/offline/%s exists)\n", VolumeNumber(dp->d_name), dp->d_name);
-		    else if (LogLevel >= 5) {
-			Log("Partition %s: attached volume %d (%s)\n",
-			    diskP->name, VolumeNumber(dp->d_name),
-			    dp->d_name);
-		    }
-		    if (vp) {
-			VPutVolume(vp);
-		    }
-		}
-	    }
-	    Log("Partition %s: attached %d volumes; %d volumes not attached\n", diskP->name, nAttached, nUnattached);
-	    closedir(dirp);
+	    assert(VAttachVolumesByPartition(diskP, &nAttached, &nUnattached) == 0);
 	}
 #endif /* AFS_PTHREAD_ENV */
     }
 
     VInit = 2;			/* Initialized, and all volumes have been attached */
+#ifdef FSSYNC_BUILD_CLIENT
     if (programType == volumeUtility && connect) {
 	if (!VConnectFS()) {
 	    Log("Unable to connect to file server; aborted\n");
-	    Lock_Destroy(&FSYNC_handler_lock);
 	    exit(1);
 	}
     }
+#ifdef AFS_DEMAND_ATTACH_FS
+    else if (programType == salvageServer) {
+	if (!VConnectFS()) {
+	    Log("Unable to connect to file server; aborted\n");
+	    exit(1);
+	}
+    }
+#endif /* AFS_DEMAND_ATTACH_FS */
+#endif /* FSSYNC_BUILD_CLIENT */
     return 0;
 }
 
@@ -412,32 +634,8 @@ VInitVolumePackageThread(void * args) {
 	diskP = dpq->diskP;
 	free(dpq);
 
-	Log("Partition %s: attaching volumes\n", diskP->name);
-	dirp = opendir(VPartitionPath(diskP));
-	assert(dirp);
-	while ((dp = readdir(dirp))) {
-	    char *p;
-	    p = strrchr(dp->d_name, '.');
-	    if (p != NULL && strcmp(p, VHDREXT) == 0) {
-	        Error error;
-		Volume *vp;
-		vp = VAttachVolumeByName(&error, diskP->name, dp->d_name,
-					 V_VOLUPD);
-		(*(vp ? &nAttached : &nUnattached))++;
-		if (error == VOFFLINE)
-		    Log("Volume %d stays offline (/vice/offline/%s exists)\n", VolumeNumber(dp->d_name), dp->d_name);
-		else if (LogLevel >= 5) {
-		    Log("Partition %s: attached volume %d (%s)\n",
-			diskP->name, VolumeNumber(dp->d_name),
-			dp->d_name);
-		}
-		if (vp) {
-		    VPutVolume(vp);
-		}
-	    }
-	}
-	Log("Partition %s: attached %d volumes; %d volumes not attached\n", diskP->name, nAttached, nUnattached);
-	closedir(dirp);
+	assert(VAttachVolumesByPartition(diskP, &nAttached, &nUnattached) == 0);
+
 	VOL_LOCK;
     }
 
@@ -448,46 +646,114 @@ VInitVolumePackageThread(void * args) {
 }
 #endif /* AFS_PTHREAD_ENV */
 
-/* This must be called by any volume utility which needs to run while the
-   file server is also running.  This is separated from VInitVolumePackage so
-   that a utility can fork--and each of the children can independently
-   initialize communication with the file server */
-int
-VConnectFS(void)
+/*
+ * attach all volumes on a given disk partition
+ */
+static int
+VAttachVolumesByPartition(struct DiskPartition *diskP, int * nAttached, int * nUnattached)
 {
-    int retVal;
-    VOL_LOCK;
-    retVal = VConnectFS_r();
-    VOL_UNLOCK;
-    return retVal;
+  DIR * dirp;
+  struct dirent * dp;
+  int ret = 0;
+
+  Log("Partition %s: attaching volumes\n", diskP->name);
+  dirp = opendir(VPartitionPath(diskP));
+  if (!dirp) {
+    Log("opendir on Partition %s failed!\n", diskP->name);
+    return 1;
+  }
+
+  while ((dp = readdir(dirp))) {
+    char *p;
+    p = strrchr(dp->d_name, '.');
+    if (p != NULL && strcmp(p, VHDREXT) == 0) {
+      Error error;
+      Volume *vp;
+#ifdef AFS_DEMAND_ATTACH_FS
+      vp = VPreAttachVolumeByName(&error, diskP->name, dp->d_name,
+                                  V_VOLUPD);
+#else /* AFS_DEMAND_ATTACH_FS */
+      vp = VAttachVolumeByName(&error, diskP->name, dp->d_name,
+			       V_VOLUPD);
+#endif /* AFS_DEMAND_ATTACH_FS */
+      (*(vp ? nAttached : nUnattached))++;
+      if (error == VOFFLINE)
+	Log("Volume %d stays offline (/vice/offline/%s exists)\n", VolumeNumber(dp->d_name), dp->d_name);
+      else if (LogLevel >= 5) {
+	Log("Partition %s: attached volume %d (%s)\n",
+	    diskP->name, VolumeNumber(dp->d_name),
+	    dp->d_name);
+      }
+#if !defined(AFS_DEMAND_ATTACH_FS)
+      if (vp) {
+	VPutVolume(vp);
+      }
+#endif /* AFS_DEMAND_ATTACH_FS */
+    }
+  }
+
+  Log("Partition %s: attached %d volumes; %d volumes not attached\n", diskP->name, *nAttached, *nUnattached);
+  closedir(dirp);
+  return ret;
 }
 
-int
-VConnectFS_r(void)
-{
-    int rc;
-    assert(VInit == 2 && programType == volumeUtility);
-    rc = FSYNC_clientInit();
-    if (rc)
-	VInit = 3;
-    return rc;
-}
 
-void
-VDisconnectFS_r(void)
-{
-    assert(programType == volumeUtility);
-    FSYNC_clientFinis();
-    VInit = 2;
-}
+/***************************************************/
+/* Shutdown routines                               */
+/***************************************************/
 
-void
-VDisconnectFS(void)
-{
-    VOL_LOCK;
-    VDisconnectFS_r();
-    VOL_UNLOCK;
-}
+/*
+ * demand attach fs
+ * highly multithreaded volume package shutdown
+ *
+ * with the demand attach fileserver extensions,
+ * VShutdown has been modified to be multithreaded.
+ * In order to achieve optimal use of many threads,
+ * the shutdown code involves one control thread and
+ * n shutdown worker threads.  The control thread
+ * periodically examines the number of volumes available
+ * for shutdown on each partition, and produces a worker
+ * thread allocation schedule.  The idea is to eliminate
+ * redundant scheduling computation on the workers by
+ * having a single master scheduler.
+ *
+ * The scheduler's objectives are:
+ * (1) fairness
+ *   each partition with volumes remaining gets allocated
+ *   at least 1 thread (assuming sufficient threads)
+ * (2) performance
+ *   threads are allocated proportional to the number of
+ *   volumes remaining to be offlined.  This ensures that
+ *   the OS I/O scheduler has many requests to elevator
+ *   seek on partitions that will (presumably) take the
+ *   longest amount of time (from now) to finish shutdown
+ * (3) keep threads busy
+ *   when there are extra threads, they are assigned to
+ *   partitions using a simple round-robin algorithm
+ *
+ * In the future, we may wish to add the ability to adapt
+ * to the relative performance patterns of each disk
+ * partition.
+ *
+ *
+ * demand attach fs
+ * multi-step shutdown process
+ *
+ * demand attach shutdown is a four-step process. Each
+ * shutdown "pass" shuts down increasingly more difficult
+ * volumes.  The main purpose is to achieve better cache
+ * utilization during shutdown.
+ *
+ * pass 0
+ *   shutdown volumes in the unattached, pre-attached
+ *   and error states
+ * pass 1
+ *   shutdown attached volumes with cached volume headers
+ * pass 2
+ *   shutdown all volumes in non-exclusive states
+ * pass 3
+ *   shutdown all remaining volumes
+ */
 
 void
 VShutdown_r(void)
@@ -495,36 +761,139 @@ VShutdown_r(void)
     int i;
     register Volume *vp, *np;
     register afs_int32 code;
+#ifdef AFS_DEMAND_ATTACH_FS
+    struct DiskPartition * diskP;
+    struct diskpartition_queue_t * dpq;
+    vshutdown_thread_t params;
+    pthread_t tid;
+    pthread_attr_t attrs;
 
-    Log("VShutdown:  shutting down on-line volumes...\n");
-    for (i = 0; i < VOLUME_HASH_TABLE_SIZE; i++) {
-	/* try to hold first volume in the hash table */
-	for (vp = VolumeHashTable[i]; vp; vp = vp->hashNext) {
-	    code = VHold_r(vp);
-	    if (code == 0)
-		break;		/* got it */
-	    /* otherwise we go around again, trying another volume */
-	}
-	while (vp) {
-	    if (LogLevel >= 5)
-		Log("VShutdown:  Attempting to take volume %u offline.\n",
-		    vp->hashid);
-	    /* first compute np before releasing vp, in case vp disappears
-	     * after releasing.  Hold it, so it doesn't disapear.  If we
-	     * can't hold it, try the next one in the chain.  Invariant
-	     * at the top of this loop is that vp is held (has extra ref count).
-	     */
-	    for (np = vp->hashNext; np; np = np->hashNext) {
-		code = VHold_r(np);
-		if (code == 0)
-		    break;	/* got it */
+    memset(&params, 0, sizeof(vshutdown_thread_t));
+
+    for (params.n_parts=0, diskP = DiskPartitionList;
+	 diskP; diskP = diskP->next, params.n_parts++);
+
+    Log("VShutdown:  shutting down on-line volumes on %d partition%s...\n", 
+	params.n_parts, params.n_parts > 1 ? "s" : "");
+
+    if (vol_attach_threads > 1) {
+	/* prepare for parallel shutdown */
+	params.n_threads = vol_attach_threads;
+	assert(pthread_mutex_init(&params.lock, NULL) == 0);
+	assert(pthread_cond_init(&params.cv, NULL) == 0);
+	assert(pthread_cond_init(&params.master_cv, NULL) == 0);
+	assert(pthread_attr_init(&attrs) == 0);
+	assert(pthread_attr_setdetachstate(&attrs, PTHREAD_CREATE_DETACHED) == 0);
+	queue_Init(&params);
+
+	/* setup the basic partition information structures for
+	 * parallel shutdown */
+	for (diskP = DiskPartitionList; diskP; diskP = diskP->next) {
+	    /* XXX debug */
+	    struct rx_queue * qp, * nqp;
+	    Volume * vp;
+	    int count = 0;
+
+	    VVByPListWait_r(diskP);
+	    VVByPListBeginExclusive_r(diskP);
+
+	    /* XXX debug */
+	    for (queue_Scan(&diskP->vol_list, qp, nqp, rx_queue)) {
+		vp = (Volume *)((char *)qp - offsetof(Volume, vol_list));
+		if (vp->header)
+		    count++;
+	    }
+	    Log("VShutdown: partition %s has %d volumes with attached headers\n",
+		VPartitionPath(diskP), count);
+		
+
+	    /* build up the pass 0 shutdown work queue */
+	    dpq = (struct diskpartition_queue_t *) malloc(sizeof(struct diskpartition_queue_t));
+	    assert(dpq != NULL);
+	    dpq->diskP = diskP;
+	    queue_Prepend(&params, dpq);
+
+	    params.part_pass_head[diskP->device] = queue_First(&diskP->vol_list, rx_queue);
+	}
+
+	Log("VShutdown:  beginning parallel fileserver shutdown\n");
+	Log("VShutdown:  using %d threads to offline volumes on %d partition%s\n",
+	    vol_attach_threads, params.n_parts, params.n_parts > 1 ? "s" : "" );
+
+	/* do pass 0 shutdown */
+	assert(pthread_mutex_lock(&params.lock) == 0);
+	for (i=0; i < params.n_threads; i++) {
+	    assert(pthread_create
+		   (&tid, &attrs, &VShutdownThread,
+		    &params) == 0);
+	}
+	
+	/* wait for all the pass 0 shutdowns to complete */
+	while (params.n_threads_complete < params.n_threads) {
+	    assert(pthread_cond_wait(&params.master_cv, &params.lock) == 0);
+	}
+	params.n_threads_complete = 0;
+	params.pass = 1;
+	assert(pthread_cond_broadcast(&params.cv) == 0);
+	assert(pthread_mutex_unlock(&params.lock) == 0);
+
+	Log("VShutdown:  pass 0 completed using the 1 thread per partition algorithm\n");
+	Log("VShutdown:  starting passes 1 through 3 using finely-granular mp-fast algorithm\n");
+
+	/* run the parallel shutdown scheduler. it will drop the glock internally */
+	ShutdownController(&params);
+	
+	/* wait for all the workers to finish pass 3 and terminate */
+	while (params.pass < 4) {
+	    assert(pthread_cond_wait(&params.cv, &vol_glock_mutex) == 0);
+	}
+	
+	assert(pthread_attr_destroy(&attrs) == 0);
+	assert(pthread_cond_destroy(&params.cv) == 0);
+	assert(pthread_cond_destroy(&params.master_cv) == 0);
+	assert(pthread_mutex_destroy(&params.lock) == 0);
+
+	/* drop the VByPList exclusive reservations */
+	for (diskP = DiskPartitionList; diskP; diskP = diskP->next) {
+	    VVByPListEndExclusive_r(diskP);
+	    Log("VShutdown:  %s stats : (pass[0]=%d, pass[1]=%d, pass[2]=%d, pass[3]=%d)\n",
+		VPartitionPath(diskP),
+		params.stats[0][diskP->device],
+		params.stats[1][diskP->device],
+		params.stats[2][diskP->device],
+		params.stats[3][diskP->device]);
+	}
+
+	Log("VShutdown:  shutdown finished using %d threads\n", params.n_threads);
+    } else {
+	/* if we're only going to run one shutdown thread, don't bother creating
+	 * another LWP */
+	Log("VShutdown:  beginning single-threaded fileserver shutdown\n");
+
+	for (diskP = DiskPartitionList; diskP; diskP = diskP->next) {
+	    VShutdownByPartition_r(diskP);
+	}
+    }
+
+    Log("VShutdown:  complete.\n");
+#else /* AFS_DEMAND_ATTACH_FS */
+    Log("VShutdown:  shutting down on-line volumes...\n");
+    for (i = 0; i < VolumeHashTable.Size; i++) {
+	/* try to hold first volume in the hash table */
+	for (queue_Scan(&VolumeHashTable.Table[i],vp,np,Volume)) {
+	    code = VHold_r(vp);
+	    if (code == 0) {
+		if (LogLevel >= 5)
+		    Log("VShutdown:  Attempting to take volume %u offline.\n",
+			vp->hashid);
+		
+		/* next, take the volume offline (drops reference count) */
+		VOffline_r(vp, "File server was shut down");
 	    }
-	    /* next, take the volume offline (drops reference count) */
-	    VOffline_r(vp, "File server was shut down");
-	    vp = np;		/* next guy to try */
 	}
     }
     Log("VShutdown:  complete.\n");
+#endif /* AFS_DEMAND_ATTACH_FS */
 }
 
 void
@@ -535,7 +904,498 @@ VShutdown(void)
     VOL_UNLOCK;
 }
 
+#ifdef AFS_DEMAND_ATTACH_FS
+/*
+ * demand attach fs
+ * shutdown control thread
+ */
+static void
+ShutdownController(vshutdown_thread_t * params)
+{
+    /* XXX debug */
+    struct DiskPartition * diskP;
+    Device id;
+    vshutdown_thread_t shadow;
 
+    ShutdownCreateSchedule(params);
+
+    while ((params->pass < 4) &&
+	   (params->n_threads_complete < params->n_threads)) {
+	/* recompute schedule once per second */
+
+	memcpy(&shadow, params, sizeof(vshutdown_thread_t));
+
+	VOL_UNLOCK;
+	/* XXX debug */
+	Log("ShutdownController:  schedule version=%d, vol_remaining=%d, pass=%d\n",
+	    shadow.schedule_version, shadow.vol_remaining, shadow.pass);
+	Log("ShutdownController:  n_threads_complete=%d, n_parts_done_pass=%d\n",
+	    shadow.n_threads_complete, shadow.n_parts_done_pass);
+	for (diskP = DiskPartitionList; diskP; diskP=diskP->next) {
+	    id = diskP->device;
+	    Log("ShutdownController:  part[%d] : (len=%d, thread_target=%d, done_pass=%d, pass_head=%p)\n",
+		id, 
+		diskP->vol_list.len,
+		shadow.part_thread_target[id], 
+		shadow.part_done_pass[id], 
+		shadow.part_pass_head[id]);
+	}
+
+	sleep(1);
+	VOL_LOCK;
+
+	ShutdownCreateSchedule(params);
+    }
+}
+
+/* create the shutdown thread work schedule.
+ * this scheduler tries to implement fairness
+ * by allocating at least 1 thread to each 
+ * partition with volumes to be shutdown,
+ * and then it attempts to allocate remaining
+ * threads based upon the amount of work left
+ */
+static void
+ShutdownCreateSchedule(vshutdown_thread_t * params)
+{
+    struct DiskPartition * diskP;
+    int sum, thr_workload, thr_left;
+    int part_residue[VOLMAXPARTS+1];
+    Device id;
+
+    /* compute the total number of outstanding volumes */
+    sum = 0;
+    for (diskP = DiskPartitionList; diskP; diskP = diskP->next) {
+	sum += diskP->vol_list.len;
+    }
+    
+    params->schedule_version++;
+    params->vol_remaining = sum;
+
+    if (!sum)
+	return;
+
+    /* compute average per-thread workload */
+    thr_workload = sum / params->n_threads;
+    if (sum % params->n_threads)
+	thr_workload++;
+
+    thr_left = params->n_threads;
+    memset(&part_residue, 0, sizeof(part_residue));
+
+    /* for fairness, give every partition with volumes remaining
+     * at least one thread */
+    for (diskP = DiskPartitionList; diskP && thr_left; diskP = diskP->next) {
+	id = diskP->device;
+	if (diskP->vol_list.len) {
+	    params->part_thread_target[id] = 1;
+	    thr_left--;
+	} else {
+	    params->part_thread_target[id] = 0;
+	}
+    }
+
+    if (thr_left && thr_workload) {
+	/* compute length-weighted workloads */
+	int delta;
+
+	for (diskP = DiskPartitionList; diskP && thr_left; diskP = diskP->next) {
+	    id = diskP->device;
+	    delta = (diskP->vol_list.len / thr_workload) -
+		params->part_thread_target[id];
+	    if (delta < 0) {
+		continue;
+	    }
+	    if (delta < thr_left) {
+		params->part_thread_target[id] += delta;
+		thr_left -= delta;
+	    } else {
+		params->part_thread_target[id] += thr_left;
+		thr_left = 0;
+		break;
+	    }
+	}
+    }
+
+    if (thr_left) {
+	/* try to assign any leftover threads to partitions that
+	 * had volume lengths closer to needing thread_target+1 */
+	int max_residue, max_id;
+
+	/* compute the residues */
+	for (diskP = DiskPartitionList; diskP; diskP = diskP->next) {
+	    id = diskP->device;
+	    part_residue[id] = diskP->vol_list.len - 
+		(params->part_thread_target[id] * thr_workload);
+	}
+
+	/* now try to allocate remaining threads to partitions with the
+	 * highest residues */
+	while (thr_left) {
+	    max_residue = 0;
+	    for (diskP = DiskPartitionList; diskP; diskP = diskP->next) {
+		id = diskP->device;
+		if (part_residue[id] > max_residue) {
+		    max_residue = part_residue[id];
+		    max_id = id;
+		}
+	    }
+
+	    if (!max_residue) {
+		break;
+	    }
+
+	    params->part_thread_target[max_id]++;
+	    thr_left--;
+	    part_residue[max_id] = 0;
+	}
+    }
+
+    if (thr_left) {
+	/* punt and give any remaining threads equally to each partition */
+	int alloc;
+	if (thr_left >= params->n_parts) {
+	    alloc = thr_left / params->n_parts;
+	    for (diskP = DiskPartitionList; diskP; diskP = diskP->next) {
+		id = diskP->device;
+		params->part_thread_target[id] += alloc;
+		thr_left -= alloc;
+	    }
+	}
+
+	/* finish off the last of the threads */
+	for (diskP = DiskPartitionList; thr_left && diskP; diskP = diskP->next) {
+	    id = diskP->device;
+	    params->part_thread_target[id]++;
+	    thr_left--;
+	}
+    }
+}
+
+/* worker thread for parallel shutdown */
+static void *
+VShutdownThread(void * args)
+{
+    struct rx_queue *qp;
+    Volume * vp;
+    vshutdown_thread_t * params;
+    int part, code, found, pass, schedule_version_save, count;
+    struct DiskPartition *diskP;
+    struct diskpartition_queue_t * dpq;
+    Device id;
+
+    params = (vshutdown_thread_t *) args;
+
+    /* acquire the shutdown pass 0 lock */
+    assert(pthread_mutex_lock(&params->lock) == 0);
+
+    /* if there's still pass 0 work to be done,
+     * get a work entry, and do a pass 0 shutdown */
+    if (queue_IsNotEmpty(params)) {
+	dpq = queue_First(params, diskpartition_queue_t);
+	queue_Remove(dpq);
+	assert(pthread_mutex_unlock(&params->lock) == 0);
+	diskP = dpq->diskP;
+	free(dpq);
+	id = diskP->device;
+
+	count = 0;
+	while (ShutdownVolumeWalk_r(diskP, 0, &params->part_pass_head[id]))
+	    count++;
+	params->stats[0][diskP->device] = count;
+	assert(pthread_mutex_lock(&params->lock) == 0);
+    }
+
+    params->n_threads_complete++;
+    if (params->n_threads_complete == params->n_threads) {
+      /* notify control thread that all workers have completed pass 0 */
+      assert(pthread_cond_signal(&params->master_cv) == 0);
+    }
+    while (params->pass == 0) {
+      assert(pthread_cond_wait(&params->cv, &params->lock) == 0);
+    }
+
+    /* switch locks */
+    assert(pthread_mutex_unlock(&params->lock) == 0);
+    VOL_LOCK;
+
+    pass = params->pass;
+    assert(pass > 0);
+
+    /* now escalate through the more complicated shutdowns */
+    while (pass <= 3) {
+	schedule_version_save = params->schedule_version;
+	found = 0;
+	/* find a disk partition to work on */
+	for (diskP = DiskPartitionList; diskP; diskP = diskP->next) {
+	    id = diskP->device;
+	    if (params->part_thread_target[id] && !params->part_done_pass[id]) {
+		params->part_thread_target[id]--;
+		found = 1;
+		break;
+	    }
+	}
+	
+	if (!found) {
+	    /* hmm. for some reason the controller thread couldn't find anything for 
+	     * us to do. let's see if there's anything we can do */
+	    for (diskP = DiskPartitionList; diskP; diskP = diskP->next) {
+		id = diskP->device;
+		if (diskP->vol_list.len && !params->part_done_pass[id]) {
+		    found = 1;
+		    break;
+		} else if (!params->part_done_pass[id]) {
+		    params->part_done_pass[id] = 1;
+		    params->n_parts_done_pass++;
+		    if (pass == 3) {
+			Log("VShutdown:  done shutting down volumes on partition %s.\n",
+			    VPartitionPath(diskP));
+		    }
+		}
+	    }
+	}
+	
+	/* do work on this partition until either the controller
+	 * creates a new schedule, or we run out of things to do
+	 * on this partition */
+	if (found) {
+	    count = 0;
+	    while (!params->part_done_pass[id] &&
+		   (schedule_version_save == params->schedule_version)) {
+		/* ShutdownVolumeWalk_r will drop the glock internally */
+		if (!ShutdownVolumeWalk_r(diskP, pass, &params->part_pass_head[id])) {
+		    if (!params->part_done_pass[id]) {
+			params->part_done_pass[id] = 1;
+			params->n_parts_done_pass++;
+			if (pass == 3) {
+			    Log("VShutdown:  done shutting down volumes on partition %s.\n",
+				VPartitionPath(diskP));
+			}
+		    }
+		    break;
+		}
+		count++;
+	    }
+
+	    params->stats[pass][id] += count;
+	} else {
+	    /* ok, everyone is done this pass, proceed */
+
+	    /* barrier lock */
+	    params->n_threads_complete++;
+	    while (params->pass == pass) {
+		if (params->n_threads_complete == params->n_threads) {
+		    /* we are the last thread to complete, so we will
+		     * reinitialize worker pool state for the next pass */
+		    params->n_threads_complete = 0;
+		    params->n_parts_done_pass = 0;
+		    params->pass++;
+		    for (diskP = DiskPartitionList; diskP; diskP = diskP->next) {
+			id = diskP->device;
+			params->part_done_pass[id] = 0;
+			params->part_pass_head[id] = queue_First(&diskP->vol_list, rx_queue);
+		    }
+
+		    /* compute a new thread schedule before releasing all the workers */
+		    ShutdownCreateSchedule(params);
+
+		    /* wake up all the workers */
+		    assert(pthread_cond_broadcast(&params->cv) == 0);
+
+		    VOL_UNLOCK;
+		    Log("VShutdown:  pass %d completed using %d threads on %d partitions\n",
+			pass, params->n_threads, params->n_parts);
+		    VOL_LOCK;
+		} else {
+		    assert(pthread_cond_wait(&params->cv, &vol_glock_mutex) == 0);
+		}
+	    }
+	    pass = params->pass;
+	}
+	
+	/* for fairness */
+	VOL_UNLOCK;
+	pthread_yield();
+	VOL_LOCK;
+    }
+
+    VOL_UNLOCK;
+
+    return NULL;
+}
+
+/* shut down all volumes on a given disk partition 
+ *
+ * note that this function will not allow mp-fast
+ * shutdown of a partition */
+int
+VShutdownByPartition_r(struct DiskPartition * dp)
+{
+    int pass, retVal;
+    int pass_stats[4];
+    int total;
+
+    /* wait for other exclusive ops to finish */
+    VVByPListWait_r(dp);
+
+    /* begin exclusive access */
+    VVByPListBeginExclusive_r(dp);
+
+    /* pick the low-hanging fruit first,
+     * then do the complicated ones last 
+     * (has the advantage of keeping
+     *  in-use volumes up until the bitter end) */
+    for (pass = 0, total=0; pass < 4; pass++) {
+	pass_stats[pass] = ShutdownVByPForPass_r(dp, pass);
+	total += pass_stats[pass];
+    }
+
+    /* end exclusive access */
+    VVByPListEndExclusive_r(dp);
+
+    Log("VShutdownByPartition:  shut down %d volumes on %s (pass[0]=%d, pass[1]=%d, pass[2]=%d, pass[3]=%d)\n",
+	total, VPartitionPath(dp), pass_stats[0], pass_stats[1], pass_stats[2], pass_stats[3]);
+
+    return retVal;
+}
+
+/* internal shutdown functionality
+ *
+ * for multi-pass shutdown:
+ * 0 to only "shutdown" {pre,un}attached and error state volumes
+ * 1 to also shutdown attached volumes w/ volume header loaded
+ * 2 to also shutdown attached volumes w/o volume header loaded
+ * 3 to also shutdown exclusive state volumes 
+ *
+ * caller MUST hold exclusive access on the hash chain
+ * because we drop vol_glock_mutex internally
+ * 
+ * this function is reentrant for passes 1--3 
+ * (e.g. multiple threads can cooperate to 
+ *  shutdown a partition mp-fast)
+ *
+ * pass 0 is not scaleable because the volume state data is
+ * synchronized by vol_glock mutex, and the locking overhead
+ * is too high to drop the lock long enough to do linked list
+ * traversal
+ */
+static int
+ShutdownVByPForPass_r(struct DiskPartition * dp, int pass)
+{
+    struct rx_queue * q = queue_First(&dp->vol_list, rx_queue);
+    register int i = 0;
+
+    while (ShutdownVolumeWalk_r(dp, pass, &q))
+	i++;
+
+    return i;
+}
+
+/* conditionally shutdown one volume on partition dp
+ * returns 1 if a volume was shutdown in this pass,
+ * 0 otherwise */
+static int
+ShutdownVolumeWalk_r(struct DiskPartition * dp, int pass,
+		     struct rx_queue ** idx)
+{
+    struct rx_queue *qp, *nqp;
+    Volume * vp;
+
+    qp = *idx;
+
+    for (queue_ScanFrom(&dp->vol_list, qp, qp, nqp, rx_queue)) {
+	vp = (Volume *) (((char *)qp) - offsetof(Volume, vol_list));
+	
+	switch (pass) {
+	case 0:
+	    if ((V_attachState(vp) != VOL_STATE_UNATTACHED) &&
+		(V_attachState(vp) != VOL_STATE_ERROR) &&
+		(V_attachState(vp) != VOL_STATE_PREATTACHED)) {
+		break;
+	    }
+	case 1:
+	    if ((V_attachState(vp) == VOL_STATE_ATTACHED) &&
+		(vp->header == NULL)) {
+		break;
+	    }
+	case 2:
+	    if (IsExclusiveState(V_attachState(vp))) {
+		break;
+	    }
+	case 3:
+	    *idx = nqp;
+	    DeleteVolumeFromVByPList_r(vp);
+	    VShutdownVolume_r(vp);
+	    vp = NULL;
+	    return 1;
+	}
+    }
+
+    return 0;
+}
+
+/*
+ * shutdown a specific volume
+ */
+/* caller MUST NOT hold a heavyweight ref on vp */
+int
+VShutdownVolume_r(Volume * vp)
+{
+    int code;
+
+    VCreateReservation_r(vp);
+
+    if (LogLevel >= 5) {
+	Log("VShutdownVolume_r:  vid=%u, device=%d, state=%hu\n",
+	    vp->hashid, vp->partition->device, V_attachState(vp));
+    }
+
+    /* wait for other blocking ops to finish */
+    VWaitExclusiveState_r(vp);
+
+    assert(IsValidState(V_attachState(vp)));
+    
+    switch(V_attachState(vp)) {
+    case VOL_STATE_SALVAGING:
+	/* make sure salvager knows we don't want
+	 * the volume back */
+	VCancelSalvage_r(vp, SALVSYNC_SHUTDOWN);
+    case VOL_STATE_PREATTACHED:
+    case VOL_STATE_ERROR:
+	VChangeState_r(vp, VOL_STATE_UNATTACHED);
+    case VOL_STATE_UNATTACHED:
+	break;
+    case VOL_STATE_GOING_OFFLINE:
+    case VOL_STATE_SHUTTING_DOWN:
+    case VOL_STATE_ATTACHED:
+	code = VHold_r(vp);
+	if (!code) {
+	    if (LogLevel >= 5)
+		Log("VShutdown:  Attempting to take volume %u offline.\n",
+		    vp->hashid);
+
+	    /* take the volume offline (drops reference count) */
+	    VOffline_r(vp, "File server was shut down");
+	}
+	break;
+    }
+    
+    VCancelReservation_r(vp);
+    vp = NULL;
+    return 0;
+}
+#endif /* AFS_DEMAND_ATTACH_FS */
+
+
+/***************************************************/
+/* Header I/O routines                             */
+/***************************************************/
+
+/* open a descriptor for the inode (h),
+ * read in an on-disk structure into buffer (to) of size (size),
+ * verify versionstamp in structure has magic (magic) and
+ * optionally verify version (version) if (version) is nonzero
+ */
 static void
 ReadHeader(Error * ec, IHandle_t * h, char *to, int size, bit32 magic,
 	   bit32 version)
@@ -574,10 +1434,39 @@ ReadHeader(Error * ec, IHandle_t * h, char *to, int size, bit32 magic,
     }
 }
 
+void
+WriteVolumeHeader_r(Error * ec, Volume * vp)
+{
+    IHandle_t *h = V_diskDataHandle(vp);
+    FdHandle_t *fdP;
+
+    *ec = 0;
+
+    fdP = IH_OPEN(h);
+    if (fdP == NULL) {
+	*ec = VSALVAGE;
+	return;
+    }
+    if (FDH_SEEK(fdP, 0, SEEK_SET) < 0) {
+	*ec = VSALVAGE;
+	FDH_REALLYCLOSE(fdP);
+	return;
+    }
+    if (FDH_WRITE(fdP, (char *)&V_disk(vp), sizeof(V_disk(vp)))
+	!= sizeof(V_disk(vp))) {
+	*ec = VSALVAGE;
+	FDH_REALLYCLOSE(fdP);
+	return;
+    }
+    FDH_CLOSE(fdP);
+}
+
 /* VolumeHeaderToDisk
  * Allows for storing 64 bit inode numbers in on-disk volume header
  * file.
  */
+/* convert in-memory representation of a volume header to the
+ * on-disk representation of a volume header */
 void
 VolumeHeaderToDisk(VolumeDiskHeader_t * dh, VolumeHeader_t * h)
 {
@@ -607,8 +1496,10 @@ VolumeHeaderToDisk(VolumeDiskHeader_t * dh, VolumeHeader_t * h)
 }
 
 /* DiskToVolumeHeader
- * Reads volume header file from disk, convering 64 bit inodes
- * if required. Makes the assumption that AFS has *always* 
+ * Converts an on-disk representation of a volume header to
+ * the in-memory representation of a volume header.
+ *
+ * Makes the assumption that AFS has *always* 
  * zero'd the volume header file so that high parts of inode
  * numbers are 0 in older (SGI EFS) volume header files.
  */
@@ -642,34 +1533,137 @@ DiskToVolumeHeader(VolumeHeader_t * h, VolumeDiskHeader_t * dh)
 }
 
 
-void
-WriteVolumeHeader_r(ec, vp)
-     Error *ec;
-     Volume *vp;
+/***************************************************/
+/* Volume Attachment routines                      */
+/***************************************************/
+
+#ifdef AFS_DEMAND_ATTACH_FS
+/* pre-attach a volume given its path 
+ *
+ * a pre-attached volume will only have its partition
+ * and hashid fields initialized
+ *
+ * at first call to VGetVolume, the volume will be
+ * fully attached
+ */
+Volume *
+VPreAttachVolumeByName(Error * ec, char *partition, char *name, int mode)
 {
-    IHandle_t *h = V_diskDataHandle(vp);
-    FdHandle_t *fdP;
+    Volume * vp;
+    VOL_LOCK;
+    vp = VPreAttachVolumeByName_r(ec, partition, name, mode);
+    VOL_UNLOCK;
+    return vp;
+}
+
+Volume *
+VPreAttachVolumeByName_r(Error * ec, char *partition, char *name, int mode)
+{
+    register Volume *vp = NULL;
+    int fd, n;
+    struct afs_stat status;
+    struct DiskPartition *partp;
+    char path[64];
+    int isbusy = 0;
+    VolId volumeId;
+    *ec = 0;
+
+    assert(programType == fileServer);
+
+    if (!(partp = VGetPartition_r(partition, 0))) {
+	*ec = VNOVOL;
+	Log("VPreAttachVolume:  Error getting partition (%s)\n", partition);
+	return NULL;
+    }
+
+    volumeId = VolumeNumber(name);
+
+    vp = VLookupVolume_r(ec, volumeId, NULL);
+    if (*ec) {
+	return NULL;
+    }
+
+    return VPreAttachVolumeById_r(ec, partp, vp, volumeId);
+}
+
+/* pre-attach a volume given its partition and volume id
+ *
+ * if vp == NULL, then a new vp is created
+ * if vp != NULL, then we assumed it is already on the hash chain
+ */
+Volume * 
+VPreAttachVolumeById_r(Error * ec, struct DiskPartition * partp, 
+		       Volume * vp, int vid)
+{
+    Volume *nvp = NULL;
 
     *ec = 0;
 
-    fdP = IH_OPEN(h);
-    if (fdP == NULL) {
-	*ec = VSALVAGE;
-	return;
+    /* check to see if pre-attach already happened */
+    if (vp && 
+	(V_attachState(vp) != VOL_STATE_UNATTACHED) && 
+	!IsErrorState(V_attachState(vp))) {
+	goto done;
+    } else if (vp) {
+	/* we're re-attaching a volume; clear out some old state */
+	memset(&vp->salvage, 0, sizeof(struct VolumeOnlineSalvage));
+    } else {
+	/* if we need to allocate a new Volume struct,
+	 * go ahead and drop the vol glock, otherwise
+	 * do the basic setup synchronised, as it's
+	 * probably not worth dropping the lock */
+	VOL_UNLOCK;
+
+	/* allocate the volume structure */
+	vp = nvp = (Volume *) malloc(sizeof(Volume));
+	assert(vp != NULL);
+	memset(vp, 0, sizeof(Volume));
+	assert(pthread_cond_init(&V_attachCV(vp), NULL) == 0);
     }
-    if (FDH_SEEK(fdP, 0, SEEK_SET) < 0) {
-	*ec = VSALVAGE;
-	FDH_REALLYCLOSE(fdP);
-	return;
+
+    /* link the volume with its associated vice partition */
+    vp->device = partp->device;
+    vp->partition = partp;
+    vp->hashid = vid;
+
+    /* if we dropped the lock, reacquire the lock,
+     * check for pre-attach races, and then add
+     * the volume to the hash table */
+    if (nvp) {
+	VOL_LOCK;
+	nvp = VLookupVolume_r(ec, vid, NULL);
+	if (*ec) {
+	    free(vp);
+	    vp = NULL;
+	    goto done;
+	} else if (nvp) { /* race detected */
+	    free(vp);
+	    vp = nvp;
+	    goto done;
+	} else {
+	  /* hack to make up for VChangeState_r() decrementing 
+	   * the old state counter */
+	  VStats.state_levels[0]++;
+	}
     }
-    if (FDH_WRITE(fdP, (char *)&V_disk(vp), sizeof(V_disk(vp)))
-	!= sizeof(V_disk(vp))) {
-	*ec = VSALVAGE;
-	FDH_REALLYCLOSE(fdP);
-	return;
-    }
-    FDH_CLOSE(fdP);
+
+    /* put pre-attached volume onto the hash table
+     * and bring it up to the pre-attached state */
+    AddVolumeToHashTable(vp, vp->hashid);
+    AddVolumeToVByPList_r(vp);
+    VLRU_Init_Node_r(vp);
+    VChangeState_r(vp, VOL_STATE_PREATTACHED);
+
+    if (LogLevel >= 5)
+	Log("VPreAttachVolumeById_r:  volume %u pre-attached\n", vp->hashid);
+
+  done:
+    if (*ec)
+	return NULL;
+    else
+	return vp;
 }
+#endif /* AFS_DEMAND_ATTACH_FS */
 
 /* Attach an existing volume, given its pathname, and return a
    pointer to the volume header information.  The volume also
@@ -679,18 +1673,16 @@ Volume *
 VAttachVolumeByName(Error * ec, char *partition, char *name, int mode)
 {
     Volume *retVal;
-    VATTACH_LOCK;
     VOL_LOCK;
     retVal = VAttachVolumeByName_r(ec, partition, name, mode);
     VOL_UNLOCK;
-    VATTACH_UNLOCK;
     return retVal;
 }
 
 Volume *
 VAttachVolumeByName_r(Error * ec, char *partition, char *name, int mode)
 {
-    register Volume *vp;
+    register Volume *vp = NULL, *svp = NULL;
     int fd, n;
     struct afs_stat status;
     struct VolumeDiskHeader diskHeader;
@@ -698,13 +1690,132 @@ VAttachVolumeByName_r(Error * ec, char *partition, char *name, int mode)
     struct DiskPartition *partp;
     char path[64];
     int isbusy = 0;
+    VolId volumeId;
+#ifdef AFS_DEMAND_ATTACH_FS
+    VolumeStats stats_save;
+#endif /* AFS_DEMAND_ATTACH_FS */
+
     *ec = 0;
+   
+    volumeId = VolumeNumber(name);
+
+    if (!(partp = VGetPartition_r(partition, 0))) {
+	*ec = VNOVOL;
+	Log("VAttachVolume: Error getting partition (%s)\n", partition);
+	goto done;
+    }
+
     if (programType == volumeUtility) {
 	assert(VInit == 3);
 	VLockPartition_r(partition);
-    }
-    if (programType == fileServer) {
-	vp = VGetVolume_r(ec, VolumeNumber(name));
+    } else if (programType == fileServer) {
+#ifdef AFS_DEMAND_ATTACH_FS
+	/* lookup the volume in the hash table */
+	vp = VLookupVolume_r(ec, volumeId, NULL);
+	if (*ec) {
+	    return NULL;
+	}
+
+	if (vp) {
+	    /* save any counters that are supposed to
+	     * be monotonically increasing over the
+	     * lifetime of the fileserver */
+	    memcpy(&stats_save, &vp->stats, sizeof(VolumeStats));
+	} else {
+	    memset(&stats_save, 0, sizeof(VolumeStats));
+	}
+
+	/* if there's something in the hash table, and it's not
+	 * in the pre-attach state, then we may need to detach
+	 * it before proceeding */
+	if (vp && (V_attachState(vp) != VOL_STATE_PREATTACHED)) {
+	    VCreateReservation_r(vp);
+	    VWaitExclusiveState_r(vp);
+
+	    /* at this point state must be one of:
+	     *   UNATTACHED,
+	     *   ATTACHED,
+	     *   SHUTTING_DOWN,
+	     *   GOING_OFFLINE,
+	     *   SALVAGING,
+	     *   ERROR
+	     */
+
+	    if (vp->specialStatus == VBUSY)
+		isbusy = 1;
+	    
+	    /* if it's already attached, see if we can return it */
+	    if (V_attachState(vp) == VOL_STATE_ATTACHED) {
+		VGetVolumeByVp_r(ec, vp);
+		if (V_inUse(vp)) {
+		    VCancelReservation_r(vp);
+		    return vp;
+		}
+
+		/* otherwise, we need to detach, and attempt to re-attach */
+		VDetachVolume_r(ec, vp);
+		if (*ec) {
+		    Log("VAttachVolume: Error detaching old volume instance (%s)\n", name);
+		}
+	    } else {
+		/* if it isn't fully attached, delete from the hash tables,
+		   and let the refcounter handle the rest */
+		DeleteVolumeFromHashTable(vp);
+		DeleteVolumeFromVByPList_r(vp);
+	    }
+
+	    VCancelReservation_r(vp);
+	    vp = NULL;
+	}
+
+	/* pre-attach volume if it hasn't been done yet */
+	if (!vp || 
+	    (V_attachState(vp) == VOL_STATE_UNATTACHED) ||
+	    (V_attachState(vp) == VOL_STATE_ERROR)) {
+	    svp = vp;
+	    vp = VPreAttachVolumeById_r(ec, partp, vp, volumeId);
+	    if (*ec) {
+		return NULL;
+	    }
+	}
+
+	assert(vp != NULL);
+
+	/* handle pre-attach races 
+	 *
+	 * multiple threads can race to pre-attach a volume,
+	 * but we can't let them race beyond that
+	 * 
+	 * our solution is to let the first thread to bring
+	 * the volume into an exclusive state win; the other
+	 * threads just wait until it finishes bringing the
+	 * volume online, and then they do a vgetvolumebyvp
+	 */
+	if (svp && (svp != vp)) {
+	    /* wait for other exclusive ops to finish */
+	    VCreateReservation_r(vp);
+	    VWaitExclusiveState_r(vp);
+
+	    /* get a heavyweight ref, kill the lightweight ref, and return */
+	    VGetVolumeByVp_r(ec, vp);
+	    VCancelReservation_r(vp);
+	    return vp;
+	}
+
+	/* at this point, we are chosen as the thread to do
+	 * demand attachment for this volume. all other threads
+	 * doing a getvolume on vp->hashid will block until we finish */
+
+	/* make sure any old header cache entries are invalidated
+	 * before proceeding */
+	FreeVolumeHeader(vp);
+
+	VChangeState_r(vp, VOL_STATE_ATTACHING);
+
+	/* restore any saved counters */
+	memcpy(&vp->stats, &stats_save, sizeof(VolumeStats));
+#else /* AFS_DEMAND_ATTACH_FS */
+	vp = VGetVolume_r(ec, volumeId);
 	if (vp) {
 	    if (V_inUse(vp))
 		return vp;
@@ -714,55 +1825,80 @@ VAttachVolumeByName_r(Error * ec, char *partition, char *name, int mode)
 	    if (*ec) {
 		Log("VAttachVolume: Error detaching volume (%s)\n", name);
 	    }
+	    vp = NULL;
 	}
-    }
-
-    if (!(partp = VGetPartition_r(partition, 0))) {
-	*ec = VNOVOL;
-	Log("VAttachVolume: Error getting partition (%s)\n", partition);
-	goto done;
+#endif /* AFS_DEMAND_ATTACH_FS */
     }
 
     *ec = 0;
     strcpy(path, VPartitionPath(partp));
+
+    VOL_UNLOCK;
+
     strcat(path, "/");
     strcat(path, name);
-    VOL_UNLOCK;
     if ((fd = afs_open(path, O_RDONLY)) == -1 || afs_fstat(fd, &status) == -1) {
 	Log("VAttachVolume: Failed to open %s (errno %d)\n", path, errno);
 	if (fd > -1)
 	    close(fd);
-	VOL_LOCK;
 	*ec = VNOVOL;
+	VOL_LOCK;
 	goto done;
     }
     n = read(fd, &diskHeader, sizeof(diskHeader));
     close(fd);
-    VOL_LOCK;
     if (n != sizeof(diskHeader)
 	|| diskHeader.stamp.magic != VOLUMEHEADERMAGIC) {
 	Log("VAttachVolume: Error reading volume header %s\n", path);
 	*ec = VSALVAGE;
+	VOL_LOCK;
 	goto done;
     }
     if (diskHeader.stamp.version != VOLUMEHEADERVERSION) {
 	Log("VAttachVolume: Volume %s, version number is incorrect; volume needs salvaged\n", path);
 	*ec = VSALVAGE;
+	VOL_LOCK;
 	goto done;
     }
 
     DiskToVolumeHeader(&iheader, &diskHeader);
+#ifdef FSSYNC_BUILD_CLIENT
     if (programType == volumeUtility && mode != V_SECRETLY && mode != V_PEEK) {
-	if (FSYNC_askfs(iheader.id, partition, FSYNC_NEEDVOLUME, mode)
-	    == FSYNC_DENIED) {
+        VOL_LOCK;
+	if (FSYNC_VolOp(iheader.id, partition, FSYNC_VOL_NEEDVOLUME, mode, NULL)
+	    != SYNC_OK) {
 	    Log("VAttachVolume: attach of volume %u apparently denied by file server\n", iheader.id);
 	    *ec = VNOVOL;	/* XXXX */
 	    goto done;
 	}
+	VOL_UNLOCK;
+    }
+#endif
+
+    if (!vp) {
+      vp = (Volume *) calloc(1, sizeof(Volume));
+      assert(vp != NULL);
+      vp->device = partp->device;
+      vp->partition = partp;
+#ifdef AFS_DEMAND_ATTACH_FS
+      assert(pthread_cond_init(&V_attachCV(vp), NULL) == 0);
+#endif /* AFS_DEMAND_ATTACH_FS */
     }
 
-    vp = attach2(ec, path, &iheader, partp, isbusy);
+    /* attach2 is entered without any locks, and returns
+     * with vol_glock_mutex held */
+    vp = attach2(ec, volumeId, path, &iheader, partp, vp, isbusy, mode);
+
     if (programType == volumeUtility && vp) {
+#ifdef AFS_DEMAND_ATTACH_FS
+	/* for dafs, we should tell the fileserver, except for V_PEEK
+         * where we know it is not necessary */
+	if (mode == V_PEEK) {
+	    vp->needsPutBack = 0;
+	} else {
+	    vp->needsPutBack = 1;
+	}
+#else /* !AFS_DEMAND_ATTACH_FS */
 	/* duplicate computation in fssync.c about whether the server
 	 * takes the volume offline or not.  If the volume isn't
 	 * offline, we must not return it when we detach the volume,
@@ -772,6 +1908,7 @@ VAttachVolumeByName_r(Error * ec, char *partition, char *name, int mode)
 	    vp->needsPutBack = 0;
 	else
 	    vp->needsPutBack = 1;
+#endif /* !AFS_DEMAND_ATTACH_FS */
     }
     /* OK, there's a problem here, but one that I don't know how to
      * fix right now, and that I don't think should arise often.
@@ -784,10 +1921,13 @@ VAttachVolumeByName_r(Error * ec, char *partition, char *name, int mode)
      * for all of that to happen, but if it does, probably the right
      * fix is for the server to allow the return of readonly volumes
      * that it doesn't think are really checked out. */
+#ifdef FSSYNC_BUILD_CLIENT
     if (programType == volumeUtility && vp == NULL &&
 	mode != V_SECRETLY && mode != V_PEEK) {
-	FSYNC_askfs(iheader.id, partition, FSYNC_ON, 0);
-    } else if (programType == fileServer && vp) {
+	FSYNC_VolOp(iheader.id, partition, FSYNC_VOL_ON, 0, NULL);
+    } else 
+#endif
+    if (programType == fileServer && vp) {
 	V_needsCallback(vp) = 0;
 #ifdef	notdef
 	if (VInit >= 2 && V_BreakVolumeCallbacks) {
@@ -795,7 +1935,7 @@ VAttachVolumeByName_r(Error * ec, char *partition, char *name, int mode)
 	    (*V_BreakVolumeCallbacks) (V_id(vp));
 	}
 #endif
-	VUpdateVolume_r(ec, vp);
+	VUpdateVolume_r(ec, vp, 0);
 	if (*ec) {
 	    Log("VAttachVolume: Error updating volume\n");
 	    if (vp)
@@ -803,7 +1943,8 @@ VAttachVolumeByName_r(Error * ec, char *partition, char *name, int mode)
 	    goto done;
 	}
 	if (VolumeWriteable(vp) && V_dontSalvage(vp) == 0) {
-	    /* This is a hack: by temporarily settint the incore
+#ifndef AFS_DEMAND_ATTACH_FS
+	    /* This is a hack: by temporarily setting the incore
 	     * dontSalvage flag ON, the volume will be put back on the
 	     * Update list (with dontSalvage OFF again).  It will then
 	     * come back in N minutes with DONT_SALVAGE eventually
@@ -812,6 +1953,7 @@ VAttachVolumeByName_r(Error * ec, char *partition, char *name, int mode)
 	     * offline without DONT SALVAGE having been set also
 	     * eventually get it set */
 	    V_dontSalvage(vp) = DONT_SALVAGE;
+#endif /* !AFS_DEMAND_ATTACH_FS */
 	    VAddToVolumeUpdateList_r(ec, vp);
 	    if (*ec) {
 		Log("VAttachVolume: Error adding volume to update list\n");
@@ -828,25 +1970,196 @@ VAttachVolumeByName_r(Error * ec, char *partition, char *name, int mode)
     if (programType == volumeUtility) {
 	VUnlockPartition_r(partition);
     }
-    if (*ec)
+    if (*ec) {
+#ifdef AFS_DEMAND_ATTACH_FS
+	if (vp) {
+	    V_attachState(vp) = VOL_STATE_ERROR;
+	    assert(pthread_cond_broadcast(&V_attachCV(vp)) == 0);
+	}
+#endif /* AFS_DEMAND_ATTACH_FS */
 	return NULL;
-    else
+    } else {
 	return vp;
+    }
 }
 
-private Volume *
-attach2(Error * ec, char *path, register struct VolumeHeader * header,
-	struct DiskPartition * partp, int isbusy)
+#ifdef AFS_DEMAND_ATTACH_FS
+/* VAttachVolumeByVp_r
+ *
+ * finish attaching a volume that is
+ * in a less than fully attached state
+ */
+/* caller MUST hold a ref count on vp */
+static Volume *
+VAttachVolumeByVp_r(Error * ec, Volume * vp, int mode)
 {
-    register Volume *vp;
+    char name[VMAXPATHLEN];
+    int fd, n, reserve = 0;
+    struct afs_stat status;
+    struct VolumeDiskHeader diskHeader;
+    struct VolumeHeader iheader;
+    struct DiskPartition *partp;
+    char path[64];
+    int isbusy = 0;
+    VolId volumeId;
+    Volume * nvp;
+    VolumeStats stats_save;
+    *ec = 0;
+
+    /* volume utility should never call AttachByVp */
+    assert(programType == fileServer);
+   
+    volumeId = vp->hashid;
+    partp = vp->partition;
+    VolumeExternalName_r(volumeId, name, sizeof(name));
+
+
+    /* if another thread is performing a blocking op, wait */
+    VWaitExclusiveState_r(vp);
+
+    memcpy(&stats_save, &vp->stats, sizeof(VolumeStats));
+
+    /* if it's already attached, see if we can return it */
+    if (V_attachState(vp) == VOL_STATE_ATTACHED) {
+	VGetVolumeByVp_r(ec, vp);
+	if (V_inUse(vp)) {
+	    return vp;
+	} else {
+	    if (vp->specialStatus == VBUSY)
+		isbusy = 1;
+	    VDetachVolume_r(ec, vp);
+	    if (*ec) {
+		Log("VAttachVolume: Error detaching volume (%s)\n", name);
+	    }
+	    vp = NULL;
+	}
+    }
+
+    /* pre-attach volume if it hasn't been done yet */
+    if (!vp || 
+	(V_attachState(vp) == VOL_STATE_UNATTACHED) ||
+	(V_attachState(vp) == VOL_STATE_ERROR)) {
+	nvp = VPreAttachVolumeById_r(ec, partp, vp, volumeId);
+	if (*ec) {
+	    return NULL;
+	}
+	if (nvp != vp) {
+	    reserve = 1;
+	    VCreateReservation_r(nvp);
+	    vp = nvp;
+	}
+    }
+    
+    assert(vp != NULL);
+    VChangeState_r(vp, VOL_STATE_ATTACHING);
+
+    /* restore monotonically increasing stats */
+    memcpy(&vp->stats, &stats_save, sizeof(VolumeStats));
+
+    *ec = 0;
+
+
+    /* compute path to disk header, 
+     * read in header, 
+     * and verify magic and version stamps */
+    strcpy(path, VPartitionPath(partp));
 
     VOL_UNLOCK;
 
-    vp = (Volume *) calloc(1, sizeof(Volume));
-    assert(vp != NULL);
+    strcat(path, "/");
+    strcat(path, name);
+    if ((fd = afs_open(path, O_RDONLY)) == -1 || afs_fstat(fd, &status) == -1) {
+	Log("VAttachVolume: Failed to open %s (errno %d)\n", path, errno);
+	if (fd > -1)
+	    close(fd);
+	*ec = VNOVOL;
+	VOL_LOCK;
+	goto done;
+    }
+    n = read(fd, &diskHeader, sizeof(diskHeader));
+    close(fd);
+    if (n != sizeof(diskHeader)
+	|| diskHeader.stamp.magic != VOLUMEHEADERMAGIC) {
+	Log("VAttachVolume: Error reading volume header %s\n", path);
+	*ec = VSALVAGE;
+	VOL_LOCK;
+	goto done;
+    }
+    if (diskHeader.stamp.version != VOLUMEHEADERVERSION) {
+	Log("VAttachVolume: Volume %s, version number is incorrect; volume needs salvaged\n", path);
+	*ec = VSALVAGE;
+	VOL_LOCK;
+	goto done;
+    }
+
+    /* convert on-disk header format to in-memory header format */
+    DiskToVolumeHeader(&iheader, &diskHeader);
+
+    /* do volume attach
+     *
+     * NOTE: attach2 is entered without any locks, and returns
+     * with vol_glock_mutex held */
+    vp = attach2(ec, volumeId, path, &iheader, partp, vp, isbusy, mode);
+
+    if (*ec || vp == NULL) {
+	goto done;
+    }
+
+    V_needsCallback(vp) = 0;
+    VUpdateVolume_r(ec, vp, 0);
+    if (*ec) {
+	Log("VAttachVolume: Error updating volume %u\n", vp->hashid);
+	VPutVolume_r(vp);
+	goto done;
+    }
+    if (VolumeWriteable(vp) && V_dontSalvage(vp) == 0) {
+#ifndef AFS_DEMAND_ATTACH_FS
+	/* This is a hack: by temporarily setting the incore
+	 * dontSalvage flag ON, the volume will be put back on the
+	 * Update list (with dontSalvage OFF again).  It will then
+	 * come back in N minutes with DONT_SALVAGE eventually
+	 * set.  This is the way that volumes that have never had
+	 * it set get it set; or that volumes that have been
+	 * offline without DONT SALVAGE having been set also
+	 * eventually get it set */
+	V_dontSalvage(vp) = DONT_SALVAGE;
+#endif /* !AFS_DEMAND_ATTACH_FS */
+	VAddToVolumeUpdateList_r(ec, vp);
+	if (*ec) {
+	    Log("VAttachVolume: Error adding volume %u to update list\n", vp->hashid);
+	    if (vp)
+		VPutVolume_r(vp);
+	    goto done;
+	}
+    }
+    if (LogLevel)
+	Log("VOnline:  volume %u (%s) attached and online\n", V_id(vp),
+	    V_name(vp));
+  done:
+    if (reserve) {
+	VCancelReservation_r(nvp);
+	reserve = 0;
+    }
+    if (*ec && (*ec != VOFFLINE) && (*ec != VSALVAGE)) {
+	if (vp && !IsErrorState(V_attachState(vp))) {
+	    VChangeState_r(vp, VOL_STATE_ERROR);
+	}
+	return NULL;
+    } else {
+	return vp;
+    }
+}
+#endif /* AFS_DEMAND_ATTACH_FS */
+
+/*
+ * called without any locks held
+ * returns with vol_glock_mutex held
+ */
+private Volume * 
+attach2(Error * ec, VolId volumeId, char *path, register struct VolumeHeader * header,
+	struct DiskPartition * partp, register Volume * vp, int isbusy, int mode)
+{
     vp->specialStatus = (byte) (isbusy ? VBUSY : 0);
-    vp->device = partp->device;
-    vp->partition = partp;
     IH_INIT(vp->vnodeIndex[vLarge].handle, partp->device, header->parent,
 	    header->largeVnodeIndex);
     IH_INIT(vp->vnodeIndex[vSmall].handle, partp->device, header->parent,
@@ -857,8 +2170,15 @@ attach2(Error * ec, char *path, register struct VolumeHeader * header,
     vp->shuttingDown = 0;
     vp->goingOffline = 0;
     vp->nUsers = 1;
+#ifdef AFS_DEMAND_ATTACH_FS
+    vp->stats.last_attach = FT_ApproxTime();
+    vp->stats.attaches++;
+#endif
 
     VOL_LOCK;
+#ifdef AFS_DEMAND_ATTACH_FS
+    IncUInt64(&VStats.attaches);
+#endif
     vp->cacheCheck = ++VolumeCacheCheck;
     /* just in case this ever rolls over */
     if (!vp->cacheCheck)
@@ -866,13 +2186,74 @@ attach2(Error * ec, char *path, register struct VolumeHeader * header,
     GetVolumeHeader(vp);
     VOL_UNLOCK;
 
+#if defined(AFS_DEMAND_ATTACH_FS) && defined(FSSYNC_BUILD_CLIENT)
+    /* demand attach changes the V_PEEK mechanism
+     *
+     * we can now suck the current disk data structure over
+     * the fssync interface without going to disk
+     *
+     * (technically, we don't need to restrict this feature
+     *  to demand attach fileservers.  However, I'm trying
+     *  to limit the number of common code changes)
+     */
+    if (programType != fileServer && mode == V_PEEK) {
+	SYNC_response res;
+	res.payload.len = sizeof(VolumeDiskData);
+	res.payload.buf = &vp->header->diskstuff;
+
+	if (FSYNC_VolOp(volumeId,
+			VPartitionPath(partp),
+			FSYNC_VOL_QUERY_HDR,
+			FSYNC_WHATEVER,
+			&res) == SYNC_OK) {
+	    goto disk_header_loaded;
+	}
+    }
+#endif /* AFS_DEMAND_ATTACH_FS && FSSYNC_BUILD_CLIENT */
     (void)ReadHeader(ec, V_diskDataHandle(vp), (char *)&V_disk(vp),
 		     sizeof(V_disk(vp)), VOLUMEINFOMAGIC, VOLUMEINFOVERSION);
 
+#ifdef AFS_DEMAND_ATTACH_FS
+    /* update stats */
     VOL_LOCK;
+    IncUInt64(&VStats.hdr_loads);
+    IncUInt64(&vp->stats.hdr_loads);
+    VOL_UNLOCK;
+#endif /* AFS_DEMAND_ATTACH_FS */
+    
     if (*ec) {
 	Log("VAttachVolume: Error reading diskDataHandle vol header %s; error=%u\n", path, *ec);
     }
+
+ disk_header_loaded:
+
+#ifdef AFS_DEMAND_ATTACH_FS
+    if (!*ec) {
+
+	/* check for pending volume operations */
+	if (vp->pending_vol_op) {
+	    /* see if the pending volume op requires exclusive access */
+	    if (!VVolOpLeaveOnline_r(vp, vp->pending_vol_op)) {
+		/* mark the volume down */
+		*ec = VOFFLINE;
+		VChangeState_r(vp, VOL_STATE_UNATTACHED);
+		if (V_offlineMessage(vp)[0] == '\0')
+		    strlcpy(V_offlineMessage(vp),
+			    "A volume utility is running.", 
+			    sizeof(V_offlineMessage(vp)));
+		V_offlineMessage(vp)[sizeof(V_offlineMessage(vp)) - 1] = '\0';
+
+		/* check to see if we should set the specialStatus flag */
+		if (VVolOpSetVBusy_r(vp, vp->pending_vol_op)) {
+		    vp->specialStatus = VBUSY;
+		}
+	    }
+	}
+
+	V_attachFlags(vp) |= VOL_HDR_LOADED;
+    }
+#endif /* AFS_DEMAND_ATTACH_FS */
+
     if (!*ec) {
 	struct IndexFileHeader iHead;
 
@@ -887,65 +2268,117 @@ attach2(Error * ec, char *path, register struct VolumeHeader * header,
 	    V_stat_initialized(vp) = 1;
 	}
 #endif /* OPENAFS_VOL_STATS */
-	VOL_UNLOCK;
+
 	(void)ReadHeader(ec, vp->vnodeIndex[vSmall].handle,
 			 (char *)&iHead, sizeof(iHead),
 			 SMALLINDEXMAGIC, SMALLINDEXVERSION);
-	VOL_LOCK;
+
 	if (*ec) {
 	    Log("VAttachVolume: Error reading smallVnode vol header %s; error=%u\n", path, *ec);
 	}
     }
+
     if (!*ec) {
 	struct IndexFileHeader iHead;
-	VOL_UNLOCK;
+
 	(void)ReadHeader(ec, vp->vnodeIndex[vLarge].handle,
 			 (char *)&iHead, sizeof(iHead),
 			 LARGEINDEXMAGIC, LARGEINDEXVERSION);
-	VOL_LOCK;
+
 	if (*ec) {
 	    Log("VAttachVolume: Error reading largeVnode vol header %s; error=%u\n", path, *ec);
 	}
     }
+
 #ifdef AFS_NAMEI_ENV
     if (!*ec) {
 	struct versionStamp stamp;
-	VOL_UNLOCK;
+
 	(void)ReadHeader(ec, V_linkHandle(vp), (char *)&stamp,
 			 sizeof(stamp), LINKTABLEMAGIC, LINKTABLEVERSION);
-	VOL_LOCK;
+
 	if (*ec) {
 	    Log("VAttachVolume: Error reading namei vol header %s; error=%u\n", path, *ec);
 	}
     }
-#endif
+#endif /* AFS_NAMEI_ENV */
+
+#if defined(AFS_DEMAND_ATTACH_FS)
+    if (*ec && ((*ec != VOFFLINE) || (V_attachState(vp) != VOL_STATE_UNATTACHED))) {
+        VOL_LOCK;
+	if (programType == fileServer) {
+	    VRequestSalvage_r(vp, SALVSYNC_ERROR, VOL_SALVAGE_INVALIDATE_HEADER);
+	    vp->nUsers = 0;
+	    *ec = VSALVAGING;
+	} else {
+	    Log("VAttachVolume: Error attaching volume %s; volume needs salvage; error=%u\n", path, *ec);
+	    FreeVolume(vp);
+	    *ec = VSALVAGE;
+	}
+	return NULL;
+    } else if (*ec) {
+	/* volume operation in progress */
+	VOL_LOCK;
+	return NULL;
+    }
+#else /* AFS_DEMAND_ATTACH_FS */
     if (*ec) {
 	Log("VAttachVolume: Error attaching volume %s; volume needs salvage; error=%u\n", path, *ec);
+        VOL_LOCK;
 	FreeVolume(vp);
 	return NULL;
     }
+#endif /* AFS_DEMAND_ATTACH_FS */
+
     if (V_needsSalvaged(vp)) {
 	if (vp->specialStatus)
 	    vp->specialStatus = 0;
-	Log("VAttachVolume: volume salvage flag is ON for %s; volume needs salvage\n", path);
-	*ec = VSALVAGE;
+        VOL_LOCK;
+#if defined(AFS_DEMAND_ATTACH_FS)
+	if (programType == fileServer) {
+	    VRequestSalvage_r(vp, SALVSYNC_NEEDED, VOL_SALVAGE_INVALIDATE_HEADER);
+	    vp->nUsers = 0;
+	    *ec = VSALVAGING;
+	} else {
+	    Log("VAttachVolume: volume salvage flag is ON for %s; volume needs salvage\n", path);
+	    FreeVolume(vp);
+	    *ec = VSALVAGE;
+	}
+#else /* AFS_DEMAND_ATTACH_FS */
 	FreeVolume(vp);
+	*ec = VSALVAGE;
+#endif /* AFS_DEMAND_ATTACH_FS */
 	return NULL;
     }
+
+    VOL_LOCK;
     if (programType == fileServer) {
 #ifndef FAST_RESTART
 	if (V_inUse(vp) && VolumeWriteable(vp)) {
 	    if (!V_needsSalvaged(vp)) {
 		V_needsSalvaged(vp) = 1;
-		VUpdateVolume_r(ec, vp);
+		VUpdateVolume_r(ec, vp, 0);
 	    }
-	    FreeVolume(vp);
+#if defined(AFS_DEMAND_ATTACH_FS)
+	    VRequestSalvage_r(vp, SALVSYNC_NEEDED, VOL_SALVAGE_INVALIDATE_HEADER);
+	    vp->nUsers = 0;
+	    *ec = VSALVAGING;
+#else /* AFS_DEMAND_ATTACH_FS */
 	    Log("VAttachVolume: volume %s needs to be salvaged; not attached.\n", path);
+	    FreeVolume(vp);
 	    *ec = VSALVAGE;
+#endif /* AFS_DEMAND_ATTACH_FS */
 	    return NULL;
 	}
 #endif /* FAST_RESTART */
+
 	if (V_destroyMe(vp) == DESTROY_ME) {
+#if defined(AFS_DEMAND_ATTACH_FS)
+	    /* schedule a salvage so the volume goes away on disk */
+	    VRequestSalvage_r(vp, SALVSYNC_ERROR, VOL_SALVAGE_INVALIDATE_HEADER);
+	    VChangeState_r(vp, VOL_STATE_ERROR);
+	    vp->nUsers = 0;
+#endif /* AFS_DEMAND_ATTACH_FS */
 	    FreeVolume(vp);
 	    Log("VAttachVolume: volume %s is junk; it should be destroyed at next salvage\n", path);
 	    *ec = VNOVOL;
@@ -953,18 +2386,21 @@ attach2(Error * ec, char *path, register struct VolumeHeader * header,
 	}
     }
 
-    AddVolumeToHashTable(vp, V_id(vp));
     vp->nextVnodeUnique = V_uniquifier(vp);
     vp->vnodeIndex[vSmall].bitmap = vp->vnodeIndex[vLarge].bitmap = NULL;
 #ifndef BITMAP_LATER
     if (programType == fileServer && VolumeWriteable(vp)) {
 	int i;
 	for (i = 0; i < nVNODECLASSES; i++) {
-	    VOL_UNLOCK;
-	    GetBitmap(ec, vp, i);
-	    VOL_LOCK;
+	    VGetBitmap_r(ec, vp, i);
 	    if (*ec) {
+#ifdef AFS_DEMAND_ATTACH_FS
+		VRequestSalvage_r(vp, SALVSYNC_ERROR, VOL_SALVAGE_INVALIDATE_HEADER);
+		vp->nUsers = 0;
+		*ec = VSALVAGING;
+#else /* AFS_DEMAND_ATTACH_FS */
 		FreeVolume(vp);
+#endif /* AFS_DEMAND_ATTACH_FS */
 		Log("VAttachVolume: error getting bitmap for volume (%s)\n",
 		    path);
 		return NULL;
@@ -982,6 +2418,12 @@ attach2(Error * ec, char *path, register struct VolumeHeader * header,
 	}
     }
 
+    AddVolumeToHashTable(vp, V_id(vp));
+#ifdef AFS_DEMAND_ATTACH_FS
+    AddVolumeToVByPList_r(vp);
+    VLRU_Add_r(vp);
+    VChangeState_r(vp, VOL_STATE_ATTACHED);
+#endif
     return vp;
 }
 
@@ -994,11 +2436,9 @@ Volume *
 VAttachVolume(Error * ec, VolumeId volumeId, int mode)
 {
     Volume *retVal;
-    VATTACH_LOCK;
     VOL_LOCK;
     retVal = VAttachVolume_r(ec, volumeId, mode);
     VOL_UNLOCK;
-    VATTACH_UNLOCK;
     return retVal;
 }
 
@@ -1028,21 +2468,39 @@ VAttachVolume_r(Error * ec, VolumeId volumeId, int mode)
  * we still guarantee we won't context swap, but the ref count won't be
  * incremented (otherwise we'd violate the invariant).
  */
+/* NOTE: with the demand attach fileserver extensions, the global lock
+ * is dropped within VHold */
+#ifdef AFS_DEMAND_ATTACH_FS
 static int
 VHold_r(register Volume * vp)
 {
     Error error;
 
-    if (vp->nUsers == 0 && !GetVolumeHeader(vp)) {
-	VolumeReplacements++;
-	ReadHeader(&error, V_diskDataHandle(vp), (char *)&V_disk(vp),
-		   sizeof(V_disk(vp)), VOLUMEINFOMAGIC, VOLUMEINFOVERSION);
-	if (error)
-	    return error;
+    VCreateReservation_r(vp);
+    VWaitExclusiveState_r(vp);
+
+    LoadVolumeHeader(&error, vp);
+    if (error) {
+	VCancelReservation_r(vp);
+	return error;
     }
     vp->nUsers++;
+    VCancelReservation_r(vp);
+    return 0;
+}
+#else /* AFS_DEMAND_ATTACH_FS */
+static int
+VHold_r(register Volume * vp)
+{
+    Error error;
+
+    LoadVolumeHeader(&error, vp);
+    if (error)
+	return error;
+    vp->nUsers++;
     return 0;
 }
+#endif /* AFS_DEMAND_ATTACH_FS */
 
 static int
 VHold(register Volume * vp)
@@ -1054,59 +2512,26 @@ VHold(register Volume * vp)
     return retVal;
 }
 
-void
-VTakeOffline_r(register Volume * vp)
-{
-    assert(vp->nUsers > 0);
-    assert(programType == fileServer);
-    vp->goingOffline = 1;
-    V_needsSalvaged(vp) = 1;
-}
 
-void
-VTakeOffline(register Volume * vp)
-{
-    VOL_LOCK;
-    VTakeOffline_r(vp);
-    VOL_UNLOCK;
-}
+/***************************************************/
+/* get and put volume routines                     */
+/***************************************************/
 
 void
 VPutVolume_r(register Volume * vp)
 {
     assert(--vp->nUsers >= 0);
     if (vp->nUsers == 0) {
+	VCheckOffline(vp);
 	ReleaseVolumeHeader(vp->header);
-	if (vp->goingOffline) {
-	    Error error;
-	    assert(programType == fileServer);
-	    vp->goingOffline = 0;
-	    V_inUse(vp) = 0;
-	    VUpdateVolume_r(&error, vp);
-	    VCloseVolumeHandles_r(vp);
-	    if (LogLevel) {
-		Log("VOffline: Volume %u (%s) is now offline", V_id(vp),
-		    V_name(vp));
-		if (V_offlineMessage(vp)[0])
-		    Log(" (%s)", V_offlineMessage(vp));
-		Log("\n");
-	    }
-#ifdef AFS_PTHREAD_ENV
-	    assert(pthread_cond_broadcast(&vol_put_volume_cond) == 0);
-#else /* AFS_PTHREAD_ENV */
-	    LWP_NoYieldSignal(VPutVolume);
-#endif /* AFS_PTHREAD_ENV */
-	}
-	if (vp->shuttingDown) {
-	    VReleaseVolumeHandles_r(vp);
-	    FreeVolume(vp);
-	    if (programType == fileServer)
-#ifdef AFS_PTHREAD_ENV
-		assert(pthread_cond_broadcast(&vol_put_volume_cond) == 0);
-#else /* AFS_PTHREAD_ENV */
-		LWP_NoYieldSignal(VPutVolume);
-#endif /* AFS_PTHREAD_ENV */
+#ifdef AFS_DEMAND_ATTACH_FS
+	if (!VCheckDetach(vp)) {
+	    VCheckSalvage(vp);
+	    VCheckFree(vp);
 	}
+#else /* AFS_DEMAND_ATTACH_FS */
+	VCheckDetach(vp);
+#endif /* AFS_DEMAND_ATTACH_FS */
     }
 }
 
@@ -1118,15 +2543,16 @@ VPutVolume(register Volume * vp)
     VOL_UNLOCK;
 }
 
+
 /* Get a pointer to an attached volume.  The pointer is returned regardless
    of whether or not the volume is in service or on/off line.  An error
    code, however, is returned with an indication of the volume's status */
 Volume *
-VGetVolume(Error * ec, VolId volumeId)
+VGetVolume(Error * ec, Error * client_ec, VolId volumeId)
 {
     Volume *retVal;
     VOL_LOCK;
-    retVal = VGetVolume_r(ec, volumeId);
+    retVal = GetVolume(ec, client_ec, volumeId, NULL, 0);
     VOL_UNLOCK;
     return retVal;
 }
@@ -1134,22 +2560,69 @@ VGetVolume(Error * ec, VolId volumeId)
 Volume *
 VGetVolume_r(Error * ec, VolId volumeId)
 {
-    Volume *vp;
-    unsigned short V0 = 0, V1 = 0, V2 = 0, V3 = 0, V4 = 0, V5 = 0, V6 =
+    return GetVolume(ec, NULL, volumeId, NULL, 0);
+}
+
+/* try to get a volume we've previously looked up */
+/* for demand attach fs, caller MUST NOT hold a ref count on vp */
+Volume * 
+VGetVolumeByVp_r(Error * ec, Volume * vp)
+{
+    return GetVolume(ec, NULL, vp->hashid, vp, 0);
+}
+
+/* private interface for getting a volume handle
+ * volumeId must be provided.
+ * hint is an optional parameter to speed up hash lookups
+ * flags is not used at this time
+ */
+/* for demand attach fs, caller MUST NOT hold a ref count on hint */
+static Volume *
+GetVolume(Error * ec, Error * client_ec, VolId volumeId, Volume * hint, int flags)
+{
+    Volume *vp = hint;
+    /* pull this profiling/debugging code out of regular builds */
+#ifdef notdef
+#define VGET_CTR_INC(x) x++
+    unsigned short V0 = 0, V1 = 0, V2 = 0, V3 = 0, V5 = 0, V6 =
 	0, V7 = 0, V8 = 0, V9 = 0;
     unsigned short V10 = 0, V11 = 0, V12 = 0, V13 = 0, V14 = 0, V15 = 0;
+#else
+#define VGET_CTR_INC(x)
+#endif
+
+#ifdef AFS_DEMAND_ATTACH_FS
+    Volume *avp, * rvp = hint;
+
+    if (rvp) {
+	VCreateReservation_r(rvp);
+    }
+#endif /* AFS_DEMAND_ATTACH_FS */
 
     for (;;) {
 	*ec = 0;
-	V0++;
-	for (vp = VolumeHashTable[VOLUME_HASH(volumeId)];
-	     vp && vp->hashid != volumeId; vp = vp->hashNext)
-	    Vlooks++;
+	if (client_ec)
+	    *client_ec = 0;
+	VGET_CTR_INC(V0);
+
+	vp = VLookupVolume_r(ec, volumeId, vp);
+	if (*ec) {
+	    vp = NULL;
+	    break;
+	}
+
+#ifdef AFS_DEMAND_ATTACH_FS
+	if (rvp && (rvp != vp)) {
+	    /* break reservation on old vp */
+	    VCancelReservation_r(rvp);
+	    rvp = NULL;
+	}
+#endif /* AFS_DEMAND_ATTACH_FS */
 
 	if (!vp) {
-	    V1++;
+	    VGET_CTR_INC(V1);
 	    if (VInit < 2) {
-		V2++;
+		VGET_CTR_INC(V2);
 		/* Until we have reached an initialization level of 2
 		 * we don't know whether this volume exists or not.
 		 * We can't sleep and retry later because before a volume
@@ -1164,99 +2637,255 @@ VGetVolume_r(Error * ec, VolId volumeId)
 	    break;
 	}
 
-	V3++;
-	VolumeGets++;
-	if (vp->nUsers == 0 && !GetVolumeHeader(vp)) {
-	    V5++;
-	    VolumeReplacements++;
-	    ReadHeader(ec, V_diskDataHandle(vp), (char *)&V_disk(vp),
-		       sizeof(V_disk(vp)), VOLUMEINFOMAGIC,
-		       VOLUMEINFOVERSION);
-	    if (*ec) {
-		V6++;
-		/* Only log the error if it was a totally unexpected error.  Simply
-		 * a missing inode is likely to be caused by the volume being deleted */
-		if (errno != ENXIO || LogLevel)
-		    Log("Volume %u: couldn't reread volume header\n",
-			vp->hashid);
-		FreeVolume(vp);
-		vp = NULL;
-		break;
-	    }
+	VGET_CTR_INC(V3);
+	IncUInt64(&VStats.hdr_gets);
+	
+#ifdef AFS_DEMAND_ATTACH_FS
+	/* block if someone else is performing an exclusive op on this volume */
+	if (rvp != vp) {
+	    rvp = vp;
+	    VCreateReservation_r(rvp);
 	}
-	V7++;
-	if (vp->shuttingDown) {
-	    V8++;
+	VWaitExclusiveState_r(vp);
+
+	/* short circuit with VNOVOL in the following circumstances:
+	 *
+	 *   VOL_STATE_ERROR
+	 *   VOL_STATE_SHUTTING_DOWN
+	 */
+	if ((V_attachState(vp) == VOL_STATE_ERROR) ||
+	    (V_attachState(vp) == VOL_STATE_SHUTTING_DOWN)) {
 	    *ec = VNOVOL;
 	    vp = NULL;
 	    break;
 	}
+
+	/* allowable states:
+	 *   UNATTACHED
+	 *   PREATTACHED
+	 *   ATTACHED
+	 *   GOING_OFFLINE
+	 *   SALVAGING
+	 */
+
+	if (vp->salvage.requested) {
+	    VUpdateSalvagePriority_r(vp);
+	}
+
+	if (V_attachState(vp) == VOL_STATE_PREATTACHED) {
+	    avp = VAttachVolumeByVp_r(ec, vp, 0);
+	    if (avp) {
+		if (vp != avp) {
+		    /* VAttachVolumeByVp_r can return a pointer
+		     * != the vp passed to it under certain
+		     * conditions; make sure we don't leak
+		     * reservations if that happens */
+		    vp = avp;
+		    VCancelReservation_r(rvp);
+		    rvp = avp;
+		    VCreateReservation_r(rvp);
+		}
+		VPutVolume_r(avp);
+	    }
+	    if (*ec) {
+		int endloop = 0;
+		switch (*ec) {
+		case VSALVAGING:
+		    break;
+		case VOFFLINE:
+		    if (!vp->pending_vol_op) {
+			endloop = 1;
+		    }
+		    break;
+		default:
+		    *ec = VNOVOL;
+		    endloop = 1;
+		}
+		if (endloop) {
+		    vp = NULL;
+		    break;
+		}
+	    }
+	}
+
+	if ((V_attachState(vp) == VOL_STATE_SALVAGING) ||
+	    (*ec == VSALVAGING)) {
+	    if (client_ec) {
+		/* see CheckVnode() in afsfileprocs.c for an explanation
+		 * of this error code logic */
+		afs_uint32 now = FT_ApproxTime();
+		if ((vp->stats.last_salvage + (10 * 60)) >= now) {
+		    *client_ec = VBUSY;
+		} else {
+		    *client_ec = VRESTARTING;
+		}
+	    }
+	    *ec = VSALVAGING;
+	    vp = NULL;
+	    break;
+	}
+
+	if (vp->pending_vol_op && !VVolOpLeaveOnline_r(vp, vp->pending_vol_op)) {
+	    if (client_ec) {
+		/* see CheckVnode() in afsfileprocs.c for an explanation
+		 * of this error code logic */
+		afs_uint32 now = FT_ApproxTime();
+		if ((vp->stats.last_vol_op + (10 * 60)) >= now) {
+		    *client_ec = VBUSY;
+		} else {
+		    *client_ec = VRESTARTING;
+		}
+	    }
+	    *ec = VOFFLINE;
+	    vp = NULL;
+	    break;
+	}
+
+	if (V_attachState(vp) == VOL_STATE_UNATTACHED) {
+	    *ec = VOFFLINE;
+	    vp = NULL;
+	    break;
+	}
+#endif /* AFS_DEMAND_ATTACH_FS */
+	
+	LoadVolumeHeader(ec, vp);
+	if (*ec) {
+	    VGET_CTR_INC(V6);
+	    /* Only log the error if it was a totally unexpected error.  Simply
+	     * a missing inode is likely to be caused by the volume being deleted */
+	    if (errno != ENXIO || LogLevel)
+		Log("Volume %u: couldn't reread volume header\n",
+		    vp->hashid);
+#ifdef AFS_DEMAND_ATTACH_FS
+	    if (programType == fileServer) {
+		VRequestSalvage_r(vp, SALVSYNC_ERROR, VOL_SALVAGE_INVALIDATE_HEADER);
+		*ec = VSALVAGING;
+	    } else {
+		FreeVolume(vp);
+		vp = NULL;
+	    }
+#else /* AFS_DEMAND_ATTACH_FS */
+	    FreeVolume(vp);
+	    vp = NULL;
+#endif /* AFS_DEMAND_ATTACH_FS */
+	    break;
+	}
+
+	VGET_CTR_INC(V7);
+	if (vp->shuttingDown) {
+	    VGET_CTR_INC(V8);
+	    *ec = VNOVOL;
+	    vp = NULL;
+	    break;
+	}
+
 	if (programType == fileServer) {
-	    V9++;
+	    VGET_CTR_INC(V9);
 	    if (vp->goingOffline) {
-		V10++;
-#ifdef AFS_PTHREAD_ENV
-		pthread_cond_wait(&vol_put_volume_cond, &vol_glock_mutex);
+		VGET_CTR_INC(V10);
+#ifdef AFS_DEMAND_ATTACH_FS
+		/* wait for the volume to go offline */
+		if (V_attachState(vp) == VOL_STATE_GOING_OFFLINE) {
+		    VWaitStateChange_r(vp);
+		}
+#elif defined(AFS_PTHREAD_ENV)
+		assert(pthread_cond_wait(&vol_put_volume_cond, &vol_glock_mutex) == 0);
 #else /* AFS_PTHREAD_ENV */
 		LWP_WaitProcess(VPutVolume);
 #endif /* AFS_PTHREAD_ENV */
 		continue;
 	    }
 	    if (vp->specialStatus) {
-		V11++;
+		VGET_CTR_INC(V11);
 		*ec = vp->specialStatus;
 	    } else if (V_inService(vp) == 0 || V_blessed(vp) == 0) {
-		V12++;
+		VGET_CTR_INC(V12);
 		*ec = VNOVOL;
 	    } else if (V_inUse(vp) == 0) {
-		V13++;
+		VGET_CTR_INC(V13);
 		*ec = VOFFLINE;
 	    } else {
-		V14++;
+		VGET_CTR_INC(V14);
 	    }
 	}
 	break;
     }
-    V15++;
+    VGET_CTR_INC(V15);
+
+#ifdef AFS_DEMAND_ATTACH_FS
     /* if no error, bump nUsers */
-    if (vp)
+    if (vp) {
 	vp->nUsers++;
+	VLRU_UpdateAccess_r(vp);
+    }
+    if (rvp) {
+	VCancelReservation_r(rvp);
+	rvp = NULL;
+    }
+    if (client_ec && !*client_ec) {
+	*client_ec = *ec;
+    }
+#else /* AFS_DEMAND_ATTACH_FS */
+    /* if no error, bump nUsers */
+    if (vp) {
+	vp->nUsers++;
+    }
+    if (client_ec) {
+	*client_ec = *ec;
+    }
+#endif /* AFS_DEMAND_ATTACH_FS */
 
     assert(vp || *ec);
     return vp;
 }
 
 
-/* For both VForceOffline and VOffline, we close all relevant handles.
- * For VOffline, if we re-attach the volume, the files may possible be
- * different than before. 
- */
-static void
-VReleaseVolumeHandles_r(Volume * vp)
+/***************************************************/
+/* Volume offline/detach routines                  */
+/***************************************************/
+
+/* caller MUST hold a heavyweight ref on vp */
+#ifdef AFS_DEMAND_ATTACH_FS
+void
+VTakeOffline_r(register Volume * vp)
 {
-    DFlushVolume(V_id(vp));
-    VReleaseVnodeFiles_r(vp);
+    assert(vp->nUsers > 0);
+    assert(programType == fileServer);
 
-    /* Too time consuming and unnecessary for the volserver */
-    if (programType != volumeUtility) {
-	IH_CONDSYNC(vp->vnodeIndex[vLarge].handle);
-	IH_CONDSYNC(vp->vnodeIndex[vSmall].handle);
-	IH_CONDSYNC(vp->diskDataHandle);
-#ifdef AFS_NT40_ENV
-	IH_CONDSYNC(vp->linkHandle);
-#endif /* AFS_NT40_ENV */
-    }
+    VCreateReservation_r(vp);
+    VWaitExclusiveState_r(vp);
 
-    IH_RELEASE(vp->vnodeIndex[vLarge].handle);
-    IH_RELEASE(vp->vnodeIndex[vSmall].handle);
-    IH_RELEASE(vp->diskDataHandle);
-    IH_RELEASE(vp->linkHandle);
+    vp->goingOffline = 1;
+    V_needsSalvaged(vp) = 1;
+
+    VRequestSalvage_r(vp, SALVSYNC_ERROR, 0);
+    VCancelReservation_r(vp);
+}
+#else /* AFS_DEMAND_ATTACH_FS */
+void
+VTakeOffline_r(register Volume * vp)
+{
+    assert(vp->nUsers > 0);
+    assert(programType == fileServer);
+
+    vp->goingOffline = 1;
+    V_needsSalvaged(vp) = 1;
+}
+#endif /* AFS_DEMAND_ATTACH_FS */
+
+void
+VTakeOffline(register Volume * vp)
+{
+    VOL_LOCK;
+    VTakeOffline_r(vp);
+    VOL_UNLOCK;
 }
 
 /* Force the volume offline, set the salvage flag.  No further references to
  * the volume through the volume package will be honored. */
+/* for demand attach, caller MUST hold ref count on vp */
 void
-VForceOffline_r(Volume * vp)
+VForceOffline_r(Volume * vp, int flags)
 {
     Error error;
     if (!V_inUse(vp))
@@ -1267,7 +2896,17 @@ VForceOffline_r(Volume * vp)
     V_inUse(vp) = 0;
     vp->goingOffline = 0;
     V_needsSalvaged(vp) = 1;
-    VUpdateVolume_r(&error, vp);
+    if (!(flags & VOL_FORCEOFF_NOUPDATE)) {
+	VUpdateVolume_r(&error, vp, VOL_UPDATE_WAIT | VOL_UPDATE_NOFORCEOFF);
+    }
+#ifdef AFS_DEMAND_ATTACH_FS
+#ifdef SALVSYNC_BUILD_CLIENT
+    if (programType == fileServer) {
+	VRequestSalvage_r(vp, SALVSYNC_ERROR, VOL_SALVAGE_INVALIDATE_HEADER);
+    }
+#endif
+    VChangeState_r(vp, VOL_STATE_ERROR);
+#endif /* AFS_DEMAND_ATTACH_FS */
 #ifdef AFS_PTHREAD_ENV
     assert(pthread_cond_broadcast(&vol_put_volume_cond) == 0);
 #else /* AFS_PTHREAD_ENV */
@@ -1275,14 +2914,13 @@ VForceOffline_r(Volume * vp)
 #endif /* AFS_PTHREAD_ENV */
 
     VReleaseVolumeHandles_r(vp);
-
 }
 
 void
 VForceOffline(Volume * vp)
 {
     VOL_LOCK;
-    VForceOffline_r(vp);
+    VForceOffline_r(vp, 0);
     VOL_UNLOCK;
 }
 
@@ -1295,6 +2933,7 @@ VOffline_r(Volume * vp, char *message)
 {
     Error error;
     VolumeId vid = V_id(vp);
+
     assert(programType != volumeUtility);
     if (!V_inUse(vp)) {
 	VPutVolume_r(vp);
@@ -1303,11 +2942,24 @@ VOffline_r(Volume * vp, char *message)
     if (V_offlineMessage(vp)[0] == '\0')
 	strncpy(V_offlineMessage(vp), message, sizeof(V_offlineMessage(vp)));
     V_offlineMessage(vp)[sizeof(V_offlineMessage(vp)) - 1] = '\0';
+
     vp->goingOffline = 1;
+#ifdef AFS_DEMAND_ATTACH_FS
+    VChangeState_r(vp, VOL_STATE_GOING_OFFLINE);
+    VCreateReservation_r(vp);
+    VPutVolume_r(vp);
+
+    /* wait for the volume to go offline */
+    if (V_attachState(vp) == VOL_STATE_GOING_OFFLINE) {
+	VWaitStateChange_r(vp);
+    }
+    VCancelReservation_r(vp);
+#else /* AFS_DEMAND_ATTACH_FS */
     VPutVolume_r(vp);
     vp = VGetVolume_r(&error, vid);	/* Wait for it to go offline */
     if (vp)			/* In case it was reattached... */
 	VPutVolume_r(vp);
+#endif /* AFS_DEMAND_ATTACH_FS */
 }
 
 void
@@ -1318,36 +2970,12 @@ VOffline(Volume * vp, char *message)
     VOL_UNLOCK;
 }
 
-/* For VDetachVolume, we close all cached file descriptors, but keep
- * the Inode handles in case we need to read from a busy volume.
- */
-static void
-VCloseVolumeHandles_r(Volume * vp)
-{
-    DFlushVolume(V_id(vp));
-    VCloseVnodeFiles_r(vp);
-
-    /* Too time consuming and unnecessary for the volserver */
-    if (programType != volumeUtility) {
-	IH_CONDSYNC(vp->vnodeIndex[vLarge].handle);
-	IH_CONDSYNC(vp->vnodeIndex[vSmall].handle);
-	IH_CONDSYNC(vp->diskDataHandle);
-#ifdef AFS_NT40_ENV
-	IH_CONDSYNC(vp->linkHandle);
-#endif /* AFS_NT40_ENV */
-    }
-
-    IH_REALLYCLOSE(vp->vnodeIndex[vLarge].handle);
-    IH_REALLYCLOSE(vp->vnodeIndex[vSmall].handle);
-    IH_REALLYCLOSE(vp->diskDataHandle);
-    IH_REALLYCLOSE(vp->linkHandle);
-}
-
 /* This gets used for the most part by utility routines that don't want
  * to keep all the volume headers around.  Generally, the file server won't
  * call this routine, because then the offline message in the volume header
- * (or other information) will still be available to clients. For NAMEI, also
- * close the file handles.
+ * (or other information) won't be available to clients. For NAMEI, also
+ * close the file handles.  However, the fileserver does call this during
+ * an attach following a volume operation.
  */
 void
 VDetachVolume_r(Error * ec, Volume * vp)
@@ -1365,9 +2993,18 @@ VDetachVolume_r(Error * ec, Volume * vp)
     volume = V_id(vp);
     DeleteVolumeFromHashTable(vp);
     vp->shuttingDown = 1;
+#ifdef AFS_DEMAND_ATTACH_FS
+    DeleteVolumeFromVByPList_r(vp);
+    VLRU_Delete_r(vp);
+    VChangeState_r(vp, VOL_STATE_SHUTTING_DOWN);
+#endif /* AFS_DEMAND_ATTACH_FS */
     VPutVolume_r(vp);
     /* Will be detached sometime in the future--this is OK since volume is offline */
 
+    /* XXX the following code should really be moved to VCheckDetach() since the volume
+     * is not technically detached until the refcounts reach zero
+     */
+#ifdef FSSYNC_BUILD_CLIENT
     if (programType == volumeUtility && notifyServer) {
 	/* 
 	 * Note:  The server is not notified in the case of a bogus volume 
@@ -1378,19 +3015,26 @@ VDetachVolume_r(Error * ec, Volume * vp)
 	 * would be two instances of the same volume, one of them bogus, 
 	 * which the file server would attempt to put on line 
 	 */
-	if (useDone)
+	if (useDone) {
 	    /* don't put online */
-	    FSYNC_askfs(volume, tpartp->name, FSYNC_DONE, 0);
-	else {
+	    FSYNC_VolOp(volume, tpartp->name, FSYNC_VOL_DONE, 0, NULL);
+	} else {
 	    /* fs can use it again */
-	    FSYNC_askfs(volume, tpartp->name, FSYNC_ON, 0);
+	    FSYNC_VolOp(volume, tpartp->name, FSYNC_VOL_ON, 0, NULL);
+
+	    /* XXX this code path is only hit by volume utilities, thus
+	     * V_BreakVolumeCallbacks will always be NULL.  if we really
+	     * want to break callbacks in this path we need to use FSYNC_VolOp() */
+#ifdef notdef
 	    /* Dettaching it so break all callbacks on it */
 	    if (V_BreakVolumeCallbacks) {
 		Log("volume %u detached; breaking all call backs\n", volume);
 		(*V_BreakVolumeCallbacks) (volume);
 	    }
+#endif
 	}
     }
+#endif /* FSSYNC_BUILD_CLIENT */
 }
 
 void
@@ -1402,20 +3046,848 @@ VDetachVolume(Error * ec, Volume * vp)
 }
 
 
-VnodeId
-VAllocBitmapEntry_r(Error * ec, Volume * vp, register struct vnodeIndex
-		    *index)
+/***************************************************/
+/* Volume fd/inode handle closing routines         */
+/***************************************************/
+
+/* For VDetachVolume, we close all cached file descriptors, but keep
+ * the Inode handles in case we need to read from a busy volume.
+ */
+/* for demand attach, caller MUST hold ref count on vp */
+static void
+VCloseVolumeHandles_r(Volume * vp)
 {
-    register byte *bp, *ep;
+#ifdef AFS_DEMAND_ATTACH_FS
+    VolState state_save;
+
+    state_save = VChangeState_r(vp, VOL_STATE_OFFLINING);
+#endif
+
+    /* demand attach fs
+     *
+     * XXX need to investigate whether we can perform
+     * DFlushVolume outside of vol_glock_mutex... 
+     *
+     * VCloseVnodeFiles_r drops the glock internally */
+    DFlushVolume(V_id(vp));
+    VCloseVnodeFiles_r(vp);
+
+#ifdef AFS_DEMAND_ATTACH_FS
+    VOL_UNLOCK;
+#endif
+
+    /* Too time consuming and unnecessary for the volserver */
+    if (programType != volumeUtility) {
+	IH_CONDSYNC(vp->vnodeIndex[vLarge].handle);
+	IH_CONDSYNC(vp->vnodeIndex[vSmall].handle);
+	IH_CONDSYNC(vp->diskDataHandle);
+#ifdef AFS_NT40_ENV
+	IH_CONDSYNC(vp->linkHandle);
+#endif /* AFS_NT40_ENV */
+    }
+
+    IH_REALLYCLOSE(vp->vnodeIndex[vLarge].handle);
+    IH_REALLYCLOSE(vp->vnodeIndex[vSmall].handle);
+    IH_REALLYCLOSE(vp->diskDataHandle);
+    IH_REALLYCLOSE(vp->linkHandle);
+
+#ifdef AFS_DEMAND_ATTACH_FS
+    VOL_LOCK;
+    VChangeState_r(vp, state_save);
+#endif
+}
+
+/* For both VForceOffline and VOffline, we close all relevant handles.
+ * For VOffline, if we re-attach the volume, the files may possible be
+ * different than before. 
+ */
+/* for demand attach, caller MUST hold a ref count on vp */
+static void
+VReleaseVolumeHandles_r(Volume * vp)
+{
+#ifdef AFS_DEMAND_ATTACH_FS
+    VolState state_save;
+
+    state_save = VChangeState_r(vp, VOL_STATE_DETACHING);
+#endif
+
+    /* XXX need to investigate whether we can perform
+     * DFlushVolume outside of vol_glock_mutex... */
+    DFlushVolume(V_id(vp));
+
+    VReleaseVnodeFiles_r(vp); /* releases the glock internally */
+
+#ifdef AFS_DEMAND_ATTACH_FS
+    VOL_UNLOCK;
+#endif
+
+    /* Too time consuming and unnecessary for the volserver */
+    if (programType != volumeUtility) {
+	IH_CONDSYNC(vp->vnodeIndex[vLarge].handle);
+	IH_CONDSYNC(vp->vnodeIndex[vSmall].handle);
+	IH_CONDSYNC(vp->diskDataHandle);
+#ifdef AFS_NT40_ENV
+	IH_CONDSYNC(vp->linkHandle);
+#endif /* AFS_NT40_ENV */
+    }
+
+    IH_RELEASE(vp->vnodeIndex[vLarge].handle);
+    IH_RELEASE(vp->vnodeIndex[vSmall].handle);
+    IH_RELEASE(vp->diskDataHandle);
+    IH_RELEASE(vp->linkHandle);
+
+#ifdef AFS_DEMAND_ATTACH_FS
+    VOL_LOCK;
+    VChangeState_r(vp, state_save);
+#endif
+}
+
+
+/***************************************************/
+/* Volume write and fsync routines                 */
+/***************************************************/
+
+void
+VUpdateVolume_r(Error * ec, Volume * vp, int flags)
+{
+#ifdef AFS_DEMAND_ATTACH_FS
+    VolState state_save;
+
+    if (flags & VOL_UPDATE_WAIT) {
+	VCreateReservation_r(vp);
+	VWaitExclusiveState_r(vp);
+    }
+#endif
+
     *ec = 0;
+    if (programType == fileServer)
+	V_uniquifier(vp) =
+	    (V_inUse(vp) ? V_nextVnodeUnique(vp) +
+	     200 : V_nextVnodeUnique(vp));
+
+#ifdef AFS_DEMAND_ATTACH_FS
+    state_save = VChangeState_r(vp, VOL_STATE_UPDATING);
+    VOL_UNLOCK;
+#endif
+
+    WriteVolumeHeader_r(ec, vp);
+
+#ifdef AFS_DEMAND_ATTACH_FS
+    VOL_LOCK;
+    VChangeState_r(vp, state_save);
+    if (flags & VOL_UPDATE_WAIT) {
+	VCancelReservation_r(vp);
+    }
+#endif
+
+    if (*ec) {
+	Log("VUpdateVolume: error updating volume header, volume %u (%s)\n",
+	    V_id(vp), V_name(vp));
+	/* try to update on-disk header, 
+	 * while preventing infinite recursion */
+	if (!(flags & VOL_UPDATE_NOFORCEOFF)) {
+	    VForceOffline_r(vp, VOL_FORCEOFF_NOUPDATE);
+	}
+    }
+}
+
+void
+VUpdateVolume(Error * ec, Volume * vp)
+{
+    VOL_LOCK;
+    VUpdateVolume_r(ec, vp, VOL_UPDATE_WAIT);
+    VOL_UNLOCK;
+}
+
+void
+VSyncVolume_r(Error * ec, Volume * vp, int flags)
+{
+    FdHandle_t *fdP;
+    int code;
+#ifdef AFS_DEMAND_ATTACH_FS
+    VolState state_save;
+#endif
+
+    if (flags & VOL_SYNC_WAIT) {
+	VUpdateVolume_r(ec, vp, VOL_UPDATE_WAIT);
+    } else {
+	VUpdateVolume_r(ec, vp, 0);
+    }
+    if (!*ec) {
+#ifdef AFS_DEMAND_ATTACH_FS
+	state_save = VChangeState_r(vp, VOL_STATE_UPDATING);
+	VOL_UNLOCK;
+#endif
+	fdP = IH_OPEN(V_diskDataHandle(vp));
+	assert(fdP != NULL);
+	code = FDH_SYNC(fdP);
+	assert(code == 0);
+	FDH_CLOSE(fdP);
+#ifdef AFS_DEMAND_ATTACH_FS
+	VOL_LOCK;
+	VChangeState_r(vp, state_save);
+#endif
+    }
+}
+
+void
+VSyncVolume(Error * ec, Volume * vp)
+{
+    VOL_LOCK;
+    VSyncVolume_r(ec, vp, VOL_SYNC_WAIT);
+    VOL_UNLOCK;
+}
+
+
+/***************************************************/
+/* Volume dealloaction routines                    */
+/***************************************************/
+
+#ifdef AFS_DEMAND_ATTACH_FS
+static void
+FreeVolume(Volume * vp)
+{
+    /* free the heap space, iff it's safe.
+     * otherwise, pull it out of the hash table, so it
+     * will get deallocated when all refs to it go away */
+    if (!VCheckFree(vp)) {
+	DeleteVolumeFromHashTable(vp);
+	DeleteVolumeFromVByPList_r(vp);
+
+	/* make sure we invalidate the header cache entry */
+	FreeVolumeHeader(vp);
+    }
+}
+#endif /* AFS_DEMAND_ATTACH_FS */
+
+static void
+ReallyFreeVolume(Volume * vp)
+{
+    int i;
+    if (!vp)
+	return;
+#ifdef AFS_DEMAND_ATTACH_FS
+    /* debug */
+    VChangeState_r(vp, VOL_STATE_FREED);
+    if (vp->pending_vol_op)
+	free(vp->pending_vol_op);
+#endif /* AFS_DEMAND_ATTACH_FS */
+    for (i = 0; i < nVNODECLASSES; i++)
+	if (vp->vnodeIndex[i].bitmap)
+	    free(vp->vnodeIndex[i].bitmap);
+    FreeVolumeHeader(vp);
+#ifndef AFS_DEMAND_ATTACH_FS
+    DeleteVolumeFromHashTable(vp);
+#endif /* AFS_DEMAND_ATTACH_FS */
+    free(vp);
+}
+
+/* check to see if we should shutdown this volume
+ * returns 1 if volume was freed, 0 otherwise */
+#ifdef AFS_DEMAND_ATTACH_FS
+static int
+VCheckDetach(register Volume * vp)
+{
+    int ret = 0;
+
+    if (vp->nUsers || vp->nWaiters)
+	return ret;
+
+    if (vp->shuttingDown) {
+	ret = 1;
+	VReleaseVolumeHandles_r(vp);
+	VCheckSalvage(vp);
+	ReallyFreeVolume(vp);
+	if (programType == fileServer) {
+	    assert(pthread_cond_broadcast(&vol_put_volume_cond) == 0);
+	}
+    }
+    return ret;
+}
+#else /* AFS_DEMAND_ATTACH_FS */
+static int
+VCheckDetach(register Volume * vp)
+{
+    int ret = 0;
+
+    if (vp->nUsers)
+	return ret;
+
+    if (vp->shuttingDown) {
+	ret = 1;
+	VReleaseVolumeHandles_r(vp);
+	ReallyFreeVolume(vp);
+	if (programType == fileServer) {
+#if defined(AFS_PTHREAD_ENV)
+	    assert(pthread_cond_broadcast(&vol_put_volume_cond) == 0);
+#else /* AFS_PTHREAD_ENV */
+	    LWP_NoYieldSignal(VPutVolume);
+#endif /* AFS_PTHREAD_ENV */
+	}
+    }
+    return ret;
+}
+#endif /* AFS_DEMAND_ATTACH_FS */
+
+/* check to see if we should offline this volume
+ * return 1 if volume went offline, 0 otherwise */
+#ifdef AFS_DEMAND_ATTACH_FS
+static int
+VCheckOffline(register Volume * vp)
+{
+    Volume * rvp = NULL;
+    int ret = 0;
+
+    if (vp->goingOffline && !vp->nUsers) {
+	Error error;
+	assert(programType == fileServer);
+	assert((V_attachState(vp) != VOL_STATE_ATTACHED) &&
+	       (V_attachState(vp) != VOL_STATE_FREED) &&
+	       (V_attachState(vp) != VOL_STATE_PREATTACHED) &&
+	       (V_attachState(vp) != VOL_STATE_UNATTACHED));
+
+	/* valid states:
+	 *
+	 * VOL_STATE_GOING_OFFLINE
+	 * VOL_STATE_SHUTTING_DOWN
+	 * IsErrorState(V_attachState(vp))
+	 * IsExclusiveState(V_attachState(vp))
+	 */
+
+	VCreateReservation_r(vp);
+	VChangeState_r(vp, VOL_STATE_OFFLINING);
+
+	ret = 1;
+	/* must clear the goingOffline flag before we drop the glock */
+	vp->goingOffline = 0;
+	V_inUse(vp) = 0;
+
+	VLRU_Delete_r(vp);
+
+	/* perform async operations */
+	VUpdateVolume_r(&error, vp, 0);
+	VCloseVolumeHandles_r(vp);
+
+	/* invalidate the volume header cache entry */
+	FreeVolumeHeader(vp);
+
+	if (LogLevel) {
+	    Log("VOffline: Volume %u (%s) is now offline", V_id(vp),
+		V_name(vp));
+	    if (V_offlineMessage(vp)[0])
+		Log(" (%s)", V_offlineMessage(vp));
+	    Log("\n");
+	}
+
+	/* if nothing changed state to error or salvaging,
+	 * drop state to unattached */
+	if (!IsErrorState(V_attachState(vp))) {
+	    VChangeState_r(vp, VOL_STATE_UNATTACHED);
+	}
+	VCancelReservation_r(vp);
+    }
+    return ret;
+}
+#else /* AFS_DEMAND_ATTACH_FS */
+static int
+VCheckOffline(register Volume * vp)
+{
+    Volume * rvp = NULL;
+    int ret = 0;
+
+    if (vp->goingOffline && !vp->nUsers) {
+	Error error;
+	assert(programType == fileServer);
+
+	ret = 1;
+	vp->goingOffline = 0;
+	V_inUse(vp) = 0;
+	VUpdateVolume_r(&error, vp, 0);
+	VCloseVolumeHandles_r(vp);
+	FreeVolumeHeader(vp);
+	if (LogLevel) {
+	    Log("VOffline: Volume %u (%s) is now offline", V_id(vp),
+		V_name(vp));
+	    if (V_offlineMessage(vp)[0])
+		Log(" (%s)", V_offlineMessage(vp));
+	    Log("\n");
+	}
+#ifdef AFS_PTHREAD_ENV
+	assert(pthread_cond_broadcast(&vol_put_volume_cond) == 0);
+#else /* AFS_PTHREAD_ENV */
+	LWP_NoYieldSignal(VPutVolume);
+#endif /* AFS_PTHREAD_ENV */
+    }
+    return ret;
+}
+#endif /* AFS_DEMAND_ATTACH_FS */
+
+/***************************************************/
+/* demand attach fs ref counting routines          */
+/***************************************************/
+
+#ifdef AFS_DEMAND_ATTACH_FS
+/* the following two functions handle reference counting for
+ * asynchronous operations on volume structs.
+ *
+ * their purpose is to prevent a VDetachVolume or VShutdown
+ * from free()ing the Volume struct during an async i/o op */
+
+/* register with the async volume op ref counter */
+static void
+VCreateReservation_r(Volume * vp)
+{
+    vp->nWaiters++;
+}
+
+/* unregister with the async volume op ref counter */
+static void
+VCancelReservation_r(Volume * vp)
+{
+    assert(--vp->nWaiters >= 0);
+    if (vp->nWaiters == 0) {
+	VCheckOffline(vp);
+	if (!VCheckDetach(vp)) {
+	    VCheckSalvage(vp);
+	    VCheckFree(vp);
+	}
+    }
+}
+
+/* check to see if we should free this volume now
+ * return 1 if volume was freed, 0 otherwise */
+static int
+VCheckFree(Volume * vp)
+{
+    int ret = 0;
+    if ((vp->nUsers == 0) &&
+	(vp->nWaiters == 0) &&
+	!(V_attachFlags(vp) & (VOL_IN_HASH | 
+			       VOL_ON_VBYP_LIST | 
+			       VOL_IS_BUSY |
+			       VOL_ON_VLRU))) {
+	ReallyFreeVolume(vp);
+	ret = 1;
+    }
+    return ret;
+}
+#endif /* AFS_DEMAND_ATTACH_FS */
+
+
+/***************************************************/
+/* online volume operations routines               */
+/***************************************************/
+
+#ifdef AFS_DEMAND_ATTACH_FS
+int
+VRegisterVolOp_r(Volume * vp, FSSYNC_VolOp_info * vopinfo)
+{
+    FSSYNC_VolOp_info * info;
+
+    /* attach a vol op info node to the volume struct */
+    info = (FSSYNC_VolOp_info *) malloc(sizeof(FSSYNC_VolOp_info));
+    assert(info != NULL);
+    memcpy(info, vopinfo, sizeof(FSSYNC_VolOp_info));
+    vp->pending_vol_op = info;
+
+    /* update stats */
+    vp->stats.last_vol_op = FT_ApproxTime();
+    vp->stats.vol_ops++;
+    IncUInt64(&VStats.vol_ops);
+
+    return 0;
+}
+
+int
+VDeregisterVolOp_r(Volume * vp, FSSYNC_VolOp_info * vopinfo)
+{
+    if (vp->pending_vol_op) {
+	free(vp->pending_vol_op);
+	vp->pending_vol_op = NULL;
+    }
+    return 0;
+}
+#endif /* AFS_DEMAND_ATTACH_FS */
+
+int
+VVolOpLeaveOnline_r(Volume * vp, FSSYNC_VolOp_info * vopinfo)
+{
+    return (vopinfo->com.command == FSYNC_VOL_NEEDVOLUME &&
+	    (vopinfo->com.reason == V_READONLY ||
+	     (!VolumeWriteable(vp) &&
+	      (vopinfo->com.reason == V_CLONE ||
+	       vopinfo->com.reason == V_DUMP))));
+}
+
+int
+VVolOpSetVBusy_r(Volume * vp, FSSYNC_VolOp_info * vopinfo)
+{
+    return (vopinfo->com.command == FSYNC_VOL_NEEDVOLUME &&
+	    (vopinfo->com.reason == V_CLONE ||
+	     vopinfo->com.reason == V_DUMP));
+}
+
+
+/***************************************************/
+/* online salvager routines                        */
+/***************************************************/
+#if defined(AFS_DEMAND_ATTACH_FS)
+#define SALVAGE_PRIO_UPDATE_INTERVAL 3      /* number of seconds between prio updates */
+#define SALVAGE_COUNT_MAX 16                /* number of online salvages we
+					     * allow before moving the volume
+					     * into a permanent error state
+					     *
+					     * once this threshold is reached,
+					     * the operator will have to manually
+					     * issue a 'bos salvage' to bring
+					     * the volume back online
+					     */
+
+/* check to see if we should salvage this volume
+ * returns 1 if salvage scheduled, 0 otherwise */
+static int
+VCheckSalvage(register Volume * vp)
+{
+    int ret = 0;
+#ifdef SALVSYNC_BUILD_CLIENT
+    if (vp->nUsers || vp->nWaiters)
+	return ret;
+    if (vp->salvage.requested) {
+	VScheduleSalvage_r(vp);
+	ret = 1;
+    }
+#endif /* SALVSYNC_BUILD_CLIENT */
+    return ret;
+}
+
+/*
+ * request that a salvage be performed once
+ * ref counts reach zero
+ */
+int
+VRequestSalvage_r(Volume * vp, int reason, int flags)
+{
+#ifdef SALVSYNC_BUILD_CLIENT
+    if (programType != fileServer)
+	return 1;
+
+    if (!vp->salvage.requested) {
+	vp->salvage.requested = 1;
+	vp->salvage.reason = reason;
+	vp->stats.last_salvage = FT_ApproxTime();
+	if (flags & VOL_SALVAGE_INVALIDATE_HEADER) {
+	    ReleaseVolumeHeader(vp->header);
+	}
+	if (vp->stats.salvages < SALVAGE_COUNT_MAX) {
+	    VChangeState_r(vp, VOL_STATE_SALVAGING);
+	} else {
+	    Log("VRequestSalvage: volume %u online salvaged too many times; forced offline.\n", vp->hashid);
+	    VChangeState_r(vp, VOL_STATE_ERROR);
+	}
+    }
+#endif /* SALVSYNC_BUILD_CLIENT */
+    return 0;
+}
+
+/*
+ * update salvage priority
+ */
+static int
+VUpdateSalvagePriority_r(Volume * vp)
+{
+    int code, ret=0;
+    afs_uint32 now;
+
+#ifdef SALVSYNC_BUILD_CLIENT
+    vp->salvage.prio++;
+    now = FT_ApproxTime();
+
+    /* update the salvageserver priority queue occasionally so that
+     * frequently requested volumes get moved to the head of the queue 
+     */
+    if ((vp->salvage.scheduled) &&
+	(vp->stats.last_salvage_req < (now-SALVAGE_PRIO_UPDATE_INTERVAL))) {
+	code = SALVSYNC_SalvageVolume(vp->hashid,
+				      VPartitionPath(vp->partition),
+				      SALVSYNC_RAISEPRIO,
+				      vp->salvage.reason,
+				      vp->salvage.prio,
+				      NULL);
+	vp->stats.last_salvage_req = now;
+	if (code != SYNC_OK) {
+	    ret = 1;
+	}
+    }
+#endif /* SALVSYNC_BUILD_CLIENT */
+    return ret;
+}
+
+
+/*
+ * schedule a salvage with the salvage server
+ */
+static int
+VScheduleSalvage_r(Volume * vp)
+{
+    int code, ret=0;
+#ifdef SALVSYNC_BUILD_CLIENT
+    VolState state_save;
+    char partName[16];
+
+    if (vp->nWaiters || vp->nUsers) {
+	return 1;
+    }
+
+    /* prevent endless salvage,attach,salvage,attach,... loops */
+    if (vp->stats.salvages >= SALVAGE_COUNT_MAX)
+	return 1;
+
+    if (!vp->salvage.scheduled) {
+	/* if we haven't previously scheduled a salvage, do so now 
+	 *
+	 * set the volume to an exclusive state and drop the lock
+	 * around the SALVSYNC call
+	 */
+	strlcpy(partName, VPartitionPath(vp->partition), sizeof(partName));
+	state_save = VChangeState_r(vp, VOL_STATE_SALVSYNC_REQ);
+	V_attachFlags(vp) |= VOL_IS_BUSY;
+	VOL_UNLOCK;
+
+	/* can't use V_id() since there's no guarantee
+	 * we have the disk data header at this point */
+	code = SALVSYNC_SalvageVolume(vp->hashid,
+				      partName,
+				      SALVSYNC_SALVAGE,
+				      vp->salvage.reason,
+				      vp->salvage.prio,
+				      NULL);
+	VOL_LOCK;
+	VChangeState_r(vp, state_save);
+	V_attachFlags(vp) &= ~(VOL_IS_BUSY);
+
+	if (code == SYNC_OK) {
+	    vp->salvage.scheduled = 1;
+	    vp->stats.salvages++;
+	    vp->stats.last_salvage_req = FT_ApproxTime();
+	    IncUInt64(&VStats.salvages);
+	} else {
+	    ret = 1;
+	    switch(code) {
+	    case SYNC_BAD_COMMAND:
+	    case SYNC_COM_ERROR:
+		break;
+	    case SYNC_DENIED:
+		Log("VScheduleSalvage_r:  SALVSYNC request denied\n");
+		break;
+	    default:
+		Log("VScheduleSalvage_r:  SALVSYNC unknown protocol error\n");
+		break;
+	    }
+	}
+    }
+#endif /* SALVSYNC_BUILD_CLIENT */
+    return ret;
+}
+
+/*
+ * cancel a scheduled salvage operation
+ */
+static int
+VCancelSalvage_r(Volume * vp, int reason)
+{
+    int code, ret = 0;
+
+#ifdef SALVSYNC_BUILD_CLIENT
+    if (vp->salvage.scheduled) {
+	code = SALVSYNC_SalvageVolume(vp->hashid,
+				      VPartitionPath(vp->partition),
+				      SALVSYNC_CANCEL,
+				      reason,
+				      0,
+				      NULL);
+	if (code == SYNC_OK) {
+	    vp->salvage.scheduled = 0;
+	} else {
+	    ret = 1;
+	}
+    }
+#endif /* SALVSYNC_BUILD_CLIENT */
+    return ret;
+}
+
+/* This must be called by any volume utility which needs to run while the
+   file server is also running.  This is separated from VInitVolumePackage so
+   that a utility can fork--and each of the children can independently
+   initialize communication with the file server */
+#ifdef SALVSYNC_BUILD_CLIENT
+int
+VConnectSALV(void)
+{
+    int retVal;
+    VOL_LOCK;
+    retVal = VConnectSALV_r();
+    VOL_UNLOCK;
+    return retVal;
+}
+
+int
+VConnectSALV_r(void)
+{
+    assert((programType != salvageServer) &&
+	   (programType != volumeUtility));
+    return SALVSYNC_clientInit();
+}
+
+int
+VDisconnectSALV(void)
+{
+    int retVal;
+    VOL_LOCK;
+    VDisconnectSALV_r();
+    VOL_UNLOCK;
+    return retVal;
+}
+
+int
+VDisconnectSALV_r(void)
+{ 
+    assert((programType != salvageServer) &&
+	   (programType != volumeUtility));
+    return SALVSYNC_clientFinis();
+}
+
+int
+VReconnectSALV(void)
+{
+    int retVal;
+    VOL_LOCK;
+    retVal = VReconnectSALV_r();
+    VOL_UNLOCK;
+    return retVal;
+}
+
+int
+VReconnectSALV_r(void)
+{
+    assert((programType != salvageServer) &&
+	   (programType != volumeUtility));
+    return SALVSYNC_clientReconnect();
+}
+#endif /* SALVSYNC_BUILD_CLIENT */
+#endif /* AFS_DEMAND_ATTACH_FS */
+
+
+/***************************************************/
+/* FSSYNC routines                                 */
+/***************************************************/
+
+/* This must be called by any volume utility which needs to run while the
+   file server is also running.  This is separated from VInitVolumePackage so
+   that a utility can fork--and each of the children can independently
+   initialize communication with the file server */
+#ifdef FSSYNC_BUILD_CLIENT
+int
+VConnectFS(void)
+{
+    int retVal;
+    VOL_LOCK;
+    retVal = VConnectFS_r();
+    VOL_UNLOCK;
+    return retVal;
+}
+
+int
+VConnectFS_r(void)
+{
+    int rc;
+    assert((VInit == 2) && 
+	   (programType != fileServer) &&
+	   (programType != salvager));
+    rc = FSYNC_clientInit();
+    if (rc)
+	VInit = 3;
+    return rc;
+}
+
+void
+VDisconnectFS_r(void)
+{
+    assert((programType != fileServer) &&
+	   (programType != salvager));
+    FSYNC_clientFinis();
+    VInit = 2;
+}
+
+void
+VDisconnectFS(void)
+{
+    VOL_LOCK;
+    VDisconnectFS_r();
+    VOL_UNLOCK;
+}
+
+static int
+VChildProcReconnectFS_r(void)
+{
+    return FSYNC_clientChildProcReconnect();
+}
+
+int
+VChildProcReconnectFS(void)
+{
+    int ret;
+    VOL_LOCK;
+    ret = VChildProcReconnectFS_r();
+    VOL_UNLOCK;
+    return ret;
+}
+#endif /* FSSYNC_BUILD_CLIENT */
+
+
+/***************************************************/
+/* volume bitmap routines                          */
+/***************************************************/
+
+/*
+ * For demand attach fs, flags parameter controls
+ * locking behavior.  If (flags & VOL_ALLOC_BITMAP_WAIT)
+ * is set, then this function will create a reservation
+ * and block on any other exclusive operations.  Otherwise,
+ * this function assumes the caller already has exclusive
+ * access to vp, and we just change the volume state.
+ */
+VnodeId
+VAllocBitmapEntry_r(Error * ec, Volume * vp, 
+		    struct vnodeIndex *index, int flags)
+{
+    VnodeId ret;
+    register byte *bp, *ep;
+#ifdef AFS_DEMAND_ATTACH_FS
+    VolState state_save;
+#endif /* AFS_DEMAND_ATTACH_FS */
+
+    *ec = 0;
+
     /* This test is probably redundant */
     if (!VolumeWriteable(vp)) {
 	*ec = (bit32) VREADONLY;
 	return 0;
     }
+
+#ifdef AFS_DEMAND_ATTACH_FS
+    if (flags & VOL_ALLOC_BITMAP_WAIT) {
+	VCreateReservation_r(vp);
+	VWaitExclusiveState_r(vp);
+    }
+    state_save = VChangeState_r(vp, VOL_STATE_GET_BITMAP);
+#endif /* AFS_DEMAND_ATTACH_FS */
+
 #ifdef BITMAP_LATER
     if ((programType == fileServer) && !index->bitmap) {
 	int i;
+#ifndef AFS_DEMAND_ATTACH_FS
+	/* demand attach fs uses the volume state to avoid races.
+	 * specialStatus field is not used at all */
 	int wasVBUSY = 0;
 	if (vp->specialStatus == VBUSY) {
 	    if (vp->goingOffline) {	/* vos dump waiting for the volume to
@@ -1423,33 +3895,49 @@ VAllocBitmapEntry_r(Error * ec, Volume * vp, register struct vnodeIndex
 					 * from AddNewReadableResidency */
 		wasVBUSY = 1;
 	    } else {
-		VOL_UNLOCK;
-		while (vp->specialStatus == VBUSY)
+		while (vp->specialStatus == VBUSY) {
 #ifdef AFS_PTHREAD_ENV
+		    VOL_UNLOCK;
 		    sleep(2);
+		    VOL_LOCK;
 #else /* AFS_PTHREAD_ENV */
 		    IOMGR_Sleep(2);
-#endif /* AFS_PTHREAD_ENV */
-		VOL_LOCK;
-	    }
-	}
-	if (!index->bitmap) {
-	    vp->specialStatus = VBUSY;	/* Stop anyone else from using it. */
-	    for (i = 0; i < nVNODECLASSES; i++) {
-		VOL_UNLOCK;
-		GetBitmap(ec, vp, i);
-		VOL_LOCK;
-		if (*ec) {
-		    vp->specialStatus = 0;
-		    vp->shuttingDown = 1;	/* Let who has it free it. */
-		    return NULL;
+#endif /* AFS_DEMAND_ATTACH_FS */
 		}
 	    }
+	}
+#endif /* !AFS_DEMAND_ATTACH_FS */
+
+	if (!index->bitmap) {
+#ifndef AFS_DEMAND_ATTACH_FS
+	    vp->specialStatus = VBUSY;	/* Stop anyone else from using it. */
+#endif /* AFS_DEMAND_ATTACH_FS */
+	    for (i = 0; i < nVNODECLASSES; i++) {
+		VGetBitmap_r(ec, vp, i);
+		if (*ec) {
+#ifdef AFS_DEMAND_ATTACH_FS
+		    VRequestSalvage_r(vp, SALVSYNC_ERROR, VOL_SALVAGE_INVALIDATE_HEADER);
+		    *ec = VSALVAGING;
+#else /* AFS_DEMAND_ATTACH_FS */
+		    DeleteVolumeFromHashTable(vp);
+		    vp->shuttingDown = 1;	/* Let who has it free it. */
+		    vp->specialStatus = 0;
+#endif /* AFS_DEMAND_ATTACH_FS */
+		    ret = NULL;
+		    goto done;
+		}
+	    }
+#ifndef AFS_DEMAND_ATTACH_FS
 	    if (!wasVBUSY)
 		vp->specialStatus = 0;	/* Allow others to have access. */
+#endif /* AFS_DEMAND_ATTACH_FS */
 	}
     }
 #endif /* BITMAP_LATER */
+
+#ifdef AFS_DEMAND_ATTACH_FS
+    VOL_UNLOCK;
+#endif /* AFS_DEMAND_ATTACH_FS */
     bp = index->bitmap + index->bitmapOffset;
     ep = index->bitmap + index->bitmapSize;
     while (bp < ep) {
@@ -1460,7 +3948,11 @@ VAllocBitmapEntry_r(Error * ec, Volume * vp, register struct vnodeIndex
 		bp++;
 	    o = ffs(~*bp) - 1;	/* ffs is documented in BSTRING(3) */
 	    *bp |= (1 << o);
-	    return (VnodeId) ((bp - index->bitmap) * 8 + o);
+	    ret = (VnodeId) ((bp - index->bitmap) * 8 + o);
+#ifdef AFS_DEMAND_ATTACH_FS
+	    VOL_LOCK;
+#endif /* AFS_DEMAND_ATTACH_FS */
+	    goto done;
 	}
 	bp += sizeof(bit32) /* i.e. 4 */ ;
     }
@@ -1474,7 +3966,19 @@ VAllocBitmapEntry_r(Error * ec, Volume * vp, register struct vnodeIndex
     index->bitmapOffset = index->bitmapSize;
     index->bitmapSize += VOLUME_BITMAP_GROWSIZE;
     *bp = 1;
-    return index->bitmapOffset * 8;
+    ret = index->bitmapOffset * 8;
+#ifdef AFS_DEMAND_ATTACH_FS
+    VOL_LOCK;
+#endif /* AFS_DEMAND_ATTACH_FS */
+
+ done:
+#ifdef AFS_DEMAND_ATTACH_FS
+    VChangeState_r(vp, state_save);
+    if (flags & VOL_ALLOC_BITMAP_WAIT) {
+	VCancelReservation_r(vp);
+    }
+#endif /* AFS_DEMAND_ATTACH_FS */
+    return ret;
 }
 
 VnodeId
@@ -1482,7 +3986,7 @@ VAllocBitmapEntry(Error * ec, Volume * vp, register struct vnodeIndex * index)
 {
     VnodeId retVal;
     VOL_LOCK;
-    retVal = VAllocBitmapEntry_r(ec, vp, index);
+    retVal = VAllocBitmapEntry_r(ec, vp, index, VOL_ALLOC_BITMAP_WAIT);
     VOL_UNLOCK;
     return retVal;
 }
@@ -1492,6 +3996,7 @@ VFreeBitMapEntry_r(Error * ec, register struct vnodeIndex *index,
 		   unsigned bitNumber)
 {
     unsigned int offset;
+
     *ec = 0;
 #ifdef BITMAP_LATER
     if (!index->bitmap)
@@ -1516,70 +4021,13 @@ VFreeBitMapEntry(Error * ec, register struct vnodeIndex *index,
     VOL_UNLOCK;
 }
 
-void
-VUpdateVolume_r(Error * ec, Volume * vp)
-{
-    *ec = 0;
-    if (programType == fileServer)
-	V_uniquifier(vp) =
-	    (V_inUse(vp) ? V_nextVnodeUnique(vp) +
-	     200 : V_nextVnodeUnique(vp));
-    /*printf("Writing volume header for '%s'\n", V_name(vp)); */
-    WriteVolumeHeader_r(ec, vp);
-    if (*ec) {
-	Log("VUpdateVolume: error updating volume header, volume %u (%s)\n",
-	    V_id(vp), V_name(vp));
-	VForceOffline_r(vp);
-    }
-}
-
-void
-VUpdateVolume(Error * ec, Volume * vp)
-{
-    VOL_LOCK;
-    VUpdateVolume_r(ec, vp);
-    VOL_UNLOCK;
-}
-
-void
-VSyncVolume_r(Error * ec, Volume * vp)
-{
-    FdHandle_t *fdP;
-    VUpdateVolume_r(ec, vp);
-    if (!ec) {
-	int code;
-	fdP = IH_OPEN(V_diskDataHandle(vp));
-	assert(fdP != NULL);
-	code = FDH_SYNC(fdP);
-	assert(code == 0);
-	FDH_CLOSE(fdP);
-    }
-}
-
-void
-VSyncVolume(Error * ec, Volume * vp)
-{
-    VOL_LOCK;
-    VSyncVolume_r(ec, vp);
-    VOL_UNLOCK;
-}
-
+/* this function will drop the glock internally.
+ * for old pthread fileservers, this is safe thanks to vbusy.
+ *
+ * for demand attach fs, caller must have already called
+ * VCreateReservation_r and VWaitExclusiveState_r */
 static void
-FreeVolume(Volume * vp)
-{
-    int i;
-    if (!vp)
-	return;
-    for (i = 0; i < nVNODECLASSES; i++)
-	if (vp->vnodeIndex[i].bitmap)
-	    free(vp->vnodeIndex[i].bitmap);
-    FreeVolumeHeader(vp);
-    DeleteVolumeFromHashTable(vp);
-    free(vp);
-}
-
-static void
-GetBitmap(Error * ec, Volume * vp, VnodeClass class)
+VGetBitmap_r(Error * ec, Volume * vp, VnodeClass class)
 {
     StreamHandle_t *file;
     int nVnodes;
@@ -1592,9 +4040,17 @@ GetBitmap(Error * ec, Volume * vp, VnodeClass class)
 #ifdef BITMAP_LATER
     byte *BitMap = 0;
 #endif /* BITMAP_LATER */
+#ifdef AFS_DEMAND_ATTACH_FS
+    VolState state_save;
+#endif /* AFS_DEMAND_ATTACH_FS */
 
     *ec = 0;
 
+#ifdef AFS_DEMAND_ATTACH_FS
+    state_save = VChangeState_r(vp, VOL_STATE_GET_BITMAP);
+#endif /* AFS_DEMAND_ATTACH_FS */
+    VOL_UNLOCK;
+
     fdP = IH_OPEN(vip->handle);
     assert(fdP != NULL);
     file = FDH_FDOPEN(fdP, "r");
@@ -1655,6 +4111,8 @@ GetBitmap(Error * ec, Volume * vp, VnodeClass class)
     STREAM_CLOSE(file);
     FDH_CLOSE(fdP);
     free(vnode);
+
+    VOL_LOCK;
 #ifdef BITMAP_LATER
     /* There may have been a racing condition with some other thread, both
      * creating the bitmaps for this volume. If the other thread was faster
@@ -1666,8 +4124,106 @@ GetBitmap(Error * ec, Volume * vp, VnodeClass class)
     } else
 	free((byte *) BitMap);
 #endif /* BITMAP_LATER */
+#ifdef AFS_DEMAND_ATTACH_FS
+    VChangeState_r(vp, state_save);
+#endif /* AFS_DEMAND_ATTACH_FS */
 }
 
+
+/***************************************************/
+/* demand attach fs state machine routines         */
+/***************************************************/
+
+#ifdef AFS_DEMAND_ATTACH_FS
+/* wait for the volume to change states */
+static void
+VWaitStateChange_r(Volume * vp)
+{
+    VolState state_save = V_attachState(vp);
+
+    assert(vp->nWaiters || vp->nUsers);
+    do {
+	assert(pthread_cond_wait(&V_attachCV(vp), &vol_glock_mutex) == 0);
+    } while (V_attachState(vp) == state_save);
+    assert(V_attachState(vp) != VOL_STATE_FREED);
+}
+
+/* wait for blocking ops to end */
+static void
+VWaitExclusiveState_r(Volume * vp)
+{
+    assert(vp->nWaiters || vp->nUsers);
+    while (IsExclusiveState(V_attachState(vp))) {
+	assert(pthread_cond_wait(&V_attachCV(vp), &vol_glock_mutex) == 0);
+    }
+    assert(V_attachState(vp) != VOL_STATE_FREED);
+}
+
+/* change state, and notify other threads,
+ * return previous state to caller */
+VolState
+VChangeState_r(Volume * vp, VolState new_state)
+{
+    VolState old_state = V_attachState(vp);
+
+    /* XXX profiling need to make sure these counters
+     * don't kill performance... */
+    VStats.state_levels[old_state]--;
+    VStats.state_levels[new_state]++;
+
+    V_attachState(vp) = new_state;
+    assert(pthread_cond_broadcast(&V_attachCV(vp)) == 0);
+    return old_state;
+}
+
+/* tells caller whether or not the current state requires
+ * exclusive access without holding glock */
+static int
+IsExclusiveState(VolState state)
+{
+    switch (state) {
+    case VOL_STATE_UPDATING:
+    case VOL_STATE_ATTACHING:
+    case VOL_STATE_GET_BITMAP:
+    case VOL_STATE_HDR_LOADING:
+    case VOL_STATE_HDR_ATTACHING:
+    case VOL_STATE_OFFLINING:
+    case VOL_STATE_DETACHING:
+	return 1;
+    }
+    return 0;
+}
+
+/* tell caller whether V_attachState is an error condition */
+static int
+IsErrorState(VolState state)
+{
+    switch (state) {
+    case VOL_STATE_ERROR:
+    case VOL_STATE_SALVAGING:
+	return 1;
+    }
+    return 0;
+}
+
+/* tell caller whether V_attachState is valid */
+static int
+IsValidState(VolState state)
+{
+    if ((state >= 0) && 
+	(state < VOL_STATE_COUNT) &&
+	(state != VOL_STATE_FREED)) {
+	return 1;
+    }
+    return 0;
+}
+#endif /* AFS_DEMAND_ATTACH_FS */
+
+
+/***************************************************/
+/* Volume Path and Volume Number utility routines  */
+/***************************************************/
+
 static void
 GetVolumePath(Error * ec, VolId volumeId, char **partitionp, char **namep)
 {
@@ -1714,6 +4270,17 @@ VolumeExternalName(VolumeId volumeId)
     return name;
 }
 
+static int
+VolumeExternalName_r(VolumeId volumeId, char * name, size_t len)
+{
+    return afs_snprintf(name, len, VFORMAT, volumeId);
+}
+
+
+/***************************************************/
+/* Volume Usage Statistics routines                */
+/***************************************************/
+
 #if OPENAFS_VOL_STATS
 #define OneDay	(86400)		/* 24 hours' worth of seconds */
 #else
@@ -1750,7 +4317,7 @@ VAdjustVolumeStatistics_r(register Volume * vp)
     unsigned int now = FT_ApproxTime();
 
     if (now - V_dayUseDate(vp) > OneDay) {
-	register ndays, i;
+	register int ndays, i;
 
 	ndays = (now - V_dayUseDate(vp)) / OneDay;
 	for (i = 6; i > ndays - 1; i--)
@@ -1799,7 +4366,7 @@ VBumpVolumeUsage_r(register Volume * vp)
      */
     if ((V_dayUse(vp)++ & 127) == 0) {
 	Error error;
-	VUpdateVolume_r(&error, vp);
+	VUpdateVolume_r(&error, vp, VOL_UPDATE_WAIT);
     }
 }
 
@@ -1814,7 +4381,9 @@ VBumpVolumeUsage(register Volume * vp)
 void
 VSetDiskUsage_r(void)
 {
+#ifndef AFS_DEMAND_ATTACH_FS
     static int FifteenMinuteCounter = 0;
+#endif
 
     while (VInit < 2) {
 	/* NOTE: Don't attempt to access the partitions list until the
@@ -1828,10 +4397,13 @@ VSetDiskUsage_r(void)
     }
 
     VResetDiskUsage_r();
+
+#ifndef AFS_DEMAND_ATTACH_FS
     if (++FifteenMinuteCounter == 3) {
 	FifteenMinuteCounter = 0;
 	VScanUpdateList();
     }
+#endif /* !AFS_DEMAND_ATTACH_FS */
 }
 
 void
@@ -1842,14 +4414,28 @@ VSetDiskUsage(void)
     VOL_UNLOCK;
 }
 
+
+/***************************************************/
+/* Volume Update List routines                     */
+/***************************************************/
+
 /* The number of minutes that a volume hasn't been updated before the
  * "Dont salvage" flag in the volume header will be turned on */
 #define SALVAGE_INTERVAL	(10*60)
 
-static VolumeId *UpdateList;	/* Pointer to array of Volume ID's */
-static int nUpdatedVolumes;	/* Updated with entry in UpdateList, salvage after crash flag on */
-static int updateSize;		/* number of entries possible */
-#define UPDATE_LIST_SIZE 100	/* size increment */
+/*
+ * demand attach fs
+ *
+ * volume update list functionality has been moved into the VLRU
+ * the DONT_SALVAGE flag is now set during VLRU demotion
+ */
+
+#ifndef AFS_DEMAND_ATTACH_FS
+static VolumeId *UpdateList = NULL;	/* Pointer to array of Volume ID's */
+static int nUpdatedVolumes = 0;	        /* Updated with entry in UpdateList, salvage after crash flag on */
+static int updateSize = 0;		/* number of entries possible */
+#define UPDATE_LIST_SIZE 128	        /* initial size increment (must be a power of 2!) */
+#endif /* !AFS_DEMAND_ATTACH_FS */
 
 void
 VAddToVolumeUpdateList_r(Error * ec, Volume * vp)
@@ -1859,15 +4445,22 @@ VAddToVolumeUpdateList_r(Error * ec, Volume * vp)
     if (V_dontSalvage(vp) == 0)
 	return;
     V_dontSalvage(vp) = 0;
-    VSyncVolume_r(ec, vp);
+    VSyncVolume_r(ec, vp, 0);
+#ifdef AFS_DEMAND_ATTACH_FS
+    V_attachFlags(vp) &= ~(VOL_HDR_DONTSALV);
+#else /* !AFS_DEMAND_ATTACH_FS */
     if (*ec)
 	return;
-    if (!UpdateList) {
+    if (UpdateList == NULL) {
 	updateSize = UPDATE_LIST_SIZE;
 	UpdateList = (VolumeId *) malloc(sizeof(VolumeId) * updateSize);
     } else {
 	if (nUpdatedVolumes == updateSize) {
-	    updateSize += UPDATE_LIST_SIZE;
+	    updateSize << 1;
+	    if (updateSize > 524288) {
+		Log("warning: there is likely a bug in the volume update scanner\n");
+		return;
+	    }
 	    UpdateList =
 		(VolumeId *) realloc(UpdateList,
 				     sizeof(VolumeId) * updateSize);
@@ -1875,8 +4468,10 @@ VAddToVolumeUpdateList_r(Error * ec, Volume * vp)
     }
     assert(UpdateList != NULL);
     UpdateList[nUpdatedVolumes++] = V_id(vp);
+#endif /* !AFS_DEMAND_ATTACH_FS */
 }
 
+#ifndef AFS_DEMAND_ATTACH_FS
 static void
 VScanUpdateList(void)
 {
@@ -1886,41 +4481,994 @@ VScanUpdateList(void)
     afs_uint32 now = FT_ApproxTime();
     /* Be careful with this code, since it works with interleaved calls to AddToVolumeUpdateList */
     for (i = gap = 0; i < nUpdatedVolumes; i++) {
+	if (gap)
+	    UpdateList[i - gap] = UpdateList[i];
+
+	/* XXX this routine needlessly messes up the Volume LRU by
+	 * breaking the LRU temporal-locality assumptions.....
+	 * we should use a special volume header allocator here */
 	vp = VGetVolume_r(&error, UpdateList[i - gap] = UpdateList[i]);
 	if (error) {
 	    gap++;
 	} else if (vp->nUsers == 1 && now - vp->updateTime > SALVAGE_INTERVAL) {
 	    V_dontSalvage(vp) = DONT_SALVAGE;
-	    VUpdateVolume_r(&error, vp);	/* No need to fsync--not critical */
+	    VUpdateVolume_r(&error, vp, 0);	/* No need to fsync--not critical */
 	    gap++;
 	}
-	if (vp)
+
+	if (vp) {
 	    VPutVolume_r(vp);
+	}
+
 #ifndef AFS_PTHREAD_ENV
 	IOMGR_Poll();
 #endif /* !AFS_PTHREAD_ENV */
     }
     nUpdatedVolumes -= gap;
 }
+#endif /* !AFS_DEMAND_ATTACH_FS */
+
 
 /***************************************************/
-/* Add on routines to manage a volume header cache */
+/* Volume LRU routines                             */
 /***************************************************/
 
-static struct volHeader *volumeLRU;
+/* demand attach fs
+ * volume LRU
+ *
+ * with demand attach fs, we attempt to soft detach(1)
+ * volumes which have not been accessed in a long time
+ * in order to speed up fileserver shutdown
+ *
+ * (1) by soft detach we mean a process very similar
+ *     to VOffline, except the final state of the 
+ *     Volume will be VOL_STATE_PREATTACHED, instead
+ *     of the usual VOL_STATE_UNATTACHED
+ */
+#ifdef AFS_DEMAND_ATTACH_FS
+
+/* implementation is reminiscent of a generational GC
+ *
+ * queue 0 is newly attached volumes. this queue is
+ * sorted by attach timestamp
+ *
+ * queue 1 is volumes that have been around a bit
+ * longer than queue 0. this queue is sorted by
+ * attach timestamp
+ *
+ * queue 2 is volumes tha have been around the longest.
+ * this queue is unsorted
+ *
+ * queue 3 is volumes that have been marked as
+ * candidates for soft detachment. this queue is
+ * unsorted
+ */
+#define VLRU_GENERATIONS  3   /* number of generations in VLRU */
+#define VLRU_QUEUES       5   /* total number of VLRU queues */
+struct VLRU_q {
+    volatile struct rx_queue q;
+    volatile int len;
+    volatile int busy;
+    pthread_cond_t cv;
+};
+struct VLRU {
+    struct VLRU_q q[VLRU_QUEUES];
+
+    /* VLRU config */
+    afs_uint32 promotion_interval[VLRU_GENERATIONS-1];  /* interval between promotions */
+    afs_uint32 scan_interval[VLRU_GENERATIONS+1];       /* interval between scans for candidates */
+
+    /* state */
+    int next_idx;
+    afs_uint32 last_promotion[VLRU_GENERATIONS-1];      /* timestamp of last promotion scan */
+    afs_uint32 last_scan[VLRU_GENERATIONS+1];           /* timestamp of last detach scan */
+
+    int scanner_state;                                  /* state of scanner thread */
+    pthread_cond_t cv;                                  /* state transition CV */
+};
+
+static struct VLRU volume_LRU;
+
+/* valid scanner states */
+#define VLRU_SCANNER_STATE_OFFLINE        0
+#define VLRU_SCANNER_STATE_ONLINE         1
+#define VLRU_SCANNER_STATE_SHUTTING_DOWN  2
+#define VLRU_SCANNER_STATE_PAUSING        3
+#define VLRU_SCANNER_STATE_PAUSED         4
+
+/* vlru disk data header stuff */
+#define VLRU_DISK_MAGIC      0x7a8b9cad
+#define VLRU_DISK_VERSION    1
+
+/* vlru default expiration time (for eventual fs state serialization of vlru data) */
+#define VLRU_DUMP_EXPIRATION_TIME   (60*60*24*7)  /* expire vlru data after 1 week */
+
+
+static afs_uint32 VLRU_offline_thresh = VLRU_DEFAULT_OFFLINE_THRESH;
+static afs_uint32 VLRU_offline_interval = VLRU_DEFAULT_OFFLINE_INTERVAL;
+static afs_uint32 VLRU_offline_max = VLRU_DEFAULT_OFFLINE_MAX;
+static afs_uint32 VLRU_enabled = 1;
+
+/* queue synchronization routines */
+static void VLRU_BeginExclusive_r(struct VLRU_q * q);
+static void VLRU_EndExclusive_r(struct VLRU_q * q);
+static void VLRU_Wait_r(struct VLRU_q * q);
+
+/* set the VLRU parameters 
+ *
+ * valid options are:
+ *  VLRU_SET_THRESH -- set the period of inactivity after
+ *    which volumes are eligible for being detached
+ *  VLRU_SET_INTERVAL -- the time interval between calls
+ *    to the volume LRU "garbage collector"
+ *  VLRU_SET_MAX -- the max number of volumes to deallocate
+ *    in one GC pass
+ */
+void
+VLRU_SetOptions(int option, afs_uint32 val)
+{
+    if (option == VLRU_SET_THRESH) {
+	VLRU_offline_thresh = val;
+    } else if (option == VLRU_SET_INTERVAL) {
+	VLRU_offline_interval = val;
+    } else if (option == VLRU_SET_MAX) {
+	VLRU_offline_max = val;
+    } else if (option == VLRU_SET_ENABLED) {
+	VLRU_enabled = val;
+    }
+    VLRU_ComputeConstants();
+}
+
+/* compute the VLRU internal timing parameters based upon the user's inputs */
+static void
+VLRU_ComputeConstants(void)
+{
+    afs_uint32 factor = VLRU_offline_thresh / VLRU_offline_interval;
+
+    /* compute the candidate scan interval */
+    volume_LRU.scan_interval[VLRU_QUEUE_CANDIDATE] = VLRU_offline_interval;
+
+    /* compute the promotion intervals */
+    volume_LRU.promotion_interval[VLRU_QUEUE_NEW] = VLRU_offline_thresh * 2;
+    volume_LRU.promotion_interval[VLRU_QUEUE_MID] = VLRU_offline_thresh * 4;
+
+    if (factor > 16) {
+	/* compute the gen 0 scan interval */
+	volume_LRU.scan_interval[VLRU_QUEUE_NEW] = VLRU_offline_thresh / 8;
+    } else {
+	/* compute the gen 0 scan interval */
+	volume_LRU.scan_interval[VLRU_QUEUE_NEW] = VLRU_offline_interval * 2;
+    }
+}
+
+/* initialize VLRU */
+static void
+VInitVLRU(void)
+{
+    pthread_t tid;
+    pthread_attr_t attrs;
+    int i;
+
+    if (!VLRU_enabled) {
+	Log("VLRU: disabled\n");
+	return;
+    }
+
+    /* initialize each of the VLRU queues */
+    for (i = 0; i < VLRU_QUEUES; i++) {
+	queue_Init(&volume_LRU.q[i]);
+	volume_LRU.q[i].len = 0;
+	volume_LRU.q[i].busy = 0;
+	assert(pthread_cond_init(&volume_LRU.q[i].cv, NULL) == 0);
+    }
+
+    /* setup the timing constants */
+    VLRU_ComputeConstants();
+
+    /* XXX put inside LogLevel check? */
+    Log("VLRU: starting scanner with the following configuration parameters:\n");
+    Log("VLRU:  offlining volumes after minimum of %d seconds of inactivity\n", VLRU_offline_thresh);
+    Log("VLRU:  running VLRU soft detach pass every %d seconds\n", VLRU_offline_interval);
+    Log("VLRU:  taking up to %d volumes offline per pass\n", VLRU_offline_max);
+    Log("VLRU:  scanning generation 0 for inactive volumes every %d seconds\n", volume_LRU.scan_interval[0]);
+    Log("VLRU:  scanning for promotion/demotion between generations 0 and 1 every %d seconds\n", volume_LRU.promotion_interval[0]);
+    Log("VLRU:  scanning for promotion/demotion between generations 1 and 2 every %d seconds\n", volume_LRU.promotion_interval[1]);
+
+    /* start up the VLRU scanner */
+    volume_LRU.scanner_state = VLRU_SCANNER_STATE_OFFLINE;
+    if (programType == fileServer) {
+	assert(pthread_cond_init(&volume_LRU.cv, NULL) == 0);
+	assert(pthread_attr_init(&attrs) == 0);
+	assert(pthread_attr_setdetachstate(&attrs, PTHREAD_CREATE_DETACHED) == 0);
+	assert(pthread_create(&tid, &attrs, &VLRU_ScannerThread, NULL) == 0);
+    }
+}
+
+/* initialize LRU support for a volume */
+static void
+VLRU_Init_Node_r(volatile Volume * vp)
+{
+    if (!VLRU_enabled)
+	return;
+
+    assert(queue_IsNotOnQueue(&vp->vlru));
+    vp->vlru.idx = VLRU_QUEUE_INVALID;
+}
+
+/* add volume to VLRU 
+ * now supports adding to queues other
+ * than new for vlru state restore
+ * caller MUST hold a ref count on vp */
+static void
+VLRU_Add_r(volatile Volume * vp)
+{
+    int idx;
+
+    if (!VLRU_enabled)
+	return;
+
+    if (queue_IsOnQueue(&vp->vlru))
+	return;
+
+    VLRU_Wait_r(&volume_LRU.q[VLRU_QUEUE_NEW]);
+
+    /* repeat check since VLRU_Wait_r may have dropped
+     * the glock */
+    if (queue_IsNotOnQueue(&vp->vlru)) {
+	idx = vp->vlru.idx;
+	if ((idx < 0) || (idx >= VLRU_QUEUE_INVALID)) {
+	    idx = vp->vlru.idx = VLRU_QUEUE_NEW;
+	}
+	queue_Prepend(&volume_LRU.q[idx], &vp->vlru);
+	volume_LRU.q[idx].len++;
+	V_attachFlags(vp) |= VOL_ON_VLRU;
+	vp->stats.last_promote = FT_ApproxTime();
+    }
+}
+
+/* delete volume from VLRU 
+ * caller MUST hold a ref count on vp */
+static void
+VLRU_Delete_r(volatile Volume * vp)
+{
+    int idx;
+
+    if (!VLRU_enabled)
+	return;
+
+    if (queue_IsNotOnQueue(&vp->vlru))
+	return;
+
+    /* handle races */
+    do {
+      idx = vp->vlru.idx;
+      if (idx == VLRU_QUEUE_INVALID)
+	  return;
+      VLRU_Wait_r(&volume_LRU.q[idx]);
+    } while (idx != vp->vlru.idx);
+
+    /* now remove from the VLRU and update 
+     * the appropriate counter */
+    queue_Remove(&vp->vlru);
+    volume_LRU.q[idx].len--;
+    vp->vlru.idx = VLRU_QUEUE_INVALID;
+    V_attachFlags(vp) &= ~(VOL_ON_VLRU);
+}
+
+/* signal that volume was just accessed.
+ * caller MUST hold a ref count on vp */
+static void
+VLRU_UpdateAccess_r(volatile Volume * vp)
+{
+    afs_uint32 live_interval;
+    Volume * rvp = NULL;
+
+    if (!VLRU_enabled)
+	return;
+
+    if (queue_IsNotOnQueue(&vp->vlru))
+	return;
+
+    assert(V_attachFlags(vp) & VOL_ON_VLRU);
+
+    /* update the access timestamp */
+    vp->stats.last_get = FT_ApproxTime();
+
+    /*
+     * if the volume is on the soft detach candidate
+     * list, we need to safely move it back to a
+     * regular generation.  this has to be done
+     * carefully so we don't race against the scanner
+     * thread.
+     */
+
+    /* if this volume is on the soft detach candidate queue,
+     * then grab exclusive access to the necessary queues */
+    if (vp->vlru.idx == VLRU_QUEUE_CANDIDATE) {
+	rvp = vp;
+	VCreateReservation_r(rvp);
+
+	VLRU_Wait_r(&volume_LRU.q[VLRU_QUEUE_NEW]);
+	VLRU_BeginExclusive_r(&volume_LRU.q[VLRU_QUEUE_NEW]);
+	VLRU_Wait_r(&volume_LRU.q[VLRU_QUEUE_CANDIDATE]);
+	VLRU_BeginExclusive_r(&volume_LRU.q[VLRU_QUEUE_CANDIDATE]);
+    }
+
+    /* make sure multiple threads don't race to update */
+    if (vp->vlru.idx == VLRU_QUEUE_CANDIDATE) {
+	VLRU_SwitchQueues(vp, VLRU_QUEUE_NEW, 1);
+    }
+
+    if (rvp) {
+      VLRU_EndExclusive_r(&volume_LRU.q[VLRU_QUEUE_CANDIDATE]);
+      VLRU_EndExclusive_r(&volume_LRU.q[VLRU_QUEUE_NEW]);
+      VCancelReservation_r(rvp);
+    }
+}
+
+/* switch a volume between two VLRU queues */
+static void
+VLRU_SwitchQueues(volatile Volume * vp, int new_idx, int append)
+{
+    if (queue_IsNotOnQueue(&vp->vlru))
+	return;
+
+    queue_Remove(&vp->vlru);
+    volume_LRU.q[vp->vlru.idx].len--;
+    
+    /* put the volume back on the correct generational queue */
+    if (append) {
+	queue_Append(&volume_LRU.q[new_idx], &vp->vlru);
+    } else {
+	queue_Prepend(&volume_LRU.q[new_idx], &vp->vlru);
+    }
+
+    volume_LRU.q[new_idx].len++;
+    vp->vlru.idx = new_idx;
+}
+
+/* VLRU GC thread */
+static void *
+VLRU_ScannerThread(void * args)
+{
+    afs_uint32 now, min_delay, delay;
+    afs_uint32 next_scan[VLRU_GENERATIONS];
+    afs_uint32 next_promotion[VLRU_GENERATIONS];
+    int i, min_idx, min_op, overdue, state;
+
+    /* set t=0 for promotion cycle to be 
+     * fileserver startup */
+    now = FT_ApproxTime();
+    for (i=0; i < VLRU_GENERATIONS-1; i++) {
+	volume_LRU.last_promotion[i] = now;
+    }
+
+    /* don't start the scanner until VLRU_offline_thresh
+     * plus a small delay for VInitVolumePackage to finish
+     * has gone by */
+
+    sleep(VLRU_offline_thresh + 60);
+
+    /* set t=0 for scan cycle to be now */
+    now = FT_ApproxTime();
+    for (i=0; i < VLRU_GENERATIONS+1; i++) {
+	volume_LRU.last_scan[i] = now;
+    }
+
+    VOL_LOCK;
+    if (volume_LRU.scanner_state == VLRU_SCANNER_STATE_OFFLINE) {
+	volume_LRU.scanner_state = VLRU_SCANNER_STATE_ONLINE;
+    }
+
+    while ((state = volume_LRU.scanner_state) != VLRU_SCANNER_STATE_SHUTTING_DOWN) {
+	/* check to see if we've been asked to pause */
+	if (volume_LRU.scanner_state == VLRU_SCANNER_STATE_PAUSING) {
+	    volume_LRU.scanner_state = VLRU_SCANNER_STATE_PAUSED;
+	    assert(pthread_cond_broadcast(&volume_LRU.cv) == 0);
+	    do {
+		assert(pthread_cond_wait(&volume_LRU.cv, &vol_glock_mutex) == 0);
+	    } while (volume_LRU.scanner_state == VLRU_SCANNER_STATE_PAUSED);
+	}
+
+	/* scheduling can happen outside the glock */
+	VOL_UNLOCK;
+
+	/* figure out what is next on the schedule */
+
+	/* figure out a potential schedule for the new generation first */
+	overdue = 0;
+	min_delay = volume_LRU.scan_interval[0] + volume_LRU.last_scan[0] - now;
+	min_idx = 0;
+	min_op = 0;
+	if (min_delay > volume_LRU.scan_interval[0]) {
+	    /* unsigned overflow -- we're overdue to run this scan */
+	    min_delay = 0;
+	    overdue = 1;
+	}
+
+	/* if we're not overdue for gen 0, figure out schedule for candidate gen */
+	if (!overdue) {
+	    i = VLRU_QUEUE_CANDIDATE;
+	    delay = volume_LRU.scan_interval[i] + volume_LRU.last_scan[i] - now;
+	    if (delay < min_delay) {
+		min_delay = delay;
+		min_idx = i;
+	    }
+	    if (delay > volume_LRU.scan_interval[i]) {
+		/* unsigned overflow -- we're overdue to run this scan */
+		min_delay = 0;
+		min_idx = i;
+		overdue = 1;
+		break;
+	    }
+	}
+
+	/* if we're still not overdue for something, figure out schedules for promotions */
+	for (i=0; !overdue && i < VLRU_GENERATIONS-1; i++) {
+	    delay = volume_LRU.promotion_interval[i] + volume_LRU.last_promotion[i] - now;
+	    if (delay < min_delay) {
+		min_delay = delay;
+		min_idx = i;
+		min_op = 1;
+	    }
+	    if (delay > volume_LRU.promotion_interval[i]) {
+		/* unsigned overflow -- we're overdue to run this promotion */
+		min_delay = 0;
+		min_idx = i;
+		min_op = 1;
+		overdue = 1;
+		break;
+	    }
+	}
+
+	/* sleep as needed */
+	if (min_delay) {
+	    sleep(min_delay);
+	}
+
+	/* do whatever is next */
+	VOL_LOCK;
+	if (min_op) {
+	    VLRU_Promote_r(min_idx);
+	    VLRU_Demote_r(min_idx+1);
+	} else {
+	    VLRU_Scan_r(min_idx);
+	}
+	now = FT_ApproxTime();
+    }
+
+    Log("VLRU scanner asked to go offline (scanner_state=%d)\n", state);
+
+    /* signal that scanner is down */
+    volume_LRU.scanner_state = VLRU_SCANNER_STATE_OFFLINE;
+    assert(pthread_cond_broadcast(&volume_LRU.cv) == 0);
+    VOL_UNLOCK;
+    return NULL;
+}
+
+/* run the promotions */
+static void
+VLRU_Promote_r(int idx)
+{
+    int len, chaining, promote;
+    afs_uint32 now, thresh;
+    struct rx_queue *qp, *nqp;
+    Volume * vp, *start, *end;
+
+    /* get exclusive access to two chains, and drop the glock */
+    VLRU_Wait_r(&volume_LRU.q[idx]);
+    VLRU_BeginExclusive_r(&volume_LRU.q[idx]);
+    VLRU_Wait_r(&volume_LRU.q[idx+1]);
+    VLRU_BeginExclusive_r(&volume_LRU.q[idx+1]);
+    VOL_UNLOCK;
+
+    thresh = volume_LRU.promotion_interval[idx];
+    now = FT_ApproxTime();
+
+    len = chaining = 0;
+    for (queue_ScanBackwards(&volume_LRU.q[idx], qp, nqp, rx_queue)) {
+	vp = (Volume *)((char *)qp - offsetof(Volume, vlru));
+	promote = (((vp->stats.last_promote + thresh) <= now) &&
+		   (vp->stats.last_get >= vp->stats.last_promote));
+
+	if (chaining) {
+	    if (promote) {
+		vp->vlru.idx++;
+		len++;
+		start = vp;
+	    } else {
+		/* promote and prepend chain */
+		queue_MoveChainAfter(&volume_LRU.q[idx+1], &start->vlru, &end->vlru);
+		chaining = 0;
+	    }
+	} else {
+	    if (promote) {
+		vp->vlru.idx++;
+		len++;
+		chaining = 1;
+		start = end = vp;
+	    }
+	}
+    }
+
+    if (chaining) {
+	/* promote and prepend */
+	queue_MoveChainAfter(&volume_LRU.q[idx+1], &start->vlru, &end->vlru);
+    }
+
+    if (len) {
+	volume_LRU.q[idx].len -= len;
+	volume_LRU.q[idx+1].len += len;
+    }
+
+    /* release exclusive access to the two chains */
+    VOL_LOCK;
+    volume_LRU.last_promotion[idx] = now;
+    VLRU_EndExclusive_r(&volume_LRU.q[idx+1]);
+    VLRU_EndExclusive_r(&volume_LRU.q[idx]);
+}
+
+/* run the demotions */
+static void
+VLRU_Demote_r(int idx)
+{
+    Error ec;
+    int len, chaining, demote;
+    afs_uint32 now, thresh;
+    struct rx_queue *qp, *nqp;
+    Volume * vp, *start, *end;
+    Volume ** salv_flag_vec = NULL;
+    int salv_vec_offset = 0;
+
+    assert(idx == VLRU_QUEUE_MID || idx == VLRU_QUEUE_OLD);
+
+    /* get exclusive access to two chains, and drop the glock */
+    VLRU_Wait_r(&volume_LRU.q[idx-1]);
+    VLRU_BeginExclusive_r(&volume_LRU.q[idx-1]);
+    VLRU_Wait_r(&volume_LRU.q[idx]);
+    VLRU_BeginExclusive_r(&volume_LRU.q[idx]);
+    VOL_UNLOCK;
+
+    /* no big deal if this allocation fails */
+    if (volume_LRU.q[idx].len) {
+	salv_flag_vec = (Volume **) malloc(volume_LRU.q[idx].len * sizeof(Volume *));
+    }
+
+    now = FT_ApproxTime();
+    thresh = volume_LRU.promotion_interval[idx-1];
+
+    len = chaining = 0;
+    for (queue_ScanBackwards(&volume_LRU.q[idx], qp, nqp, rx_queue)) {
+	vp = (Volume *)((char *)qp - offsetof(Volume, vlru));
+	demote = (((vp->stats.last_promote + thresh) <= now) &&
+		  (vp->stats.last_get < (now - thresh)));
+
+	/* we now do volume update list DONT_SALVAGE flag setting during
+	 * demotion passes */
+	if (salv_flag_vec &&
+	    !(V_attachFlags(vp) & VOL_HDR_DONTSALV) &&
+	    demote && 
+	    (vp->updateTime < (now - SALVAGE_INTERVAL)) &&
+	    (V_attachState(vp) == VOL_STATE_ATTACHED)) {
+	    salv_flag_vec[salv_vec_offset++] = vp;
+	    VCreateReservation_r(vp);
+	}
+
+	if (chaining) {
+	    if (demote) {
+		vp->vlru.idx--;
+		len++;
+		start = vp;
+	    } else {
+		/* demote and append chain */
+		queue_MoveChainBefore(&volume_LRU.q[idx-1], &start->vlru, &end->vlru);
+		chaining = 0;
+	    }
+	} else {
+	    if (demote) {
+		vp->vlru.idx--;
+		len++;
+		chaining = 1;
+		start = end = vp;
+	    }
+	}
+    }
+
+    if (chaining) {
+	queue_MoveChainBefore(&volume_LRU.q[idx-1], &start->vlru, &end->vlru);
+    }
+
+    if (len) {
+	volume_LRU.q[idx].len -= len;
+	volume_LRU.q[idx-1].len += len;
+    }
+
+    /* release exclusive access to the two chains */
+    VOL_LOCK;
+    VLRU_EndExclusive_r(&volume_LRU.q[idx]);
+    VLRU_EndExclusive_r(&volume_LRU.q[idx-1]);
+
+    /* now go back and set the DONT_SALVAGE flags as appropriate */
+    if (salv_flag_vec) {
+	int i;
+	for (i = 0; i < salv_vec_offset; i++) {
+	    vp = salv_flag_vec[i];
+	    if (!(V_attachFlags(vp) & VOL_HDR_DONTSALV) &&
+		(vp->updateTime < (now - SALVAGE_INTERVAL)) &&
+		(V_attachState(vp) == VOL_STATE_ATTACHED)) {
+		ec = VHold_r(vp);
+		if (!ec) {
+		    V_attachFlags(vp) |= VOL_HDR_DONTSALV;
+		    V_dontSalvage(vp) = DONT_SALVAGE;
+		    VUpdateVolume_r(&ec, vp, 0);
+		    VPutVolume_r(vp);
+		}
+	    }
+	    VCancelReservation_r(vp);
+	}
+	free(salv_flag_vec);
+    }
+}
+
+/* run a pass of the VLRU GC scanner */
+static void
+VLRU_Scan_r(int idx)
+{
+    afs_uint32 now, thresh;
+    struct rx_queue *qp, *nqp;
+    volatile Volume * vp;
+    int i, locked = 1;
+
+    assert(idx == VLRU_QUEUE_NEW || idx == VLRU_QUEUE_CANDIDATE);
+
+    /* gain exclusive access to the idx VLRU */
+    VLRU_Wait_r(&volume_LRU.q[idx]);
+    VLRU_BeginExclusive_r(&volume_LRU.q[idx]);
+
+    if (idx != VLRU_QUEUE_CANDIDATE) {
+	/* gain exclusive access to the candidate VLRU */
+	VLRU_Wait_r(&volume_LRU.q[VLRU_QUEUE_CANDIDATE]);
+	VLRU_BeginExclusive_r(&volume_LRU.q[VLRU_QUEUE_CANDIDATE]);
+    }
+
+    now = FT_ApproxTime();
+    thresh = now - VLRU_offline_thresh;
+
+    /* perform candidate selection and soft detaching */
+    if (idx == VLRU_QUEUE_CANDIDATE) {
+	/* soft detach some volumes from the candidate pool */
+	VOL_UNLOCK;
+	locked = 0;
+
+	for (i=0,queue_ScanBackwards(&volume_LRU.q[idx], qp, nqp, rx_queue)) {
+	    vp = (Volume *)((char *)qp - offsetof(Volume, vlru));
+	    if (i >= VLRU_offline_max) {
+		break;
+	    }
+	    /* check timestamp to see if it's a candidate for soft detaching */
+	    if (vp->stats.last_get <= thresh) {
+		VOL_LOCK;
+		if (VCheckSoftDetach(vp, thresh))
+		    i++;
+		VOL_UNLOCK;
+	    }
+	}
+    } else {
+	/* scan for volumes to become soft detach candidates */
+	for (i=1,queue_ScanBackwards(&volume_LRU.q[idx], qp, nqp, rx_queue),i++) {
+	    vp = (Volume *)((char *)qp - offsetof(Volume, vlru));
+
+	    /* check timestamp to see if it's a candidate for soft detaching */
+	    if (vp->stats.last_get <= thresh) {
+		VCheckSoftDetachCandidate(vp, thresh);
+	    }
+
+	    if (!(i&0x7f)) {   /* lock coarsening optimization */
+		VOL_UNLOCK;
+		pthread_yield();
+		VOL_LOCK;
+	    }
+	}
+    }
+
+    /* relinquish exclusive access to the VLRU chains */
+    if (!locked) {
+	VOL_LOCK;
+    }
+    volume_LRU.last_scan[idx] = now;
+    if (idx != VLRU_QUEUE_CANDIDATE) {
+	VLRU_EndExclusive_r(&volume_LRU.q[VLRU_QUEUE_CANDIDATE]);
+    }
+    VLRU_EndExclusive_r(&volume_LRU.q[idx]);
+}
+
+/* check whether volume is safe to soft detach
+ * caller MUST NOT hold a ref count on vp */
+static int
+VCheckSoftDetach(volatile Volume * vp, afs_uint32 thresh)
+{
+    int ret=0;
+
+    if (vp->nUsers || vp->nWaiters)
+	return 0;
+
+    if (vp->stats.last_get <= thresh) {
+	ret = VSoftDetachVolume_r(vp, thresh);
+    }
+
+    return ret;
+}
+
+/* check whether volume should be made a 
+ * soft detach candidate */
+static int
+VCheckSoftDetachCandidate(volatile Volume * vp, afs_uint32 thresh)
+{
+    int idx, ret = 0;
+    if (vp->nUsers || vp->nWaiters)
+	return 0;
+
+    idx = vp->vlru.idx;
+
+    assert(idx == VLRU_QUEUE_NEW);
+
+    if (vp->stats.last_get <= thresh) {
+	/* move to candidate pool */
+	queue_Remove(&vp->vlru);
+	volume_LRU.q[VLRU_QUEUE_NEW].len--;
+	queue_Prepend(&volume_LRU.q[VLRU_QUEUE_CANDIDATE], &vp->vlru);
+	vp->vlru.idx = VLRU_QUEUE_CANDIDATE;
+	volume_LRU.q[VLRU_QUEUE_CANDIDATE].len++;
+	ret = 1;
+    }
+
+    return ret;
+}
+
+
+/* begin exclusive access on VLRU */
+static void
+VLRU_BeginExclusive_r(struct VLRU_q * q)
+{
+    assert(q->busy == 0);
+    q->busy = 1;
+}
+
+/* end exclusive access on VLRU */
+static void
+VLRU_EndExclusive_r(struct VLRU_q * q)
+{
+    assert(q->busy);
+    q->busy = 0;
+    assert(pthread_cond_broadcast(&q->cv) == 0);
+}
+
+/* wait for another thread to end exclusive access on VLRU */
+static void
+VLRU_Wait_r(struct VLRU_q * q)
+{
+    while(q->busy) {
+	assert(pthread_cond_wait(&q->cv, &vol_glock_mutex) == 0);
+    }
+}
+
+/* demand attach fs
+ * volume soft detach
+ *
+ * caller MUST NOT hold a ref count on vp */
+static int
+VSoftDetachVolume_r(volatile Volume * vp, afs_uint32 thresh)
+{
+    afs_uint32 ts_save;
+    int ret = 0;
+
+    assert(vp->vlru.idx == VLRU_QUEUE_CANDIDATE);
+
+    ts_save = vp->stats.last_get;
+    if (ts_save > thresh)
+	return 0;
+
+    if (vp->nUsers || vp->nWaiters)
+	return 0;
+
+    if (IsExclusiveState(V_attachState(vp))) {
+	return 0;
+    }
+
+    switch (V_attachState(vp)) {
+    case VOL_STATE_UNATTACHED:
+    case VOL_STATE_PREATTACHED:
+    case VOL_STATE_ERROR:
+    case VOL_STATE_GOING_OFFLINE:
+    case VOL_STATE_SHUTTING_DOWN:
+    case VOL_STATE_SALVAGING:
+	volume_LRU.q[vp->vlru.idx].len--;
+
+	/* create and cancel a reservation to
+	 * give the volume an opportunity to
+	 * be deallocated */
+	VCreateReservation_r(vp);
+	queue_Remove(&vp->vlru);
+	vp->vlru.idx = VLRU_QUEUE_INVALID;
+	V_attachFlags(vp) &= ~(VOL_ON_VLRU);
+	VCancelReservation_r(vp);
+	return 0;
+    }
+
+    /* hold the volume and take it offline.
+     * no need for reservations, as VHold_r
+     * takes care of that internally. */
+    if (VHold_r(vp) == 0) {
+	/* vhold drops the glock, so now we should
+	 * check to make sure we aren't racing against
+	 * other threads.  if we are racing, offlining vp
+	 * would be wasteful, and block the scanner for a while 
+	 */
+	if (vp->nWaiters || 
+	    (vp->nUsers > 1) ||
+	    (vp->shuttingDown) ||
+	    (vp->goingOffline) ||
+	    (vp->stats.last_get != ts_save)) {
+	    /* looks like we're racing someone else. bail */
+	    VPutVolume_r(vp);
+	    vp = NULL;
+	} else {
+	    /* pull it off the VLRU */
+	    assert(vp->vlru.idx == VLRU_QUEUE_CANDIDATE);
+	    volume_LRU.q[VLRU_QUEUE_CANDIDATE].len--;
+	    queue_Remove(&vp->vlru);
+	    vp->vlru.idx = VLRU_QUEUE_INVALID;
+	    V_attachFlags(vp) &= ~(VOL_ON_VLRU);
+
+	    /* take if offline */
+	    VOffline_r(vp, "volume has been soft detached");
+
+	    /* invalidate the volume header cache */
+	    FreeVolumeHeader(vp);
+
+	    /* update stats */
+	    IncUInt64(&VStats.soft_detaches);
+	    vp->stats.soft_detaches++;
+
+	    /* put in pre-attached state so demand
+	     * attacher can work on it */
+	    VChangeState_r(vp, VOL_STATE_PREATTACHED);
+	    ret = 1;
+	}
+    }
+    return ret;
+}
+#endif /* AFS_DEMAND_ATTACH_FS */
+
+
+/***************************************************/
+/* Volume Header Cache routines                    */
+/***************************************************/
+
+struct volume_hdr_LRU_t volume_hdr_LRU;
 
 /* Allocate a bunch of headers; string them together */
 static void
-InitLRU(int howMany)
+VInitVolumeHeaderCache(afs_uint32 howMany)
 {
     register struct volHeader *hp;
     if (programType != fileServer)
 	return;
+    queue_Init(&volume_hdr_LRU);
+#ifdef AFS_DEMAND_ATTACH_FS
+    volume_hdr_LRU.stats.free = 0;
+    volume_hdr_LRU.stats.used = howMany;
+    volume_hdr_LRU.stats.attached = 0;
+#endif
     hp = (struct volHeader *)(calloc(howMany, sizeof(struct volHeader)));
     while (howMany--)
 	ReleaseVolumeHeader(hp++);
 }
 
+#ifdef AFS_DEMAND_ATTACH_FS
+/* Get a volume header from the LRU list; update the old one if necessary */
+/* Returns 1 if there was already a header, which is removed from the LRU list */
+/* caller MUST has a ref count on vp */
+static int
+GetVolumeHeader(register Volume * vp)
+{
+    Error error;
+    register struct volHeader *hd;
+    int old;
+    static int everLogged = 0;
+
+    /* XXX debug 9/19/05 we've apparently got
+     * a ref counting bug somewhere that's
+     * breaking the nUsers == 0 => header on LRU
+     * assumption */
+    if (vp->header && queue_IsNotOnQueue(vp->header)) {
+	Log("nUsers == 0, but header not on LRU\n");
+	return 1;
+    }
+
+    old = (vp->header != NULL);	/* old == volume already has a header */
+
+    if (programType != fileServer) {
+	/* for volume utilities, we allocate volHeaders as needed */
+	if (!vp->header) {
+	    hd = (struct volHeader *)calloc(1, sizeof(*vp->header));
+	    assert(hd != NULL);
+	    vp->header = hd;
+	    hd->back = vp;
+	    V_attachFlags(vp) |= VOL_HDR_ATTACHED;
+	}
+    } else {
+	if (old) {
+	    /* the header we previously dropped in the lru is
+	     * still available. pull it off the lru and return */
+	    hd = vp->header;
+	    queue_Remove(hd);
+	    assert(hd->back == vp);
+	} else {
+	    /* we need to grab a new element off the LRU */
+	    if (queue_IsNotEmpty(&volume_hdr_LRU)) {
+		/* grab an element and pull off of LRU */
+		hd = queue_First(&volume_hdr_LRU, volHeader);
+		queue_Remove(hd);
+	    } else {
+		/* LRU is empty, so allocate a new volHeader 
+		 * this is probably indicative of a leak, so let the user know */
+		hd = (struct volHeader *)calloc(1, sizeof(struct volHeader));
+		assert(hd != NULL);
+		if (!everLogged) {
+		    Log("****Allocated more volume headers, probably leak****\n");
+		    everLogged = 1;
+		}
+		volume_hdr_LRU.stats.free++;
+	    }
+	    if (hd->back) {
+		VolState vp_save, back_save;
+		/* this header used to belong to someone else. 
+		 * we'll need to check if the header needs to
+		 * be sync'd out to disk */
+
+		/* if hd->back were in an exclusive state, then
+		 * its volHeader would not be on the LRU... */
+		assert(!IsExclusiveState(V_attachState(hd->back)));
+
+		if (hd->diskstuff.inUse) {
+		    /* volume was in use, so we'll need to sync
+		     * its header to disk */
+		    back_save = VChangeState_r(hd->back, VOL_STATE_UPDATING);
+		    vp_save = VChangeState_r(vp, VOL_STATE_HDR_ATTACHING);
+		    VCreateReservation_r(hd->back);
+		    VOL_UNLOCK;
+
+		    WriteVolumeHeader_r(&error, hd->back);
+		    /* Ignore errors; catch them later */
+
+		    VOL_LOCK;
+		}
+
+		V_attachFlags(hd->back) &= ~(VOL_HDR_ATTACHED | VOL_HDR_LOADED | VOL_HDR_IN_LRU);
+		hd->back->header = NULL;
+
+		if (hd->diskstuff.inUse) {
+		    VChangeState_r(hd->back, back_save);
+		    VCancelReservation_r(hd->back);
+		    VChangeState_r(vp, vp_save);
+		}
+	    } else {
+		volume_hdr_LRU.stats.attached++;
+	    }
+	    hd->back = vp;
+	    vp->header = hd;
+	    V_attachFlags(vp) |= VOL_HDR_ATTACHED;
+	}
+	volume_hdr_LRU.stats.free--;
+	volume_hdr_LRU.stats.used++;
+    }
+    IncUInt64(&VStats.hdr_gets);
+    IncUInt64(&vp->stats.hdr_gets);
+    vp->stats.last_hdr_get = FT_ApproxTime();
+    return old;
+}
+#else /* AFS_DEMAND_ATTACH_FS */
 /* Get a volume header from the LRU list; update the old one if necessary */
 /* Returns 1 if there was already a header, which is removed from the LRU list */
 static int
@@ -1932,7 +5480,9 @@ GetVolumeHeader(register Volume * vp)
     static int everLogged = 0;
 
     old = (vp->header != NULL);	/* old == volume already has a header */
+
     if (programType != fileServer) {
+	/* for volume utilities, we allocate volHeaders as needed */
 	if (!vp->header) {
 	    hd = (struct volHeader *)calloc(1, sizeof(*vp->header));
 	    assert(hd != NULL);
@@ -1940,45 +5490,98 @@ GetVolumeHeader(register Volume * vp)
 	    hd->back = vp;
 	}
     } else {
+	/* for the fileserver, we keep a volume header cache */
 	if (old) {
+	    /* the header we previously dropped in the lru is
+	     * still available. pull it off the lru and return */
 	    hd = vp->header;
-	    if (volumeLRU == hd)
-		volumeLRU = hd->next;
+	    queue_Remove(hd);
 	    assert(hd->back == vp);
 	} else {
-	    if (volumeLRU)
-		/* not currently in use and least recently used */
-		hd = volumeLRU->prev;
-	    else {
-		hd = (struct volHeader *)calloc(1, sizeof(*vp->header));
-		/* make it look like single elt LRU */
-		hd->prev = hd->next = hd;
+	    /* we need to grab a new element off the LRU */
+	    if (queue_IsNotEmpty(&volume_hdr_LRU)) {
+		/* grab an element */
+		hd = queue_First(&volume_hdr_LRU, volHeader);
+		queue_Remove(hd);
+	    } else {
+		/* LRU is empty, so allocate a new volHeader 
+		 * this is probably indicative of a leak, so let the user know */
+		hd = (struct volHeader *)calloc(1, sizeof(struct volHeader));
+		assert(hd != NULL);
 		if (!everLogged) {
 		    Log("****Allocated more volume headers, probably leak****\n");
 		    everLogged = 1;
 		}
 	    }
 	    if (hd->back) {
+		/* this header used to belong to someone else. 
+		 * we'll need to check if the header needs to
+		 * be sync'd out to disk */
+
 		if (hd->diskstuff.inUse) {
 		    WriteVolumeHeader_r(&error, hd->back);
 		    /* Ignore errors; catch them later */
 		}
-		hd->back->header = 0;
+		hd->back->header = NULL;
 	    }
 	    hd->back = vp;
 	    vp->header = hd;
 	}
-	if (hd->next) {		/* hd->next != 0 --> in LRU chain (we zero it later) */
-	    hd->prev->next = hd->next;	/* pull hd out of LRU list */
-	    hd->next->prev = hd->prev;	/* if hd only element, this is noop */
-	}
-	hd->next = hd->prev = 0;
-	/* if not in LRU chain, next test won't be true */
-	if (hd == volumeLRU)	/* last header item, turn into empty list */
-	    volumeLRU = NULL;
     }
     return old;
 }
+#endif /* AFS_DEMAND_ATTACH_FS */
+
+
+/* make sure a volume header is attached to
+ * vp, and has the correct data loaded from
+ * disk. */
+#ifdef AFS_DEMAND_ATTACH_FS
+/* caller MUST hold a ref count on vp */
+static void
+LoadVolumeHeader(Error * ec, Volume * vp)
+{
+    VolState state_save;
+    *ec = 0;
+
+    if (vp->nUsers == 0 && !GetVolumeHeader(vp)) {
+	IncUInt64(&VStats.hdr_loads);
+	state_save = VChangeState_r(vp, VOL_STATE_HDR_LOADING);
+	VOL_UNLOCK;
+
+	ReadHeader(ec, V_diskDataHandle(vp), (char *)&V_disk(vp),
+		   sizeof(V_disk(vp)), VOLUMEINFOMAGIC,
+		   VOLUMEINFOVERSION);
+	IncUInt64(&vp->stats.hdr_loads);
+
+	VOL_LOCK;
+	if (!*ec)
+	    V_attachFlags(vp) |= VOL_HDR_LOADED;
+	VChangeState_r(vp, state_save);
+    }
+    if (*ec) {
+	/* maintain (nUsers==0) => header in LRU invariant */
+	ReleaseVolumeHeader(vp->header);
+    }
+}
+#else /* AFS_DEMAND_ATTACH_FS */
+static void
+LoadVolumeHeader(Error * ec, Volume * vp)
+{
+    *ec = 0;
+    if (vp->nUsers == 0 && !GetVolumeHeader(vp)) {
+	IncUInt64(&VStats.hdr_loads);
+
+	ReadHeader(ec, V_diskDataHandle(vp), (char *)&V_disk(vp),
+		   sizeof(V_disk(vp)), VOLUMEINFOMAGIC,
+		   VOLUMEINFOVERSION);
+    }
+    if (*ec) {
+	/* maintain (nUsers==0) => header in LRU invariant */
+	ReleaseVolumeHeader(vp->header);
+    }
+}
+#endif /* AFS_DEMAND_ATTACH_FS */
 
 /* Put it at the top of the LRU chain */
 static void
@@ -1986,18 +5589,22 @@ ReleaseVolumeHeader(register struct volHeader *hd)
 {
     if (programType != fileServer)
 	return;
-    if (!hd || hd->next)	/* no header, or header already released */
+    if (!hd || queue_IsOnQueue(hd))	/* no header, or header already released */
 	return;
-    if (!volumeLRU) {
-	hd->next = hd->prev = hd;
-    } else {
-	hd->prev = volumeLRU->prev;
-	hd->next = volumeLRU;
-	hd->prev->next = hd->next->prev = hd;
+    queue_Append(&volume_hdr_LRU, hd);
+#ifdef AFS_DEMAND_ATTACH_FS
+    if (hd->back) {
+	V_attachFlags(hd->back) |= VOL_HDR_IN_LRU;
     }
-    volumeLRU = hd;
+    volume_hdr_LRU.stats.free++;
+    volume_hdr_LRU.stats.used--;
+#endif
 }
 
+/* for fileserver, return header to LRU, and
+ * invalidate it as a cache entry.
+ *
+ * for volume utilities, free the heap space */
 static void
 FreeVolumeHeader(register Volume * vp)
 {
@@ -2006,57 +5613,349 @@ FreeVolumeHeader(register Volume * vp)
 	return;
     if (programType == fileServer) {
 	ReleaseVolumeHeader(hd);
-	hd->back = 0;
+	hd->back = NULL;
     } else {
 	free(hd);
     }
-    vp->header = 0;
+#ifdef AFS_DEMAND_ATTACH_FS
+    V_attachFlags(vp) &= ~(VOL_HDR_ATTACHED | VOL_HDR_IN_LRU | VOL_HDR_LOADED);
+    volume_hdr_LRU.stats.attached--;
+#endif
+    vp->header = NULL;
 }
 
 
 /***************************************************/
-/* Routines to add volume to hash chain, delete it */
+/* Volume Hash Table routines                      */
 /***************************************************/
 
+int 
+VSetVolHashSize(int logsize)
+{
+    /* 64 to 16384 hash buckets seems like a reasonable range */
+    if ((logsize < 6 ) || (logsize > 14)) {
+        return -1;
+    }
+    
+    if (!VInit) {
+        VolumeHashTable.Size = 1 << logsize;
+        VolumeHashTable.Mask = VolumeHashTable.Size - 1;
+    } else {
+	/* we can't yet support runtime modification of this
+	 * parameter. we'll need a configuration rwlock to
+	 * make runtime modification feasible.... */
+	return -1;
+    }
+    return 0;
+}
+
+static void
+VInitVolumeHash(void)
+{
+    register int i;
+
+    VolumeHashTable.Table = (VolumeHashChainHead *) calloc(VolumeHashTable.Size, 
+							   sizeof(VolumeHashChainHead));
+    assert(VolumeHashTable.Table != NULL);
+    
+    for (i=0; i < VolumeHashTable.Size; i++) {
+	queue_Init(&VolumeHashTable.Table[i]);
+#ifdef AFS_DEMAND_ATTACH_FS
+	assert(pthread_cond_init(&VolumeHashTable.Table[i].chain_busy_cv, NULL) == 0);
+#endif /* AFS_DEMAND_ATTACH_FS */
+    }
+}
+
+/* for demand-attach, caller MUST hold a ref count on vp */
 static void
 AddVolumeToHashTable(register Volume * vp, int hashid)
 {
-    int hash = VOLUME_HASH(hashid);
+    VolumeHashChainHead * head;
+
+    if (queue_IsOnQueue(vp))
+	return;
+
+    head = &VolumeHashTable.Table[VOLUME_HASH(hashid)];
+
+#ifdef AFS_DEMAND_ATTACH_FS
+    /* wait for the hash chain to become available */
+    VHashWait_r(head);
+
+    V_attachFlags(vp) |= VOL_IN_HASH;
+    vp->chainCacheCheck = ++head->cacheCheck;
+#endif /* AFS_DEMAND_ATTACH_FS */
+
+    head->len++;
     vp->hashid = hashid;
-    vp->hashNext = VolumeHashTable[hash];
-    VolumeHashTable[hash] = vp;
+    queue_Append(head, vp);
     vp->vnodeHashOffset = VolumeHashOffset_r();
 }
 
+/* for demand-attach, caller MUST hold a ref count on vp */
 static void
 DeleteVolumeFromHashTable(register Volume * vp)
 {
-    int hash = VOLUME_HASH(vp->hashid);
-    if (VolumeHashTable[hash] == vp)
-	VolumeHashTable[hash] = vp->hashNext;
-    else {
-	Volume *tvp = VolumeHashTable[hash];
-	if (tvp == NULL)
-	    return;
-	while (tvp->hashNext && tvp->hashNext != vp)
-	    tvp = tvp->hashNext;
-	if (tvp->hashNext == NULL)
-	    return;
-	tvp->hashNext = vp->hashNext;
-    }
-    vp->hashid = 0;
+    VolumeHashChainHead * head;
+
+    if (!queue_IsOnQueue(vp))
+	return;
+
+    head = &VolumeHashTable.Table[VOLUME_HASH(vp->hashid)];
+
+#ifdef AFS_DEMAND_ATTACH_FS
+    /* wait for the hash chain to become available */
+    VHashWait_r(head);
+
+    V_attachFlags(vp) &= ~(VOL_IN_HASH);
+    head->cacheCheck++;
+#endif /* AFS_DEMAND_ATTACH_FS */
+
+    head->len--;
+    queue_Remove(vp);
+    /* do NOT reset hashid to zero, as the online
+     * salvager package may need to know the volume id
+     * after the volume is removed from the hash */
 }
 
+/* - look up a volume id in the hash table
+ * - occasionally rebalance hash chains
+ * - update lookup statistics accordingly
+ */
+/* the hint parameter allows us to short-circuit on
+ * DEMAND_ATTACH_FS if the cacheChecks match between
+ * the hash chain head and hint
+ * caller MUST hold a refcount on hint */
+Volume *
+VLookupVolume_r(Error * ec, VolId volumeId, Volume * hint)
+{
+    register int looks = 0;
+    Volume * vp, *np, *pp;
+    VolumeHashChainHead * head;
+    *ec = 0;
+
+    head = &VolumeHashTable.Table[VOLUME_HASH(volumeId)];
+
+#ifdef AFS_DEMAND_ATTACH_FS
+    /* wait for the hash chain to become available */
+    VHashWait_r(head);
+
+    /* check to see if we can short circuit without walking the hash chain */
+    if (hint && (hint->chainCacheCheck == head->cacheCheck)) {
+	IncUInt64(&hint->stats.hash_short_circuits);
+	return hint;
+    }
+#endif /* AFS_DEMAND_ATTACH_FS */
+
+    /* someday we need to either do per-chain locks, RWlocks,
+     * or both for volhash access. 
+     * (and move to a data structure with better cache locality) */
+
+    /* search the chain for this volume id */
+    for(queue_Scan(head, vp, np, Volume)) {
+	looks++;
+	if ((vp->hashid == volumeId)) {
+	    break;
+	}
+    }
+
+    if (queue_IsEnd(head, vp)) {
+	vp = NULL;
+    }
+
+#ifdef AFS_DEMAND_ATTACH_FS
+    /* update hash chain statistics */
+    {
+	afs_uint64 lks;
+	FillInt64(lks, 0, looks);
+	AddUInt64(head->looks, lks, &head->looks);
+	AddUInt64(VStats.hash_looks, lks, &VStats.hash_looks);
+	IncUInt64(&head->gets);
+    }
+
+    if (vp) {
+	afs_uint64 thresh;
+	IncUInt64(&vp->stats.hash_lookups);
+
+	/* for demand attach fileserver, we permit occasional hash chain reordering
+	 * so that frequently looked up volumes move towards the head of the chain */
+	pp = queue_Prev(vp, Volume);
+	if (!queue_IsEnd(head, pp)) {
+	    FillInt64(thresh, 0, VOLUME_HASH_REORDER_THRESHOLD);
+	    AddUInt64(thresh, pp->stats.hash_lookups, &thresh);
+	    if (GEInt64(vp->stats.hash_lookups, thresh)) {
+		VReorderHash_r(head, pp, vp);
+	    }
+	}
+
+	/* update the short-circuit cache check */
+	vp->chainCacheCheck = head->cacheCheck;
+    }
+#endif /* AFS_DEMAND_ATTACH_FS */    
+
+    return vp;
+}
+
+#ifdef AFS_DEMAND_ATTACH_FS
+/* perform volume hash chain reordering.
+ *
+ * advance a subchain beginning at vp ahead of
+ * the adjacent subchain ending at pp */
+static void
+VReorderHash_r(VolumeHashChainHead * head, Volume * pp, Volume * vp)
+{
+    Volume *tp, *np, *lp;
+    afs_uint64 move_thresh;
+
+    /* this should never be called if the chain is already busy, so
+     * no need to wait for other exclusive chain ops to finish */
+
+    /* this is a rather heavy set of operations,
+     * so let's set the chain busy flag and drop
+     * the vol_glock */
+    VHashBeginExclusive_r(head);
+    VOL_UNLOCK;
+
+    /* scan forward in the chain from vp looking for the last element
+     * in the chain we want to advance */
+    FillInt64(move_thresh, 0, VOLUME_HASH_REORDER_CHAIN_THRESH);
+    AddUInt64(move_thresh, pp->stats.hash_lookups, &move_thresh);
+    for(queue_ScanFrom(head, vp, tp, np, Volume)) {
+	if (LTInt64(tp->stats.hash_lookups, move_thresh)) {
+	    break;
+	}
+    }
+    lp = queue_Prev(tp, Volume);
+
+    /* scan backwards from pp to determine where to splice and
+     * insert the subchain we're advancing */
+    for(queue_ScanBackwardsFrom(head, pp, tp, np, Volume)) {
+	if (GTInt64(tp->stats.hash_lookups, move_thresh)) {
+	    break;
+	}
+    }
+    tp = queue_Next(tp, Volume);
+
+    /* rebalance chain(vp,...,lp) ahead of chain(tp,...,pp) */
+    queue_MoveChainBefore(tp,vp,lp);
+
+    VOL_LOCK;
+    IncUInt64(&VStats.hash_reorders);
+    head->cacheCheck++;
+    IncUInt64(&head->reorders);
+
+    /* wake up any threads waiting for the hash chain */
+    VHashEndExclusive_r(head);
+}
+
+
+/* demand-attach fs volume hash
+ * asynchronous exclusive operations */
+
+/* take exclusive control over the hash chain */
+static void
+VHashBeginExclusive_r(VolumeHashChainHead * head)
+{
+    assert(head->busy == 0);
+    head->busy = 1;
+}
+
+/* relinquish exclusive control over the hash chain */
+static void
+VHashEndExclusive_r(VolumeHashChainHead * head)
+{
+    assert(head->busy);
+    head->busy = 0;
+    assert(pthread_cond_broadcast(&head->chain_busy_cv) == 0);
+}
+
+/* wait for another thread to finish its exclusive ops */
+static void
+VHashWait_r(VolumeHashChainHead * head)
+{
+    while (head->busy) {
+	assert(pthread_cond_wait(&head->chain_busy_cv, &vol_glock_mutex) == 0);
+    }
+}
+#endif /* AFS_DEMAND_ATTACH_FS */
+
+
+/***************************************************/
+/* Volume by Partition List routines               */
+/***************************************************/
+
+/*
+ * demand attach fileserver adds a
+ * linked list of volumes to each
+ * partition object, thus allowing
+ * for quick enumeration of all
+ * volumes on a partition
+ */
+
+#ifdef AFS_DEMAND_ATTACH_FS
+static void
+AddVolumeToVByPList_r(Volume * vp)
+{
+    if (queue_IsNotOnQueue(&vp->vol_list)) {
+	queue_Append(&vp->partition->vol_list, &vp->vol_list);
+	V_attachFlags(vp) |= VOL_ON_VBYP_LIST;
+	vp->partition->vol_list.len++;
+    }
+}
+
+static void
+DeleteVolumeFromVByPList_r(Volume * vp)
+{
+    if (queue_IsOnQueue(&vp->vol_list)) {
+	queue_Remove(&vp->vol_list);
+	V_attachFlags(vp) &= ~(VOL_ON_VBYP_LIST);
+	vp->partition->vol_list.len--;
+    }
+}
+
+/* take exclusive control over the list */
+static void
+VVByPListBeginExclusive_r(struct DiskPartition * dp)
+{
+    assert(dp->vol_list.busy == 0);
+    dp->vol_list.busy = 1;
+}
+
+/* relinquish exclusive control over the list */
+static void
+VVByPListEndExclusive_r(struct DiskPartition * dp)
+{
+    assert(dp->vol_list.busy);
+    dp->vol_list.busy = 0;
+    assert(pthread_cond_broadcast(&dp->vol_list.cv) == 0);
+}
+
+/* wait for another thread to finish its exclusive ops */
+static void
+VVByPListWait_r(struct DiskPartition * dp)
+{
+    while (dp->vol_list.busy) {
+	assert(pthread_cond_wait(&dp->vol_list.cv, &vol_glock_mutex) == 0);
+    }
+}
+#endif /* AFS_DEMAND_ATTACH_FS */
+
+/***************************************************/
+/* Volume Cache Statistics routines                */
+/***************************************************/
+
 void
 VPrintCacheStats_r(void)
 {
+    afs_uint32 get_hi, get_lo, load_hi, load_lo;
     register struct VnodeClassInfo *vcp;
     vcp = &VnodeClassInfo[vLarge];
     Log("Large vnode cache, %d entries, %d allocs, %d gets (%d reads), %d writes\n", vcp->cacheSize, vcp->allocs, vcp->gets, vcp->reads, vcp->writes);
     vcp = &VnodeClassInfo[vSmall];
     Log("Small vnode cache,%d entries, %d allocs, %d gets (%d reads), %d writes\n", vcp->cacheSize, vcp->allocs, vcp->gets, vcp->reads, vcp->writes);
+    SplitInt64(VStats.hdr_gets, get_hi, get_lo);
+    SplitInt64(VStats.hdr_loads, load_hi, load_lo);
     Log("Volume header cache, %d entries, %d gets, %d replacements\n",
-	VolumeCacheSize, VolumeGets, VolumeReplacements);
+	VStats.hdr_cache_size, get_lo, load_lo);
 }
 
 void
@@ -2067,3 +5966,259 @@ VPrintCacheStats(void)
     VOL_UNLOCK;
 }
 
+#ifdef AFS_DEMAND_ATTACH_FS
+static double
+UInt64ToDouble(afs_uint64 * x)
+{
+    static double c32 = 4.0 * 1.073741824 * 1000000000.0;
+    afs_uint32 h, l;
+    SplitInt64(*x, h, l);
+    return (((double)h) * c32) + ((double) l);
+}
+
+static char *
+DoubleToPrintable(double x, char * buf, int len)
+{
+    static double billion = 1000000000.0;
+    afs_uint32 y[3];
+
+    y[0] = (afs_uint32) (x / (billion * billion));
+    y[1] = (afs_uint32) ((x - (((double)y[0]) * billion * billion)) / billion);
+    y[2] = (afs_uint32) (x - ((((double)y[0]) * billion * billion) + (((double)y[1]) * billion)));
+
+    if (y[0]) {
+	snprintf(buf, len, "%d%09d%09d", y[0], y[1], y[2]);
+    } else if (y[1]) {
+	snprintf(buf, len, "%d%09d", y[1], y[2]);
+    } else {
+	snprintf(buf, len, "%d", y[2]);
+    }
+    buf[len-1] = '\0';
+    return buf;
+}
+
+static void
+VPrintExtendedCacheStats_r(int flags)
+{
+    int i, j;
+    struct stats {
+	double min;
+	double max;
+	double sum;
+	double avg;
+    };
+    struct stats looks, gets, reorders, len;
+    struct stats ch_looks, ch_gets, ch_reorders;
+    char pr_buf[4][32];
+    VolumeHashChainHead *head;
+    Volume *vp, *np;
+
+    /* zero out stats */
+    memset(&looks, 0, sizeof(struct stats));
+    memset(&gets, 0, sizeof(struct stats));
+    memset(&reorders, 0, sizeof(struct stats));
+    memset(&len, 0, sizeof(struct stats));
+    memset(&ch_looks, 0, sizeof(struct stats));
+    memset(&ch_gets, 0, sizeof(struct stats));
+    memset(&ch_reorders, 0, sizeof(struct stats));
+
+    for (i = 0; i < VolumeHashTable.Size; i++) {
+	head = &VolumeHashTable.Table[i];
+
+	VHashWait_r(head);
+	VHashBeginExclusive_r(head);
+	VOL_UNLOCK;
+
+	ch_looks.sum    = UInt64ToDouble(&head->looks);
+	ch_gets.sum     = UInt64ToDouble(&head->gets);
+	ch_reorders.sum = UInt64ToDouble(&head->reorders);
+
+	/* update global statistics */
+	{
+	    looks.sum    += ch_looks.sum;
+	    gets.sum     += ch_gets.sum;
+	    reorders.sum += ch_reorders.sum;
+	    len.sum      += (double)head->len;
+	    
+	    if (i == 0) {
+		len.min      = (double) head->len;
+		len.max      = (double) head->len;
+		looks.min    = ch_looks.sum;
+		looks.max    = ch_looks.sum;
+		gets.min     = ch_gets.sum;
+		gets.max     = ch_gets.sum;
+		reorders.min = ch_reorders.sum;
+		reorders.max = ch_reorders.sum;
+	    } else {
+		if (((double)head->len) < len.min)
+		    len.min = (double) head->len;
+		if (((double)head->len) > len.max)
+		    len.max = (double) head->len;
+		if (ch_looks.sum < looks.min)
+		    looks.min = ch_looks.sum;
+		else if (ch_looks.sum > looks.max)
+		    looks.max = ch_looks.sum;
+		if (ch_gets.sum < gets.min)
+		    gets.min = ch_gets.sum;
+		else if (ch_gets.sum > gets.max)
+		    gets.max = ch_gets.sum;
+		if (ch_reorders.sum < reorders.min)
+		    reorders.min = ch_reorders.sum;
+		else if (ch_reorders.sum > reorders.max)
+		    reorders.max = ch_reorders.sum;
+	    }
+	}
+
+	if ((flags & VOL_STATS_PER_CHAIN2) && queue_IsNotEmpty(head)) {
+	    /* compute detailed per-chain stats */
+	    struct stats hdr_loads, hdr_gets;
+	    double v_looks, v_loads, v_gets;
+
+	    /* initialize stats with data from first element in chain */
+	    vp = queue_First(head, Volume);
+	    v_looks = UInt64ToDouble(&vp->stats.hash_lookups);
+	    v_loads = UInt64ToDouble(&vp->stats.hdr_loads);
+	    v_gets  = UInt64ToDouble(&vp->stats.hdr_gets);
+	    ch_gets.min = ch_gets.max = v_looks;
+	    hdr_loads.min = hdr_loads.max = v_loads;
+	    hdr_gets.min = hdr_gets.max = v_gets;
+	    hdr_loads.sum = hdr_gets.sum = 0;
+
+	    vp = queue_Next(vp, Volume);
+
+	    /* pull in stats from remaining elements in chain */
+	    for (queue_ScanFrom(head, vp, vp, np, Volume)) {
+		v_looks = UInt64ToDouble(&vp->stats.hash_lookups);
+		v_loads = UInt64ToDouble(&vp->stats.hdr_loads);
+		v_gets  = UInt64ToDouble(&vp->stats.hdr_gets);
+
+		hdr_loads.sum += v_loads;
+		hdr_gets.sum += v_gets;
+
+		if (v_looks < ch_gets.min)
+		    ch_gets.min = v_looks;
+		else if (v_looks > ch_gets.max)
+		    ch_gets.max = v_looks;
+
+		if (v_loads < hdr_loads.min)
+		    hdr_loads.min = v_loads;
+		else if (v_loads > hdr_loads.max)
+		    hdr_loads.max = v_loads;
+
+		if (v_gets < hdr_gets.min)
+		    hdr_gets.min = v_gets;
+		else if (v_gets > hdr_gets.max)
+		    hdr_gets.max = v_gets;
+	    }
+
+	    /* compute per-chain averages */
+	    ch_gets.avg = ch_gets.sum / ((double)head->len);
+	    hdr_loads.avg = hdr_loads.sum / ((double)head->len);
+	    hdr_gets.avg = hdr_gets.sum / ((double)head->len);
+
+	    /* dump per-chain stats */
+	    Log("Volume hash chain %d : len=%d, looks=%s, reorders=%s\n",
+		i, head->len, 
+		DoubleToPrintable(ch_looks.sum, pr_buf[0], sizeof(pr_buf[0])),
+		DoubleToPrintable(ch_reorders.sum, pr_buf[1], sizeof(pr_buf[1])));
+	    Log("\tVolume gets : min=%s, max=%s, avg=%s, total=%s\n",
+		DoubleToPrintable(ch_gets.min, pr_buf[0], sizeof(pr_buf[0])),
+		DoubleToPrintable(ch_gets.max, pr_buf[1], sizeof(pr_buf[1])),
+		DoubleToPrintable(ch_gets.avg, pr_buf[2], sizeof(pr_buf[2])),
+		DoubleToPrintable(ch_gets.sum, pr_buf[3], sizeof(pr_buf[3])));
+	    Log("\tHDR gets : min=%s, max=%s, avg=%s, total=%s\n",
+		DoubleToPrintable(hdr_gets.min, pr_buf[0], sizeof(pr_buf[0])),
+		DoubleToPrintable(hdr_gets.max, pr_buf[1], sizeof(pr_buf[1])),
+		DoubleToPrintable(hdr_gets.avg, pr_buf[2], sizeof(pr_buf[2])),
+		DoubleToPrintable(hdr_gets.sum, pr_buf[3], sizeof(pr_buf[3])));
+	    Log("\tHDR loads : min=%s, max=%s, avg=%s, total=%s\n",
+		DoubleToPrintable(hdr_loads.min, pr_buf[0], sizeof(pr_buf[0])),
+		DoubleToPrintable(hdr_loads.max, pr_buf[1], sizeof(pr_buf[1])),
+		DoubleToPrintable(hdr_loads.avg, pr_buf[2], sizeof(pr_buf[2])),
+		DoubleToPrintable(hdr_loads.sum, pr_buf[3], sizeof(pr_buf[3])));
+	} else if (flags & VOL_STATS_PER_CHAIN) {
+	    /* dump simple per-chain stats */
+	    Log("Volume hash chain %d : len=%d, looks=%s, gets=%s, reorders=%s\n",
+		i, head->len, 
+		DoubleToPrintable(ch_looks.sum, pr_buf[0], sizeof(pr_buf[0])),
+		DoubleToPrintable(ch_gets.sum, pr_buf[1], sizeof(pr_buf[1])),
+		DoubleToPrintable(ch_reorders.sum, pr_buf[2], sizeof(pr_buf[2])));
+	}
+
+	VOL_LOCK;
+	VHashEndExclusive_r(head);
+    }
+
+    VOL_UNLOCK;
+
+    /* compute global averages */
+    len.avg      = len.sum      / ((double)VolumeHashTable.Size);
+    looks.avg    = looks.sum    / ((double)VolumeHashTable.Size);
+    gets.avg     = gets.sum     / ((double)VolumeHashTable.Size);
+    reorders.avg = reorders.sum / ((double)VolumeHashTable.Size);
+
+    /* dump global stats */
+    Log("Volume hash summary: %d buckets\n", VolumeHashTable.Size);
+    Log(" chain length : min=%s, max=%s, avg=%s, total=%s\n",
+	DoubleToPrintable(len.min, pr_buf[0], sizeof(pr_buf[0])),
+	DoubleToPrintable(len.max, pr_buf[1], sizeof(pr_buf[1])),
+	DoubleToPrintable(len.avg, pr_buf[2], sizeof(pr_buf[2])),
+	DoubleToPrintable(len.sum, pr_buf[3], sizeof(pr_buf[3])));
+    Log(" looks : min=%s, max=%s, avg=%s, total=%s\n",
+	DoubleToPrintable(looks.min, pr_buf[0], sizeof(pr_buf[0])),
+	DoubleToPrintable(looks.max, pr_buf[1], sizeof(pr_buf[1])),
+	DoubleToPrintable(looks.avg, pr_buf[2], sizeof(pr_buf[2])),
+	DoubleToPrintable(looks.sum, pr_buf[3], sizeof(pr_buf[3])));
+    Log(" gets : min=%s, max=%s, avg=%s, total=%s\n",
+	DoubleToPrintable(gets.min, pr_buf[0], sizeof(pr_buf[0])),
+	DoubleToPrintable(gets.max, pr_buf[1], sizeof(pr_buf[1])),
+	DoubleToPrintable(gets.avg, pr_buf[2], sizeof(pr_buf[2])),
+	DoubleToPrintable(gets.sum, pr_buf[3], sizeof(pr_buf[3])));
+    Log(" reorders : min=%s, max=%s, avg=%s, total=%s\n",
+	DoubleToPrintable(reorders.min, pr_buf[0], sizeof(pr_buf[0])),
+	DoubleToPrintable(reorders.max, pr_buf[1], sizeof(pr_buf[1])),
+	DoubleToPrintable(reorders.avg, pr_buf[2], sizeof(pr_buf[2])),
+	DoubleToPrintable(reorders.sum, pr_buf[3], sizeof(pr_buf[3])));
+
+    /* print extended disk related statistics */
+    {
+	struct DiskPartition * diskP;
+	afs_uint32 vol_count[VOLMAXPARTS+1];
+	byte part_exists[VOLMAXPARTS+1];
+	Device id;
+	int i;
+
+	memset(vol_count, 0, sizeof(vol_count));
+	memset(part_exists, 0, sizeof(part_exists));
+
+	VOL_LOCK;
+
+	for (diskP = DiskPartitionList; diskP; diskP = diskP->next) {
+	    id = diskP->device;
+	    vol_count[id] = diskP->vol_list.len;
+	    part_exists[id] = 1;
+	}
+
+	VOL_UNLOCK;
+	for (i = 0; i <= VOLMAXPARTS; i++) {
+	    if (part_exists[i]) {
+		diskP = VGetPartitionById_r(i, 0);
+		if (diskP) {
+		    Log("Partition %s has %d online volumes\n", 
+			VPartitionPath(diskP), diskP->vol_list.len);
+		}
+	    }
+	}
+	VOL_LOCK;
+    }
+
+}
+
+void
+VPrintExtendedCacheStats(int flags)
+{
+    VOL_LOCK;
+    VPrintExtendedCacheStats_r(flags);
+    VOL_UNLOCK;
+}
+#endif /* AFS_DEMAND_ATTACH_FS */
diff --git a/src/vol/volume.h b/src/vol/volume.h
index c66a09b7c5..09190bc310 100644
--- a/src/vol/volume.h
+++ b/src/vol/volume.h
@@ -5,6 +5,8 @@
  * This software has been released under the terms of the IBM Public
  * License.  For details, see the LICENSE file in the top-level source
  * directory or online at http://www.openafs.org/dl/license10.html
+ *
+ * Portions Copyright (c) 2006 Sine Nomine Associates
  */
 
 /*
@@ -24,44 +26,44 @@
 #define VolumeWriteable2(vol)		(vol.type == readwriteVolume)
 typedef bit32 FileOffset;	/* Offset in this file */
 #define Date afs_uint32
+#include "daemon_com.h"
+#include "fssync.h"
 
 #ifdef AFS_PTHREAD_ENV
 #include <assert.h>
 #include <pthread.h>
 extern pthread_mutex_t vol_glock_mutex;
-extern pthread_mutex_t vol_attach_mutex;
-extern pthread_mutex_t vol_fsync_mutex;
 extern pthread_mutex_t vol_trans_mutex;
 extern pthread_cond_t vol_put_volume_cond;
 extern pthread_cond_t vol_sleep_cond;
 extern int vol_attach_threads;
-/* this lock has been deprecated */
-#define VATTACH_LOCK
-#define VATTACH_UNLOCK
 #define VOL_LOCK \
     assert(pthread_mutex_lock(&vol_glock_mutex) == 0)
 #define VOL_UNLOCK \
     assert(pthread_mutex_unlock(&vol_glock_mutex) == 0)
-#define VFSYNC_LOCK \
-    assert(pthread_mutex_lock(&vol_fsync_mutex) == 0)
-#define VFSYNC_UNLOCK \
-    assert(pthread_mutex_unlock(&vol_fsync_mutex) == 0)
+#define VSALVSYNC_LOCK \
+    assert(pthread_mutex_lock(&vol_salvsync_mutex) == 0)
+#define VSALVSYNC_UNLOCK \
+    assert(pthread_mutex_unlock(&vol_salvsync_mutex) == 0)
 #define VTRANS_LOCK \
     assert(pthread_mutex_lock(&vol_trans_mutex) == 0)
 #define VTRANS_UNLOCK \
     assert(pthread_mutex_unlock(&vol_trans_mutex) == 0)
 #else /* AFS_PTHREAD_ENV */
-#define VATTACH_LOCK
-#define VATTACH_UNLOCK
 #define VOL_LOCK
 #define VOL_UNLOCK
-#define VFSYNC_LOCK
-#define VFSYNC_UNLOCK
+#define VSALVSYNC_LOCK
+#define VSALVSYNC_UNLOCK
 #define VTRANS_LOCK
 #define VTRANS_UNLOCK
 #endif /* AFS_PTHREAD_ENV */
 
-typedef enum { fileServer, volumeUtility, salvager } ProgramType;
+typedef enum { fileServer,       /* the fileserver process */
+	       volumeUtility,    /* volserver, or a single volume salvager (non-dafs) */
+	       salvager,         /* standalone whole-partition salvager */
+	       salvageServer,    /* dafs online salvager */
+	       debugUtility      /* fssync-debug or similar utility */
+} ProgramType;
 extern ProgramType programType;	/* The type of program using the package */
 
 /* Some initialization parameters for the volume package */
@@ -76,6 +78,70 @@ struct versionStamp {		/* Version stamp for critical volume files */
 				 * that created this file */
 };
 
+#ifdef AFS_DEMAND_ATTACH_FS
+/*
+ * demand attach fs
+ * volume state machine
+ *
+ * these must be contiguous in order for IsValidState() to work correctly 
+ */
+#define VOL_STATE_UNATTACHED     0       /* volume is unattached */
+#define VOL_STATE_PREATTACHED    1       /* volume has been pre-attached */
+#define VOL_STATE_ATTACHING      2       /* volume is transitioning to fully attached */
+#define VOL_STATE_ATTACHED       3       /* volume has been fully attached */
+#define VOL_STATE_UPDATING       4       /* volume is updating on-disk structures */
+#define VOL_STATE_GET_BITMAP     5       /* volume is getting bitmap entries */
+#define VOL_STATE_HDR_LOADING    6       /* volume is loading disk header */
+#define VOL_STATE_HDR_ATTACHING  7       /* volume is getting a header from the LRU */
+#define VOL_STATE_SHUTTING_DOWN  8       /* volume is shutting down */
+#define VOL_STATE_GOING_OFFLINE  9       /* volume is going offline */
+#define VOL_STATE_OFFLINING      10      /* volume is transitioning to offline */
+#define VOL_STATE_DETACHING      11      /* volume is transitioning to detached */
+#define VOL_STATE_SALVSYNC_REQ   12      /* volume is blocked on a salvsync request */
+#define VOL_STATE_SALVAGING      13      /* volume is being salvaged */
+#define VOL_STATE_ERROR          14      /* volume is in an error state */
+#define VOL_STATE_FREED          15      /* debugging aid */
+
+#define VOL_STATE_COUNT          16      /* total number of valid states */
+
+/* V_attachFlags bits */
+#define VOL_HDR_ATTACHED   0x1     /* volume header is attached to Volume struct */
+#define VOL_HDR_LOADED     0x2     /* volume header contents are valid */
+#define VOL_HDR_IN_LRU     0x4     /* volume header is in LRU */
+#define VOL_IN_HASH        0x8     /* volume is in hash table */
+#define VOL_ON_VBYP_LIST   0x10    /* volume is on VByP list */
+#define VOL_IS_BUSY        0x20    /* volume is not to be free()d */
+#define VOL_ON_VLRU        0x40    /* volume is on the VLRU */
+#define VOL_HDR_DONTSALV   0x80    /* volume header DONTSALVAGE flag is set */
+
+/* VPrintExtendedCacheStats flags */
+#define VOL_STATS_PER_CHAIN   0x1  /* compute simple per-chain stats */
+#define VOL_STATS_PER_CHAIN2  0x2  /* compute per-chain stats that require scanning
+				    * every element of the chain */
+
+/* VLRU_SetOptions options */
+#define VLRU_SET_THRESH       1
+#define VLRU_SET_INTERVAL     2
+#define VLRU_SET_MAX          3
+#define VLRU_SET_ENABLED      4
+
+/* valid VLRU queue names */
+#define VLRU_QUEUE_NEW 0            /* LRU queue for new volumes */
+#define VLRU_QUEUE_MID 1            /* survivor generation */
+#define VLRU_QUEUE_OLD 2            /* old generation */
+#define VLRU_QUEUE_CANDIDATE 3      /* soft detach candidate pool */
+#define VLRU_QUEUE_HELD 4           /* volumes which are not allowed
+				     * to be soft detached */
+#define VLRU_QUEUE_INVALID 5        /* invalid queue id */
+
+/* default scanner timing parameters */
+#define VLRU_DEFAULT_OFFLINE_THRESH (60*60*2) /* 2 hours */
+#define VLRU_DEFAULT_OFFLINE_INTERVAL (60*2) /* 2 minutes */
+#define VLRU_DEFAULT_OFFLINE_MAX 8 /* 8 volumes */
+
+#endif /* AFS_DEMAND_ATTACH_FS */
+
+
 /* Magic numbers and version stamps for each type of file */
 #define VOLUMEHEADERMAGIC	((bit32)0x88a1bb3c)
 #define VOLUMEINFOMAGIC		((bit32)0x78a1b2c5)
@@ -297,8 +363,144 @@ typedef struct VolumeDiskData {
 /**************************************/
 /* Memory resident volume information */
 /**************************************/
+
+/* global volume package stats */
+typedef struct VolPkgStats {
+#ifdef AFS_DEMAND_ATTACH_FS
+    /*
+     * demand attach fs
+     * extended volume package statistics
+     */
+
+    /* levels */
+    afs_uint32 state_levels[VOL_STATE_COUNT];
+
+    /* counters */
+    afs_uint64 hash_looks;           /* number of hash chain element traversals */
+    afs_uint64 hash_reorders;        /* number of hash chain reorders */
+    afs_uint64 salvages;             /* online salvages since fileserver start */
+    afs_uint64 vol_ops;              /* volume operations since fileserver start */
+#endif /* AFS_DEMAND_ATTACH_FS */
+
+    afs_uint64 hdr_loads;            /* header loads from disk */
+    afs_uint64 hdr_gets;             /* header pulls out of LRU */
+    afs_uint64 attaches;             /* volume attaches since fileserver start */
+    afs_uint64 soft_detaches;        /* soft detach ops since fileserver start */
+
+    /* configuration parameters */
+    afs_uint32 hdr_cache_size;       /* size of volume header cache */
+} VolPkgStats;
+extern VolPkgStats VStats;
+
+/*
+ * volume header cache supporting structures
+ */
+#ifdef AFS_DEMAND_ATTACH_FS
+struct volume_hdr_LRU_stats {
+    afs_uint32 free;
+    afs_uint32 used;
+    afs_uint32 attached;
+};
+#endif
+
+struct volume_hdr_LRU_t {
+    struct rx_queue lru;
+#ifdef AFS_DEMAND_ATTACH_FS
+    struct volume_hdr_LRU_stats stats;
+#endif
+};
+extern struct volume_hdr_LRU_t volume_hdr_LRU;
+
+/*
+ * volume hash chain supporting structures
+ */
+typedef struct VolumeHashChainHead {
+    struct rx_queue queue;
+    int len;
+    /* someday we could put a per-chain lock here... */
+#ifdef AFS_DEMAND_ATTACH_FS
+    int busy;
+    int cacheCheck;
+
+    /* per-chain statistics */
+    afs_uint64 looks;
+    afs_uint64 gets;
+    afs_uint64 reorders;
+
+    pthread_cond_t chain_busy_cv;
+#endif /* AFS_DEMAND_ATTACH_FS */
+} VolumeHashChainHead;
+
+typedef struct VolumeHashTable {
+    int Size;
+    int Mask;
+    VolumeHashChainHead * Table;
+} VolumeHashTable_t;
+extern VolumeHashTable_t VolumeHashTable;
+
+struct VolumeHashChainStats {
+    afs_int32 table_size;
+    afs_int32 chain_len;
+#ifdef AFS_DEMAND_ATTACH_FS
+    afs_int32 chain_cacheCheck;
+    afs_int32 chain_busy;
+    afs_uint64 chain_looks;
+    afs_uint64 chain_gets;
+    afs_uint64 chain_reorders;
+#endif
+};
+
+
+#ifdef AFS_DEMAND_ATTACH_FS
+/* demand attach fs
+ * extended per-volume statistics 
+ *
+ * please note that this structure lives across the entire
+ * lifetime of the fileserver process
+ */
+typedef struct VolumeStats {
+    /* counters */
+    afs_uint64 hash_lookups;         /* hash table lookups */
+    afs_uint64 hash_short_circuits;  /* short circuited hash lookups (due to cacheCheck) */
+    afs_uint64 hdr_loads;            /* header loads from disk */
+    afs_uint64 hdr_gets;             /* header pulls out of LRU */
+    afs_uint16 attaches;             /* attaches of this volume since fileserver start */
+    afs_uint16 soft_detaches;        /* soft detaches of this volume */
+    afs_uint16 salvages;             /* online salvages since fileserver start */
+    afs_uint16 vol_ops;              /* volume operations since fileserver start */
+
+    /* timestamps */
+    afs_uint32 last_attach;      /* unix timestamp of last VAttach */
+    afs_uint32 last_get;         /* unix timestamp of last VGet/VHold */
+    afs_uint32 last_promote;     /* unix timestamp of last VLRU promote/demote */
+    afs_uint32 last_hdr_get;     /* unix timestamp of last GetVolumeHeader() */
+    afs_uint32 last_salvage;     /* unix timestamp of last initiation of an online salvage */
+    afs_uint32 last_salvage_req; /* unix timestamp of last SALVSYNC request */
+    afs_uint32 last_vol_op;      /* unix timestamp of last volume operation */
+} VolumeStats;
+
+/* demand attach fs
+ * online salvager state */
+typedef struct VolumeOnlineSalvage {
+    afs_uint32 prio;            /* number of VGetVolume's since salvage requested */
+    int reason;                 /* reason for requesting online salvage */
+    byte requested;             /* flag specifying that salvage should be scheduled */
+    byte scheduled;             /* flag specifying whether online salvage scheduled */
+    byte reserved[2];           /* padding */
+} VolumeOnlineSalvage;
+
+/* demand attach fs
+ * volume LRU state */
+typedef struct VolumeVLRUState {
+    struct rx_queue lru;        /* VLRU queue pointers */
+    int idx;                    /* VLRU generation index */
+} VolumeVLRUState;
+
+typedef afs_uint16 VolState;    /* attachment state type */
+#endif /* AFS_DEMAND_ATTACH_FS */
+
 typedef struct Volume {
-    struct Volume *hashNext;	/* Next in hash resolution table */
+    struct rx_queue q;          /* Volume hash chain pointers */
     VolumeId hashid;		/* Volume number -- for hash table lookup */
     struct volHeader *header;	/* Cached disk data */
     Device device;		/* Unix device for the volume */
@@ -339,10 +541,23 @@ typedef struct Volume {
     afs_uint32 updateTime;	/* Time that this volume was put on the updated
 				 * volume list--the list of volumes that will be
 				 * salvaged should the file server crash */
+#ifdef AFS_DEMAND_ATTACH_FS
+    VolState attach_state;      /* what stage of attachment has been completed */
+    afs_uint16 attach_flags;    /* flags related to attachment state */
+    pthread_cond_t attach_cv;   /* state change condition variable */
+    short nWaiters;             /* volume package internal ref count */
+    int chainCacheCheck;        /* Volume hash chain cache check */
+    struct rx_queue vol_list;   /* per-partition volume list (VByPList) */
+
+    VolumeOnlineSalvage salvage;  /* online salvager state */
+    VolumeStats stats;            /* per-volume statistics */
+    VolumeVLRUState vlru;         /* state specific to the VLRU */
+    FSSYNC_VolOp_info * pending_vol_op;  /* fssync command info for any pending vol ops */
+#endif /* AFS_DEMAND_ATTACH_FS */
 } Volume;
 
 struct volHeader {
-    struct volHeader *prev, *next;	/* LRU pointers */
+    struct rx_queue lru;
     VolumeDiskData diskstuff;	/* General volume info read from disk */
     Volume *back;		/* back pointer to current volume structure */
 };
@@ -356,6 +571,11 @@ struct volHeader {
 #define V_vnodeIndex(vp)	((vp)->vnodeIndex)
 #define V_nextVnodeUnique(vp)	((vp)->nextVnodeUnique)
 #define V_linkHandle(vp)	((vp)->linkHandle)
+#ifdef AFS_DEMAND_ATTACH_FS
+#define V_attachState(vp)       ((vp)->attach_state)
+#define V_attachFlags(vp)       ((vp)->attach_flags)
+#define V_attachCV(vp)          ((vp)->attach_cv)
+#endif /* AFS_DEMAND_ATTACH_FS */
 
 /* N.B. V_id must be this, rather than vp->id, or some programs will break, probably */
 #define V_stamp(vp)		((vp)->header->diskstuff.stamp)
@@ -414,7 +634,7 @@ struct volHeader {
 
 extern char *VSalvageMessage;	/* Canonical message when a volume is forced
 				 * offline */
-extern Volume *VGetVolume(Error * ec, VolId volumeId);
+extern Volume *VGetVolume(Error * ec, Error * client_ec, VolId volumeId);
 extern Volume *VGetVolume_r(Error * ec, VolId volumeId);
 extern void VPutVolume(Volume *);
 extern void VPutVolume_r(Volume *);
@@ -422,6 +642,9 @@ extern void VOffline(Volume * vp, char *message);
 extern void VOffline_r(Volume * vp, char *message);
 extern int VConnectFS(void);
 extern int VConnectFS_r(void);
+extern void VDisconnectFS(void);
+extern void VDisconnectFS_r(void);
+extern int VChildProcReconnectFS(void);
 extern Volume *VAttachVolume(Error * ec, VolumeId volumeId, int mode);
 extern Volume *VAttachVolume_r(Error * ec, VolumeId volumeId, int mode);
 extern Volume *VCreateVolume(Error * ec, char *partname, VolId volumeId,
@@ -431,7 +654,7 @@ extern Volume *VCreateVolume_r(Error * ec, char *partname, VolId volumeId,
 extern VnodeId VAllocBitmapEntry(Error * ec, Volume * vp,
 				 struct vnodeIndex *index);
 extern VnodeId VAllocBitmapEntry_r(Error * ec, Volume * vp,
-				   struct vnodeIndex *index);
+				   struct vnodeIndex *index, int flags);
 extern void VFreeBitMapEntry(Error * ec, register struct vnodeIndex *index,
 			     unsigned bitNumber);
 extern void VFreeBitMapEntry_r(Error * ec, register struct vnodeIndex *index,
@@ -444,13 +667,13 @@ extern Volume *VAttachVolumeByName_r(Error * ec, char *partition, char *name,
 				     int mode);
 extern void VShutdown(void);
 extern void VUpdateVolume(Error * ec, Volume * vp);
-extern void VUpdateVolume_r(Error * ec, Volume * vp);
+extern void VUpdateVolume_r(Error * ec, Volume * vp, int flags);
 extern void VAddToVolumeUpdateList(Error * ec, Volume * vp);
 extern void VAddToVolumeUpdateList_r(Error * ec, Volume * vp);
 extern void VDetachVolume(Error * ec, Volume * vp);
 extern void VDetachVolume_r(Error * ec, Volume * vp);
 extern void VForceOffline(Volume * vp);
-extern void VForceOffline_r(Volume * vp);
+extern void VForceOffline_r(Volume * vp, int flags);
 extern void VBumpVolumeUsage(register Volume * vp);
 extern void VBumpVolumeUsage_r(register Volume * vp);
 extern void VSetDiskUsage(void);
@@ -459,12 +682,41 @@ extern void VReleaseVnodeFiles_r(Volume * vp);
 extern void VCloseVnodeFiles_r(Volume * vp);
 extern struct DiskPartition *VGetPartition(char *name, int abortp);
 extern struct DiskPartition *VGetPartition_r(char *name, int abortp);
-extern int VInitVolumePackage(ProgramType pt, int nLargeVnodes,
-			      int nSmallVnodes, int connect, int volcache);
+extern int VInitVolumePackage(ProgramType pt, afs_uint32 nLargeVnodes,
+			      afs_uint32 nSmallVnodes, int connect, afs_uint32 volcache);
 extern void DiskToVolumeHeader(VolumeHeader_t * h, VolumeDiskHeader_t * dh);
 extern void VolumeHeaderToDisk(VolumeDiskHeader_t * dh, VolumeHeader_t * h);
 extern void VTakeOffline_r(register Volume * vp);
 extern void VTakeOffline(register Volume * vp);
+extern Volume * VLookupVolume_r(Error * ec, VolId volumeId, Volume * hint);
+
+#ifdef AFS_DEMAND_ATTACH_FS
+extern Volume *VPreAttachVolumeByName(Error * ec, char *partition, char *name, 
+				       int mode);
+extern Volume *VPreAttachVolumeByName_r(Error * ec, char *partition, char *name,
+				     int mode);
+extern Volume *VPreAttachVolumeById_r(Error * ec, struct DiskPartition * partp, 
+				      Volume * vp, int volume_id);
+extern Volume *VGetVolumeByVp_r(Error * ec, Volume * vp);
+extern int VShutdownByPartition_r(struct DiskPartition * dp);
+extern int VShutdownVolume_r(Volume * vp);
+extern int VConnectSALV(void);
+extern int VConnectSALV_r(void);
+extern int VReconnectSALV(void);
+extern int VReconnectSALV_r(void);
+extern int VDisconnectSALV(void);
+extern int VDisconnectSALV_r(void);
+extern void VPrintExtendedCacheStats(int flags);
+extern void VPrintExtendedCacheStats_r(int flags);
+extern VolState VChangeState_r(Volume * vp, VolState new_state);
+extern void VLRU_SetOptions(int option, afs_uint32 val);
+extern int VSetVolHashSize(int logsize);
+extern int VRequestSalvage_r(Volume * vp, int reason, int flags);
+extern int VRegisterVolOp_r(Volume * vp, FSSYNC_VolOp_info * vopinfo);
+extern int VDeregisterVolOp_r(Volume * vp, FSSYNC_VolOp_info * vopinfo);
+#endif /* AFS_DEMAND_ATTACH_FS */
+extern int VVolOpLeaveOnline_r(Volume * vp, FSSYNC_VolOp_info * vopinfo);
+extern int VVolOpSetVBusy_r(Volume * vp, FSSYNC_VolOp_info * vopinfo);
 
 
 /* Naive formula relating number of file size to number of 1K blocks in file */
@@ -500,6 +752,26 @@ extern void VTakeOffline(register Volume * vp);
 				 * getting the most recent data. */
 
 
+
+/* VUpdateVolume_r flags */
+#define VOL_UPDATE_WAIT          0x1  /* for demand attach, wait for other exclusive ops to end */
+#define VOL_UPDATE_NOFORCEOFF    0x2  /* don't force offline on failure. this is to prevent
+				       * infinite recursion between vupdate and vforceoff */
+
+/* VForceOffline_r flags */
+#define VOL_FORCEOFF_NOUPDATE    0x1  /* don't force update on forceoff. this is to prevent
+				       * infinite recursion between vupdate and vforceoff */
+
+/* VSyncVolume_r flags */
+#define VOL_SYNC_WAIT            0x1  /* for demand attach, wait for other exclusive ops to end */
+
+/* VAllocBitmapEntry_r flags */
+#define VOL_ALLOC_BITMAP_WAIT    0x1  /* for demand attach, wait for other exclusive ops to end */
+
+/* VRequestSalvage_r flags */
+#define VOL_SALVAGE_INVALIDATE_HEADER 0x1 /* for demand attach fs, invalidate volume header cache */
+
+
 #if	defined(NEARINODE_HINT)
 #define V_pref(vp,nearInode)  nearInodeHash(V_id(vp),(nearInode)); (nearInode) %= V_partition(vp)->f_files
 #else
diff --git a/src/volser/NTMakefile b/src/volser/NTMakefile
index 5e6fa35e93..ded4d73634 100644
--- a/src/volser/NTMakefile
+++ b/src/volser/NTMakefile
@@ -5,6 +5,8 @@
 # License.  For details, see the LICENSE file in the top-level source
 # directory or online at http://www.openafs.org/dl/license10.html
 
+AFSDEV_AUXCDEFINES = -DFSSYNC_BUILD_CLIENT
+
 RELDIR=volser
 !INCLUDE ..\config\NTMakefile.$(SYS_NAME)
 !INCLUDE ..\config\NTMakefile.version
diff --git a/src/volser/dumpstuff.c b/src/volser/dumpstuff.c
index 911c35ae44..fc16c52793 100644
--- a/src/volser/dumpstuff.c
+++ b/src/volser/dumpstuff.c
@@ -51,6 +51,7 @@ RCSID
 #include <afs/volume.h>
 #include <afs/partition.h>
 #include "dump.h"
+#include <afs/daemon_com.h>
 #include <afs/fssync.h>
 #include <afs/acl.h>
 #include "volser.h"
diff --git a/src/volser/volprocs.c b/src/volser/volprocs.c
index ae1664fd27..5bba7c10f5 100644
--- a/src/volser/volprocs.c
+++ b/src/volser/volprocs.c
@@ -61,6 +61,7 @@ RCSID
 #include <afs/volume.h>
 #include <afs/partition.h>
 #include "vol.h"
+#include <afs/daemon_com.h>
 #include <afs/fssync.h>
 #include <afs/acl.h>
 #include "afs/audit.h"
@@ -844,7 +845,7 @@ VolReClone(struct rx_call *acid, afs_int32 atrans, afs_int32 cloneId)
 
     {
 	struct DiskPartition *tpartp = originalvp->partition;
-	FSYNC_askfs(cloneId, tpartp->name, FSYNC_RESTOREVOLUME, 0);
+	FSYNC_VolOp(cloneId, tpartp->name, FSYNC_VOL_BREAKCBKS, 0, NULL);
     }
     return 0;
 
@@ -1355,8 +1356,7 @@ VolRestore(struct rx_call *acid, afs_int32 atrans, afs_int32 aflags,
     DFlushVolume(V_parentId(tt->volume)); /* Ensure dir buffers get dropped */
 
     code = RestoreVolume(acid, tt->volume, (aflags & 1), cookie);	/* last is incrementalp */
-    FSYNC_askfs(tt->volid, NULL, FSYNC_RESTOREVOLUME, 0l);	/*break call backs on the
-								 * restored volume */
+    FSYNC_VolOp(tt->volid, NULL, FSYNC_VOL_BREAKCBKS, 0l, NULL);
     tt->rxCallPtr = (struct rx_call *)0;
     tcode = TRELE(tt);
 
@@ -1422,7 +1422,7 @@ VolSetForwarding(struct rx_call *acid, afs_int32 atid, afs_int32 anewsite)
     }
     strcpy(tt->lastProcName, "SetForwarding");
     tt->rxCallPtr = acid;
-    FSYNC_askfs(tt->volid, NULL, FSYNC_MOVEVOLUME, anewsite);
+    FSYNC_VolOp(tt->volid, NULL, FSYNC_VOL_MOVE, anewsite, NULL);
     tt->rxCallPtr = (struct rx_call *)0;
     if (TRELE(tt))
 	return VOLSERTRELE_ERROR;
@@ -1672,6 +1672,9 @@ XVolListPartitions(struct rx_call *acid, struct partEntries *pEntries)
 
     /* Only report attached partitions */
     for (i = 0; i < VOLMAXPARTS; i++) {
+#ifdef AFS_DEMAND_ATTACH_FS
+	dp = VGetPartitionById(i, 0);
+#else
 	if (i < 26) {
 	    namehead[6] = i + 'a';
 	    namehead[7] = '\0';
@@ -1682,6 +1685,7 @@ XVolListPartitions(struct rx_call *acid, struct partEntries *pEntries)
 	    namehead[8] = '\0';
 	}
 	dp = VGetPartition(namehead, 0);
+#endif
 	if (dp)
 	    partList.partId[j++] = i;
     }
@@ -1792,7 +1796,7 @@ VolListOneVolume(struct rx_call *acid, afs_int32 partid, afs_int32
 		pntr->volid = volid;
 		goto drop;
 	    }
-	    tv = VAttachVolumeByName(&error, pname, volname, V_READONLY);
+	    tv = VAttachVolumeByName(&error, pname, volname, V_PEEK);
 	    if (error) {
 		pntr->status = 0;	/*things are messed up */
 		strcpy(pntr->name, volname);
@@ -2007,7 +2011,7 @@ VolXListOneVolume(struct rx_call *a_rxCidP, afs_int32 a_partID,
 	    /*
 	     * Attach the volume, give up on the volume if we can't.
 	     */
-	    tv = VAttachVolumeByName(&error, pname, volname, V_READONLY);
+	    tv = VAttachVolumeByName(&error, pname, volname, V_PEEK);
 	    if (error) {
 		xInfoP->status = 0;	/*things are messed up */
 		strcpy(xInfoP->name, volname);
@@ -2819,7 +2823,7 @@ SAFSVolConvertROtoRWvolume(struct rx_call *acid, afs_int32 partId,
 	return EIO;
     }
     close(fd);
-    FSYNC_askfs(volumeId, pname, FSYNC_RESTOREVOLUME, 0);
+    FSYNC_VolOp(volumeId, pname, FSYNC_VOL_BREAKCBKS, 0, NULL);
 
     for (dp = DiskPartitionList; dp && strcmp(dp->name, pname);
 	 dp = dp->next);
@@ -2854,8 +2858,8 @@ SAFSVolConvertROtoRWvolume(struct rx_call *acid, afs_int32 partId,
     if (unlink(opath) < 0) {
 	Log("1 SAFS_VolConvertROtoRWvolume: Couldn't unlink RO header, error = %d\n", error);
     }
-    FSYNC_askfs(volumeId, pname, FSYNC_DONE, 0);
-    FSYNC_askfs(h.id, pname, FSYNC_ON, 0);
+    FSYNC_VolOp(volumeId, pname, FSYNC_VOL_DONE, 0, NULL);
+    FSYNC_VolOp(h.id, pname, FSYNC_VOL_ON, 0, NULL);
     return 0;
 #else /* AFS_NAMEI_ENV */
     return EINVAL;
diff --git a/src/volser/volser.p.h b/src/volser/volser.p.h
index 9e5b015c7c..e0111f0e4a 100644
--- a/src/volser/volser.p.h
+++ b/src/volser/volser.p.h
@@ -15,6 +15,8 @@
 #include <pthread.h>
 #endif
 
+#include <afs/voldefs.h>
+
 /* vflags, representing state of the volume */
 #define	VTDeleteOnSalvage	1	/* delete on next salvage */
 #define	VTOutOfService		2	/* never put this volume online */
@@ -110,7 +112,6 @@ extern struct volser_trans *QI_GlobalWriteTrans;
 #define INVALID_BID 0
 #define VOLSER_MAXVOLNAME 65
 #define VOLSER_OLDMAXVOLNAME 32
-#define	VOLMAXPARTS	255
 
 /*flags used for interfacing with the  backup system */
 struct volDescription {		/*used for interfacing with the backup system */