diff --git a/doc/man-pages/pod8/fragments/fileserver-options.pod b/doc/man-pages/pod8/fragments/fileserver-options.pod index 0215a5a47c..89cad7da02 100644 --- a/doc/man-pages/pod8/fragments/fileserver-options.pod +++ b/doc/man-pages/pod8/fragments/fileserver-options.pod @@ -296,3 +296,108 @@ consequences. This option is not supported on platforms other than AIX. Prevents any portion of the fileserver binary from being paged (swapped) out of memory on a file server machine running the IRIX operating system. This option is not supported on platforms other than IRIX. + +=item B<-sync> + +This option changes how hard the fileserver tries to ensure that data written +to volumes actually hits the physical disk. + +Normally, when the fileserver writes to disk, the underlying filesystem or +Operating System may delay writes from actually going to disk, and reorder +which writes hit the disk first. So, during an unclean shutdown of the machine +(if the power goes out, or the machine crashes, etc), or if the physical disk +backing store becomes unavailable, file data may become lost that the server +previously told clients was already successfully written. + +To try to mitigate this, the fileserver will try to "sync" file data to the +physical disk at numerous points during various I/O. However, this can result +in significantly reduced performance. Depending on the usage patterns, this may +or may not be acceptable. This option dictates specifically what the fileserver +does when it wants to perform a "sync". + +There are several options; pass one of these as the argument to -sync. The +default is C. + +=over 4 + +=item always + +This causes a sync operation to always sync immediately and synchronously. +This is the slowest option that provides the greatest protection against data +loss in the event of a crash or backing store unavailability. + +Note that this is still not a 100% guarantee that data will not be lost or +corrupted during a crash. The underlying filesystem itself may cause data to +be lost or corrupt in such a situation. And OpenAFS itself does not (yet) even +guarantee that all data is consistent at any point in time; so even if the +filesystem and OS do not buffer or reorder any writes, you are not guaranteed +that all data will be okay after a crash. + +This option may be appropriate if you have reason to believe a server is prone +to data loss failures, such as if the server encounters frequent power failures +or connectivity issues with network attached storage. Or if the backend storage +is temporarily degraded in some way (for example, a battery on a caching +controller fails), it may make sense to temporarily use the C option +until the situation is fixed. Some servers may also allow for sync operations +to occur very quickly, such that the C option is not noticeably slower +than any other option. In such a case, there is no downside to specifying +C. + +This was the only behavior allowed in OpenAFS releases prior to 1.4.5. + +=item delayed + +This causes a sync to do nothing immediately, but the sync will happen sometime +in the background, within approximately the next 10 seconds. This works by +having a separate thread that goes through all open file handles every 10 +seconds, and it syncs the ones that have been marked as needing a sync. File +handles flagged for sync may also get synced on volume detachment, according to +the same behavior as with the C option. + +This option is currently not recommended, since in the past the code +implementing this option has caused rare data corruption during normal +operation. However, it is currently the default option to allow consistent +behavior from previous OpenAFS releases. + +This was the only behavior allowed in OpenAFS releases starting from 1.4.5 up +to and including 1.6.2. It is the default starting in OpenAFS 1.6.3. This +option will be removed in a future version of OpenAFS, and the default behavior +will likely change to the C behavior. + +=item onclose + +This causes a sync to do nothing immediately, but causes the relevant file to +be flagged as potentially needing a sync. When a volume is detached, flagged +volume metadata files are synced, as well as data files that have been accessed +recently. Events that cause a volume to detach include: performing certain +volume operations (restore, salvage, offline, et al), detection of volume +consistency errors, a clean shutdown of the fileserver, or during DAFS "soft +detachment". + +Effectively this option is the same as C while a volume is attached and +actively being used, but if a volume is detached, there is an additional +guarantee for the data's consistency. + +=item never + +This causes all syncs to never do anything. This is the fastest option, with +the weakest guarantees for data consistency. + +Depending on the underlying filesystem and Operating System, there may be +guarantees that any data written to disk will hit the physical media after a +certain amount of time. For example, Linux's pdflush process usually makes this +guarantee, and ext3 can make certain various consistency guarantees according +to the options given. ZFS on Solaris can also provide similar guarantees, as +can various other platforms and filesystems. Consult the documentation for +your platform if you are unsure. + +=back + +Which option you choose is not an easy decision to make. Various developers and +experts sometimes disagree on which option is the most reasonable, and it may +depend on the specific scenario and workload involved. Some argue that the +C option does not provide significantly greater guarantees over any +other option, whereas others argue that choosing anything besides the C +option allows for an unacceptable risk of data loss. This may depend on your +usage patterns, your hardware, your platform and filesystem, and who you talk +to about this topic. diff --git a/doc/man-pages/pod8/fragments/fileserver-synopsis.pod b/doc/man-pages/pod8/fragments/fileserver-synopsis.pod index eb4c846680..ba20687a5b 100644 --- a/doc/man-pages/pod8/fragments/fileserver-synopsis.pod +++ b/doc/man-pages/pod8/fragments/fileserver-synopsis.pod @@ -45,3 +45,4 @@ B S<<< [B<-vattachpar> >] >>> S<<< [B<-m> >] >>> S<<< [B<-lock>] >>> + S<<< [B<-sync> >] >>> diff --git a/doc/man-pages/pod8/fragments/volserver-options.pod b/doc/man-pages/pod8/fragments/volserver-options.pod index 3c61666ee3..a2b98e25be 100644 --- a/doc/man-pages/pod8/fragments/volserver-options.pod +++ b/doc/man-pages/pod8/fragments/volserver-options.pod @@ -74,6 +74,11 @@ user.admin PTS entry. Sites whose Kerberos realms don't have these collisions between principal names may disable this check by starting the server with this option. +=item B<-sync> > + +This is the same as the B<-sync> option in L. See +L. + =item B<-help> Prints the online help for this command. All other valid options are diff --git a/doc/man-pages/pod8/fragments/volserver-synopsis.pod b/doc/man-pages/pod8/fragments/volserver-synopsis.pod index 03ca28a3ef..d5fca73534 100644 --- a/doc/man-pages/pod8/fragments/volserver-synopsis.pod +++ b/doc/man-pages/pod8/fragments/volserver-synopsis.pod @@ -6,3 +6,4 @@ B [B<-nojumbo>] [B<-jumbo>] [B<-enable_peer_stats>] [B<-enable_process_stats>] [B<-allow-dotted-principals>] [B<-help>] + [B<-sync> >] diff --git a/src/viced/viced.c b/src/viced/viced.c index e580c17ecd..7173b10c2b 100644 --- a/src/viced/viced.c +++ b/src/viced/viced.c @@ -964,6 +964,7 @@ FlagMsg(void) fputs("[-abortthreshold ] ", stdout); fputs("[-nojumbo (disable jumbogram network packets - deprecated)] ", stdout); fputs("[-jumbo (enable jumbogram network packets)] ", stdout); + fputs("[-sync ]", stdout); /* fputs("[-enable_peer_stats] ", stdout); */ /* fputs("[-enable_process_stats] ", stdout); */ fputs("[-help]\n", stdout); @@ -1436,6 +1437,16 @@ ParseArgs(int argc, char *argv[]) else if (strcmp(argv[i], "-saneacls") == 0) { saneacls = 1; } + else if (strcmp(argv[i], "-sync") == 0) { + if ((i + 1) >= argc) { + printf("You have to specify -sync \n"); + return -1; + } + if (ih_SetSyncBehavior(argv[++i])) { + printf("Invalid -sync value %s\n", argv[i]); + return -1; + } + } else { return (-1); } diff --git a/src/vol/ihandle.c b/src/vol/ihandle.c index 4ae1441a70..f8db2cd152 100644 --- a/src/vol/ihandle.c +++ b/src/vol/ihandle.c @@ -114,6 +114,34 @@ void ih_PkgDefaults(void) /* fd cache size that will be used if/when ih_UseLargeCache() * is called */ vol_io_params.fd_max_cachesize = FD_MAX_CACHESIZE; + + vol_io_params.sync_behavior = IH_SYNC_DELAYED; +} + +int +ih_SetSyncBehavior(const char *behavior) +{ + int val; + + if (strcmp(behavior, "always") == 0) { + val = IH_SYNC_ALWAYS; + + } else if (strcmp(behavior, "delayed") == 0) { + val = IH_SYNC_DELAYED; + + } else if (strcmp(behavior, "onclose") == 0) { + val = IH_SYNC_ONCLOSE; + + } else if (strcmp(behavior, "never") == 0) { + val = IH_SYNC_NEVER; + + } else { + /* invalid behavior name */ + return -1; + } + + vol_io_params.sync_behavior = val; + return 0; } #ifdef AFS_PTHREAD_ENV @@ -173,7 +201,7 @@ ih_Initialize(void) #endif fdCacheSize = MIN(fdMaxCacheSize, vol_io_params.fd_initial_cachesize); - { + if (vol_io_params.sync_behavior == IH_SYNC_DELAYED) { #ifdef AFS_PTHREAD_ENV pthread_t syncer; pthread_attr_t tattr; @@ -886,6 +914,8 @@ ih_reallyclose(IHandle_t * ihP) ihP->ih_refcnt++; /* must not disappear over unlock */ if (ihP->ih_synced) { FdHandle_t *fdP; + osi_Assert(vol_io_params.sync_behavior != IH_SYNC_ALWAYS); + osi_Assert(vol_io_params.sync_behavior != IH_SYNC_NEVER); ihP->ih_synced = 0; IH_UNLOCK; @@ -1104,3 +1134,23 @@ ih_isunlinked(int fd) return 0; } #endif /* !AFS_NT40_ENV */ + +int +ih_fdsync(FdHandle_t *fdP) +{ + switch (vol_io_params.sync_behavior) { + case IH_SYNC_ALWAYS: + return OS_SYNC(fdP->fd_fd); + case IH_SYNC_DELAYED: + case IH_SYNC_ONCLOSE: + if (fdP->fd_ih) { + fdP->fd_ih->ih_synced = 1; + return 0; + } + return 1; + case IH_SYNC_NEVER: + return 0; + default: + osi_Assert(0); + } +} diff --git a/src/vol/ihandle.h b/src/vol/ihandle.h index cb6adcf37c..718d9ae427 100644 --- a/src/vol/ihandle.h +++ b/src/vol/ihandle.h @@ -200,6 +200,26 @@ typedef struct StreamHandle_s { #define FD_HANDLE_MALLOCSIZE ((size_t)((4096/sizeof(FdHandle_t)))) #define STREAM_HANDLE_MALLOCSIZE 1 +/* Possible values for the vol_io_params.sync_behavior option. + * These dictate what actually happens when you call FDH_SYNC or IH_CONDSYNC. */ +#define IH_SYNC_ALWAYS (1) /* This makes FDH_SYNCs do what you'd probably + * expect: a synchronous fsync() */ +#define IH_SYNC_ONCLOSE (2) /* This makes FDH_SYNCs just flag the ih as "I + * need to sync", and does not perform the actual + * fsync() until we IH_REALLYCLOSE. This provides a + * little assurance over IH_SYNC_NEVER when a volume + * has gone offline, and a few other situations. */ +#define IH_SYNC_NEVER (3) /* This makes FDH_SYNCs do nothing. Faster, but + * obviously less durable. The OS may ensure that + * our data hits the disk eventually, depending on + * the platform and various OS-specific tuning + * parameters. */ +#define IH_SYNC_DELAYED (4) /* This makes FDH_SYNCs set a flag in the ih that + * says "I need to sync". And in a separate thread, + * ih_sync_thread finds all IHs that have this + * flag set, and it syncs them. Such IHs are also + * synced when closed, as in IH_SYNC_ONCLOSE. */ + /* READ THIS. * @@ -218,8 +238,9 @@ typedef struct ih_init_params afs_uint32 fd_handle_setaside; /* for non-cached i/o, trad. was 128 */ afs_uint32 fd_initial_cachesize; /* what was 'default' */ afs_uint32 fd_max_cachesize; /* max open files if large-cache activated */ -} ih_init_params; + int sync_behavior; /* one of the IH_SYNC_* constants */ +} ih_init_params; /* Number of file descriptors needed for non-cached I/O */ #define FD_HANDLE_SETASIDE 128 /* Match to MAX_FILESERVER_THREAD */ @@ -301,6 +322,7 @@ extern FILE *ih_fdopen(FdHandle_t * h, char *fdperms); extern void ih_PkgDefaults(void); extern void ih_Initialize(void); extern void ih_UseLargeCache(void); +extern int ih_SetSyncBehavior(const char *behavior); extern IHandle_t *ih_init(int /*@alt Device@ */ dev, int /*@alt VolId@ */ vid, Inode ino); extern IHandle_t *ih_copy(IHandle_t * ihP); @@ -550,11 +572,13 @@ extern afs_sfsize_t ih_size(FD_t); #define FDH_WRITE(H, B, S) OS_WRITE((H)->fd_fd, B, S) #define FDH_SEEK(H, O, F) OS_SEEK((H)->fd_fd, O, F) -#define FDH_SYNC(H) ((H->fd_ih!=NULL) ? ( H->fd_ih->ih_synced = 1) - 1 : 1) +#define FDH_SYNC(H) ih_fdsync(H) #define FDH_TRUNC(H, L) OS_TRUNC((H)->fd_fd, L) #define FDH_SIZE(H) OS_SIZE((H)->fd_fd) #define FDH_LOCKFILE(H, O) OS_LOCKFILE((H)->fd_fd, O) #define FDH_UNLOCKFILE(H, O) OS_UNLOCKFILE((H)->fd_fd, O) #define FDH_ISUNLINKED(H) OS_ISUNLINKED((H)->fd_fd) +extern int ih_fdsync(FdHandle_t *fdP); + #endif /* _IHANDLE_H_ */ diff --git a/src/vol/vol-salvage.c b/src/vol/vol-salvage.c index 152effcd81..55afa95b76 100644 --- a/src/vol/vol-salvage.c +++ b/src/vol/vol-salvage.c @@ -2985,17 +2985,8 @@ CopyAndSalvage(struct SalvInfo *salvinfo, struct DirSummary *dir) vnodeIndexOffset(vcp, dir->vnodeNumber), (char *)&vnode, sizeof(vnode)); osi_Assert(lcode == sizeof(vnode)); -#if 0 -#ifdef AFS_NT40_ENV - nt_sync(salvinfo->fileSysDevice); -#else - sync(); /* this is slow, but hopefully rarely called. We don't have - * an open FD on the file itself to fsync. - */ -#endif -#else - salvinfo->vnodeInfo[vLarge].handle->ih_synced = 1; -#endif + IH_CONDSYNC(salvinfo->vnodeInfo[vLarge].handle); + /* make sure old directory file is really closed */ fdP = IH_OPEN(dir->dirHandle.dirh_handle); FDH_REALLYCLOSE(fdP); diff --git a/src/volser/volmain.c b/src/volser/volmain.c index 52e9b043e2..ea0af1bd54 100644 --- a/src/volser/volmain.c +++ b/src/volser/volmain.c @@ -374,6 +374,16 @@ main(int argc, char **argv) rx_enablePeerRPCStats(); } else if (strcmp(argv[code], "-enable_process_stats") == 0) { rx_enableProcessRPCStats(); + } else if (strcmp(argv[code], "-sync") == 0) { + if ((code + 1) >= argc) { + printf("You have to specify -sync \n"); + exit(1); + } + ih_PkgDefaults(); + if (ih_SetSyncBehavior(argv[++code])) { + printf("Invalid -sync value %s\n", argv[code]); + exit(1); + } } #ifndef AFS_NT40_ENV else if (strcmp(argv[code], "-syslog") == 0) { @@ -394,6 +404,7 @@ main(int argc, char **argv) "[-udpsize ] " "[-syslog[=FACILITY]] " "[-enable_peer_stats] [-enable_process_stats] " + "[-sync ] " "[-help]\n"); #else printf("Usage: volserver [-log] [-p ] " @@ -401,6 +412,7 @@ main(int argc, char **argv) "[-nojumbo] [-jumbo] [-rxmaxmtu ] [-rxbind] [-allow-dotted-principals] " "[-udpsize ] " "[-enable_peer_stats] [-enable_process_stats] " + "[-sync ] " "[-help]\n"); #endif VS_EXIT(1);