From 154bbe416cea47360e9b50a57dd1f7e6705ebab0 Mon Sep 17 00:00:00 2001 From: "Christian S.J. Peron" Date: Tue, 16 Sep 2008 20:32:29 +0000 Subject: [PATCH] Implement zero-copy bpf(4) buffer or "zbuf" support for libpcap. A slightly different version has been committed upstream in the libpcap vendor branch. This will allow people to experiment with zero-copy bpf(4) without requiring external patches. Note to enable this functionality: sysctl net.bpf.zerocopy_enable=1 By default, libpcap will use the legacy buffering method unless this sysctl variable is set to 1. For the details about zero-copy bpf(4) implementation see svn change r177548. Requested by: many Discussed with: sam In collaboration with: rwatson --- contrib/libpcap/pcap-bpf.c | 346 +++++++++++++++++++++++++++++++++---- contrib/libpcap/pcap-int.h | 26 ++- contrib/libpcap/pcap.c | 19 ++ 3 files changed, 354 insertions(+), 37 deletions(-) diff --git a/contrib/libpcap/pcap-bpf.c b/contrib/libpcap/pcap-bpf.c index 91bfdcb283cd..d398ec728d26 100644 --- a/contrib/libpcap/pcap-bpf.c +++ b/contrib/libpcap/pcap-bpf.c @@ -30,6 +30,7 @@ static const char rcsid[] _U_ = #endif #include /* optionally get BSD define */ +#include #include #include #include @@ -86,6 +87,10 @@ static int odmlockid = 0; #endif /* _AIX */ +#ifdef BIOCSETBUFMODE +#include +#endif + #include #include #include @@ -139,6 +144,159 @@ pcap_stats_bpf(pcap_t *p, struct pcap_stat *ps) return (0); } +#ifdef BIOCGETBUFMODE +/* + * Zero-copy BPF buffer routines to check for and acknowledge BPF data in + * shared memory buffers. + * + * pcap_next_zbuf_shm(): Check for a newly available shared memory buffer, + * and set up p->buffer and cc to reflect one if available. Notice that if + * there was no prior buffer, we select zbuf1 as this will be the first + * buffer filled for a fresh BPF session. + */ +static int +pcap_next_zbuf_shm(pcap_t *p, int *cc) +{ + struct bpf_zbuf_header *bzh; + + if (p->zbuffer == p->zbuf2 || p->zbuffer == NULL) { + bzh = (struct bpf_zbuf_header *)p->zbuf1; + if (bzh->bzh_user_gen != + atomic_load_acq_int(&bzh->bzh_kernel_gen)) { + p->bzh = bzh; + p->zbuffer = (u_char *)p->zbuf1; + p->buffer = p->zbuffer + sizeof(*bzh); + *cc = bzh->bzh_kernel_len; + return (1); + } + } else if (p->zbuffer == p->zbuf1) { + bzh = (struct bpf_zbuf_header *)p->zbuf2; + if (bzh->bzh_user_gen != + atomic_load_acq_int(&bzh->bzh_kernel_gen)) { + p->bzh = bzh; + p->zbuffer = (u_char *)p->zbuf2; + p->buffer = p->zbuffer + sizeof(*bzh); + *cc = bzh->bzh_kernel_len; + return (1); + } + } + *cc = 0; + return (0); +} + +/* + * pcap_next_zbuf() -- Similar to pcap_next_zbuf_shm(), except wait using + * select() for data or a timeout, and possibly force rotation of the buffer + * in the event we time out or are in immediate mode. Invoke the shared + * memory check before doing system calls in order to avoid doing avoidable + * work. + */ +static int +pcap_next_zbuf(pcap_t *p, int *cc) +{ + struct bpf_zbuf bz; + struct timeval tv; + struct timespec cur; + fd_set r_set; + int data, r; + int tmout, expire; + +#define TSTOMILLI(ts) (((ts)->tv_sec * 1000) + ((ts)->tv_nsec / 1000000)) + /* + * Start out by seeing whether anything is waiting by checking the + * next shared memory buffer for data. + */ + data = pcap_next_zbuf_shm(p, cc); + if (data) + return (data); + /* + * If a previous sleep was interrupted due to signal delivery, make + * sure that the timeout gets adjusted accordingly. This requires + * that we analyze when the timeout should be been expired, and + * subtract the current time from that. If after this operation, + * our timeout is less then or equal to zero, handle it like a + * regular timeout. + */ + tmout = p->to_ms; + if (tmout) + (void) clock_gettime(CLOCK_MONOTONIC, &cur); + if (p->interrupted && p->to_ms) { + expire = TSTOMILLI(&p->firstsel) + p->to_ms; + tmout = expire - TSTOMILLI(&cur); +#undef TSTOMILLI + if (tmout <= 0) { + p->interrupted = 0; + data = pcap_next_zbuf_shm(p, cc); + if (data) + return (data); + if (ioctl(p->fd, BIOCROTZBUF, &bz) < 0) { + (void) snprintf(p->errbuf, PCAP_ERRBUF_SIZE, + "BIOCROTZBUF: %s", strerror(errno)); + return (-1); + } + return (pcap_next_zbuf_shm(p, cc)); + } + } + /* + * No data in the buffer, so must use select() to wait for data or + * the next timeout. + */ + FD_ZERO(&r_set); + FD_SET(p->fd, &r_set); + if (tmout != 0) { + tv.tv_sec = tmout / 1000; + tv.tv_usec = (tmout * 1000) % 1000000; + } + r = select(p->fd + 1, &r_set, NULL, NULL, p->to_ms != 0 ? &tv : + NULL); + if (r < 0 && errno == EINTR) { + if (!p->interrupted && p->to_ms) { + p->interrupted = 1; + p->firstsel = cur; + } + return (0); + } else if (r < 0) { + (void) snprintf(p->errbuf, PCAP_ERRBUF_SIZE, + "select: %s", strerror(errno)); + return (-1); + } + p->interrupted = 0; + /* + * Check again for data, which may exist now that we've either been + * woken up as a result of data or timed out. Try the "there's data" + * case first since it doesn't require a system call. + */ + data = pcap_next_zbuf_shm(p, cc); + if (data) + return (data); + + /* + * Try forcing a buffer rotation to dislodge timed out or immediate + * data. + */ + if (ioctl(p->fd, BIOCROTZBUF, &bz) < 0) { + (void) snprintf(p->errbuf, PCAP_ERRBUF_SIZE, + "BIOCROTZBUF: %s", strerror(errno)); + return (-1); + } + return (pcap_next_zbuf_shm(p, cc)); +} + +/* + * Notify kernel that we are done with the buffer. We don't reset zbuffer so + * that we know which buffer to use next time around. + */ +static int +pcap_ack_zbuf(pcap_t *p) +{ + + atomic_store_rel_int(&p->bzh->bzh_user_gen, p->bzh->bzh_kernel_gen); + p->bzh = NULL; + p->buffer = NULL; + return (0); +} +#endif + static int pcap_read_bpf(pcap_t *p, int cnt, pcap_handler callback, u_char *user) { @@ -147,6 +305,9 @@ pcap_read_bpf(pcap_t *p, int cnt, pcap_handler callback, u_char *user) register u_char *bp, *ep; u_char *datap; struct bpf_insn *fcode; +#ifdef BIOCSETBUFMODE + int i; +#endif #ifdef PCAP_FDDIPAD register int pad; #endif @@ -167,7 +328,27 @@ pcap_read_bpf(pcap_t *p, int cnt, pcap_handler callback, u_char *user) } cc = p->cc; if (p->cc == 0) { - cc = read(p->fd, (char *)p->buffer, p->bufsize); + /* + * When reading without zero-copy from a file descriptor, we + * use a single buffer and return a length of data in the + * buffer. With zero-copy, we update the p->buffer pointer + * to point at whatever underlying buffer contains the next + * data and update cc to reflect the data found in the + * buffer. + */ +#ifdef BIOCSETBUFMODE + if (p->zerocopy) { + if (p->buffer != NULL) + pcap_ack_zbuf(p); + i = pcap_next_zbuf(p, &cc); + if (i == 0) + goto again; + if (i < 0) + return (-1); + } else +#endif + cc = read(p->fd, (char *)p->buffer, p->bufsize); + if (cc < 0) { /* Don't choke when we get ptraced */ switch (errno) { @@ -609,6 +790,10 @@ pcap_open_live(const char *device, int snaplen, int promisc, int to_ms, struct bpf_insn total_insn; struct bpf_program total_prog; struct utsname osinfo; +#ifdef BIOCSETBUFMODE + struct bpf_zbuf bz; + u_int bufmode, zbufmax; +#endif #ifdef HAVE_DAG_API if (strstr(device, "dag")) { @@ -646,41 +831,105 @@ pcap_open_live(const char *device, int snaplen, int promisc, int to_ms, goto bad; } +#ifdef BIOCSETBUFMODE /* - * Try finding a good size for the buffer; 32768 may be too - * big, so keep cutting it in half until we find a size - * that works, or run out of sizes to try. If the default - * is larger, don't make it smaller. - * - * XXX - there should be a user-accessible hook to set the - * initial buffer size. + * If the BPF extension to set buffer mode is present, try setting + * the mode to zero-copy. If that fails, use regular buffering. If + * it succeeds but other setup fails, return an error to the user. */ - if ((ioctl(fd, BIOCGBLEN, (caddr_t)&v) < 0) || v < 32768) - v = 32768; - for ( ; v != 0; v >>= 1) { - /* Ignore the return value - this is because the call fails - * on BPF systems that don't have kernel malloc. And if - * the call fails, it's no big deal, we just continue to - * use the standard buffer size. + bufmode = BPF_BUFMODE_ZBUF; + if (ioctl(fd, BIOCSETBUFMODE, (caddr_t)&bufmode) == 0) { + p->zerocopy = 1; + + /* + * How to pick a buffer size: first, query the maximum buffer + * size supported by zero-copy. This also lets us quickly + * determine whether the kernel generally supports zero-copy. + * Then, query the default buffer size, which reflects kernel + * policy for a desired default. Round to the nearest page + * size. */ - (void) ioctl(fd, BIOCSBLEN, (caddr_t)&v); - + if (ioctl(fd, BIOCGETZMAX, (caddr_t)&zbufmax) < 0) { + snprintf(ebuf, PCAP_ERRBUF_SIZE, "BIOCGETZMAX: %s", + pcap_strerror(errno)); + goto bad; + } + if ((ioctl(fd, BIOCGBLEN, (caddr_t)&v) < 0) || v < 32768) + v = 32768; +#ifndef roundup +#define roundup(x, y) ((((x)+((y)-1))/(y))*(y)) /* to any y */ +#endif + p->zbufsize = roundup(v, getpagesize()); + if (p->zbufsize > zbufmax) + p->zbufsize = zbufmax; + p->zbuf1 = mmap(NULL, p->zbufsize, PROT_READ | PROT_WRITE, + MAP_ANON, -1, 0); + p->zbuf2 = mmap(NULL, p->zbufsize, PROT_READ | PROT_WRITE, + MAP_ANON, -1, 0); + if (p->zbuf1 == MAP_FAILED || p->zbuf2 == MAP_FAILED) { + snprintf(ebuf, PCAP_ERRBUF_SIZE, "mmap: %s", + pcap_strerror(errno)); + goto bad; + } + bzero(&bz, sizeof(bz)); + bz.bz_bufa = p->zbuf1; + bz.bz_bufb = p->zbuf2; + bz.bz_buflen = p->zbufsize; + if (ioctl(fd, BIOCSETZBUF, (caddr_t)&bz) < 0) { + snprintf(ebuf, PCAP_ERRBUF_SIZE, "BIOCSETZBUF: %s", + pcap_strerror(errno)); + goto bad; + } (void)strncpy(ifr.ifr_name, device, sizeof(ifr.ifr_name)); - if (ioctl(fd, BIOCSETIF, (caddr_t)&ifr) >= 0) - break; /* that size worked; we're done */ - - if (errno != ENOBUFS) { + if (ioctl(fd, BIOCSETIF, (caddr_t)&ifr) < 0) { snprintf(ebuf, PCAP_ERRBUF_SIZE, "BIOCSETIF: %s: %s", device, pcap_strerror(errno)); goto bad; } - } + v = p->zbufsize - sizeof(struct bpf_zbuf_header); + } else { +#endif - if (v == 0) { - snprintf(ebuf, PCAP_ERRBUF_SIZE, - "BIOCSBLEN: %s: No buffer size worked", device); - goto bad; + /* + * Try finding a good size for the buffer; 32768 may be too + * big, so keep cutting it in half until we find a size + * that works, or run out of sizes to try. If the default + * is larger, don't make it smaller. + * + * XXX - there should be a user-accessible hook to set the + * initial buffer size. + */ + if ((ioctl(fd, BIOCGBLEN, (caddr_t)&v) < 0) || v < 32768) + v = 32768; + for ( ; v != 0; v >>= 1) { + /* Ignore the return value - this is because the call + * fails on BPF systems that don't have kernel + * malloc. And if the call fails, it's no big deal, + * we just continue to use the standard buffer size. + */ + (void) ioctl(fd, BIOCSBLEN, (caddr_t)&v); + + (void)strncpy(ifr.ifr_name, device, + sizeof(ifr.ifr_name)); + if (ioctl(fd, BIOCSETIF, (caddr_t)&ifr) >= 0) + break; /* that size worked; we're done */ + + if (errno != ENOBUFS) { + snprintf(ebuf, PCAP_ERRBUF_SIZE, + "BIOCSETIF: %s: %s", + device, pcap_strerror(errno)); + goto bad; + } + } + + if (v == 0) { + snprintf(ebuf, PCAP_ERRBUF_SIZE, + "BIOCSBLEN: %s: No buffer size worked", device); + goto bad; + } +#ifdef BIOCSETBUFMODE } +#endif /* Get the data link layer type. */ if (ioctl(fd, BIOCGDLT, (caddr_t)&v) < 0) { @@ -855,7 +1104,8 @@ pcap_open_live(const char *device, int snaplen, int promisc, int to_ms, } #endif /* set timeout */ - if (to_ms != 0) { + p->to_ms = to_ms; + if (to_ms != 0 && !p->zerocopy) { /* * XXX - is this seconds/nanoseconds in AIX? * (Treating it as such doesn't fix the timeout @@ -870,6 +1120,9 @@ pcap_open_live(const char *device, int snaplen, int promisc, int to_ms, goto bad; } } +#ifdef BIOCSETBUFMODE + p->timeout = to_ms; +#endif #ifdef _AIX #ifdef BIOCIMMEDIATE @@ -942,16 +1195,22 @@ pcap_open_live(const char *device, int snaplen, int promisc, int to_ms, goto bad; } p->bufsize = v; - p->buffer = (u_char *)malloc(p->bufsize); - if (p->buffer == NULL) { - snprintf(ebuf, PCAP_ERRBUF_SIZE, "malloc: %s", - pcap_strerror(errno)); - goto bad; - } +#ifdef BIOCSETBUFMODE + if (!p->zerocopy) { +#endif + p->buffer = (u_char *)malloc(p->bufsize); + if (p->buffer == NULL) { + snprintf(ebuf, PCAP_ERRBUF_SIZE, "malloc: %s", + pcap_strerror(errno)); + goto bad; + } #ifdef _AIX - /* For some strange reason this seems to prevent the EFAULT - * problems we have experienced from AIX BPF. */ - memset(p->buffer, 0x0, p->bufsize); + /* For some strange reason this seems to prevent the EFAULT + * problems we have experienced from AIX BPF. */ + memset(p->buffer, 0x0, p->bufsize); +#endif +#ifdef BIOCSETBUFMODE + } #endif /* @@ -1036,7 +1295,22 @@ pcap_open_live(const char *device, int snaplen, int promisc, int to_ms, return (p); bad: + (void)close(fd); +#ifdef BIOCSETBUFMODE + /* + * In zero-copy mode, p->buffer is just a pointer into one of the two + * memory-mapped buffers, so no need to free it. + */ + if (p->zerocopy) { + if (p->zbuf1 != MAP_FAILED && p->zbuf1 != NULL) + munmap(p->zbuf1, p->zbufsize); + if (p->zbuf2 != MAP_FAILED && p->zbuf2 != NULL) + munmap(p->zbuf2, p->zbufsize); + } else +#endif + if (p->buffer != NULL) + free(p->buffer); if (p->dlt_list != NULL) free(p->dlt_list); free(p); diff --git a/contrib/libpcap/pcap-int.h b/contrib/libpcap/pcap-int.h index fbab8e9cac34..aad7369462a8 100644 --- a/contrib/libpcap/pcap-int.h +++ b/contrib/libpcap/pcap-int.h @@ -167,12 +167,36 @@ struct pcap { struct pcap_md md; /* - * Read buffer. + * Read buffer -- for file descriptor read buffer model. */ int bufsize; u_char *buffer; u_char *bp; int cc; + int to_ms; + + /* + * Zero-copy read buffer -- for zero-copy BPF. 'buffer' above will + * alternative between these two actual mmap'd buffers as required. + * As there is a header on the front size of the mmap'd buffer, only + * some of the buffer is exposed to libpcap as a whole via bufsize; + * zbufsize is the true size. zbuffer tracks the current zbuf + * assocated with buffer so that it can be used to decide which the + * next buffer to read will be. + */ + u_char *zbuf1, *zbuf2, *zbuffer; + u_int zbufsize; + u_int timeout; + u_int zerocopy; + u_int interrupted; + struct timespec firstsel; + + /* + * If there's currently a buffer being actively processed, then it is + * referenced here; 'buffer' is also pointed at it, but offset by the + * size of the header. + */ + struct bpf_zbuf_header *bzh; /* * Place holder for pcap_next(). diff --git a/contrib/libpcap/pcap.c b/contrib/libpcap/pcap.c index 0822e1adf0dc..1a3c6b8d5827 100644 --- a/contrib/libpcap/pcap.c +++ b/contrib/libpcap/pcap.c @@ -44,6 +44,7 @@ static const char rcsid[] _U_ = #include #else /* WIN32 */ #include +#include #endif /* WIN32 */ #include @@ -738,6 +739,24 @@ pcap_stats_dead(pcap_t *p, struct pcap_stat *ps _U_) void pcap_close_common(pcap_t *p) { +#ifdef BIOCSETBUFMODE + /* + * Check to see if this pcap instance was using the zerocopy buffer + * mode. If it was, delete the mappings. Note that p->buffer + * gets initialized to one of the mmaped regions in this case, so + * do not try and free it directly. + * + * If the regular buffer mode was selected, then it is safe to free + * this memory. + */ + if (p->zerocopy) { + if (p->zbuf1 != MAP_FAILED && p->zbuf1 != NULL) + munmap(p->zbuf1, p->zbufsize); + if (p->zbuf2 != MAP_FAILED && p->zbuf2 != NULL) + munmap(p->zbuf2, p->zbufsize); + p->buffer = NULL; + } else +#endif if (p->buffer != NULL) free(p->buffer); #if !defined(WIN32) && !defined(MSDOS)