mirror of
https://github.com/freebsd/freebsd-src.git
synced 2024-11-29 11:02:44 +00:00
Implement zero-copy bpf(4) buffer or "zbuf" support for libpcap. A slightly
different version has been committed upstream in the libpcap vendor branch. This will allow people to experiment with zero-copy bpf(4) without requiring external patches. Note to enable this functionality: sysctl net.bpf.zerocopy_enable=1 By default, libpcap will use the legacy buffering method unless this sysctl variable is set to 1. For the details about zero-copy bpf(4) implementation see svn change r177548. Requested by: many Discussed with: sam In collaboration with: rwatson
This commit is contained in:
parent
db2529820a
commit
154bbe416c
Notes:
svn2git
2020-12-20 02:59:44 +00:00
svn path=/head/; revision=183102
@ -30,6 +30,7 @@ static const char rcsid[] _U_ =
|
||||
#endif
|
||||
|
||||
#include <sys/param.h> /* optionally get BSD define */
|
||||
#include <sys/mman.h>
|
||||
#include <sys/time.h>
|
||||
#include <sys/timeb.h>
|
||||
#include <sys/socket.h>
|
||||
@ -86,6 +87,10 @@ static int odmlockid = 0;
|
||||
|
||||
#endif /* _AIX */
|
||||
|
||||
#ifdef BIOCSETBUFMODE
|
||||
#include <machine/atomic.h>
|
||||
#endif
|
||||
|
||||
#include <ctype.h>
|
||||
#include <errno.h>
|
||||
#include <netdb.h>
|
||||
@ -139,6 +144,159 @@ pcap_stats_bpf(pcap_t *p, struct pcap_stat *ps)
|
||||
return (0);
|
||||
}
|
||||
|
||||
#ifdef BIOCGETBUFMODE
|
||||
/*
|
||||
* Zero-copy BPF buffer routines to check for and acknowledge BPF data in
|
||||
* shared memory buffers.
|
||||
*
|
||||
* pcap_next_zbuf_shm(): Check for a newly available shared memory buffer,
|
||||
* and set up p->buffer and cc to reflect one if available. Notice that if
|
||||
* there was no prior buffer, we select zbuf1 as this will be the first
|
||||
* buffer filled for a fresh BPF session.
|
||||
*/
|
||||
static int
|
||||
pcap_next_zbuf_shm(pcap_t *p, int *cc)
|
||||
{
|
||||
struct bpf_zbuf_header *bzh;
|
||||
|
||||
if (p->zbuffer == p->zbuf2 || p->zbuffer == NULL) {
|
||||
bzh = (struct bpf_zbuf_header *)p->zbuf1;
|
||||
if (bzh->bzh_user_gen !=
|
||||
atomic_load_acq_int(&bzh->bzh_kernel_gen)) {
|
||||
p->bzh = bzh;
|
||||
p->zbuffer = (u_char *)p->zbuf1;
|
||||
p->buffer = p->zbuffer + sizeof(*bzh);
|
||||
*cc = bzh->bzh_kernel_len;
|
||||
return (1);
|
||||
}
|
||||
} else if (p->zbuffer == p->zbuf1) {
|
||||
bzh = (struct bpf_zbuf_header *)p->zbuf2;
|
||||
if (bzh->bzh_user_gen !=
|
||||
atomic_load_acq_int(&bzh->bzh_kernel_gen)) {
|
||||
p->bzh = bzh;
|
||||
p->zbuffer = (u_char *)p->zbuf2;
|
||||
p->buffer = p->zbuffer + sizeof(*bzh);
|
||||
*cc = bzh->bzh_kernel_len;
|
||||
return (1);
|
||||
}
|
||||
}
|
||||
*cc = 0;
|
||||
return (0);
|
||||
}
|
||||
|
||||
/*
|
||||
* pcap_next_zbuf() -- Similar to pcap_next_zbuf_shm(), except wait using
|
||||
* select() for data or a timeout, and possibly force rotation of the buffer
|
||||
* in the event we time out or are in immediate mode. Invoke the shared
|
||||
* memory check before doing system calls in order to avoid doing avoidable
|
||||
* work.
|
||||
*/
|
||||
static int
|
||||
pcap_next_zbuf(pcap_t *p, int *cc)
|
||||
{
|
||||
struct bpf_zbuf bz;
|
||||
struct timeval tv;
|
||||
struct timespec cur;
|
||||
fd_set r_set;
|
||||
int data, r;
|
||||
int tmout, expire;
|
||||
|
||||
#define TSTOMILLI(ts) (((ts)->tv_sec * 1000) + ((ts)->tv_nsec / 1000000))
|
||||
/*
|
||||
* Start out by seeing whether anything is waiting by checking the
|
||||
* next shared memory buffer for data.
|
||||
*/
|
||||
data = pcap_next_zbuf_shm(p, cc);
|
||||
if (data)
|
||||
return (data);
|
||||
/*
|
||||
* If a previous sleep was interrupted due to signal delivery, make
|
||||
* sure that the timeout gets adjusted accordingly. This requires
|
||||
* that we analyze when the timeout should be been expired, and
|
||||
* subtract the current time from that. If after this operation,
|
||||
* our timeout is less then or equal to zero, handle it like a
|
||||
* regular timeout.
|
||||
*/
|
||||
tmout = p->to_ms;
|
||||
if (tmout)
|
||||
(void) clock_gettime(CLOCK_MONOTONIC, &cur);
|
||||
if (p->interrupted && p->to_ms) {
|
||||
expire = TSTOMILLI(&p->firstsel) + p->to_ms;
|
||||
tmout = expire - TSTOMILLI(&cur);
|
||||
#undef TSTOMILLI
|
||||
if (tmout <= 0) {
|
||||
p->interrupted = 0;
|
||||
data = pcap_next_zbuf_shm(p, cc);
|
||||
if (data)
|
||||
return (data);
|
||||
if (ioctl(p->fd, BIOCROTZBUF, &bz) < 0) {
|
||||
(void) snprintf(p->errbuf, PCAP_ERRBUF_SIZE,
|
||||
"BIOCROTZBUF: %s", strerror(errno));
|
||||
return (-1);
|
||||
}
|
||||
return (pcap_next_zbuf_shm(p, cc));
|
||||
}
|
||||
}
|
||||
/*
|
||||
* No data in the buffer, so must use select() to wait for data or
|
||||
* the next timeout.
|
||||
*/
|
||||
FD_ZERO(&r_set);
|
||||
FD_SET(p->fd, &r_set);
|
||||
if (tmout != 0) {
|
||||
tv.tv_sec = tmout / 1000;
|
||||
tv.tv_usec = (tmout * 1000) % 1000000;
|
||||
}
|
||||
r = select(p->fd + 1, &r_set, NULL, NULL, p->to_ms != 0 ? &tv :
|
||||
NULL);
|
||||
if (r < 0 && errno == EINTR) {
|
||||
if (!p->interrupted && p->to_ms) {
|
||||
p->interrupted = 1;
|
||||
p->firstsel = cur;
|
||||
}
|
||||
return (0);
|
||||
} else if (r < 0) {
|
||||
(void) snprintf(p->errbuf, PCAP_ERRBUF_SIZE,
|
||||
"select: %s", strerror(errno));
|
||||
return (-1);
|
||||
}
|
||||
p->interrupted = 0;
|
||||
/*
|
||||
* Check again for data, which may exist now that we've either been
|
||||
* woken up as a result of data or timed out. Try the "there's data"
|
||||
* case first since it doesn't require a system call.
|
||||
*/
|
||||
data = pcap_next_zbuf_shm(p, cc);
|
||||
if (data)
|
||||
return (data);
|
||||
|
||||
/*
|
||||
* Try forcing a buffer rotation to dislodge timed out or immediate
|
||||
* data.
|
||||
*/
|
||||
if (ioctl(p->fd, BIOCROTZBUF, &bz) < 0) {
|
||||
(void) snprintf(p->errbuf, PCAP_ERRBUF_SIZE,
|
||||
"BIOCROTZBUF: %s", strerror(errno));
|
||||
return (-1);
|
||||
}
|
||||
return (pcap_next_zbuf_shm(p, cc));
|
||||
}
|
||||
|
||||
/*
|
||||
* Notify kernel that we are done with the buffer. We don't reset zbuffer so
|
||||
* that we know which buffer to use next time around.
|
||||
*/
|
||||
static int
|
||||
pcap_ack_zbuf(pcap_t *p)
|
||||
{
|
||||
|
||||
atomic_store_rel_int(&p->bzh->bzh_user_gen, p->bzh->bzh_kernel_gen);
|
||||
p->bzh = NULL;
|
||||
p->buffer = NULL;
|
||||
return (0);
|
||||
}
|
||||
#endif
|
||||
|
||||
static int
|
||||
pcap_read_bpf(pcap_t *p, int cnt, pcap_handler callback, u_char *user)
|
||||
{
|
||||
@ -147,6 +305,9 @@ pcap_read_bpf(pcap_t *p, int cnt, pcap_handler callback, u_char *user)
|
||||
register u_char *bp, *ep;
|
||||
u_char *datap;
|
||||
struct bpf_insn *fcode;
|
||||
#ifdef BIOCSETBUFMODE
|
||||
int i;
|
||||
#endif
|
||||
#ifdef PCAP_FDDIPAD
|
||||
register int pad;
|
||||
#endif
|
||||
@ -167,7 +328,27 @@ pcap_read_bpf(pcap_t *p, int cnt, pcap_handler callback, u_char *user)
|
||||
}
|
||||
cc = p->cc;
|
||||
if (p->cc == 0) {
|
||||
cc = read(p->fd, (char *)p->buffer, p->bufsize);
|
||||
/*
|
||||
* When reading without zero-copy from a file descriptor, we
|
||||
* use a single buffer and return a length of data in the
|
||||
* buffer. With zero-copy, we update the p->buffer pointer
|
||||
* to point at whatever underlying buffer contains the next
|
||||
* data and update cc to reflect the data found in the
|
||||
* buffer.
|
||||
*/
|
||||
#ifdef BIOCSETBUFMODE
|
||||
if (p->zerocopy) {
|
||||
if (p->buffer != NULL)
|
||||
pcap_ack_zbuf(p);
|
||||
i = pcap_next_zbuf(p, &cc);
|
||||
if (i == 0)
|
||||
goto again;
|
||||
if (i < 0)
|
||||
return (-1);
|
||||
} else
|
||||
#endif
|
||||
cc = read(p->fd, (char *)p->buffer, p->bufsize);
|
||||
|
||||
if (cc < 0) {
|
||||
/* Don't choke when we get ptraced */
|
||||
switch (errno) {
|
||||
@ -609,6 +790,10 @@ pcap_open_live(const char *device, int snaplen, int promisc, int to_ms,
|
||||
struct bpf_insn total_insn;
|
||||
struct bpf_program total_prog;
|
||||
struct utsname osinfo;
|
||||
#ifdef BIOCSETBUFMODE
|
||||
struct bpf_zbuf bz;
|
||||
u_int bufmode, zbufmax;
|
||||
#endif
|
||||
|
||||
#ifdef HAVE_DAG_API
|
||||
if (strstr(device, "dag")) {
|
||||
@ -646,41 +831,105 @@ pcap_open_live(const char *device, int snaplen, int promisc, int to_ms,
|
||||
goto bad;
|
||||
}
|
||||
|
||||
#ifdef BIOCSETBUFMODE
|
||||
/*
|
||||
* Try finding a good size for the buffer; 32768 may be too
|
||||
* big, so keep cutting it in half until we find a size
|
||||
* that works, or run out of sizes to try. If the default
|
||||
* is larger, don't make it smaller.
|
||||
*
|
||||
* XXX - there should be a user-accessible hook to set the
|
||||
* initial buffer size.
|
||||
* If the BPF extension to set buffer mode is present, try setting
|
||||
* the mode to zero-copy. If that fails, use regular buffering. If
|
||||
* it succeeds but other setup fails, return an error to the user.
|
||||
*/
|
||||
if ((ioctl(fd, BIOCGBLEN, (caddr_t)&v) < 0) || v < 32768)
|
||||
v = 32768;
|
||||
for ( ; v != 0; v >>= 1) {
|
||||
/* Ignore the return value - this is because the call fails
|
||||
* on BPF systems that don't have kernel malloc. And if
|
||||
* the call fails, it's no big deal, we just continue to
|
||||
* use the standard buffer size.
|
||||
bufmode = BPF_BUFMODE_ZBUF;
|
||||
if (ioctl(fd, BIOCSETBUFMODE, (caddr_t)&bufmode) == 0) {
|
||||
p->zerocopy = 1;
|
||||
|
||||
/*
|
||||
* How to pick a buffer size: first, query the maximum buffer
|
||||
* size supported by zero-copy. This also lets us quickly
|
||||
* determine whether the kernel generally supports zero-copy.
|
||||
* Then, query the default buffer size, which reflects kernel
|
||||
* policy for a desired default. Round to the nearest page
|
||||
* size.
|
||||
*/
|
||||
(void) ioctl(fd, BIOCSBLEN, (caddr_t)&v);
|
||||
|
||||
if (ioctl(fd, BIOCGETZMAX, (caddr_t)&zbufmax) < 0) {
|
||||
snprintf(ebuf, PCAP_ERRBUF_SIZE, "BIOCGETZMAX: %s",
|
||||
pcap_strerror(errno));
|
||||
goto bad;
|
||||
}
|
||||
if ((ioctl(fd, BIOCGBLEN, (caddr_t)&v) < 0) || v < 32768)
|
||||
v = 32768;
|
||||
#ifndef roundup
|
||||
#define roundup(x, y) ((((x)+((y)-1))/(y))*(y)) /* to any y */
|
||||
#endif
|
||||
p->zbufsize = roundup(v, getpagesize());
|
||||
if (p->zbufsize > zbufmax)
|
||||
p->zbufsize = zbufmax;
|
||||
p->zbuf1 = mmap(NULL, p->zbufsize, PROT_READ | PROT_WRITE,
|
||||
MAP_ANON, -1, 0);
|
||||
p->zbuf2 = mmap(NULL, p->zbufsize, PROT_READ | PROT_WRITE,
|
||||
MAP_ANON, -1, 0);
|
||||
if (p->zbuf1 == MAP_FAILED || p->zbuf2 == MAP_FAILED) {
|
||||
snprintf(ebuf, PCAP_ERRBUF_SIZE, "mmap: %s",
|
||||
pcap_strerror(errno));
|
||||
goto bad;
|
||||
}
|
||||
bzero(&bz, sizeof(bz));
|
||||
bz.bz_bufa = p->zbuf1;
|
||||
bz.bz_bufb = p->zbuf2;
|
||||
bz.bz_buflen = p->zbufsize;
|
||||
if (ioctl(fd, BIOCSETZBUF, (caddr_t)&bz) < 0) {
|
||||
snprintf(ebuf, PCAP_ERRBUF_SIZE, "BIOCSETZBUF: %s",
|
||||
pcap_strerror(errno));
|
||||
goto bad;
|
||||
}
|
||||
(void)strncpy(ifr.ifr_name, device, sizeof(ifr.ifr_name));
|
||||
if (ioctl(fd, BIOCSETIF, (caddr_t)&ifr) >= 0)
|
||||
break; /* that size worked; we're done */
|
||||
|
||||
if (errno != ENOBUFS) {
|
||||
if (ioctl(fd, BIOCSETIF, (caddr_t)&ifr) < 0) {
|
||||
snprintf(ebuf, PCAP_ERRBUF_SIZE, "BIOCSETIF: %s: %s",
|
||||
device, pcap_strerror(errno));
|
||||
goto bad;
|
||||
}
|
||||
}
|
||||
v = p->zbufsize - sizeof(struct bpf_zbuf_header);
|
||||
} else {
|
||||
#endif
|
||||
|
||||
if (v == 0) {
|
||||
snprintf(ebuf, PCAP_ERRBUF_SIZE,
|
||||
"BIOCSBLEN: %s: No buffer size worked", device);
|
||||
goto bad;
|
||||
/*
|
||||
* Try finding a good size for the buffer; 32768 may be too
|
||||
* big, so keep cutting it in half until we find a size
|
||||
* that works, or run out of sizes to try. If the default
|
||||
* is larger, don't make it smaller.
|
||||
*
|
||||
* XXX - there should be a user-accessible hook to set the
|
||||
* initial buffer size.
|
||||
*/
|
||||
if ((ioctl(fd, BIOCGBLEN, (caddr_t)&v) < 0) || v < 32768)
|
||||
v = 32768;
|
||||
for ( ; v != 0; v >>= 1) {
|
||||
/* Ignore the return value - this is because the call
|
||||
* fails on BPF systems that don't have kernel
|
||||
* malloc. And if the call fails, it's no big deal,
|
||||
* we just continue to use the standard buffer size.
|
||||
*/
|
||||
(void) ioctl(fd, BIOCSBLEN, (caddr_t)&v);
|
||||
|
||||
(void)strncpy(ifr.ifr_name, device,
|
||||
sizeof(ifr.ifr_name));
|
||||
if (ioctl(fd, BIOCSETIF, (caddr_t)&ifr) >= 0)
|
||||
break; /* that size worked; we're done */
|
||||
|
||||
if (errno != ENOBUFS) {
|
||||
snprintf(ebuf, PCAP_ERRBUF_SIZE,
|
||||
"BIOCSETIF: %s: %s",
|
||||
device, pcap_strerror(errno));
|
||||
goto bad;
|
||||
}
|
||||
}
|
||||
|
||||
if (v == 0) {
|
||||
snprintf(ebuf, PCAP_ERRBUF_SIZE,
|
||||
"BIOCSBLEN: %s: No buffer size worked", device);
|
||||
goto bad;
|
||||
}
|
||||
#ifdef BIOCSETBUFMODE
|
||||
}
|
||||
#endif
|
||||
|
||||
/* Get the data link layer type. */
|
||||
if (ioctl(fd, BIOCGDLT, (caddr_t)&v) < 0) {
|
||||
@ -855,7 +1104,8 @@ pcap_open_live(const char *device, int snaplen, int promisc, int to_ms,
|
||||
}
|
||||
#endif
|
||||
/* set timeout */
|
||||
if (to_ms != 0) {
|
||||
p->to_ms = to_ms;
|
||||
if (to_ms != 0 && !p->zerocopy) {
|
||||
/*
|
||||
* XXX - is this seconds/nanoseconds in AIX?
|
||||
* (Treating it as such doesn't fix the timeout
|
||||
@ -870,6 +1120,9 @@ pcap_open_live(const char *device, int snaplen, int promisc, int to_ms,
|
||||
goto bad;
|
||||
}
|
||||
}
|
||||
#ifdef BIOCSETBUFMODE
|
||||
p->timeout = to_ms;
|
||||
#endif
|
||||
|
||||
#ifdef _AIX
|
||||
#ifdef BIOCIMMEDIATE
|
||||
@ -942,16 +1195,22 @@ pcap_open_live(const char *device, int snaplen, int promisc, int to_ms,
|
||||
goto bad;
|
||||
}
|
||||
p->bufsize = v;
|
||||
p->buffer = (u_char *)malloc(p->bufsize);
|
||||
if (p->buffer == NULL) {
|
||||
snprintf(ebuf, PCAP_ERRBUF_SIZE, "malloc: %s",
|
||||
pcap_strerror(errno));
|
||||
goto bad;
|
||||
}
|
||||
#ifdef BIOCSETBUFMODE
|
||||
if (!p->zerocopy) {
|
||||
#endif
|
||||
p->buffer = (u_char *)malloc(p->bufsize);
|
||||
if (p->buffer == NULL) {
|
||||
snprintf(ebuf, PCAP_ERRBUF_SIZE, "malloc: %s",
|
||||
pcap_strerror(errno));
|
||||
goto bad;
|
||||
}
|
||||
#ifdef _AIX
|
||||
/* For some strange reason this seems to prevent the EFAULT
|
||||
* problems we have experienced from AIX BPF. */
|
||||
memset(p->buffer, 0x0, p->bufsize);
|
||||
/* For some strange reason this seems to prevent the EFAULT
|
||||
* problems we have experienced from AIX BPF. */
|
||||
memset(p->buffer, 0x0, p->bufsize);
|
||||
#endif
|
||||
#ifdef BIOCSETBUFMODE
|
||||
}
|
||||
#endif
|
||||
|
||||
/*
|
||||
@ -1036,7 +1295,22 @@ pcap_open_live(const char *device, int snaplen, int promisc, int to_ms,
|
||||
|
||||
return (p);
|
||||
bad:
|
||||
|
||||
(void)close(fd);
|
||||
#ifdef BIOCSETBUFMODE
|
||||
/*
|
||||
* In zero-copy mode, p->buffer is just a pointer into one of the two
|
||||
* memory-mapped buffers, so no need to free it.
|
||||
*/
|
||||
if (p->zerocopy) {
|
||||
if (p->zbuf1 != MAP_FAILED && p->zbuf1 != NULL)
|
||||
munmap(p->zbuf1, p->zbufsize);
|
||||
if (p->zbuf2 != MAP_FAILED && p->zbuf2 != NULL)
|
||||
munmap(p->zbuf2, p->zbufsize);
|
||||
} else
|
||||
#endif
|
||||
if (p->buffer != NULL)
|
||||
free(p->buffer);
|
||||
if (p->dlt_list != NULL)
|
||||
free(p->dlt_list);
|
||||
free(p);
|
||||
|
@ -167,12 +167,36 @@ struct pcap {
|
||||
struct pcap_md md;
|
||||
|
||||
/*
|
||||
* Read buffer.
|
||||
* Read buffer -- for file descriptor read buffer model.
|
||||
*/
|
||||
int bufsize;
|
||||
u_char *buffer;
|
||||
u_char *bp;
|
||||
int cc;
|
||||
int to_ms;
|
||||
|
||||
/*
|
||||
* Zero-copy read buffer -- for zero-copy BPF. 'buffer' above will
|
||||
* alternative between these two actual mmap'd buffers as required.
|
||||
* As there is a header on the front size of the mmap'd buffer, only
|
||||
* some of the buffer is exposed to libpcap as a whole via bufsize;
|
||||
* zbufsize is the true size. zbuffer tracks the current zbuf
|
||||
* assocated with buffer so that it can be used to decide which the
|
||||
* next buffer to read will be.
|
||||
*/
|
||||
u_char *zbuf1, *zbuf2, *zbuffer;
|
||||
u_int zbufsize;
|
||||
u_int timeout;
|
||||
u_int zerocopy;
|
||||
u_int interrupted;
|
||||
struct timespec firstsel;
|
||||
|
||||
/*
|
||||
* If there's currently a buffer being actively processed, then it is
|
||||
* referenced here; 'buffer' is also pointed at it, but offset by the
|
||||
* size of the header.
|
||||
*/
|
||||
struct bpf_zbuf_header *bzh;
|
||||
|
||||
/*
|
||||
* Place holder for pcap_next().
|
||||
|
@ -44,6 +44,7 @@ static const char rcsid[] _U_ =
|
||||
#include <pcap-stdinc.h>
|
||||
#else /* WIN32 */
|
||||
#include <sys/types.h>
|
||||
#include <sys/mman.h>
|
||||
#endif /* WIN32 */
|
||||
|
||||
#include <stdio.h>
|
||||
@ -738,6 +739,24 @@ pcap_stats_dead(pcap_t *p, struct pcap_stat *ps _U_)
|
||||
void
|
||||
pcap_close_common(pcap_t *p)
|
||||
{
|
||||
#ifdef BIOCSETBUFMODE
|
||||
/*
|
||||
* Check to see if this pcap instance was using the zerocopy buffer
|
||||
* mode. If it was, delete the mappings. Note that p->buffer
|
||||
* gets initialized to one of the mmaped regions in this case, so
|
||||
* do not try and free it directly.
|
||||
*
|
||||
* If the regular buffer mode was selected, then it is safe to free
|
||||
* this memory.
|
||||
*/
|
||||
if (p->zerocopy) {
|
||||
if (p->zbuf1 != MAP_FAILED && p->zbuf1 != NULL)
|
||||
munmap(p->zbuf1, p->zbufsize);
|
||||
if (p->zbuf2 != MAP_FAILED && p->zbuf2 != NULL)
|
||||
munmap(p->zbuf2, p->zbufsize);
|
||||
p->buffer = NULL;
|
||||
} else
|
||||
#endif
|
||||
if (p->buffer != NULL)
|
||||
free(p->buffer);
|
||||
#if !defined(WIN32) && !defined(MSDOS)
|
||||
|
Loading…
Reference in New Issue
Block a user