diff --git a/sys/dev/cxgbe/adapter.h b/sys/dev/cxgbe/adapter.h index bf0dda5a0f43..34db6e8218ac 100644 --- a/sys/dev/cxgbe/adapter.h +++ b/sys/dev/cxgbe/adapter.h @@ -804,8 +804,11 @@ struct adapter { void *tom_softc; /* (struct tom_data *) */ struct tom_tunables tt; - struct iw_tunables iwt; + struct t4_offload_policy *policy; + struct rwlock policy_lock; + void *iwarp_softc; /* (struct c4iw_dev *) */ + struct iw_tunables iwt; void *iscsi_ulp_softc; /* (struct cxgbei_data *) */ void *ccr_softc; /* (struct ccr_softc *) */ struct l2t_data *l2t; /* L2 table */ diff --git a/sys/dev/cxgbe/offload.h b/sys/dev/cxgbe/offload.h index 431e486e2afa..cd27d7de6e24 100644 --- a/sys/dev/cxgbe/offload.h +++ b/sys/dev/cxgbe/offload.h @@ -156,6 +156,7 @@ struct tom_tunables { int num_tls_rx_ports; int tx_align; int tx_zcopy; + int cop_managed_offloading; }; /* iWARP driver tunables */ struct iw_tunables { diff --git a/sys/dev/cxgbe/t4_ioctl.h b/sys/dev/cxgbe/t4_ioctl.h index 8fa92935c62b..b7b8ce5fbe24 100644 --- a/sys/dev/cxgbe/t4_ioctl.h +++ b/sys/dev/cxgbe/t4_ioctl.h @@ -35,6 +35,7 @@ #include #include +#include /* * Ioctl commands specific to this driver. @@ -344,6 +345,44 @@ struct t4_cudbg_dump { uint8_t *data; }; +enum { + OPEN_TYPE_LISTEN = 'L', + OPEN_TYPE_ACTIVE = 'A', + OPEN_TYPE_PASSIVE = 'P', + OPEN_TYPE_DONTCARE = 'D', +}; + +struct offload_settings { + int8_t offload; + int8_t rx_coalesce; + int8_t cong_algo; + int8_t sched_class; + int8_t tstamp; + int8_t sack; + int8_t nagle; + int8_t ecn; + int8_t ddp; + int8_t tls; + int16_t txq; + int16_t rxq; + int16_t mss; +}; + +struct offload_rule { + char open_type; + struct offload_settings settings; + struct bpf_program bpf_prog; /* compiled program/filter */ +}; + +/* + * An offload policy consists of a set of rules matched in sequence. The + * settings of the first rule that matches are applied to that connection. + */ +struct t4_offload_policy { + uint32_t nrules; + struct offload_rule *rule; +}; + #define CHELSIO_T4_GETREG _IOWR('f', T4_GETREG, struct t4_reg) #define CHELSIO_T4_SETREG _IOW('f', T4_SETREG, struct t4_reg) #define CHELSIO_T4_REGDUMP _IOWR('f', T4_REGDUMP, struct t4_regdump) @@ -368,4 +407,5 @@ struct t4_cudbg_dump { #define CHELSIO_T4_LOAD_BOOT _IOW('f', T4_LOAD_BOOT, struct t4_bootrom) #define CHELSIO_T4_LOAD_BOOTCFG _IOW('f', T4_LOAD_BOOTCFG, struct t4_data) #define CHELSIO_T4_CUDBG_DUMP _IOWR('f', T4_CUDBG_DUMP, struct t4_cudbg_dump) +#define CHELSIO_T4_SET_OFLD_POLICY _IOW('f', T4_SET_OFLD_POLICY, struct t4_offload_policy) #endif diff --git a/sys/dev/cxgbe/t4_main.c b/sys/dev/cxgbe/t4_main.c index 9fa4e9efc20c..7ab5c4d76e27 100644 --- a/sys/dev/cxgbe/t4_main.c +++ b/sys/dev/cxgbe/t4_main.c @@ -470,6 +470,14 @@ static int pcie_relaxed_ordering = -1; TUNABLE_INT("hw.cxgbe.pcie_relaxed_ordering", &pcie_relaxed_ordering); +#ifdef TCP_OFFLOAD +/* + * TOE tunables. + */ +static int t4_cop_managed_offloading = 0; +TUNABLE_INT("hw.cxgbe.cop_managed_offloading", &t4_cop_managed_offloading); +#endif + /* Functions used by VIs to obtain unique MAC addresses for each VI. */ static int vi_mac_funcs[] = { FW_VI_FUNC_ETH, @@ -617,6 +625,8 @@ static int load_cfg(struct adapter *, struct t4_data *); static int load_boot(struct adapter *, struct t4_bootrom *); static int load_bootcfg(struct adapter *, struct t4_data *); static int cudbg_dump(struct adapter *, struct t4_cudbg_dump *); +static void free_offload_policy(struct t4_offload_policy *); +static int set_offload_policy(struct adapter *, struct t4_offload_policy *); static int read_card_mem(struct adapter *, int, struct t4_mem_range *); static int read_i2c(struct adapter *, struct t4_i2c_data *); #ifdef TCP_OFFLOAD @@ -897,6 +907,9 @@ t4_attach(device_t dev) mtx_init(&sc->reg_lock, "indirect register access", 0, MTX_DEF); + sc->policy = NULL; + rw_init(&sc->policy_lock, "connection offload policy"); + rc = t4_map_bars_0_and_4(sc); if (rc != 0) goto done; /* error message displayed already */ @@ -1405,6 +1418,14 @@ t4_detach_common(device_t dev) if (mtx_initialized(&sc->reg_lock)) mtx_destroy(&sc->reg_lock); + if (rw_initialized(&sc->policy_lock)) { + rw_destroy(&sc->policy_lock); +#ifdef TCP_OFFLOAD + if (sc->policy != NULL) + free_offload_policy(sc->policy); +#endif + } + for (i = 0; i < NUM_MEMWIN; i++) { struct memwin *mw = &sc->memwin[i]; @@ -5440,6 +5461,12 @@ t4_sysctls(struct adapter *sc) CTLFLAG_RW, &sc->tt.tx_zcopy, 0, "Enable zero-copy aio_write(2)"); + sc->tt.cop_managed_offloading = !!t4_cop_managed_offloading; + SYSCTL_ADD_INT(ctx, children, OID_AUTO, + "cop_managed_offloading", CTLFLAG_RW, + &sc->tt.cop_managed_offloading, 0, + "COP (Connection Offload Policy) controls all TOE offload"); + SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "timer_tick", CTLTYPE_STRING | CTLFLAG_RD, sc, 0, sysctl_tp_tick, "A", "TP timer tick (us)"); @@ -9385,6 +9412,113 @@ done: return (rc); } +static void +free_offload_policy(struct t4_offload_policy *op) +{ + struct offload_rule *r; + int i; + + if (op == NULL) + return; + + r = &op->rule[0]; + for (i = 0; i < op->nrules; i++, r++) { + free(r->bpf_prog.bf_insns, M_CXGBE); + } + free(op->rule, M_CXGBE); + free(op, M_CXGBE); +} + +static int +set_offload_policy(struct adapter *sc, struct t4_offload_policy *uop) +{ + int i, rc, len; + struct t4_offload_policy *op, *old; + struct bpf_program *bf; + const struct offload_settings *s; + struct offload_rule *r; + void *u; + + if (!is_offload(sc)) + return (ENODEV); + + if (uop->nrules == 0) { + /* Delete installed policies. */ + op = NULL; + goto set_policy; + } if (uop->nrules > 256) { /* arbitrary */ + return (E2BIG); + } + + /* Copy userspace offload policy to kernel */ + op = malloc(sizeof(*op), M_CXGBE, M_ZERO | M_WAITOK); + op->nrules = uop->nrules; + len = op->nrules * sizeof(struct offload_rule); + op->rule = malloc(len, M_CXGBE, M_ZERO | M_WAITOK); + rc = copyin(uop->rule, op->rule, len); + if (rc) { + free(op->rule, M_CXGBE); + free(op, M_CXGBE); + return (rc); + } + + r = &op->rule[0]; + for (i = 0; i < op->nrules; i++, r++) { + + /* Validate open_type */ + if (r->open_type != OPEN_TYPE_LISTEN && + r->open_type != OPEN_TYPE_ACTIVE && + r->open_type != OPEN_TYPE_PASSIVE && + r->open_type != OPEN_TYPE_DONTCARE) { +error: + /* + * Rules 0 to i have malloc'd filters that need to be + * freed. Rules i+1 to nrules have userspace pointers + * and should be left alone. + */ + op->nrules = i; + free_offload_policy(op); + return (rc); + } + + /* Validate settings */ + s = &r->settings; + if ((s->offload != 0 && s->offload != 1) || + s->cong_algo < -1 || s->cong_algo > CONG_ALG_HIGHSPEED || + s->sched_class < -1 || + s->sched_class >= sc->chip_params->nsched_cls) { + rc = EINVAL; + goto error; + } + + bf = &r->bpf_prog; + u = bf->bf_insns; /* userspace ptr */ + bf->bf_insns = NULL; + if (bf->bf_len == 0) { + /* legal, matches everything */ + continue; + } + len = bf->bf_len * sizeof(*bf->bf_insns); + bf->bf_insns = malloc(len, M_CXGBE, M_ZERO | M_WAITOK); + rc = copyin(u, bf->bf_insns, len); + if (rc != 0) + goto error; + + if (!bpf_validate(bf->bf_insns, bf->bf_len)) { + rc = EINVAL; + goto error; + } + } +set_policy: + rw_wlock(&sc->policy_lock); + old = sc->policy; + sc->policy = op; + rw_wunlock(&sc->policy_lock); + free_offload_policy(old); + + return (0); +} + #define MAX_READ_BUF_SIZE (128 * 1024) static int read_card_mem(struct adapter *sc, int win, struct t4_mem_range *mr) @@ -9743,6 +9877,9 @@ t4_ioctl(struct cdev *dev, unsigned long cmd, caddr_t data, int fflag, case CHELSIO_T4_CUDBG_DUMP: rc = cudbg_dump(sc, (struct t4_cudbg_dump *)data); break; + case CHELSIO_T4_SET_OFLD_POLICY: + rc = set_offload_policy(sc, (struct t4_offload_policy *)data); + break; default: rc = ENOTTY; } diff --git a/sys/dev/cxgbe/t4_sge.c b/sys/dev/cxgbe/t4_sge.c index 7edd17f6df69..b2fbfe381918 100644 --- a/sys/dev/cxgbe/t4_sge.c +++ b/sys/dev/cxgbe/t4_sge.c @@ -963,8 +963,10 @@ mtu_to_max_payload(struct adapter *sc, int mtu, const int toe) #ifdef TCP_OFFLOAD if (toe) { - payload = sc->tt.rx_coalesce ? - G_RXCOALESCESIZE(t4_read_reg(sc, A_TP_PARA_REG2)) : mtu; + int rxcs = G_RXCOALESCESIZE(t4_read_reg(sc, A_TP_PARA_REG2)); + + /* Note that COP can set rx_coalesce on/off per connection. */ + payload = max(mtu, rxcs); } else { #endif /* large enough even when hw VLAN extraction is disabled */ diff --git a/sys/dev/cxgbe/tom/t4_connect.c b/sys/dev/cxgbe/tom/t4_connect.c index 7b0267007923..9d1b6add679f 100644 --- a/sys/dev/cxgbe/tom/t4_connect.c +++ b/sys/dev/cxgbe/tom/t4_connect.c @@ -43,6 +43,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include #include @@ -55,6 +56,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include "common/common.h" #include "common/t4_msg.h" @@ -233,47 +235,85 @@ do_act_open_rpl(struct sge_iq *iq, const struct rss_header *rss, * Options2 for active open. */ static uint32_t -calc_opt2a(struct socket *so, struct toepcb *toep) +calc_opt2a(struct socket *so, struct toepcb *toep, + const struct offload_settings *s) { struct tcpcb *tp = so_sototcpcb(so); struct port_info *pi = toep->vi->pi; struct adapter *sc = pi->adapter; - uint32_t opt2; + uint32_t opt2 = 0; - opt2 = V_TX_QUEUE(sc->params.tp.tx_modq[pi->tx_chan]) | - F_RSS_QUEUE_VALID | V_RSS_QUEUE(toep->ofld_rxq->iq.abs_id); + /* + * rx flow control, rx coalesce, congestion control, and tx pace are all + * explicitly set by the driver. On T5+ the ISS is also set by the + * driver to the value picked by the kernel. + */ + if (is_t4(sc)) { + opt2 |= F_RX_FC_VALID | F_RX_COALESCE_VALID; + opt2 |= F_CONG_CNTRL_VALID | F_PACE_VALID; + } else { + opt2 |= F_T5_OPT_2_VALID; /* all 4 valid */ + opt2 |= F_T5_ISS; /* ISS provided in CPL */ + } - if (tp->t_flags & TF_SACK_PERMIT) + if (s->sack > 0 || (s->sack < 0 && (tp->t_flags & TF_SACK_PERMIT))) opt2 |= F_SACK_EN; - if (tp->t_flags & TF_REQ_TSTMP) + if (s->tstamp > 0 || (s->tstamp < 0 && (tp->t_flags & TF_REQ_TSTMP))) opt2 |= F_TSTAMPS_EN; if (tp->t_flags & TF_REQ_SCALE) opt2 |= F_WND_SCALE_EN; - if (V_tcp_do_ecn) + if (s->ecn > 0 || (s->ecn < 0 && V_tcp_do_ecn == 1)) opt2 |= F_CCTRL_ECN; - /* RX_COALESCE is always a valid value (M_RX_COALESCE). */ - if (is_t4(sc)) - opt2 |= F_RX_COALESCE_VALID; + /* XXX: F_RX_CHANNEL for multiple rx c-chan support goes here. */ + + opt2 |= V_TX_QUEUE(sc->params.tp.tx_modq[pi->tx_chan]); + + /* These defaults are subject to ULP specific fixups later. */ + opt2 |= V_RX_FC_DDP(0) | V_RX_FC_DISABLE(0); + + opt2 |= V_PACE(0); + + if (s->cong_algo >= 0) + opt2 |= V_CONG_CNTRL(s->cong_algo); + else if (sc->tt.cong_algorithm >= 0) + opt2 |= V_CONG_CNTRL(sc->tt.cong_algorithm & M_CONG_CNTRL); else { - opt2 |= F_T5_OPT_2_VALID; - opt2 |= F_T5_ISS; + struct cc_algo *cc = CC_ALGO(tp); + + if (strcasecmp(cc->name, "reno") == 0) + opt2 |= V_CONG_CNTRL(CONG_ALG_RENO); + else if (strcasecmp(cc->name, "tahoe") == 0) + opt2 |= V_CONG_CNTRL(CONG_ALG_TAHOE); + if (strcasecmp(cc->name, "newreno") == 0) + opt2 |= V_CONG_CNTRL(CONG_ALG_NEWRENO); + if (strcasecmp(cc->name, "highspeed") == 0) + opt2 |= V_CONG_CNTRL(CONG_ALG_HIGHSPEED); + else { + /* + * Use newreno in case the algorithm selected by the + * host stack is not supported by the hardware. + */ + opt2 |= V_CONG_CNTRL(CONG_ALG_NEWRENO); + } } - if (sc->tt.rx_coalesce) + + if (s->rx_coalesce > 0 || (s->rx_coalesce < 0 && sc->tt.rx_coalesce)) opt2 |= V_RX_COALESCE(M_RX_COALESCE); - if (sc->tt.cong_algorithm != -1) - opt2 |= V_CONG_CNTRL(sc->tt.cong_algorithm & M_CONG_CNTRL); + /* Note that ofld_rxq is already set according to s->rxq. */ + opt2 |= F_RSS_QUEUE_VALID; + opt2 |= V_RSS_QUEUE(toep->ofld_rxq->iq.abs_id); #ifdef USE_DDP_RX_FLOW_CONTROL if (toep->ulp_mode == ULP_MODE_TCPDDP) - opt2 |= F_RX_FC_VALID | F_RX_FC_DDP; + opt2 |= F_RX_FC_DDP; #endif + if (toep->ulp_mode == ULP_MODE_TLS) { - opt2 |= F_RX_FC_VALID; opt2 &= ~V_RX_COALESCE(M_RX_COALESCE); opt2 |= F_RX_FC_DISABLE; } @@ -348,10 +388,12 @@ t4_connect(struct toedev *tod, struct socket *so, struct rtentry *rt, struct wrqe *wr = NULL; struct ifnet *rt_ifp = rt->rt_ifp; struct vi_info *vi; - int mtu_idx, rscale, qid_atid, rc, isipv6; + int mtu_idx, rscale, qid_atid, rc, isipv6, txqid, rxqid; struct inpcb *inp = sotoinpcb(so); struct tcpcb *tp = intotcpcb(inp); int reason; + struct offload_settings settings; + uint16_t vid = 0xffff; INP_WLOCK_ASSERT(inp); KASSERT(nam->sa_family == AF_INET || nam->sa_family == AF_INET6, @@ -363,12 +405,30 @@ t4_connect(struct toedev *tod, struct socket *so, struct rtentry *rt, struct ifnet *ifp = VLAN_COOKIE(rt_ifp); vi = ifp->if_softc; + VLAN_TAG(ifp, &vid); } else if (rt_ifp->if_type == IFT_IEEE8023ADLAG) DONT_OFFLOAD_ACTIVE_OPEN(ENOSYS); /* XXX: implement lagg+TOE */ else DONT_OFFLOAD_ACTIVE_OPEN(ENOTSUP); - toep = alloc_toepcb(vi, -1, -1, M_NOWAIT | M_ZERO); + rw_rlock(&sc->policy_lock); + settings = *lookup_offload_policy(sc, OPEN_TYPE_ACTIVE, NULL, vid, inp); + rw_runlock(&sc->policy_lock); + if (!settings.offload) + DONT_OFFLOAD_ACTIVE_OPEN(EPERM); + + if (settings.txq >= 0 && settings.txq < vi->nofldtxq) + txqid = settings.txq; + else + txqid = arc4random() % vi->nofldtxq; + txqid += vi->first_ofld_txq; + if (settings.rxq >= 0 && settings.rxq < vi->nofldrxq) + rxqid = settings.rxq; + else + rxqid = arc4random() % vi->nofldrxq; + rxqid += vi->first_ofld_rxq; + + toep = alloc_toepcb(vi, txqid, rxqid, M_NOWAIT | M_ZERO); if (toep == NULL) DONT_OFFLOAD_ACTIVE_OPEN(ENOMEM); @@ -387,7 +447,7 @@ t4_connect(struct toedev *tod, struct socket *so, struct rtentry *rt, DONT_OFFLOAD_ACTIVE_OPEN(ENOMEM); toep->vnet = so->so_vnet; - set_ulp_mode(toep, select_ulp_mode(so, sc)); + set_ulp_mode(toep, select_ulp_mode(so, sc, &settings)); SOCKBUF_LOCK(&so->so_rcv); /* opt0 rcv_bufsiz initially, assumes its normal meaning later */ toep->rx_credits = min(select_rcv_wnd(so) >> 10, M_RCV_BUFSIZ); @@ -402,7 +462,7 @@ t4_connect(struct toedev *tod, struct socket *so, struct rtentry *rt, rscale = tp->request_r_scale = select_rcv_wscale(); else rscale = 0; - mtu_idx = find_best_mtu_idx(sc, &inp->inp_inc, 0); + mtu_idx = find_best_mtu_idx(sc, &inp->inp_inc, &settings); qid_atid = (toep->ofld_rxq->iq.abs_id << 14) | toep->tid; if (isipv6) { @@ -443,8 +503,8 @@ t4_connect(struct toedev *tod, struct socket *so, struct rtentry *rt, cpl->peer_ip_hi = *(uint64_t *)&inp->in6p_faddr.s6_addr[0]; cpl->peer_ip_lo = *(uint64_t *)&inp->in6p_faddr.s6_addr[8]; cpl->opt0 = calc_opt0(so, vi, toep->l2te, mtu_idx, rscale, - toep->rx_credits, toep->ulp_mode); - cpl->opt2 = calc_opt2a(so, toep); + toep->rx_credits, toep->ulp_mode, &settings); + cpl->opt2 = calc_opt2a(so, toep, &settings); } else { struct cpl_act_open_req *cpl = wrtod(wr); struct cpl_t5_act_open_req *cpl5 = (void *)cpl; @@ -472,8 +532,8 @@ t4_connect(struct toedev *tod, struct socket *so, struct rtentry *rt, inp_4tuple_get(inp, &cpl->local_ip, &cpl->local_port, &cpl->peer_ip, &cpl->peer_port); cpl->opt0 = calc_opt0(so, vi, toep->l2te, mtu_idx, rscale, - toep->rx_credits, toep->ulp_mode); - cpl->opt2 = calc_opt2a(so, toep); + toep->rx_credits, toep->ulp_mode, &settings); + cpl->opt2 = calc_opt2a(so, toep, &settings); } CTR5(KTR_CXGBE, "%s: atid %u (%s), toep %p, inp %p", __func__, diff --git a/sys/dev/cxgbe/tom/t4_cpl_io.c b/sys/dev/cxgbe/tom/t4_cpl_io.c index 351a2e3c42d8..11693371c17b 100644 --- a/sys/dev/cxgbe/tom/t4_cpl_io.c +++ b/sys/dev/cxgbe/tom/t4_cpl_io.c @@ -121,6 +121,11 @@ send_flowc_wr(struct toepcb *toep, struct flowc_tx_params *ftxp) nparams++; if (toep->tls.fcplenmax != 0) nparams++; + if (toep->tc_idx != -1) { + MPASS(toep->tc_idx >= 0 && + toep->tc_idx < sc->chip_params->nsched_cls); + nparams++; + } flowclen = sizeof(*flowc) + nparams * sizeof(struct fw_flowc_mnemval); @@ -172,6 +177,8 @@ send_flowc_wr(struct toepcb *toep, struct flowc_tx_params *ftxp) FLOWC_PARAM(ULP_MODE, toep->ulp_mode); if (toep->tls.fcplenmax != 0) FLOWC_PARAM(TXDATAPLEN_MAX, toep->tls.fcplenmax); + if (toep->tc_idx != -1) + FLOWC_PARAM(SCHEDCLASS, toep->tc_idx); #undef FLOWC_PARAM KASSERT(paramidx == nparams, ("nparams mismatch")); @@ -333,19 +340,19 @@ assign_rxopt(struct tcpcb *tp, unsigned int opt) n = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); else n = sizeof(struct ip) + sizeof(struct tcphdr); - if (V_tcp_do_rfc1323) - n += TCPOLEN_TSTAMP_APPA; tp->t_maxseg = sc->params.mtus[G_TCPOPT_MSS(opt)] - n; - CTR4(KTR_CXGBE, "%s: tid %d, mtu_idx %u (%u)", __func__, toep->tid, - G_TCPOPT_MSS(opt), sc->params.mtus[G_TCPOPT_MSS(opt)]); - if (G_TCPOPT_TSTAMP(opt)) { tp->t_flags |= TF_RCVD_TSTMP; /* timestamps ok */ tp->ts_recent = 0; /* hmmm */ tp->ts_recent_age = tcp_ts_getticks(); + tp->t_maxseg -= TCPOLEN_TSTAMP_APPA; } + CTR5(KTR_CXGBE, "%s: tid %d, mtu_idx %u (%u), mss %u", __func__, + toep->tid, G_TCPOPT_MSS(opt), sc->params.mtus[G_TCPOPT_MSS(opt)], + tp->t_maxseg); + if (G_TCPOPT_SACK(opt)) tp->t_flags |= TF_SACK_PERMIT; /* should already be set */ else diff --git a/sys/dev/cxgbe/tom/t4_listen.c b/sys/dev/cxgbe/tom/t4_listen.c index 7b7428c9bcb3..7571c353dc3b 100644 --- a/sys/dev/cxgbe/tom/t4_listen.c +++ b/sys/dev/cxgbe/tom/t4_listen.c @@ -45,6 +45,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include #include @@ -62,6 +63,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include "common/common.h" #include "common/t4_msg.h" @@ -84,7 +86,8 @@ static struct listen_ctx *listen_hash_find(struct adapter *, struct inpcb *); static struct listen_ctx *listen_hash_del(struct adapter *, struct inpcb *); static struct inpcb *release_lctx(struct adapter *, struct listen_ctx *); -static inline void save_qids_in_mbuf(struct mbuf *, struct vi_info *); +static inline void save_qids_in_mbuf(struct mbuf *, struct vi_info *, + struct offload_settings *); static inline void get_qids_from_mbuf(struct mbuf *m, int *, int *); static void send_reset_synqe(struct toedev *, struct synq_entry *); @@ -513,9 +516,17 @@ t4_listen_start(struct toedev *tod, struct tcpcb *tp) struct inpcb *inp = tp->t_inpcb; struct listen_ctx *lctx; int i, rc, v; + struct offload_settings settings; INP_WLOCK_ASSERT(inp); + rw_rlock(&sc->policy_lock); + settings = *lookup_offload_policy(sc, OPEN_TYPE_LISTEN, NULL, 0xffff, + inp); + rw_runlock(&sc->policy_lock); + if (!settings.offload) + return (0); + /* Don't start a hardware listener for any loopback address. */ if (inp->inp_vflag & INP_IPV6 && IN6_IS_ADDR_LOOPBACK(&inp->in6p_laddr)) return (0); @@ -948,12 +959,22 @@ t4_offload_socket(struct toedev *tod, void *arg, struct socket *so) } static inline void -save_qids_in_mbuf(struct mbuf *m, struct vi_info *vi) +save_qids_in_mbuf(struct mbuf *m, struct vi_info *vi, + struct offload_settings *s) { uint32_t txqid, rxqid; - txqid = (arc4random() % vi->nofldtxq) + vi->first_ofld_txq; - rxqid = (arc4random() % vi->nofldrxq) + vi->first_ofld_rxq; + if (s->txq >= 0 && s->txq < vi->nofldtxq) + txqid = s->txq; + else + txqid = arc4random() % vi->nofldtxq; + txqid += vi->first_ofld_txq; + + if (s->rxq >= 0 && s->rxq < vi->nofldrxq) + rxqid = s->rxq; + else + rxqid = arc4random() % vi->nofldrxq; + rxqid += vi->first_ofld_rxq; m->m_pkthdr.flowid = (txqid << 16) | (rxqid & 0xffff); } @@ -1019,50 +1040,88 @@ t4opt_to_tcpopt(const struct tcp_options *t4opt, struct tcpopt *to) */ static uint32_t calc_opt2p(struct adapter *sc, struct port_info *pi, int rxqid, - const struct tcp_options *tcpopt, struct tcphdr *th, int ulp_mode) + const struct tcp_options *tcpopt, struct tcphdr *th, int ulp_mode, + struct cc_algo *cc, const struct offload_settings *s) { struct sge_ofld_rxq *ofld_rxq = &sc->sge.ofld_rxq[rxqid]; - uint32_t opt2; + uint32_t opt2 = 0; - opt2 = V_TX_QUEUE(sc->params.tp.tx_modq[pi->tx_chan]) | - F_RSS_QUEUE_VALID | V_RSS_QUEUE(ofld_rxq->iq.abs_id); - - if (V_tcp_do_rfc1323) { - if (tcpopt->tstamp) - opt2 |= F_TSTAMPS_EN; - if (tcpopt->sack) - opt2 |= F_SACK_EN; - if (tcpopt->wsf <= 14) - opt2 |= F_WND_SCALE_EN; + /* + * rx flow control, rx coalesce, congestion control, and tx pace are all + * explicitly set by the driver. On T5+ the ISS is also set by the + * driver to the value picked by the kernel. + */ + if (is_t4(sc)) { + opt2 |= F_RX_FC_VALID | F_RX_COALESCE_VALID; + opt2 |= F_CONG_CNTRL_VALID | F_PACE_VALID; + } else { + opt2 |= F_T5_OPT_2_VALID; /* all 4 valid */ + opt2 |= F_T5_ISS; /* ISS provided in CPL */ } - if (V_tcp_do_ecn && th->th_flags & (TH_ECE | TH_CWR)) + if (tcpopt->sack && (s->sack > 0 || (s->sack < 0 && V_tcp_do_rfc1323))) + opt2 |= F_SACK_EN; + + if (tcpopt->tstamp && + (s->tstamp > 0 || (s->tstamp < 0 && V_tcp_do_rfc1323))) + opt2 |= F_TSTAMPS_EN; + + if (tcpopt->wsf < 15 && V_tcp_do_rfc1323) + opt2 |= F_WND_SCALE_EN; + + if (th->th_flags & (TH_ECE | TH_CWR) && + (s->ecn > 0 || (s->ecn < 0 && V_tcp_do_ecn))) opt2 |= F_CCTRL_ECN; - /* RX_COALESCE is always a valid value (0 or M_RX_COALESCE). */ - if (is_t4(sc)) - opt2 |= F_RX_COALESCE_VALID; + /* XXX: F_RX_CHANNEL for multiple rx c-chan support goes here. */ + + opt2 |= V_TX_QUEUE(sc->params.tp.tx_modq[pi->tx_chan]); + + /* These defaults are subject to ULP specific fixups later. */ + opt2 |= V_RX_FC_DDP(0) | V_RX_FC_DISABLE(0); + + opt2 |= V_PACE(0); + + if (s->cong_algo >= 0) + opt2 |= V_CONG_CNTRL(s->cong_algo); + else if (sc->tt.cong_algorithm >= 0) + opt2 |= V_CONG_CNTRL(sc->tt.cong_algorithm & M_CONG_CNTRL); else { - opt2 |= F_T5_OPT_2_VALID; - opt2 |= F_T5_ISS; + if (strcasecmp(cc->name, "reno") == 0) + opt2 |= V_CONG_CNTRL(CONG_ALG_RENO); + else if (strcasecmp(cc->name, "tahoe") == 0) + opt2 |= V_CONG_CNTRL(CONG_ALG_TAHOE); + if (strcasecmp(cc->name, "newreno") == 0) + opt2 |= V_CONG_CNTRL(CONG_ALG_NEWRENO); + if (strcasecmp(cc->name, "highspeed") == 0) + opt2 |= V_CONG_CNTRL(CONG_ALG_HIGHSPEED); + else { + /* + * Use newreno in case the algorithm selected by the + * host stack is not supported by the hardware. + */ + opt2 |= V_CONG_CNTRL(CONG_ALG_NEWRENO); + } } - if (sc->tt.rx_coalesce) + + if (s->rx_coalesce > 0 || (s->rx_coalesce < 0 && sc->tt.rx_coalesce)) opt2 |= V_RX_COALESCE(M_RX_COALESCE); - if (sc->tt.cong_algorithm != -1) - opt2 |= V_CONG_CNTRL(sc->tt.cong_algorithm & M_CONG_CNTRL); + /* Note that ofld_rxq is already set according to s->rxq. */ + opt2 |= F_RSS_QUEUE_VALID; + opt2 |= V_RSS_QUEUE(ofld_rxq->iq.abs_id); #ifdef USE_DDP_RX_FLOW_CONTROL if (ulp_mode == ULP_MODE_TCPDDP) - opt2 |= F_RX_FC_VALID | F_RX_FC_DDP; + opt2 |= F_RX_FC_DDP; #endif + if (ulp_mode == ULP_MODE_TLS) { - opt2 |= F_RX_FC_VALID; opt2 &= ~V_RX_COALESCE(M_RX_COALESCE); opt2 |= F_RX_FC_DISABLE; } - return htobe32(opt2); + return (htobe32(opt2)); } static void @@ -1199,6 +1258,7 @@ do_pass_accept_req(struct sge_iq *iq, const struct rss_header *rss, #ifdef INVARIANTS unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); #endif + struct offload_settings settings; KASSERT(opcode == CPL_PASS_ACCEPT_REQ, ("%s: unexpected opcode 0x%x", __func__, opcode)); @@ -1334,15 +1394,23 @@ found: REJECT_PASS_ACCEPT(); } so = inp->inp_socket; + rw_rlock(&sc->policy_lock); + settings = *lookup_offload_policy(sc, OPEN_TYPE_PASSIVE, m, 0xffff, inp); + rw_runlock(&sc->policy_lock); + if (!settings.offload) { + INP_WUNLOCK(inp); + free(wr, M_CXGBE); + REJECT_PASS_ACCEPT(); + } - mtu_idx = find_best_mtu_idx(sc, &inc, be16toh(cpl->tcpopt.mss)); + mtu_idx = find_best_mtu_idx(sc, &inc, &settings); rscale = cpl->tcpopt.wsf && V_tcp_do_rfc1323 ? select_rcv_wscale() : 0; /* opt0 rcv_bufsiz initially, assumes its normal meaning later */ wnd = max(so->sol_sbrcv_hiwat, MIN_RCV_WND); wnd = min(wnd, MAX_RCV_WND); rx_credits = min(wnd >> 10, M_RCV_BUFSIZ); - save_qids_in_mbuf(m, vi); + save_qids_in_mbuf(m, vi, &settings); get_qids_from_mbuf(m, NULL, &rxqid); if (is_t4(sc)) @@ -1352,7 +1420,7 @@ found: INIT_TP_WR_MIT_CPL(rpl5, CPL_PASS_ACCEPT_RPL, tid); } - ulp_mode = select_ulp_mode(so, sc); + ulp_mode = select_ulp_mode(so, sc, &settings); switch (ulp_mode) { case ULP_MODE_TCPDDP: synqe->flags |= TPF_SYNQE_TCPDDP; @@ -1361,8 +1429,10 @@ found: synqe->flags |= TPF_SYNQE_TLS; break; } - rpl->opt0 = calc_opt0(so, vi, e, mtu_idx, rscale, rx_credits, ulp_mode); - rpl->opt2 = calc_opt2p(sc, pi, rxqid, &cpl->tcpopt, &th, ulp_mode); + rpl->opt0 = calc_opt0(so, vi, e, mtu_idx, rscale, rx_credits, ulp_mode, + &settings); + rpl->opt2 = calc_opt2p(sc, pi, rxqid, &cpl->tcpopt, &th, ulp_mode, + CC_ALGO(intotcpcb(inp)), &settings); synqe->tid = tid; synqe->lctx = lctx; diff --git a/sys/dev/cxgbe/tom/t4_tom.c b/sys/dev/cxgbe/tom/t4_tom.c index f36d07461305..ce714e14e381 100644 --- a/sys/dev/cxgbe/tom/t4_tom.c +++ b/sys/dev/cxgbe/tom/t4_tom.c @@ -51,6 +51,8 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include +#include #include #include #include @@ -137,15 +139,11 @@ alloc_toepcb(struct vi_info *vi, int txqid, int rxqid, int flags) txsd_total = tx_credits / howmany(sizeof(struct fw_ofld_tx_data_wr) + 1, 16); - if (txqid < 0) - txqid = (arc4random() % vi->nofldtxq) + vi->first_ofld_txq; KASSERT(txqid >= vi->first_ofld_txq && txqid < vi->first_ofld_txq + vi->nofldtxq, ("%s: txqid %d for vi %p (first %d, n %d)", __func__, txqid, vi, vi->first_ofld_txq, vi->nofldtxq)); - if (rxqid < 0) - rxqid = (arc4random() % vi->nofldrxq) + vi->first_ofld_rxq; KASSERT(rxqid >= vi->first_ofld_rxq && rxqid < vi->first_ofld_rxq + vi->nofldrxq, ("%s: rxqid %d for vi %p (first %d, n %d)", __func__, rxqid, vi, @@ -569,27 +567,28 @@ queue_tid_release(struct adapter *sc, int tid) } /* - * What mtu_idx to use, given a 4-tuple and/or an MSS cap + * What mtu_idx to use, given a 4-tuple. Note that both s->mss and tcp_mssopt + * have the MSS that we should advertise in our SYN. Advertised MSS doesn't + * account for any TCP options so the effective MSS (only payload, no headers or + * options) could be different. We fill up tp->t_maxseg with the effective MSS + * at the end of the 3-way handshake. */ int -find_best_mtu_idx(struct adapter *sc, struct in_conninfo *inc, int pmss) +find_best_mtu_idx(struct adapter *sc, struct in_conninfo *inc, + struct offload_settings *s) { unsigned short *mtus = &sc->params.mtus[0]; - int i, mss, n; + int i, mss, mtu; - KASSERT(inc != NULL || pmss > 0, - ("%s: at least one of inc/pmss must be specified", __func__)); - - mss = inc ? tcp_mssopt(inc) : pmss; - if (pmss > 0 && mss > pmss) - mss = pmss; + MPASS(inc != NULL); + mss = s->mss > 0 ? s->mss : tcp_mssopt(inc); if (inc->inc_flags & INC_ISIPV6) - n = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); + mtu = mss + sizeof(struct ip6_hdr) + sizeof(struct tcphdr); else - n = sizeof(struct ip) + sizeof(struct tcphdr); + mtu = mss + sizeof(struct ip) + sizeof(struct tcphdr); - for (i = 0; i < NMTUS - 1 && mtus[i + 1] <= mss + n; i++) + for (i = 0; i < NMTUS - 1 && mtus[i + 1] <= mtu; i++) continue; return (i); @@ -632,33 +631,32 @@ select_rcv_wscale(void) */ uint64_t calc_opt0(struct socket *so, struct vi_info *vi, struct l2t_entry *e, - int mtu_idx, int rscale, int rx_credits, int ulp_mode) + int mtu_idx, int rscale, int rx_credits, int ulp_mode, + struct offload_settings *s) { + int keepalive; uint64_t opt0; + MPASS(so != NULL); + MPASS(vi != NULL); KASSERT(rx_credits <= M_RCV_BUFSIZ, ("%s: rcv_bufsiz too high", __func__)); opt0 = F_TCAM_BYPASS | V_WND_SCALE(rscale) | V_MSS_IDX(mtu_idx) | - V_ULP_MODE(ulp_mode) | V_RCV_BUFSIZ(rx_credits); + V_ULP_MODE(ulp_mode) | V_RCV_BUFSIZ(rx_credits) | + V_L2T_IDX(e->idx) | V_SMAC_SEL(vi->smt_idx) | + V_TX_CHAN(vi->pi->tx_chan); - if (so != NULL) { + keepalive = tcp_always_keepalive || so_options_get(so) & SO_KEEPALIVE; + opt0 |= V_KEEP_ALIVE(keepalive != 0); + + if (s->nagle < 0) { struct inpcb *inp = sotoinpcb(so); struct tcpcb *tp = intotcpcb(inp); - int keepalive = tcp_always_keepalive || - so_options_get(so) & SO_KEEPALIVE; opt0 |= V_NAGLE((tp->t_flags & TF_NODELAY) == 0); - opt0 |= V_KEEP_ALIVE(keepalive != 0); - } - - if (e != NULL) - opt0 |= V_L2T_IDX(e->idx); - - if (vi != NULL) { - opt0 |= V_SMAC_SEL(vi->smt_idx); - opt0 |= V_TX_CHAN(vi->pi->tx_chan); - } + } else + opt0 |= V_NAGLE(s->nagle != 0); return htobe64(opt0); } @@ -720,12 +718,15 @@ is_tls_sock(struct socket *so, struct adapter *sc) } int -select_ulp_mode(struct socket *so, struct adapter *sc) +select_ulp_mode(struct socket *so, struct adapter *sc, + struct offload_settings *s) { - if (can_tls_offload(sc) && is_tls_sock(so, sc)) + if (can_tls_offload(sc) && + (s->tls > 0 || (s->tls < 0 && is_tls_sock(so, sc)))) return (ULP_MODE_TLS); - else if (sc->tt.ddp && (so->so_options & SO_NO_DDP) == 0) + else if (s->ddp > 0 || + (s->ddp < 0 && sc->tt.ddp && (so->so_options & SO_NO_DDP) == 0)) return (ULP_MODE_TCPDDP); else return (ULP_MODE_NONE); @@ -1093,6 +1094,181 @@ free_tom_data(struct adapter *sc, struct tom_data *td) free(td, M_CXGBE); } +static char * +prepare_pkt(int open_type, uint16_t vtag, struct inpcb *inp, int *pktlen, + int *buflen) +{ + char *pkt; + struct tcphdr *th; + int ipv6, len; + const int maxlen = + max(sizeof(struct ether_header), sizeof(struct ether_vlan_header)) + + max(sizeof(struct ip), sizeof(struct ip6_hdr)) + + sizeof(struct tcphdr); + + MPASS(open_type == OPEN_TYPE_ACTIVE || open_type == OPEN_TYPE_LISTEN); + + pkt = malloc(maxlen, M_CXGBE, M_ZERO | M_NOWAIT); + if (pkt == NULL) + return (NULL); + + ipv6 = inp->inp_vflag & INP_IPV6; + len = 0; + + if (vtag == 0xffff) { + struct ether_header *eh = (void *)pkt; + + if (ipv6) + eh->ether_type = htons(ETHERTYPE_IPV6); + else + eh->ether_type = htons(ETHERTYPE_IP); + + len += sizeof(*eh); + } else { + struct ether_vlan_header *evh = (void *)pkt; + + evh->evl_encap_proto = htons(ETHERTYPE_VLAN); + evh->evl_tag = htons(vtag); + if (ipv6) + evh->evl_proto = htons(ETHERTYPE_IPV6); + else + evh->evl_proto = htons(ETHERTYPE_IP); + + len += sizeof(*evh); + } + + if (ipv6) { + struct ip6_hdr *ip6 = (void *)&pkt[len]; + + ip6->ip6_vfc = IPV6_VERSION; + ip6->ip6_plen = htons(sizeof(struct tcphdr)); + ip6->ip6_nxt = IPPROTO_TCP; + if (open_type == OPEN_TYPE_ACTIVE) { + ip6->ip6_src = inp->in6p_laddr; + ip6->ip6_dst = inp->in6p_faddr; + } else if (open_type == OPEN_TYPE_LISTEN) { + ip6->ip6_src = inp->in6p_laddr; + ip6->ip6_dst = ip6->ip6_src; + } + + len += sizeof(*ip6); + } else { + struct ip *ip = (void *)&pkt[len]; + + ip->ip_v = IPVERSION; + ip->ip_hl = sizeof(*ip) >> 2; + ip->ip_tos = inp->inp_ip_tos; + ip->ip_len = htons(sizeof(struct ip) + sizeof(struct tcphdr)); + ip->ip_ttl = inp->inp_ip_ttl; + ip->ip_p = IPPROTO_TCP; + if (open_type == OPEN_TYPE_ACTIVE) { + ip->ip_src = inp->inp_laddr; + ip->ip_dst = inp->inp_faddr; + } else if (open_type == OPEN_TYPE_LISTEN) { + ip->ip_src = inp->inp_laddr; + ip->ip_dst = ip->ip_src; + } + + len += sizeof(*ip); + } + + th = (void *)&pkt[len]; + if (open_type == OPEN_TYPE_ACTIVE) { + th->th_sport = inp->inp_lport; /* network byte order already */ + th->th_dport = inp->inp_fport; /* ditto */ + } else if (open_type == OPEN_TYPE_LISTEN) { + th->th_sport = inp->inp_lport; /* network byte order already */ + th->th_dport = th->th_sport; + } + len += sizeof(th); + + *pktlen = *buflen = len; + return (pkt); +} + +const struct offload_settings * +lookup_offload_policy(struct adapter *sc, int open_type, struct mbuf *m, + uint16_t vtag, struct inpcb *inp) +{ + const struct t4_offload_policy *op; + char *pkt; + struct offload_rule *r; + int i, matched, pktlen, buflen; + static const struct offload_settings allow_offloading_settings = { + .offload = 1, + .rx_coalesce = -1, + .cong_algo = -1, + .sched_class = -1, + .tstamp = -1, + .sack = -1, + .nagle = -1, + .ecn = -1, + .ddp = -1, + .tls = -1, + .txq = -1, + .rxq = -1, + .mss = -1, + }; + static const struct offload_settings disallow_offloading_settings = { + .offload = 0, + /* rest is irrelevant when offload is off. */ + }; + + rw_assert(&sc->policy_lock, RA_LOCKED); + + /* + * If there's no Connection Offloading Policy attached to the device + * then we need to return a default static policy. If + * "cop_managed_offloading" is true, then we need to disallow + * offloading until a COP is attached to the device. Otherwise we + * allow offloading ... + */ + op = sc->policy; + if (op == NULL) { + if (sc->tt.cop_managed_offloading) + return (&disallow_offloading_settings); + else + return (&allow_offloading_settings); + } + + switch (open_type) { + case OPEN_TYPE_ACTIVE: + case OPEN_TYPE_LISTEN: + pkt = prepare_pkt(open_type, 0xffff, inp, &pktlen, &buflen); + break; + case OPEN_TYPE_PASSIVE: + MPASS(m != NULL); + pkt = mtod(m, char *); + MPASS(*pkt == CPL_PASS_ACCEPT_REQ); + pkt += sizeof(struct cpl_pass_accept_req); + pktlen = m->m_pkthdr.len - sizeof(struct cpl_pass_accept_req); + buflen = m->m_len - sizeof(struct cpl_pass_accept_req); + break; + default: + MPASS(0); + return (&disallow_offloading_settings); + } + + if (pkt == NULL || pktlen == 0 || buflen == 0) + return (&disallow_offloading_settings); + + r = &op->rule[0]; + for (i = 0; i < op->nrules; i++, r++) { + if (r->open_type != open_type && + r->open_type != OPEN_TYPE_DONTCARE) { + continue; + } + matched = bpf_filter(r->bpf_prog.bf_insns, pkt, pktlen, buflen); + if (matched) + break; + } + + if (open_type == OPEN_TYPE_ACTIVE || open_type == OPEN_TYPE_LISTEN) + free(pkt, M_CXGBE); + + return (matched ? &r->settings : &disallow_offloading_settings); +} + static void reclaim_wr_resources(void *arg, int count) { diff --git a/sys/dev/cxgbe/tom/t4_tom.h b/sys/dev/cxgbe/tom/t4_tom.h index 0192238c05cc..b875b0bcf93e 100644 --- a/sys/dev/cxgbe/tom/t4_tom.h +++ b/sys/dev/cxgbe/tom/t4_tom.h @@ -87,6 +87,7 @@ enum { }; struct sockopt; +struct offload_settings; struct ofld_tx_sdesc { uint32_t plen; /* payload length */ @@ -333,13 +334,15 @@ void *lookup_tid(struct adapter *, int); void update_tid(struct adapter *, int, void *); void remove_tid(struct adapter *, int, int); void release_tid(struct adapter *, int, struct sge_wrq *); -int find_best_mtu_idx(struct adapter *, struct in_conninfo *, int); +int find_best_mtu_idx(struct adapter *, struct in_conninfo *, + struct offload_settings *); u_long select_rcv_wnd(struct socket *); int select_rcv_wscale(void); uint64_t calc_opt0(struct socket *, struct vi_info *, struct l2t_entry *, - int, int, int, int); + int, int, int, int, struct offload_settings *); uint64_t select_ntuple(struct vi_info *, struct l2t_entry *); -int select_ulp_mode(struct socket *, struct adapter *); +int select_ulp_mode(struct socket *, struct adapter *, + struct offload_settings *); void set_ulp_mode(struct toepcb *, int); int negative_advice(int); struct clip_entry *hold_lip(struct tom_data *, struct in6_addr *, @@ -416,6 +419,8 @@ void handle_ddp_close(struct toepcb *, struct tcpcb *, uint32_t); void handle_ddp_indicate(struct toepcb *); void handle_ddp_tcb_rpl(struct toepcb *, const struct cpl_set_tcb_rpl *); void insert_ddp_data(struct toepcb *, uint32_t); +const struct offload_settings *lookup_offload_policy(struct adapter *, int, + struct mbuf *, uint16_t, struct inpcb *); /* t4_tls.c */ bool can_tls_offload(struct adapter *); diff --git a/usr.sbin/cxgbetool/Makefile b/usr.sbin/cxgbetool/Makefile index da1ef0049b7c..ee31cdda5858 100644 --- a/usr.sbin/cxgbetool/Makefile +++ b/usr.sbin/cxgbetool/Makefile @@ -8,6 +8,7 @@ SRCS+= tcbinfot4.c tcbshowt4.c SRCS+= tcbinfot5.c tcbshowt5.c SRCS+= tcbinfot6.c tcbshowt6.c CFLAGS+= -I${SRCTOP}/sys/dev/cxgbe -I${SRCTOP}/sys -I. +LIBADD= pcap WARNS?= 2 .include diff --git a/usr.sbin/cxgbetool/cxgbetool.8 b/usr.sbin/cxgbetool/cxgbetool.8 index 81affe139d11..87c4a019c9b6 100644 --- a/usr.sbin/cxgbetool/cxgbetool.8 +++ b/usr.sbin/cxgbetool/cxgbetool.8 @@ -31,7 +31,7 @@ .\" .\" $FreeBSD$ .\" -.Dd March 6, 2017 +.Dd April 13, 2018 .Dt CXGBETOOL 8 .Os .Sh NAME @@ -64,6 +64,10 @@ .It .Nm Ar nexus Cm memdump Ar addr len .It +.Nm Ar nexus Cm policy Ar cop.txt +.It +.Nm Ar nexus Cm policy clear +.It .Nm Ar nexus Bro Cm reg | reg64 Brc Ar addr Ns Op Ar =val .It .Nm Ar nexus Cm regdump Op Ar register-block ... @@ -378,6 +382,144 @@ bytes of data of the card's memory starting at .Ar addr Ns . The card's memory map is available in .Va dev.t4nex.%d.misc.meminfo Ns . +.It Cm policy Ar cop.txt +Install the Connection Offload Policy (COP) in +.Ar cop.txt Ns . +A COP offers fine-grained control over which connections get offloaded and with +what parameters. +Set +.Cm hw.cxgbe.cop_managed_offloading="1" +in loader.conf to ensure that t4_tom will not offload any connection before a +COP is installed. +Note that t4_tom must be loaded and operational (IFCAP_TOE enabled) as always +for any kind of offload based on the hardware TOE. +.Bl -column -offset indent "COP installed" "cop_managed_offloading" "Behavior" +.It Sy COP installed Ta Sy cop_managed_offloading Ta Sy Behavior +.It NO Ta 0 Ta offload all [Default] +.It NO Ta 1 Ta no offload +.It YES Ta Don't Care Ta Rule based offload +.El +.Pp +The policy file consists of empty lines, comments (lines begining with #) and +any number of rules. +Rules are applied in the order they appear in the file and processing stops at +the first match. +There is an implicit rule that disables offload for connections that do not +match anything in the policy. +.Pp +Each rule consists of a filter part, which determines what connections the +rule applies to, and a settings part, which determines whether whether matching +connections will be offloaded and, if so, with what settings. +The general form of a rule is +.Bl -ohang -offset indent +.It Cm \&[ Ar socket-type Cm \&] Ar pcap-filter Cm => Ar settings +.Pp +.Ar socket-type +is one of the following. +.Bl -tag -width "X" -compact +.It Sy A +Active open. +Connection is being opened by this host. +.It Sy P +Passive open. +Connection was requested by a peer. +.It Sy L +Listen called on a socket. +Disabling offload in such a rule will prevent a hardware listener from being started. +.It Sy D +Don't care. +Matches all of the above. +.El +.Pp +.Ar pcap-filter +is an expression that follows the +.Xr pcap-filter 7 +syntax, or it is the keyword +.Cm all +that matches everything. +.Pp +.Ar settings +determine whether connections matching +.Ar socket-type +and +.Ar pcap-filter +are offloaded and optionally sets some per-connection properties if they are. +A combination of the following is allowed. +.Bl -tag -width "timestamp" -compact +.It Cm offload +Connection should be offloaded. +Use +.Cm !offload +or +.Cm not offload +to disable offload instead. +.It Cm coalesce +Enable rx payload coalescing. +Negate to disable. +.It Cm timestamp +Enable TCP timestamp option. +Negate to disable. +.It Cm sack +Enable TCP Selective Acknowledgements (SACK). +Negate to disable. +.It Cm nagle +Enable Nagle's algorithm. +Negate to disable. +.It Cm ecn +Enable Explicit Congestion Notification (ECN). +Negate to disable. +.It Cm ddp +Use Direct Data Placement (zero copy receive) and zero copy transmit on the +connection to service AIO requests on the socket. +Negate to disable. +.It Cm tls +Set ULP mode to ULP_MODE_TLS. +.It Cm cong Ar algo +Use the specified congestion control algorithm. +.Ar algo +must be one of +.Cm reno Ns , Cm tahoe Ns , Cm newreno Ns , or Cm highspeed Ns . +.It Cm class Ar sc +Bind the connection to the specified tx scheduling class. +Valid range is 0 to 14 (for T4) and 0 to 15 (T5 onwards). +.It Cm rxq Ar qid +Use the specified offload rx queue. +.Ar qid +should be between 0 and nofldrxq for the ifnet. +.It Cm txq Ar qnum +Use the specified offload tx queue. +.Ar qid +should be between 0 and nofldtxq for the ifnet. +.It Cm bind Ar qnum +Shorthand for +.Cm rxq Ar qnum Cm txq Ar qnum Ns . +Use only when nofldrxq is the same as nofldtxq. +.It Cm mss Ar val +Set the advertised TCP MSS in the SYN for this connection to +.Ar val +(in bytes). +The hardware MTU table must already have an entry that is suitable for the MSS. +.El +.Pp +.It Example of a COP. +Note that hardware listener for port 22 will be IPv4 only because the rule +before it will prevent any IPv6 servers other than the first two. Also note +that outgoing connections to 192.168/16 are the only outgoing connections that +will get offloaded. +.Bd -literal +[L] port 80 => offload +[L] port 443 => offload +[L] ip6 => !offload +[L] port 22 => offload +[P] dst port 80 => offload cong highspeed !sack !ecn +[P] dst port 443 => offload tls +[A] dst net 192.168/16 => offload +[A] all => !offload +[D] port 22 => offload !nagle +.Ed +.El +.It Cm policy clear +Remove the Connection Offload Policy (COP) if one is in use. .It Bro Cm reg | reg64 Brc Ar addr Ns Op Ar =val .It Cm regdump Op Ar register-block ... Display contents of device registers. diff --git a/usr.sbin/cxgbetool/cxgbetool.c b/usr.sbin/cxgbetool/cxgbetool.c index ddadf337a477..8a49872a9d98 100644 --- a/usr.sbin/cxgbetool/cxgbetool.c +++ b/usr.sbin/cxgbetool/cxgbetool.c @@ -49,6 +49,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include "t4_ioctl.h" #include "tcb_common.h" @@ -106,6 +107,8 @@ usage(FILE *fp) "\tloadfw install firmware\n" "\tmemdump dump a memory range\n" "\tmodinfo [raw] optics/cable information\n" + "\tpolicy install offload policy\n" + "\tpolicy clear remove offload policy\n" "\treg
[=] read/write register\n" "\treg64
[=] read/write 64 bit register\n" "\tregdump [] ... dump registers\n" @@ -2889,6 +2892,453 @@ sched_queue(int argc, const char *argv[]) return doit(CHELSIO_T4_SCHED_QUEUE, &op); } +static int +parse_offload_settings_word(const char *s, char **pnext, const char *ws, + int *pneg, struct offload_settings *os) +{ + + while (*s == '!') { + (*pneg)++; + s++; + } + + if (!strcmp(s, "not")) { + (*pneg)++; + return (0); + } + + if (!strcmp(s, "offload")) { + os->offload = (*pneg + 1) & 1; + *pneg = 0; + } else if (!strcmp(s , "coalesce")) { + os->rx_coalesce = (*pneg + 1) & 1; + *pneg = 0; + } else if (!strcmp(s, "timestamp") || !strcmp(s, "tstamp")) { + os->tstamp = (*pneg + 1) & 1; + *pneg = 0; + } else if (!strcmp(s, "sack")) { + os->sack = (*pneg + 1) & 1; + *pneg = 0; + } else if (!strcmp(s, "nagle")) { + os->nagle = (*pneg + 1) & 1; + *pneg = 0; + } else if (!strcmp(s, "ecn")) { + os->ecn = (*pneg + 1) & 1; + *pneg = 0; + } else if (!strcmp(s, "ddp")) { + os->ddp = (*pneg + 1) & 1; + *pneg = 0; + } else if (!strcmp(s, "tls")) { + os->tls = (*pneg + 1) & 1; + *pneg = 0; + } else { + char *param, *p; + long val; + + /* Settings with additional parameter handled here. */ + + if (*pneg) { + warnx("\"%s\" is not a valid keyword, or it does not " + "support negation.", s); + return (EINVAL); + } + + while ((param = strsep(pnext, ws)) != NULL) { + if (*param != '\0') + break; + } + if (param == NULL) { + warnx("\"%s\" is not a valid keyword, or it requires a " + "parameter that has not been provided.", s); + return (EINVAL); + } + + if (!strcmp(s, "cong")) { + if (!strcmp(param, "reno")) + os->cong_algo = 0; + else if (!strcmp(param, "tahoe")) + os->cong_algo = 1; + else if (!strcmp(param, "newreno")) + os->cong_algo = 2; + else if (!strcmp(param, "highspeed")) + os->cong_algo = 3; + else { + warnx("unknown congestion algorithm \"%s\".", s); + return (EINVAL); + } + } else if (!strcmp(s, "class")) { + val = -1; + p = str_to_number(param, &val, NULL); + /* (nsched_cls - 1) is spelled 15 here. */ + if (*p || val < 0 || val > 15) { + warnx("invalid scheduling class \"%s\". " + "\"class\" needs an integer value where " + "0 <= value <= 15", param); + return (EINVAL); + } + os->sched_class = val; + } else if (!strcmp(s, "bind") || !strcmp(s, "txq") || + !strcmp(s, "rxq")) { + val = -1; + if (strcmp(param, "random")) { + p = str_to_number(param, &val, NULL); + if (*p || val < 0 || val > 0xffff) { + warnx("invalid queue specification " + "\"%s\". \"%s\" needs an integer" + " value, or \"random\".", + param, s); + return (EINVAL); + } + } + if (!strcmp(s, "bind")) { + os->txq = val; + os->rxq = val; + } else if (!strcmp(s, "txq")) { + os->txq = val; + } else if (!strcmp(s, "rxq")) { + os->rxq = val; + } else { + return (EDOOFUS); + } + } else if (!strcmp(s, "mss")) { + val = -1; + p = str_to_number(param, &val, NULL); + if (*p || val <= 0) { + warnx("invalid MSS specification \"%s\". " + "\"mss\" needs a positive integer value", + param); + return (EINVAL); + } + os->mss = val; + } else { + warnx("unknown settings keyword: \"%s\"", s); + return (EINVAL); + } + } + + return (0); +} + +static int +parse_offload_settings(const char *settings_ro, struct offload_settings *os) +{ + const char *ws = " \f\n\r\v\t"; + char *settings, *s, *next; + int rc, nsettings, neg; + static const struct offload_settings default_settings = { + .offload = 0, /* No settings imply !offload */ + .rx_coalesce = -1, + .cong_algo = -1, + .sched_class = -1, + .tstamp = -1, + .sack = -1, + .nagle = -1, + .ecn = -1, + .ddp = -1, + .tls = -1, + .txq = -1, + .rxq = -1, + .mss = -1, + }; + + *os = default_settings; + + next = settings = strdup(settings_ro); + if (settings == NULL) { + warn (NULL); + return (errno); + } + + nsettings = 0; + rc = 0; + neg = 0; + while ((s = strsep(&next, ws)) != NULL) { + if (*s == '\0') + continue; + nsettings++; + rc = parse_offload_settings_word(s, &next, ws, &neg, os); + if (rc != 0) + goto done; + } + if (nsettings == 0) { + warnx("no settings provided"); + rc = EINVAL; + goto done; + } + if (neg > 0) { + warnx("%d stray negation(s) at end of offload settings", neg); + rc = EINVAL; + goto done; + } +done: + free(settings); + return (rc); +} + +static int +isempty_line(char *line, size_t llen) +{ + + /* skip leading whitespace */ + while (isspace(*line)) { + line++; + llen--; + } + if (llen == 0 || *line == '#' || *line == '\n') + return (1); + + return (0); +} + +static int +special_offload_rule(char *str) +{ + + /* skip leading whitespaces */ + while (isspace(*str)) + str++; + + /* check for special strings: "-", "all", "any" */ + if (*str == '-') { + str++; + } else if (!strncmp(str, "all", 3) || !strncmp(str, "any", 3)) { + str += 3; + } else { + return (0); + } + + /* skip trailing whitespaces */ + while (isspace(*str)) + str++; + + return (*str == '\0'); +} + +/* + * A rule has 3 parts: an open-type, a match expression, and offload settings. + * + * [] => + */ +static int +parse_offload_policy_line(size_t lno, char *line, size_t llen, pcap_t *pd, + struct offload_rule *r) +{ + char *expr, *settings, *s; + + bzero(r, sizeof(*r)); + + /* Skip leading whitespace. */ + while (isspace(*line)) + line++; + /* Trim trailing whitespace */ + s = &line[llen - 1]; + while (isspace(*s)) { + *s-- = '\0'; + llen--; + } + + /* + * First part of the rule: '[X]' where X = A/D/L/P + */ + if (*line++ != '[') { + warnx("missing \"[\" on line %zd", lno); + return (EINVAL); + } + switch (*line) { + case 'A': + case 'D': + case 'L': + case 'P': + r->open_type = *line; + break; + default: + warnx("invalid socket-type \"%c\" on line %zd.", *line, lno); + return (EINVAL); + } + line++; + if (*line++ != ']') { + warnx("missing \"]\" after \"[%c\" on line %zd", + r->open_type, lno); + return (EINVAL); + } + + /* Skip whitespace. */ + while (isspace(*line)) + line++; + + /* + * Rest of the rule: => + */ + expr = line; + s = strstr(line, "=>"); + if (s == NULL) + return (EINVAL); + settings = s + 2; + while (isspace(*settings)) + settings++; + *s = '\0'; + + /* + * is either a special name (all, any) or a pcap-filter(7). + * In case of a special name the bpf_prog stays all-zero. + */ + if (!special_offload_rule(expr)) { + if (pcap_compile(pd, &r->bpf_prog, expr, 1, + PCAP_NETMASK_UNKNOWN) < 0) { + warnx("failed to compile \"%s\" on line %zd: %s", expr, + lno, pcap_geterr(pd)); + return (EINVAL); + } + } + + /* settings to apply on a match. */ + if (parse_offload_settings(settings, &r->settings) != 0) { + warnx("failed to parse offload settings \"%s\" on line %zd", + settings, lno); + pcap_freecode(&r->bpf_prog); + return (EINVAL); + } + + return (0); + +} + +/* + * Note that op itself is not dynamically allocated. + */ +static void +free_offload_policy(struct t4_offload_policy *op) +{ + int i; + + for (i = 0; i < op->nrules; i++) { + /* + * pcap_freecode can cope with empty bpf_prog, which is the case + * for an rule that matches on 'any/all/-'. + */ + pcap_freecode(&op->rule[i].bpf_prog); + } + free(op->rule); + op->nrules = 0; + op->rule = NULL; +} + +#define REALLOC_STRIDE 32 + +/* + * Fills up op->nrules and op->rule. + */ +static int +parse_offload_policy(const char *fname, struct t4_offload_policy *op) +{ + FILE *fp; + char *line; + int lno, maxrules, rc; + size_t lcap, llen; + struct offload_rule *r; + pcap_t *pd; + + fp = fopen(fname, "r"); + if (fp == NULL) { + warn("Unable to open file \"%s\"", fname); + return (errno); + } + pd = pcap_open_dead(DLT_EN10MB, 128); + if (pd == NULL) { + warnx("Failed to open pcap device"); + fclose(fp); + return (EIO); + } + + rc = 0; + lno = 0; + lcap = 0; + maxrules = 0; + op->nrules = 0; + op->rule = NULL; + line = NULL; + + while ((llen = getline(&line, &lcap, fp)) != -1) { + lno++; + + /* Skip empty lines. */ + if (isempty_line(line, llen)) + continue; + + if (op->nrules == maxrules) { + maxrules += REALLOC_STRIDE; + r = realloc(op->rule, + maxrules * sizeof(struct offload_rule)); + if (r == NULL) { + warnx("failed to allocate memory for %d rules", + maxrules); + rc = ENOMEM; + goto done; + } + op->rule = r; + } + + r = &op->rule[op->nrules]; + rc = parse_offload_policy_line(lno, line, llen, pd, r); + if (rc != 0) { + warnx("Error parsing line %d of \"%s\"", lno, fname); + goto done; + } + + op->nrules++; + } + free(line); + + if (!feof(fp)) { + warn("Error while reading from file \"%s\" at line %d", + fname, lno); + rc = errno; + goto done; + } + + if (op->nrules == 0) { + warnx("No valid rules found in \"%s\"", fname); + rc = EINVAL; + } +done: + pcap_close(pd); + fclose(fp); + if (rc != 0) { + free_offload_policy(op); + } + + return (rc); +} + +static int +load_offload_policy(int argc, const char *argv[]) +{ + int rc = 0; + const char *fname = argv[0]; + struct t4_offload_policy op = {0}; + + if (argc != 1) { + warnx("incorrect number of arguments."); + return (EINVAL); + } + + if (!strcmp(fname, "clear") || !strcmp(fname, "none")) { + /* op.nrules is 0 and that means clear policy */ + return (doit(CHELSIO_T4_SET_OFLD_POLICY, &op)); + } + + rc = parse_offload_policy(fname, &op); + if (rc != 0) { + /* Error message displayed already */ + return (EINVAL); + } + + rc = doit(CHELSIO_T4_SET_OFLD_POLICY, &op); + free_offload_policy(&op); + + return (rc); +} + static int run_cmd(int argc, const char *argv[]) { @@ -2935,6 +3385,8 @@ run_cmd(int argc, const char *argv[]) rc = loadbootcfg(argc, argv); else if (!strcmp(cmd, "dumpstate")) rc = dumpstate(argc, argv); + else if (!strcmp(cmd, "policy")) + rc = load_offload_policy(argc, argv); else { rc = EINVAL; warnx("invalid command \"%s\"", cmd);