1 /************************************************************************** 2 3 Copyright (c) 2007, Chelsio Inc. 4 All rights reserved. 5 6 Redistribution and use in source and binary forms, with or without 7 modification, are permitted provided that the following conditions are met: 8 9 1. Redistributions of source code must retain the above copyright notice, 10 this list of conditions and the following disclaimer. 11 12 2. Neither the name of the Chelsio Corporation nor the names of its 13 contributors may be used to endorse or promote products derived from 14 this software without specific prior written permission. 15 16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 21 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 22 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 23 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 24 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 25 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 26 POSSIBILITY OF SUCH DAMAGE. 27 28 ***************************************************************************/ 29 30 #include <sys/cdefs.h> 31 __KERNEL_RCSID(0, "$NetBSD: cxgb_l2t.c,v 1.5 2018/12/22 14:28:56 maxv Exp $"); 32 33 #include <sys/param.h> 34 #include <sys/systm.h> 35 #include <sys/kernel.h> 36 #include <sys/lock.h> 37 #include <sys/mutex.h> 38 39 #include <sys/socket.h> 40 #include <sys/socketvar.h> 41 #include <net/if.h> 42 #include <netinet/in.h> 43 #include <netinet/in_var.h> 44 #include <netinet/if_inarp.h> 45 #include <net/if_dl.h> 46 #include <net/route.h> 47 #include <netinet/in.h> 48 49 #ifdef CONFIG_DEFINED 50 #include <cxgb_include.h> 51 #else 52 #include "cxgb_include.h" 53 #endif 54 55 #define VLAN_NONE 0xfff 56 #define SDL(s) ((struct sockaddr_dl *)s) 57 #define RT_ENADDR(rt) ((u_char *)LLADDR(SDL((rt)))) 58 #define rt_expire rt_rmx.rmx_expire 59 60 /* 61 * Module locking notes: There is a RW lock protecting the L2 table as a 62 * whole plus a spinlock per L2T entry. Entry lookups and allocations happen 63 * under the protection of the table lock, individual entry changes happen 64 * while holding that entry's spinlock. The table lock nests outside the 65 * entry locks. Allocations of new entries take the table lock as writers so 66 * no other lookups can happen while allocating new entries. Entry updates 67 * take the table lock as readers so multiple entries can be updated in 68 * parallel. An L2T entry can be dropped by decrementing its reference count 69 * and therefore can happen in parallel with entry allocation but no entry 70 * can change state or increment its ref count during allocation as both of 71 * these perform lookups. 72 */ 73 74 static inline unsigned int 75 vlan_prio(const struct l2t_entry *e) 76 { 77 return e->vlan >> 13; 78 } 79 80 static inline unsigned int 81 arp_hash(u32 key, int ifindex, const struct l2t_data *d) 82 { 83 return jhash_2words(key, ifindex, 0) & (d->nentries - 1); 84 } 85 86 static inline void 87 neigh_replace(struct l2t_entry *e, struct rtentry *rt) 88 { 89 RT_LOCK(rt); 90 RT_ADDREF(rt); 91 RT_UNLOCK(rt); 92 93 if (e->neigh) { 94 RT_LOCK(e->neigh); 95 RT_REMREF(e->neigh); 96 RT_UNLOCK(e->neigh); 97 } 98 e->neigh = rt; 99 } 100 101 /* 102 * Set up an L2T entry and send any packets waiting in the arp queue. The 103 * supplied mbuf is used for the CPL_L2T_WRITE_REQ. Must be called with the 104 * entry locked. 105 */ 106 static int 107 setup_l2e_send_pending(struct toedev *dev, struct mbuf *m, 108 struct l2t_entry *e) 109 { 110 struct cpl_l2t_write_req *req; 111 112 if (!m) { 113 if ((m = m_gethdr(M_NOWAIT, MT_DATA)) == NULL) 114 return (ENOMEM); 115 } 116 /* 117 * XXX m_align 118 */ 119 req = mtod(m, struct cpl_l2t_write_req *); 120 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); 121 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_L2T_WRITE_REQ, e->idx)); 122 req->params = htonl(V_L2T_W_IDX(e->idx) | V_L2T_W_IFF(e->smt_idx) | 123 V_L2T_W_VLAN(e->vlan & EVL_VLID_MASK) | 124 V_L2T_W_PRIO(vlan_prio(e))); 125 126 memcpy(e->dmac, RT_ENADDR(e->neigh), sizeof(e->dmac)); 127 memcpy(req->dst_mac, e->dmac, sizeof(req->dst_mac)); 128 m_set_priority(m, CPL_PRIORITY_CONTROL); 129 while (e->arpq_head) { 130 m = e->arpq_head; /* XXX XXX XXX: Memory leak? */ 131 e->arpq_head = m->m_next; 132 m->m_next = NULL; 133 } 134 e->arpq_tail = NULL; 135 e->state = L2T_STATE_VALID; 136 137 return 0; 138 } 139 140 /* 141 * Add a packet to the an L2T entry's queue of packets awaiting resolution. 142 * Must be called with the entry's lock held. 143 */ 144 static inline void 145 arpq_enqueue(struct l2t_entry *e, struct mbuf *m) 146 { 147 m->m_next = NULL; 148 if (e->arpq_head) 149 e->arpq_tail->m_next = m; 150 else 151 e->arpq_head = m; 152 e->arpq_tail = m; 153 } 154 155 int 156 t3_l2t_send_slow(struct toedev *dev, struct mbuf *m, 157 struct l2t_entry *e) 158 { 159 struct rtentry *rt; 160 struct mbuf *m0; 161 162 if ((m0 = m_gethdr(M_NOWAIT, MT_DATA)) == NULL) 163 return (ENOMEM); 164 165 rt = e->neigh; 166 167 again: 168 switch (e->state) { 169 case L2T_STATE_STALE: /* entry is stale, kick off revalidation */ 170 arpresolve(rt->rt_ifp, rt, m0, rt->rt_gateway, RT_ENADDR(rt)); 171 mtx_lock(&e->lock); 172 if (e->state == L2T_STATE_STALE) 173 e->state = L2T_STATE_VALID; 174 mtx_unlock(&e->lock); 175 case L2T_STATE_VALID: /* fast-path, send the packet on */ 176 case L2T_STATE_RESOLVING: 177 mtx_lock(&e->lock); 178 if (e->state != L2T_STATE_RESOLVING) { // ARP already completed 179 mtx_unlock(&e->lock); 180 goto again; 181 } 182 arpq_enqueue(e, m); 183 mtx_unlock(&e->lock); 184 185 /* XXX XXX XXX: Memory leak? */ 186 if ((m0 = m_gethdr(M_NOWAIT, MT_DATA)) == NULL) 187 return (ENOMEM); 188 /* 189 * Only the first packet added to the arpq should kick off 190 * resolution. However, because the m_gethdr below can fail, 191 * we allow each packet added to the arpq to retry resolution 192 * as a way of recovering from transient memory exhaustion. 193 * A better way would be to use a work request to retry L2T 194 * entries when there's no memory. 195 */ 196 if (arpresolve(rt->rt_ifp, rt, m0, rt->rt_gateway, RT_ENADDR(rt)) == 0) { 197 198 mtx_lock(&e->lock); 199 if (e->arpq_head) 200 setup_l2e_send_pending(dev, m, e); 201 else 202 m_freem(m); 203 mtx_unlock(&e->lock); 204 } 205 } 206 return 0; 207 } 208 209 void 210 t3_l2t_send_event(struct toedev *dev, struct l2t_entry *e) 211 { 212 struct rtentry *rt; 213 struct mbuf *m0; 214 215 if ((m0 = m_gethdr(M_NOWAIT, MT_DATA)) == NULL) 216 return; 217 218 rt = e->neigh; 219 again: 220 switch (e->state) { 221 case L2T_STATE_STALE: /* entry is stale, kick off revalidation */ 222 arpresolve(rt->rt_ifp, rt, m0, rt->rt_gateway, RT_ENADDR(rt)); 223 mtx_lock(&e->lock); 224 if (e->state == L2T_STATE_STALE) { 225 e->state = L2T_STATE_VALID; 226 } 227 mtx_unlock(&e->lock); 228 return; 229 case L2T_STATE_VALID: /* fast-path, send the packet on */ 230 return; 231 case L2T_STATE_RESOLVING: 232 mtx_lock(&e->lock); 233 if (e->state != L2T_STATE_RESOLVING) { // ARP already completed 234 mtx_unlock(&e->lock); 235 goto again; 236 } 237 mtx_unlock(&e->lock); 238 239 if ((m0 = m_gethdr(M_NOWAIT, MT_DATA)) == NULL) 240 return; 241 /* 242 * Only the first packet added to the arpq should kick off 243 * resolution. However, because the alloc_skb below can fail, 244 * we allow each packet added to the arpq to retry resolution 245 * as a way of recovering from transient memory exhaustion. 246 * A better way would be to use a work request to retry L2T 247 * entries when there's no memory. 248 */ 249 arpresolve(rt->rt_ifp, rt, m0, rt->rt_gateway, RT_ENADDR(rt)); 250 251 } 252 return; 253 } 254 /* 255 * Allocate a free L2T entry. Must be called with l2t_data.lock held. 256 */ 257 static struct l2t_entry * 258 alloc_l2e(struct l2t_data *d) 259 { 260 struct l2t_entry *end, *e, **p; 261 262 if (!atomic_load_acq_int(&d->nfree)) 263 return NULL; 264 265 /* there's definitely a free entry */ 266 for (e = d->rover, end = &d->l2tab[d->nentries]; e != end; ++e) 267 if (atomic_load_acq_int(&e->refcnt) == 0) 268 goto found; 269 270 for (e = &d->l2tab[1]; atomic_load_acq_int(&e->refcnt); ++e) ; 271 found: 272 d->rover = e + 1; 273 atomic_add_int(&d->nfree, -1); 274 275 /* 276 * The entry we found may be an inactive entry that is 277 * presently in the hash table. We need to remove it. 278 */ 279 if (e->state != L2T_STATE_UNUSED) { 280 int hash = arp_hash(e->addr, e->ifindex, d); 281 282 for (p = &d->l2tab[hash].first; *p; p = &(*p)->next) 283 if (*p == e) { 284 *p = e->next; 285 break; 286 } 287 e->state = L2T_STATE_UNUSED; 288 } 289 return e; 290 } 291 292 /* 293 * Called when an L2T entry has no more users. The entry is left in the hash 294 * table since it is likely to be reused but we also bump nfree to indicate 295 * that the entry can be reallocated for a different neighbor. We also drop 296 * the existing neighbor reference in case the neighbor is going away and is 297 * waiting on our reference. 298 * 299 * Because entries can be reallocated to other neighbors once their ref count 300 * drops to 0 we need to take the entry's lock to avoid races with a new 301 * incarnation. 302 */ 303 void 304 t3_l2e_free(struct l2t_data *d, struct l2t_entry *e) 305 { 306 mtx_lock(&e->lock); 307 if (atomic_load_acq_int(&e->refcnt) == 0) { /* hasn't been recycled */ 308 if (e->neigh) { 309 RT_LOCK(e->neigh); 310 RT_REMREF(e->neigh); 311 RT_UNLOCK(e->neigh); 312 e->neigh = NULL; 313 } 314 } 315 mtx_unlock(&e->lock); 316 atomic_add_int(&d->nfree, 1); 317 } 318 319 /* 320 * Update an L2T entry that was previously used for the same next hop as neigh. 321 * Must be called with softirqs disabled. 322 */ 323 static inline void 324 reuse_entry(struct l2t_entry *e, struct rtentry *neigh) 325 { 326 struct llinfo_arp *la; 327 328 la = (struct llinfo_arp *)neigh->rt_llinfo; 329 330 mtx_lock(&e->lock); /* avoid race with t3_l2t_free */ 331 if (neigh != e->neigh) 332 neigh_replace(e, neigh); 333 334 if (memcmp(e->dmac, RT_ENADDR(neigh), sizeof(e->dmac)) || 335 (neigh->rt_expire > time_uptime)) 336 e->state = L2T_STATE_RESOLVING; 337 else if (la->la_hold == NULL) 338 e->state = L2T_STATE_VALID; 339 else 340 e->state = L2T_STATE_STALE; 341 mtx_unlock(&e->lock); 342 } 343 344 struct l2t_entry * 345 t3_l2t_get(struct toedev *dev, struct rtentry *neigh, 346 unsigned int smt_idx) 347 { 348 struct l2t_entry *e; 349 struct l2t_data *d = L2DATA(dev); 350 u32 addr = ((struct sockaddr_in *)rt_getkey(neigh))->sin_addr.s_addr; 351 int ifidx = neigh->rt_ifp->if_index; 352 int hash = arp_hash(addr, ifidx, d); 353 354 rw_wlock(&d->lock); 355 for (e = d->l2tab[hash].first; e; e = e->next) 356 if (e->addr == addr && e->ifindex == ifidx && 357 e->smt_idx == smt_idx) { 358 l2t_hold(d, e); 359 if (atomic_load_acq_int(&e->refcnt) == 1) 360 reuse_entry(e, neigh); 361 goto done; 362 } 363 364 /* Need to allocate a new entry */ 365 e = alloc_l2e(d); 366 if (e) { 367 mtx_lock(&e->lock); /* avoid race with t3_l2t_free */ 368 e->next = d->l2tab[hash].first; 369 d->l2tab[hash].first = e; 370 e->state = L2T_STATE_RESOLVING; 371 e->addr = addr; 372 e->ifindex = ifidx; 373 e->smt_idx = smt_idx; 374 atomic_store_rel_int(&e->refcnt, 1); 375 neigh_replace(e, neigh); 376 #ifdef notyet 377 /* 378 * XXX need to add accessor function for vlan tag 379 */ 380 if (neigh->rt_ifp->if_vlantrunk) 381 e->vlan = VLAN_DEV_INFO(neigh->dev)->vlan_id; 382 else 383 #endif 384 e->vlan = VLAN_NONE; 385 mtx_unlock(&e->lock); 386 } 387 done: 388 rw_wunlock(&d->lock); 389 return e; 390 } 391 392 /* 393 * Called when address resolution fails for an L2T entry to handle packets 394 * on the arpq head. If a packet specifies a failure handler it is invoked, 395 * otherwise the packets is sent to the TOE. 396 * 397 * XXX: maybe we should abandon the latter behavior and just require a failure 398 * handler. 399 */ 400 static void 401 handle_failed_resolution(struct toedev *dev, struct mbuf *arpq) 402 { 403 404 while (arpq) { 405 struct mbuf *m = arpq; 406 #ifdef notyet 407 struct l2t_mbuf_cb *cb = L2T_MBUF_CB(m); 408 #endif 409 arpq = m->m_next; 410 m->m_next = NULL; 411 #ifdef notyet 412 if (cb->arp_failure_handler) 413 cb->arp_failure_handler(dev, m); 414 else 415 #endif 416 } 417 418 } 419 420 #if defined(NETEVENT) || !defined(CONFIG_CHELSIO_T3_MODULE) 421 /* 422 * Called when the host's ARP layer makes a change to some entry that is 423 * loaded into the HW L2 table. 424 */ 425 void 426 t3_l2t_update(struct toedev *dev, struct rtentry *neigh) 427 { 428 struct l2t_entry *e; 429 struct mbuf *arpq = NULL; 430 struct l2t_data *d = L2DATA(dev); 431 u32 addr = ((struct sockaddr_in *)rt_getkey(neigh))->sin_addr.s_addr; 432 int ifidx = neigh->rt_ifp->if_index; 433 int hash = arp_hash(addr, ifidx, d); 434 struct llinfo_arp *la; 435 436 rw_rlock(&d->lock); 437 for (e = d->l2tab[hash].first; e; e = e->next) 438 if (e->addr == addr && e->ifindex == ifidx) { 439 mtx_lock(&e->lock); 440 goto found; 441 } 442 rw_runlock(&d->lock); 443 return; 444 445 found: 446 rw_runlock(&d->lock); 447 if (atomic_load_acq_int(&e->refcnt)) { 448 if (neigh != e->neigh) 449 neigh_replace(e, neigh); 450 451 la = (struct llinfo_arp *)neigh->rt_llinfo; 452 if (e->state == L2T_STATE_RESOLVING) { 453 454 if (la->la_asked >= 5 /* arp_maxtries */) { 455 arpq = e->arpq_head; 456 e->arpq_head = e->arpq_tail = NULL; 457 } else if (la->la_hold == NULL) 458 setup_l2e_send_pending(dev, NULL, e); 459 } else { 460 e->state = (la->la_hold == NULL) ? 461 L2T_STATE_VALID : L2T_STATE_STALE; 462 if (memcmp(e->dmac, RT_ENADDR(neigh), 6)) 463 setup_l2e_send_pending(dev, NULL, e); 464 } 465 } 466 mtx_unlock(&e->lock); 467 468 if (arpq) 469 handle_failed_resolution(dev, arpq); 470 } 471 #else 472 /* 473 * Called from a kprobe, interrupts are off. 474 */ 475 void 476 t3_l2t_update(struct toedev *dev, struct rtentry *neigh) 477 { 478 struct l2t_entry *e; 479 struct l2t_data *d = L2DATA(dev); 480 u32 addr = *(u32 *) rt_key(neigh); 481 int ifidx = neigh->dev->ifindex; 482 int hash = arp_hash(addr, ifidx, d); 483 484 rw_rlock(&d->lock); 485 for (e = d->l2tab[hash].first; e; e = e->next) 486 if (e->addr == addr && e->ifindex == ifidx) { 487 mtx_lock(&e->lock); 488 if (atomic_load_acq_int(&e->refcnt)) { 489 if (neigh != e->neigh) 490 neigh_replace(e, neigh); 491 e->tdev = dev; 492 mod_timer(&e->update_timer, jiffies + 1); 493 } 494 mtx_unlock(&e->lock); 495 break; 496 } 497 rw_runlock(&d->lock); 498 } 499 500 static void 501 update_timer_cb(unsigned long data) 502 { 503 struct mbuf *arpq = NULL; 504 struct l2t_entry *e = (struct l2t_entry *)data; 505 struct rtentry *neigh = e->neigh; 506 struct toedev *dev = e->tdev; 507 508 barrier(); 509 if (!atomic_load_acq_int(&e->refcnt)) 510 return; 511 512 rw_rlock(&neigh->lock); 513 mtx_lock(&e->lock); 514 515 if (atomic_load_acq_int(&e->refcnt)) { 516 if (e->state == L2T_STATE_RESOLVING) { 517 if (neigh->nud_state & NUD_FAILED) { 518 arpq = e->arpq_head; 519 e->arpq_head = e->arpq_tail = NULL; 520 } else if (neigh_is_connected(neigh) && e->arpq_head) 521 setup_l2e_send_pending(dev, NULL, e); 522 } else { 523 e->state = neigh_is_connected(neigh) ? 524 L2T_STATE_VALID : L2T_STATE_STALE; 525 if (memcmp(e->dmac, RT_ENADDR(neigh), sizeof(e->dmac))) 526 setup_l2e_send_pending(dev, NULL, e); 527 } 528 } 529 mtx_unlock(&e->lock); 530 rw_runlock(&neigh->lock); 531 532 if (arpq) 533 handle_failed_resolution(dev, arpq); 534 } 535 #endif 536 537 struct l2t_data * 538 t3_init_l2t(unsigned int l2t_capacity) 539 { 540 struct l2t_data *d; 541 int i, size = sizeof(*d) + l2t_capacity * sizeof(struct l2t_entry); 542 543 d = cxgb_alloc_mem(size); 544 if (!d) 545 return NULL; 546 547 d->nentries = l2t_capacity; 548 d->rover = &d->l2tab[1]; /* entry 0 is not used */ 549 atomic_store_rel_int(&d->nfree, l2t_capacity - 1); 550 rw_init(&d->lock, "L2T"); 551 552 for (i = 0; i < l2t_capacity; ++i) { 553 d->l2tab[i].idx = i; 554 d->l2tab[i].state = L2T_STATE_UNUSED; 555 mtx_init(&d->l2tab[i].lock, "L2TAB", NULL, MTX_DEF); 556 atomic_store_rel_int(&d->l2tab[i].refcnt, 0); 557 #ifndef NETEVENT 558 #ifdef CONFIG_CHELSIO_T3_MODULE 559 setup_timer(&d->l2tab[i].update_timer, update_timer_cb, 560 (unsigned long)&d->l2tab[i]); 561 #endif 562 #endif 563 } 564 return d; 565 } 566 567 void 568 t3_free_l2t(struct l2t_data *d) 569 { 570 #ifndef NETEVENT 571 #ifdef CONFIG_CHELSIO_T3_MODULE 572 int i; 573 574 /* Stop all L2T timers */ 575 for (i = 0; i < d->nentries; ++i) 576 del_timer_sync(&d->l2tab[i].update_timer); 577 #endif 578 #endif 579 cxgb_free_mem(d); 580 } 581 582 #ifdef CONFIG_PROC_FS 583 #include <linux/module.h> 584 #include <linux/proc_fs.h> 585 #include <linux/seq_file.h> 586 587 static inline void * 588 l2t_get_idx(struct seq_file *seq, loff_t pos) 589 { 590 struct l2t_data *d = seq->private; 591 592 return pos >= d->nentries ? NULL : &d->l2tab[pos]; 593 } 594 595 static void * 596 l2t_seq_start(struct seq_file *seq, loff_t *pos) 597 { 598 return *pos ? l2t_get_idx(seq, *pos) : SEQ_START_TOKEN; 599 } 600 601 static void * 602 l2t_seq_next(struct seq_file *seq, void *v, loff_t *pos) 603 { 604 v = l2t_get_idx(seq, *pos + 1); 605 if (v) 606 ++*pos; 607 return v; 608 } 609 610 static void 611 l2t_seq_stop(struct seq_file *seq, void *v) 612 { 613 } 614 615 static char 616 l2e_state(const struct l2t_entry *e) 617 { 618 switch (e->state) { 619 case L2T_STATE_VALID: return 'V'; /* valid, fast-path entry */ 620 case L2T_STATE_STALE: return 'S'; /* needs revalidation, but usable */ 621 case L2T_STATE_RESOLVING: 622 return e->arpq_head ? 'A' : 'R'; 623 default: 624 return 'U'; 625 } 626 } 627 628 static int 629 l2t_seq_show(struct seq_file *seq, void *v) 630 { 631 if (v == SEQ_START_TOKEN) 632 seq_puts(seq, "Index IP address Ethernet address VLAN " 633 "Prio State Users SMTIDX Port\n"); 634 else { 635 char ip[20]; 636 struct l2t_entry *e = v; 637 638 mtx_lock(&e->lock); 639 snprintf(ip, sizeof(ip), "%u.%u.%u.%u", NIPQUAD(e->addr)); 640 seq_printf(seq, "%-5u %-15s %02x:%02x:%02x:%02x:%02x:%02x %4d" 641 " %3u %c %7u %4u %s\n", 642 e->idx, ip, e->dmac[0], e->dmac[1], e->dmac[2], 643 e->dmac[3], e->dmac[4], e->dmac[5], 644 e->vlan & EVL_VLID_MASK, vlan_prio(e), 645 l2e_state(e), atomic_load_acq_int(&e->refcnt), e->smt_idx, 646 e->neigh ? e->neigh->dev->name : ""); 647 mtx_unlock(&e->lock); 648 } 649 return 0; 650 } 651 652 #endif 653