Loading...
1/* Evaluate MSG_ZEROCOPY
2 *
3 * Send traffic between two processes over one of the supported
4 * protocols and modes:
5 *
6 * PF_INET/PF_INET6
7 * - SOCK_STREAM
8 * - SOCK_DGRAM
9 * - SOCK_DGRAM with UDP_CORK
10 * - SOCK_RAW
11 * - SOCK_RAW with IP_HDRINCL
12 *
13 * PF_PACKET
14 * - SOCK_DGRAM
15 * - SOCK_RAW
16 *
17 * PF_RDS
18 * - SOCK_SEQPACKET
19 *
20 * Start this program on two connected hosts, one in send mode and
21 * the other with option '-r' to put it in receiver mode.
22 *
23 * If zerocopy mode ('-z') is enabled, the sender will verify that
24 * the kernel queues completions on the error queue for all zerocopy
25 * transfers.
26 */
27
28#define _GNU_SOURCE
29
30#include <arpa/inet.h>
31#include <error.h>
32#include <errno.h>
33#include <limits.h>
34#include <linux/errqueue.h>
35#include <linux/if_packet.h>
36#include <linux/ipv6.h>
37#include <linux/socket.h>
38#include <linux/sockios.h>
39#include <net/ethernet.h>
40#include <net/if.h>
41#include <netinet/ip.h>
42#include <netinet/ip6.h>
43#include <netinet/tcp.h>
44#include <netinet/udp.h>
45#include <poll.h>
46#include <sched.h>
47#include <stdbool.h>
48#include <stdio.h>
49#include <stdint.h>
50#include <stdlib.h>
51#include <string.h>
52#include <sys/ioctl.h>
53#include <sys/socket.h>
54#include <sys/stat.h>
55#include <sys/time.h>
56#include <sys/types.h>
57#include <sys/wait.h>
58#include <unistd.h>
59#include <linux/rds.h>
60
61#ifndef SO_EE_ORIGIN_ZEROCOPY
62#define SO_EE_ORIGIN_ZEROCOPY 5
63#endif
64
65#ifndef SO_ZEROCOPY
66#define SO_ZEROCOPY 60
67#endif
68
69#ifndef SO_EE_CODE_ZEROCOPY_COPIED
70#define SO_EE_CODE_ZEROCOPY_COPIED 1
71#endif
72
73#ifndef MSG_ZEROCOPY
74#define MSG_ZEROCOPY 0x4000000
75#endif
76
77static int cfg_cork;
78static bool cfg_cork_mixed;
79static int cfg_cpu = -1; /* default: pin to last cpu */
80static int cfg_family = PF_UNSPEC;
81static int cfg_ifindex = 1;
82static int cfg_payload_len;
83static int cfg_port = 8000;
84static bool cfg_rx;
85static int cfg_runtime_ms = 4200;
86static int cfg_verbose;
87static int cfg_waittime_ms = 500;
88static bool cfg_zerocopy;
89
90static socklen_t cfg_alen;
91static struct sockaddr_storage cfg_dst_addr;
92static struct sockaddr_storage cfg_src_addr;
93
94static char payload[IP_MAXPACKET];
95static long packets, bytes, completions, expected_completions;
96static int zerocopied = -1;
97static uint32_t next_completion;
98
99static unsigned long gettimeofday_ms(void)
100{
101 struct timeval tv;
102
103 gettimeofday(&tv, NULL);
104 return (tv.tv_sec * 1000) + (tv.tv_usec / 1000);
105}
106
107static uint16_t get_ip_csum(const uint16_t *start, int num_words)
108{
109 unsigned long sum = 0;
110 int i;
111
112 for (i = 0; i < num_words; i++)
113 sum += start[i];
114
115 while (sum >> 16)
116 sum = (sum & 0xFFFF) + (sum >> 16);
117
118 return ~sum;
119}
120
121static int do_setcpu(int cpu)
122{
123 cpu_set_t mask;
124
125 CPU_ZERO(&mask);
126 CPU_SET(cpu, &mask);
127 if (sched_setaffinity(0, sizeof(mask), &mask))
128 fprintf(stderr, "cpu: unable to pin, may increase variance.\n");
129 else if (cfg_verbose)
130 fprintf(stderr, "cpu: %u\n", cpu);
131
132 return 0;
133}
134
135static void do_setsockopt(int fd, int level, int optname, int val)
136{
137 if (setsockopt(fd, level, optname, &val, sizeof(val)))
138 error(1, errno, "setsockopt %d.%d: %d", level, optname, val);
139}
140
141static int do_poll(int fd, int events)
142{
143 struct pollfd pfd;
144 int ret;
145
146 pfd.events = events;
147 pfd.revents = 0;
148 pfd.fd = fd;
149
150 ret = poll(&pfd, 1, cfg_waittime_ms);
151 if (ret == -1)
152 error(1, errno, "poll");
153
154 return ret && (pfd.revents & events);
155}
156
157static int do_accept(int fd)
158{
159 int fda = fd;
160
161 fd = accept(fda, NULL, NULL);
162 if (fd == -1)
163 error(1, errno, "accept");
164 if (close(fda))
165 error(1, errno, "close listen sock");
166
167 return fd;
168}
169
170static void add_zcopy_cookie(struct msghdr *msg, uint32_t cookie)
171{
172 struct cmsghdr *cm;
173
174 if (!msg->msg_control)
175 error(1, errno, "NULL cookie");
176 cm = (void *)msg->msg_control;
177 cm->cmsg_len = CMSG_LEN(sizeof(cookie));
178 cm->cmsg_level = SOL_RDS;
179 cm->cmsg_type = RDS_CMSG_ZCOPY_COOKIE;
180 memcpy(CMSG_DATA(cm), &cookie, sizeof(cookie));
181}
182
183static bool do_sendmsg(int fd, struct msghdr *msg, bool do_zerocopy, int domain)
184{
185 int ret, len, i, flags;
186 static uint32_t cookie;
187 char ckbuf[CMSG_SPACE(sizeof(cookie))];
188
189 len = 0;
190 for (i = 0; i < msg->msg_iovlen; i++)
191 len += msg->msg_iov[i].iov_len;
192
193 flags = MSG_DONTWAIT;
194 if (do_zerocopy) {
195 flags |= MSG_ZEROCOPY;
196 if (domain == PF_RDS) {
197 memset(&msg->msg_control, 0, sizeof(msg->msg_control));
198 msg->msg_controllen = CMSG_SPACE(sizeof(cookie));
199 msg->msg_control = (struct cmsghdr *)ckbuf;
200 add_zcopy_cookie(msg, ++cookie);
201 }
202 }
203
204 ret = sendmsg(fd, msg, flags);
205 if (ret == -1 && errno == EAGAIN)
206 return false;
207 if (ret == -1)
208 error(1, errno, "send");
209 if (cfg_verbose && ret != len)
210 fprintf(stderr, "send: ret=%u != %u\n", ret, len);
211
212 if (len) {
213 packets++;
214 bytes += ret;
215 if (do_zerocopy && ret)
216 expected_completions++;
217 }
218 if (do_zerocopy && domain == PF_RDS) {
219 msg->msg_control = NULL;
220 msg->msg_controllen = 0;
221 }
222
223 return true;
224}
225
226static void do_sendmsg_corked(int fd, struct msghdr *msg)
227{
228 bool do_zerocopy = cfg_zerocopy;
229 int i, payload_len, extra_len;
230
231 /* split up the packet. for non-multiple, make first buffer longer */
232 payload_len = cfg_payload_len / cfg_cork;
233 extra_len = cfg_payload_len - (cfg_cork * payload_len);
234
235 do_setsockopt(fd, IPPROTO_UDP, UDP_CORK, 1);
236
237 for (i = 0; i < cfg_cork; i++) {
238
239 /* in mixed-frags mode, alternate zerocopy and copy frags
240 * start with non-zerocopy, to ensure attach later works
241 */
242 if (cfg_cork_mixed)
243 do_zerocopy = (i & 1);
244
245 msg->msg_iov[0].iov_len = payload_len + extra_len;
246 extra_len = 0;
247
248 do_sendmsg(fd, msg, do_zerocopy,
249 (cfg_dst_addr.ss_family == AF_INET ?
250 PF_INET : PF_INET6));
251 }
252
253 do_setsockopt(fd, IPPROTO_UDP, UDP_CORK, 0);
254}
255
256static int setup_iph(struct iphdr *iph, uint16_t payload_len)
257{
258 struct sockaddr_in *daddr = (void *) &cfg_dst_addr;
259 struct sockaddr_in *saddr = (void *) &cfg_src_addr;
260
261 memset(iph, 0, sizeof(*iph));
262
263 iph->version = 4;
264 iph->tos = 0;
265 iph->ihl = 5;
266 iph->ttl = 2;
267 iph->saddr = saddr->sin_addr.s_addr;
268 iph->daddr = daddr->sin_addr.s_addr;
269 iph->protocol = IPPROTO_EGP;
270 iph->tot_len = htons(sizeof(*iph) + payload_len);
271 iph->check = get_ip_csum((void *) iph, iph->ihl << 1);
272
273 return sizeof(*iph);
274}
275
276static int setup_ip6h(struct ipv6hdr *ip6h, uint16_t payload_len)
277{
278 struct sockaddr_in6 *daddr = (void *) &cfg_dst_addr;
279 struct sockaddr_in6 *saddr = (void *) &cfg_src_addr;
280
281 memset(ip6h, 0, sizeof(*ip6h));
282
283 ip6h->version = 6;
284 ip6h->payload_len = htons(payload_len);
285 ip6h->nexthdr = IPPROTO_EGP;
286 ip6h->hop_limit = 2;
287 ip6h->saddr = saddr->sin6_addr;
288 ip6h->daddr = daddr->sin6_addr;
289
290 return sizeof(*ip6h);
291}
292
293
294static void setup_sockaddr(int domain, const char *str_addr,
295 struct sockaddr_storage *sockaddr)
296{
297 struct sockaddr_in6 *addr6 = (void *) sockaddr;
298 struct sockaddr_in *addr4 = (void *) sockaddr;
299
300 switch (domain) {
301 case PF_INET:
302 memset(addr4, 0, sizeof(*addr4));
303 addr4->sin_family = AF_INET;
304 addr4->sin_port = htons(cfg_port);
305 if (str_addr &&
306 inet_pton(AF_INET, str_addr, &(addr4->sin_addr)) != 1)
307 error(1, 0, "ipv4 parse error: %s", str_addr);
308 break;
309 case PF_INET6:
310 memset(addr6, 0, sizeof(*addr6));
311 addr6->sin6_family = AF_INET6;
312 addr6->sin6_port = htons(cfg_port);
313 if (str_addr &&
314 inet_pton(AF_INET6, str_addr, &(addr6->sin6_addr)) != 1)
315 error(1, 0, "ipv6 parse error: %s", str_addr);
316 break;
317 default:
318 error(1, 0, "illegal domain");
319 }
320}
321
322static int do_setup_tx(int domain, int type, int protocol)
323{
324 int fd;
325
326 fd = socket(domain, type, protocol);
327 if (fd == -1)
328 error(1, errno, "socket t");
329
330 do_setsockopt(fd, SOL_SOCKET, SO_SNDBUF, 1 << 21);
331 if (cfg_zerocopy)
332 do_setsockopt(fd, SOL_SOCKET, SO_ZEROCOPY, 1);
333
334 if (domain != PF_PACKET && domain != PF_RDS)
335 if (connect(fd, (void *) &cfg_dst_addr, cfg_alen))
336 error(1, errno, "connect");
337
338 if (domain == PF_RDS) {
339 if (bind(fd, (void *) &cfg_src_addr, cfg_alen))
340 error(1, errno, "bind");
341 }
342
343 return fd;
344}
345
346static uint32_t do_process_zerocopy_cookies(struct rds_zcopy_cookies *ck)
347{
348 int i;
349
350 if (ck->num > RDS_MAX_ZCOOKIES)
351 error(1, 0, "Returned %d cookies, max expected %d\n",
352 ck->num, RDS_MAX_ZCOOKIES);
353 for (i = 0; i < ck->num; i++)
354 if (cfg_verbose >= 2)
355 fprintf(stderr, "%d\n", ck->cookies[i]);
356 return ck->num;
357}
358
359static bool do_recvmsg_completion(int fd)
360{
361 char cmsgbuf[CMSG_SPACE(sizeof(struct rds_zcopy_cookies))];
362 struct rds_zcopy_cookies *ck;
363 struct cmsghdr *cmsg;
364 struct msghdr msg;
365 bool ret = false;
366
367 memset(&msg, 0, sizeof(msg));
368 msg.msg_control = cmsgbuf;
369 msg.msg_controllen = sizeof(cmsgbuf);
370
371 if (recvmsg(fd, &msg, MSG_DONTWAIT))
372 return ret;
373
374 if (msg.msg_flags & MSG_CTRUNC)
375 error(1, errno, "recvmsg notification: truncated");
376
377 for (cmsg = CMSG_FIRSTHDR(&msg); cmsg; cmsg = CMSG_NXTHDR(&msg, cmsg)) {
378 if (cmsg->cmsg_level == SOL_RDS &&
379 cmsg->cmsg_type == RDS_CMSG_ZCOPY_COMPLETION) {
380
381 ck = (struct rds_zcopy_cookies *)CMSG_DATA(cmsg);
382 completions += do_process_zerocopy_cookies(ck);
383 ret = true;
384 break;
385 }
386 error(0, 0, "ignoring cmsg at level %d type %d\n",
387 cmsg->cmsg_level, cmsg->cmsg_type);
388 }
389 return ret;
390}
391
392static bool do_recv_completion(int fd, int domain)
393{
394 struct sock_extended_err *serr;
395 struct msghdr msg = {};
396 struct cmsghdr *cm;
397 uint32_t hi, lo, range;
398 int ret, zerocopy;
399 char control[100];
400
401 if (domain == PF_RDS)
402 return do_recvmsg_completion(fd);
403
404 msg.msg_control = control;
405 msg.msg_controllen = sizeof(control);
406
407 ret = recvmsg(fd, &msg, MSG_ERRQUEUE);
408 if (ret == -1 && errno == EAGAIN)
409 return false;
410 if (ret == -1)
411 error(1, errno, "recvmsg notification");
412 if (msg.msg_flags & MSG_CTRUNC)
413 error(1, errno, "recvmsg notification: truncated");
414
415 cm = CMSG_FIRSTHDR(&msg);
416 if (!cm)
417 error(1, 0, "cmsg: no cmsg");
418 if (!((cm->cmsg_level == SOL_IP && cm->cmsg_type == IP_RECVERR) ||
419 (cm->cmsg_level == SOL_IPV6 && cm->cmsg_type == IPV6_RECVERR) ||
420 (cm->cmsg_level == SOL_PACKET && cm->cmsg_type == PACKET_TX_TIMESTAMP)))
421 error(1, 0, "serr: wrong type: %d.%d",
422 cm->cmsg_level, cm->cmsg_type);
423
424 serr = (void *) CMSG_DATA(cm);
425
426 if (serr->ee_origin != SO_EE_ORIGIN_ZEROCOPY)
427 error(1, 0, "serr: wrong origin: %u", serr->ee_origin);
428 if (serr->ee_errno != 0)
429 error(1, 0, "serr: wrong error code: %u", serr->ee_errno);
430
431 hi = serr->ee_data;
432 lo = serr->ee_info;
433 range = hi - lo + 1;
434
435 /* Detect notification gaps. These should not happen often, if at all.
436 * Gaps can occur due to drops, reordering and retransmissions.
437 */
438 if (lo != next_completion)
439 fprintf(stderr, "gap: %u..%u does not append to %u\n",
440 lo, hi, next_completion);
441 next_completion = hi + 1;
442
443 zerocopy = !(serr->ee_code & SO_EE_CODE_ZEROCOPY_COPIED);
444 if (zerocopied == -1)
445 zerocopied = zerocopy;
446 else if (zerocopied != zerocopy) {
447 fprintf(stderr, "serr: inconsistent\n");
448 zerocopied = zerocopy;
449 }
450
451 if (cfg_verbose >= 2)
452 fprintf(stderr, "completed: %u (h=%u l=%u)\n",
453 range, hi, lo);
454
455 completions += range;
456 return true;
457}
458
459/* Read all outstanding messages on the errqueue */
460static void do_recv_completions(int fd, int domain)
461{
462 while (do_recv_completion(fd, domain)) {}
463}
464
465/* Wait for all remaining completions on the errqueue */
466static void do_recv_remaining_completions(int fd, int domain)
467{
468 int64_t tstop = gettimeofday_ms() + cfg_waittime_ms;
469
470 while (completions < expected_completions &&
471 gettimeofday_ms() < tstop) {
472 if (do_poll(fd, domain == PF_RDS ? POLLIN : POLLERR))
473 do_recv_completions(fd, domain);
474 }
475
476 if (completions < expected_completions)
477 fprintf(stderr, "missing notifications: %lu < %lu\n",
478 completions, expected_completions);
479}
480
481static void do_tx(int domain, int type, int protocol)
482{
483 struct iovec iov[3] = { {0} };
484 struct sockaddr_ll laddr;
485 struct msghdr msg = {0};
486 struct ethhdr eth;
487 union {
488 struct ipv6hdr ip6h;
489 struct iphdr iph;
490 } nh;
491 uint64_t tstop;
492 int fd;
493
494 fd = do_setup_tx(domain, type, protocol);
495
496 if (domain == PF_PACKET) {
497 uint16_t proto = cfg_family == PF_INET ? ETH_P_IP : ETH_P_IPV6;
498
499 /* sock_raw passes ll header as data */
500 if (type == SOCK_RAW) {
501 memset(eth.h_dest, 0x06, ETH_ALEN);
502 memset(eth.h_source, 0x02, ETH_ALEN);
503 eth.h_proto = htons(proto);
504 iov[0].iov_base = ð
505 iov[0].iov_len = sizeof(eth);
506 msg.msg_iovlen++;
507 }
508
509 /* both sock_raw and sock_dgram expect name */
510 memset(&laddr, 0, sizeof(laddr));
511 laddr.sll_family = AF_PACKET;
512 laddr.sll_ifindex = cfg_ifindex;
513 laddr.sll_protocol = htons(proto);
514 laddr.sll_halen = ETH_ALEN;
515
516 memset(laddr.sll_addr, 0x06, ETH_ALEN);
517
518 msg.msg_name = &laddr;
519 msg.msg_namelen = sizeof(laddr);
520 }
521
522 /* packet and raw sockets with hdrincl must pass network header */
523 if (domain == PF_PACKET || protocol == IPPROTO_RAW) {
524 if (cfg_family == PF_INET)
525 iov[1].iov_len = setup_iph(&nh.iph, cfg_payload_len);
526 else
527 iov[1].iov_len = setup_ip6h(&nh.ip6h, cfg_payload_len);
528
529 iov[1].iov_base = (void *) &nh;
530 msg.msg_iovlen++;
531 }
532
533 if (domain == PF_RDS) {
534 msg.msg_name = &cfg_dst_addr;
535 msg.msg_namelen = (cfg_dst_addr.ss_family == AF_INET ?
536 sizeof(struct sockaddr_in) :
537 sizeof(struct sockaddr_in6));
538 }
539
540 iov[2].iov_base = payload;
541 iov[2].iov_len = cfg_payload_len;
542 msg.msg_iovlen++;
543 msg.msg_iov = &iov[3 - msg.msg_iovlen];
544
545 tstop = gettimeofday_ms() + cfg_runtime_ms;
546 do {
547 if (cfg_cork)
548 do_sendmsg_corked(fd, &msg);
549 else
550 do_sendmsg(fd, &msg, cfg_zerocopy, domain);
551
552 while (!do_poll(fd, POLLOUT)) {
553 if (cfg_zerocopy)
554 do_recv_completions(fd, domain);
555 }
556
557 } while (gettimeofday_ms() < tstop);
558
559 if (cfg_zerocopy)
560 do_recv_remaining_completions(fd, domain);
561
562 if (close(fd))
563 error(1, errno, "close");
564
565 fprintf(stderr, "tx=%lu (%lu MB) txc=%lu zc=%c\n",
566 packets, bytes >> 20, completions,
567 zerocopied == 1 ? 'y' : 'n');
568}
569
570static int do_setup_rx(int domain, int type, int protocol)
571{
572 int fd;
573
574 /* If tx over PF_PACKET, rx over PF_INET(6)/SOCK_RAW,
575 * to recv the only copy of the packet, not a clone
576 */
577 if (domain == PF_PACKET)
578 error(1, 0, "Use PF_INET/SOCK_RAW to read");
579
580 if (type == SOCK_RAW && protocol == IPPROTO_RAW)
581 error(1, 0, "IPPROTO_RAW: not supported on Rx");
582
583 fd = socket(domain, type, protocol);
584 if (fd == -1)
585 error(1, errno, "socket r");
586
587 do_setsockopt(fd, SOL_SOCKET, SO_RCVBUF, 1 << 21);
588 do_setsockopt(fd, SOL_SOCKET, SO_RCVLOWAT, 1 << 16);
589 do_setsockopt(fd, SOL_SOCKET, SO_REUSEPORT, 1);
590
591 if (bind(fd, (void *) &cfg_dst_addr, cfg_alen))
592 error(1, errno, "bind");
593
594 if (type == SOCK_STREAM) {
595 if (listen(fd, 1))
596 error(1, errno, "listen");
597 fd = do_accept(fd);
598 }
599
600 return fd;
601}
602
603/* Flush all outstanding bytes for the tcp receive queue */
604static void do_flush_tcp(int fd)
605{
606 int ret;
607
608 /* MSG_TRUNC flushes up to len bytes */
609 ret = recv(fd, NULL, 1 << 21, MSG_TRUNC | MSG_DONTWAIT);
610 if (ret == -1 && errno == EAGAIN)
611 return;
612 if (ret == -1)
613 error(1, errno, "flush");
614 if (!ret)
615 return;
616
617 packets++;
618 bytes += ret;
619}
620
621/* Flush all outstanding datagrams. Verify first few bytes of each. */
622static void do_flush_datagram(int fd, int type)
623{
624 int ret, off = 0;
625 char buf[64];
626
627 /* MSG_TRUNC will return full datagram length */
628 ret = recv(fd, buf, sizeof(buf), MSG_DONTWAIT | MSG_TRUNC);
629 if (ret == -1 && errno == EAGAIN)
630 return;
631
632 /* raw ipv4 return with header, raw ipv6 without */
633 if (cfg_family == PF_INET && type == SOCK_RAW) {
634 off += sizeof(struct iphdr);
635 ret -= sizeof(struct iphdr);
636 }
637
638 if (ret == -1)
639 error(1, errno, "recv");
640 if (ret != cfg_payload_len)
641 error(1, 0, "recv: ret=%u != %u", ret, cfg_payload_len);
642 if (ret > sizeof(buf) - off)
643 ret = sizeof(buf) - off;
644 if (memcmp(buf + off, payload, ret))
645 error(1, 0, "recv: data mismatch");
646
647 packets++;
648 bytes += cfg_payload_len;
649}
650
651static void do_rx(int domain, int type, int protocol)
652{
653 const int cfg_receiver_wait_ms = 400;
654 uint64_t tstop;
655 int fd;
656
657 fd = do_setup_rx(domain, type, protocol);
658
659 tstop = gettimeofday_ms() + cfg_runtime_ms + cfg_receiver_wait_ms;
660 do {
661 if (type == SOCK_STREAM)
662 do_flush_tcp(fd);
663 else
664 do_flush_datagram(fd, type);
665
666 do_poll(fd, POLLIN);
667
668 } while (gettimeofday_ms() < tstop);
669
670 if (close(fd))
671 error(1, errno, "close");
672
673 fprintf(stderr, "rx=%lu (%lu MB)\n", packets, bytes >> 20);
674}
675
676static void do_test(int domain, int type, int protocol)
677{
678 int i;
679
680 if (cfg_cork && (domain == PF_PACKET || type != SOCK_DGRAM))
681 error(1, 0, "can only cork udp sockets");
682
683 do_setcpu(cfg_cpu);
684
685 for (i = 0; i < IP_MAXPACKET; i++)
686 payload[i] = 'a' + (i % 26);
687
688 if (cfg_rx)
689 do_rx(domain, type, protocol);
690 else
691 do_tx(domain, type, protocol);
692}
693
694static void usage(const char *filepath)
695{
696 error(1, 0, "Usage: %s [options] <test>", filepath);
697}
698
699static void parse_opts(int argc, char **argv)
700{
701 const int max_payload_len = sizeof(payload) -
702 sizeof(struct ipv6hdr) -
703 sizeof(struct tcphdr) -
704 40 /* max tcp options */;
705 int c;
706 char *daddr = NULL, *saddr = NULL;
707 char *cfg_test;
708
709 cfg_payload_len = max_payload_len;
710
711 while ((c = getopt(argc, argv, "46c:C:D:i:mp:rs:S:t:vz")) != -1) {
712 switch (c) {
713 case '4':
714 if (cfg_family != PF_UNSPEC)
715 error(1, 0, "Pass one of -4 or -6");
716 cfg_family = PF_INET;
717 cfg_alen = sizeof(struct sockaddr_in);
718 break;
719 case '6':
720 if (cfg_family != PF_UNSPEC)
721 error(1, 0, "Pass one of -4 or -6");
722 cfg_family = PF_INET6;
723 cfg_alen = sizeof(struct sockaddr_in6);
724 break;
725 case 'c':
726 cfg_cork = strtol(optarg, NULL, 0);
727 break;
728 case 'C':
729 cfg_cpu = strtol(optarg, NULL, 0);
730 break;
731 case 'D':
732 daddr = optarg;
733 break;
734 case 'i':
735 cfg_ifindex = if_nametoindex(optarg);
736 if (cfg_ifindex == 0)
737 error(1, errno, "invalid iface: %s", optarg);
738 break;
739 case 'm':
740 cfg_cork_mixed = true;
741 break;
742 case 'p':
743 cfg_port = strtoul(optarg, NULL, 0);
744 break;
745 case 'r':
746 cfg_rx = true;
747 break;
748 case 's':
749 cfg_payload_len = strtoul(optarg, NULL, 0);
750 break;
751 case 'S':
752 saddr = optarg;
753 break;
754 case 't':
755 cfg_runtime_ms = 200 + strtoul(optarg, NULL, 10) * 1000;
756 break;
757 case 'v':
758 cfg_verbose++;
759 break;
760 case 'z':
761 cfg_zerocopy = true;
762 break;
763 }
764 }
765
766 cfg_test = argv[argc - 1];
767 if (strcmp(cfg_test, "rds") == 0) {
768 if (!daddr)
769 error(1, 0, "-D <server addr> required for PF_RDS\n");
770 if (!cfg_rx && !saddr)
771 error(1, 0, "-S <client addr> required for PF_RDS\n");
772 }
773 setup_sockaddr(cfg_family, daddr, &cfg_dst_addr);
774 setup_sockaddr(cfg_family, saddr, &cfg_src_addr);
775
776 if (cfg_payload_len > max_payload_len)
777 error(1, 0, "-s: payload exceeds max (%d)", max_payload_len);
778 if (cfg_cork_mixed && (!cfg_zerocopy || !cfg_cork))
779 error(1, 0, "-m: cork_mixed requires corking and zerocopy");
780
781 if (optind != argc - 1)
782 usage(argv[0]);
783}
784
785int main(int argc, char **argv)
786{
787 const char *cfg_test;
788
789 parse_opts(argc, argv);
790
791 cfg_test = argv[argc - 1];
792
793 if (!strcmp(cfg_test, "packet"))
794 do_test(PF_PACKET, SOCK_RAW, 0);
795 else if (!strcmp(cfg_test, "packet_dgram"))
796 do_test(PF_PACKET, SOCK_DGRAM, 0);
797 else if (!strcmp(cfg_test, "raw"))
798 do_test(cfg_family, SOCK_RAW, IPPROTO_EGP);
799 else if (!strcmp(cfg_test, "raw_hdrincl"))
800 do_test(cfg_family, SOCK_RAW, IPPROTO_RAW);
801 else if (!strcmp(cfg_test, "tcp"))
802 do_test(cfg_family, SOCK_STREAM, 0);
803 else if (!strcmp(cfg_test, "udp"))
804 do_test(cfg_family, SOCK_DGRAM, 0);
805 else if (!strcmp(cfg_test, "rds"))
806 do_test(PF_RDS, SOCK_SEQPACKET, 0);
807 else
808 error(1, 0, "unknown cfg_test %s", cfg_test);
809
810 return 0;
811}
1/* Evaluate MSG_ZEROCOPY
2 *
3 * Send traffic between two processes over one of the supported
4 * protocols and modes:
5 *
6 * PF_INET/PF_INET6
7 * - SOCK_STREAM
8 * - SOCK_DGRAM
9 * - SOCK_DGRAM with UDP_CORK
10 * - SOCK_RAW
11 * - SOCK_RAW with IP_HDRINCL
12 *
13 * PF_PACKET
14 * - SOCK_DGRAM
15 * - SOCK_RAW
16 *
17 * PF_RDS
18 * - SOCK_SEQPACKET
19 *
20 * Start this program on two connected hosts, one in send mode and
21 * the other with option '-r' to put it in receiver mode.
22 *
23 * If zerocopy mode ('-z') is enabled, the sender will verify that
24 * the kernel queues completions on the error queue for all zerocopy
25 * transfers.
26 */
27
28#define _GNU_SOURCE
29
30#include <arpa/inet.h>
31#include <error.h>
32#include <errno.h>
33#include <limits.h>
34#include <linux/errqueue.h>
35#include <linux/if_packet.h>
36#include <linux/ipv6.h>
37#include <linux/socket.h>
38#include <linux/sockios.h>
39#include <net/ethernet.h>
40#include <net/if.h>
41#include <netinet/ip.h>
42#include <netinet/ip6.h>
43#include <netinet/tcp.h>
44#include <netinet/udp.h>
45#include <poll.h>
46#include <sched.h>
47#include <stdbool.h>
48#include <stdio.h>
49#include <stdint.h>
50#include <stdlib.h>
51#include <string.h>
52#include <sys/ioctl.h>
53#include <sys/socket.h>
54#include <sys/stat.h>
55#include <sys/time.h>
56#include <sys/types.h>
57#include <sys/wait.h>
58#include <unistd.h>
59#include <linux/rds.h>
60
61#ifndef SO_EE_ORIGIN_ZEROCOPY
62#define SO_EE_ORIGIN_ZEROCOPY 5
63#endif
64
65#ifndef SO_ZEROCOPY
66#define SO_ZEROCOPY 60
67#endif
68
69#ifndef SO_EE_CODE_ZEROCOPY_COPIED
70#define SO_EE_CODE_ZEROCOPY_COPIED 1
71#endif
72
73#ifndef MSG_ZEROCOPY
74#define MSG_ZEROCOPY 0x4000000
75#endif
76
77static int cfg_cork;
78static bool cfg_cork_mixed;
79static int cfg_cpu = -1; /* default: pin to last cpu */
80static int cfg_family = PF_UNSPEC;
81static int cfg_ifindex = 1;
82static int cfg_payload_len;
83static int cfg_port = 8000;
84static bool cfg_rx;
85static int cfg_runtime_ms = 4200;
86static int cfg_verbose;
87static int cfg_waittime_ms = 500;
88static int cfg_notification_limit = 32;
89static bool cfg_zerocopy;
90
91static socklen_t cfg_alen;
92static struct sockaddr_storage cfg_dst_addr;
93static struct sockaddr_storage cfg_src_addr;
94
95static char payload[IP_MAXPACKET];
96static long packets, bytes, completions, expected_completions;
97static int zerocopied = -1;
98static uint32_t next_completion;
99static uint32_t sends_since_notify;
100
101static unsigned long gettimeofday_ms(void)
102{
103 struct timeval tv;
104
105 gettimeofday(&tv, NULL);
106 return (tv.tv_sec * 1000) + (tv.tv_usec / 1000);
107}
108
109static uint16_t get_ip_csum(const uint16_t *start, int num_words)
110{
111 unsigned long sum = 0;
112 int i;
113
114 for (i = 0; i < num_words; i++)
115 sum += start[i];
116
117 while (sum >> 16)
118 sum = (sum & 0xFFFF) + (sum >> 16);
119
120 return ~sum;
121}
122
123static int do_setcpu(int cpu)
124{
125 cpu_set_t mask;
126
127 CPU_ZERO(&mask);
128 CPU_SET(cpu, &mask);
129 if (sched_setaffinity(0, sizeof(mask), &mask))
130 fprintf(stderr, "cpu: unable to pin, may increase variance.\n");
131 else if (cfg_verbose)
132 fprintf(stderr, "cpu: %u\n", cpu);
133
134 return 0;
135}
136
137static void do_setsockopt(int fd, int level, int optname, int val)
138{
139 if (setsockopt(fd, level, optname, &val, sizeof(val)))
140 error(1, errno, "setsockopt %d.%d: %d", level, optname, val);
141}
142
143static int do_poll(int fd, int events)
144{
145 struct pollfd pfd;
146 int ret;
147
148 pfd.events = events;
149 pfd.revents = 0;
150 pfd.fd = fd;
151
152 ret = poll(&pfd, 1, cfg_waittime_ms);
153 if (ret == -1)
154 error(1, errno, "poll");
155
156 return ret && (pfd.revents & events);
157}
158
159static int do_accept(int fd)
160{
161 int fda = fd;
162
163 fd = accept(fda, NULL, NULL);
164 if (fd == -1)
165 error(1, errno, "accept");
166 if (close(fda))
167 error(1, errno, "close listen sock");
168
169 return fd;
170}
171
172static void add_zcopy_cookie(struct msghdr *msg, uint32_t cookie)
173{
174 struct cmsghdr *cm;
175
176 if (!msg->msg_control)
177 error(1, errno, "NULL cookie");
178 cm = (void *)msg->msg_control;
179 cm->cmsg_len = CMSG_LEN(sizeof(cookie));
180 cm->cmsg_level = SOL_RDS;
181 cm->cmsg_type = RDS_CMSG_ZCOPY_COOKIE;
182 memcpy(CMSG_DATA(cm), &cookie, sizeof(cookie));
183}
184
185static bool do_sendmsg(int fd, struct msghdr *msg, bool do_zerocopy, int domain)
186{
187 int ret, len, i, flags;
188 static uint32_t cookie;
189 char ckbuf[CMSG_SPACE(sizeof(cookie))];
190
191 len = 0;
192 for (i = 0; i < msg->msg_iovlen; i++)
193 len += msg->msg_iov[i].iov_len;
194
195 flags = MSG_DONTWAIT;
196 if (do_zerocopy) {
197 flags |= MSG_ZEROCOPY;
198 if (domain == PF_RDS) {
199 memset(&msg->msg_control, 0, sizeof(msg->msg_control));
200 msg->msg_controllen = CMSG_SPACE(sizeof(cookie));
201 msg->msg_control = (struct cmsghdr *)ckbuf;
202 add_zcopy_cookie(msg, ++cookie);
203 }
204 }
205
206 ret = sendmsg(fd, msg, flags);
207 if (ret == -1 && errno == EAGAIN)
208 return false;
209 if (ret == -1)
210 error(1, errno, "send");
211 if (cfg_verbose && ret != len)
212 fprintf(stderr, "send: ret=%u != %u\n", ret, len);
213 sends_since_notify++;
214
215 if (len) {
216 packets++;
217 bytes += ret;
218 if (do_zerocopy && ret)
219 expected_completions++;
220 }
221 if (do_zerocopy && domain == PF_RDS) {
222 msg->msg_control = NULL;
223 msg->msg_controllen = 0;
224 }
225
226 return true;
227}
228
229static void do_sendmsg_corked(int fd, struct msghdr *msg)
230{
231 bool do_zerocopy = cfg_zerocopy;
232 int i, payload_len, extra_len;
233
234 /* split up the packet. for non-multiple, make first buffer longer */
235 payload_len = cfg_payload_len / cfg_cork;
236 extra_len = cfg_payload_len - (cfg_cork * payload_len);
237
238 do_setsockopt(fd, IPPROTO_UDP, UDP_CORK, 1);
239
240 for (i = 0; i < cfg_cork; i++) {
241
242 /* in mixed-frags mode, alternate zerocopy and copy frags
243 * start with non-zerocopy, to ensure attach later works
244 */
245 if (cfg_cork_mixed)
246 do_zerocopy = (i & 1);
247
248 msg->msg_iov[0].iov_len = payload_len + extra_len;
249 extra_len = 0;
250
251 do_sendmsg(fd, msg, do_zerocopy,
252 (cfg_dst_addr.ss_family == AF_INET ?
253 PF_INET : PF_INET6));
254 }
255
256 do_setsockopt(fd, IPPROTO_UDP, UDP_CORK, 0);
257}
258
259static int setup_iph(struct iphdr *iph, uint16_t payload_len)
260{
261 struct sockaddr_in *daddr = (void *) &cfg_dst_addr;
262 struct sockaddr_in *saddr = (void *) &cfg_src_addr;
263
264 memset(iph, 0, sizeof(*iph));
265
266 iph->version = 4;
267 iph->tos = 0;
268 iph->ihl = 5;
269 iph->ttl = 2;
270 iph->saddr = saddr->sin_addr.s_addr;
271 iph->daddr = daddr->sin_addr.s_addr;
272 iph->protocol = IPPROTO_EGP;
273 iph->tot_len = htons(sizeof(*iph) + payload_len);
274 iph->check = get_ip_csum((void *) iph, iph->ihl << 1);
275
276 return sizeof(*iph);
277}
278
279static int setup_ip6h(struct ipv6hdr *ip6h, uint16_t payload_len)
280{
281 struct sockaddr_in6 *daddr = (void *) &cfg_dst_addr;
282 struct sockaddr_in6 *saddr = (void *) &cfg_src_addr;
283
284 memset(ip6h, 0, sizeof(*ip6h));
285
286 ip6h->version = 6;
287 ip6h->payload_len = htons(payload_len);
288 ip6h->nexthdr = IPPROTO_EGP;
289 ip6h->hop_limit = 2;
290 ip6h->saddr = saddr->sin6_addr;
291 ip6h->daddr = daddr->sin6_addr;
292
293 return sizeof(*ip6h);
294}
295
296
297static void setup_sockaddr(int domain, const char *str_addr,
298 struct sockaddr_storage *sockaddr)
299{
300 struct sockaddr_in6 *addr6 = (void *) sockaddr;
301 struct sockaddr_in *addr4 = (void *) sockaddr;
302
303 switch (domain) {
304 case PF_INET:
305 memset(addr4, 0, sizeof(*addr4));
306 addr4->sin_family = AF_INET;
307 addr4->sin_port = htons(cfg_port);
308 if (str_addr &&
309 inet_pton(AF_INET, str_addr, &(addr4->sin_addr)) != 1)
310 error(1, 0, "ipv4 parse error: %s", str_addr);
311 break;
312 case PF_INET6:
313 memset(addr6, 0, sizeof(*addr6));
314 addr6->sin6_family = AF_INET6;
315 addr6->sin6_port = htons(cfg_port);
316 if (str_addr &&
317 inet_pton(AF_INET6, str_addr, &(addr6->sin6_addr)) != 1)
318 error(1, 0, "ipv6 parse error: %s", str_addr);
319 break;
320 default:
321 error(1, 0, "illegal domain");
322 }
323}
324
325static int do_setup_tx(int domain, int type, int protocol)
326{
327 int fd;
328
329 fd = socket(domain, type, protocol);
330 if (fd == -1)
331 error(1, errno, "socket t");
332
333 do_setsockopt(fd, SOL_SOCKET, SO_SNDBUF, 1 << 21);
334 if (cfg_zerocopy)
335 do_setsockopt(fd, SOL_SOCKET, SO_ZEROCOPY, 1);
336
337 if (domain != PF_PACKET && domain != PF_RDS)
338 if (connect(fd, (void *) &cfg_dst_addr, cfg_alen))
339 error(1, errno, "connect");
340
341 if (domain == PF_RDS) {
342 if (bind(fd, (void *) &cfg_src_addr, cfg_alen))
343 error(1, errno, "bind");
344 }
345
346 return fd;
347}
348
349static uint32_t do_process_zerocopy_cookies(struct rds_zcopy_cookies *ck)
350{
351 int i;
352
353 if (ck->num > RDS_MAX_ZCOOKIES)
354 error(1, 0, "Returned %d cookies, max expected %d\n",
355 ck->num, RDS_MAX_ZCOOKIES);
356 for (i = 0; i < ck->num; i++)
357 if (cfg_verbose >= 2)
358 fprintf(stderr, "%d\n", ck->cookies[i]);
359 return ck->num;
360}
361
362static bool do_recvmsg_completion(int fd)
363{
364 char cmsgbuf[CMSG_SPACE(sizeof(struct rds_zcopy_cookies))];
365 struct rds_zcopy_cookies *ck;
366 struct cmsghdr *cmsg;
367 struct msghdr msg;
368 bool ret = false;
369
370 memset(&msg, 0, sizeof(msg));
371 msg.msg_control = cmsgbuf;
372 msg.msg_controllen = sizeof(cmsgbuf);
373
374 if (recvmsg(fd, &msg, MSG_DONTWAIT))
375 return ret;
376
377 if (msg.msg_flags & MSG_CTRUNC)
378 error(1, errno, "recvmsg notification: truncated");
379
380 for (cmsg = CMSG_FIRSTHDR(&msg); cmsg; cmsg = CMSG_NXTHDR(&msg, cmsg)) {
381 if (cmsg->cmsg_level == SOL_RDS &&
382 cmsg->cmsg_type == RDS_CMSG_ZCOPY_COMPLETION) {
383
384 ck = (struct rds_zcopy_cookies *)CMSG_DATA(cmsg);
385 completions += do_process_zerocopy_cookies(ck);
386 ret = true;
387 break;
388 }
389 error(0, 0, "ignoring cmsg at level %d type %d\n",
390 cmsg->cmsg_level, cmsg->cmsg_type);
391 }
392 return ret;
393}
394
395static bool do_recv_completion(int fd, int domain)
396{
397 struct sock_extended_err *serr;
398 struct msghdr msg = {};
399 struct cmsghdr *cm;
400 uint32_t hi, lo, range;
401 int ret, zerocopy;
402 char control[100];
403
404 if (domain == PF_RDS)
405 return do_recvmsg_completion(fd);
406
407 msg.msg_control = control;
408 msg.msg_controllen = sizeof(control);
409
410 ret = recvmsg(fd, &msg, MSG_ERRQUEUE);
411 if (ret == -1 && errno == EAGAIN)
412 return false;
413 if (ret == -1)
414 error(1, errno, "recvmsg notification");
415 if (msg.msg_flags & MSG_CTRUNC)
416 error(1, errno, "recvmsg notification: truncated");
417
418 cm = CMSG_FIRSTHDR(&msg);
419 if (!cm)
420 error(1, 0, "cmsg: no cmsg");
421 if (!((cm->cmsg_level == SOL_IP && cm->cmsg_type == IP_RECVERR) ||
422 (cm->cmsg_level == SOL_IPV6 && cm->cmsg_type == IPV6_RECVERR) ||
423 (cm->cmsg_level == SOL_PACKET && cm->cmsg_type == PACKET_TX_TIMESTAMP)))
424 error(1, 0, "serr: wrong type: %d.%d",
425 cm->cmsg_level, cm->cmsg_type);
426
427 serr = (void *) CMSG_DATA(cm);
428
429 if (serr->ee_origin != SO_EE_ORIGIN_ZEROCOPY)
430 error(1, 0, "serr: wrong origin: %u", serr->ee_origin);
431 if (serr->ee_errno != 0)
432 error(1, 0, "serr: wrong error code: %u", serr->ee_errno);
433
434 hi = serr->ee_data;
435 lo = serr->ee_info;
436 range = hi - lo + 1;
437
438 /* Detect notification gaps. These should not happen often, if at all.
439 * Gaps can occur due to drops, reordering and retransmissions.
440 */
441 if (cfg_verbose && lo != next_completion)
442 fprintf(stderr, "gap: %u..%u does not append to %u\n",
443 lo, hi, next_completion);
444 next_completion = hi + 1;
445
446 zerocopy = !(serr->ee_code & SO_EE_CODE_ZEROCOPY_COPIED);
447 if (zerocopied == -1)
448 zerocopied = zerocopy;
449 else if (zerocopied != zerocopy) {
450 fprintf(stderr, "serr: inconsistent\n");
451 zerocopied = zerocopy;
452 }
453
454 if (cfg_verbose >= 2)
455 fprintf(stderr, "completed: %u (h=%u l=%u)\n",
456 range, hi, lo);
457
458 completions += range;
459 return true;
460}
461
462/* Read all outstanding messages on the errqueue */
463static void do_recv_completions(int fd, int domain)
464{
465 while (do_recv_completion(fd, domain)) {}
466 sends_since_notify = 0;
467}
468
469/* Wait for all remaining completions on the errqueue */
470static void do_recv_remaining_completions(int fd, int domain)
471{
472 int64_t tstop = gettimeofday_ms() + cfg_waittime_ms;
473
474 while (completions < expected_completions &&
475 gettimeofday_ms() < tstop) {
476 if (do_poll(fd, domain == PF_RDS ? POLLIN : POLLERR))
477 do_recv_completions(fd, domain);
478 }
479
480 if (completions < expected_completions)
481 fprintf(stderr, "missing notifications: %lu < %lu\n",
482 completions, expected_completions);
483}
484
485static void do_tx(int domain, int type, int protocol)
486{
487 struct iovec iov[3] = { {0} };
488 struct sockaddr_ll laddr;
489 struct msghdr msg = {0};
490 struct ethhdr eth;
491 union {
492 struct ipv6hdr ip6h;
493 struct iphdr iph;
494 } nh;
495 uint64_t tstop;
496 int fd;
497
498 fd = do_setup_tx(domain, type, protocol);
499
500 if (domain == PF_PACKET) {
501 uint16_t proto = cfg_family == PF_INET ? ETH_P_IP : ETH_P_IPV6;
502
503 /* sock_raw passes ll header as data */
504 if (type == SOCK_RAW) {
505 memset(eth.h_dest, 0x06, ETH_ALEN);
506 memset(eth.h_source, 0x02, ETH_ALEN);
507 eth.h_proto = htons(proto);
508 iov[0].iov_base = ð
509 iov[0].iov_len = sizeof(eth);
510 msg.msg_iovlen++;
511 }
512
513 /* both sock_raw and sock_dgram expect name */
514 memset(&laddr, 0, sizeof(laddr));
515 laddr.sll_family = AF_PACKET;
516 laddr.sll_ifindex = cfg_ifindex;
517 laddr.sll_protocol = htons(proto);
518 laddr.sll_halen = ETH_ALEN;
519
520 memset(laddr.sll_addr, 0x06, ETH_ALEN);
521
522 msg.msg_name = &laddr;
523 msg.msg_namelen = sizeof(laddr);
524 }
525
526 /* packet and raw sockets with hdrincl must pass network header */
527 if (domain == PF_PACKET || protocol == IPPROTO_RAW) {
528 if (cfg_family == PF_INET)
529 iov[1].iov_len = setup_iph(&nh.iph, cfg_payload_len);
530 else
531 iov[1].iov_len = setup_ip6h(&nh.ip6h, cfg_payload_len);
532
533 iov[1].iov_base = (void *) &nh;
534 msg.msg_iovlen++;
535 }
536
537 if (domain == PF_RDS) {
538 msg.msg_name = &cfg_dst_addr;
539 msg.msg_namelen = (cfg_dst_addr.ss_family == AF_INET ?
540 sizeof(struct sockaddr_in) :
541 sizeof(struct sockaddr_in6));
542 }
543
544 iov[2].iov_base = payload;
545 iov[2].iov_len = cfg_payload_len;
546 msg.msg_iovlen++;
547 msg.msg_iov = &iov[3 - msg.msg_iovlen];
548
549 tstop = gettimeofday_ms() + cfg_runtime_ms;
550 do {
551 if (cfg_cork)
552 do_sendmsg_corked(fd, &msg);
553 else
554 do_sendmsg(fd, &msg, cfg_zerocopy, domain);
555
556 if (cfg_zerocopy && sends_since_notify >= cfg_notification_limit)
557 do_recv_completions(fd, domain);
558
559 while (!do_poll(fd, POLLOUT)) {
560 if (cfg_zerocopy)
561 do_recv_completions(fd, domain);
562 }
563
564 } while (gettimeofday_ms() < tstop);
565
566 if (cfg_zerocopy)
567 do_recv_remaining_completions(fd, domain);
568
569 if (close(fd))
570 error(1, errno, "close");
571
572 fprintf(stderr, "tx=%lu (%lu MB) txc=%lu zc=%c\n",
573 packets, bytes >> 20, completions,
574 zerocopied == 1 ? 'y' : 'n');
575}
576
577static int do_setup_rx(int domain, int type, int protocol)
578{
579 int fd;
580
581 /* If tx over PF_PACKET, rx over PF_INET(6)/SOCK_RAW,
582 * to recv the only copy of the packet, not a clone
583 */
584 if (domain == PF_PACKET)
585 error(1, 0, "Use PF_INET/SOCK_RAW to read");
586
587 if (type == SOCK_RAW && protocol == IPPROTO_RAW)
588 error(1, 0, "IPPROTO_RAW: not supported on Rx");
589
590 fd = socket(domain, type, protocol);
591 if (fd == -1)
592 error(1, errno, "socket r");
593
594 do_setsockopt(fd, SOL_SOCKET, SO_RCVBUF, 1 << 21);
595 do_setsockopt(fd, SOL_SOCKET, SO_RCVLOWAT, 1 << 16);
596 do_setsockopt(fd, SOL_SOCKET, SO_REUSEPORT, 1);
597
598 if (bind(fd, (void *) &cfg_dst_addr, cfg_alen))
599 error(1, errno, "bind");
600
601 if (type == SOCK_STREAM) {
602 if (listen(fd, 1))
603 error(1, errno, "listen");
604 fd = do_accept(fd);
605 }
606
607 return fd;
608}
609
610/* Flush all outstanding bytes for the tcp receive queue */
611static void do_flush_tcp(int fd)
612{
613 int ret;
614
615 /* MSG_TRUNC flushes up to len bytes */
616 ret = recv(fd, NULL, 1 << 21, MSG_TRUNC | MSG_DONTWAIT);
617 if (ret == -1 && errno == EAGAIN)
618 return;
619 if (ret == -1)
620 error(1, errno, "flush");
621 if (!ret)
622 return;
623
624 packets++;
625 bytes += ret;
626}
627
628/* Flush all outstanding datagrams. Verify first few bytes of each. */
629static void do_flush_datagram(int fd, int type)
630{
631 int ret, off = 0;
632 char buf[64];
633
634 /* MSG_TRUNC will return full datagram length */
635 ret = recv(fd, buf, sizeof(buf), MSG_DONTWAIT | MSG_TRUNC);
636 if (ret == -1 && errno == EAGAIN)
637 return;
638
639 /* raw ipv4 return with header, raw ipv6 without */
640 if (cfg_family == PF_INET && type == SOCK_RAW) {
641 off += sizeof(struct iphdr);
642 ret -= sizeof(struct iphdr);
643 }
644
645 if (ret == -1)
646 error(1, errno, "recv");
647 if (ret != cfg_payload_len)
648 error(1, 0, "recv: ret=%u != %u", ret, cfg_payload_len);
649 if (ret > sizeof(buf) - off)
650 ret = sizeof(buf) - off;
651 if (memcmp(buf + off, payload, ret))
652 error(1, 0, "recv: data mismatch");
653
654 packets++;
655 bytes += cfg_payload_len;
656}
657
658static void do_rx(int domain, int type, int protocol)
659{
660 const int cfg_receiver_wait_ms = 400;
661 uint64_t tstop;
662 int fd;
663
664 fd = do_setup_rx(domain, type, protocol);
665
666 tstop = gettimeofday_ms() + cfg_runtime_ms + cfg_receiver_wait_ms;
667 do {
668 if (type == SOCK_STREAM)
669 do_flush_tcp(fd);
670 else
671 do_flush_datagram(fd, type);
672
673 do_poll(fd, POLLIN);
674
675 } while (gettimeofday_ms() < tstop);
676
677 if (close(fd))
678 error(1, errno, "close");
679
680 fprintf(stderr, "rx=%lu (%lu MB)\n", packets, bytes >> 20);
681}
682
683static void do_test(int domain, int type, int protocol)
684{
685 int i;
686
687 if (cfg_cork && (domain == PF_PACKET || type != SOCK_DGRAM))
688 error(1, 0, "can only cork udp sockets");
689
690 do_setcpu(cfg_cpu);
691
692 for (i = 0; i < IP_MAXPACKET; i++)
693 payload[i] = 'a' + (i % 26);
694
695 if (cfg_rx)
696 do_rx(domain, type, protocol);
697 else
698 do_tx(domain, type, protocol);
699}
700
701static void usage(const char *filepath)
702{
703 error(1, 0, "Usage: %s [options] <test>", filepath);
704}
705
706static void parse_opts(int argc, char **argv)
707{
708 const int max_payload_len = sizeof(payload) -
709 sizeof(struct ipv6hdr) -
710 sizeof(struct tcphdr) -
711 40 /* max tcp options */;
712 int c;
713 char *daddr = NULL, *saddr = NULL;
714 char *cfg_test;
715
716 cfg_payload_len = max_payload_len;
717
718 while ((c = getopt(argc, argv, "46c:C:D:i:l:mp:rs:S:t:vz")) != -1) {
719 switch (c) {
720 case '4':
721 if (cfg_family != PF_UNSPEC)
722 error(1, 0, "Pass one of -4 or -6");
723 cfg_family = PF_INET;
724 cfg_alen = sizeof(struct sockaddr_in);
725 break;
726 case '6':
727 if (cfg_family != PF_UNSPEC)
728 error(1, 0, "Pass one of -4 or -6");
729 cfg_family = PF_INET6;
730 cfg_alen = sizeof(struct sockaddr_in6);
731 break;
732 case 'c':
733 cfg_cork = strtol(optarg, NULL, 0);
734 break;
735 case 'C':
736 cfg_cpu = strtol(optarg, NULL, 0);
737 break;
738 case 'D':
739 daddr = optarg;
740 break;
741 case 'i':
742 cfg_ifindex = if_nametoindex(optarg);
743 if (cfg_ifindex == 0)
744 error(1, errno, "invalid iface: %s", optarg);
745 break;
746 case 'l':
747 cfg_notification_limit = strtoul(optarg, NULL, 0);
748 break;
749 case 'm':
750 cfg_cork_mixed = true;
751 break;
752 case 'p':
753 cfg_port = strtoul(optarg, NULL, 0);
754 break;
755 case 'r':
756 cfg_rx = true;
757 break;
758 case 's':
759 cfg_payload_len = strtoul(optarg, NULL, 0);
760 break;
761 case 'S':
762 saddr = optarg;
763 break;
764 case 't':
765 cfg_runtime_ms = 200 + strtoul(optarg, NULL, 10) * 1000;
766 break;
767 case 'v':
768 cfg_verbose++;
769 break;
770 case 'z':
771 cfg_zerocopy = true;
772 break;
773 }
774 }
775
776 cfg_test = argv[argc - 1];
777 if (strcmp(cfg_test, "rds") == 0) {
778 if (!daddr)
779 error(1, 0, "-D <server addr> required for PF_RDS\n");
780 if (!cfg_rx && !saddr)
781 error(1, 0, "-S <client addr> required for PF_RDS\n");
782 }
783 setup_sockaddr(cfg_family, daddr, &cfg_dst_addr);
784 setup_sockaddr(cfg_family, saddr, &cfg_src_addr);
785
786 if (cfg_payload_len > max_payload_len)
787 error(1, 0, "-s: payload exceeds max (%d)", max_payload_len);
788 if (cfg_cork_mixed && (!cfg_zerocopy || !cfg_cork))
789 error(1, 0, "-m: cork_mixed requires corking and zerocopy");
790
791 if (optind != argc - 1)
792 usage(argv[0]);
793}
794
795int main(int argc, char **argv)
796{
797 const char *cfg_test;
798
799 parse_opts(argc, argv);
800
801 cfg_test = argv[argc - 1];
802
803 if (!strcmp(cfg_test, "packet"))
804 do_test(PF_PACKET, SOCK_RAW, 0);
805 else if (!strcmp(cfg_test, "packet_dgram"))
806 do_test(PF_PACKET, SOCK_DGRAM, 0);
807 else if (!strcmp(cfg_test, "raw"))
808 do_test(cfg_family, SOCK_RAW, IPPROTO_EGP);
809 else if (!strcmp(cfg_test, "raw_hdrincl"))
810 do_test(cfg_family, SOCK_RAW, IPPROTO_RAW);
811 else if (!strcmp(cfg_test, "tcp"))
812 do_test(cfg_family, SOCK_STREAM, 0);
813 else if (!strcmp(cfg_test, "udp"))
814 do_test(cfg_family, SOCK_DGRAM, 0);
815 else if (!strcmp(cfg_test, "rds"))
816 do_test(PF_RDS, SOCK_SEQPACKET, 0);
817 else
818 error(1, 0, "unknown cfg_test %s", cfg_test);
819
820 return 0;
821}