Linux Audio

Check our new training course

Loading...
v6.2
  1/* Evaluate MSG_ZEROCOPY
  2 *
  3 * Send traffic between two processes over one of the supported
  4 * protocols and modes:
  5 *
  6 * PF_INET/PF_INET6
  7 * - SOCK_STREAM
  8 * - SOCK_DGRAM
  9 * - SOCK_DGRAM with UDP_CORK
 10 * - SOCK_RAW
 11 * - SOCK_RAW with IP_HDRINCL
 12 *
 13 * PF_PACKET
 14 * - SOCK_DGRAM
 15 * - SOCK_RAW
 16 *
 17 * PF_RDS
 18 * - SOCK_SEQPACKET
 19 *
 20 * Start this program on two connected hosts, one in send mode and
 21 * the other with option '-r' to put it in receiver mode.
 22 *
 23 * If zerocopy mode ('-z') is enabled, the sender will verify that
 24 * the kernel queues completions on the error queue for all zerocopy
 25 * transfers.
 26 */
 27
 28#define _GNU_SOURCE
 29
 30#include <arpa/inet.h>
 31#include <error.h>
 32#include <errno.h>
 33#include <limits.h>
 34#include <linux/errqueue.h>
 35#include <linux/if_packet.h>
 36#include <linux/ipv6.h>
 37#include <linux/socket.h>
 38#include <linux/sockios.h>
 39#include <net/ethernet.h>
 40#include <net/if.h>
 41#include <netinet/ip.h>
 42#include <netinet/ip6.h>
 43#include <netinet/tcp.h>
 44#include <netinet/udp.h>
 45#include <poll.h>
 46#include <sched.h>
 47#include <stdbool.h>
 48#include <stdio.h>
 49#include <stdint.h>
 50#include <stdlib.h>
 51#include <string.h>
 52#include <sys/ioctl.h>
 53#include <sys/socket.h>
 54#include <sys/stat.h>
 55#include <sys/time.h>
 56#include <sys/types.h>
 57#include <sys/wait.h>
 58#include <unistd.h>
 59#include <linux/rds.h>
 60
 61#ifndef SO_EE_ORIGIN_ZEROCOPY
 62#define SO_EE_ORIGIN_ZEROCOPY		5
 63#endif
 64
 65#ifndef SO_ZEROCOPY
 66#define SO_ZEROCOPY	60
 67#endif
 68
 69#ifndef SO_EE_CODE_ZEROCOPY_COPIED
 70#define SO_EE_CODE_ZEROCOPY_COPIED	1
 71#endif
 72
 73#ifndef MSG_ZEROCOPY
 74#define MSG_ZEROCOPY	0x4000000
 75#endif
 76
 77static int  cfg_cork;
 78static bool cfg_cork_mixed;
 79static int  cfg_cpu		= -1;		/* default: pin to last cpu */
 80static int  cfg_family		= PF_UNSPEC;
 81static int  cfg_ifindex		= 1;
 82static int  cfg_payload_len;
 83static int  cfg_port		= 8000;
 84static bool cfg_rx;
 85static int  cfg_runtime_ms	= 4200;
 86static int  cfg_verbose;
 87static int  cfg_waittime_ms	= 500;
 
 88static bool cfg_zerocopy;
 89
 90static socklen_t cfg_alen;
 91static struct sockaddr_storage cfg_dst_addr;
 92static struct sockaddr_storage cfg_src_addr;
 93
 94static char payload[IP_MAXPACKET];
 95static long packets, bytes, completions, expected_completions;
 96static int  zerocopied = -1;
 97static uint32_t next_completion;
 
 98
 99static unsigned long gettimeofday_ms(void)
100{
101	struct timeval tv;
102
103	gettimeofday(&tv, NULL);
104	return (tv.tv_sec * 1000) + (tv.tv_usec / 1000);
105}
106
107static uint16_t get_ip_csum(const uint16_t *start, int num_words)
108{
109	unsigned long sum = 0;
110	int i;
111
112	for (i = 0; i < num_words; i++)
113		sum += start[i];
114
115	while (sum >> 16)
116		sum = (sum & 0xFFFF) + (sum >> 16);
117
118	return ~sum;
119}
120
121static int do_setcpu(int cpu)
122{
123	cpu_set_t mask;
124
125	CPU_ZERO(&mask);
126	CPU_SET(cpu, &mask);
127	if (sched_setaffinity(0, sizeof(mask), &mask))
128		fprintf(stderr, "cpu: unable to pin, may increase variance.\n");
129	else if (cfg_verbose)
130		fprintf(stderr, "cpu: %u\n", cpu);
131
132	return 0;
133}
134
135static void do_setsockopt(int fd, int level, int optname, int val)
136{
137	if (setsockopt(fd, level, optname, &val, sizeof(val)))
138		error(1, errno, "setsockopt %d.%d: %d", level, optname, val);
139}
140
141static int do_poll(int fd, int events)
142{
143	struct pollfd pfd;
144	int ret;
145
146	pfd.events = events;
147	pfd.revents = 0;
148	pfd.fd = fd;
149
150	ret = poll(&pfd, 1, cfg_waittime_ms);
151	if (ret == -1)
152		error(1, errno, "poll");
153
154	return ret && (pfd.revents & events);
155}
156
157static int do_accept(int fd)
158{
159	int fda = fd;
160
161	fd = accept(fda, NULL, NULL);
162	if (fd == -1)
163		error(1, errno, "accept");
164	if (close(fda))
165		error(1, errno, "close listen sock");
166
167	return fd;
168}
169
170static void add_zcopy_cookie(struct msghdr *msg, uint32_t cookie)
171{
172	struct cmsghdr *cm;
173
174	if (!msg->msg_control)
175		error(1, errno, "NULL cookie");
176	cm = (void *)msg->msg_control;
177	cm->cmsg_len = CMSG_LEN(sizeof(cookie));
178	cm->cmsg_level = SOL_RDS;
179	cm->cmsg_type = RDS_CMSG_ZCOPY_COOKIE;
180	memcpy(CMSG_DATA(cm), &cookie, sizeof(cookie));
181}
182
183static bool do_sendmsg(int fd, struct msghdr *msg, bool do_zerocopy, int domain)
184{
185	int ret, len, i, flags;
186	static uint32_t cookie;
187	char ckbuf[CMSG_SPACE(sizeof(cookie))];
188
189	len = 0;
190	for (i = 0; i < msg->msg_iovlen; i++)
191		len += msg->msg_iov[i].iov_len;
192
193	flags = MSG_DONTWAIT;
194	if (do_zerocopy) {
195		flags |= MSG_ZEROCOPY;
196		if (domain == PF_RDS) {
197			memset(&msg->msg_control, 0, sizeof(msg->msg_control));
198			msg->msg_controllen = CMSG_SPACE(sizeof(cookie));
199			msg->msg_control = (struct cmsghdr *)ckbuf;
200			add_zcopy_cookie(msg, ++cookie);
201		}
202	}
203
204	ret = sendmsg(fd, msg, flags);
205	if (ret == -1 && errno == EAGAIN)
206		return false;
207	if (ret == -1)
208		error(1, errno, "send");
209	if (cfg_verbose && ret != len)
210		fprintf(stderr, "send: ret=%u != %u\n", ret, len);
 
211
212	if (len) {
213		packets++;
214		bytes += ret;
215		if (do_zerocopy && ret)
216			expected_completions++;
217	}
218	if (do_zerocopy && domain == PF_RDS) {
219		msg->msg_control = NULL;
220		msg->msg_controllen = 0;
221	}
222
223	return true;
224}
225
226static void do_sendmsg_corked(int fd, struct msghdr *msg)
227{
228	bool do_zerocopy = cfg_zerocopy;
229	int i, payload_len, extra_len;
230
231	/* split up the packet. for non-multiple, make first buffer longer */
232	payload_len = cfg_payload_len / cfg_cork;
233	extra_len = cfg_payload_len - (cfg_cork * payload_len);
234
235	do_setsockopt(fd, IPPROTO_UDP, UDP_CORK, 1);
236
237	for (i = 0; i < cfg_cork; i++) {
238
239		/* in mixed-frags mode, alternate zerocopy and copy frags
240		 * start with non-zerocopy, to ensure attach later works
241		 */
242		if (cfg_cork_mixed)
243			do_zerocopy = (i & 1);
244
245		msg->msg_iov[0].iov_len = payload_len + extra_len;
246		extra_len = 0;
247
248		do_sendmsg(fd, msg, do_zerocopy,
249			   (cfg_dst_addr.ss_family == AF_INET ?
250			    PF_INET : PF_INET6));
251	}
252
253	do_setsockopt(fd, IPPROTO_UDP, UDP_CORK, 0);
254}
255
256static int setup_iph(struct iphdr *iph, uint16_t payload_len)
257{
258	struct sockaddr_in *daddr = (void *) &cfg_dst_addr;
259	struct sockaddr_in *saddr = (void *) &cfg_src_addr;
260
261	memset(iph, 0, sizeof(*iph));
262
263	iph->version	= 4;
264	iph->tos	= 0;
265	iph->ihl	= 5;
266	iph->ttl	= 2;
267	iph->saddr	= saddr->sin_addr.s_addr;
268	iph->daddr	= daddr->sin_addr.s_addr;
269	iph->protocol	= IPPROTO_EGP;
270	iph->tot_len	= htons(sizeof(*iph) + payload_len);
271	iph->check	= get_ip_csum((void *) iph, iph->ihl << 1);
272
273	return sizeof(*iph);
274}
275
276static int setup_ip6h(struct ipv6hdr *ip6h, uint16_t payload_len)
277{
278	struct sockaddr_in6 *daddr = (void *) &cfg_dst_addr;
279	struct sockaddr_in6 *saddr = (void *) &cfg_src_addr;
280
281	memset(ip6h, 0, sizeof(*ip6h));
282
283	ip6h->version		= 6;
284	ip6h->payload_len	= htons(payload_len);
285	ip6h->nexthdr		= IPPROTO_EGP;
286	ip6h->hop_limit		= 2;
287	ip6h->saddr		= saddr->sin6_addr;
288	ip6h->daddr		= daddr->sin6_addr;
289
290	return sizeof(*ip6h);
291}
292
293
294static void setup_sockaddr(int domain, const char *str_addr,
295			   struct sockaddr_storage *sockaddr)
296{
297	struct sockaddr_in6 *addr6 = (void *) sockaddr;
298	struct sockaddr_in *addr4 = (void *) sockaddr;
299
300	switch (domain) {
301	case PF_INET:
302		memset(addr4, 0, sizeof(*addr4));
303		addr4->sin_family = AF_INET;
304		addr4->sin_port = htons(cfg_port);
305		if (str_addr &&
306		    inet_pton(AF_INET, str_addr, &(addr4->sin_addr)) != 1)
307			error(1, 0, "ipv4 parse error: %s", str_addr);
308		break;
309	case PF_INET6:
310		memset(addr6, 0, sizeof(*addr6));
311		addr6->sin6_family = AF_INET6;
312		addr6->sin6_port = htons(cfg_port);
313		if (str_addr &&
314		    inet_pton(AF_INET6, str_addr, &(addr6->sin6_addr)) != 1)
315			error(1, 0, "ipv6 parse error: %s", str_addr);
316		break;
317	default:
318		error(1, 0, "illegal domain");
319	}
320}
321
322static int do_setup_tx(int domain, int type, int protocol)
323{
324	int fd;
325
326	fd = socket(domain, type, protocol);
327	if (fd == -1)
328		error(1, errno, "socket t");
329
330	do_setsockopt(fd, SOL_SOCKET, SO_SNDBUF, 1 << 21);
331	if (cfg_zerocopy)
332		do_setsockopt(fd, SOL_SOCKET, SO_ZEROCOPY, 1);
333
334	if (domain != PF_PACKET && domain != PF_RDS)
335		if (connect(fd, (void *) &cfg_dst_addr, cfg_alen))
336			error(1, errno, "connect");
337
338	if (domain == PF_RDS) {
339		if (bind(fd, (void *) &cfg_src_addr, cfg_alen))
340			error(1, errno, "bind");
341	}
342
343	return fd;
344}
345
346static uint32_t do_process_zerocopy_cookies(struct rds_zcopy_cookies *ck)
347{
348	int i;
349
350	if (ck->num > RDS_MAX_ZCOOKIES)
351		error(1, 0, "Returned %d cookies, max expected %d\n",
352		      ck->num, RDS_MAX_ZCOOKIES);
353	for (i = 0; i < ck->num; i++)
354		if (cfg_verbose >= 2)
355			fprintf(stderr, "%d\n", ck->cookies[i]);
356	return ck->num;
357}
358
359static bool do_recvmsg_completion(int fd)
360{
361	char cmsgbuf[CMSG_SPACE(sizeof(struct rds_zcopy_cookies))];
362	struct rds_zcopy_cookies *ck;
363	struct cmsghdr *cmsg;
364	struct msghdr msg;
365	bool ret = false;
366
367	memset(&msg, 0, sizeof(msg));
368	msg.msg_control = cmsgbuf;
369	msg.msg_controllen = sizeof(cmsgbuf);
370
371	if (recvmsg(fd, &msg, MSG_DONTWAIT))
372		return ret;
373
374	if (msg.msg_flags & MSG_CTRUNC)
375		error(1, errno, "recvmsg notification: truncated");
376
377	for (cmsg = CMSG_FIRSTHDR(&msg); cmsg; cmsg = CMSG_NXTHDR(&msg, cmsg)) {
378		if (cmsg->cmsg_level == SOL_RDS &&
379		    cmsg->cmsg_type == RDS_CMSG_ZCOPY_COMPLETION) {
380
381			ck = (struct rds_zcopy_cookies *)CMSG_DATA(cmsg);
382			completions += do_process_zerocopy_cookies(ck);
383			ret = true;
384			break;
385		}
386		error(0, 0, "ignoring cmsg at level %d type %d\n",
387			    cmsg->cmsg_level, cmsg->cmsg_type);
388	}
389	return ret;
390}
391
392static bool do_recv_completion(int fd, int domain)
393{
394	struct sock_extended_err *serr;
395	struct msghdr msg = {};
396	struct cmsghdr *cm;
397	uint32_t hi, lo, range;
398	int ret, zerocopy;
399	char control[100];
400
401	if (domain == PF_RDS)
402		return do_recvmsg_completion(fd);
403
404	msg.msg_control = control;
405	msg.msg_controllen = sizeof(control);
406
407	ret = recvmsg(fd, &msg, MSG_ERRQUEUE);
408	if (ret == -1 && errno == EAGAIN)
409		return false;
410	if (ret == -1)
411		error(1, errno, "recvmsg notification");
412	if (msg.msg_flags & MSG_CTRUNC)
413		error(1, errno, "recvmsg notification: truncated");
414
415	cm = CMSG_FIRSTHDR(&msg);
416	if (!cm)
417		error(1, 0, "cmsg: no cmsg");
418	if (!((cm->cmsg_level == SOL_IP && cm->cmsg_type == IP_RECVERR) ||
419	      (cm->cmsg_level == SOL_IPV6 && cm->cmsg_type == IPV6_RECVERR) ||
420	      (cm->cmsg_level == SOL_PACKET && cm->cmsg_type == PACKET_TX_TIMESTAMP)))
421		error(1, 0, "serr: wrong type: %d.%d",
422		      cm->cmsg_level, cm->cmsg_type);
423
424	serr = (void *) CMSG_DATA(cm);
425
426	if (serr->ee_origin != SO_EE_ORIGIN_ZEROCOPY)
427		error(1, 0, "serr: wrong origin: %u", serr->ee_origin);
428	if (serr->ee_errno != 0)
429		error(1, 0, "serr: wrong error code: %u", serr->ee_errno);
430
431	hi = serr->ee_data;
432	lo = serr->ee_info;
433	range = hi - lo + 1;
434
435	/* Detect notification gaps. These should not happen often, if at all.
436	 * Gaps can occur due to drops, reordering and retransmissions.
437	 */
438	if (lo != next_completion)
439		fprintf(stderr, "gap: %u..%u does not append to %u\n",
440			lo, hi, next_completion);
441	next_completion = hi + 1;
442
443	zerocopy = !(serr->ee_code & SO_EE_CODE_ZEROCOPY_COPIED);
444	if (zerocopied == -1)
445		zerocopied = zerocopy;
446	else if (zerocopied != zerocopy) {
447		fprintf(stderr, "serr: inconsistent\n");
448		zerocopied = zerocopy;
449	}
450
451	if (cfg_verbose >= 2)
452		fprintf(stderr, "completed: %u (h=%u l=%u)\n",
453			range, hi, lo);
454
455	completions += range;
456	return true;
457}
458
459/* Read all outstanding messages on the errqueue */
460static void do_recv_completions(int fd, int domain)
461{
462	while (do_recv_completion(fd, domain)) {}
 
463}
464
465/* Wait for all remaining completions on the errqueue */
466static void do_recv_remaining_completions(int fd, int domain)
467{
468	int64_t tstop = gettimeofday_ms() + cfg_waittime_ms;
469
470	while (completions < expected_completions &&
471	       gettimeofday_ms() < tstop) {
472		if (do_poll(fd, domain == PF_RDS ? POLLIN : POLLERR))
473			do_recv_completions(fd, domain);
474	}
475
476	if (completions < expected_completions)
477		fprintf(stderr, "missing notifications: %lu < %lu\n",
478			completions, expected_completions);
479}
480
481static void do_tx(int domain, int type, int protocol)
482{
483	struct iovec iov[3] = { {0} };
484	struct sockaddr_ll laddr;
485	struct msghdr msg = {0};
486	struct ethhdr eth;
487	union {
488		struct ipv6hdr ip6h;
489		struct iphdr iph;
490	} nh;
491	uint64_t tstop;
492	int fd;
493
494	fd = do_setup_tx(domain, type, protocol);
495
496	if (domain == PF_PACKET) {
497		uint16_t proto = cfg_family == PF_INET ? ETH_P_IP : ETH_P_IPV6;
498
499		/* sock_raw passes ll header as data */
500		if (type == SOCK_RAW) {
501			memset(eth.h_dest, 0x06, ETH_ALEN);
502			memset(eth.h_source, 0x02, ETH_ALEN);
503			eth.h_proto = htons(proto);
504			iov[0].iov_base = &eth;
505			iov[0].iov_len = sizeof(eth);
506			msg.msg_iovlen++;
507		}
508
509		/* both sock_raw and sock_dgram expect name */
510		memset(&laddr, 0, sizeof(laddr));
511		laddr.sll_family	= AF_PACKET;
512		laddr.sll_ifindex	= cfg_ifindex;
513		laddr.sll_protocol	= htons(proto);
514		laddr.sll_halen		= ETH_ALEN;
515
516		memset(laddr.sll_addr, 0x06, ETH_ALEN);
517
518		msg.msg_name		= &laddr;
519		msg.msg_namelen		= sizeof(laddr);
520	}
521
522	/* packet and raw sockets with hdrincl must pass network header */
523	if (domain == PF_PACKET || protocol == IPPROTO_RAW) {
524		if (cfg_family == PF_INET)
525			iov[1].iov_len = setup_iph(&nh.iph, cfg_payload_len);
526		else
527			iov[1].iov_len = setup_ip6h(&nh.ip6h, cfg_payload_len);
528
529		iov[1].iov_base = (void *) &nh;
530		msg.msg_iovlen++;
531	}
532
533	if (domain == PF_RDS) {
534		msg.msg_name = &cfg_dst_addr;
535		msg.msg_namelen =  (cfg_dst_addr.ss_family == AF_INET ?
536				    sizeof(struct sockaddr_in) :
537				    sizeof(struct sockaddr_in6));
538	}
539
540	iov[2].iov_base = payload;
541	iov[2].iov_len = cfg_payload_len;
542	msg.msg_iovlen++;
543	msg.msg_iov = &iov[3 - msg.msg_iovlen];
544
545	tstop = gettimeofday_ms() + cfg_runtime_ms;
546	do {
547		if (cfg_cork)
548			do_sendmsg_corked(fd, &msg);
549		else
550			do_sendmsg(fd, &msg, cfg_zerocopy, domain);
551
 
 
 
552		while (!do_poll(fd, POLLOUT)) {
553			if (cfg_zerocopy)
554				do_recv_completions(fd, domain);
555		}
556
557	} while (gettimeofday_ms() < tstop);
558
559	if (cfg_zerocopy)
560		do_recv_remaining_completions(fd, domain);
561
562	if (close(fd))
563		error(1, errno, "close");
564
565	fprintf(stderr, "tx=%lu (%lu MB) txc=%lu zc=%c\n",
566		packets, bytes >> 20, completions,
567		zerocopied == 1 ? 'y' : 'n');
568}
569
570static int do_setup_rx(int domain, int type, int protocol)
571{
572	int fd;
573
574	/* If tx over PF_PACKET, rx over PF_INET(6)/SOCK_RAW,
575	 * to recv the only copy of the packet, not a clone
576	 */
577	if (domain == PF_PACKET)
578		error(1, 0, "Use PF_INET/SOCK_RAW to read");
579
580	if (type == SOCK_RAW && protocol == IPPROTO_RAW)
581		error(1, 0, "IPPROTO_RAW: not supported on Rx");
582
583	fd = socket(domain, type, protocol);
584	if (fd == -1)
585		error(1, errno, "socket r");
586
587	do_setsockopt(fd, SOL_SOCKET, SO_RCVBUF, 1 << 21);
588	do_setsockopt(fd, SOL_SOCKET, SO_RCVLOWAT, 1 << 16);
589	do_setsockopt(fd, SOL_SOCKET, SO_REUSEPORT, 1);
590
591	if (bind(fd, (void *) &cfg_dst_addr, cfg_alen))
592		error(1, errno, "bind");
593
594	if (type == SOCK_STREAM) {
595		if (listen(fd, 1))
596			error(1, errno, "listen");
597		fd = do_accept(fd);
598	}
599
600	return fd;
601}
602
603/* Flush all outstanding bytes for the tcp receive queue */
604static void do_flush_tcp(int fd)
605{
606	int ret;
607
608	/* MSG_TRUNC flushes up to len bytes */
609	ret = recv(fd, NULL, 1 << 21, MSG_TRUNC | MSG_DONTWAIT);
610	if (ret == -1 && errno == EAGAIN)
611		return;
612	if (ret == -1)
613		error(1, errno, "flush");
614	if (!ret)
615		return;
616
617	packets++;
618	bytes += ret;
619}
620
621/* Flush all outstanding datagrams. Verify first few bytes of each. */
622static void do_flush_datagram(int fd, int type)
623{
624	int ret, off = 0;
625	char buf[64];
626
627	/* MSG_TRUNC will return full datagram length */
628	ret = recv(fd, buf, sizeof(buf), MSG_DONTWAIT | MSG_TRUNC);
629	if (ret == -1 && errno == EAGAIN)
630		return;
631
632	/* raw ipv4 return with header, raw ipv6 without */
633	if (cfg_family == PF_INET && type == SOCK_RAW) {
634		off += sizeof(struct iphdr);
635		ret -= sizeof(struct iphdr);
636	}
637
638	if (ret == -1)
639		error(1, errno, "recv");
640	if (ret != cfg_payload_len)
641		error(1, 0, "recv: ret=%u != %u", ret, cfg_payload_len);
642	if (ret > sizeof(buf) - off)
643		ret = sizeof(buf) - off;
644	if (memcmp(buf + off, payload, ret))
645		error(1, 0, "recv: data mismatch");
646
647	packets++;
648	bytes += cfg_payload_len;
649}
650
651static void do_rx(int domain, int type, int protocol)
652{
653	const int cfg_receiver_wait_ms = 400;
654	uint64_t tstop;
655	int fd;
656
657	fd = do_setup_rx(domain, type, protocol);
658
659	tstop = gettimeofday_ms() + cfg_runtime_ms + cfg_receiver_wait_ms;
660	do {
661		if (type == SOCK_STREAM)
662			do_flush_tcp(fd);
663		else
664			do_flush_datagram(fd, type);
665
666		do_poll(fd, POLLIN);
667
668	} while (gettimeofday_ms() < tstop);
669
670	if (close(fd))
671		error(1, errno, "close");
672
673	fprintf(stderr, "rx=%lu (%lu MB)\n", packets, bytes >> 20);
674}
675
676static void do_test(int domain, int type, int protocol)
677{
678	int i;
679
680	if (cfg_cork && (domain == PF_PACKET || type != SOCK_DGRAM))
681		error(1, 0, "can only cork udp sockets");
682
683	do_setcpu(cfg_cpu);
684
685	for (i = 0; i < IP_MAXPACKET; i++)
686		payload[i] = 'a' + (i % 26);
687
688	if (cfg_rx)
689		do_rx(domain, type, protocol);
690	else
691		do_tx(domain, type, protocol);
692}
693
694static void usage(const char *filepath)
695{
696	error(1, 0, "Usage: %s [options] <test>", filepath);
697}
698
699static void parse_opts(int argc, char **argv)
700{
701	const int max_payload_len = sizeof(payload) -
702				    sizeof(struct ipv6hdr) -
703				    sizeof(struct tcphdr) -
704				    40 /* max tcp options */;
705	int c;
706	char *daddr = NULL, *saddr = NULL;
707	char *cfg_test;
708
709	cfg_payload_len = max_payload_len;
710
711	while ((c = getopt(argc, argv, "46c:C:D:i:mp:rs:S:t:vz")) != -1) {
712		switch (c) {
713		case '4':
714			if (cfg_family != PF_UNSPEC)
715				error(1, 0, "Pass one of -4 or -6");
716			cfg_family = PF_INET;
717			cfg_alen = sizeof(struct sockaddr_in);
718			break;
719		case '6':
720			if (cfg_family != PF_UNSPEC)
721				error(1, 0, "Pass one of -4 or -6");
722			cfg_family = PF_INET6;
723			cfg_alen = sizeof(struct sockaddr_in6);
724			break;
725		case 'c':
726			cfg_cork = strtol(optarg, NULL, 0);
727			break;
728		case 'C':
729			cfg_cpu = strtol(optarg, NULL, 0);
730			break;
731		case 'D':
732			daddr = optarg;
733			break;
734		case 'i':
735			cfg_ifindex = if_nametoindex(optarg);
736			if (cfg_ifindex == 0)
737				error(1, errno, "invalid iface: %s", optarg);
 
 
 
738			break;
739		case 'm':
740			cfg_cork_mixed = true;
741			break;
742		case 'p':
743			cfg_port = strtoul(optarg, NULL, 0);
744			break;
745		case 'r':
746			cfg_rx = true;
747			break;
748		case 's':
749			cfg_payload_len = strtoul(optarg, NULL, 0);
750			break;
751		case 'S':
752			saddr = optarg;
753			break;
754		case 't':
755			cfg_runtime_ms = 200 + strtoul(optarg, NULL, 10) * 1000;
756			break;
757		case 'v':
758			cfg_verbose++;
759			break;
760		case 'z':
761			cfg_zerocopy = true;
762			break;
763		}
764	}
765
766	cfg_test = argv[argc - 1];
767	if (strcmp(cfg_test, "rds") == 0) {
768		if (!daddr)
769			error(1, 0, "-D <server addr> required for PF_RDS\n");
770		if (!cfg_rx && !saddr)
771			error(1, 0, "-S <client addr> required for PF_RDS\n");
772	}
773	setup_sockaddr(cfg_family, daddr, &cfg_dst_addr);
774	setup_sockaddr(cfg_family, saddr, &cfg_src_addr);
775
776	if (cfg_payload_len > max_payload_len)
777		error(1, 0, "-s: payload exceeds max (%d)", max_payload_len);
778	if (cfg_cork_mixed && (!cfg_zerocopy || !cfg_cork))
779		error(1, 0, "-m: cork_mixed requires corking and zerocopy");
780
781	if (optind != argc - 1)
782		usage(argv[0]);
783}
784
785int main(int argc, char **argv)
786{
787	const char *cfg_test;
788
789	parse_opts(argc, argv);
790
791	cfg_test = argv[argc - 1];
792
793	if (!strcmp(cfg_test, "packet"))
794		do_test(PF_PACKET, SOCK_RAW, 0);
795	else if (!strcmp(cfg_test, "packet_dgram"))
796		do_test(PF_PACKET, SOCK_DGRAM, 0);
797	else if (!strcmp(cfg_test, "raw"))
798		do_test(cfg_family, SOCK_RAW, IPPROTO_EGP);
799	else if (!strcmp(cfg_test, "raw_hdrincl"))
800		do_test(cfg_family, SOCK_RAW, IPPROTO_RAW);
801	else if (!strcmp(cfg_test, "tcp"))
802		do_test(cfg_family, SOCK_STREAM, 0);
803	else if (!strcmp(cfg_test, "udp"))
804		do_test(cfg_family, SOCK_DGRAM, 0);
805	else if (!strcmp(cfg_test, "rds"))
806		do_test(PF_RDS, SOCK_SEQPACKET, 0);
807	else
808		error(1, 0, "unknown cfg_test %s", cfg_test);
809
810	return 0;
811}
v6.13.7
  1/* Evaluate MSG_ZEROCOPY
  2 *
  3 * Send traffic between two processes over one of the supported
  4 * protocols and modes:
  5 *
  6 * PF_INET/PF_INET6
  7 * - SOCK_STREAM
  8 * - SOCK_DGRAM
  9 * - SOCK_DGRAM with UDP_CORK
 10 * - SOCK_RAW
 11 * - SOCK_RAW with IP_HDRINCL
 12 *
 13 * PF_PACKET
 14 * - SOCK_DGRAM
 15 * - SOCK_RAW
 16 *
 17 * PF_RDS
 18 * - SOCK_SEQPACKET
 19 *
 20 * Start this program on two connected hosts, one in send mode and
 21 * the other with option '-r' to put it in receiver mode.
 22 *
 23 * If zerocopy mode ('-z') is enabled, the sender will verify that
 24 * the kernel queues completions on the error queue for all zerocopy
 25 * transfers.
 26 */
 27
 28#define _GNU_SOURCE
 29
 30#include <arpa/inet.h>
 31#include <error.h>
 32#include <errno.h>
 33#include <limits.h>
 34#include <linux/errqueue.h>
 35#include <linux/if_packet.h>
 36#include <linux/ipv6.h>
 37#include <linux/socket.h>
 38#include <linux/sockios.h>
 39#include <net/ethernet.h>
 40#include <net/if.h>
 41#include <netinet/ip.h>
 42#include <netinet/ip6.h>
 43#include <netinet/tcp.h>
 44#include <netinet/udp.h>
 45#include <poll.h>
 46#include <sched.h>
 47#include <stdbool.h>
 48#include <stdio.h>
 49#include <stdint.h>
 50#include <stdlib.h>
 51#include <string.h>
 52#include <sys/ioctl.h>
 53#include <sys/socket.h>
 54#include <sys/stat.h>
 55#include <sys/time.h>
 56#include <sys/types.h>
 57#include <sys/wait.h>
 58#include <unistd.h>
 59#include <linux/rds.h>
 60
 61#ifndef SO_EE_ORIGIN_ZEROCOPY
 62#define SO_EE_ORIGIN_ZEROCOPY		5
 63#endif
 64
 65#ifndef SO_ZEROCOPY
 66#define SO_ZEROCOPY	60
 67#endif
 68
 69#ifndef SO_EE_CODE_ZEROCOPY_COPIED
 70#define SO_EE_CODE_ZEROCOPY_COPIED	1
 71#endif
 72
 73#ifndef MSG_ZEROCOPY
 74#define MSG_ZEROCOPY	0x4000000
 75#endif
 76
 77static int  cfg_cork;
 78static bool cfg_cork_mixed;
 79static int  cfg_cpu		= -1;		/* default: pin to last cpu */
 80static int  cfg_family		= PF_UNSPEC;
 81static int  cfg_ifindex		= 1;
 82static int  cfg_payload_len;
 83static int  cfg_port		= 8000;
 84static bool cfg_rx;
 85static int  cfg_runtime_ms	= 4200;
 86static int  cfg_verbose;
 87static int  cfg_waittime_ms	= 500;
 88static int  cfg_notification_limit = 32;
 89static bool cfg_zerocopy;
 90
 91static socklen_t cfg_alen;
 92static struct sockaddr_storage cfg_dst_addr;
 93static struct sockaddr_storage cfg_src_addr;
 94
 95static char payload[IP_MAXPACKET];
 96static long packets, bytes, completions, expected_completions;
 97static int  zerocopied = -1;
 98static uint32_t next_completion;
 99static uint32_t sends_since_notify;
100
101static unsigned long gettimeofday_ms(void)
102{
103	struct timeval tv;
104
105	gettimeofday(&tv, NULL);
106	return (tv.tv_sec * 1000) + (tv.tv_usec / 1000);
107}
108
109static uint16_t get_ip_csum(const uint16_t *start, int num_words)
110{
111	unsigned long sum = 0;
112	int i;
113
114	for (i = 0; i < num_words; i++)
115		sum += start[i];
116
117	while (sum >> 16)
118		sum = (sum & 0xFFFF) + (sum >> 16);
119
120	return ~sum;
121}
122
123static int do_setcpu(int cpu)
124{
125	cpu_set_t mask;
126
127	CPU_ZERO(&mask);
128	CPU_SET(cpu, &mask);
129	if (sched_setaffinity(0, sizeof(mask), &mask))
130		fprintf(stderr, "cpu: unable to pin, may increase variance.\n");
131	else if (cfg_verbose)
132		fprintf(stderr, "cpu: %u\n", cpu);
133
134	return 0;
135}
136
137static void do_setsockopt(int fd, int level, int optname, int val)
138{
139	if (setsockopt(fd, level, optname, &val, sizeof(val)))
140		error(1, errno, "setsockopt %d.%d: %d", level, optname, val);
141}
142
143static int do_poll(int fd, int events)
144{
145	struct pollfd pfd;
146	int ret;
147
148	pfd.events = events;
149	pfd.revents = 0;
150	pfd.fd = fd;
151
152	ret = poll(&pfd, 1, cfg_waittime_ms);
153	if (ret == -1)
154		error(1, errno, "poll");
155
156	return ret && (pfd.revents & events);
157}
158
159static int do_accept(int fd)
160{
161	int fda = fd;
162
163	fd = accept(fda, NULL, NULL);
164	if (fd == -1)
165		error(1, errno, "accept");
166	if (close(fda))
167		error(1, errno, "close listen sock");
168
169	return fd;
170}
171
172static void add_zcopy_cookie(struct msghdr *msg, uint32_t cookie)
173{
174	struct cmsghdr *cm;
175
176	if (!msg->msg_control)
177		error(1, errno, "NULL cookie");
178	cm = (void *)msg->msg_control;
179	cm->cmsg_len = CMSG_LEN(sizeof(cookie));
180	cm->cmsg_level = SOL_RDS;
181	cm->cmsg_type = RDS_CMSG_ZCOPY_COOKIE;
182	memcpy(CMSG_DATA(cm), &cookie, sizeof(cookie));
183}
184
185static bool do_sendmsg(int fd, struct msghdr *msg, bool do_zerocopy, int domain)
186{
187	int ret, len, i, flags;
188	static uint32_t cookie;
189	char ckbuf[CMSG_SPACE(sizeof(cookie))];
190
191	len = 0;
192	for (i = 0; i < msg->msg_iovlen; i++)
193		len += msg->msg_iov[i].iov_len;
194
195	flags = MSG_DONTWAIT;
196	if (do_zerocopy) {
197		flags |= MSG_ZEROCOPY;
198		if (domain == PF_RDS) {
199			memset(&msg->msg_control, 0, sizeof(msg->msg_control));
200			msg->msg_controllen = CMSG_SPACE(sizeof(cookie));
201			msg->msg_control = (struct cmsghdr *)ckbuf;
202			add_zcopy_cookie(msg, ++cookie);
203		}
204	}
205
206	ret = sendmsg(fd, msg, flags);
207	if (ret == -1 && errno == EAGAIN)
208		return false;
209	if (ret == -1)
210		error(1, errno, "send");
211	if (cfg_verbose && ret != len)
212		fprintf(stderr, "send: ret=%u != %u\n", ret, len);
213	sends_since_notify++;
214
215	if (len) {
216		packets++;
217		bytes += ret;
218		if (do_zerocopy && ret)
219			expected_completions++;
220	}
221	if (do_zerocopy && domain == PF_RDS) {
222		msg->msg_control = NULL;
223		msg->msg_controllen = 0;
224	}
225
226	return true;
227}
228
229static void do_sendmsg_corked(int fd, struct msghdr *msg)
230{
231	bool do_zerocopy = cfg_zerocopy;
232	int i, payload_len, extra_len;
233
234	/* split up the packet. for non-multiple, make first buffer longer */
235	payload_len = cfg_payload_len / cfg_cork;
236	extra_len = cfg_payload_len - (cfg_cork * payload_len);
237
238	do_setsockopt(fd, IPPROTO_UDP, UDP_CORK, 1);
239
240	for (i = 0; i < cfg_cork; i++) {
241
242		/* in mixed-frags mode, alternate zerocopy and copy frags
243		 * start with non-zerocopy, to ensure attach later works
244		 */
245		if (cfg_cork_mixed)
246			do_zerocopy = (i & 1);
247
248		msg->msg_iov[0].iov_len = payload_len + extra_len;
249		extra_len = 0;
250
251		do_sendmsg(fd, msg, do_zerocopy,
252			   (cfg_dst_addr.ss_family == AF_INET ?
253			    PF_INET : PF_INET6));
254	}
255
256	do_setsockopt(fd, IPPROTO_UDP, UDP_CORK, 0);
257}
258
259static int setup_iph(struct iphdr *iph, uint16_t payload_len)
260{
261	struct sockaddr_in *daddr = (void *) &cfg_dst_addr;
262	struct sockaddr_in *saddr = (void *) &cfg_src_addr;
263
264	memset(iph, 0, sizeof(*iph));
265
266	iph->version	= 4;
267	iph->tos	= 0;
268	iph->ihl	= 5;
269	iph->ttl	= 2;
270	iph->saddr	= saddr->sin_addr.s_addr;
271	iph->daddr	= daddr->sin_addr.s_addr;
272	iph->protocol	= IPPROTO_EGP;
273	iph->tot_len	= htons(sizeof(*iph) + payload_len);
274	iph->check	= get_ip_csum((void *) iph, iph->ihl << 1);
275
276	return sizeof(*iph);
277}
278
279static int setup_ip6h(struct ipv6hdr *ip6h, uint16_t payload_len)
280{
281	struct sockaddr_in6 *daddr = (void *) &cfg_dst_addr;
282	struct sockaddr_in6 *saddr = (void *) &cfg_src_addr;
283
284	memset(ip6h, 0, sizeof(*ip6h));
285
286	ip6h->version		= 6;
287	ip6h->payload_len	= htons(payload_len);
288	ip6h->nexthdr		= IPPROTO_EGP;
289	ip6h->hop_limit		= 2;
290	ip6h->saddr		= saddr->sin6_addr;
291	ip6h->daddr		= daddr->sin6_addr;
292
293	return sizeof(*ip6h);
294}
295
296
297static void setup_sockaddr(int domain, const char *str_addr,
298			   struct sockaddr_storage *sockaddr)
299{
300	struct sockaddr_in6 *addr6 = (void *) sockaddr;
301	struct sockaddr_in *addr4 = (void *) sockaddr;
302
303	switch (domain) {
304	case PF_INET:
305		memset(addr4, 0, sizeof(*addr4));
306		addr4->sin_family = AF_INET;
307		addr4->sin_port = htons(cfg_port);
308		if (str_addr &&
309		    inet_pton(AF_INET, str_addr, &(addr4->sin_addr)) != 1)
310			error(1, 0, "ipv4 parse error: %s", str_addr);
311		break;
312	case PF_INET6:
313		memset(addr6, 0, sizeof(*addr6));
314		addr6->sin6_family = AF_INET6;
315		addr6->sin6_port = htons(cfg_port);
316		if (str_addr &&
317		    inet_pton(AF_INET6, str_addr, &(addr6->sin6_addr)) != 1)
318			error(1, 0, "ipv6 parse error: %s", str_addr);
319		break;
320	default:
321		error(1, 0, "illegal domain");
322	}
323}
324
325static int do_setup_tx(int domain, int type, int protocol)
326{
327	int fd;
328
329	fd = socket(domain, type, protocol);
330	if (fd == -1)
331		error(1, errno, "socket t");
332
333	do_setsockopt(fd, SOL_SOCKET, SO_SNDBUF, 1 << 21);
334	if (cfg_zerocopy)
335		do_setsockopt(fd, SOL_SOCKET, SO_ZEROCOPY, 1);
336
337	if (domain != PF_PACKET && domain != PF_RDS)
338		if (connect(fd, (void *) &cfg_dst_addr, cfg_alen))
339			error(1, errno, "connect");
340
341	if (domain == PF_RDS) {
342		if (bind(fd, (void *) &cfg_src_addr, cfg_alen))
343			error(1, errno, "bind");
344	}
345
346	return fd;
347}
348
349static uint32_t do_process_zerocopy_cookies(struct rds_zcopy_cookies *ck)
350{
351	int i;
352
353	if (ck->num > RDS_MAX_ZCOOKIES)
354		error(1, 0, "Returned %d cookies, max expected %d\n",
355		      ck->num, RDS_MAX_ZCOOKIES);
356	for (i = 0; i < ck->num; i++)
357		if (cfg_verbose >= 2)
358			fprintf(stderr, "%d\n", ck->cookies[i]);
359	return ck->num;
360}
361
362static bool do_recvmsg_completion(int fd)
363{
364	char cmsgbuf[CMSG_SPACE(sizeof(struct rds_zcopy_cookies))];
365	struct rds_zcopy_cookies *ck;
366	struct cmsghdr *cmsg;
367	struct msghdr msg;
368	bool ret = false;
369
370	memset(&msg, 0, sizeof(msg));
371	msg.msg_control = cmsgbuf;
372	msg.msg_controllen = sizeof(cmsgbuf);
373
374	if (recvmsg(fd, &msg, MSG_DONTWAIT))
375		return ret;
376
377	if (msg.msg_flags & MSG_CTRUNC)
378		error(1, errno, "recvmsg notification: truncated");
379
380	for (cmsg = CMSG_FIRSTHDR(&msg); cmsg; cmsg = CMSG_NXTHDR(&msg, cmsg)) {
381		if (cmsg->cmsg_level == SOL_RDS &&
382		    cmsg->cmsg_type == RDS_CMSG_ZCOPY_COMPLETION) {
383
384			ck = (struct rds_zcopy_cookies *)CMSG_DATA(cmsg);
385			completions += do_process_zerocopy_cookies(ck);
386			ret = true;
387			break;
388		}
389		error(0, 0, "ignoring cmsg at level %d type %d\n",
390			    cmsg->cmsg_level, cmsg->cmsg_type);
391	}
392	return ret;
393}
394
395static bool do_recv_completion(int fd, int domain)
396{
397	struct sock_extended_err *serr;
398	struct msghdr msg = {};
399	struct cmsghdr *cm;
400	uint32_t hi, lo, range;
401	int ret, zerocopy;
402	char control[100];
403
404	if (domain == PF_RDS)
405		return do_recvmsg_completion(fd);
406
407	msg.msg_control = control;
408	msg.msg_controllen = sizeof(control);
409
410	ret = recvmsg(fd, &msg, MSG_ERRQUEUE);
411	if (ret == -1 && errno == EAGAIN)
412		return false;
413	if (ret == -1)
414		error(1, errno, "recvmsg notification");
415	if (msg.msg_flags & MSG_CTRUNC)
416		error(1, errno, "recvmsg notification: truncated");
417
418	cm = CMSG_FIRSTHDR(&msg);
419	if (!cm)
420		error(1, 0, "cmsg: no cmsg");
421	if (!((cm->cmsg_level == SOL_IP && cm->cmsg_type == IP_RECVERR) ||
422	      (cm->cmsg_level == SOL_IPV6 && cm->cmsg_type == IPV6_RECVERR) ||
423	      (cm->cmsg_level == SOL_PACKET && cm->cmsg_type == PACKET_TX_TIMESTAMP)))
424		error(1, 0, "serr: wrong type: %d.%d",
425		      cm->cmsg_level, cm->cmsg_type);
426
427	serr = (void *) CMSG_DATA(cm);
428
429	if (serr->ee_origin != SO_EE_ORIGIN_ZEROCOPY)
430		error(1, 0, "serr: wrong origin: %u", serr->ee_origin);
431	if (serr->ee_errno != 0)
432		error(1, 0, "serr: wrong error code: %u", serr->ee_errno);
433
434	hi = serr->ee_data;
435	lo = serr->ee_info;
436	range = hi - lo + 1;
437
438	/* Detect notification gaps. These should not happen often, if at all.
439	 * Gaps can occur due to drops, reordering and retransmissions.
440	 */
441	if (cfg_verbose && lo != next_completion)
442		fprintf(stderr, "gap: %u..%u does not append to %u\n",
443			lo, hi, next_completion);
444	next_completion = hi + 1;
445
446	zerocopy = !(serr->ee_code & SO_EE_CODE_ZEROCOPY_COPIED);
447	if (zerocopied == -1)
448		zerocopied = zerocopy;
449	else if (zerocopied != zerocopy) {
450		fprintf(stderr, "serr: inconsistent\n");
451		zerocopied = zerocopy;
452	}
453
454	if (cfg_verbose >= 2)
455		fprintf(stderr, "completed: %u (h=%u l=%u)\n",
456			range, hi, lo);
457
458	completions += range;
459	return true;
460}
461
462/* Read all outstanding messages on the errqueue */
463static void do_recv_completions(int fd, int domain)
464{
465	while (do_recv_completion(fd, domain)) {}
466	sends_since_notify = 0;
467}
468
469/* Wait for all remaining completions on the errqueue */
470static void do_recv_remaining_completions(int fd, int domain)
471{
472	int64_t tstop = gettimeofday_ms() + cfg_waittime_ms;
473
474	while (completions < expected_completions &&
475	       gettimeofday_ms() < tstop) {
476		if (do_poll(fd, domain == PF_RDS ? POLLIN : POLLERR))
477			do_recv_completions(fd, domain);
478	}
479
480	if (completions < expected_completions)
481		fprintf(stderr, "missing notifications: %lu < %lu\n",
482			completions, expected_completions);
483}
484
485static void do_tx(int domain, int type, int protocol)
486{
487	struct iovec iov[3] = { {0} };
488	struct sockaddr_ll laddr;
489	struct msghdr msg = {0};
490	struct ethhdr eth;
491	union {
492		struct ipv6hdr ip6h;
493		struct iphdr iph;
494	} nh;
495	uint64_t tstop;
496	int fd;
497
498	fd = do_setup_tx(domain, type, protocol);
499
500	if (domain == PF_PACKET) {
501		uint16_t proto = cfg_family == PF_INET ? ETH_P_IP : ETH_P_IPV6;
502
503		/* sock_raw passes ll header as data */
504		if (type == SOCK_RAW) {
505			memset(eth.h_dest, 0x06, ETH_ALEN);
506			memset(eth.h_source, 0x02, ETH_ALEN);
507			eth.h_proto = htons(proto);
508			iov[0].iov_base = &eth;
509			iov[0].iov_len = sizeof(eth);
510			msg.msg_iovlen++;
511		}
512
513		/* both sock_raw and sock_dgram expect name */
514		memset(&laddr, 0, sizeof(laddr));
515		laddr.sll_family	= AF_PACKET;
516		laddr.sll_ifindex	= cfg_ifindex;
517		laddr.sll_protocol	= htons(proto);
518		laddr.sll_halen		= ETH_ALEN;
519
520		memset(laddr.sll_addr, 0x06, ETH_ALEN);
521
522		msg.msg_name		= &laddr;
523		msg.msg_namelen		= sizeof(laddr);
524	}
525
526	/* packet and raw sockets with hdrincl must pass network header */
527	if (domain == PF_PACKET || protocol == IPPROTO_RAW) {
528		if (cfg_family == PF_INET)
529			iov[1].iov_len = setup_iph(&nh.iph, cfg_payload_len);
530		else
531			iov[1].iov_len = setup_ip6h(&nh.ip6h, cfg_payload_len);
532
533		iov[1].iov_base = (void *) &nh;
534		msg.msg_iovlen++;
535	}
536
537	if (domain == PF_RDS) {
538		msg.msg_name = &cfg_dst_addr;
539		msg.msg_namelen =  (cfg_dst_addr.ss_family == AF_INET ?
540				    sizeof(struct sockaddr_in) :
541				    sizeof(struct sockaddr_in6));
542	}
543
544	iov[2].iov_base = payload;
545	iov[2].iov_len = cfg_payload_len;
546	msg.msg_iovlen++;
547	msg.msg_iov = &iov[3 - msg.msg_iovlen];
548
549	tstop = gettimeofday_ms() + cfg_runtime_ms;
550	do {
551		if (cfg_cork)
552			do_sendmsg_corked(fd, &msg);
553		else
554			do_sendmsg(fd, &msg, cfg_zerocopy, domain);
555
556		if (cfg_zerocopy && sends_since_notify >= cfg_notification_limit)
557			do_recv_completions(fd, domain);
558
559		while (!do_poll(fd, POLLOUT)) {
560			if (cfg_zerocopy)
561				do_recv_completions(fd, domain);
562		}
563
564	} while (gettimeofday_ms() < tstop);
565
566	if (cfg_zerocopy)
567		do_recv_remaining_completions(fd, domain);
568
569	if (close(fd))
570		error(1, errno, "close");
571
572	fprintf(stderr, "tx=%lu (%lu MB) txc=%lu zc=%c\n",
573		packets, bytes >> 20, completions,
574		zerocopied == 1 ? 'y' : 'n');
575}
576
577static int do_setup_rx(int domain, int type, int protocol)
578{
579	int fd;
580
581	/* If tx over PF_PACKET, rx over PF_INET(6)/SOCK_RAW,
582	 * to recv the only copy of the packet, not a clone
583	 */
584	if (domain == PF_PACKET)
585		error(1, 0, "Use PF_INET/SOCK_RAW to read");
586
587	if (type == SOCK_RAW && protocol == IPPROTO_RAW)
588		error(1, 0, "IPPROTO_RAW: not supported on Rx");
589
590	fd = socket(domain, type, protocol);
591	if (fd == -1)
592		error(1, errno, "socket r");
593
594	do_setsockopt(fd, SOL_SOCKET, SO_RCVBUF, 1 << 21);
595	do_setsockopt(fd, SOL_SOCKET, SO_RCVLOWAT, 1 << 16);
596	do_setsockopt(fd, SOL_SOCKET, SO_REUSEPORT, 1);
597
598	if (bind(fd, (void *) &cfg_dst_addr, cfg_alen))
599		error(1, errno, "bind");
600
601	if (type == SOCK_STREAM) {
602		if (listen(fd, 1))
603			error(1, errno, "listen");
604		fd = do_accept(fd);
605	}
606
607	return fd;
608}
609
610/* Flush all outstanding bytes for the tcp receive queue */
611static void do_flush_tcp(int fd)
612{
613	int ret;
614
615	/* MSG_TRUNC flushes up to len bytes */
616	ret = recv(fd, NULL, 1 << 21, MSG_TRUNC | MSG_DONTWAIT);
617	if (ret == -1 && errno == EAGAIN)
618		return;
619	if (ret == -1)
620		error(1, errno, "flush");
621	if (!ret)
622		return;
623
624	packets++;
625	bytes += ret;
626}
627
628/* Flush all outstanding datagrams. Verify first few bytes of each. */
629static void do_flush_datagram(int fd, int type)
630{
631	int ret, off = 0;
632	char buf[64];
633
634	/* MSG_TRUNC will return full datagram length */
635	ret = recv(fd, buf, sizeof(buf), MSG_DONTWAIT | MSG_TRUNC);
636	if (ret == -1 && errno == EAGAIN)
637		return;
638
639	/* raw ipv4 return with header, raw ipv6 without */
640	if (cfg_family == PF_INET && type == SOCK_RAW) {
641		off += sizeof(struct iphdr);
642		ret -= sizeof(struct iphdr);
643	}
644
645	if (ret == -1)
646		error(1, errno, "recv");
647	if (ret != cfg_payload_len)
648		error(1, 0, "recv: ret=%u != %u", ret, cfg_payload_len);
649	if (ret > sizeof(buf) - off)
650		ret = sizeof(buf) - off;
651	if (memcmp(buf + off, payload, ret))
652		error(1, 0, "recv: data mismatch");
653
654	packets++;
655	bytes += cfg_payload_len;
656}
657
658static void do_rx(int domain, int type, int protocol)
659{
660	const int cfg_receiver_wait_ms = 400;
661	uint64_t tstop;
662	int fd;
663
664	fd = do_setup_rx(domain, type, protocol);
665
666	tstop = gettimeofday_ms() + cfg_runtime_ms + cfg_receiver_wait_ms;
667	do {
668		if (type == SOCK_STREAM)
669			do_flush_tcp(fd);
670		else
671			do_flush_datagram(fd, type);
672
673		do_poll(fd, POLLIN);
674
675	} while (gettimeofday_ms() < tstop);
676
677	if (close(fd))
678		error(1, errno, "close");
679
680	fprintf(stderr, "rx=%lu (%lu MB)\n", packets, bytes >> 20);
681}
682
683static void do_test(int domain, int type, int protocol)
684{
685	int i;
686
687	if (cfg_cork && (domain == PF_PACKET || type != SOCK_DGRAM))
688		error(1, 0, "can only cork udp sockets");
689
690	do_setcpu(cfg_cpu);
691
692	for (i = 0; i < IP_MAXPACKET; i++)
693		payload[i] = 'a' + (i % 26);
694
695	if (cfg_rx)
696		do_rx(domain, type, protocol);
697	else
698		do_tx(domain, type, protocol);
699}
700
701static void usage(const char *filepath)
702{
703	error(1, 0, "Usage: %s [options] <test>", filepath);
704}
705
706static void parse_opts(int argc, char **argv)
707{
708	const int max_payload_len = sizeof(payload) -
709				    sizeof(struct ipv6hdr) -
710				    sizeof(struct tcphdr) -
711				    40 /* max tcp options */;
712	int c;
713	char *daddr = NULL, *saddr = NULL;
714	char *cfg_test;
715
716	cfg_payload_len = max_payload_len;
717
718	while ((c = getopt(argc, argv, "46c:C:D:i:l:mp:rs:S:t:vz")) != -1) {
719		switch (c) {
720		case '4':
721			if (cfg_family != PF_UNSPEC)
722				error(1, 0, "Pass one of -4 or -6");
723			cfg_family = PF_INET;
724			cfg_alen = sizeof(struct sockaddr_in);
725			break;
726		case '6':
727			if (cfg_family != PF_UNSPEC)
728				error(1, 0, "Pass one of -4 or -6");
729			cfg_family = PF_INET6;
730			cfg_alen = sizeof(struct sockaddr_in6);
731			break;
732		case 'c':
733			cfg_cork = strtol(optarg, NULL, 0);
734			break;
735		case 'C':
736			cfg_cpu = strtol(optarg, NULL, 0);
737			break;
738		case 'D':
739			daddr = optarg;
740			break;
741		case 'i':
742			cfg_ifindex = if_nametoindex(optarg);
743			if (cfg_ifindex == 0)
744				error(1, errno, "invalid iface: %s", optarg);
745			break;
746		case 'l':
747			cfg_notification_limit = strtoul(optarg, NULL, 0);
748			break;
749		case 'm':
750			cfg_cork_mixed = true;
751			break;
752		case 'p':
753			cfg_port = strtoul(optarg, NULL, 0);
754			break;
755		case 'r':
756			cfg_rx = true;
757			break;
758		case 's':
759			cfg_payload_len = strtoul(optarg, NULL, 0);
760			break;
761		case 'S':
762			saddr = optarg;
763			break;
764		case 't':
765			cfg_runtime_ms = 200 + strtoul(optarg, NULL, 10) * 1000;
766			break;
767		case 'v':
768			cfg_verbose++;
769			break;
770		case 'z':
771			cfg_zerocopy = true;
772			break;
773		}
774	}
775
776	cfg_test = argv[argc - 1];
777	if (strcmp(cfg_test, "rds") == 0) {
778		if (!daddr)
779			error(1, 0, "-D <server addr> required for PF_RDS\n");
780		if (!cfg_rx && !saddr)
781			error(1, 0, "-S <client addr> required for PF_RDS\n");
782	}
783	setup_sockaddr(cfg_family, daddr, &cfg_dst_addr);
784	setup_sockaddr(cfg_family, saddr, &cfg_src_addr);
785
786	if (cfg_payload_len > max_payload_len)
787		error(1, 0, "-s: payload exceeds max (%d)", max_payload_len);
788	if (cfg_cork_mixed && (!cfg_zerocopy || !cfg_cork))
789		error(1, 0, "-m: cork_mixed requires corking and zerocopy");
790
791	if (optind != argc - 1)
792		usage(argv[0]);
793}
794
795int main(int argc, char **argv)
796{
797	const char *cfg_test;
798
799	parse_opts(argc, argv);
800
801	cfg_test = argv[argc - 1];
802
803	if (!strcmp(cfg_test, "packet"))
804		do_test(PF_PACKET, SOCK_RAW, 0);
805	else if (!strcmp(cfg_test, "packet_dgram"))
806		do_test(PF_PACKET, SOCK_DGRAM, 0);
807	else if (!strcmp(cfg_test, "raw"))
808		do_test(cfg_family, SOCK_RAW, IPPROTO_EGP);
809	else if (!strcmp(cfg_test, "raw_hdrincl"))
810		do_test(cfg_family, SOCK_RAW, IPPROTO_RAW);
811	else if (!strcmp(cfg_test, "tcp"))
812		do_test(cfg_family, SOCK_STREAM, 0);
813	else if (!strcmp(cfg_test, "udp"))
814		do_test(cfg_family, SOCK_DGRAM, 0);
815	else if (!strcmp(cfg_test, "rds"))
816		do_test(PF_RDS, SOCK_SEQPACKET, 0);
817	else
818		error(1, 0, "unknown cfg_test %s", cfg_test);
819
820	return 0;
821}