Linux Audio

Check our new training course

Yocto distribution development and maintenance

Need a Yocto distribution for your embedded project?
Loading...
Note: File does not exist in v3.5.6.
   1// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
   2/* -
   3 * net/sched/act_ct.c  Connection Tracking action
   4 *
   5 * Authors:   Paul Blakey <paulb@mellanox.com>
   6 *            Yossi Kuperman <yossiku@mellanox.com>
   7 *            Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
   8 */
   9
  10#include <linux/module.h>
  11#include <linux/init.h>
  12#include <linux/kernel.h>
  13#include <linux/skbuff.h>
  14#include <linux/rtnetlink.h>
  15#include <linux/pkt_cls.h>
  16#include <linux/ip.h>
  17#include <linux/ipv6.h>
  18#include <linux/rhashtable.h>
  19#include <net/netlink.h>
  20#include <net/pkt_sched.h>
  21#include <net/pkt_cls.h>
  22#include <net/act_api.h>
  23#include <net/ip.h>
  24#include <net/ipv6_frag.h>
  25#include <uapi/linux/tc_act/tc_ct.h>
  26#include <net/tc_act/tc_ct.h>
  27#include <net/tc_wrapper.h>
  28
  29#include <net/netfilter/nf_flow_table.h>
  30#include <net/netfilter/nf_conntrack.h>
  31#include <net/netfilter/nf_conntrack_core.h>
  32#include <net/netfilter/nf_conntrack_zones.h>
  33#include <net/netfilter/nf_conntrack_helper.h>
  34#include <net/netfilter/nf_conntrack_acct.h>
  35#include <net/netfilter/ipv6/nf_defrag_ipv6.h>
  36#include <net/netfilter/nf_conntrack_act_ct.h>
  37#include <net/netfilter/nf_conntrack_seqadj.h>
  38#include <uapi/linux/netfilter/nf_nat.h>
  39
  40static struct workqueue_struct *act_ct_wq;
  41static struct rhashtable zones_ht;
  42static DEFINE_MUTEX(zones_mutex);
  43
  44struct zones_ht_key {
  45	struct net *net;
  46	u16 zone;
  47};
  48
  49struct tcf_ct_flow_table {
  50	struct rhash_head node; /* In zones tables */
  51
  52	struct rcu_work rwork;
  53	struct nf_flowtable nf_ft;
  54	refcount_t ref;
  55	struct zones_ht_key key;
  56
  57	bool dying;
  58};
  59
  60static const struct rhashtable_params zones_params = {
  61	.head_offset = offsetof(struct tcf_ct_flow_table, node),
  62	.key_offset = offsetof(struct tcf_ct_flow_table, key),
  63	.key_len = offsetofend(struct zones_ht_key, zone),
  64	.automatic_shrinking = true,
  65};
  66
  67static struct flow_action_entry *
  68tcf_ct_flow_table_flow_action_get_next(struct flow_action *flow_action)
  69{
  70	int i = flow_action->num_entries++;
  71
  72	return &flow_action->entries[i];
  73}
  74
  75static void tcf_ct_add_mangle_action(struct flow_action *action,
  76				     enum flow_action_mangle_base htype,
  77				     u32 offset,
  78				     u32 mask,
  79				     u32 val)
  80{
  81	struct flow_action_entry *entry;
  82
  83	entry = tcf_ct_flow_table_flow_action_get_next(action);
  84	entry->id = FLOW_ACTION_MANGLE;
  85	entry->mangle.htype = htype;
  86	entry->mangle.mask = ~mask;
  87	entry->mangle.offset = offset;
  88	entry->mangle.val = val;
  89}
  90
  91/* The following nat helper functions check if the inverted reverse tuple
  92 * (target) is different then the current dir tuple - meaning nat for ports
  93 * and/or ip is needed, and add the relevant mangle actions.
  94 */
  95static void
  96tcf_ct_flow_table_add_action_nat_ipv4(const struct nf_conntrack_tuple *tuple,
  97				      struct nf_conntrack_tuple target,
  98				      struct flow_action *action)
  99{
 100	if (memcmp(&target.src.u3, &tuple->src.u3, sizeof(target.src.u3)))
 101		tcf_ct_add_mangle_action(action, FLOW_ACT_MANGLE_HDR_TYPE_IP4,
 102					 offsetof(struct iphdr, saddr),
 103					 0xFFFFFFFF,
 104					 be32_to_cpu(target.src.u3.ip));
 105	if (memcmp(&target.dst.u3, &tuple->dst.u3, sizeof(target.dst.u3)))
 106		tcf_ct_add_mangle_action(action, FLOW_ACT_MANGLE_HDR_TYPE_IP4,
 107					 offsetof(struct iphdr, daddr),
 108					 0xFFFFFFFF,
 109					 be32_to_cpu(target.dst.u3.ip));
 110}
 111
 112static void
 113tcf_ct_add_ipv6_addr_mangle_action(struct flow_action *action,
 114				   union nf_inet_addr *addr,
 115				   u32 offset)
 116{
 117	int i;
 118
 119	for (i = 0; i < sizeof(struct in6_addr) / sizeof(u32); i++)
 120		tcf_ct_add_mangle_action(action, FLOW_ACT_MANGLE_HDR_TYPE_IP6,
 121					 i * sizeof(u32) + offset,
 122					 0xFFFFFFFF, be32_to_cpu(addr->ip6[i]));
 123}
 124
 125static void
 126tcf_ct_flow_table_add_action_nat_ipv6(const struct nf_conntrack_tuple *tuple,
 127				      struct nf_conntrack_tuple target,
 128				      struct flow_action *action)
 129{
 130	if (memcmp(&target.src.u3, &tuple->src.u3, sizeof(target.src.u3)))
 131		tcf_ct_add_ipv6_addr_mangle_action(action, &target.src.u3,
 132						   offsetof(struct ipv6hdr,
 133							    saddr));
 134	if (memcmp(&target.dst.u3, &tuple->dst.u3, sizeof(target.dst.u3)))
 135		tcf_ct_add_ipv6_addr_mangle_action(action, &target.dst.u3,
 136						   offsetof(struct ipv6hdr,
 137							    daddr));
 138}
 139
 140static void
 141tcf_ct_flow_table_add_action_nat_tcp(const struct nf_conntrack_tuple *tuple,
 142				     struct nf_conntrack_tuple target,
 143				     struct flow_action *action)
 144{
 145	__be16 target_src = target.src.u.tcp.port;
 146	__be16 target_dst = target.dst.u.tcp.port;
 147
 148	if (target_src != tuple->src.u.tcp.port)
 149		tcf_ct_add_mangle_action(action, FLOW_ACT_MANGLE_HDR_TYPE_TCP,
 150					 offsetof(struct tcphdr, source),
 151					 0xFFFF, be16_to_cpu(target_src));
 152	if (target_dst != tuple->dst.u.tcp.port)
 153		tcf_ct_add_mangle_action(action, FLOW_ACT_MANGLE_HDR_TYPE_TCP,
 154					 offsetof(struct tcphdr, dest),
 155					 0xFFFF, be16_to_cpu(target_dst));
 156}
 157
 158static void
 159tcf_ct_flow_table_add_action_nat_udp(const struct nf_conntrack_tuple *tuple,
 160				     struct nf_conntrack_tuple target,
 161				     struct flow_action *action)
 162{
 163	__be16 target_src = target.src.u.udp.port;
 164	__be16 target_dst = target.dst.u.udp.port;
 165
 166	if (target_src != tuple->src.u.udp.port)
 167		tcf_ct_add_mangle_action(action, FLOW_ACT_MANGLE_HDR_TYPE_UDP,
 168					 offsetof(struct udphdr, source),
 169					 0xFFFF, be16_to_cpu(target_src));
 170	if (target_dst != tuple->dst.u.udp.port)
 171		tcf_ct_add_mangle_action(action, FLOW_ACT_MANGLE_HDR_TYPE_UDP,
 172					 offsetof(struct udphdr, dest),
 173					 0xFFFF, be16_to_cpu(target_dst));
 174}
 175
 176static void tcf_ct_flow_table_add_action_meta(struct nf_conn *ct,
 177					      enum ip_conntrack_dir dir,
 178					      enum ip_conntrack_info ctinfo,
 179					      struct flow_action *action)
 180{
 181	struct nf_conn_labels *ct_labels;
 182	struct flow_action_entry *entry;
 183	u32 *act_ct_labels;
 184
 185	entry = tcf_ct_flow_table_flow_action_get_next(action);
 186	entry->id = FLOW_ACTION_CT_METADATA;
 187#if IS_ENABLED(CONFIG_NF_CONNTRACK_MARK)
 188	entry->ct_metadata.mark = READ_ONCE(ct->mark);
 189#endif
 190	/* aligns with the CT reference on the SKB nf_ct_set */
 191	entry->ct_metadata.cookie = (unsigned long)ct | ctinfo;
 192	entry->ct_metadata.orig_dir = dir == IP_CT_DIR_ORIGINAL;
 193
 194	act_ct_labels = entry->ct_metadata.labels;
 195	ct_labels = nf_ct_labels_find(ct);
 196	if (ct_labels)
 197		memcpy(act_ct_labels, ct_labels->bits, NF_CT_LABELS_MAX_SIZE);
 198	else
 199		memset(act_ct_labels, 0, NF_CT_LABELS_MAX_SIZE);
 200}
 201
 202static int tcf_ct_flow_table_add_action_nat(struct net *net,
 203					    struct nf_conn *ct,
 204					    enum ip_conntrack_dir dir,
 205					    struct flow_action *action)
 206{
 207	const struct nf_conntrack_tuple *tuple = &ct->tuplehash[dir].tuple;
 208	struct nf_conntrack_tuple target;
 209
 210	if (!(ct->status & IPS_NAT_MASK))
 211		return 0;
 212
 213	nf_ct_invert_tuple(&target, &ct->tuplehash[!dir].tuple);
 214
 215	switch (tuple->src.l3num) {
 216	case NFPROTO_IPV4:
 217		tcf_ct_flow_table_add_action_nat_ipv4(tuple, target,
 218						      action);
 219		break;
 220	case NFPROTO_IPV6:
 221		tcf_ct_flow_table_add_action_nat_ipv6(tuple, target,
 222						      action);
 223		break;
 224	default:
 225		return -EOPNOTSUPP;
 226	}
 227
 228	switch (nf_ct_protonum(ct)) {
 229	case IPPROTO_TCP:
 230		tcf_ct_flow_table_add_action_nat_tcp(tuple, target, action);
 231		break;
 232	case IPPROTO_UDP:
 233		tcf_ct_flow_table_add_action_nat_udp(tuple, target, action);
 234		break;
 235	default:
 236		return -EOPNOTSUPP;
 237	}
 238
 239	return 0;
 240}
 241
 242static int tcf_ct_flow_table_fill_actions(struct net *net,
 243					  struct flow_offload *flow,
 244					  enum flow_offload_tuple_dir tdir,
 245					  struct nf_flow_rule *flow_rule)
 246{
 247	struct flow_action *action = &flow_rule->rule->action;
 248	int num_entries = action->num_entries;
 249	struct nf_conn *ct = flow->ct;
 250	enum ip_conntrack_info ctinfo;
 251	enum ip_conntrack_dir dir;
 252	int i, err;
 253
 254	switch (tdir) {
 255	case FLOW_OFFLOAD_DIR_ORIGINAL:
 256		dir = IP_CT_DIR_ORIGINAL;
 257		ctinfo = test_bit(IPS_SEEN_REPLY_BIT, &ct->status) ?
 258			IP_CT_ESTABLISHED : IP_CT_NEW;
 259		if (ctinfo == IP_CT_ESTABLISHED)
 260			set_bit(NF_FLOW_HW_ESTABLISHED, &flow->flags);
 261		break;
 262	case FLOW_OFFLOAD_DIR_REPLY:
 263		dir = IP_CT_DIR_REPLY;
 264		ctinfo = IP_CT_ESTABLISHED_REPLY;
 265		break;
 266	default:
 267		return -EOPNOTSUPP;
 268	}
 269
 270	err = tcf_ct_flow_table_add_action_nat(net, ct, dir, action);
 271	if (err)
 272		goto err_nat;
 273
 274	tcf_ct_flow_table_add_action_meta(ct, dir, ctinfo, action);
 275	return 0;
 276
 277err_nat:
 278	/* Clear filled actions */
 279	for (i = num_entries; i < action->num_entries; i++)
 280		memset(&action->entries[i], 0, sizeof(action->entries[i]));
 281	action->num_entries = num_entries;
 282
 283	return err;
 284}
 285
 286static bool tcf_ct_flow_is_outdated(const struct flow_offload *flow)
 287{
 288	return test_bit(IPS_SEEN_REPLY_BIT, &flow->ct->status) &&
 289	       test_bit(IPS_HW_OFFLOAD_BIT, &flow->ct->status) &&
 290	       !test_bit(NF_FLOW_HW_PENDING, &flow->flags) &&
 291	       !test_bit(NF_FLOW_HW_ESTABLISHED, &flow->flags);
 292}
 293
 294static void tcf_ct_flow_table_get_ref(struct tcf_ct_flow_table *ct_ft);
 295
 296static void tcf_ct_nf_get(struct nf_flowtable *ft)
 297{
 298	struct tcf_ct_flow_table *ct_ft =
 299		container_of(ft, struct tcf_ct_flow_table, nf_ft);
 300
 301	tcf_ct_flow_table_get_ref(ct_ft);
 302}
 303
 304static void tcf_ct_flow_table_put(struct tcf_ct_flow_table *ct_ft);
 305
 306static void tcf_ct_nf_put(struct nf_flowtable *ft)
 307{
 308	struct tcf_ct_flow_table *ct_ft =
 309		container_of(ft, struct tcf_ct_flow_table, nf_ft);
 310
 311	tcf_ct_flow_table_put(ct_ft);
 312}
 313
 314static struct nf_flowtable_type flowtable_ct = {
 315	.gc		= tcf_ct_flow_is_outdated,
 316	.action		= tcf_ct_flow_table_fill_actions,
 317	.get		= tcf_ct_nf_get,
 318	.put		= tcf_ct_nf_put,
 319	.owner		= THIS_MODULE,
 320};
 321
 322static int tcf_ct_flow_table_get(struct net *net, struct tcf_ct_params *params)
 323{
 324	struct zones_ht_key key = { .net = net, .zone = params->zone };
 325	struct tcf_ct_flow_table *ct_ft;
 326	int err = -ENOMEM;
 327
 328	mutex_lock(&zones_mutex);
 329	ct_ft = rhashtable_lookup_fast(&zones_ht, &key, zones_params);
 330	if (ct_ft && refcount_inc_not_zero(&ct_ft->ref))
 331		goto out_unlock;
 332
 333	ct_ft = kzalloc(sizeof(*ct_ft), GFP_KERNEL);
 334	if (!ct_ft)
 335		goto err_alloc;
 336	refcount_set(&ct_ft->ref, 1);
 337
 338	ct_ft->key = key;
 339	err = rhashtable_insert_fast(&zones_ht, &ct_ft->node, zones_params);
 340	if (err)
 341		goto err_insert;
 342
 343	ct_ft->nf_ft.type = &flowtable_ct;
 344	ct_ft->nf_ft.flags |= NF_FLOWTABLE_HW_OFFLOAD |
 345			      NF_FLOWTABLE_COUNTER;
 346	err = nf_flow_table_init(&ct_ft->nf_ft);
 347	if (err)
 348		goto err_init;
 349	write_pnet(&ct_ft->nf_ft.net, net);
 350
 351	__module_get(THIS_MODULE);
 352out_unlock:
 353	params->ct_ft = ct_ft;
 354	params->nf_ft = &ct_ft->nf_ft;
 355	mutex_unlock(&zones_mutex);
 356
 357	return 0;
 358
 359err_init:
 360	rhashtable_remove_fast(&zones_ht, &ct_ft->node, zones_params);
 361err_insert:
 362	kfree(ct_ft);
 363err_alloc:
 364	mutex_unlock(&zones_mutex);
 365	return err;
 366}
 367
 368static void tcf_ct_flow_table_get_ref(struct tcf_ct_flow_table *ct_ft)
 369{
 370	refcount_inc(&ct_ft->ref);
 371}
 372
 373static void tcf_ct_flow_table_cleanup_work(struct work_struct *work)
 374{
 375	struct tcf_ct_flow_table *ct_ft;
 376	struct flow_block *block;
 377
 378	ct_ft = container_of(to_rcu_work(work), struct tcf_ct_flow_table,
 379			     rwork);
 380	nf_flow_table_free(&ct_ft->nf_ft);
 381
 382	block = &ct_ft->nf_ft.flow_block;
 383	down_write(&ct_ft->nf_ft.flow_block_lock);
 384	WARN_ON(!list_empty(&block->cb_list));
 385	up_write(&ct_ft->nf_ft.flow_block_lock);
 386	kfree(ct_ft);
 387
 388	module_put(THIS_MODULE);
 389}
 390
 391static void tcf_ct_flow_table_put(struct tcf_ct_flow_table *ct_ft)
 392{
 393	if (refcount_dec_and_test(&ct_ft->ref)) {
 394		rhashtable_remove_fast(&zones_ht, &ct_ft->node, zones_params);
 395		INIT_RCU_WORK(&ct_ft->rwork, tcf_ct_flow_table_cleanup_work);
 396		queue_rcu_work(act_ct_wq, &ct_ft->rwork);
 397	}
 398}
 399
 400static void tcf_ct_flow_tc_ifidx(struct flow_offload *entry,
 401				 struct nf_conn_act_ct_ext *act_ct_ext, u8 dir)
 402{
 403	entry->tuplehash[dir].tuple.xmit_type = FLOW_OFFLOAD_XMIT_TC;
 404	entry->tuplehash[dir].tuple.tc.iifidx = act_ct_ext->ifindex[dir];
 405}
 406
 407static void tcf_ct_flow_ct_ext_ifidx_update(struct flow_offload *entry)
 408{
 409	struct nf_conn_act_ct_ext *act_ct_ext;
 410
 411	act_ct_ext = nf_conn_act_ct_ext_find(entry->ct);
 412	if (act_ct_ext) {
 413		tcf_ct_flow_tc_ifidx(entry, act_ct_ext, FLOW_OFFLOAD_DIR_ORIGINAL);
 414		tcf_ct_flow_tc_ifidx(entry, act_ct_ext, FLOW_OFFLOAD_DIR_REPLY);
 415	}
 416}
 417
 418static void tcf_ct_flow_table_add(struct tcf_ct_flow_table *ct_ft,
 419				  struct nf_conn *ct,
 420				  bool tcp, bool bidirectional)
 421{
 422	struct nf_conn_act_ct_ext *act_ct_ext;
 423	struct flow_offload *entry;
 424	int err;
 425
 426	if (test_and_set_bit(IPS_OFFLOAD_BIT, &ct->status))
 427		return;
 428
 429	entry = flow_offload_alloc(ct);
 430	if (!entry) {
 431		WARN_ON_ONCE(1);
 432		goto err_alloc;
 433	}
 434
 435	if (tcp) {
 436		ct->proto.tcp.seen[0].flags |= IP_CT_TCP_FLAG_BE_LIBERAL;
 437		ct->proto.tcp.seen[1].flags |= IP_CT_TCP_FLAG_BE_LIBERAL;
 438	}
 439	if (bidirectional)
 440		__set_bit(NF_FLOW_HW_BIDIRECTIONAL, &entry->flags);
 441
 442	act_ct_ext = nf_conn_act_ct_ext_find(ct);
 443	if (act_ct_ext) {
 444		tcf_ct_flow_tc_ifidx(entry, act_ct_ext, FLOW_OFFLOAD_DIR_ORIGINAL);
 445		tcf_ct_flow_tc_ifidx(entry, act_ct_ext, FLOW_OFFLOAD_DIR_REPLY);
 446	}
 447
 448	err = flow_offload_add(&ct_ft->nf_ft, entry);
 449	if (err)
 450		goto err_add;
 451
 452	return;
 453
 454err_add:
 455	flow_offload_free(entry);
 456err_alloc:
 457	clear_bit(IPS_OFFLOAD_BIT, &ct->status);
 458}
 459
 460static void tcf_ct_flow_table_process_conn(struct tcf_ct_flow_table *ct_ft,
 461					   struct nf_conn *ct,
 462					   enum ip_conntrack_info ctinfo)
 463{
 464	bool tcp = false, bidirectional = true;
 465
 466	switch (nf_ct_protonum(ct)) {
 467	case IPPROTO_TCP:
 468		if ((ctinfo != IP_CT_ESTABLISHED &&
 469		     ctinfo != IP_CT_ESTABLISHED_REPLY) ||
 470		    !test_bit(IPS_ASSURED_BIT, &ct->status) ||
 471		    ct->proto.tcp.state != TCP_CONNTRACK_ESTABLISHED)
 472			return;
 473
 474		tcp = true;
 475		break;
 476	case IPPROTO_UDP:
 477		if (!nf_ct_is_confirmed(ct))
 478			return;
 479		if (!test_bit(IPS_ASSURED_BIT, &ct->status))
 480			bidirectional = false;
 481		break;
 482#ifdef CONFIG_NF_CT_PROTO_GRE
 483	case IPPROTO_GRE: {
 484		struct nf_conntrack_tuple *tuple;
 485
 486		if ((ctinfo != IP_CT_ESTABLISHED &&
 487		     ctinfo != IP_CT_ESTABLISHED_REPLY) ||
 488		    !test_bit(IPS_ASSURED_BIT, &ct->status) ||
 489		    ct->status & IPS_NAT_MASK)
 490			return;
 491
 492		tuple = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
 493		/* No support for GRE v1 */
 494		if (tuple->src.u.gre.key || tuple->dst.u.gre.key)
 495			return;
 496		break;
 497	}
 498#endif
 499	default:
 500		return;
 501	}
 502
 503	if (nf_ct_ext_exist(ct, NF_CT_EXT_HELPER) ||
 504	    ct->status & IPS_SEQ_ADJUST)
 505		return;
 506
 507	tcf_ct_flow_table_add(ct_ft, ct, tcp, bidirectional);
 508}
 509
 510static bool
 511tcf_ct_flow_table_fill_tuple_ipv4(struct sk_buff *skb,
 512				  struct flow_offload_tuple *tuple,
 513				  struct tcphdr **tcph)
 514{
 515	struct flow_ports *ports;
 516	unsigned int thoff;
 517	struct iphdr *iph;
 518	size_t hdrsize;
 519	u8 ipproto;
 520
 521	if (!pskb_network_may_pull(skb, sizeof(*iph)))
 522		return false;
 523
 524	iph = ip_hdr(skb);
 525	thoff = iph->ihl * 4;
 526
 527	if (ip_is_fragment(iph) ||
 528	    unlikely(thoff != sizeof(struct iphdr)))
 529		return false;
 530
 531	ipproto = iph->protocol;
 532	switch (ipproto) {
 533	case IPPROTO_TCP:
 534		hdrsize = sizeof(struct tcphdr);
 535		break;
 536	case IPPROTO_UDP:
 537		hdrsize = sizeof(*ports);
 538		break;
 539#ifdef CONFIG_NF_CT_PROTO_GRE
 540	case IPPROTO_GRE:
 541		hdrsize = sizeof(struct gre_base_hdr);
 542		break;
 543#endif
 544	default:
 545		return false;
 546	}
 547
 548	if (iph->ttl <= 1)
 549		return false;
 550
 551	if (!pskb_network_may_pull(skb, thoff + hdrsize))
 552		return false;
 553
 554	switch (ipproto) {
 555	case IPPROTO_TCP:
 556		*tcph = (void *)(skb_network_header(skb) + thoff);
 557		fallthrough;
 558	case IPPROTO_UDP:
 559		ports = (struct flow_ports *)(skb_network_header(skb) + thoff);
 560		tuple->src_port = ports->source;
 561		tuple->dst_port = ports->dest;
 562		break;
 563	case IPPROTO_GRE: {
 564		struct gre_base_hdr *greh;
 565
 566		greh = (struct gre_base_hdr *)(skb_network_header(skb) + thoff);
 567		if ((greh->flags & GRE_VERSION) != GRE_VERSION_0)
 568			return false;
 569		break;
 570	}
 571	}
 572
 573	iph = ip_hdr(skb);
 574
 575	tuple->src_v4.s_addr = iph->saddr;
 576	tuple->dst_v4.s_addr = iph->daddr;
 577	tuple->l3proto = AF_INET;
 578	tuple->l4proto = ipproto;
 579
 580	return true;
 581}
 582
 583static bool
 584tcf_ct_flow_table_fill_tuple_ipv6(struct sk_buff *skb,
 585				  struct flow_offload_tuple *tuple,
 586				  struct tcphdr **tcph)
 587{
 588	struct flow_ports *ports;
 589	struct ipv6hdr *ip6h;
 590	unsigned int thoff;
 591	size_t hdrsize;
 592	u8 nexthdr;
 593
 594	if (!pskb_network_may_pull(skb, sizeof(*ip6h)))
 595		return false;
 596
 597	ip6h = ipv6_hdr(skb);
 598	thoff = sizeof(*ip6h);
 599
 600	nexthdr = ip6h->nexthdr;
 601	switch (nexthdr) {
 602	case IPPROTO_TCP:
 603		hdrsize = sizeof(struct tcphdr);
 604		break;
 605	case IPPROTO_UDP:
 606		hdrsize = sizeof(*ports);
 607		break;
 608#ifdef CONFIG_NF_CT_PROTO_GRE
 609	case IPPROTO_GRE:
 610		hdrsize = sizeof(struct gre_base_hdr);
 611		break;
 612#endif
 613	default:
 614		return false;
 615	}
 616
 617	if (ip6h->hop_limit <= 1)
 618		return false;
 619
 620	if (!pskb_network_may_pull(skb, thoff + hdrsize))
 621		return false;
 622
 623	switch (nexthdr) {
 624	case IPPROTO_TCP:
 625		*tcph = (void *)(skb_network_header(skb) + thoff);
 626		fallthrough;
 627	case IPPROTO_UDP:
 628		ports = (struct flow_ports *)(skb_network_header(skb) + thoff);
 629		tuple->src_port = ports->source;
 630		tuple->dst_port = ports->dest;
 631		break;
 632	case IPPROTO_GRE: {
 633		struct gre_base_hdr *greh;
 634
 635		greh = (struct gre_base_hdr *)(skb_network_header(skb) + thoff);
 636		if ((greh->flags & GRE_VERSION) != GRE_VERSION_0)
 637			return false;
 638		break;
 639	}
 640	}
 641
 642	ip6h = ipv6_hdr(skb);
 643
 644	tuple->src_v6 = ip6h->saddr;
 645	tuple->dst_v6 = ip6h->daddr;
 646	tuple->l3proto = AF_INET6;
 647	tuple->l4proto = nexthdr;
 648
 649	return true;
 650}
 651
 652static bool tcf_ct_flow_table_lookup(struct tcf_ct_params *p,
 653				     struct sk_buff *skb,
 654				     u8 family)
 655{
 656	struct nf_flowtable *nf_ft = &p->ct_ft->nf_ft;
 657	struct flow_offload_tuple_rhash *tuplehash;
 658	struct flow_offload_tuple tuple = {};
 659	enum ip_conntrack_info ctinfo;
 660	struct tcphdr *tcph = NULL;
 661	bool force_refresh = false;
 662	struct flow_offload *flow;
 663	struct nf_conn *ct;
 664	u8 dir;
 665
 666	switch (family) {
 667	case NFPROTO_IPV4:
 668		if (!tcf_ct_flow_table_fill_tuple_ipv4(skb, &tuple, &tcph))
 669			return false;
 670		break;
 671	case NFPROTO_IPV6:
 672		if (!tcf_ct_flow_table_fill_tuple_ipv6(skb, &tuple, &tcph))
 673			return false;
 674		break;
 675	default:
 676		return false;
 677	}
 678
 679	tuplehash = flow_offload_lookup(nf_ft, &tuple);
 680	if (!tuplehash)
 681		return false;
 682
 683	dir = tuplehash->tuple.dir;
 684	flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]);
 685	ct = flow->ct;
 686
 687	if (dir == FLOW_OFFLOAD_DIR_REPLY &&
 688	    !test_bit(NF_FLOW_HW_BIDIRECTIONAL, &flow->flags)) {
 689		/* Only offload reply direction after connection became
 690		 * assured.
 691		 */
 692		if (test_bit(IPS_ASSURED_BIT, &ct->status))
 693			set_bit(NF_FLOW_HW_BIDIRECTIONAL, &flow->flags);
 694		else if (test_bit(NF_FLOW_HW_ESTABLISHED, &flow->flags))
 695			/* If flow_table flow has already been updated to the
 696			 * established state, then don't refresh.
 697			 */
 698			return false;
 699		force_refresh = true;
 700	}
 701
 702	if (tcph && (unlikely(tcph->fin || tcph->rst))) {
 703		flow_offload_teardown(flow);
 704		return false;
 705	}
 706
 707	if (dir == FLOW_OFFLOAD_DIR_ORIGINAL)
 708		ctinfo = test_bit(IPS_SEEN_REPLY_BIT, &ct->status) ?
 709			IP_CT_ESTABLISHED : IP_CT_NEW;
 710	else
 711		ctinfo = IP_CT_ESTABLISHED_REPLY;
 712
 713	nf_conn_act_ct_ext_fill(skb, ct, ctinfo);
 714	tcf_ct_flow_ct_ext_ifidx_update(flow);
 715	flow_offload_refresh(nf_ft, flow, force_refresh);
 716	if (!test_bit(IPS_ASSURED_BIT, &ct->status)) {
 717		/* Process this flow in SW to allow promoting to ASSURED */
 718		return false;
 719	}
 720
 721	nf_conntrack_get(&ct->ct_general);
 722	nf_ct_set(skb, ct, ctinfo);
 723	if (nf_ft->flags & NF_FLOWTABLE_COUNTER)
 724		nf_ct_acct_update(ct, dir, skb->len);
 725
 726	return true;
 727}
 728
 729static int tcf_ct_flow_tables_init(void)
 730{
 731	return rhashtable_init(&zones_ht, &zones_params);
 732}
 733
 734static void tcf_ct_flow_tables_uninit(void)
 735{
 736	rhashtable_destroy(&zones_ht);
 737}
 738
 739static struct tc_action_ops act_ct_ops;
 740
 741struct tc_ct_action_net {
 742	struct tc_action_net tn; /* Must be first */
 743};
 744
 745/* Determine whether skb->_nfct is equal to the result of conntrack lookup. */
 746static bool tcf_ct_skb_nfct_cached(struct net *net, struct sk_buff *skb,
 747				   struct tcf_ct_params *p)
 748{
 749	enum ip_conntrack_info ctinfo;
 750	struct nf_conn *ct;
 751
 752	ct = nf_ct_get(skb, &ctinfo);
 753	if (!ct)
 754		return false;
 755	if (!net_eq(net, read_pnet(&ct->ct_net)))
 756		goto drop_ct;
 757	if (nf_ct_zone(ct)->id != p->zone)
 758		goto drop_ct;
 759	if (p->helper) {
 760		struct nf_conn_help *help;
 761
 762		help = nf_ct_ext_find(ct, NF_CT_EXT_HELPER);
 763		if (help && rcu_access_pointer(help->helper) != p->helper)
 764			goto drop_ct;
 765	}
 766
 767	/* Force conntrack entry direction. */
 768	if ((p->ct_action & TCA_CT_ACT_FORCE) &&
 769	    CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL) {
 770		if (nf_ct_is_confirmed(ct))
 771			nf_ct_kill(ct);
 772
 773		goto drop_ct;
 774	}
 775
 776	return true;
 777
 778drop_ct:
 779	nf_ct_put(ct);
 780	nf_ct_set(skb, NULL, IP_CT_UNTRACKED);
 781
 782	return false;
 783}
 784
 785static u8 tcf_ct_skb_nf_family(struct sk_buff *skb)
 786{
 787	u8 family = NFPROTO_UNSPEC;
 788
 789	switch (skb_protocol(skb, true)) {
 790	case htons(ETH_P_IP):
 791		family = NFPROTO_IPV4;
 792		break;
 793	case htons(ETH_P_IPV6):
 794		family = NFPROTO_IPV6;
 795		break;
 796	default:
 797		break;
 798	}
 799
 800	return family;
 801}
 802
 803static int tcf_ct_ipv4_is_fragment(struct sk_buff *skb, bool *frag)
 804{
 805	unsigned int len;
 806
 807	len =  skb_network_offset(skb) + sizeof(struct iphdr);
 808	if (unlikely(skb->len < len))
 809		return -EINVAL;
 810	if (unlikely(!pskb_may_pull(skb, len)))
 811		return -ENOMEM;
 812
 813	*frag = ip_is_fragment(ip_hdr(skb));
 814	return 0;
 815}
 816
 817static int tcf_ct_ipv6_is_fragment(struct sk_buff *skb, bool *frag)
 818{
 819	unsigned int flags = 0, len, payload_ofs = 0;
 820	unsigned short frag_off;
 821	int nexthdr;
 822
 823	len =  skb_network_offset(skb) + sizeof(struct ipv6hdr);
 824	if (unlikely(skb->len < len))
 825		return -EINVAL;
 826	if (unlikely(!pskb_may_pull(skb, len)))
 827		return -ENOMEM;
 828
 829	nexthdr = ipv6_find_hdr(skb, &payload_ofs, -1, &frag_off, &flags);
 830	if (unlikely(nexthdr < 0))
 831		return -EPROTO;
 832
 833	*frag = flags & IP6_FH_F_FRAG;
 834	return 0;
 835}
 836
 837static int tcf_ct_handle_fragments(struct net *net, struct sk_buff *skb,
 838				   u8 family, u16 zone, bool *defrag)
 839{
 840	enum ip_conntrack_info ctinfo;
 841	struct nf_conn *ct;
 842	int err = 0;
 843	bool frag;
 844	u8 proto;
 845	u16 mru;
 846
 847	/* Previously seen (loopback)? Ignore. */
 848	ct = nf_ct_get(skb, &ctinfo);
 849	if ((ct && !nf_ct_is_template(ct)) || ctinfo == IP_CT_UNTRACKED)
 850		return 0;
 851
 852	if (family == NFPROTO_IPV4)
 853		err = tcf_ct_ipv4_is_fragment(skb, &frag);
 854	else
 855		err = tcf_ct_ipv6_is_fragment(skb, &frag);
 856	if (err || !frag)
 857		return err;
 858
 859	err = nf_ct_handle_fragments(net, skb, zone, family, &proto, &mru);
 860	if (err)
 861		return err;
 862
 863	*defrag = true;
 864	tc_skb_cb(skb)->mru = mru;
 865
 866	return 0;
 867}
 868
 869static void tcf_ct_params_free(struct tcf_ct_params *params)
 870{
 871	if (params->helper) {
 872#if IS_ENABLED(CONFIG_NF_NAT)
 873		if (params->ct_action & TCA_CT_ACT_NAT)
 874			nf_nat_helper_put(params->helper);
 875#endif
 876		nf_conntrack_helper_put(params->helper);
 877	}
 878	if (params->ct_ft)
 879		tcf_ct_flow_table_put(params->ct_ft);
 880	if (params->tmpl) {
 881		if (params->put_labels)
 882			nf_connlabels_put(nf_ct_net(params->tmpl));
 883
 884		nf_ct_put(params->tmpl);
 885	}
 886
 887	kfree(params);
 888}
 889
 890static void tcf_ct_params_free_rcu(struct rcu_head *head)
 891{
 892	struct tcf_ct_params *params;
 893
 894	params = container_of(head, struct tcf_ct_params, rcu);
 895	tcf_ct_params_free(params);
 896}
 897
 898static void tcf_ct_act_set_mark(struct nf_conn *ct, u32 mark, u32 mask)
 899{
 900#if IS_ENABLED(CONFIG_NF_CONNTRACK_MARK)
 901	u32 new_mark;
 902
 903	if (!mask)
 904		return;
 905
 906	new_mark = mark | (READ_ONCE(ct->mark) & ~(mask));
 907	if (READ_ONCE(ct->mark) != new_mark) {
 908		WRITE_ONCE(ct->mark, new_mark);
 909		if (nf_ct_is_confirmed(ct))
 910			nf_conntrack_event_cache(IPCT_MARK, ct);
 911	}
 912#endif
 913}
 914
 915static void tcf_ct_act_set_labels(struct nf_conn *ct,
 916				  u32 *labels,
 917				  u32 *labels_m)
 918{
 919#if IS_ENABLED(CONFIG_NF_CONNTRACK_LABELS)
 920	size_t labels_sz = sizeof_field(struct tcf_ct_params, labels);
 921
 922	if (!memchr_inv(labels_m, 0, labels_sz))
 923		return;
 924
 925	nf_connlabels_replace(ct, labels, labels_m, 4);
 926#endif
 927}
 928
 929static int tcf_ct_act_nat(struct sk_buff *skb,
 930			  struct nf_conn *ct,
 931			  enum ip_conntrack_info ctinfo,
 932			  int ct_action,
 933			  struct nf_nat_range2 *range,
 934			  bool commit)
 935{
 936#if IS_ENABLED(CONFIG_NF_NAT)
 937	int err, action = 0;
 938
 939	if (!(ct_action & TCA_CT_ACT_NAT))
 940		return NF_ACCEPT;
 941	if (ct_action & TCA_CT_ACT_NAT_SRC)
 942		action |= BIT(NF_NAT_MANIP_SRC);
 943	if (ct_action & TCA_CT_ACT_NAT_DST)
 944		action |= BIT(NF_NAT_MANIP_DST);
 945
 946	err = nf_ct_nat(skb, ct, ctinfo, &action, range, commit);
 947	if (err != NF_ACCEPT)
 948		return err & NF_VERDICT_MASK;
 949
 950	if (action & BIT(NF_NAT_MANIP_SRC))
 951		tc_skb_cb(skb)->post_ct_snat = 1;
 952	if (action & BIT(NF_NAT_MANIP_DST))
 953		tc_skb_cb(skb)->post_ct_dnat = 1;
 954
 955	return err;
 956#else
 957	return NF_ACCEPT;
 958#endif
 959}
 960
 961TC_INDIRECT_SCOPE int tcf_ct_act(struct sk_buff *skb, const struct tc_action *a,
 962				 struct tcf_result *res)
 963{
 964	struct net *net = dev_net(skb->dev);
 965	enum ip_conntrack_info ctinfo;
 966	struct tcf_ct *c = to_ct(a);
 967	struct nf_conn *tmpl = NULL;
 968	struct nf_hook_state state;
 969	bool cached, commit, clear;
 970	int nh_ofs, err, retval;
 971	struct tcf_ct_params *p;
 972	bool add_helper = false;
 973	bool skip_add = false;
 974	bool defrag = false;
 975	struct nf_conn *ct;
 976	u8 family;
 977
 978	p = rcu_dereference_bh(c->params);
 979
 980	retval = READ_ONCE(c->tcf_action);
 981	commit = p->ct_action & TCA_CT_ACT_COMMIT;
 982	clear = p->ct_action & TCA_CT_ACT_CLEAR;
 983	tmpl = p->tmpl;
 984
 985	tcf_lastuse_update(&c->tcf_tm);
 986	tcf_action_update_bstats(&c->common, skb);
 987
 988	if (clear) {
 989		tc_skb_cb(skb)->post_ct = false;
 990		ct = nf_ct_get(skb, &ctinfo);
 991		if (ct) {
 992			nf_ct_put(ct);
 993			nf_ct_set(skb, NULL, IP_CT_UNTRACKED);
 994		}
 995
 996		goto out_clear;
 997	}
 998
 999	family = tcf_ct_skb_nf_family(skb);
1000	if (family == NFPROTO_UNSPEC)
1001		goto drop;
1002
1003	/* The conntrack module expects to be working at L3.
1004	 * We also try to pull the IPv4/6 header to linear area
1005	 */
1006	nh_ofs = skb_network_offset(skb);
1007	skb_pull_rcsum(skb, nh_ofs);
1008	err = tcf_ct_handle_fragments(net, skb, family, p->zone, &defrag);
1009	if (err)
1010		goto out_frag;
1011
1012	err = nf_ct_skb_network_trim(skb, family);
1013	if (err)
1014		goto drop;
1015
1016	/* If we are recirculating packets to match on ct fields and
1017	 * committing with a separate ct action, then we don't need to
1018	 * actually run the packet through conntrack twice unless it's for a
1019	 * different zone.
1020	 */
1021	cached = tcf_ct_skb_nfct_cached(net, skb, p);
1022	if (!cached) {
1023		if (tcf_ct_flow_table_lookup(p, skb, family)) {
1024			skip_add = true;
1025			goto do_nat;
1026		}
1027
1028		/* Associate skb with specified zone. */
1029		if (tmpl) {
1030			nf_conntrack_put(skb_nfct(skb));
1031			nf_conntrack_get(&tmpl->ct_general);
1032			nf_ct_set(skb, tmpl, IP_CT_NEW);
1033		}
1034
1035		state.hook = NF_INET_PRE_ROUTING;
1036		state.net = net;
1037		state.pf = family;
1038		err = nf_conntrack_in(skb, &state);
1039		if (err != NF_ACCEPT)
1040			goto nf_error;
1041	}
1042
1043do_nat:
1044	ct = nf_ct_get(skb, &ctinfo);
1045	if (!ct)
1046		goto out_push;
1047	nf_ct_deliver_cached_events(ct);
1048	nf_conn_act_ct_ext_fill(skb, ct, ctinfo);
1049
1050	err = tcf_ct_act_nat(skb, ct, ctinfo, p->ct_action, &p->range, commit);
1051	if (err != NF_ACCEPT)
1052		goto nf_error;
1053
1054	if (!nf_ct_is_confirmed(ct) && commit && p->helper && !nfct_help(ct)) {
1055		err = __nf_ct_try_assign_helper(ct, p->tmpl, GFP_ATOMIC);
1056		if (err)
1057			goto drop;
1058		add_helper = true;
1059		if (p->ct_action & TCA_CT_ACT_NAT && !nfct_seqadj(ct)) {
1060			if (!nfct_seqadj_ext_add(ct))
1061				goto drop;
1062		}
1063	}
1064
1065	if (nf_ct_is_confirmed(ct) ? ((!cached && !skip_add) || add_helper) : commit) {
1066		err = nf_ct_helper(skb, ct, ctinfo, family);
1067		if (err != NF_ACCEPT)
1068			goto nf_error;
1069	}
1070
1071	if (commit) {
1072		tcf_ct_act_set_mark(ct, p->mark, p->mark_mask);
1073		tcf_ct_act_set_labels(ct, p->labels, p->labels_mask);
1074
1075		if (!nf_ct_is_confirmed(ct))
1076			nf_conn_act_ct_ext_add(skb, ct, ctinfo);
1077
1078		/* This will take care of sending queued events
1079		 * even if the connection is already confirmed.
1080		 */
1081		err = nf_conntrack_confirm(skb);
1082		if (err != NF_ACCEPT)
1083			goto nf_error;
1084
1085		/* The ct may be dropped if a clash has been resolved,
1086		 * so it's necessary to retrieve it from skb again to
1087		 * prevent UAF.
1088		 */
1089		ct = nf_ct_get(skb, &ctinfo);
1090		if (!ct)
1091			skip_add = true;
1092	}
1093
1094	if (!skip_add)
1095		tcf_ct_flow_table_process_conn(p->ct_ft, ct, ctinfo);
1096
1097out_push:
1098	skb_push_rcsum(skb, nh_ofs);
1099
1100	tc_skb_cb(skb)->post_ct = true;
1101	tc_skb_cb(skb)->zone = p->zone;
1102out_clear:
1103	if (defrag)
1104		qdisc_skb_cb(skb)->pkt_len = skb->len;
1105	return retval;
1106
1107out_frag:
1108	if (err != -EINPROGRESS)
1109		tcf_action_inc_drop_qstats(&c->common);
1110	return TC_ACT_CONSUMED;
1111
1112drop:
1113	tcf_action_inc_drop_qstats(&c->common);
1114	return TC_ACT_SHOT;
1115
1116nf_error:
1117	/* some verdicts store extra data in upper bits, such
1118	 * as errno or queue number.
1119	 */
1120	switch (err & NF_VERDICT_MASK) {
1121	case NF_DROP:
1122		goto drop;
1123	case NF_STOLEN:
1124		tcf_action_inc_drop_qstats(&c->common);
1125		return TC_ACT_CONSUMED;
1126	default:
1127		DEBUG_NET_WARN_ON_ONCE(1);
1128		goto drop;
1129	}
1130}
1131
1132static const struct nla_policy ct_policy[TCA_CT_MAX + 1] = {
1133	[TCA_CT_ACTION] = { .type = NLA_U16 },
1134	[TCA_CT_PARMS] = NLA_POLICY_EXACT_LEN(sizeof(struct tc_ct)),
1135	[TCA_CT_ZONE] = { .type = NLA_U16 },
1136	[TCA_CT_MARK] = { .type = NLA_U32 },
1137	[TCA_CT_MARK_MASK] = { .type = NLA_U32 },
1138	[TCA_CT_LABELS] = { .type = NLA_BINARY,
1139			    .len = 128 / BITS_PER_BYTE },
1140	[TCA_CT_LABELS_MASK] = { .type = NLA_BINARY,
1141				 .len = 128 / BITS_PER_BYTE },
1142	[TCA_CT_NAT_IPV4_MIN] = { .type = NLA_U32 },
1143	[TCA_CT_NAT_IPV4_MAX] = { .type = NLA_U32 },
1144	[TCA_CT_NAT_IPV6_MIN] = NLA_POLICY_EXACT_LEN(sizeof(struct in6_addr)),
1145	[TCA_CT_NAT_IPV6_MAX] = NLA_POLICY_EXACT_LEN(sizeof(struct in6_addr)),
1146	[TCA_CT_NAT_PORT_MIN] = { .type = NLA_U16 },
1147	[TCA_CT_NAT_PORT_MAX] = { .type = NLA_U16 },
1148	[TCA_CT_HELPER_NAME] = { .type = NLA_STRING, .len = NF_CT_HELPER_NAME_LEN },
1149	[TCA_CT_HELPER_FAMILY] = { .type = NLA_U8 },
1150	[TCA_CT_HELPER_PROTO] = { .type = NLA_U8 },
1151};
1152
1153static int tcf_ct_fill_params_nat(struct tcf_ct_params *p,
1154				  struct tc_ct *parm,
1155				  struct nlattr **tb,
1156				  struct netlink_ext_ack *extack)
1157{
1158	struct nf_nat_range2 *range;
1159
1160	if (!(p->ct_action & TCA_CT_ACT_NAT))
1161		return 0;
1162
1163	if (!IS_ENABLED(CONFIG_NF_NAT)) {
1164		NL_SET_ERR_MSG_MOD(extack, "Netfilter nat isn't enabled in kernel");
1165		return -EOPNOTSUPP;
1166	}
1167
1168	if (!(p->ct_action & (TCA_CT_ACT_NAT_SRC | TCA_CT_ACT_NAT_DST)))
1169		return 0;
1170
1171	if ((p->ct_action & TCA_CT_ACT_NAT_SRC) &&
1172	    (p->ct_action & TCA_CT_ACT_NAT_DST)) {
1173		NL_SET_ERR_MSG_MOD(extack, "dnat and snat can't be enabled at the same time");
1174		return -EOPNOTSUPP;
1175	}
1176
1177	range = &p->range;
1178	if (tb[TCA_CT_NAT_IPV4_MIN]) {
1179		struct nlattr *max_attr = tb[TCA_CT_NAT_IPV4_MAX];
1180
1181		p->ipv4_range = true;
1182		range->flags |= NF_NAT_RANGE_MAP_IPS;
1183		range->min_addr.ip =
1184			nla_get_in_addr(tb[TCA_CT_NAT_IPV4_MIN]);
1185
1186		range->max_addr.ip =
1187			nla_get_in_addr_default(max_attr, range->min_addr.ip);
1188	} else if (tb[TCA_CT_NAT_IPV6_MIN]) {
1189		struct nlattr *max_attr = tb[TCA_CT_NAT_IPV6_MAX];
1190
1191		p->ipv4_range = false;
1192		range->flags |= NF_NAT_RANGE_MAP_IPS;
1193		range->min_addr.in6 =
1194			nla_get_in6_addr(tb[TCA_CT_NAT_IPV6_MIN]);
1195
1196		range->max_addr.in6 = max_attr ?
1197				      nla_get_in6_addr(max_attr) :
1198				      range->min_addr.in6;
1199	}
1200
1201	if (tb[TCA_CT_NAT_PORT_MIN]) {
1202		range->flags |= NF_NAT_RANGE_PROTO_SPECIFIED;
1203		range->min_proto.all = nla_get_be16(tb[TCA_CT_NAT_PORT_MIN]);
1204
1205		range->max_proto.all = tb[TCA_CT_NAT_PORT_MAX] ?
1206				       nla_get_be16(tb[TCA_CT_NAT_PORT_MAX]) :
1207				       range->min_proto.all;
1208	}
1209
1210	return 0;
1211}
1212
1213static void tcf_ct_set_key_val(struct nlattr **tb,
1214			       void *val, int val_type,
1215			       void *mask, int mask_type,
1216			       int len)
1217{
1218	if (!tb[val_type])
1219		return;
1220	nla_memcpy(val, tb[val_type], len);
1221
1222	if (!mask)
1223		return;
1224
1225	if (mask_type == TCA_CT_UNSPEC || !tb[mask_type])
1226		memset(mask, 0xff, len);
1227	else
1228		nla_memcpy(mask, tb[mask_type], len);
1229}
1230
1231static int tcf_ct_fill_params(struct net *net,
1232			      struct tcf_ct_params *p,
1233			      struct tc_ct *parm,
1234			      struct nlattr **tb,
1235			      struct netlink_ext_ack *extack)
1236{
1237	struct nf_conntrack_zone zone;
1238	int err, family, proto, len;
1239	bool put_labels = false;
1240	struct nf_conn *tmpl;
1241	char *name;
1242
1243	p->zone = NF_CT_DEFAULT_ZONE_ID;
1244
1245	tcf_ct_set_key_val(tb,
1246			   &p->ct_action, TCA_CT_ACTION,
1247			   NULL, TCA_CT_UNSPEC,
1248			   sizeof(p->ct_action));
1249
1250	if (p->ct_action & TCA_CT_ACT_CLEAR)
1251		return 0;
1252
1253	err = tcf_ct_fill_params_nat(p, parm, tb, extack);
1254	if (err)
1255		return err;
1256
1257	if (tb[TCA_CT_MARK]) {
1258		if (!IS_ENABLED(CONFIG_NF_CONNTRACK_MARK)) {
1259			NL_SET_ERR_MSG_MOD(extack, "Conntrack mark isn't enabled.");
1260			return -EOPNOTSUPP;
1261		}
1262		tcf_ct_set_key_val(tb,
1263				   &p->mark, TCA_CT_MARK,
1264				   &p->mark_mask, TCA_CT_MARK_MASK,
1265				   sizeof(p->mark));
1266	}
1267
1268	if (tb[TCA_CT_LABELS]) {
1269		unsigned int n_bits = sizeof_field(struct tcf_ct_params, labels) * 8;
1270
1271		if (!IS_ENABLED(CONFIG_NF_CONNTRACK_LABELS)) {
1272			NL_SET_ERR_MSG_MOD(extack, "Conntrack labels isn't enabled.");
1273			return -EOPNOTSUPP;
1274		}
1275
1276		if (nf_connlabels_get(net, n_bits - 1)) {
1277			NL_SET_ERR_MSG_MOD(extack, "Failed to set connlabel length");
1278			return -EOPNOTSUPP;
1279		} else {
1280			put_labels = true;
1281		}
1282
1283		tcf_ct_set_key_val(tb,
1284				   p->labels, TCA_CT_LABELS,
1285				   p->labels_mask, TCA_CT_LABELS_MASK,
1286				   sizeof(p->labels));
1287	}
1288
1289	if (tb[TCA_CT_ZONE]) {
1290		if (!IS_ENABLED(CONFIG_NF_CONNTRACK_ZONES)) {
1291			NL_SET_ERR_MSG_MOD(extack, "Conntrack zones isn't enabled.");
1292			return -EOPNOTSUPP;
1293		}
1294
1295		tcf_ct_set_key_val(tb,
1296				   &p->zone, TCA_CT_ZONE,
1297				   NULL, TCA_CT_UNSPEC,
1298				   sizeof(p->zone));
1299	}
1300
1301	nf_ct_zone_init(&zone, p->zone, NF_CT_DEFAULT_ZONE_DIR, 0);
1302	tmpl = nf_ct_tmpl_alloc(net, &zone, GFP_KERNEL);
1303	if (!tmpl) {
1304		NL_SET_ERR_MSG_MOD(extack, "Failed to allocate conntrack template");
1305		return -ENOMEM;
1306	}
1307	p->tmpl = tmpl;
1308	if (tb[TCA_CT_HELPER_NAME]) {
1309		name = nla_data(tb[TCA_CT_HELPER_NAME]);
1310		len = nla_len(tb[TCA_CT_HELPER_NAME]);
1311		if (len > 16 || name[len - 1] != '\0') {
1312			NL_SET_ERR_MSG_MOD(extack, "Failed to parse helper name.");
1313			err = -EINVAL;
1314			goto err;
1315		}
1316		family = nla_get_u8_default(tb[TCA_CT_HELPER_FAMILY], AF_INET);
1317		proto = nla_get_u8_default(tb[TCA_CT_HELPER_PROTO],
1318					   IPPROTO_TCP);
1319		err = nf_ct_add_helper(tmpl, name, family, proto,
1320				       p->ct_action & TCA_CT_ACT_NAT, &p->helper);
1321		if (err) {
1322			NL_SET_ERR_MSG_MOD(extack, "Failed to add helper");
1323			goto err;
1324		}
1325	}
1326
1327	p->put_labels = put_labels;
1328
1329	if (p->ct_action & TCA_CT_ACT_COMMIT)
1330		__set_bit(IPS_CONFIRMED_BIT, &tmpl->status);
1331	return 0;
1332err:
1333	if (put_labels)
1334		nf_connlabels_put(net);
1335
1336	nf_ct_put(p->tmpl);
1337	p->tmpl = NULL;
1338	return err;
1339}
1340
1341static int tcf_ct_init(struct net *net, struct nlattr *nla,
1342		       struct nlattr *est, struct tc_action **a,
1343		       struct tcf_proto *tp, u32 flags,
1344		       struct netlink_ext_ack *extack)
1345{
1346	struct tc_action_net *tn = net_generic(net, act_ct_ops.net_id);
1347	bool bind = flags & TCA_ACT_FLAGS_BIND;
1348	struct tcf_ct_params *params = NULL;
1349	struct nlattr *tb[TCA_CT_MAX + 1];
1350	struct tcf_chain *goto_ch = NULL;
1351	struct tc_ct *parm;
1352	struct tcf_ct *c;
1353	int err, res = 0;
1354	u32 index;
1355
1356	if (!nla) {
1357		NL_SET_ERR_MSG_MOD(extack, "Ct requires attributes to be passed");
1358		return -EINVAL;
1359	}
1360
1361	err = nla_parse_nested(tb, TCA_CT_MAX, nla, ct_policy, extack);
1362	if (err < 0)
1363		return err;
1364
1365	if (!tb[TCA_CT_PARMS]) {
1366		NL_SET_ERR_MSG_MOD(extack, "Missing required ct parameters");
1367		return -EINVAL;
1368	}
1369	parm = nla_data(tb[TCA_CT_PARMS]);
1370	index = parm->index;
1371	err = tcf_idr_check_alloc(tn, &index, a, bind);
1372	if (err < 0)
1373		return err;
1374
1375	if (!err) {
1376		err = tcf_idr_create_from_flags(tn, index, est, a,
1377						&act_ct_ops, bind, flags);
1378		if (err) {
1379			tcf_idr_cleanup(tn, index);
1380			return err;
1381		}
1382		res = ACT_P_CREATED;
1383	} else {
1384		if (bind)
1385			return ACT_P_BOUND;
1386
1387		if (!(flags & TCA_ACT_FLAGS_REPLACE)) {
1388			tcf_idr_release(*a, bind);
1389			return -EEXIST;
1390		}
1391	}
1392	err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, extack);
1393	if (err < 0)
1394		goto cleanup;
1395
1396	c = to_ct(*a);
1397
1398	params = kzalloc(sizeof(*params), GFP_KERNEL);
1399	if (unlikely(!params)) {
1400		err = -ENOMEM;
1401		goto cleanup;
1402	}
1403
1404	err = tcf_ct_fill_params(net, params, parm, tb, extack);
1405	if (err)
1406		goto cleanup;
1407
1408	err = tcf_ct_flow_table_get(net, params);
1409	if (err)
1410		goto cleanup;
1411
1412	spin_lock_bh(&c->tcf_lock);
1413	goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch);
1414	params = rcu_replace_pointer(c->params, params,
1415				     lockdep_is_held(&c->tcf_lock));
1416	spin_unlock_bh(&c->tcf_lock);
1417
1418	if (goto_ch)
1419		tcf_chain_put_by_act(goto_ch);
1420	if (params)
1421		call_rcu(&params->rcu, tcf_ct_params_free_rcu);
1422
1423	return res;
1424
1425cleanup:
1426	if (goto_ch)
1427		tcf_chain_put_by_act(goto_ch);
1428	if (params)
1429		tcf_ct_params_free(params);
1430	tcf_idr_release(*a, bind);
1431	return err;
1432}
1433
1434static void tcf_ct_cleanup(struct tc_action *a)
1435{
1436	struct tcf_ct_params *params;
1437	struct tcf_ct *c = to_ct(a);
1438
1439	params = rcu_dereference_protected(c->params, 1);
1440	if (params)
1441		call_rcu(&params->rcu, tcf_ct_params_free_rcu);
1442}
1443
1444static int tcf_ct_dump_key_val(struct sk_buff *skb,
1445			       void *val, int val_type,
1446			       void *mask, int mask_type,
1447			       int len)
1448{
1449	int err;
1450
1451	if (mask && !memchr_inv(mask, 0, len))
1452		return 0;
1453
1454	err = nla_put(skb, val_type, len, val);
1455	if (err)
1456		return err;
1457
1458	if (mask_type != TCA_CT_UNSPEC) {
1459		err = nla_put(skb, mask_type, len, mask);
1460		if (err)
1461			return err;
1462	}
1463
1464	return 0;
1465}
1466
1467static int tcf_ct_dump_nat(struct sk_buff *skb, struct tcf_ct_params *p)
1468{
1469	struct nf_nat_range2 *range = &p->range;
1470
1471	if (!(p->ct_action & TCA_CT_ACT_NAT))
1472		return 0;
1473
1474	if (!(p->ct_action & (TCA_CT_ACT_NAT_SRC | TCA_CT_ACT_NAT_DST)))
1475		return 0;
1476
1477	if (range->flags & NF_NAT_RANGE_MAP_IPS) {
1478		if (p->ipv4_range) {
1479			if (nla_put_in_addr(skb, TCA_CT_NAT_IPV4_MIN,
1480					    range->min_addr.ip))
1481				return -1;
1482			if (nla_put_in_addr(skb, TCA_CT_NAT_IPV4_MAX,
1483					    range->max_addr.ip))
1484				return -1;
1485		} else {
1486			if (nla_put_in6_addr(skb, TCA_CT_NAT_IPV6_MIN,
1487					     &range->min_addr.in6))
1488				return -1;
1489			if (nla_put_in6_addr(skb, TCA_CT_NAT_IPV6_MAX,
1490					     &range->max_addr.in6))
1491				return -1;
1492		}
1493	}
1494
1495	if (range->flags & NF_NAT_RANGE_PROTO_SPECIFIED) {
1496		if (nla_put_be16(skb, TCA_CT_NAT_PORT_MIN,
1497				 range->min_proto.all))
1498			return -1;
1499		if (nla_put_be16(skb, TCA_CT_NAT_PORT_MAX,
1500				 range->max_proto.all))
1501			return -1;
1502	}
1503
1504	return 0;
1505}
1506
1507static int tcf_ct_dump_helper(struct sk_buff *skb, struct nf_conntrack_helper *helper)
1508{
1509	if (!helper)
1510		return 0;
1511
1512	if (nla_put_string(skb, TCA_CT_HELPER_NAME, helper->name) ||
1513	    nla_put_u8(skb, TCA_CT_HELPER_FAMILY, helper->tuple.src.l3num) ||
1514	    nla_put_u8(skb, TCA_CT_HELPER_PROTO, helper->tuple.dst.protonum))
1515		return -1;
1516
1517	return 0;
1518}
1519
1520static inline int tcf_ct_dump(struct sk_buff *skb, struct tc_action *a,
1521			      int bind, int ref)
1522{
1523	unsigned char *b = skb_tail_pointer(skb);
1524	struct tcf_ct *c = to_ct(a);
1525	struct tcf_ct_params *p;
1526
1527	struct tc_ct opt = {
1528		.index   = c->tcf_index,
1529		.refcnt  = refcount_read(&c->tcf_refcnt) - ref,
1530		.bindcnt = atomic_read(&c->tcf_bindcnt) - bind,
1531	};
1532	struct tcf_t t;
1533
1534	spin_lock_bh(&c->tcf_lock);
1535	p = rcu_dereference_protected(c->params,
1536				      lockdep_is_held(&c->tcf_lock));
1537	opt.action = c->tcf_action;
1538
1539	if (tcf_ct_dump_key_val(skb,
1540				&p->ct_action, TCA_CT_ACTION,
1541				NULL, TCA_CT_UNSPEC,
1542				sizeof(p->ct_action)))
1543		goto nla_put_failure;
1544
1545	if (p->ct_action & TCA_CT_ACT_CLEAR)
1546		goto skip_dump;
1547
1548	if (IS_ENABLED(CONFIG_NF_CONNTRACK_MARK) &&
1549	    tcf_ct_dump_key_val(skb,
1550				&p->mark, TCA_CT_MARK,
1551				&p->mark_mask, TCA_CT_MARK_MASK,
1552				sizeof(p->mark)))
1553		goto nla_put_failure;
1554
1555	if (IS_ENABLED(CONFIG_NF_CONNTRACK_LABELS) &&
1556	    tcf_ct_dump_key_val(skb,
1557				p->labels, TCA_CT_LABELS,
1558				p->labels_mask, TCA_CT_LABELS_MASK,
1559				sizeof(p->labels)))
1560		goto nla_put_failure;
1561
1562	if (IS_ENABLED(CONFIG_NF_CONNTRACK_ZONES) &&
1563	    tcf_ct_dump_key_val(skb,
1564				&p->zone, TCA_CT_ZONE,
1565				NULL, TCA_CT_UNSPEC,
1566				sizeof(p->zone)))
1567		goto nla_put_failure;
1568
1569	if (tcf_ct_dump_nat(skb, p))
1570		goto nla_put_failure;
1571
1572	if (tcf_ct_dump_helper(skb, p->helper))
1573		goto nla_put_failure;
1574
1575skip_dump:
1576	if (nla_put(skb, TCA_CT_PARMS, sizeof(opt), &opt))
1577		goto nla_put_failure;
1578
1579	tcf_tm_dump(&t, &c->tcf_tm);
1580	if (nla_put_64bit(skb, TCA_CT_TM, sizeof(t), &t, TCA_CT_PAD))
1581		goto nla_put_failure;
1582	spin_unlock_bh(&c->tcf_lock);
1583
1584	return skb->len;
1585nla_put_failure:
1586	spin_unlock_bh(&c->tcf_lock);
1587	nlmsg_trim(skb, b);
1588	return -1;
1589}
1590
1591static void tcf_stats_update(struct tc_action *a, u64 bytes, u64 packets,
1592			     u64 drops, u64 lastuse, bool hw)
1593{
1594	struct tcf_ct *c = to_ct(a);
1595
1596	tcf_action_update_stats(a, bytes, packets, drops, hw);
1597	c->tcf_tm.lastuse = max_t(u64, c->tcf_tm.lastuse, lastuse);
1598}
1599
1600static int tcf_ct_offload_act_setup(struct tc_action *act, void *entry_data,
1601				    u32 *index_inc, bool bind,
1602				    struct netlink_ext_ack *extack)
1603{
1604	if (bind) {
1605		struct flow_action_entry *entry = entry_data;
1606
1607		if (tcf_ct_helper(act))
1608			return -EOPNOTSUPP;
1609
1610		entry->id = FLOW_ACTION_CT;
1611		entry->ct.action = tcf_ct_action(act);
1612		entry->ct.zone = tcf_ct_zone(act);
1613		entry->ct.flow_table = tcf_ct_ft(act);
1614		*index_inc = 1;
1615	} else {
1616		struct flow_offload_action *fl_action = entry_data;
1617
1618		fl_action->id = FLOW_ACTION_CT;
1619	}
1620
1621	return 0;
1622}
1623
1624static struct tc_action_ops act_ct_ops = {
1625	.kind		=	"ct",
1626	.id		=	TCA_ID_CT,
1627	.owner		=	THIS_MODULE,
1628	.act		=	tcf_ct_act,
1629	.dump		=	tcf_ct_dump,
1630	.init		=	tcf_ct_init,
1631	.cleanup	=	tcf_ct_cleanup,
1632	.stats_update	=	tcf_stats_update,
1633	.offload_act_setup =	tcf_ct_offload_act_setup,
1634	.size		=	sizeof(struct tcf_ct),
1635};
1636MODULE_ALIAS_NET_ACT("ct");
1637
1638static __net_init int ct_init_net(struct net *net)
1639{
1640	struct tc_ct_action_net *tn = net_generic(net, act_ct_ops.net_id);
1641
1642	return tc_action_net_init(net, &tn->tn, &act_ct_ops);
1643}
1644
1645static void __net_exit ct_exit_net(struct list_head *net_list)
1646{
1647	tc_action_net_exit(net_list, act_ct_ops.net_id);
1648}
1649
1650static struct pernet_operations ct_net_ops = {
1651	.init = ct_init_net,
1652	.exit_batch = ct_exit_net,
1653	.id   = &act_ct_ops.net_id,
1654	.size = sizeof(struct tc_ct_action_net),
1655};
1656
1657static int __init ct_init_module(void)
1658{
1659	int err;
1660
1661	act_ct_wq = alloc_ordered_workqueue("act_ct_workqueue", 0);
1662	if (!act_ct_wq)
1663		return -ENOMEM;
1664
1665	err = tcf_ct_flow_tables_init();
1666	if (err)
1667		goto err_tbl_init;
1668
1669	err = tcf_register_action(&act_ct_ops, &ct_net_ops);
1670	if (err)
1671		goto err_register;
1672
1673	static_branch_inc(&tcf_frag_xmit_count);
1674
1675	return 0;
1676
1677err_register:
1678	tcf_ct_flow_tables_uninit();
1679err_tbl_init:
1680	destroy_workqueue(act_ct_wq);
1681	return err;
1682}
1683
1684static void __exit ct_cleanup_module(void)
1685{
1686	static_branch_dec(&tcf_frag_xmit_count);
1687	tcf_unregister_action(&act_ct_ops, &ct_net_ops);
1688	tcf_ct_flow_tables_uninit();
1689	destroy_workqueue(act_ct_wq);
1690}
1691
1692module_init(ct_init_module);
1693module_exit(ct_cleanup_module);
1694MODULE_AUTHOR("Paul Blakey <paulb@mellanox.com>");
1695MODULE_AUTHOR("Yossi Kuperman <yossiku@mellanox.com>");
1696MODULE_AUTHOR("Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>");
1697MODULE_DESCRIPTION("Connection tracking action");
1698MODULE_LICENSE("GPL v2");