aboutsummaryrefslogtreecommitdiff
path: root/net
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2018-06-06 18:39:49 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2018-06-06 18:39:49 -0700
commit1c8c5a9d38f607c0b6fd12c91cbe1a4418762a21 (patch)
treedcc97181d4d187252e0cc8fdf29d9b365fa3ffd0 /net
parent285767604576148fc1be7fcd112e4a90eb0d6ad2 (diff)
parent7170e6045a6a8b33f4fa5753589dc77b16198e2d (diff)
downloadlinux-stericsson-1c8c5a9d38f607c0b6fd12c91cbe1a4418762a21.tar.gz
Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next
Pull networking updates from David Miller: 1) Add Maglev hashing scheduler to IPVS, from Inju Song. 2) Lots of new TC subsystem tests from Roman Mashak. 3) Add TCP zero copy receive and fix delayed acks and autotuning with SO_RCVLOWAT, from Eric Dumazet. 4) Add XDP_REDIRECT support to mlx5 driver, from Jesper Dangaard Brouer. 5) Add ttl inherit support to vxlan, from Hangbin Liu. 6) Properly separate ipv6 routes into their logically independant components. fib6_info for the routing table, and fib6_nh for sets of nexthops, which thus can be shared. From David Ahern. 7) Add bpf_xdp_adjust_tail helper, which can be used to generate ICMP messages from XDP programs. From Nikita V. Shirokov. 8) Lots of long overdue cleanups to the r8169 driver, from Heiner Kallweit. 9) Add BTF ("BPF Type Format"), from Martin KaFai Lau. 10) Add traffic condition monitoring to iwlwifi, from Luca Coelho. 11) Plumb extack down into fib_rules, from Roopa Prabhu. 12) Add Flower classifier offload support to igb, from Vinicius Costa Gomes. 13) Add UDP GSO support, from Willem de Bruijn. 14) Add documentation for eBPF helpers, from Quentin Monnet. 15) Add TLS tx offload to mlx5, from Ilya Lesokhin. 16) Allow applications to be given the number of bytes available to read on a socket via a control message returned from recvmsg(), from Soheil Hassas Yeganeh. 17) Add x86_32 eBPF JIT compiler, from Wang YanQing. 18) Add AF_XDP sockets, with zerocopy support infrastructure as well. From Björn Töpel. 19) Remove indirect load support from all of the BPF JITs and handle these operations in the verifier by translating them into native BPF instead. From Daniel Borkmann. 20) Add GRO support to ipv6 gre tunnels, from Eran Ben Elisha. 21) Allow XDP programs to do lookups in the main kernel routing tables for forwarding. From David Ahern. 22) Allow drivers to store hardware state into an ELF section of kernel dump vmcore files, and use it in cxgb4. From Rahul Lakkireddy. 23) Various RACK and loss detection improvements in TCP, from Yuchung Cheng. 24) Add TCP SACK compression, from Eric Dumazet. 25) Add User Mode Helper support and basic bpfilter infrastructure, from Alexei Starovoitov. 26) Support ports and protocol values in RTM_GETROUTE, from Roopa Prabhu. 27) Support bulking in ->ndo_xdp_xmit() API, from Jesper Dangaard Brouer. 28) Add lots of forwarding selftests, from Petr Machata. 29) Add generic network device failover driver, from Sridhar Samudrala. * ra.kernel.org:/pub/scm/linux/kernel/git/davem/net-next: (1959 commits) strparser: Add __strp_unpause and use it in ktls. rxrpc: Fix terminal retransmission connection ID to include the channel net: hns3: Optimize PF CMDQ interrupt switching process net: hns3: Fix for VF mailbox receiving unknown message net: hns3: Fix for VF mailbox cannot receiving PF response bnx2x: use the right constant Revert "net: sched: cls: Fix offloading when ingress dev is vxlan" net: dsa: b53: Fix for brcm tag issue in Cygnus SoC enic: fix UDP rss bits netdev-FAQ: clarify DaveM's position for stable backports rtnetlink: validate attributes in do_setlink() mlxsw: Add extack messages for port_{un, }split failures netdevsim: Add extack error message for devlink reload devlink: Add extack to reload and port_{un, }split operations net: metrics: add proper netlink validation ipmr: fix error path when ipmr_new_table fails ip6mr: only set ip6mr_table from setsockopt when ip6mr_new_table succeeds net: hns3: remove unused hclgevf_cfg_func_mta_filter netfilter: provide udp*_lib_lookup for nf_tproxy qed*: Utilize FW 8.37.2.0 ...
Diffstat (limited to 'net')
-rw-r--r--net/8021q/vlan.c11
-rw-r--r--net/8021q/vlan.h3
-rw-r--r--net/8021q/vlan_dev.c4
-rw-r--r--net/8021q/vlan_netlink.c45
-rw-r--r--net/9p/mod.c2
-rw-r--r--net/Kconfig22
-rw-r--r--net/Makefile6
-rw-r--r--net/batman-adv/Kconfig6
-rw-r--r--net/batman-adv/bat_v_elp.c15
-rw-r--r--net/batman-adv/main.h2
-rw-r--r--net/batman-adv/multicast.c29
-rw-r--r--net/batman-adv/soft-interface.c5
-rw-r--r--net/batman-adv/types.h23
-rw-r--r--net/bluetooth/hci_core.c54
-rw-r--r--net/bluetooth/hci_debugfs.c24
-rw-r--r--net/bluetooth/hci_event.c12
-rw-r--r--net/bluetooth/hci_request.c30
-rw-r--r--net/bluetooth/smp.c12
-rw-r--r--net/bpf/test_run.c3
-rw-r--r--net/bpfilter/Kconfig16
-rw-r--r--net/bpfilter/Makefile32
-rw-r--r--net/bpfilter/bpfilter_kern.c114
-rw-r--r--net/bpfilter/main.c63
-rw-r--r--net/bpfilter/msgfmt.h17
-rw-r--r--net/bridge/br.c16
-rw-r--r--net/bridge/br_fdb.c69
-rw-r--r--net/bridge/br_forward.c6
-rw-r--r--net/bridge/br_if.c11
-rw-r--r--net/bridge/br_input.c1
-rw-r--r--net/bridge/br_netlink.c9
-rw-r--r--net/bridge/br_private.h41
-rw-r--r--net/bridge/br_switchdev.c37
-rw-r--r--net/bridge/br_sysfs_if.c2
-rw-r--r--net/bridge/br_vlan.c144
-rw-r--r--net/bridge/netfilter/Kconfig7
-rw-r--r--net/bridge/netfilter/Makefile1
-rw-r--r--net/bridge/netfilter/ebtables.c63
-rw-r--r--net/bridge/netfilter/nft_meta_bridge.c135
-rw-r--r--net/core/Makefile2
-rw-r--r--net/core/dev.c148
-rw-r--r--net/core/devlink.c111
-rw-r--r--net/core/dst.c1
-rw-r--r--net/core/ethtool.c63
-rw-r--r--net/core/failover.c315
-rw-r--r--net/core/fib_rules.c495
-rw-r--r--net/core/filter.c1423
-rw-r--r--net/core/flow_dissector.c19
-rw-r--r--net/core/neighbour.c8
-rw-r--r--net/core/net-traces.c4
-rw-r--r--net/core/page_pool.c317
-rw-r--r--net/core/rtnetlink.c34
-rw-r--r--net/core/skbuff.c25
-rw-r--r--net/core/sock.c40
-rw-r--r--net/core/xdp.c299
-rw-r--r--net/dcb/dcbnl.c20
-rw-r--r--net/dccp/minisocks.c1
-rw-r--r--net/decnet/dn_rules.c7
-rw-r--r--net/dsa/Kconfig2
-rw-r--r--net/dsa/dsa2.c24
-rw-r--r--net/dsa/dsa_priv.h9
-rw-r--r--net/dsa/master.c62
-rw-r--r--net/dsa/port.c96
-rw-r--r--net/dsa/slave.c307
-rw-r--r--net/ethernet/eth.c6
-rw-r--r--net/ipv4/Makefile5
-rw-r--r--net/ipv4/af_inet.c5
-rw-r--r--net/ipv4/bpfilter/Makefile2
-rw-r--r--net/ipv4/bpfilter/sockopt.c43
-rw-r--r--net/ipv4/devinet.c15
-rw-r--r--net/ipv4/fib_frontend.c58
-rw-r--r--net/ipv4/fib_rules.c7
-rw-r--r--net/ipv4/fib_semantics.c45
-rw-r--r--net/ipv4/fib_trie.c14
-rw-r--r--net/ipv4/inet_connection_sock.c5
-rw-r--r--net/ipv4/ip_gre.c12
-rw-r--r--net/ipv4/ip_output.c45
-rw-r--r--net/ipv4/ip_sockglue.c17
-rw-r--r--net/ipv4/ip_tunnel_core.c6
-rw-r--r--net/ipv4/ipconfig.c150
-rw-r--r--net/ipv4/ipmr.c3
-rw-r--r--net/ipv4/ipmr_base.c8
-rw-r--r--net/ipv4/metrics.c55
-rw-r--r--net/ipv4/netfilter/Kconfig10
-rw-r--r--net/ipv4/netfilter/Makefile5
-rw-r--r--net/ipv4/netfilter/ip_tables.c7
-rw-r--r--net/ipv4/netfilter/ipt_MASQUERADE.c2
-rw-r--r--net/ipv4/netfilter/iptable_nat.c88
-rw-r--r--net/ipv4/netfilter/nf_flow_table_ipv4.c255
-rw-r--r--net/ipv4/netfilter/nf_nat_h323.c4
-rw-r--r--net/ipv4/netfilter/nf_nat_l3proto_ipv4.c143
-rw-r--r--net/ipv4/netfilter/nf_nat_masquerade_ipv4.c8
-rw-r--r--net/ipv4/netfilter/nf_nat_pptp.c2
-rw-r--r--net/ipv4/netfilter/nf_nat_proto_gre.c2
-rw-r--r--net/ipv4/netfilter/nf_nat_proto_icmp.c2
-rw-r--r--net/ipv4/netfilter/nf_tproxy_ipv4.c147
-rw-r--r--net/ipv4/netfilter/nft_chain_nat_ipv4.c53
-rw-r--r--net/ipv4/netfilter/nft_masq_ipv4.c2
-rw-r--r--net/ipv4/netlink.c23
-rw-r--r--net/ipv4/proc.c3
-rw-r--r--net/ipv4/route.c177
-rw-r--r--net/ipv4/sysctl_net_ipv4.c22
-rw-r--r--net/ipv4/tcp.c204
-rw-r--r--net/ipv4/tcp_input.c271
-rw-r--r--net/ipv4/tcp_ipv4.c55
-rw-r--r--net/ipv4/tcp_minisocks.c2
-rw-r--r--net/ipv4/tcp_output.c112
-rw-r--r--net/ipv4/tcp_recovery.c80
-rw-r--r--net/ipv4/tcp_timer.c27
-rw-r--r--net/ipv4/udp.c120
-rw-r--r--net/ipv4/udp_offload.c101
-rw-r--r--net/ipv6/Kconfig5
-rw-r--r--net/ipv6/addrconf.c527
-rw-r--r--net/ipv6/addrconf_core.c41
-rw-r--r--net/ipv6/af_inet6.c65
-rw-r--r--net/ipv6/anycast.c33
-rw-r--r--net/ipv6/exthdrs.c55
-rw-r--r--net/ipv6/exthdrs_core.c2
-rw-r--r--net/ipv6/fib6_rules.c145
-rw-r--r--net/ipv6/ip6_fib.c639
-rw-r--r--net/ipv6/ip6_gre.c51
-rw-r--r--net/ipv6/ip6_input.c2
-rw-r--r--net/ipv6/ip6_offload.c6
-rw-r--r--net/ipv6/ip6_output.c96
-rw-r--r--net/ipv6/ip6_vti.c2
-rw-r--r--net/ipv6/ip6mr.c24
-rw-r--r--net/ipv6/ndisc.c48
-rw-r--r--net/ipv6/netfilter/Kconfig10
-rw-r--r--net/ipv6/netfilter/Makefile3
-rw-r--r--net/ipv6/netfilter/ip6_tables.c6
-rw-r--r--net/ipv6/netfilter/ip6t_MASQUERADE.c2
-rw-r--r--net/ipv6/netfilter/ip6t_rpfilter.c2
-rw-r--r--net/ipv6/netfilter/ip6t_srh.c173
-rw-r--r--net/ipv6/netfilter/ip6table_nat.c87
-rw-r--r--net/ipv6/netfilter/nf_flow_table_ipv6.c246
-rw-r--r--net/ipv6/netfilter/nf_nat_l3proto_ipv6.c137
-rw-r--r--net/ipv6/netfilter/nf_nat_masquerade_ipv6.c8
-rw-r--r--net/ipv6/netfilter/nf_nat_proto_icmpv6.c2
-rw-r--r--net/ipv6/netfilter/nf_tproxy_ipv6.c146
-rw-r--r--net/ipv6/netfilter/nft_chain_nat_ipv6.c51
-rw-r--r--net/ipv6/netfilter/nft_masq_ipv6.c2
-rw-r--r--net/ipv6/netfilter/nft_redir_ipv6.c2
-rw-r--r--net/ipv6/reassembly.c25
-rw-r--r--net/ipv6/route.c1895
-rw-r--r--net/ipv6/seg6.c1
-rw-r--r--net/ipv6/seg6_iptunnel.c24
-rw-r--r--net/ipv6/seg6_local.c190
-rw-r--r--net/ipv6/sysctl_net_ipv6.c8
-rw-r--r--net/ipv6/tcp_ipv6.c8
-rw-r--r--net/ipv6/udp.c72
-rw-r--r--net/ipv6/udp_offload.c5
-rw-r--r--net/ipv6/xfrm6_policy.c2
-rw-r--r--net/ipv6/xfrm6_state.c6
-rw-r--r--net/l2tp/l2tp_debugfs.c20
-rw-r--r--net/l2tp/l2tp_ppp.c56
-rw-r--r--net/mac80211/cfg.c103
-rw-r--r--net/mac80211/driver-ops.h8
-rw-r--r--net/mac80211/ethtool.c13
-rw-r--r--net/mac80211/ht.c44
-rw-r--r--net/mac80211/ieee80211_i.h3
-rw-r--r--net/mac80211/main.c3
-rw-r--r--net/mac80211/mlme.c17
-rw-r--r--net/mac80211/rx.c40
-rw-r--r--net/mac80211/sta_info.c38
-rw-r--r--net/mac80211/sta_info.h5
-rw-r--r--net/mac80211/status.c2
-rw-r--r--net/mac80211/trace.h25
-rw-r--r--net/mac80211/tx.c45
-rw-r--r--net/mac80211/util.c6
-rw-r--r--net/ncsi/internal.h34
-rw-r--r--net/ncsi/ncsi-manage.c226
-rw-r--r--net/ncsi/ncsi-netlink.c21
-rw-r--r--net/ncsi/ncsi-rsp.c179
-rw-r--r--net/netfilter/Kconfig51
-rw-r--r--net/netfilter/Makefile12
-rw-r--r--net/netfilter/core.c102
-rw-r--r--net/netfilter/ipvs/Kconfig37
-rw-r--r--net/netfilter/ipvs/Makefile1
-rw-r--r--net/netfilter/ipvs/ip_vs_app.c24
-rw-r--r--net/netfilter/ipvs/ip_vs_ctl.c4
-rw-r--r--net/netfilter/ipvs/ip_vs_dh.c3
-rw-r--r--net/netfilter/ipvs/ip_vs_ftp.c467
-rw-r--r--net/netfilter/ipvs/ip_vs_lblc.c4
-rw-r--r--net/netfilter/ipvs/ip_vs_lblcr.c4
-rw-r--r--net/netfilter/ipvs/ip_vs_mh.c540
-rw-r--r--net/netfilter/ipvs/ip_vs_nfct.c101
-rw-r--r--net/netfilter/ipvs/ip_vs_proto_sctp.c4
-rw-r--r--net/netfilter/ipvs/ip_vs_proto_tcp.c8
-rw-r--r--net/netfilter/ipvs/ip_vs_proto_udp.c4
-rw-r--r--net/netfilter/ipvs/ip_vs_sh.c3
-rw-r--r--net/netfilter/ipvs/ip_vs_xmit.c5
-rw-r--r--net/netfilter/nf_conncount.c36
-rw-r--r--net/netfilter/nf_conntrack_core.c92
-rw-r--r--net/netfilter/nf_conntrack_ftp.c3
-rw-r--r--net/netfilter/nf_conntrack_irc.c6
-rw-r--r--net/netfilter/nf_conntrack_netlink.c13
-rw-r--r--net/netfilter/nf_conntrack_sane.c3
-rw-r--r--net/netfilter/nf_conntrack_sip.c2
-rw-r--r--net/netfilter/nf_conntrack_tftp.c2
-rw-r--r--net/netfilter/nf_flow_table_core.c (renamed from net/netfilter/nf_flow_table.c)309
-rw-r--r--net/netfilter/nf_flow_table_inet.c3
-rw-r--r--net/netfilter/nf_flow_table_ip.c489
-rw-r--r--net/netfilter/nf_internals.h5
-rw-r--r--net/netfilter/nf_nat_core.c321
-rw-r--r--net/netfilter/nf_nat_helper.c2
-rw-r--r--net/netfilter/nf_nat_proto_common.c9
-rw-r--r--net/netfilter/nf_nat_proto_dccp.c2
-rw-r--r--net/netfilter/nf_nat_proto_sctp.c2
-rw-r--r--net/netfilter/nf_nat_proto_tcp.c2
-rw-r--r--net/netfilter/nf_nat_proto_udp.c4
-rw-r--r--net/netfilter/nf_nat_proto_unknown.c2
-rw-r--r--net/netfilter/nf_nat_redirect.c10
-rw-r--r--net/netfilter/nf_nat_sip.c2
-rw-r--r--net/netfilter/nf_osf.c218
-rw-r--r--net/netfilter/nf_tables_api.c1325
-rw-r--r--net/netfilter/nf_tables_core.c72
-rw-r--r--net/netfilter/nfnetlink.c44
-rw-r--r--net/netfilter/nfnetlink_log.c8
-rw-r--r--net/netfilter/nfnetlink_queue.c28
-rw-r--r--net/netfilter/nft_compat.c29
-rw-r--r--net/netfilter/nft_connlimit.c297
-rw-r--r--net/netfilter/nft_counter.c4
-rw-r--r--net/netfilter/nft_ct.c3
-rw-r--r--net/netfilter/nft_dynset.c16
-rw-r--r--net/netfilter/nft_exthdr.c23
-rw-r--r--net/netfilter/nft_flow_offload.c5
-rw-r--r--net/netfilter/nft_fwd_netdev.c146
-rw-r--r--net/netfilter/nft_hash.c127
-rw-r--r--net/netfilter/nft_immediate.c27
-rw-r--r--net/netfilter/nft_log.c92
-rw-r--r--net/netfilter/nft_lookup.c47
-rw-r--r--net/netfilter/nft_meta.c112
-rw-r--r--net/netfilter/nft_nat.c2
-rw-r--r--net/netfilter/nft_numgen.c158
-rw-r--r--net/netfilter/nft_objref.c4
-rw-r--r--net/netfilter/nft_rt.c22
-rw-r--r--net/netfilter/nft_set_bitmap.c34
-rw-r--r--net/netfilter/nft_set_hash.c174
-rw-r--r--net/netfilter/nft_set_rbtree.c109
-rw-r--r--net/netfilter/nft_socket.c144
-rw-r--r--net/netfilter/xt_NETMAP.c8
-rw-r--r--net/netfilter/xt_NFLOG.c15
-rw-r--r--net/netfilter/xt_REDIRECT.c2
-rw-r--r--net/netfilter/xt_TPROXY.c366
-rw-r--r--net/netfilter/xt_nat.c72
-rw-r--r--net/netfilter/xt_osf.c202
-rw-r--r--net/netfilter/xt_socket.c4
-rw-r--r--net/nfc/netlink.c17
-rw-r--r--net/openvswitch/Kconfig3
-rw-r--r--net/openvswitch/conntrack.c555
-rw-r--r--net/openvswitch/conntrack.h9
-rw-r--r--net/openvswitch/datapath.c7
-rw-r--r--net/openvswitch/datapath.h3
-rw-r--r--net/packet/af_packet.c44
-rw-r--r--net/qrtr/Kconfig7
-rw-r--r--net/qrtr/Makefile2
-rw-r--r--net/qrtr/tun.c161
-rw-r--r--net/rfkill/core.c66
-rw-r--r--net/rxrpc/ar-internal.h2
-rw-r--r--net/rxrpc/call_event.c8
-rw-r--r--net/rxrpc/conn_event.c2
-rw-r--r--net/rxrpc/input.c10
-rw-r--r--net/sched/act_api.c20
-rw-r--r--net/sched/act_csum.c6
-rw-r--r--net/sched/cls_api.c443
-rw-r--r--net/sched/cls_basic.c24
-rw-r--r--net/sched/cls_bpf.c22
-rw-r--r--net/sched/cls_cgroup.c23
-rw-r--r--net/sched/cls_flow.c24
-rw-r--r--net/sched/cls_flower.c317
-rw-r--r--net/sched/cls_fw.c24
-rw-r--r--net/sched/cls_matchall.c21
-rw-r--r--net/sched/cls_route.c23
-rw-r--r--net/sched/cls_rsvp.h20
-rw-r--r--net/sched/cls_tcindex.c41
-rw-r--r--net/sched/cls_u32.c37
-rw-r--r--net/sched/sch_generic.c49
-rw-r--r--net/sched/sch_mq.c37
-rw-r--r--net/sctp/associola.c85
-rw-r--r--net/sctp/chunk.c12
-rw-r--r--net/sctp/output.c28
-rw-r--r--net/sctp/outqueue.c660
-rw-r--r--net/sctp/sm_make_chunk.c143
-rw-r--r--net/sctp/socket.c43
-rw-r--r--net/sctp/transport.c39
-rw-r--r--net/smc/af_smc.c803
-rw-r--r--net/smc/smc.h68
-rw-r--r--net/smc/smc_cdc.c101
-rw-r--r--net/smc/smc_cdc.h15
-rw-r--r--net/smc/smc_clc.c6
-rw-r--r--net/smc/smc_clc.h2
-rw-r--r--net/smc/smc_core.c199
-rw-r--r--net/smc/smc_core.h29
-rw-r--r--net/smc/smc_diag.c44
-rw-r--r--net/smc/smc_ib.c13
-rw-r--r--net/smc/smc_llc.c242
-rw-r--r--net/smc/smc_llc.h8
-rw-r--r--net/smc/smc_rx.c308
-rw-r--r--net/smc/smc_rx.h11
-rw-r--r--net/smc/smc_tx.c111
-rw-r--r--net/smc/smc_tx.h5
-rw-r--r--net/smc/smc_wr.c1
-rw-r--r--net/strparser/strparser.c13
-rw-r--r--net/tipc/bearer.c29
-rw-r--r--net/tipc/bearer.h3
-rw-r--r--net/tipc/name_table.c103
-rw-r--r--net/tipc/node.c33
-rw-r--r--net/tipc/node.h3
-rw-r--r--net/tipc/socket.c13
-rw-r--r--net/tipc/udp_media.c4
-rw-r--r--net/tipc/udp_media.h14
-rw-r--r--net/tls/Kconfig10
-rw-r--r--net/tls/Makefile2
-rw-r--r--net/tls/tls_device.c766
-rw-r--r--net/tls/tls_device_fallback.c450
-rw-r--r--net/tls/tls_main.c139
-rw-r--r--net/tls/tls_sw.c143
-rw-r--r--net/wireless/core.c4
-rw-r--r--net/wireless/nl80211.c304
-rw-r--r--net/wireless/rdev-ops.h12
-rw-r--r--net/wireless/reg.c39
-rw-r--r--net/wireless/sme.c88
-rw-r--r--net/wireless/trace.h14
-rw-r--r--net/wireless/util.c11
-rw-r--r--net/xdp/Kconfig7
-rw-r--r--net/xdp/Makefile1
-rw-r--r--net/xdp/xdp_umem.c361
-rw-r--r--net/xdp/xdp_umem.h30
-rw-r--r--net/xdp/xdp_umem_props.h14
-rw-r--r--net/xdp/xsk.c788
-rw-r--r--net/xdp/xsk_queue.c63
-rw-r--r--net/xdp/xsk_queue.h265
-rw-r--r--net/xfrm/xfrm_state.c9
332 files changed, 20336 insertions, 8128 deletions
diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c
index 5505ee6ebdbe..73a65789271b 100644
--- a/net/8021q/vlan.c
+++ b/net/8021q/vlan.c
@@ -118,17 +118,21 @@ void unregister_vlan_dev(struct net_device *dev, struct list_head *head)
}
int vlan_check_real_dev(struct net_device *real_dev,
- __be16 protocol, u16 vlan_id)
+ __be16 protocol, u16 vlan_id,
+ struct netlink_ext_ack *extack)
{
const char *name = real_dev->name;
if (real_dev->features & NETIF_F_VLAN_CHALLENGED) {
pr_info("VLANs not supported on %s\n", name);
+ NL_SET_ERR_MSG_MOD(extack, "VLANs not supported on device");
return -EOPNOTSUPP;
}
- if (vlan_find_dev(real_dev, protocol, vlan_id) != NULL)
+ if (vlan_find_dev(real_dev, protocol, vlan_id) != NULL) {
+ NL_SET_ERR_MSG_MOD(extack, "VLAN device already exists");
return -EEXIST;
+ }
return 0;
}
@@ -215,7 +219,8 @@ static int register_vlan_device(struct net_device *real_dev, u16 vlan_id)
if (vlan_id >= VLAN_VID_MASK)
return -ERANGE;
- err = vlan_check_real_dev(real_dev, htons(ETH_P_8021Q), vlan_id);
+ err = vlan_check_real_dev(real_dev, htons(ETH_P_8021Q), vlan_id,
+ NULL);
if (err < 0)
return err;
diff --git a/net/8021q/vlan.h b/net/8021q/vlan.h
index e23aac3e4d37..44df1c3df02d 100644
--- a/net/8021q/vlan.h
+++ b/net/8021q/vlan.h
@@ -109,7 +109,8 @@ int vlan_dev_change_flags(const struct net_device *dev, u32 flag, u32 mask);
void vlan_dev_get_realdev_name(const struct net_device *dev, char *result);
int vlan_check_real_dev(struct net_device *real_dev,
- __be16 protocol, u16 vlan_id);
+ __be16 protocol, u16 vlan_id,
+ struct netlink_ext_ack *extack);
void vlan_setup(struct net_device *dev);
int register_vlan_dev(struct net_device *dev, struct netlink_ext_ack *extack);
void unregister_vlan_dev(struct net_device *dev, struct list_head *head);
diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c
index 236452ebbd9e..546af0e73ac3 100644
--- a/net/8021q/vlan_dev.c
+++ b/net/8021q/vlan_dev.c
@@ -215,7 +215,9 @@ int vlan_dev_set_egress_priority(const struct net_device *dev,
return 0;
}
-/* Flags are defined in the vlan_flags enum in include/linux/if_vlan.h file. */
+/* Flags are defined in the vlan_flags enum in
+ * include/uapi/linux/if_vlan.h file.
+ */
int vlan_dev_change_flags(const struct net_device *dev, u32 flags, u32 mask)
{
struct vlan_dev_priv *vlan = vlan_dev_priv(dev);
diff --git a/net/8021q/vlan_netlink.c b/net/8021q/vlan_netlink.c
index 6689c0b272a7..9b60c1e399e2 100644
--- a/net/8021q/vlan_netlink.c
+++ b/net/8021q/vlan_netlink.c
@@ -47,14 +47,20 @@ static int vlan_validate(struct nlattr *tb[], struct nlattr *data[],
int err;
if (tb[IFLA_ADDRESS]) {
- if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
+ if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) {
+ NL_SET_ERR_MSG_MOD(extack, "Invalid link address");
return -EINVAL;
- if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
+ }
+ if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) {
+ NL_SET_ERR_MSG_MOD(extack, "Invalid link address");
return -EADDRNOTAVAIL;
+ }
}
- if (!data)
+ if (!data) {
+ NL_SET_ERR_MSG_MOD(extack, "VLAN properties not specified");
return -EINVAL;
+ }
if (data[IFLA_VLAN_PROTOCOL]) {
switch (nla_get_be16(data[IFLA_VLAN_PROTOCOL])) {
@@ -62,29 +68,38 @@ static int vlan_validate(struct nlattr *tb[], struct nlattr *data[],
case htons(ETH_P_8021AD):
break;
default:
+ NL_SET_ERR_MSG_MOD(extack, "Invalid VLAN protocol");
return -EPROTONOSUPPORT;
}
}
if (data[IFLA_VLAN_ID]) {
id = nla_get_u16(data[IFLA_VLAN_ID]);
- if (id >= VLAN_VID_MASK)
+ if (id >= VLAN_VID_MASK) {
+ NL_SET_ERR_MSG_MOD(extack, "Invalid VLAN id");
return -ERANGE;
+ }
}
if (data[IFLA_VLAN_FLAGS]) {
flags = nla_data(data[IFLA_VLAN_FLAGS]);
if ((flags->flags & flags->mask) &
~(VLAN_FLAG_REORDER_HDR | VLAN_FLAG_GVRP |
- VLAN_FLAG_LOOSE_BINDING | VLAN_FLAG_MVRP))
+ VLAN_FLAG_LOOSE_BINDING | VLAN_FLAG_MVRP)) {
+ NL_SET_ERR_MSG_MOD(extack, "Invalid VLAN flags");
return -EINVAL;
+ }
}
err = vlan_validate_qos_map(data[IFLA_VLAN_INGRESS_QOS]);
- if (err < 0)
+ if (err < 0) {
+ NL_SET_ERR_MSG_MOD(extack, "Invalid ingress QOS map");
return err;
+ }
err = vlan_validate_qos_map(data[IFLA_VLAN_EGRESS_QOS]);
- if (err < 0)
+ if (err < 0) {
+ NL_SET_ERR_MSG_MOD(extack, "Invalid egress QOS map");
return err;
+ }
return 0;
}
@@ -126,14 +141,21 @@ static int vlan_newlink(struct net *src_net, struct net_device *dev,
__be16 proto;
int err;
- if (!data[IFLA_VLAN_ID])
+ if (!data[IFLA_VLAN_ID]) {
+ NL_SET_ERR_MSG_MOD(extack, "VLAN id not specified");
return -EINVAL;
+ }
- if (!tb[IFLA_LINK])
+ if (!tb[IFLA_LINK]) {
+ NL_SET_ERR_MSG_MOD(extack, "link not specified");
return -EINVAL;
+ }
+
real_dev = __dev_get_by_index(src_net, nla_get_u32(tb[IFLA_LINK]));
- if (!real_dev)
+ if (!real_dev) {
+ NL_SET_ERR_MSG_MOD(extack, "link does not exist");
return -ENODEV;
+ }
if (data[IFLA_VLAN_PROTOCOL])
proto = nla_get_be16(data[IFLA_VLAN_PROTOCOL]);
@@ -146,7 +168,8 @@ static int vlan_newlink(struct net *src_net, struct net_device *dev,
dev->priv_flags |= (real_dev->priv_flags & IFF_XMIT_DST_RELEASE);
vlan->flags = VLAN_FLAG_REORDER_HDR;
- err = vlan_check_real_dev(real_dev, vlan->vlan_proto, vlan->vlan_id);
+ err = vlan_check_real_dev(real_dev, vlan->vlan_proto, vlan->vlan_id,
+ extack);
if (err < 0)
return err;
diff --git a/net/9p/mod.c b/net/9p/mod.c
index 6ab36aea7727..eb9777f05755 100644
--- a/net/9p/mod.c
+++ b/net/9p/mod.c
@@ -104,7 +104,7 @@ EXPORT_SYMBOL(v9fs_unregister_trans);
/**
* v9fs_get_trans_by_name - get transport with the matching name
- * @name: string identifying transport
+ * @s: string identifying transport
*
*/
struct p9_trans_module *v9fs_get_trans_by_name(char *s)
diff --git a/net/Kconfig b/net/Kconfig
index 0428f12c25c2..f738a6f27665 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -59,6 +59,7 @@ source "net/tls/Kconfig"
source "net/xfrm/Kconfig"
source "net/iucv/Kconfig"
source "net/smc/Kconfig"
+source "net/xdp/Kconfig"
config INET
bool "TCP/IP networking"
@@ -201,6 +202,8 @@ source "net/bridge/netfilter/Kconfig"
endif
+source "net/bpfilter/Kconfig"
+
source "net/dccp/Kconfig"
source "net/sctp/Kconfig"
source "net/rds/Kconfig"
@@ -407,6 +410,9 @@ config GRO_CELLS
bool
default n
+config SOCK_VALIDATE_XMIT
+ bool
+
config NET_DEVLINK
tristate "Network physical/parent device Netlink interface"
help
@@ -423,6 +429,22 @@ config MAY_USE_DEVLINK
on MAY_USE_DEVLINK to ensure they do not cause link errors when
devlink is a loadable module and the driver using it is built-in.
+config PAGE_POOL
+ bool
+
+config FAILOVER
+ tristate "Generic failover module"
+ help
+ The failover module provides a generic interface for paravirtual
+ drivers to register a netdev and a set of ops with a failover
+ instance. The ops are used as event handlers that get called to
+ handle netdev register/unregister/link change/name change events
+ on slave pci ethernet devices with the same mac address as the
+ failover netdev. This enables paravirtual drivers to use a
+ VF as an accelerated low latency datapath. It also allows live
+ migration of VMs with direct attached VFs by failing over to the
+ paravirtual datapath when the VF is unplugged.
+
endif # if NET
# Used by archs to tell that they support BPF JIT compiler plus which flavour.
diff --git a/net/Makefile b/net/Makefile
index a6147c61b174..13ec0d5415c7 100644
--- a/net/Makefile
+++ b/net/Makefile
@@ -20,6 +20,11 @@ obj-$(CONFIG_TLS) += tls/
obj-$(CONFIG_XFRM) += xfrm/
obj-$(CONFIG_UNIX) += unix/
obj-$(CONFIG_NET) += ipv6/
+ifneq ($(CC_CAN_LINK),y)
+$(warning CC cannot link executables. Skipping bpfilter.)
+else
+obj-$(CONFIG_BPFILTER) += bpfilter/
+endif
obj-$(CONFIG_PACKET) += packet/
obj-$(CONFIG_NET_KEY) += key/
obj-$(CONFIG_BRIDGE) += bridge/
@@ -85,3 +90,4 @@ obj-y += l3mdev/
endif
obj-$(CONFIG_QRTR) += qrtr/
obj-$(CONFIG_NET_NCSI) += ncsi/
+obj-$(CONFIG_XDP_SOCKETS) += xdp/
diff --git a/net/batman-adv/Kconfig b/net/batman-adv/Kconfig
index e4e2e02b7380..de8034d80623 100644
--- a/net/batman-adv/Kconfig
+++ b/net/batman-adv/Kconfig
@@ -35,7 +35,7 @@ config BATMAN_ADV
config BATMAN_ADV_BATMAN_V
bool "B.A.T.M.A.N. V protocol (experimental)"
depends on BATMAN_ADV && !(CFG80211=m && BATMAN_ADV=y)
- default n
+ default y
help
This option enables the B.A.T.M.A.N. V protocol, the successor
of the currently used B.A.T.M.A.N. IV protocol. The main
@@ -94,13 +94,13 @@ config BATMAN_ADV_DEBUGFS
bool "batman-adv debugfs entries"
depends on BATMAN_ADV
depends on DEBUG_FS
- default y
+ default n
help
Enable this to export routing related debug tables via debugfs.
The information for each soft-interface and used hard-interface can be
found under batman_adv/
- If unsure, say Y.
+ If unsure, say N.
config BATMAN_ADV_DEBUG
bool "B.A.T.M.A.N. debugging"
diff --git a/net/batman-adv/bat_v_elp.c b/net/batman-adv/bat_v_elp.c
index 28687493599f..71c20c1d4002 100644
--- a/net/batman-adv/bat_v_elp.c
+++ b/net/batman-adv/bat_v_elp.c
@@ -127,7 +127,20 @@ static u32 batadv_v_elp_get_throughput(struct batadv_hardif_neigh_node *neigh)
rtnl_lock();
ret = __ethtool_get_link_ksettings(hard_iface->net_dev, &link_settings);
rtnl_unlock();
- if (ret == 0) {
+
+ /* Virtual interface drivers such as tun / tap interfaces, VLAN, etc
+ * tend to initialize the interface throughput with some value for the
+ * sake of having a throughput number to export via ethtool. This
+ * exported throughput leaves batman-adv to conclude the interface
+ * throughput is genuine (reflecting reality), thus no measurements
+ * are necessary.
+ *
+ * Based on the observation that those interface types also tend to set
+ * the link auto-negotiation to 'off', batman-adv shall check this
+ * setting to differentiate between genuine link throughput information
+ * and placeholders installed by virtual interfaces.
+ */
+ if (ret == 0 && link_settings.base.autoneg == AUTONEG_ENABLE) {
/* link characteristics might change over time */
if (link_settings.base.duplex == DUPLEX_FULL)
hard_iface->bat_v.flags |= BATADV_FULL_DUPLEX;
diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h
index 057a28a9fe88..8da3c9336111 100644
--- a/net/batman-adv/main.h
+++ b/net/batman-adv/main.h
@@ -25,7 +25,7 @@
#define BATADV_DRIVER_DEVICE "batman-adv"
#ifndef BATADV_SOURCE_VERSION
-#define BATADV_SOURCE_VERSION "2018.1"
+#define BATADV_SOURCE_VERSION "2018.2"
#endif
/* B.A.T.M.A.N. parameters */
diff --git a/net/batman-adv/multicast.c b/net/batman-adv/multicast.c
index a35f597e8c8b..86725d792e15 100644
--- a/net/batman-adv/multicast.c
+++ b/net/batman-adv/multicast.c
@@ -815,9 +815,6 @@ static int batadv_mcast_forw_mode_check(struct batadv_priv *bat_priv,
if (!atomic_read(&bat_priv->multicast_mode))
return -EINVAL;
- if (atomic_read(&bat_priv->mcast.num_disabled))
- return -EINVAL;
-
switch (ntohs(ethhdr->h_proto)) {
case ETH_P_IP:
return batadv_mcast_forw_mode_check_ipv4(bat_priv, skb,
@@ -1183,33 +1180,23 @@ static void batadv_mcast_tvlv_ogm_handler(struct batadv_priv *bat_priv,
{
bool orig_mcast_enabled = !(flags & BATADV_TVLV_HANDLER_OGM_CIFNOTFND);
u8 mcast_flags = BATADV_NO_FLAGS;
- bool orig_initialized;
if (orig_mcast_enabled && tvlv_value &&
tvlv_value_len >= sizeof(mcast_flags))
mcast_flags = *(u8 *)tvlv_value;
+ if (!orig_mcast_enabled) {
+ mcast_flags |= BATADV_MCAST_WANT_ALL_IPV4;
+ mcast_flags |= BATADV_MCAST_WANT_ALL_IPV6;
+ }
+
spin_lock_bh(&orig->mcast_handler_lock);
- orig_initialized = test_bit(BATADV_ORIG_CAPA_HAS_MCAST,
- &orig->capa_initialized);
- /* If mcast support is turned on decrease the disabled mcast node
- * counter only if we had increased it for this node before. If this
- * is a completely new orig_node no need to decrease the counter.
- */
if (orig_mcast_enabled &&
!test_bit(BATADV_ORIG_CAPA_HAS_MCAST, &orig->capabilities)) {
- if (orig_initialized)
- atomic_dec(&bat_priv->mcast.num_disabled);
set_bit(BATADV_ORIG_CAPA_HAS_MCAST, &orig->capabilities);
- /* If mcast support is being switched off or if this is an initial
- * OGM without mcast support then increase the disabled mcast
- * node counter.
- */
} else if (!orig_mcast_enabled &&
- (test_bit(BATADV_ORIG_CAPA_HAS_MCAST, &orig->capabilities) ||
- !orig_initialized)) {
- atomic_inc(&bat_priv->mcast.num_disabled);
+ test_bit(BATADV_ORIG_CAPA_HAS_MCAST, &orig->capabilities)) {
clear_bit(BATADV_ORIG_CAPA_HAS_MCAST, &orig->capabilities);
}
@@ -1595,10 +1582,6 @@ void batadv_mcast_purge_orig(struct batadv_orig_node *orig)
spin_lock_bh(&orig->mcast_handler_lock);
- if (!test_bit(BATADV_ORIG_CAPA_HAS_MCAST, &orig->capabilities) &&
- test_bit(BATADV_ORIG_CAPA_HAS_MCAST, &orig->capa_initialized))
- atomic_dec(&bat_priv->mcast.num_disabled);
-
batadv_mcast_want_unsnoop_update(bat_priv, orig, BATADV_NO_FLAGS);
batadv_mcast_want_ipv4_update(bat_priv, orig, BATADV_NO_FLAGS);
batadv_mcast_want_ipv6_update(bat_priv, orig, BATADV_NO_FLAGS);
diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c
index edeffcb9f3a2..1485263a348b 100644
--- a/net/batman-adv/soft-interface.c
+++ b/net/batman-adv/soft-interface.c
@@ -188,8 +188,8 @@ static void batadv_interface_set_rx_mode(struct net_device *dev)
{
}
-static int batadv_interface_tx(struct sk_buff *skb,
- struct net_device *soft_iface)
+static netdev_tx_t batadv_interface_tx(struct sk_buff *skb,
+ struct net_device *soft_iface)
{
struct ethhdr *ethhdr;
struct batadv_priv *bat_priv = netdev_priv(soft_iface);
@@ -796,7 +796,6 @@ static int batadv_softif_init_late(struct net_device *dev)
bat_priv->mcast.querier_ipv6.shadowing = false;
bat_priv->mcast.flags = BATADV_NO_FLAGS;
atomic_set(&bat_priv->multicast_mode, 1);
- atomic_set(&bat_priv->mcast.num_disabled, 0);
atomic_set(&bat_priv->mcast.num_want_all_unsnoopables, 0);
atomic_set(&bat_priv->mcast.num_want_all_ipv4, 0);
atomic_set(&bat_priv->mcast.num_want_all_ipv6, 0);
diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h
index 476b052ad982..360357f83f20 100644
--- a/net/batman-adv/types.h
+++ b/net/batman-adv/types.h
@@ -215,10 +215,12 @@ struct batadv_hard_iface {
struct batadv_hard_iface_bat_v bat_v;
#endif
+#ifdef CONFIG_BATMAN_ADV_DEBUGFS
/**
* @debug_dir: dentry for nc subdir in batman-adv directory in debugfs
*/
struct dentry *debug_dir;
+#endif
/**
* @neigh_list: list of unique single hop neighbors via this interface
@@ -1160,13 +1162,13 @@ struct batadv_priv_dat {
*/
struct batadv_mcast_querier_state {
/** @exists: whether a querier exists in the mesh */
- bool exists;
+ unsigned char exists:1;
/**
* @shadowing: if a querier exists, whether it is potentially shadowing
* multicast listeners (i.e. querier is behind our own bridge segment)
*/
- bool shadowing;
+ unsigned char shadowing:1;
};
/**
@@ -1207,13 +1209,10 @@ struct batadv_priv_mcast {
u8 flags;
/** @enabled: whether the multicast tvlv is currently enabled */
- bool enabled;
+ unsigned char enabled:1;
/** @bridged: whether the soft interface has a bridge on top */
- bool bridged;
-
- /** @num_disabled: number of nodes that have no mcast tvlv */
- atomic_t num_disabled;
+ unsigned char bridged:1;
/**
* @num_want_all_unsnoopables: number of nodes wanting unsnoopable IP
@@ -1245,10 +1244,12 @@ struct batadv_priv_nc {
/** @work: work queue callback item for cleanup */
struct delayed_work work;
+#ifdef CONFIG_BATMAN_ADV_DEBUGFS
/**
* @debug_dir: dentry for nc subdir in batman-adv directory in debugfs
*/
struct dentry *debug_dir;
+#endif
/**
* @min_tq: only consider neighbors for encoding if neigh_tq > min_tq
@@ -1392,7 +1393,7 @@ struct batadv_tp_vars {
atomic_t dup_acks;
/** @fast_recovery: true if in Fast Recovery mode */
- bool fast_recovery;
+ unsigned char fast_recovery:1;
/** @recover: last sent seqno when entering Fast Recovery */
u32 recover;
@@ -1601,8 +1602,10 @@ struct batadv_priv {
/** @mesh_obj: kobject for sysfs mesh subdirectory */
struct kobject *mesh_obj;
+#ifdef CONFIG_BATMAN_ADV_DEBUGFS
/** @debug_dir: dentry for debugfs batman-adv subdirectory */
struct dentry *debug_dir;
+#endif
/** @forw_bat_list: list of aggregated OGMs that will be forwarded */
struct hlist_head forw_bat_list;
@@ -2049,10 +2052,10 @@ struct batadv_skb_cb {
* @decoded: Marks a skb as decoded, which is checked when searching for
* coding opportunities in network-coding.c
*/
- bool decoded;
+ unsigned char decoded:1;
/** @num_bcasts: Counter for broadcast packet retransmissions */
- unsigned int num_bcasts;
+ unsigned char num_bcasts;
};
/**
diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c
index 40d260f2bea5..1dec33790198 100644
--- a/net/bluetooth/hci_core.c
+++ b/net/bluetooth/hci_core.c
@@ -76,19 +76,15 @@ static ssize_t dut_mode_write(struct file *file, const char __user *user_buf,
{
struct hci_dev *hdev = file->private_data;
struct sk_buff *skb;
- char buf[32];
- size_t buf_size = min(count, (sizeof(buf)-1));
bool enable;
+ int err;
if (!test_bit(HCI_UP, &hdev->flags))
return -ENETDOWN;
- if (copy_from_user(buf, user_buf, buf_size))
- return -EFAULT;
-
- buf[buf_size] = '\0';
- if (strtobool(buf, &enable))
- return -EINVAL;
+ err = kstrtobool_from_user(user_buf, count, &enable);
+ if (err)
+ return err;
if (enable == hci_dev_test_flag(hdev, HCI_DUT_MODE))
return -EALREADY;
@@ -135,17 +131,12 @@ static ssize_t vendor_diag_write(struct file *file, const char __user *user_buf,
size_t count, loff_t *ppos)
{
struct hci_dev *hdev = file->private_data;
- char buf[32];
- size_t buf_size = min(count, (sizeof(buf)-1));
bool enable;
int err;
- if (copy_from_user(buf, user_buf, buf_size))
- return -EFAULT;
-
- buf[buf_size] = '\0';
- if (strtobool(buf, &enable))
- return -EINVAL;
+ err = kstrtobool_from_user(user_buf, count, &enable);
+ if (err)
+ return err;
/* When the diagnostic flags are not persistent and the transport
* is not active or in user channel operation, then there is no need
@@ -3422,6 +3413,37 @@ int hci_send_cmd(struct hci_dev *hdev, __u16 opcode, __u32 plen,
return 0;
}
+int __hci_cmd_send(struct hci_dev *hdev, u16 opcode, u32 plen,
+ const void *param)
+{
+ struct sk_buff *skb;
+
+ if (hci_opcode_ogf(opcode) != 0x3f) {
+ /* A controller receiving a command shall respond with either
+ * a Command Status Event or a Command Complete Event.
+ * Therefore, all standard HCI commands must be sent via the
+ * standard API, using hci_send_cmd or hci_cmd_sync helpers.
+ * Some vendors do not comply with this rule for vendor-specific
+ * commands and do not return any event. We want to support
+ * unresponded commands for such cases only.
+ */
+ bt_dev_err(hdev, "unresponded command not supported");
+ return -EINVAL;
+ }
+
+ skb = hci_prepare_cmd(hdev, opcode, plen, param);
+ if (!skb) {
+ bt_dev_err(hdev, "no memory for command (opcode 0x%4.4x)",
+ opcode);
+ return -ENOMEM;
+ }
+
+ hci_send_frame(hdev, skb);
+
+ return 0;
+}
+EXPORT_SYMBOL(__hci_cmd_send);
+
/* Get data from the previously sent command */
void *hci_sent_cmd_data(struct hci_dev *hdev, __u16 opcode)
{
diff --git a/net/bluetooth/hci_debugfs.c b/net/bluetooth/hci_debugfs.c
index 418b76e557b0..0d8ab5b3c177 100644
--- a/net/bluetooth/hci_debugfs.c
+++ b/net/bluetooth/hci_debugfs.c
@@ -47,19 +47,15 @@ static ssize_t __name ## _write(struct file *file, \
size_t count, loff_t *ppos) \
{ \
struct hci_dev *hdev = file->private_data; \
- char buf[32]; \
- size_t buf_size = min(count, (sizeof(buf) - 1)); \
bool enable; \
+ int err; \
\
if (test_bit(HCI_UP, &hdev->flags)) \
return -EBUSY; \
\
- if (copy_from_user(buf, user_buf, buf_size)) \
- return -EFAULT; \
- \
- buf[buf_size] = '\0'; \
- if (strtobool(buf, &enable)) \
- return -EINVAL; \
+ err = kstrtobool_from_user(user_buf, count, &enable); \
+ if (err) \
+ return err; \
\
if (enable == test_bit(__quirk, &hdev->quirks)) \
return -EALREADY; \
@@ -658,19 +654,15 @@ static ssize_t force_static_address_write(struct file *file,
size_t count, loff_t *ppos)
{
struct hci_dev *hdev = file->private_data;
- char buf[32];
- size_t buf_size = min(count, (sizeof(buf)-1));
bool enable;
+ int err;
if (test_bit(HCI_UP, &hdev->flags))
return -EBUSY;
- if (copy_from_user(buf, user_buf, buf_size))
- return -EFAULT;
-
- buf[buf_size] = '\0';
- if (strtobool(buf, &enable))
- return -EINVAL;
+ err = kstrtobool_from_user(user_buf, count, &enable);
+ if (err)
+ return err;
if (enable == hci_dev_test_flag(hdev, HCI_FORCE_STATIC_ADDR))
return -EALREADY;
diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c
index 139707cd9d35..235b5aaab23d 100644
--- a/net/bluetooth/hci_event.c
+++ b/net/bluetooth/hci_event.c
@@ -4942,10 +4942,14 @@ static void hci_le_adv_report_evt(struct hci_dev *hdev, struct sk_buff *skb)
struct hci_ev_le_advertising_info *ev = ptr;
s8 rssi;
- rssi = ev->data[ev->length];
- process_adv_report(hdev, ev->evt_type, &ev->bdaddr,
- ev->bdaddr_type, NULL, 0, rssi,
- ev->data, ev->length);
+ if (ev->length <= HCI_MAX_AD_LENGTH) {
+ rssi = ev->data[ev->length];
+ process_adv_report(hdev, ev->evt_type, &ev->bdaddr,
+ ev->bdaddr_type, NULL, 0, rssi,
+ ev->data, ev->length);
+ } else {
+ bt_dev_err(hdev, "Dropping invalid advertising data");
+ }
ptr += sizeof(*ev) + ev->length + 1;
}
diff --git a/net/bluetooth/hci_request.c b/net/bluetooth/hci_request.c
index 66c0781773df..e44d34734834 100644
--- a/net/bluetooth/hci_request.c
+++ b/net/bluetooth/hci_request.c
@@ -122,7 +122,6 @@ void hci_req_sync_cancel(struct hci_dev *hdev, int err)
struct sk_buff *__hci_cmd_sync_ev(struct hci_dev *hdev, u16 opcode, u32 plen,
const void *param, u8 event, u32 timeout)
{
- DECLARE_WAITQUEUE(wait, current);
struct hci_request req;
struct sk_buff *skb;
int err = 0;
@@ -135,21 +134,14 @@ struct sk_buff *__hci_cmd_sync_ev(struct hci_dev *hdev, u16 opcode, u32 plen,
hdev->req_status = HCI_REQ_PEND;
- add_wait_queue(&hdev->req_wait_q, &wait);
- set_current_state(TASK_INTERRUPTIBLE);
-
err = hci_req_run_skb(&req, hci_req_sync_complete);
- if (err < 0) {
- remove_wait_queue(&hdev->req_wait_q, &wait);
- set_current_state(TASK_RUNNING);
+ if (err < 0)
return ERR_PTR(err);
- }
- schedule_timeout(timeout);
+ err = wait_event_interruptible_timeout(hdev->req_wait_q,
+ hdev->req_status != HCI_REQ_PEND, timeout);
- remove_wait_queue(&hdev->req_wait_q, &wait);
-
- if (signal_pending(current))
+ if (err == -ERESTARTSYS)
return ERR_PTR(-EINTR);
switch (hdev->req_status) {
@@ -197,7 +189,6 @@ int __hci_req_sync(struct hci_dev *hdev, int (*func)(struct hci_request *req,
unsigned long opt, u32 timeout, u8 *hci_status)
{
struct hci_request req;
- DECLARE_WAITQUEUE(wait, current);
int err = 0;
BT_DBG("%s start", hdev->name);
@@ -213,16 +204,10 @@ int __hci_req_sync(struct hci_dev *hdev, int (*func)(struct hci_request *req,
return err;
}
- add_wait_queue(&hdev->req_wait_q, &wait);
- set_current_state(TASK_INTERRUPTIBLE);
-
err = hci_req_run_skb(&req, hci_req_sync_complete);
if (err < 0) {
hdev->req_status = 0;
- remove_wait_queue(&hdev->req_wait_q, &wait);
- set_current_state(TASK_RUNNING);
-
/* ENODATA means the HCI request command queue is empty.
* This can happen when a request with conditionals doesn't
* trigger any commands to be sent. This is normal behavior
@@ -240,11 +225,10 @@ int __hci_req_sync(struct hci_dev *hdev, int (*func)(struct hci_request *req,
return err;
}
- schedule_timeout(timeout);
-
- remove_wait_queue(&hdev->req_wait_q, &wait);
+ err = wait_event_interruptible_timeout(hdev->req_wait_q,
+ hdev->req_status != HCI_REQ_PEND, timeout);
- if (signal_pending(current))
+ if (err == -ERESTARTSYS)
return -EINTR;
switch (hdev->req_status) {
diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c
index a2ddae2f37d7..ae91e2d40056 100644
--- a/net/bluetooth/smp.c
+++ b/net/bluetooth/smp.c
@@ -3315,16 +3315,12 @@ static ssize_t force_bredr_smp_write(struct file *file,
size_t count, loff_t *ppos)
{
struct hci_dev *hdev = file->private_data;
- char buf[32];
- size_t buf_size = min(count, (sizeof(buf)-1));
bool enable;
+ int err;
- if (copy_from_user(buf, user_buf, buf_size))
- return -EFAULT;
-
- buf[buf_size] = '\0';
- if (strtobool(buf, &enable))
- return -EINVAL;
+ err = kstrtobool_from_user(user_buf, count, &enable);
+ if (err)
+ return err;
if (enable == hci_dev_test_flag(hdev, HCI_FORCE_BREDR_SMP))
return -EALREADY;
diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c
index 2ced48662c1f..68c3578343b4 100644
--- a/net/bpf/test_run.c
+++ b/net/bpf/test_run.c
@@ -170,7 +170,8 @@ int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr,
xdp.rxq = &rxqueue->xdp_rxq;
retval = bpf_test_run(prog, &xdp, repeat, &duration);
- if (xdp.data != data + XDP_PACKET_HEADROOM + NET_IP_ALIGN)
+ if (xdp.data != data + XDP_PACKET_HEADROOM + NET_IP_ALIGN ||
+ xdp.data_end != xdp.data + size)
size = xdp.data_end - xdp.data;
ret = bpf_test_finish(kattr, uattr, xdp.data, size, retval, duration);
kfree(data);
diff --git a/net/bpfilter/Kconfig b/net/bpfilter/Kconfig
new file mode 100644
index 000000000000..a948b072c28f
--- /dev/null
+++ b/net/bpfilter/Kconfig
@@ -0,0 +1,16 @@
+menuconfig BPFILTER
+ bool "BPF based packet filtering framework (BPFILTER)"
+ default n
+ depends on NET && BPF && INET
+ help
+ This builds experimental bpfilter framework that is aiming to
+ provide netfilter compatible functionality via BPF
+
+if BPFILTER
+config BPFILTER_UMH
+ tristate "bpfilter kernel module with user mode helper"
+ default m
+ help
+ This builds bpfilter kernel module with embedded user mode helper
+endif
+
diff --git a/net/bpfilter/Makefile b/net/bpfilter/Makefile
new file mode 100644
index 000000000000..aafa72001fcd
--- /dev/null
+++ b/net/bpfilter/Makefile
@@ -0,0 +1,32 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Makefile for the Linux BPFILTER layer.
+#
+
+hostprogs-y := bpfilter_umh
+bpfilter_umh-objs := main.o
+HOSTCFLAGS += -I. -Itools/include/ -Itools/include/uapi
+HOSTCC := $(CC)
+
+ifeq ($(CONFIG_BPFILTER_UMH), y)
+# builtin bpfilter_umh should be compiled with -static
+# since rootfs isn't mounted at the time of __init
+# function is called and do_execv won't find elf interpreter
+HOSTLDFLAGS += -static
+endif
+
+# a bit of elf magic to convert bpfilter_umh binary into a binary blob
+# inside bpfilter_umh.o elf file referenced by
+# _binary_net_bpfilter_bpfilter_umh_start symbol
+# which bpfilter_kern.c passes further into umh blob loader at run-time
+quiet_cmd_copy_umh = GEN $@
+ cmd_copy_umh = echo ':' > $(obj)/.bpfilter_umh.o.cmd; \
+ $(OBJCOPY) -I binary -O $(CONFIG_OUTPUT_FORMAT) \
+ -B `$(OBJDUMP) -f $<|grep architecture|cut -d, -f1|cut -d' ' -f2` \
+ --rename-section .data=.init.rodata $< $@
+
+$(obj)/bpfilter_umh.o: $(obj)/bpfilter_umh
+ $(call cmd,copy_umh)
+
+obj-$(CONFIG_BPFILTER_UMH) += bpfilter.o
+bpfilter-objs += bpfilter_kern.o bpfilter_umh.o
diff --git a/net/bpfilter/bpfilter_kern.c b/net/bpfilter/bpfilter_kern.c
new file mode 100644
index 000000000000..b13d058f8c34
--- /dev/null
+++ b/net/bpfilter/bpfilter_kern.c
@@ -0,0 +1,114 @@
+// SPDX-License-Identifier: GPL-2.0
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/umh.h>
+#include <linux/bpfilter.h>
+#include <linux/sched.h>
+#include <linux/sched/signal.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include "msgfmt.h"
+
+#define UMH_start _binary_net_bpfilter_bpfilter_umh_start
+#define UMH_end _binary_net_bpfilter_bpfilter_umh_end
+
+extern char UMH_start;
+extern char UMH_end;
+
+static struct umh_info info;
+/* since ip_getsockopt() can run in parallel, serialize access to umh */
+static DEFINE_MUTEX(bpfilter_lock);
+
+static void shutdown_umh(struct umh_info *info)
+{
+ struct task_struct *tsk;
+
+ tsk = pid_task(find_vpid(info->pid), PIDTYPE_PID);
+ if (tsk)
+ force_sig(SIGKILL, tsk);
+ fput(info->pipe_to_umh);
+ fput(info->pipe_from_umh);
+}
+
+static void __stop_umh(void)
+{
+ if (IS_ENABLED(CONFIG_INET) &&
+ bpfilter_process_sockopt) {
+ bpfilter_process_sockopt = NULL;
+ shutdown_umh(&info);
+ }
+}
+
+static void stop_umh(void)
+{
+ mutex_lock(&bpfilter_lock);
+ __stop_umh();
+ mutex_unlock(&bpfilter_lock);
+}
+
+static int __bpfilter_process_sockopt(struct sock *sk, int optname,
+ char __user *optval,
+ unsigned int optlen, bool is_set)
+{
+ struct mbox_request req;
+ struct mbox_reply reply;
+ loff_t pos;
+ ssize_t n;
+ int ret;
+
+ req.is_set = is_set;
+ req.pid = current->pid;
+ req.cmd = optname;
+ req.addr = (long)optval;
+ req.len = optlen;
+ mutex_lock(&bpfilter_lock);
+ n = __kernel_write(info.pipe_to_umh, &req, sizeof(req), &pos);
+ if (n != sizeof(req)) {
+ pr_err("write fail %zd\n", n);
+ __stop_umh();
+ ret = -EFAULT;
+ goto out;
+ }
+ pos = 0;
+ n = kernel_read(info.pipe_from_umh, &reply, sizeof(reply), &pos);
+ if (n != sizeof(reply)) {
+ pr_err("read fail %zd\n", n);
+ __stop_umh();
+ ret = -EFAULT;
+ goto out;
+ }
+ ret = reply.status;
+out:
+ mutex_unlock(&bpfilter_lock);
+ return ret;
+}
+
+static int __init load_umh(void)
+{
+ int err;
+
+ /* fork usermode process */
+ err = fork_usermode_blob(&UMH_start, &UMH_end - &UMH_start, &info);
+ if (err)
+ return err;
+ pr_info("Loaded bpfilter_umh pid %d\n", info.pid);
+
+ /* health check that usermode process started correctly */
+ if (__bpfilter_process_sockopt(NULL, 0, 0, 0, 0) != 0) {
+ stop_umh();
+ return -EFAULT;
+ }
+ if (IS_ENABLED(CONFIG_INET))
+ bpfilter_process_sockopt = &__bpfilter_process_sockopt;
+
+ return 0;
+}
+
+static void __exit fini_umh(void)
+{
+ stop_umh();
+}
+module_init(load_umh);
+module_exit(fini_umh);
+MODULE_LICENSE("GPL");
diff --git a/net/bpfilter/main.c b/net/bpfilter/main.c
new file mode 100644
index 000000000000..1317f108df8a
--- /dev/null
+++ b/net/bpfilter/main.c
@@ -0,0 +1,63 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+#include <sys/uio.h>
+#include <errno.h>
+#include <stdio.h>
+#include <sys/socket.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include "include/uapi/linux/bpf.h"
+#include <asm/unistd.h>
+#include "msgfmt.h"
+
+int debug_fd;
+
+static int handle_get_cmd(struct mbox_request *cmd)
+{
+ switch (cmd->cmd) {
+ case 0:
+ return 0;
+ default:
+ break;
+ }
+ return -ENOPROTOOPT;
+}
+
+static int handle_set_cmd(struct mbox_request *cmd)
+{
+ return -ENOPROTOOPT;
+}
+
+static void loop(void)
+{
+ while (1) {
+ struct mbox_request req;
+ struct mbox_reply reply;
+ int n;
+
+ n = read(0, &req, sizeof(req));
+ if (n != sizeof(req)) {
+ dprintf(debug_fd, "invalid request %d\n", n);
+ return;
+ }
+
+ reply.status = req.is_set ?
+ handle_set_cmd(&req) :
+ handle_get_cmd(&req);
+
+ n = write(1, &reply, sizeof(reply));
+ if (n != sizeof(reply)) {
+ dprintf(debug_fd, "reply failed %d\n", n);
+ return;
+ }
+ }
+}
+
+int main(void)
+{
+ debug_fd = open("/dev/console", 00000002);
+ dprintf(debug_fd, "Started bpfilter\n");
+ loop();
+ close(debug_fd);
+ return 0;
+}
diff --git a/net/bpfilter/msgfmt.h b/net/bpfilter/msgfmt.h
new file mode 100644
index 000000000000..98d121c62945
--- /dev/null
+++ b/net/bpfilter/msgfmt.h
@@ -0,0 +1,17 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _NET_BPFILTER_MSGFMT_H
+#define _NET_BPFILTER_MSGFMT_H
+
+struct mbox_request {
+ __u64 addr;
+ __u32 len;
+ __u32 is_set;
+ __u32 cmd;
+ __u32 pid;
+};
+
+struct mbox_reply {
+ __u32 status;
+};
+
+#endif
diff --git a/net/bridge/br.c b/net/bridge/br.c
index 671d13c10f6f..b0a0b82e2d91 100644
--- a/net/bridge/br.c
+++ b/net/bridge/br.c
@@ -34,6 +34,7 @@ static int br_device_event(struct notifier_block *unused, unsigned long event, v
struct net_device *dev = netdev_notifier_info_to_dev(ptr);
struct net_bridge_port *p;
struct net_bridge *br;
+ bool notified = false;
bool changed_addr;
int err;
@@ -67,7 +68,7 @@ static int br_device_event(struct notifier_block *unused, unsigned long event, v
break;
case NETDEV_CHANGE:
- br_port_carrier_check(p);
+ br_port_carrier_check(p, &notified);
break;
case NETDEV_FEAT_CHANGE:
@@ -76,8 +77,10 @@ static int br_device_event(struct notifier_block *unused, unsigned long event, v
case NETDEV_DOWN:
spin_lock_bh(&br->lock);
- if (br->dev->flags & IFF_UP)
+ if (br->dev->flags & IFF_UP) {
br_stp_disable_port(p);
+ notified = true;
+ }
spin_unlock_bh(&br->lock);
break;
@@ -85,6 +88,7 @@ static int br_device_event(struct notifier_block *unused, unsigned long event, v
if (netif_running(br->dev) && netif_oper_up(dev)) {
spin_lock_bh(&br->lock);
br_stp_enable_port(p);
+ notified = true;
spin_unlock_bh(&br->lock);
}
break;
@@ -110,8 +114,8 @@ static int br_device_event(struct notifier_block *unused, unsigned long event, v
}
/* Events that may cause spanning tree to refresh */
- if (event == NETDEV_CHANGEADDR || event == NETDEV_UP ||
- event == NETDEV_CHANGE || event == NETDEV_DOWN)
+ if (!notified && (event == NETDEV_CHANGEADDR || event == NETDEV_UP ||
+ event == NETDEV_CHANGE || event == NETDEV_DOWN))
br_ifinfo_notify(RTM_NEWLINK, NULL, p);
return NOTIFY_DONE;
@@ -141,7 +145,7 @@ static int br_switchdev_event(struct notifier_block *unused,
case SWITCHDEV_FDB_ADD_TO_BRIDGE:
fdb_info = ptr;
err = br_fdb_external_learn_add(br, p, fdb_info->addr,
- fdb_info->vid);
+ fdb_info->vid, false);
if (err) {
err = notifier_from_errno(err);
break;
@@ -152,7 +156,7 @@ static int br_switchdev_event(struct notifier_block *unused,
case SWITCHDEV_FDB_DEL_TO_BRIDGE:
fdb_info = ptr;
err = br_fdb_external_learn_del(br, p, fdb_info->addr,
- fdb_info->vid);
+ fdb_info->vid, false);
if (err)
err = notifier_from_errno(err);
break;
diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c
index d9e69e4514be..b19e3104afd6 100644
--- a/net/bridge/br_fdb.c
+++ b/net/bridge/br_fdb.c
@@ -40,7 +40,7 @@ static struct kmem_cache *br_fdb_cache __read_mostly;
static int fdb_insert(struct net_bridge *br, struct net_bridge_port *source,
const unsigned char *addr, u16 vid);
static void fdb_notify(struct net_bridge *br,
- const struct net_bridge_fdb_entry *, int);
+ const struct net_bridge_fdb_entry *, int, bool);
int __init br_fdb_init(void)
{
@@ -121,6 +121,28 @@ static struct net_bridge_fdb_entry *br_fdb_find(struct net_bridge *br,
return fdb;
}
+struct net_device *br_fdb_find_port(const struct net_device *br_dev,
+ const unsigned char *addr,
+ __u16 vid)
+{
+ struct net_bridge_fdb_entry *f;
+ struct net_device *dev = NULL;
+ struct net_bridge *br;
+
+ ASSERT_RTNL();
+
+ if (!netif_is_bridge_master(br_dev))
+ return NULL;
+
+ br = netdev_priv(br_dev);
+ f = br_fdb_find(br, addr, vid);
+ if (f && f->dst)
+ dev = f->dst->dev;
+
+ return dev;
+}
+EXPORT_SYMBOL_GPL(br_fdb_find_port);
+
struct net_bridge_fdb_entry *br_fdb_find_rcu(struct net_bridge *br,
const unsigned char *addr,
__u16 vid)
@@ -173,7 +195,8 @@ static void fdb_del_hw_addr(struct net_bridge *br, const unsigned char *addr)
}
}
-static void fdb_delete(struct net_bridge *br, struct net_bridge_fdb_entry *f)
+static void fdb_delete(struct net_bridge *br, struct net_bridge_fdb_entry *f,
+ bool swdev_notify)
{
trace_fdb_delete(br, f);
@@ -183,7 +206,7 @@ static void fdb_delete(struct net_bridge *br, struct net_bridge_fdb_entry *f)
hlist_del_init_rcu(&f->fdb_node);
rhashtable_remove_fast(&br->fdb_hash_tbl, &f->rhnode,
br_fdb_rht_params);
- fdb_notify(br, f, RTM_DELNEIGH);
+ fdb_notify(br, f, RTM_DELNEIGH, swdev_notify);
call_rcu(&f->rcu, fdb_rcu_free);
}
@@ -219,7 +242,7 @@ static void fdb_delete_local(struct net_bridge *br,
return;
}
- fdb_delete(br, f);
+ fdb_delete(br, f, true);
}
void br_fdb_find_delete_local(struct net_bridge *br,
@@ -334,7 +357,7 @@ void br_fdb_cleanup(struct work_struct *work)
} else {
spin_lock_bh(&br->hash_lock);
if (!hlist_unhashed(&f->fdb_node))
- fdb_delete(br, f);
+ fdb_delete(br, f, true);
spin_unlock_bh(&br->hash_lock);
}
}
@@ -354,7 +377,7 @@ void br_fdb_flush(struct net_bridge *br)
spin_lock_bh(&br->hash_lock);
hlist_for_each_entry_safe(f, tmp, &br->fdb_list, fdb_node) {
if (!f->is_static)
- fdb_delete(br, f);
+ fdb_delete(br, f, true);
}
spin_unlock_bh(&br->hash_lock);
}
@@ -383,7 +406,7 @@ void br_fdb_delete_by_port(struct net_bridge *br,
if (f->is_local)
fdb_delete_local(br, p, f);
else
- fdb_delete(br, f);
+ fdb_delete(br, f, true);
}
spin_unlock_bh(&br->hash_lock);
}
@@ -509,7 +532,7 @@ static int fdb_insert(struct net_bridge *br, struct net_bridge_port *source,
return 0;
br_warn(br, "adding interface %s with same address as a received packet (addr:%pM, vlan:%u)\n",
source ? source->dev->name : br->dev->name, addr, vid);
- fdb_delete(br, fdb);
+ fdb_delete(br, fdb, true);
}
fdb = fdb_create(br, source, addr, vid, 1, 1);
@@ -517,7 +540,7 @@ static int fdb_insert(struct net_bridge *br, struct net_bridge_port *source,
return -ENOMEM;
fdb_add_hw_addr(br, addr);
- fdb_notify(br, fdb, RTM_NEWNEIGH);
+ fdb_notify(br, fdb, RTM_NEWNEIGH, true);
return 0;
}
@@ -572,7 +595,7 @@ void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source,
fdb->added_by_user = 1;
if (unlikely(fdb_modified)) {
trace_br_fdb_update(br, source, addr, vid, added_by_user);
- fdb_notify(br, fdb, RTM_NEWNEIGH);
+ fdb_notify(br, fdb, RTM_NEWNEIGH, true);
}
}
} else {
@@ -583,7 +606,7 @@ void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source,
fdb->added_by_user = 1;
trace_br_fdb_update(br, source, addr, vid,
added_by_user);
- fdb_notify(br, fdb, RTM_NEWNEIGH);
+ fdb_notify(br, fdb, RTM_NEWNEIGH, true);
}
/* else we lose race and someone else inserts
* it first, don't bother updating
@@ -665,13 +688,15 @@ static inline size_t fdb_nlmsg_size(void)
}
static void fdb_notify(struct net_bridge *br,
- const struct net_bridge_fdb_entry *fdb, int type)
+ const struct net_bridge_fdb_entry *fdb, int type,
+ bool swdev_notify)
{
struct net *net = dev_net(br->dev);
struct sk_buff *skb;
int err = -ENOBUFS;
- br_switchdev_fdb_notify(fdb, type);
+ if (swdev_notify)
+ br_switchdev_fdb_notify(fdb, type);
skb = nlmsg_new(fdb_nlmsg_size(), GFP_ATOMIC);
if (skb == NULL)
@@ -810,7 +835,7 @@ static int fdb_add_entry(struct net_bridge *br, struct net_bridge_port *source,
fdb->used = jiffies;
if (modified) {
fdb->updated = jiffies;
- fdb_notify(br, fdb, RTM_NEWNEIGH);
+ fdb_notify(br, fdb, RTM_NEWNEIGH, true);
}
return 0;
@@ -834,7 +859,7 @@ static int __br_fdb_add(struct ndmsg *ndm, struct net_bridge *br,
rcu_read_unlock();
local_bh_enable();
} else if (ndm->ndm_flags & NTF_EXT_LEARNED) {
- err = br_fdb_external_learn_add(br, p, addr, vid);
+ err = br_fdb_external_learn_add(br, p, addr, vid, true);
} else {
spin_lock_bh(&br->hash_lock);
err = fdb_add_entry(br, p, addr, ndm->ndm_state,
@@ -923,7 +948,7 @@ static int fdb_delete_by_addr_and_port(struct net_bridge *br,
if (!fdb || fdb->dst != p)
return -ENOENT;
- fdb_delete(br, fdb);
+ fdb_delete(br, fdb, true);
return 0;
}
@@ -1043,7 +1068,8 @@ void br_fdb_unsync_static(struct net_bridge *br, struct net_bridge_port *p)
}
int br_fdb_external_learn_add(struct net_bridge *br, struct net_bridge_port *p,
- const unsigned char *addr, u16 vid)
+ const unsigned char *addr, u16 vid,
+ bool swdev_notify)
{
struct net_bridge_fdb_entry *fdb;
bool modified = false;
@@ -1061,7 +1087,7 @@ int br_fdb_external_learn_add(struct net_bridge *br, struct net_bridge_port *p,
goto err_unlock;
}
fdb->added_by_external_learn = 1;
- fdb_notify(br, fdb, RTM_NEWNEIGH);
+ fdb_notify(br, fdb, RTM_NEWNEIGH, swdev_notify);
} else {
fdb->updated = jiffies;
@@ -1080,7 +1106,7 @@ int br_fdb_external_learn_add(struct net_bridge *br, struct net_bridge_port *p,
}
if (modified)
- fdb_notify(br, fdb, RTM_NEWNEIGH);
+ fdb_notify(br, fdb, RTM_NEWNEIGH, swdev_notify);
}
err_unlock:
@@ -1090,7 +1116,8 @@ err_unlock:
}
int br_fdb_external_learn_del(struct net_bridge *br, struct net_bridge_port *p,
- const unsigned char *addr, u16 vid)
+ const unsigned char *addr, u16 vid,
+ bool swdev_notify)
{
struct net_bridge_fdb_entry *fdb;
int err = 0;
@@ -1099,7 +1126,7 @@ int br_fdb_external_learn_del(struct net_bridge *br, struct net_bridge_port *p,
fdb = br_fdb_find(br, addr, vid);
if (fdb && fdb->added_by_external_learn)
- fdb_delete(br, fdb);
+ fdb_delete(br, fdb, swdev_notify);
else
err = -ENOENT;
diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c
index b4eed113d2ec..9019f326fe81 100644
--- a/net/bridge/br_forward.c
+++ b/net/bridge/br_forward.c
@@ -30,7 +30,8 @@ static inline int should_deliver(const struct net_bridge_port *p,
vg = nbp_vlan_group_rcu(p);
return ((p->flags & BR_HAIRPIN_MODE) || skb->dev != p->dev) &&
br_allowed_egress(vg, skb) && p->state == BR_STATE_FORWARDING &&
- nbp_switchdev_allowed_egress(p, skb);
+ nbp_switchdev_allowed_egress(p, skb) &&
+ !br_skb_isolated(p, skb);
}
int br_dev_queue_push_xmit(struct net *net, struct sock *sk, struct sk_buff *skb)
@@ -274,8 +275,7 @@ void br_multicast_flood(struct net_bridge_mdb_entry *mdst,
struct net_bridge_port *port, *lport, *rport;
lport = p ? p->port : NULL;
- rport = rp ? hlist_entry(rp, struct net_bridge_port, rlist) :
- NULL;
+ rport = hlist_entry_safe(rp, struct net_bridge_port, rlist);
if ((unsigned long)lport > (unsigned long)rport) {
port = lport;
diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c
index 5bb6681fa91e..05e42d86882d 100644
--- a/net/bridge/br_if.c
+++ b/net/bridge/br_if.c
@@ -64,7 +64,7 @@ static int port_cost(struct net_device *dev)
/* Check for port carrier transitions. */
-void br_port_carrier_check(struct net_bridge_port *p)
+void br_port_carrier_check(struct net_bridge_port *p, bool *notified)
{
struct net_device *dev = p->dev;
struct net_bridge *br = p->br;
@@ -73,16 +73,21 @@ void br_port_carrier_check(struct net_bridge_port *p)
netif_running(dev) && netif_oper_up(dev))
p->path_cost = port_cost(dev);
+ *notified = false;
if (!netif_running(br->dev))
return;
spin_lock_bh(&br->lock);
if (netif_running(dev) && netif_oper_up(dev)) {
- if (p->state == BR_STATE_DISABLED)
+ if (p->state == BR_STATE_DISABLED) {
br_stp_enable_port(p);
+ *notified = true;
+ }
} else {
- if (p->state != BR_STATE_DISABLED)
+ if (p->state != BR_STATE_DISABLED) {
br_stp_disable_port(p);
+ *notified = true;
+ }
}
spin_unlock_bh(&br->lock);
}
diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c
index 7f98a7d25866..72074276c088 100644
--- a/net/bridge/br_input.c
+++ b/net/bridge/br_input.c
@@ -114,6 +114,7 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb
goto drop;
BR_INPUT_SKB_CB(skb)->brdev = br->dev;
+ BR_INPUT_SKB_CB(skb)->src_port_isolated = !!(p->flags & BR_ISOLATED);
if (IS_ENABLED(CONFIG_INET) &&
(skb->protocol == htons(ETH_P_ARP) ||
diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c
index 015f465c514b..9f5eb05b0373 100644
--- a/net/bridge/br_netlink.c
+++ b/net/bridge/br_netlink.c
@@ -139,6 +139,7 @@ static inline size_t br_port_info_size(void)
+ nla_total_size(1) /* IFLA_BRPORT_PROXYARP_WIFI */
+ nla_total_size(1) /* IFLA_BRPORT_VLAN_TUNNEL */
+ nla_total_size(1) /* IFLA_BRPORT_NEIGH_SUPPRESS */
+ + nla_total_size(1) /* IFLA_BRPORT_ISOLATED */
+ nla_total_size(sizeof(struct ifla_bridge_id)) /* IFLA_BRPORT_ROOT_ID */
+ nla_total_size(sizeof(struct ifla_bridge_id)) /* IFLA_BRPORT_BRIDGE_ID */
+ nla_total_size(sizeof(u16)) /* IFLA_BRPORT_DESIGNATED_PORT */
@@ -213,7 +214,8 @@ static int br_port_fill_attrs(struct sk_buff *skb,
BR_VLAN_TUNNEL)) ||
nla_put_u16(skb, IFLA_BRPORT_GROUP_FWD_MASK, p->group_fwd_mask) ||
nla_put_u8(skb, IFLA_BRPORT_NEIGH_SUPPRESS,
- !!(p->flags & BR_NEIGH_SUPPRESS)))
+ !!(p->flags & BR_NEIGH_SUPPRESS)) ||
+ nla_put_u8(skb, IFLA_BRPORT_ISOLATED, !!(p->flags & BR_ISOLATED)))
return -EMSGSIZE;
timerval = br_timer_value(&p->message_age_timer);
@@ -660,6 +662,7 @@ static const struct nla_policy br_port_policy[IFLA_BRPORT_MAX + 1] = {
[IFLA_BRPORT_VLAN_TUNNEL] = { .type = NLA_U8 },
[IFLA_BRPORT_GROUP_FWD_MASK] = { .type = NLA_U16 },
[IFLA_BRPORT_NEIGH_SUPPRESS] = { .type = NLA_U8 },
+ [IFLA_BRPORT_ISOLATED] = { .type = NLA_U8 },
};
/* Change the state of the port and notify spanning tree */
@@ -810,6 +813,10 @@ static int br_setport(struct net_bridge_port *p, struct nlattr *tb[])
if (err)
return err;
+ err = br_set_port_flag(p, tb, IFLA_BRPORT_ISOLATED, BR_ISOLATED);
+ if (err)
+ return err;
+
br_port_flags_change(p, old_flags ^ p->flags);
return 0;
}
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index a7cb3ece5031..5216a524b537 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -423,6 +423,7 @@ struct br_input_skb_cb {
#endif
bool proxyarp_replied;
+ bool src_port_isolated;
#ifdef CONFIG_BRIDGE_VLAN_FILTERING
bool vlan_filtered;
@@ -553,9 +554,11 @@ int br_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb,
int br_fdb_sync_static(struct net_bridge *br, struct net_bridge_port *p);
void br_fdb_unsync_static(struct net_bridge *br, struct net_bridge_port *p);
int br_fdb_external_learn_add(struct net_bridge *br, struct net_bridge_port *p,
- const unsigned char *addr, u16 vid);
+ const unsigned char *addr, u16 vid,
+ bool swdev_notify);
int br_fdb_external_learn_del(struct net_bridge *br, struct net_bridge_port *p,
- const unsigned char *addr, u16 vid);
+ const unsigned char *addr, u16 vid,
+ bool swdev_notify);
void br_fdb_offloaded_set(struct net_bridge *br, struct net_bridge_port *p,
const unsigned char *addr, u16 vid);
@@ -572,8 +575,16 @@ int br_forward_finish(struct net *net, struct sock *sk, struct sk_buff *skb);
void br_flood(struct net_bridge *br, struct sk_buff *skb,
enum br_pkt_type pkt_type, bool local_rcv, bool local_orig);
+/* return true if both source port and dest port are isolated */
+static inline bool br_skb_isolated(const struct net_bridge_port *to,
+ const struct sk_buff *skb)
+{
+ return BR_INPUT_SKB_CB(skb)->src_port_isolated &&
+ (to->flags & BR_ISOLATED);
+}
+
/* br_if.c */
-void br_port_carrier_check(struct net_bridge_port *p);
+void br_port_carrier_check(struct net_bridge_port *p, bool *notified);
int br_add_bridge(struct net *net, const char *name);
int br_del_bridge(struct net *net, const char *name);
int br_add_if(struct net_bridge *br, struct net_device *dev,
@@ -594,11 +605,22 @@ static inline bool br_rx_handler_check_rcu(const struct net_device *dev)
return rcu_dereference(dev->rx_handler) == br_handle_frame;
}
+static inline bool br_rx_handler_check_rtnl(const struct net_device *dev)
+{
+ return rcu_dereference_rtnl(dev->rx_handler) == br_handle_frame;
+}
+
static inline struct net_bridge_port *br_port_get_check_rcu(const struct net_device *dev)
{
return br_rx_handler_check_rcu(dev) ? br_port_get_rcu(dev) : NULL;
}
+static inline struct net_bridge_port *
+br_port_get_check_rtnl(const struct net_device *dev)
+{
+ return br_rx_handler_check_rtnl(dev) ? br_port_get_rtnl_rcu(dev) : NULL;
+}
+
/* br_ioctl.c */
int br_dev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd);
int br_ioctl_deviceless_stub(struct net *net, unsigned int cmd,
@@ -1117,6 +1139,8 @@ int br_switchdev_set_port_flag(struct net_bridge_port *p,
unsigned long mask);
void br_switchdev_fdb_notify(const struct net_bridge_fdb_entry *fdb,
int type);
+int br_switchdev_port_vlan_add(struct net_device *dev, u16 vid, u16 flags);
+int br_switchdev_port_vlan_del(struct net_device *dev, u16 vid);
static inline void br_switchdev_frame_unmark(struct sk_buff *skb)
{
@@ -1146,6 +1170,17 @@ static inline int br_switchdev_set_port_flag(struct net_bridge_port *p,
return 0;
}
+static inline int br_switchdev_port_vlan_add(struct net_device *dev,
+ u16 vid, u16 flags)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline int br_switchdev_port_vlan_del(struct net_device *dev, u16 vid)
+{
+ return -EOPNOTSUPP;
+}
+
static inline void
br_switchdev_fdb_notify(const struct net_bridge_fdb_entry *fdb, int type)
{
diff --git a/net/bridge/br_switchdev.c b/net/bridge/br_switchdev.c
index ee775f4ff76c..d77f807420c4 100644
--- a/net/bridge/br_switchdev.c
+++ b/net/bridge/br_switchdev.c
@@ -102,13 +102,15 @@ int br_switchdev_set_port_flag(struct net_bridge_port *p,
static void
br_switchdev_fdb_call_notifiers(bool adding, const unsigned char *mac,
- u16 vid, struct net_device *dev)
+ u16 vid, struct net_device *dev,
+ bool added_by_user)
{
struct switchdev_notifier_fdb_info info;
unsigned long notifier_type;
info.addr = mac;
info.vid = vid;
+ info.added_by_user = added_by_user;
notifier_type = adding ? SWITCHDEV_FDB_ADD_TO_DEVICE : SWITCHDEV_FDB_DEL_TO_DEVICE;
call_switchdev_notifiers(notifier_type, dev, &info.info);
}
@@ -116,19 +118,46 @@ br_switchdev_fdb_call_notifiers(bool adding, const unsigned char *mac,
void
br_switchdev_fdb_notify(const struct net_bridge_fdb_entry *fdb, int type)
{
- if (!fdb->added_by_user || !fdb->dst)
+ if (!fdb->dst)
return;
switch (type) {
case RTM_DELNEIGH:
br_switchdev_fdb_call_notifiers(false, fdb->key.addr.addr,
fdb->key.vlan_id,
- fdb->dst->dev);
+ fdb->dst->dev,
+ fdb->added_by_user);
break;
case RTM_NEWNEIGH:
br_switchdev_fdb_call_notifiers(true, fdb->key.addr.addr,
fdb->key.vlan_id,
- fdb->dst->dev);
+ fdb->dst->dev,
+ fdb->added_by_user);
break;
}
}
+
+int br_switchdev_port_vlan_add(struct net_device *dev, u16 vid, u16 flags)
+{
+ struct switchdev_obj_port_vlan v = {
+ .obj.orig_dev = dev,
+ .obj.id = SWITCHDEV_OBJ_ID_PORT_VLAN,
+ .flags = flags,
+ .vid_begin = vid,
+ .vid_end = vid,
+ };
+
+ return switchdev_port_obj_add(dev, &v.obj);
+}
+
+int br_switchdev_port_vlan_del(struct net_device *dev, u16 vid)
+{
+ struct switchdev_obj_port_vlan v = {
+ .obj.orig_dev = dev,
+ .obj.id = SWITCHDEV_OBJ_ID_PORT_VLAN,
+ .vid_begin = vid,
+ .vid_end = vid,
+ };
+
+ return switchdev_port_obj_del(dev, &v.obj);
+}
diff --git a/net/bridge/br_sysfs_if.c b/net/bridge/br_sysfs_if.c
index fd31ad83ec7b..f99c5bf5c906 100644
--- a/net/bridge/br_sysfs_if.c
+++ b/net/bridge/br_sysfs_if.c
@@ -192,6 +192,7 @@ BRPORT_ATTR_FLAG(proxyarp_wifi, BR_PROXYARP_WIFI);
BRPORT_ATTR_FLAG(multicast_flood, BR_MCAST_FLOOD);
BRPORT_ATTR_FLAG(broadcast_flood, BR_BCAST_FLOOD);
BRPORT_ATTR_FLAG(neigh_suppress, BR_NEIGH_SUPPRESS);
+BRPORT_ATTR_FLAG(isolated, BR_ISOLATED);
#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
static ssize_t show_multicast_router(struct net_bridge_port *p, char *buf)
@@ -243,6 +244,7 @@ static const struct brport_attribute *brport_attrs[] = {
&brport_attr_broadcast_flood,
&brport_attr_group_fwd_mask,
&brport_attr_neigh_suppress,
+ &brport_attr_isolated,
NULL
};
diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c
index 9896f4975353..7df269092103 100644
--- a/net/bridge/br_vlan.c
+++ b/net/bridge/br_vlan.c
@@ -82,19 +82,12 @@ static bool __vlan_add_flags(struct net_bridge_vlan *v, u16 flags)
static int __vlan_vid_add(struct net_device *dev, struct net_bridge *br,
u16 vid, u16 flags)
{
- struct switchdev_obj_port_vlan v = {
- .obj.orig_dev = dev,
- .obj.id = SWITCHDEV_OBJ_ID_PORT_VLAN,
- .flags = flags,
- .vid_begin = vid,
- .vid_end = vid,
- };
int err;
/* Try switchdev op first. In case it is not supported, fallback to
* 8021q add.
*/
- err = switchdev_port_obj_add(dev, &v.obj);
+ err = br_switchdev_port_vlan_add(dev, vid, flags);
if (err == -EOPNOTSUPP)
return vlan_vid_add(dev, br->vlan_proto, vid);
return err;
@@ -130,18 +123,12 @@ static void __vlan_del_list(struct net_bridge_vlan *v)
static int __vlan_vid_del(struct net_device *dev, struct net_bridge *br,
u16 vid)
{
- struct switchdev_obj_port_vlan v = {
- .obj.orig_dev = dev,
- .obj.id = SWITCHDEV_OBJ_ID_PORT_VLAN,
- .vid_begin = vid,
- .vid_end = vid,
- };
int err;
/* Try switchdev op first. In case it is not supported, fallback to
* 8021q del.
*/
- err = switchdev_port_obj_del(dev, &v.obj);
+ err = br_switchdev_port_vlan_del(dev, vid);
if (err == -EOPNOTSUPP) {
vlan_vid_del(dev, br->vlan_proto, vid);
return 0;
@@ -259,6 +246,10 @@ static int __vlan_add(struct net_bridge_vlan *v, u16 flags)
goto out_filt;
v->brvlan = masterv;
v->stats = masterv->stats;
+ } else {
+ err = br_switchdev_port_vlan_add(dev, v->vid, flags);
+ if (err && err != -EOPNOTSUPP)
+ goto out;
}
/* Add the dev mac and count the vlan only if it's usable */
@@ -294,6 +285,8 @@ out_filt:
br_vlan_put_master(masterv);
v->brvlan = NULL;
}
+ } else {
+ br_switchdev_port_vlan_del(dev, v->vid);
}
goto out;
@@ -319,6 +312,11 @@ static int __vlan_del(struct net_bridge_vlan *v)
err = __vlan_vid_del(p->dev, p->br, v->vid);
if (err)
goto out;
+ } else {
+ err = br_switchdev_port_vlan_del(v->br->dev, v->vid);
+ if (err && err != -EOPNOTSUPP)
+ goto out;
+ err = 0;
}
if (br_vlan_should_use(v)) {
@@ -564,6 +562,48 @@ bool br_should_learn(struct net_bridge_port *p, struct sk_buff *skb, u16 *vid)
return false;
}
+static int br_vlan_add_existing(struct net_bridge *br,
+ struct net_bridge_vlan_group *vg,
+ struct net_bridge_vlan *vlan,
+ u16 flags, bool *changed)
+{
+ int err;
+
+ err = br_switchdev_port_vlan_add(br->dev, vlan->vid, flags);
+ if (err && err != -EOPNOTSUPP)
+ return err;
+
+ if (!br_vlan_is_brentry(vlan)) {
+ /* Trying to change flags of non-existent bridge vlan */
+ if (!(flags & BRIDGE_VLAN_INFO_BRENTRY)) {
+ err = -EINVAL;
+ goto err_flags;
+ }
+ /* It was only kept for port vlans, now make it real */
+ err = br_fdb_insert(br, NULL, br->dev->dev_addr,
+ vlan->vid);
+ if (err) {
+ br_err(br, "failed to insert local address into bridge forwarding table\n");
+ goto err_fdb_insert;
+ }
+
+ refcount_inc(&vlan->refcnt);
+ vlan->flags |= BRIDGE_VLAN_INFO_BRENTRY;
+ vg->num_vlans++;
+ *changed = true;
+ }
+
+ if (__vlan_add_flags(vlan, flags))
+ *changed = true;
+
+ return 0;
+
+err_fdb_insert:
+err_flags:
+ br_switchdev_port_vlan_del(br->dev, vlan->vid);
+ return err;
+}
+
/* Must be protected by RTNL.
* Must be called with vid in range from 1 to 4094 inclusive.
* changed must be true only if the vlan was created or updated
@@ -579,28 +619,8 @@ int br_vlan_add(struct net_bridge *br, u16 vid, u16 flags, bool *changed)
*changed = false;
vg = br_vlan_group(br);
vlan = br_vlan_find(vg, vid);
- if (vlan) {
- if (!br_vlan_is_brentry(vlan)) {
- /* Trying to change flags of non-existent bridge vlan */
- if (!(flags & BRIDGE_VLAN_INFO_BRENTRY))
- return -EINVAL;
- /* It was only kept for port vlans, now make it real */
- ret = br_fdb_insert(br, NULL, br->dev->dev_addr,
- vlan->vid);
- if (ret) {
- br_err(br, "failed insert local address into bridge forwarding table\n");
- return ret;
- }
- refcount_inc(&vlan->refcnt);
- vlan->flags |= BRIDGE_VLAN_INFO_BRENTRY;
- vg->num_vlans++;
- *changed = true;
- }
- if (__vlan_add_flags(vlan, flags))
- *changed = true;
-
- return 0;
- }
+ if (vlan)
+ return br_vlan_add_existing(br, vg, vlan, flags, changed);
vlan = kzalloc(sizeof(*vlan), GFP_KERNEL);
if (!vlan)
@@ -1053,13 +1073,6 @@ err_vlan_enabled:
int nbp_vlan_add(struct net_bridge_port *port, u16 vid, u16 flags,
bool *changed)
{
- struct switchdev_obj_port_vlan v = {
- .obj.orig_dev = port->dev,
- .obj.id = SWITCHDEV_OBJ_ID_PORT_VLAN,
- .flags = flags,
- .vid_begin = vid,
- .vid_end = vid,
- };
struct net_bridge_vlan *vlan;
int ret;
@@ -1069,7 +1082,7 @@ int nbp_vlan_add(struct net_bridge_port *port, u16 vid, u16 flags,
vlan = br_vlan_find(nbp_vlan_group(port), vid);
if (vlan) {
/* Pass the flags to the hardware bridge */
- ret = switchdev_port_obj_add(port->dev, &v.obj);
+ ret = br_switchdev_port_vlan_add(port->dev, vid, flags);
if (ret && ret != -EOPNOTSUPP)
return ret;
*changed = __vlan_add_flags(vlan, flags);
@@ -1149,3 +1162,44 @@ void br_vlan_get_stats(const struct net_bridge_vlan *v,
stats->tx_packets += txpackets;
}
}
+
+int br_vlan_get_pvid(const struct net_device *dev, u16 *p_pvid)
+{
+ struct net_bridge_vlan_group *vg;
+
+ ASSERT_RTNL();
+ if (netif_is_bridge_master(dev))
+ vg = br_vlan_group(netdev_priv(dev));
+ else
+ return -EINVAL;
+
+ *p_pvid = br_get_pvid(vg);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(br_vlan_get_pvid);
+
+int br_vlan_get_info(const struct net_device *dev, u16 vid,
+ struct bridge_vlan_info *p_vinfo)
+{
+ struct net_bridge_vlan_group *vg;
+ struct net_bridge_vlan *v;
+ struct net_bridge_port *p;
+
+ ASSERT_RTNL();
+ p = br_port_get_check_rtnl(dev);
+ if (p)
+ vg = nbp_vlan_group(p);
+ else if (netif_is_bridge_master(dev))
+ vg = br_vlan_group(netdev_priv(dev));
+ else
+ return -EINVAL;
+
+ v = br_vlan_find(vg, vid);
+ if (!v)
+ return -ENOENT;
+
+ p_vinfo->vid = vid;
+ p_vinfo->flags = v->flags;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(br_vlan_get_info);
diff --git a/net/bridge/netfilter/Kconfig b/net/bridge/netfilter/Kconfig
index f212447794bd..9a0159aebe1a 100644
--- a/net/bridge/netfilter/Kconfig
+++ b/net/bridge/netfilter/Kconfig
@@ -8,13 +8,6 @@ menuconfig NF_TABLES_BRIDGE
bool "Ethernet Bridge nf_tables support"
if NF_TABLES_BRIDGE
-
-config NFT_BRIDGE_META
- tristate "Netfilter nf_table bridge meta support"
- depends on NFT_META
- help
- Add support for bridge dedicated meta key.
-
config NFT_BRIDGE_REJECT
tristate "Netfilter nf_tables bridge reject support"
depends on NFT_REJECT && NFT_REJECT_IPV4 && NFT_REJECT_IPV6
diff --git a/net/bridge/netfilter/Makefile b/net/bridge/netfilter/Makefile
index 4bc758dd4a8c..9b868861f21a 100644
--- a/net/bridge/netfilter/Makefile
+++ b/net/bridge/netfilter/Makefile
@@ -3,7 +3,6 @@
# Makefile for the netfilter modules for Link Layer filtering on a bridge.
#
-obj-$(CONFIG_NFT_BRIDGE_META) += nft_meta_bridge.o
obj-$(CONFIG_NFT_BRIDGE_REJECT) += nft_reject_bridge.o
# packet logging
diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c
index 0e27c51331fb..28f68a2ec911 100644
--- a/net/bridge/netfilter/ebtables.c
+++ b/net/bridge/netfilter/ebtables.c
@@ -101,7 +101,7 @@ ebt_do_match(struct ebt_entry_match *m, const struct sk_buff *skb,
{
par->match = m->u.match;
par->matchinfo = m->data;
- return m->u.match->match(skb, par) ? EBT_MATCH : EBT_NOMATCH;
+ return !m->u.match->match(skb, par);
}
static inline int
@@ -177,6 +177,12 @@ struct ebt_entry *ebt_next_entry(const struct ebt_entry *entry)
return (void *)entry + entry->next_offset;
}
+static inline const struct ebt_entry_target *
+ebt_get_target_c(const struct ebt_entry *e)
+{
+ return ebt_get_target((struct ebt_entry *)e);
+}
+
/* Do some firewalling */
unsigned int ebt_do_table(struct sk_buff *skb,
const struct nf_hook_state *state,
@@ -230,8 +236,7 @@ unsigned int ebt_do_table(struct sk_buff *skb,
*/
EBT_WATCHER_ITERATE(point, ebt_do_watcher, skb, &acpar);
- t = (struct ebt_entry_target *)
- (((char *)point) + point->target_offset);
+ t = ebt_get_target_c(point);
/* standard target */
if (!t->u.target->target)
verdict = ((struct ebt_standard_target *)t)->verdict;
@@ -343,6 +348,16 @@ find_table_lock(struct net *net, const char *name, int *error,
"ebtable_", error, mutex);
}
+static inline void ebt_free_table_info(struct ebt_table_info *info)
+{
+ int i;
+
+ if (info->chainstack) {
+ for_each_possible_cpu(i)
+ vfree(info->chainstack[i]);
+ vfree(info->chainstack);
+ }
+}
static inline int
ebt_check_match(struct ebt_entry_match *m, struct xt_mtchk_param *par,
unsigned int *cnt)
@@ -627,7 +642,7 @@ ebt_cleanup_entry(struct ebt_entry *e, struct net *net, unsigned int *cnt)
return 1;
EBT_WATCHER_ITERATE(e, ebt_cleanup_watcher, net, NULL);
EBT_MATCH_ITERATE(e, ebt_cleanup_match, net, NULL);
- t = (struct ebt_entry_target *)(((char *)e) + e->target_offset);
+ t = ebt_get_target(e);
par.net = net;
par.target = t->u.target;
@@ -706,7 +721,7 @@ ebt_check_entry(struct ebt_entry *e, struct net *net,
ret = EBT_WATCHER_ITERATE(e, ebt_check_watcher, &tgpar, &j);
if (ret != 0)
goto cleanup_watchers;
- t = (struct ebt_entry_target *)(((char *)e) + e->target_offset);
+ t = ebt_get_target(e);
gap = e->next_offset - e->target_offset;
target = xt_request_find_target(NFPROTO_BRIDGE, t->u.name, 0);
@@ -779,8 +794,7 @@ static int check_chainloops(const struct ebt_entries *chain, struct ebt_cl_stack
if (pos == nentries)
continue;
}
- t = (struct ebt_entry_target *)
- (((char *)e) + e->target_offset);
+ t = ebt_get_target_c(e);
if (strcmp(t->u.name, EBT_STANDARD_TARGET))
goto letscontinue;
if (e->target_offset + sizeof(struct ebt_standard_target) >
@@ -975,7 +989,7 @@ static void get_counters(const struct ebt_counter *oldcounters,
static int do_replace_finish(struct net *net, struct ebt_replace *repl,
struct ebt_table_info *newinfo)
{
- int ret, i;
+ int ret;
struct ebt_counter *counterstmp = NULL;
/* used to be able to unlock earlier */
struct ebt_table_info *table;
@@ -1051,13 +1065,8 @@ static int do_replace_finish(struct net *net, struct ebt_replace *repl,
ebt_cleanup_entry, net, NULL);
vfree(table->entries);
- if (table->chainstack) {
- for_each_possible_cpu(i)
- vfree(table->chainstack[i]);
- vfree(table->chainstack);
- }
+ ebt_free_table_info(table);
vfree(table);
-
vfree(counterstmp);
#ifdef CONFIG_AUDIT
@@ -1078,11 +1087,7 @@ free_iterate:
free_counterstmp:
vfree(counterstmp);
/* can be initialized in translate_table() */
- if (newinfo->chainstack) {
- for_each_possible_cpu(i)
- vfree(newinfo->chainstack[i]);
- vfree(newinfo->chainstack);
- }
+ ebt_free_table_info(newinfo);
return ret;
}
@@ -1147,8 +1152,6 @@ free_newinfo:
static void __ebt_unregister_table(struct net *net, struct ebt_table *table)
{
- int i;
-
mutex_lock(&ebt_mutex);
list_del(&table->list);
mutex_unlock(&ebt_mutex);
@@ -1157,11 +1160,7 @@ static void __ebt_unregister_table(struct net *net, struct ebt_table *table)
if (table->private->nentries)
module_put(table->me);
vfree(table->private->entries);
- if (table->private->chainstack) {
- for_each_possible_cpu(i)
- vfree(table->private->chainstack[i]);
- vfree(table->private->chainstack);
- }
+ ebt_free_table_info(table->private);
vfree(table->private);
kfree(table);
}
@@ -1263,11 +1262,7 @@ int ebt_register_table(struct net *net, const struct ebt_table *input_table,
free_unlock:
mutex_unlock(&ebt_mutex);
free_chainstack:
- if (newinfo->chainstack) {
- for_each_possible_cpu(i)
- vfree(newinfo->chainstack[i]);
- vfree(newinfo->chainstack);
- }
+ ebt_free_table_info(newinfo);
vfree(newinfo->entries);
free_newinfo:
vfree(newinfo);
@@ -1405,7 +1400,7 @@ static inline int ebt_entry_to_user(struct ebt_entry *e, const char *base,
return -EFAULT;
hlp = ubase + (((char *)e + e->target_offset) - base);
- t = (struct ebt_entry_target *)(((char *)e) + e->target_offset);
+ t = ebt_get_target_c(e);
ret = EBT_MATCH_ITERATE(e, ebt_match_to_user, base, ubase);
if (ret != 0)
@@ -1746,7 +1741,7 @@ static int compat_copy_entry_to_user(struct ebt_entry *e, void __user **dstptr,
return ret;
target_offset = e->target_offset - (origsize - *size);
- t = (struct ebt_entry_target *) ((char *) e + e->target_offset);
+ t = ebt_get_target(e);
ret = compat_target_to_user(t, dstptr, size);
if (ret)
@@ -1794,7 +1789,7 @@ static int compat_calc_entry(const struct ebt_entry *e,
EBT_MATCH_ITERATE(e, compat_calc_match, &off);
EBT_WATCHER_ITERATE(e, compat_calc_watcher, &off);
- t = (const struct ebt_entry_target *) ((char *) e + e->target_offset);
+ t = ebt_get_target_c(e);
off += xt_compat_target_offset(t->u.target);
off += ebt_compat_entry_padsize();
diff --git a/net/bridge/netfilter/nft_meta_bridge.c b/net/bridge/netfilter/nft_meta_bridge.c
deleted file mode 100644
index bb63c9aed55d..000000000000
--- a/net/bridge/netfilter/nft_meta_bridge.c
+++ /dev/null
@@ -1,135 +0,0 @@
-/*
- * Copyright (c) 2014 Intel Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- */
-
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/netlink.h>
-#include <linux/netfilter.h>
-#include <linux/netfilter/nf_tables.h>
-#include <net/netfilter/nf_tables.h>
-#include <net/netfilter/nft_meta.h>
-
-#include "../br_private.h"
-
-static void nft_meta_bridge_get_eval(const struct nft_expr *expr,
- struct nft_regs *regs,
- const struct nft_pktinfo *pkt)
-{
- const struct nft_meta *priv = nft_expr_priv(expr);
- const struct net_device *in = nft_in(pkt), *out = nft_out(pkt);
- u32 *dest = &regs->data[priv->dreg];
- const struct net_bridge_port *p;
-
- switch (priv->key) {
- case NFT_META_BRI_IIFNAME:
- if (in == NULL || (p = br_port_get_rcu(in)) == NULL)
- goto err;
- break;
- case NFT_META_BRI_OIFNAME:
- if (out == NULL || (p = br_port_get_rcu(out)) == NULL)
- goto err;
- break;
- default:
- goto out;
- }
-
- strncpy((char *)dest, p->br->dev->name, IFNAMSIZ);
- return;
-out:
- return nft_meta_get_eval(expr, regs, pkt);
-err:
- regs->verdict.code = NFT_BREAK;
-}
-
-static int nft_meta_bridge_get_init(const struct nft_ctx *ctx,
- const struct nft_expr *expr,
- const struct nlattr * const tb[])
-{
- struct nft_meta *priv = nft_expr_priv(expr);
- unsigned int len;
-
- priv->key = ntohl(nla_get_be32(tb[NFTA_META_KEY]));
- switch (priv->key) {
- case NFT_META_BRI_IIFNAME:
- case NFT_META_BRI_OIFNAME:
- len = IFNAMSIZ;
- break;
- default:
- return nft_meta_get_init(ctx, expr, tb);
- }
-
- priv->dreg = nft_parse_register(tb[NFTA_META_DREG]);
- return nft_validate_register_store(ctx, priv->dreg, NULL,
- NFT_DATA_VALUE, len);
-}
-
-static struct nft_expr_type nft_meta_bridge_type;
-static const struct nft_expr_ops nft_meta_bridge_get_ops = {
- .type = &nft_meta_bridge_type,
- .size = NFT_EXPR_SIZE(sizeof(struct nft_meta)),
- .eval = nft_meta_bridge_get_eval,
- .init = nft_meta_bridge_get_init,
- .dump = nft_meta_get_dump,
-};
-
-static const struct nft_expr_ops nft_meta_bridge_set_ops = {
- .type = &nft_meta_bridge_type,
- .size = NFT_EXPR_SIZE(sizeof(struct nft_meta)),
- .eval = nft_meta_set_eval,
- .init = nft_meta_set_init,
- .destroy = nft_meta_set_destroy,
- .dump = nft_meta_set_dump,
- .validate = nft_meta_set_validate,
-};
-
-static const struct nft_expr_ops *
-nft_meta_bridge_select_ops(const struct nft_ctx *ctx,
- const struct nlattr * const tb[])
-{
- if (tb[NFTA_META_KEY] == NULL)
- return ERR_PTR(-EINVAL);
-
- if (tb[NFTA_META_DREG] && tb[NFTA_META_SREG])
- return ERR_PTR(-EINVAL);
-
- if (tb[NFTA_META_DREG])
- return &nft_meta_bridge_get_ops;
-
- if (tb[NFTA_META_SREG])
- return &nft_meta_bridge_set_ops;
-
- return ERR_PTR(-EINVAL);
-}
-
-static struct nft_expr_type nft_meta_bridge_type __read_mostly = {
- .family = NFPROTO_BRIDGE,
- .name = "meta",
- .select_ops = nft_meta_bridge_select_ops,
- .policy = nft_meta_policy,
- .maxattr = NFTA_META_MAX,
- .owner = THIS_MODULE,
-};
-
-static int __init nft_meta_bridge_module_init(void)
-{
- return nft_register_expr(&nft_meta_bridge_type);
-}
-
-static void __exit nft_meta_bridge_module_exit(void)
-{
- nft_unregister_expr(&nft_meta_bridge_type);
-}
-
-module_init(nft_meta_bridge_module_init);
-module_exit(nft_meta_bridge_module_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Tomasz Bursztyka <tomasz.bursztyka@linux.intel.com>");
-MODULE_ALIAS_NFT_AF_EXPR(AF_BRIDGE, "meta");
diff --git a/net/core/Makefile b/net/core/Makefile
index 6dbbba8c57ae..80175e6a2eb8 100644
--- a/net/core/Makefile
+++ b/net/core/Makefile
@@ -14,6 +14,7 @@ obj-y += dev.o ethtool.o dev_addr_lists.o dst.o netevent.o \
fib_notifier.o xdp.o
obj-y += net-sysfs.o
+obj-$(CONFIG_PAGE_POOL) += page_pool.o
obj-$(CONFIG_PROC_FS) += net-procfs.o
obj-$(CONFIG_NET_PKTGEN) += pktgen.o
obj-$(CONFIG_NETPOLL) += netpoll.o
@@ -30,3 +31,4 @@ obj-$(CONFIG_DST_CACHE) += dst_cache.o
obj-$(CONFIG_HWBM) += hwbm.o
obj-$(CONFIG_NET_DEVLINK) += devlink.o
obj-$(CONFIG_GRO_CELLS) += gro_cells.o
+obj-$(CONFIG_FAILOVER) += failover.o
diff --git a/net/core/dev.c b/net/core/dev.c
index 9c149238a4ce..6e18242a1cae 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1285,6 +1285,7 @@ int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
return len;
}
+EXPORT_SYMBOL(dev_set_alias);
/**
* dev_get_alias - get ifalias of a device
@@ -1586,7 +1587,7 @@ const char *netdev_cmd_to_name(enum netdev_cmd cmd)
N(UDP_TUNNEL_DROP_INFO) N(CHANGE_TX_QUEUE_LEN)
N(CVLAN_FILTER_PUSH_INFO) N(CVLAN_FILTER_DROP_INFO)
N(SVLAN_FILTER_PUSH_INFO) N(SVLAN_FILTER_DROP_INFO)
- };
+ }
#undef N
return "UNKNOWN_NETDEV_EVENT";
}
@@ -1754,38 +1755,38 @@ int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
EXPORT_SYMBOL(call_netdevice_notifiers);
#ifdef CONFIG_NET_INGRESS
-static struct static_key ingress_needed __read_mostly;
+static DEFINE_STATIC_KEY_FALSE(ingress_needed_key);
void net_inc_ingress_queue(void)
{
- static_key_slow_inc(&ingress_needed);
+ static_branch_inc(&ingress_needed_key);
}
EXPORT_SYMBOL_GPL(net_inc_ingress_queue);
void net_dec_ingress_queue(void)
{
- static_key_slow_dec(&ingress_needed);
+ static_branch_dec(&ingress_needed_key);
}
EXPORT_SYMBOL_GPL(net_dec_ingress_queue);
#endif
#ifdef CONFIG_NET_EGRESS
-static struct static_key egress_needed __read_mostly;
+static DEFINE_STATIC_KEY_FALSE(egress_needed_key);
void net_inc_egress_queue(void)
{
- static_key_slow_inc(&egress_needed);
+ static_branch_inc(&egress_needed_key);
}
EXPORT_SYMBOL_GPL(net_inc_egress_queue);
void net_dec_egress_queue(void)
{
- static_key_slow_dec(&egress_needed);
+ static_branch_dec(&egress_needed_key);
}
EXPORT_SYMBOL_GPL(net_dec_egress_queue);
#endif
-static struct static_key netstamp_needed __read_mostly;
+static DEFINE_STATIC_KEY_FALSE(netstamp_needed_key);
#ifdef HAVE_JUMP_LABEL
static atomic_t netstamp_needed_deferred;
static atomic_t netstamp_wanted;
@@ -1796,9 +1797,9 @@ static void netstamp_clear(struct work_struct *work)
wanted = atomic_add_return(deferred, &netstamp_wanted);
if (wanted > 0)
- static_key_enable(&netstamp_needed);
+ static_branch_enable(&netstamp_needed_key);
else
- static_key_disable(&netstamp_needed);
+ static_branch_disable(&netstamp_needed_key);
}
static DECLARE_WORK(netstamp_work, netstamp_clear);
#endif
@@ -1818,7 +1819,7 @@ void net_enable_timestamp(void)
atomic_inc(&netstamp_needed_deferred);
schedule_work(&netstamp_work);
#else
- static_key_slow_inc(&netstamp_needed);
+ static_branch_inc(&netstamp_needed_key);
#endif
}
EXPORT_SYMBOL(net_enable_timestamp);
@@ -1838,7 +1839,7 @@ void net_disable_timestamp(void)
atomic_dec(&netstamp_needed_deferred);
schedule_work(&netstamp_work);
#else
- static_key_slow_dec(&netstamp_needed);
+ static_branch_dec(&netstamp_needed_key);
#endif
}
EXPORT_SYMBOL(net_disable_timestamp);
@@ -1846,15 +1847,15 @@ EXPORT_SYMBOL(net_disable_timestamp);
static inline void net_timestamp_set(struct sk_buff *skb)
{
skb->tstamp = 0;
- if (static_key_false(&netstamp_needed))
+ if (static_branch_unlikely(&netstamp_needed_key))
__net_timestamp(skb);
}
-#define net_timestamp_check(COND, SKB) \
- if (static_key_false(&netstamp_needed)) { \
- if ((COND) && !(SKB)->tstamp) \
- __net_timestamp(SKB); \
- } \
+#define net_timestamp_check(COND, SKB) \
+ if (static_branch_unlikely(&netstamp_needed_key)) { \
+ if ((COND) && !(SKB)->tstamp) \
+ __net_timestamp(SKB); \
+ } \
bool is_skb_forwardable(const struct net_device *dev, const struct sk_buff *skb)
{
@@ -2614,17 +2615,16 @@ EXPORT_SYMBOL(netif_device_attach);
* Returns a Tx hash based on the given packet descriptor a Tx queues' number
* to be used as a distribution range.
*/
-u16 __skb_tx_hash(const struct net_device *dev, struct sk_buff *skb,
- unsigned int num_tx_queues)
+static u16 skb_tx_hash(const struct net_device *dev, struct sk_buff *skb)
{
u32 hash;
u16 qoffset = 0;
- u16 qcount = num_tx_queues;
+ u16 qcount = dev->real_num_tx_queues;
if (skb_rx_queue_recorded(skb)) {
hash = skb_get_rx_queue(skb);
- while (unlikely(hash >= num_tx_queues))
- hash -= num_tx_queues;
+ while (unlikely(hash >= qcount))
+ hash -= qcount;
return hash;
}
@@ -2637,7 +2637,6 @@ u16 __skb_tx_hash(const struct net_device *dev, struct sk_buff *skb,
return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset;
}
-EXPORT_SYMBOL(__skb_tx_hash);
static void skb_warn_bad_offload(const struct sk_buff *skb)
{
@@ -3095,6 +3094,10 @@ static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device
if (unlikely(!skb))
goto out_null;
+ skb = sk_validate_xmit_skb(skb, dev);
+ if (unlikely(!skb))
+ goto out_null;
+
if (netif_needs_gso(skb, features)) {
struct sk_buff *segs;
@@ -3223,7 +3226,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
rc = NET_XMIT_DROP;
} else {
rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK;
- __qdisc_run(q);
+ qdisc_run(q);
}
if (unlikely(to_free))
@@ -3511,7 +3514,7 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
#ifdef CONFIG_NET_CLS_ACT
skb->tc_at_ingress = 0;
# ifdef CONFIG_NET_EGRESS
- if (static_key_false(&egress_needed)) {
+ if (static_branch_unlikely(&egress_needed_key)) {
skb = sch_handle_egress(skb, &rc, dev);
if (!skb)
goto out;
@@ -3606,6 +3609,44 @@ int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv)
}
EXPORT_SYMBOL(dev_queue_xmit_accel);
+int dev_direct_xmit(struct sk_buff *skb, u16 queue_id)
+{
+ struct net_device *dev = skb->dev;
+ struct sk_buff *orig_skb = skb;
+ struct netdev_queue *txq;
+ int ret = NETDEV_TX_BUSY;
+ bool again = false;
+
+ if (unlikely(!netif_running(dev) ||
+ !netif_carrier_ok(dev)))
+ goto drop;
+
+ skb = validate_xmit_skb_list(skb, dev, &again);
+ if (skb != orig_skb)
+ goto drop;
+
+ skb_set_queue_mapping(skb, queue_id);
+ txq = skb_get_tx_queue(dev, skb);
+
+ local_bh_disable();
+
+ HARD_TX_LOCK(dev, txq, smp_processor_id());
+ if (!netif_xmit_frozen_or_drv_stopped(txq))
+ ret = netdev_start_xmit(skb, dev, txq, false);
+ HARD_TX_UNLOCK(dev, txq);
+
+ local_bh_enable();
+
+ if (!dev_xmit_complete(ret))
+ kfree_skb(skb);
+
+ return ret;
+drop:
+ atomic_long_inc(&dev->tx_dropped);
+ kfree_skb_list(skb);
+ return NET_XMIT_DROP;
+}
+EXPORT_SYMBOL(dev_direct_xmit);
/*************************************************************************
* Receiver routines
@@ -3975,12 +4016,12 @@ static struct netdev_rx_queue *netif_get_rxqueue(struct sk_buff *skb)
}
static u32 netif_receive_generic_xdp(struct sk_buff *skb,
+ struct xdp_buff *xdp,
struct bpf_prog *xdp_prog)
{
struct netdev_rx_queue *rxqueue;
+ void *orig_data, *orig_data_end;
u32 metalen, act = XDP_DROP;
- struct xdp_buff xdp;
- void *orig_data;
int hlen, off;
u32 mac_len;
@@ -4015,31 +4056,42 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb,
*/
mac_len = skb->data - skb_mac_header(skb);
hlen = skb_headlen(skb) + mac_len;
- xdp.data = skb->data - mac_len;
- xdp.data_meta = xdp.data;
- xdp.data_end = xdp.data + hlen;
- xdp.data_hard_start = skb->data - skb_headroom(skb);
- orig_data = xdp.data;
+ xdp->data = skb->data - mac_len;
+ xdp->data_meta = xdp->data;
+ xdp->data_end = xdp->data + hlen;
+ xdp->data_hard_start = skb->data - skb_headroom(skb);
+ orig_data_end = xdp->data_end;
+ orig_data = xdp->data;
rxqueue = netif_get_rxqueue(skb);
- xdp.rxq = &rxqueue->xdp_rxq;
+ xdp->rxq = &rxqueue->xdp_rxq;
- act = bpf_prog_run_xdp(xdp_prog, &xdp);
+ act = bpf_prog_run_xdp(xdp_prog, xdp);
- off = xdp.data - orig_data;
+ off = xdp->data - orig_data;
if (off > 0)
__skb_pull(skb, off);
else if (off < 0)
__skb_push(skb, -off);
skb->mac_header += off;
+ /* check if bpf_xdp_adjust_tail was used. it can only "shrink"
+ * pckt.
+ */
+ off = orig_data_end - xdp->data_end;
+ if (off != 0) {
+ skb_set_tail_pointer(skb, xdp->data_end - xdp->data);
+ skb->len -= off;
+
+ }
+
switch (act) {
case XDP_REDIRECT:
case XDP_TX:
__skb_push(skb, mac_len);
break;
case XDP_PASS:
- metalen = xdp.data - xdp.data_meta;
+ metalen = xdp->data - xdp->data_meta;
if (metalen)
skb_metadata_set(skb, metalen);
break;
@@ -4084,22 +4136,24 @@ void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog)
}
EXPORT_SYMBOL_GPL(generic_xdp_tx);
-static struct static_key generic_xdp_needed __read_mostly;
+static DEFINE_STATIC_KEY_FALSE(generic_xdp_needed_key);
int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff *skb)
{
if (xdp_prog) {
- u32 act = netif_receive_generic_xdp(skb, xdp_prog);
+ struct xdp_buff xdp;
+ u32 act;
int err;
+ act = netif_receive_generic_xdp(skb, &xdp, xdp_prog);
if (act != XDP_PASS) {
switch (act) {
case XDP_REDIRECT:
err = xdp_do_generic_redirect(skb->dev, skb,
- xdp_prog);
+ &xdp, xdp_prog);
if (err)
goto out_redir;
- /* fallthru to submit skb */
+ break;
case XDP_TX:
generic_xdp_tx(skb, xdp_prog);
break;
@@ -4122,7 +4176,7 @@ static int netif_rx_internal(struct sk_buff *skb)
trace_netif_rx(skb);
- if (static_key_false(&generic_xdp_needed)) {
+ if (static_branch_unlikely(&generic_xdp_needed_key)) {
int ret;
preempt_disable();
@@ -4494,7 +4548,7 @@ another_round:
skip_taps:
#ifdef CONFIG_NET_INGRESS
- if (static_key_false(&ingress_needed)) {
+ if (static_branch_unlikely(&ingress_needed_key)) {
skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev);
if (!skb)
goto out;
@@ -4654,9 +4708,9 @@ static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp)
bpf_prog_put(old);
if (old && !new) {
- static_key_slow_dec(&generic_xdp_needed);
+ static_branch_dec(&generic_xdp_needed_key);
} else if (new && !old) {
- static_key_slow_inc(&generic_xdp_needed);
+ static_branch_inc(&generic_xdp_needed_key);
dev_disable_lro(dev);
dev_disable_gro_hw(dev);
}
@@ -4684,7 +4738,7 @@ static int netif_receive_skb_internal(struct sk_buff *skb)
if (skb_defer_rx_timestamp(skb))
return NET_RX_SUCCESS;
- if (static_key_false(&generic_xdp_needed)) {
+ if (static_branch_unlikely(&generic_xdp_needed_key)) {
int ret;
preempt_disable();
@@ -7852,6 +7906,8 @@ int register_netdevice(struct net_device *dev)
int ret;
struct net *net = dev_net(dev);
+ BUILD_BUG_ON(sizeof(netdev_features_t) * BITS_PER_BYTE <
+ NETDEV_FEATURE_COUNT);
BUG_ON(dev_boot_phase);
ASSERT_RTNL();
diff --git a/net/core/devlink.c b/net/core/devlink.c
index ad1317376798..22099705cc41 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -453,6 +453,27 @@ static void devlink_notify(struct devlink *devlink, enum devlink_command cmd)
msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL);
}
+static int devlink_nl_port_attrs_put(struct sk_buff *msg,
+ struct devlink_port *devlink_port)
+{
+ struct devlink_port_attrs *attrs = &devlink_port->attrs;
+
+ if (!attrs->set)
+ return 0;
+ if (nla_put_u16(msg, DEVLINK_ATTR_PORT_FLAVOUR, attrs->flavour))
+ return -EMSGSIZE;
+ if (nla_put_u32(msg, DEVLINK_ATTR_PORT_NUMBER, attrs->port_number))
+ return -EMSGSIZE;
+ if (!attrs->split)
+ return 0;
+ if (nla_put_u32(msg, DEVLINK_ATTR_PORT_SPLIT_GROUP, attrs->port_number))
+ return -EMSGSIZE;
+ if (nla_put_u32(msg, DEVLINK_ATTR_PORT_SPLIT_SUBPORT_NUMBER,
+ attrs->split_subport_number))
+ return -EMSGSIZE;
+ return 0;
+}
+
static int devlink_nl_port_fill(struct sk_buff *msg, struct devlink *devlink,
struct devlink_port *devlink_port,
enum devlink_command cmd, u32 portid,
@@ -492,9 +513,7 @@ static int devlink_nl_port_fill(struct sk_buff *msg, struct devlink *devlink,
ibdev->name))
goto nla_put_failure;
}
- if (devlink_port->split &&
- nla_put_u32(msg, DEVLINK_ATTR_PORT_SPLIT_GROUP,
- devlink_port->split_group))
+ if (devlink_nl_port_attrs_put(msg, devlink_port))
goto nla_put_failure;
genlmsg_end(msg, hdr);
@@ -683,12 +702,13 @@ static int devlink_nl_cmd_port_set_doit(struct sk_buff *skb,
return 0;
}
-static int devlink_port_split(struct devlink *devlink,
- u32 port_index, u32 count)
+static int devlink_port_split(struct devlink *devlink, u32 port_index,
+ u32 count, struct netlink_ext_ack *extack)
{
if (devlink->ops && devlink->ops->port_split)
- return devlink->ops->port_split(devlink, port_index, count);
+ return devlink->ops->port_split(devlink, port_index, count,
+ extack);
return -EOPNOTSUPP;
}
@@ -705,14 +725,15 @@ static int devlink_nl_cmd_port_split_doit(struct sk_buff *skb,
port_index = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_INDEX]);
count = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_SPLIT_COUNT]);
- return devlink_port_split(devlink, port_index, count);
+ return devlink_port_split(devlink, port_index, count, info->extack);
}
-static int devlink_port_unsplit(struct devlink *devlink, u32 port_index)
+static int devlink_port_unsplit(struct devlink *devlink, u32 port_index,
+ struct netlink_ext_ack *extack)
{
if (devlink->ops && devlink->ops->port_unsplit)
- return devlink->ops->port_unsplit(devlink, port_index);
+ return devlink->ops->port_unsplit(devlink, port_index, extack);
return -EOPNOTSUPP;
}
@@ -726,7 +747,7 @@ static int devlink_nl_cmd_port_unsplit_doit(struct sk_buff *skb,
return -EINVAL;
port_index = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_INDEX]);
- return devlink_port_unsplit(devlink, port_index);
+ return devlink_port_unsplit(devlink, port_index, info->extack);
}
static int devlink_nl_sb_fill(struct sk_buff *msg, struct devlink *devlink,
@@ -1807,7 +1828,6 @@ send_done:
nla_put_failure:
err = -EMSGSIZE;
err_table_put:
- genlmsg_cancel(skb, hdr);
nlmsg_free(skb);
return err;
}
@@ -2013,7 +2033,6 @@ int devlink_dpipe_entry_ctx_prepare(struct devlink_dpipe_dump_ctx *dump_ctx)
return 0;
nla_put_failure:
- genlmsg_cancel(dump_ctx->skb, dump_ctx->hdr);
nlmsg_free(dump_ctx->skb);
return -EMSGSIZE;
}
@@ -2230,7 +2249,6 @@ send_done:
nla_put_failure:
err = -EMSGSIZE;
err_table_put:
- genlmsg_cancel(skb, hdr);
nlmsg_free(skb);
return err;
}
@@ -2532,7 +2550,6 @@ nla_put_failure:
err = -EMSGSIZE;
err_resource_put:
err_skb_send_alloc:
- genlmsg_cancel(skb, hdr);
nlmsg_free(skb);
return err;
}
@@ -2584,7 +2601,7 @@ static int devlink_nl_cmd_reload(struct sk_buff *skb, struct genl_info *info)
NL_SET_ERR_MSG_MOD(info->extack, "resources size validation failed");
return err;
}
- return devlink->ops->reload(devlink);
+ return devlink->ops->reload(devlink, info->extack);
}
static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = {
@@ -2737,7 +2754,8 @@ static const struct genl_ops devlink_nl_ops[] = {
.doit = devlink_nl_cmd_eswitch_set_doit,
.policy = devlink_nl_policy,
.flags = GENL_ADMIN_PERM,
- .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
+ .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK |
+ DEVLINK_NL_FLAG_NO_LOCK,
},
{
.cmd = DEVLINK_CMD_DPIPE_TABLE_GET,
@@ -2971,19 +2989,64 @@ void devlink_port_type_clear(struct devlink_port *devlink_port)
EXPORT_SYMBOL_GPL(devlink_port_type_clear);
/**
- * devlink_port_split_set - Set port is split
+ * devlink_port_attrs_set - Set port attributes
*
* @devlink_port: devlink port
- * @split_group: split group - identifies group split port is part of
+ * @flavour: flavour of the port
+ * @port_number: number of the port that is facing user, for example
+ * the front panel port number
+ * @split: indicates if this is split port
+ * @split_subport_number: if the port is split, this is the number
+ * of subport.
*/
-void devlink_port_split_set(struct devlink_port *devlink_port,
- u32 split_group)
-{
- devlink_port->split = true;
- devlink_port->split_group = split_group;
+void devlink_port_attrs_set(struct devlink_port *devlink_port,
+ enum devlink_port_flavour flavour,
+ u32 port_number, bool split,
+ u32 split_subport_number)
+{
+ struct devlink_port_attrs *attrs = &devlink_port->attrs;
+
+ attrs->set = true;
+ attrs->flavour = flavour;
+ attrs->port_number = port_number;
+ attrs->split = split;
+ attrs->split_subport_number = split_subport_number;
devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW);
}
-EXPORT_SYMBOL_GPL(devlink_port_split_set);
+EXPORT_SYMBOL_GPL(devlink_port_attrs_set);
+
+int devlink_port_get_phys_port_name(struct devlink_port *devlink_port,
+ char *name, size_t len)
+{
+ struct devlink_port_attrs *attrs = &devlink_port->attrs;
+ int n = 0;
+
+ if (!attrs->set)
+ return -EOPNOTSUPP;
+
+ switch (attrs->flavour) {
+ case DEVLINK_PORT_FLAVOUR_PHYSICAL:
+ if (!attrs->split)
+ n = snprintf(name, len, "p%u", attrs->port_number);
+ else
+ n = snprintf(name, len, "p%us%u", attrs->port_number,
+ attrs->split_subport_number);
+ break;
+ case DEVLINK_PORT_FLAVOUR_CPU:
+ case DEVLINK_PORT_FLAVOUR_DSA:
+ /* As CPU and DSA ports do not have a netdevice associated
+ * case should not ever happen.
+ */
+ WARN_ON(1);
+ return -EINVAL;
+ }
+
+ if (n >= len)
+ return -EINVAL;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(devlink_port_get_phys_port_name);
int devlink_sb_register(struct devlink *devlink, unsigned int sb_index,
u32 size, u16 ingress_pools_count,
diff --git a/net/core/dst.c b/net/core/dst.c
index 007aa0b08291..2d9b37f8944a 100644
--- a/net/core/dst.c
+++ b/net/core/dst.c
@@ -58,6 +58,7 @@ const struct dst_metrics dst_default_metrics = {
*/
.refcnt = REFCOUNT_INIT(1),
};
+EXPORT_SYMBOL(dst_default_metrics);
void dst_init(struct dst_entry *dst, struct dst_ops *ops,
struct net_device *dev, int initial_ref, int initial_obsolete,
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index ba02f0dfe85c..c15075dc7572 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -92,6 +92,7 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN]
[NETIF_F_GSO_PARTIAL_BIT] = "tx-gso-partial",
[NETIF_F_GSO_SCTP_BIT] = "tx-sctp-segmentation",
[NETIF_F_GSO_ESP_BIT] = "tx-esp-segmentation",
+ [NETIF_F_GSO_UDP_L4_BIT] = "tx-udp-segmentation",
[NETIF_F_FCOE_CRC_BIT] = "tx-checksum-fcoe-crc",
[NETIF_F_SCTP_CRC_BIT] = "tx-checksum-sctp",
@@ -109,6 +110,7 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN]
[NETIF_F_HW_ESP_TX_CSUM_BIT] = "esp-tx-csum-hw-offload",
[NETIF_F_RX_UDP_TUNNEL_PORT_BIT] = "rx-udp_tunnel-port-offload",
[NETIF_F_HW_TLS_RECORD_BIT] = "tls-hw-record",
+ [NETIF_F_HW_TLS_TX_BIT] = "tls-hw-tx-offload",
};
static const char
@@ -210,23 +212,6 @@ static int ethtool_set_features(struct net_device *dev, void __user *useraddr)
return ret;
}
-static int phy_get_sset_count(struct phy_device *phydev)
-{
- int ret;
-
- if (phydev->drv->get_sset_count &&
- phydev->drv->get_strings &&
- phydev->drv->get_stats) {
- mutex_lock(&phydev->lock);
- ret = phydev->drv->get_sset_count(phydev);
- mutex_unlock(&phydev->lock);
-
- return ret;
- }
-
- return -EOPNOTSUPP;
-}
-
static int __ethtool_get_sset_count(struct net_device *dev, int sset)
{
const struct ethtool_ops *ops = dev->ethtool_ops;
@@ -243,12 +228,9 @@ static int __ethtool_get_sset_count(struct net_device *dev, int sset)
if (sset == ETH_SS_PHY_TUNABLES)
return ARRAY_SIZE(phy_tunable_strings);
- if (sset == ETH_SS_PHY_STATS) {
- if (dev->phydev)
- return phy_get_sset_count(dev->phydev);
- else
- return -EOPNOTSUPP;
- }
+ if (sset == ETH_SS_PHY_STATS && dev->phydev &&
+ !ops->get_ethtool_phy_stats)
+ return phy_ethtool_get_sset_count(dev->phydev);
if (ops->get_sset_count && ops->get_strings)
return ops->get_sset_count(dev, sset);
@@ -271,17 +253,10 @@ static void __ethtool_get_strings(struct net_device *dev,
memcpy(data, tunable_strings, sizeof(tunable_strings));
else if (stringset == ETH_SS_PHY_TUNABLES)
memcpy(data, phy_tunable_strings, sizeof(phy_tunable_strings));
- else if (stringset == ETH_SS_PHY_STATS) {
- struct phy_device *phydev = dev->phydev;
-
- if (phydev) {
- mutex_lock(&phydev->lock);
- phydev->drv->get_strings(phydev, data);
- mutex_unlock(&phydev->lock);
- } else {
- return;
- }
- } else
+ else if (stringset == ETH_SS_PHY_STATS && dev->phydev &&
+ !ops->get_ethtool_phy_stats)
+ phy_ethtool_get_strings(dev->phydev, data);
+ else
/* ops->get_strings is valid because checked earlier */
ops->get_strings(dev, stringset, data);
}
@@ -1998,15 +1973,19 @@ static int ethtool_get_stats(struct net_device *dev, void __user *useraddr)
static int ethtool_get_phy_stats(struct net_device *dev, void __user *useraddr)
{
- struct ethtool_stats stats;
+ const struct ethtool_ops *ops = dev->ethtool_ops;
struct phy_device *phydev = dev->phydev;
+ struct ethtool_stats stats;
u64 *data;
int ret, n_stats;
- if (!phydev)
+ if (!phydev && (!ops->get_ethtool_phy_stats || !ops->get_sset_count))
return -EOPNOTSUPP;
- n_stats = phy_get_sset_count(phydev);
+ if (dev->phydev && !ops->get_ethtool_phy_stats)
+ n_stats = phy_ethtool_get_sset_count(dev->phydev);
+ else
+ n_stats = ops->get_sset_count(dev, ETH_SS_PHY_STATS);
if (n_stats < 0)
return n_stats;
if (n_stats > S32_MAX / sizeof(u64))
@@ -2021,9 +2000,13 @@ static int ethtool_get_phy_stats(struct net_device *dev, void __user *useraddr)
if (n_stats && !data)
return -ENOMEM;
- mutex_lock(&phydev->lock);
- phydev->drv->get_stats(phydev, &stats, data);
- mutex_unlock(&phydev->lock);
+ if (dev->phydev && !ops->get_ethtool_phy_stats) {
+ ret = phy_ethtool_get_stats(dev->phydev, &stats, data);
+ if (ret < 0)
+ return ret;
+ } else {
+ ops->get_ethtool_phy_stats(dev, &stats, data);
+ }
ret = -EFAULT;
if (copy_to_user(useraddr, &stats, sizeof(stats)))
diff --git a/net/core/failover.c b/net/core/failover.c
new file mode 100644
index 000000000000..4a92a98ccce9
--- /dev/null
+++ b/net/core/failover.c
@@ -0,0 +1,315 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2018, Intel Corporation. */
+
+/* A common module to handle registrations and notifications for paravirtual
+ * drivers to enable accelerated datapath and support VF live migration.
+ *
+ * The notifier and event handling code is based on netvsc driver.
+ */
+
+#include <linux/module.h>
+#include <linux/etherdevice.h>
+#include <uapi/linux/if_arp.h>
+#include <linux/rtnetlink.h>
+#include <linux/if_vlan.h>
+#include <net/failover.h>
+
+static LIST_HEAD(failover_list);
+static DEFINE_SPINLOCK(failover_lock);
+
+static struct net_device *failover_get_bymac(u8 *mac, struct failover_ops **ops)
+{
+ struct net_device *failover_dev;
+ struct failover *failover;
+
+ spin_lock(&failover_lock);
+ list_for_each_entry(failover, &failover_list, list) {
+ failover_dev = rtnl_dereference(failover->failover_dev);
+ if (ether_addr_equal(failover_dev->perm_addr, mac)) {
+ *ops = rtnl_dereference(failover->ops);
+ spin_unlock(&failover_lock);
+ return failover_dev;
+ }
+ }
+ spin_unlock(&failover_lock);
+ return NULL;
+}
+
+/**
+ * failover_slave_register - Register a slave netdev
+ *
+ * @slave_dev: slave netdev that is being registered
+ *
+ * Registers a slave device to a failover instance. Only ethernet devices
+ * are supported.
+ */
+static int failover_slave_register(struct net_device *slave_dev)
+{
+ struct netdev_lag_upper_info lag_upper_info;
+ struct net_device *failover_dev;
+ struct failover_ops *fops;
+ int err;
+
+ if (slave_dev->type != ARPHRD_ETHER)
+ goto done;
+
+ ASSERT_RTNL();
+
+ failover_dev = failover_get_bymac(slave_dev->perm_addr, &fops);
+ if (!failover_dev)
+ goto done;
+
+ if (fops && fops->slave_pre_register &&
+ fops->slave_pre_register(slave_dev, failover_dev))
+ goto done;
+
+ err = netdev_rx_handler_register(slave_dev, fops->slave_handle_frame,
+ failover_dev);
+ if (err) {
+ netdev_err(slave_dev, "can not register failover rx handler (err = %d)\n",
+ err);
+ goto done;
+ }
+
+ lag_upper_info.tx_type = NETDEV_LAG_TX_TYPE_ACTIVEBACKUP;
+ err = netdev_master_upper_dev_link(slave_dev, failover_dev, NULL,
+ &lag_upper_info, NULL);
+ if (err) {
+ netdev_err(slave_dev, "can not set failover device %s (err = %d)\n",
+ failover_dev->name, err);
+ goto err_upper_link;
+ }
+
+ slave_dev->priv_flags |= IFF_FAILOVER_SLAVE;
+
+ if (fops && fops->slave_register &&
+ !fops->slave_register(slave_dev, failover_dev))
+ return NOTIFY_OK;
+
+ netdev_upper_dev_unlink(slave_dev, failover_dev);
+ slave_dev->priv_flags &= ~IFF_FAILOVER_SLAVE;
+err_upper_link:
+ netdev_rx_handler_unregister(slave_dev);
+done:
+ return NOTIFY_DONE;
+}
+
+/**
+ * failover_slave_unregister - Unregister a slave netdev
+ *
+ * @slave_dev: slave netdev that is being unregistered
+ *
+ * Unregisters a slave device from a failover instance.
+ */
+int failover_slave_unregister(struct net_device *slave_dev)
+{
+ struct net_device *failover_dev;
+ struct failover_ops *fops;
+
+ if (!netif_is_failover_slave(slave_dev))
+ goto done;
+
+ ASSERT_RTNL();
+
+ failover_dev = failover_get_bymac(slave_dev->perm_addr, &fops);
+ if (!failover_dev)
+ goto done;
+
+ if (fops && fops->slave_pre_unregister &&
+ fops->slave_pre_unregister(slave_dev, failover_dev))
+ goto done;
+
+ netdev_rx_handler_unregister(slave_dev);
+ netdev_upper_dev_unlink(slave_dev, failover_dev);
+ slave_dev->priv_flags &= ~IFF_FAILOVER_SLAVE;
+
+ if (fops && fops->slave_unregister &&
+ !fops->slave_unregister(slave_dev, failover_dev))
+ return NOTIFY_OK;
+
+done:
+ return NOTIFY_DONE;
+}
+EXPORT_SYMBOL_GPL(failover_slave_unregister);
+
+static int failover_slave_link_change(struct net_device *slave_dev)
+{
+ struct net_device *failover_dev;
+ struct failover_ops *fops;
+
+ if (!netif_is_failover_slave(slave_dev))
+ goto done;
+
+ ASSERT_RTNL();
+
+ failover_dev = failover_get_bymac(slave_dev->perm_addr, &fops);
+ if (!failover_dev)
+ goto done;
+
+ if (!netif_running(failover_dev))
+ goto done;
+
+ if (fops && fops->slave_link_change &&
+ !fops->slave_link_change(slave_dev, failover_dev))
+ return NOTIFY_OK;
+
+done:
+ return NOTIFY_DONE;
+}
+
+static int failover_slave_name_change(struct net_device *slave_dev)
+{
+ struct net_device *failover_dev;
+ struct failover_ops *fops;
+
+ if (!netif_is_failover_slave(slave_dev))
+ goto done;
+
+ ASSERT_RTNL();
+
+ failover_dev = failover_get_bymac(slave_dev->perm_addr, &fops);
+ if (!failover_dev)
+ goto done;
+
+ if (!netif_running(failover_dev))
+ goto done;
+
+ if (fops && fops->slave_name_change &&
+ !fops->slave_name_change(slave_dev, failover_dev))
+ return NOTIFY_OK;
+
+done:
+ return NOTIFY_DONE;
+}
+
+static int
+failover_event(struct notifier_block *this, unsigned long event, void *ptr)
+{
+ struct net_device *event_dev = netdev_notifier_info_to_dev(ptr);
+
+ /* Skip parent events */
+ if (netif_is_failover(event_dev))
+ return NOTIFY_DONE;
+
+ switch (event) {
+ case NETDEV_REGISTER:
+ return failover_slave_register(event_dev);
+ case NETDEV_UNREGISTER:
+ return failover_slave_unregister(event_dev);
+ case NETDEV_UP:
+ case NETDEV_DOWN:
+ case NETDEV_CHANGE:
+ return failover_slave_link_change(event_dev);
+ case NETDEV_CHANGENAME:
+ return failover_slave_name_change(event_dev);
+ default:
+ return NOTIFY_DONE;
+ }
+}
+
+static struct notifier_block failover_notifier = {
+ .notifier_call = failover_event,
+};
+
+static void
+failover_existing_slave_register(struct net_device *failover_dev)
+{
+ struct net *net = dev_net(failover_dev);
+ struct net_device *dev;
+
+ rtnl_lock();
+ for_each_netdev(net, dev) {
+ if (netif_is_failover(dev))
+ continue;
+ if (ether_addr_equal(failover_dev->perm_addr, dev->perm_addr))
+ failover_slave_register(dev);
+ }
+ rtnl_unlock();
+}
+
+/**
+ * failover_register - Register a failover instance
+ *
+ * @dev: failover netdev
+ * @ops: failover ops
+ *
+ * Allocate and register a failover instance for a failover netdev. ops
+ * provides handlers for slave device register/unregister/link change/
+ * name change events.
+ *
+ * Return: pointer to failover instance
+ */
+struct failover *failover_register(struct net_device *dev,
+ struct failover_ops *ops)
+{
+ struct failover *failover;
+
+ if (dev->type != ARPHRD_ETHER)
+ return ERR_PTR(-EINVAL);
+
+ failover = kzalloc(sizeof(*failover), GFP_KERNEL);
+ if (!failover)
+ return ERR_PTR(-ENOMEM);
+
+ rcu_assign_pointer(failover->ops, ops);
+ dev_hold(dev);
+ dev->priv_flags |= IFF_FAILOVER;
+ rcu_assign_pointer(failover->failover_dev, dev);
+
+ spin_lock(&failover_lock);
+ list_add_tail(&failover->list, &failover_list);
+ spin_unlock(&failover_lock);
+
+ netdev_info(dev, "failover master:%s registered\n", dev->name);
+
+ failover_existing_slave_register(dev);
+
+ return failover;
+}
+EXPORT_SYMBOL_GPL(failover_register);
+
+/**
+ * failover_unregister - Unregister a failover instance
+ *
+ * @failover: pointer to failover instance
+ *
+ * Unregisters and frees a failover instance.
+ */
+void failover_unregister(struct failover *failover)
+{
+ struct net_device *failover_dev;
+
+ failover_dev = rcu_dereference(failover->failover_dev);
+
+ netdev_info(failover_dev, "failover master:%s unregistered\n",
+ failover_dev->name);
+
+ failover_dev->priv_flags &= ~IFF_FAILOVER;
+ dev_put(failover_dev);
+
+ spin_lock(&failover_lock);
+ list_del(&failover->list);
+ spin_unlock(&failover_lock);
+
+ kfree(failover);
+}
+EXPORT_SYMBOL_GPL(failover_unregister);
+
+static __init int
+failover_init(void)
+{
+ register_netdevice_notifier(&failover_notifier);
+
+ return 0;
+}
+module_init(failover_init);
+
+static __exit
+void failover_exit(void)
+{
+ unregister_netdevice_notifier(&failover_notifier);
+}
+module_exit(failover_exit);
+
+MODULE_DESCRIPTION("Generic failover infrastructure/interface");
+MODULE_LICENSE("GPL v2");
diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
index 33958f84c173..126ffc5bc630 100644
--- a/net/core/fib_rules.c
+++ b/net/core/fib_rules.c
@@ -387,247 +387,304 @@ unsigned int fib_rules_seq_read(struct net *net, int family)
}
EXPORT_SYMBOL_GPL(fib_rules_seq_read);
-static int validate_rulemsg(struct fib_rule_hdr *frh, struct nlattr **tb,
- struct fib_rules_ops *ops)
-{
- int err = -EINVAL;
-
- if (frh->src_len)
- if (tb[FRA_SRC] == NULL ||
- frh->src_len > (ops->addr_size * 8) ||
- nla_len(tb[FRA_SRC]) != ops->addr_size)
- goto errout;
-
- if (frh->dst_len)
- if (tb[FRA_DST] == NULL ||
- frh->dst_len > (ops->addr_size * 8) ||
- nla_len(tb[FRA_DST]) != ops->addr_size)
- goto errout;
-
- err = 0;
-errout:
- return err;
-}
-
-static int rule_exists(struct fib_rules_ops *ops, struct fib_rule_hdr *frh,
- struct nlattr **tb, struct fib_rule *rule)
+static struct fib_rule *rule_find(struct fib_rules_ops *ops,
+ struct fib_rule_hdr *frh,
+ struct nlattr **tb,
+ struct fib_rule *rule,
+ bool user_priority)
{
struct fib_rule *r;
list_for_each_entry(r, &ops->rules_list, list) {
- if (r->action != rule->action)
+ if (rule->action && r->action != rule->action)
continue;
- if (r->table != rule->table)
+ if (rule->table && r->table != rule->table)
continue;
- if (r->pref != rule->pref)
+ if (user_priority && r->pref != rule->pref)
continue;
- if (memcmp(r->iifname, rule->iifname, IFNAMSIZ))
+ if (rule->iifname[0] &&
+ memcmp(r->iifname, rule->iifname, IFNAMSIZ))
continue;
- if (memcmp(r->oifname, rule->oifname, IFNAMSIZ))
+ if (rule->oifname[0] &&
+ memcmp(r->oifname, rule->oifname, IFNAMSIZ))
continue;
- if (r->mark != rule->mark)
+ if (rule->mark && r->mark != rule->mark)
continue;
- if (r->mark_mask != rule->mark_mask)
+ if (rule->mark_mask && r->mark_mask != rule->mark_mask)
continue;
- if (r->tun_id != rule->tun_id)
+ if (rule->tun_id && r->tun_id != rule->tun_id)
continue;
if (r->fr_net != rule->fr_net)
continue;
- if (r->l3mdev != rule->l3mdev)
+ if (rule->l3mdev && r->l3mdev != rule->l3mdev)
continue;
- if (!uid_eq(r->uid_range.start, rule->uid_range.start) ||
- !uid_eq(r->uid_range.end, rule->uid_range.end))
+ if (uid_range_set(&rule->uid_range) &&
+ (!uid_eq(r->uid_range.start, rule->uid_range.start) ||
+ !uid_eq(r->uid_range.end, rule->uid_range.end)))
continue;
- if (r->ip_proto != rule->ip_proto)
+ if (rule->ip_proto && r->ip_proto != rule->ip_proto)
continue;
- if (!fib_rule_port_range_compare(&r->sport_range,
+ if (fib_rule_port_range_set(&rule->sport_range) &&
+ !fib_rule_port_range_compare(&r->sport_range,
&rule->sport_range))
continue;
- if (!fib_rule_port_range_compare(&r->dport_range,
+ if (fib_rule_port_range_set(&rule->dport_range) &&
+ !fib_rule_port_range_compare(&r->dport_range,
&rule->dport_range))
continue;
if (!ops->compare(r, frh, tb))
continue;
- return 1;
+ return r;
+ }
+
+ return NULL;
+}
+
+#ifdef CONFIG_NET_L3_MASTER_DEV
+static int fib_nl2rule_l3mdev(struct nlattr *nla, struct fib_rule *nlrule,
+ struct netlink_ext_ack *extack)
+{
+ nlrule->l3mdev = nla_get_u8(nla);
+ if (nlrule->l3mdev != 1) {
+ NL_SET_ERR_MSG(extack, "Invalid l3mdev attribute");
+ return -1;
}
+
return 0;
}
+#else
+static int fib_nl2rule_l3mdev(struct nlattr *nla, struct fib_rule *nlrule,
+ struct netlink_ext_ack *extack)
+{
+ NL_SET_ERR_MSG(extack, "l3mdev support is not enabled in kernel");
+ return -1;
+}
+#endif
-int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh,
- struct netlink_ext_ack *extack)
+static int fib_nl2rule(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack,
+ struct fib_rules_ops *ops,
+ struct nlattr *tb[],
+ struct fib_rule **rule,
+ bool *user_priority)
{
struct net *net = sock_net(skb->sk);
struct fib_rule_hdr *frh = nlmsg_data(nlh);
- struct fib_rules_ops *ops = NULL;
- struct fib_rule *rule, *r, *last = NULL;
- struct nlattr *tb[FRA_MAX+1];
- int err = -EINVAL, unresolved = 0;
-
- if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh)))
- goto errout;
+ struct fib_rule *nlrule = NULL;
+ int err = -EINVAL;
- ops = lookup_rules_ops(net, frh->family);
- if (ops == NULL) {
- err = -EAFNOSUPPORT;
- goto errout;
+ if (frh->src_len)
+ if (!tb[FRA_SRC] ||
+ frh->src_len > (ops->addr_size * 8) ||
+ nla_len(tb[FRA_SRC]) != ops->addr_size) {
+ NL_SET_ERR_MSG(extack, "Invalid source address");
+ goto errout;
}
- err = nlmsg_parse(nlh, sizeof(*frh), tb, FRA_MAX, ops->policy, extack);
- if (err < 0)
- goto errout;
-
- err = validate_rulemsg(frh, tb, ops);
- if (err < 0)
- goto errout;
+ if (frh->dst_len)
+ if (!tb[FRA_DST] ||
+ frh->dst_len > (ops->addr_size * 8) ||
+ nla_len(tb[FRA_DST]) != ops->addr_size) {
+ NL_SET_ERR_MSG(extack, "Invalid dst address");
+ goto errout;
+ }
- rule = kzalloc(ops->rule_size, GFP_KERNEL);
- if (rule == NULL) {
+ nlrule = kzalloc(ops->rule_size, GFP_KERNEL);
+ if (!nlrule) {
err = -ENOMEM;
goto errout;
}
- refcount_set(&rule->refcnt, 1);
- rule->fr_net = net;
+ refcount_set(&nlrule->refcnt, 1);
+ nlrule->fr_net = net;
- rule->pref = tb[FRA_PRIORITY] ? nla_get_u32(tb[FRA_PRIORITY])
- : fib_default_rule_pref(ops);
+ if (tb[FRA_PRIORITY]) {
+ nlrule->pref = nla_get_u32(tb[FRA_PRIORITY]);
+ *user_priority = true;
+ } else {
+ nlrule->pref = fib_default_rule_pref(ops);
+ }
- rule->proto = tb[FRA_PROTOCOL] ?
+ nlrule->proto = tb[FRA_PROTOCOL] ?
nla_get_u8(tb[FRA_PROTOCOL]) : RTPROT_UNSPEC;
if (tb[FRA_IIFNAME]) {
struct net_device *dev;
- rule->iifindex = -1;
- nla_strlcpy(rule->iifname, tb[FRA_IIFNAME], IFNAMSIZ);
- dev = __dev_get_by_name(net, rule->iifname);
+ nlrule->iifindex = -1;
+ nla_strlcpy(nlrule->iifname, tb[FRA_IIFNAME], IFNAMSIZ);
+ dev = __dev_get_by_name(net, nlrule->iifname);
if (dev)
- rule->iifindex = dev->ifindex;
+ nlrule->iifindex = dev->ifindex;
}
if (tb[FRA_OIFNAME]) {
struct net_device *dev;
- rule->oifindex = -1;
- nla_strlcpy(rule->oifname, tb[FRA_OIFNAME], IFNAMSIZ);
- dev = __dev_get_by_name(net, rule->oifname);
+ nlrule->oifindex = -1;
+ nla_strlcpy(nlrule->oifname, tb[FRA_OIFNAME], IFNAMSIZ);
+ dev = __dev_get_by_name(net, nlrule->oifname);
if (dev)
- rule->oifindex = dev->ifindex;
+ nlrule->oifindex = dev->ifindex;
}
if (tb[FRA_FWMARK]) {
- rule->mark = nla_get_u32(tb[FRA_FWMARK]);
- if (rule->mark)
+ nlrule->mark = nla_get_u32(tb[FRA_FWMARK]);
+ if (nlrule->mark)
/* compatibility: if the mark value is non-zero all bits
* are compared unless a mask is explicitly specified.
*/
- rule->mark_mask = 0xFFFFFFFF;
+ nlrule->mark_mask = 0xFFFFFFFF;
}
if (tb[FRA_FWMASK])
- rule->mark_mask = nla_get_u32(tb[FRA_FWMASK]);
+ nlrule->mark_mask = nla_get_u32(tb[FRA_FWMASK]);
if (tb[FRA_TUN_ID])
- rule->tun_id = nla_get_be64(tb[FRA_TUN_ID]);
+ nlrule->tun_id = nla_get_be64(tb[FRA_TUN_ID]);
err = -EINVAL;
- if (tb[FRA_L3MDEV]) {
-#ifdef CONFIG_NET_L3_MASTER_DEV
- rule->l3mdev = nla_get_u8(tb[FRA_L3MDEV]);
- if (rule->l3mdev != 1)
-#endif
- goto errout_free;
- }
+ if (tb[FRA_L3MDEV] &&
+ fib_nl2rule_l3mdev(tb[FRA_L3MDEV], nlrule, extack) < 0)
+ goto errout_free;
- rule->action = frh->action;
- rule->flags = frh->flags;
- rule->table = frh_get_table(frh, tb);
+ nlrule->action = frh->action;
+ nlrule->flags = frh->flags;
+ nlrule->table = frh_get_table(frh, tb);
if (tb[FRA_SUPPRESS_PREFIXLEN])
- rule->suppress_prefixlen = nla_get_u32(tb[FRA_SUPPRESS_PREFIXLEN]);
+ nlrule->suppress_prefixlen = nla_get_u32(tb[FRA_SUPPRESS_PREFIXLEN]);
else
- rule->suppress_prefixlen = -1;
+ nlrule->suppress_prefixlen = -1;
if (tb[FRA_SUPPRESS_IFGROUP])
- rule->suppress_ifgroup = nla_get_u32(tb[FRA_SUPPRESS_IFGROUP]);
+ nlrule->suppress_ifgroup = nla_get_u32(tb[FRA_SUPPRESS_IFGROUP]);
else
- rule->suppress_ifgroup = -1;
+ nlrule->suppress_ifgroup = -1;
if (tb[FRA_GOTO]) {
- if (rule->action != FR_ACT_GOTO)
+ if (nlrule->action != FR_ACT_GOTO) {
+ NL_SET_ERR_MSG(extack, "Unexpected goto");
goto errout_free;
+ }
- rule->target = nla_get_u32(tb[FRA_GOTO]);
+ nlrule->target = nla_get_u32(tb[FRA_GOTO]);
/* Backward jumps are prohibited to avoid endless loops */
- if (rule->target <= rule->pref)
+ if (nlrule->target <= nlrule->pref) {
+ NL_SET_ERR_MSG(extack, "Backward goto not supported");
goto errout_free;
-
- list_for_each_entry(r, &ops->rules_list, list) {
- if (r->pref == rule->target) {
- RCU_INIT_POINTER(rule->ctarget, r);
- break;
- }
}
-
- if (rcu_dereference_protected(rule->ctarget, 1) == NULL)
- unresolved = 1;
- } else if (rule->action == FR_ACT_GOTO)
+ } else if (nlrule->action == FR_ACT_GOTO) {
+ NL_SET_ERR_MSG(extack, "Missing goto target for action goto");
goto errout_free;
+ }
- if (rule->l3mdev && rule->table)
+ if (nlrule->l3mdev && nlrule->table) {
+ NL_SET_ERR_MSG(extack, "l3mdev and table are mutually exclusive");
goto errout_free;
+ }
if (tb[FRA_UID_RANGE]) {
if (current_user_ns() != net->user_ns) {
err = -EPERM;
+ NL_SET_ERR_MSG(extack, "No permission to set uid");
goto errout_free;
}
- rule->uid_range = nla_get_kuid_range(tb);
+ nlrule->uid_range = nla_get_kuid_range(tb);
- if (!uid_range_set(&rule->uid_range) ||
- !uid_lte(rule->uid_range.start, rule->uid_range.end))
+ if (!uid_range_set(&nlrule->uid_range) ||
+ !uid_lte(nlrule->uid_range.start, nlrule->uid_range.end)) {
+ NL_SET_ERR_MSG(extack, "Invalid uid range");
goto errout_free;
+ }
} else {
- rule->uid_range = fib_kuid_range_unset;
+ nlrule->uid_range = fib_kuid_range_unset;
}
if (tb[FRA_IP_PROTO])
- rule->ip_proto = nla_get_u8(tb[FRA_IP_PROTO]);
+ nlrule->ip_proto = nla_get_u8(tb[FRA_IP_PROTO]);
if (tb[FRA_SPORT_RANGE]) {
err = nla_get_port_range(tb[FRA_SPORT_RANGE],
- &rule->sport_range);
- if (err)
+ &nlrule->sport_range);
+ if (err) {
+ NL_SET_ERR_MSG(extack, "Invalid sport range");
goto errout_free;
+ }
}
if (tb[FRA_DPORT_RANGE]) {
err = nla_get_port_range(tb[FRA_DPORT_RANGE],
- &rule->dport_range);
- if (err)
+ &nlrule->dport_range);
+ if (err) {
+ NL_SET_ERR_MSG(extack, "Invalid dport range");
goto errout_free;
+ }
}
+ *rule = nlrule;
+
+ return 0;
+
+errout_free:
+ kfree(nlrule);
+errout:
+ return err;
+}
+
+int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
+{
+ struct net *net = sock_net(skb->sk);
+ struct fib_rule_hdr *frh = nlmsg_data(nlh);
+ struct fib_rules_ops *ops = NULL;
+ struct fib_rule *rule = NULL, *r, *last = NULL;
+ struct nlattr *tb[FRA_MAX + 1];
+ int err = -EINVAL, unresolved = 0;
+ bool user_priority = false;
+
+ if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh))) {
+ NL_SET_ERR_MSG(extack, "Invalid msg length");
+ goto errout;
+ }
+
+ ops = lookup_rules_ops(net, frh->family);
+ if (!ops) {
+ err = -EAFNOSUPPORT;
+ NL_SET_ERR_MSG(extack, "Rule family not supported");
+ goto errout;
+ }
+
+ err = nlmsg_parse(nlh, sizeof(*frh), tb, FRA_MAX, ops->policy, extack);
+ if (err < 0) {
+ NL_SET_ERR_MSG(extack, "Error parsing msg");
+ goto errout;
+ }
+
+ err = fib_nl2rule(skb, nlh, extack, ops, tb, &rule, &user_priority);
+ if (err)
+ goto errout;
+
if ((nlh->nlmsg_flags & NLM_F_EXCL) &&
- rule_exists(ops, frh, tb, rule)) {
+ rule_find(ops, frh, tb, rule, user_priority)) {
err = -EEXIST;
goto errout_free;
}
- err = ops->configure(rule, skb, frh, tb);
+ err = ops->configure(rule, skb, frh, tb, extack);
if (err < 0)
goto errout_free;
@@ -637,6 +694,16 @@ int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh,
goto errout_free;
list_for_each_entry(r, &ops->rules_list, list) {
+ if (r->pref == rule->target) {
+ RCU_INIT_POINTER(rule->ctarget, r);
+ break;
+ }
+ }
+
+ if (rcu_dereference_protected(rule->ctarget, 1) == NULL)
+ unresolved = 1;
+
+ list_for_each_entry(r, &ops->rules_list, list) {
if (r->pref > rule->pref)
break;
last = r;
@@ -690,171 +757,97 @@ int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh,
{
struct net *net = sock_net(skb->sk);
struct fib_rule_hdr *frh = nlmsg_data(nlh);
- struct fib_rule_port_range sprange = {0, 0};
- struct fib_rule_port_range dprange = {0, 0};
struct fib_rules_ops *ops = NULL;
- struct fib_rule *rule, *r;
+ struct fib_rule *rule = NULL, *r, *nlrule = NULL;
struct nlattr *tb[FRA_MAX+1];
- struct fib_kuid_range range;
int err = -EINVAL;
+ bool user_priority = false;
- if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh)))
+ if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh))) {
+ NL_SET_ERR_MSG(extack, "Invalid msg length");
goto errout;
+ }
ops = lookup_rules_ops(net, frh->family);
if (ops == NULL) {
err = -EAFNOSUPPORT;
+ NL_SET_ERR_MSG(extack, "Rule family not supported");
goto errout;
}
err = nlmsg_parse(nlh, sizeof(*frh), tb, FRA_MAX, ops->policy, extack);
- if (err < 0)
+ if (err < 0) {
+ NL_SET_ERR_MSG(extack, "Error parsing msg");
goto errout;
+ }
- err = validate_rulemsg(frh, tb, ops);
- if (err < 0)
+ err = fib_nl2rule(skb, nlh, extack, ops, tb, &nlrule, &user_priority);
+ if (err)
goto errout;
- if (tb[FRA_UID_RANGE]) {
- range = nla_get_kuid_range(tb);
- if (!uid_range_set(&range)) {
- err = -EINVAL;
- goto errout;
- }
- } else {
- range = fib_kuid_range_unset;
+ rule = rule_find(ops, frh, tb, nlrule, user_priority);
+ if (!rule) {
+ err = -ENOENT;
+ goto errout;
}
- if (tb[FRA_SPORT_RANGE]) {
- err = nla_get_port_range(tb[FRA_SPORT_RANGE],
- &sprange);
- if (err)
- goto errout;
+ if (rule->flags & FIB_RULE_PERMANENT) {
+ err = -EPERM;
+ goto errout;
}
- if (tb[FRA_DPORT_RANGE]) {
- err = nla_get_port_range(tb[FRA_DPORT_RANGE],
- &dprange);
+ if (ops->delete) {
+ err = ops->delete(rule);
if (err)
goto errout;
}
- list_for_each_entry(rule, &ops->rules_list, list) {
- if (tb[FRA_PROTOCOL] &&
- (rule->proto != nla_get_u8(tb[FRA_PROTOCOL])))
- continue;
-
- if (frh->action && (frh->action != rule->action))
- continue;
-
- if (frh_get_table(frh, tb) &&
- (frh_get_table(frh, tb) != rule->table))
- continue;
-
- if (tb[FRA_PRIORITY] &&
- (rule->pref != nla_get_u32(tb[FRA_PRIORITY])))
- continue;
-
- if (tb[FRA_IIFNAME] &&
- nla_strcmp(tb[FRA_IIFNAME], rule->iifname))
- continue;
-
- if (tb[FRA_OIFNAME] &&
- nla_strcmp(tb[FRA_OIFNAME], rule->oifname))
- continue;
-
- if (tb[FRA_FWMARK] &&
- (rule->mark != nla_get_u32(tb[FRA_FWMARK])))
- continue;
-
- if (tb[FRA_FWMASK] &&
- (rule->mark_mask != nla_get_u32(tb[FRA_FWMASK])))
- continue;
-
- if (tb[FRA_TUN_ID] &&
- (rule->tun_id != nla_get_be64(tb[FRA_TUN_ID])))
- continue;
-
- if (tb[FRA_L3MDEV] &&
- (rule->l3mdev != nla_get_u8(tb[FRA_L3MDEV])))
- continue;
-
- if (uid_range_set(&range) &&
- (!uid_eq(rule->uid_range.start, range.start) ||
- !uid_eq(rule->uid_range.end, range.end)))
- continue;
-
- if (tb[FRA_IP_PROTO] &&
- (rule->ip_proto != nla_get_u8(tb[FRA_IP_PROTO])))
- continue;
-
- if (fib_rule_port_range_set(&sprange) &&
- !fib_rule_port_range_compare(&rule->sport_range, &sprange))
- continue;
-
- if (fib_rule_port_range_set(&dprange) &&
- !fib_rule_port_range_compare(&rule->dport_range, &dprange))
- continue;
-
- if (!ops->compare(rule, frh, tb))
- continue;
-
- if (rule->flags & FIB_RULE_PERMANENT) {
- err = -EPERM;
- goto errout;
- }
-
- if (ops->delete) {
- err = ops->delete(rule);
- if (err)
- goto errout;
- }
+ if (rule->tun_id)
+ ip_tunnel_unneed_metadata();
- if (rule->tun_id)
- ip_tunnel_unneed_metadata();
+ list_del_rcu(&rule->list);
- list_del_rcu(&rule->list);
-
- if (rule->action == FR_ACT_GOTO) {
- ops->nr_goto_rules--;
- if (rtnl_dereference(rule->ctarget) == NULL)
- ops->unresolved_rules--;
- }
+ if (rule->action == FR_ACT_GOTO) {
+ ops->nr_goto_rules--;
+ if (rtnl_dereference(rule->ctarget) == NULL)
+ ops->unresolved_rules--;
+ }
- /*
- * Check if this rule is a target to any of them. If so,
- * adjust to the next one with the same preference or
- * disable them. As this operation is eventually very
- * expensive, it is only performed if goto rules, except
- * current if it is goto rule, have actually been added.
- */
- if (ops->nr_goto_rules > 0) {
- struct fib_rule *n;
-
- n = list_next_entry(rule, list);
- if (&n->list == &ops->rules_list || n->pref != rule->pref)
- n = NULL;
- list_for_each_entry(r, &ops->rules_list, list) {
- if (rtnl_dereference(r->ctarget) != rule)
- continue;
- rcu_assign_pointer(r->ctarget, n);
- if (!n)
- ops->unresolved_rules++;
- }
+ /*
+ * Check if this rule is a target to any of them. If so,
+ * adjust to the next one with the same preference or
+ * disable them. As this operation is eventually very
+ * expensive, it is only performed if goto rules, except
+ * current if it is goto rule, have actually been added.
+ */
+ if (ops->nr_goto_rules > 0) {
+ struct fib_rule *n;
+
+ n = list_next_entry(rule, list);
+ if (&n->list == &ops->rules_list || n->pref != rule->pref)
+ n = NULL;
+ list_for_each_entry(r, &ops->rules_list, list) {
+ if (rtnl_dereference(r->ctarget) != rule)
+ continue;
+ rcu_assign_pointer(r->ctarget, n);
+ if (!n)
+ ops->unresolved_rules++;
}
-
- call_fib_rule_notifiers(net, FIB_EVENT_RULE_DEL, rule, ops,
- NULL);
- notify_rule_change(RTM_DELRULE, rule, ops, nlh,
- NETLINK_CB(skb).portid);
- fib_rule_put(rule);
- flush_route_cache(ops);
- rules_ops_put(ops);
- return 0;
}
- err = -ENOENT;
+ call_fib_rule_notifiers(net, FIB_EVENT_RULE_DEL, rule, ops,
+ NULL);
+ notify_rule_change(RTM_DELRULE, rule, ops, nlh,
+ NETLINK_CB(skb).portid);
+ fib_rule_put(rule);
+ flush_route_cache(ops);
+ rules_ops_put(ops);
+ kfree(nlrule);
+ return 0;
+
errout:
+ if (nlrule)
+ kfree(nlrule);
rules_ops_put(ops);
return err;
}
diff --git a/net/core/filter.c b/net/core/filter.c
index 201ff36b17a8..3d9ba7e5965a 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -57,7 +57,17 @@
#include <net/sock_reuseport.h>
#include <net/busy_poll.h>
#include <net/tcp.h>
+#include <net/xfrm.h>
#include <linux/bpf_trace.h>
+#include <net/xdp_sock.h>
+#include <linux/inetdevice.h>
+#include <net/ip_fib.h>
+#include <net/flow.h>
+#include <net/arp.h>
+#include <net/ipv6.h>
+#include <linux/seg6_local.h>
+#include <net/seg6.h>
+#include <net/seg6_local.h>
/**
* sk_filter_trim_cap - run a packet through a socket filter
@@ -111,12 +121,12 @@ int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap)
}
EXPORT_SYMBOL(sk_filter_trim_cap);
-BPF_CALL_1(__skb_get_pay_offset, struct sk_buff *, skb)
+BPF_CALL_1(bpf_skb_get_pay_offset, struct sk_buff *, skb)
{
return skb_get_poff(skb);
}
-BPF_CALL_3(__skb_get_nlattr, struct sk_buff *, skb, u32, a, u32, x)
+BPF_CALL_3(bpf_skb_get_nlattr, struct sk_buff *, skb, u32, a, u32, x)
{
struct nlattr *nla;
@@ -136,7 +146,7 @@ BPF_CALL_3(__skb_get_nlattr, struct sk_buff *, skb, u32, a, u32, x)
return 0;
}
-BPF_CALL_3(__skb_get_nlattr_nest, struct sk_buff *, skb, u32, a, u32, x)
+BPF_CALL_3(bpf_skb_get_nlattr_nest, struct sk_buff *, skb, u32, a, u32, x)
{
struct nlattr *nla;
@@ -160,13 +170,94 @@ BPF_CALL_3(__skb_get_nlattr_nest, struct sk_buff *, skb, u32, a, u32, x)
return 0;
}
-BPF_CALL_0(__get_raw_cpu_id)
+BPF_CALL_4(bpf_skb_load_helper_8, const struct sk_buff *, skb, const void *,
+ data, int, headlen, int, offset)
+{
+ u8 tmp, *ptr;
+ const int len = sizeof(tmp);
+
+ if (offset >= 0) {
+ if (headlen - offset >= len)
+ return *(u8 *)(data + offset);
+ if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp)))
+ return tmp;
+ } else {
+ ptr = bpf_internal_load_pointer_neg_helper(skb, offset, len);
+ if (likely(ptr))
+ return *(u8 *)ptr;
+ }
+
+ return -EFAULT;
+}
+
+BPF_CALL_2(bpf_skb_load_helper_8_no_cache, const struct sk_buff *, skb,
+ int, offset)
+{
+ return ____bpf_skb_load_helper_8(skb, skb->data, skb->len - skb->data_len,
+ offset);
+}
+
+BPF_CALL_4(bpf_skb_load_helper_16, const struct sk_buff *, skb, const void *,
+ data, int, headlen, int, offset)
+{
+ u16 tmp, *ptr;
+ const int len = sizeof(tmp);
+
+ if (offset >= 0) {
+ if (headlen - offset >= len)
+ return get_unaligned_be16(data + offset);
+ if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp)))
+ return be16_to_cpu(tmp);
+ } else {
+ ptr = bpf_internal_load_pointer_neg_helper(skb, offset, len);
+ if (likely(ptr))
+ return get_unaligned_be16(ptr);
+ }
+
+ return -EFAULT;
+}
+
+BPF_CALL_2(bpf_skb_load_helper_16_no_cache, const struct sk_buff *, skb,
+ int, offset)
+{
+ return ____bpf_skb_load_helper_16(skb, skb->data, skb->len - skb->data_len,
+ offset);
+}
+
+BPF_CALL_4(bpf_skb_load_helper_32, const struct sk_buff *, skb, const void *,
+ data, int, headlen, int, offset)
+{
+ u32 tmp, *ptr;
+ const int len = sizeof(tmp);
+
+ if (likely(offset >= 0)) {
+ if (headlen - offset >= len)
+ return get_unaligned_be32(data + offset);
+ if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp)))
+ return be32_to_cpu(tmp);
+ } else {
+ ptr = bpf_internal_load_pointer_neg_helper(skb, offset, len);
+ if (likely(ptr))
+ return get_unaligned_be32(ptr);
+ }
+
+ return -EFAULT;
+}
+
+BPF_CALL_2(bpf_skb_load_helper_32_no_cache, const struct sk_buff *, skb,
+ int, offset)
+{
+ return ____bpf_skb_load_helper_32(skb, skb->data, skb->len - skb->data_len,
+ offset);
+}
+
+BPF_CALL_0(bpf_get_raw_cpu_id)
{
return raw_smp_processor_id();
}
static const struct bpf_func_proto bpf_get_raw_smp_processor_id_proto = {
- .func = __get_raw_cpu_id,
+ .func = bpf_get_raw_cpu_id,
.gpl_only = false,
.ret_type = RET_INTEGER,
};
@@ -316,16 +407,16 @@ static bool convert_bpf_extensions(struct sock_filter *fp,
/* Emit call(arg1=CTX, arg2=A, arg3=X) */
switch (fp->k) {
case SKF_AD_OFF + SKF_AD_PAY_OFFSET:
- *insn = BPF_EMIT_CALL(__skb_get_pay_offset);
+ *insn = BPF_EMIT_CALL(bpf_skb_get_pay_offset);
break;
case SKF_AD_OFF + SKF_AD_NLATTR:
- *insn = BPF_EMIT_CALL(__skb_get_nlattr);
+ *insn = BPF_EMIT_CALL(bpf_skb_get_nlattr);
break;
case SKF_AD_OFF + SKF_AD_NLATTR_NEST:
- *insn = BPF_EMIT_CALL(__skb_get_nlattr_nest);
+ *insn = BPF_EMIT_CALL(bpf_skb_get_nlattr_nest);
break;
case SKF_AD_OFF + SKF_AD_CPU:
- *insn = BPF_EMIT_CALL(__get_raw_cpu_id);
+ *insn = BPF_EMIT_CALL(bpf_get_raw_cpu_id);
break;
case SKF_AD_OFF + SKF_AD_RANDOM:
*insn = BPF_EMIT_CALL(bpf_user_rnd_u32);
@@ -352,26 +443,87 @@ static bool convert_bpf_extensions(struct sock_filter *fp,
return true;
}
+static bool convert_bpf_ld_abs(struct sock_filter *fp, struct bpf_insn **insnp)
+{
+ const bool unaligned_ok = IS_BUILTIN(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS);
+ int size = bpf_size_to_bytes(BPF_SIZE(fp->code));
+ bool endian = BPF_SIZE(fp->code) == BPF_H ||
+ BPF_SIZE(fp->code) == BPF_W;
+ bool indirect = BPF_MODE(fp->code) == BPF_IND;
+ const int ip_align = NET_IP_ALIGN;
+ struct bpf_insn *insn = *insnp;
+ int offset = fp->k;
+
+ if (!indirect &&
+ ((unaligned_ok && offset >= 0) ||
+ (!unaligned_ok && offset >= 0 &&
+ offset + ip_align >= 0 &&
+ offset + ip_align % size == 0))) {
+ *insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_H);
+ *insn++ = BPF_ALU64_IMM(BPF_SUB, BPF_REG_TMP, offset);
+ *insn++ = BPF_JMP_IMM(BPF_JSLT, BPF_REG_TMP, size, 2 + endian);
+ *insn++ = BPF_LDX_MEM(BPF_SIZE(fp->code), BPF_REG_A, BPF_REG_D,
+ offset);
+ if (endian)
+ *insn++ = BPF_ENDIAN(BPF_FROM_BE, BPF_REG_A, size * 8);
+ *insn++ = BPF_JMP_A(8);
+ }
+
+ *insn++ = BPF_MOV64_REG(BPF_REG_ARG1, BPF_REG_CTX);
+ *insn++ = BPF_MOV64_REG(BPF_REG_ARG2, BPF_REG_D);
+ *insn++ = BPF_MOV64_REG(BPF_REG_ARG3, BPF_REG_H);
+ if (!indirect) {
+ *insn++ = BPF_MOV64_IMM(BPF_REG_ARG4, offset);
+ } else {
+ *insn++ = BPF_MOV64_REG(BPF_REG_ARG4, BPF_REG_X);
+ if (fp->k)
+ *insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_ARG4, offset);
+ }
+
+ switch (BPF_SIZE(fp->code)) {
+ case BPF_B:
+ *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_8);
+ break;
+ case BPF_H:
+ *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_16);
+ break;
+ case BPF_W:
+ *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_32);
+ break;
+ default:
+ return false;
+ }
+
+ *insn++ = BPF_JMP_IMM(BPF_JSGE, BPF_REG_A, 0, 2);
+ *insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_A, BPF_REG_A);
+ *insn = BPF_EXIT_INSN();
+
+ *insnp = insn;
+ return true;
+}
+
/**
* bpf_convert_filter - convert filter program
* @prog: the user passed filter program
* @len: the length of the user passed filter program
* @new_prog: allocated 'struct bpf_prog' or NULL
* @new_len: pointer to store length of converted program
+ * @seen_ld_abs: bool whether we've seen ld_abs/ind
*
* Remap 'sock_filter' style classic BPF (cBPF) instruction set to 'bpf_insn'
* style extended BPF (eBPF).
* Conversion workflow:
*
* 1) First pass for calculating the new program length:
- * bpf_convert_filter(old_prog, old_len, NULL, &new_len)
+ * bpf_convert_filter(old_prog, old_len, NULL, &new_len, &seen_ld_abs)
*
* 2) 2nd pass to remap in two passes: 1st pass finds new
* jump offsets, 2nd pass remapping:
- * bpf_convert_filter(old_prog, old_len, new_prog, &new_len);
+ * bpf_convert_filter(old_prog, old_len, new_prog, &new_len, &seen_ld_abs)
*/
static int bpf_convert_filter(struct sock_filter *prog, int len,
- struct bpf_prog *new_prog, int *new_len)
+ struct bpf_prog *new_prog, int *new_len,
+ bool *seen_ld_abs)
{
int new_flen = 0, pass = 0, target, i, stack_off;
struct bpf_insn *new_insn, *first_insn = NULL;
@@ -410,12 +562,27 @@ do_pass:
* do this ourself. Initial CTX is present in BPF_REG_ARG1.
*/
*new_insn++ = BPF_MOV64_REG(BPF_REG_CTX, BPF_REG_ARG1);
+ if (*seen_ld_abs) {
+ /* For packet access in classic BPF, cache skb->data
+ * in callee-saved BPF R8 and skb->len - skb->data_len
+ * (headlen) in BPF R9. Since classic BPF is read-only
+ * on CTX, we only need to cache it once.
+ */
+ *new_insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data),
+ BPF_REG_D, BPF_REG_CTX,
+ offsetof(struct sk_buff, data));
+ *new_insn++ = BPF_LDX_MEM(BPF_W, BPF_REG_H, BPF_REG_CTX,
+ offsetof(struct sk_buff, len));
+ *new_insn++ = BPF_LDX_MEM(BPF_W, BPF_REG_TMP, BPF_REG_CTX,
+ offsetof(struct sk_buff, data_len));
+ *new_insn++ = BPF_ALU32_REG(BPF_SUB, BPF_REG_H, BPF_REG_TMP);
+ }
} else {
new_insn += 3;
}
for (i = 0; i < len; fp++, i++) {
- struct bpf_insn tmp_insns[6] = { };
+ struct bpf_insn tmp_insns[32] = { };
struct bpf_insn *insn = tmp_insns;
if (addrs)
@@ -458,6 +625,11 @@ do_pass:
BPF_MODE(fp->code) == BPF_ABS &&
convert_bpf_extensions(fp, &insn))
break;
+ if (BPF_CLASS(fp->code) == BPF_LD &&
+ convert_bpf_ld_abs(fp, &insn)) {
+ *seen_ld_abs = true;
+ break;
+ }
if (fp->code == (BPF_ALU | BPF_DIV | BPF_X) ||
fp->code == (BPF_ALU | BPF_MOD | BPF_X)) {
@@ -567,21 +739,31 @@ jmp_rest:
break;
/* ldxb 4 * ([14] & 0xf) is remaped into 6 insns. */
- case BPF_LDX | BPF_MSH | BPF_B:
- /* tmp = A */
- *insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_A);
+ case BPF_LDX | BPF_MSH | BPF_B: {
+ struct sock_filter tmp = {
+ .code = BPF_LD | BPF_ABS | BPF_B,
+ .k = fp->k,
+ };
+
+ *seen_ld_abs = true;
+
+ /* X = A */
+ *insn++ = BPF_MOV64_REG(BPF_REG_X, BPF_REG_A);
/* A = BPF_R0 = *(u8 *) (skb->data + K) */
- *insn++ = BPF_LD_ABS(BPF_B, fp->k);
+ convert_bpf_ld_abs(&tmp, &insn);
+ insn++;
/* A &= 0xf */
*insn++ = BPF_ALU32_IMM(BPF_AND, BPF_REG_A, 0xf);
/* A <<= 2 */
*insn++ = BPF_ALU32_IMM(BPF_LSH, BPF_REG_A, 2);
+ /* tmp = X */
+ *insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_X);
/* X = A */
*insn++ = BPF_MOV64_REG(BPF_REG_X, BPF_REG_A);
/* A = tmp */
*insn = BPF_MOV64_REG(BPF_REG_A, BPF_REG_TMP);
break;
-
+ }
/* RET_K is remaped into 2 insns. RET_A case doesn't need an
* extra mov as BPF_REG_0 is already mapped into BPF_REG_A.
*/
@@ -663,6 +845,8 @@ jmp_rest:
if (!new_prog) {
/* Only calculating new length. */
*new_len = new_insn - first_insn;
+ if (*seen_ld_abs)
+ *new_len += 4; /* Prologue bits. */
return 0;
}
@@ -1024,6 +1208,7 @@ static struct bpf_prog *bpf_migrate_filter(struct bpf_prog *fp)
struct sock_filter *old_prog;
struct bpf_prog *old_fp;
int err, new_len, old_len = fp->len;
+ bool seen_ld_abs = false;
/* We are free to overwrite insns et al right here as it
* won't be used at this point in time anymore internally
@@ -1045,7 +1230,8 @@ static struct bpf_prog *bpf_migrate_filter(struct bpf_prog *fp)
}
/* 1st pass: calculate the new program length. */
- err = bpf_convert_filter(old_prog, old_len, NULL, &new_len);
+ err = bpf_convert_filter(old_prog, old_len, NULL, &new_len,
+ &seen_ld_abs);
if (err)
goto out_err_free;
@@ -1064,7 +1250,8 @@ static struct bpf_prog *bpf_migrate_filter(struct bpf_prog *fp)
fp->len = new_len;
/* 2nd pass: remap sock_filter insns into bpf_insn insns. */
- err = bpf_convert_filter(old_prog, old_len, fp, &new_len);
+ err = bpf_convert_filter(old_prog, old_len, fp, &new_len,
+ &seen_ld_abs);
if (err)
/* 2nd bpf_convert_filter() can fail only if it fails
* to allocate memory, remapping must succeed. Note,
@@ -1512,6 +1699,47 @@ static const struct bpf_func_proto bpf_skb_load_bytes_proto = {
.arg4_type = ARG_CONST_SIZE,
};
+BPF_CALL_5(bpf_skb_load_bytes_relative, const struct sk_buff *, skb,
+ u32, offset, void *, to, u32, len, u32, start_header)
+{
+ u8 *ptr;
+
+ if (unlikely(offset > 0xffff || len > skb_headlen(skb)))
+ goto err_clear;
+
+ switch (start_header) {
+ case BPF_HDR_START_MAC:
+ ptr = skb_mac_header(skb) + offset;
+ break;
+ case BPF_HDR_START_NET:
+ ptr = skb_network_header(skb) + offset;
+ break;
+ default:
+ goto err_clear;
+ }
+
+ if (likely(ptr >= skb_mac_header(skb) &&
+ ptr + len <= skb_tail_pointer(skb))) {
+ memcpy(to, ptr, len);
+ return 0;
+ }
+
+err_clear:
+ memset(to, 0, len);
+ return -EFAULT;
+}
+
+static const struct bpf_func_proto bpf_skb_load_bytes_relative_proto = {
+ .func = bpf_skb_load_bytes_relative,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_PTR_TO_UNINIT_MEM,
+ .arg4_type = ARG_CONST_SIZE,
+ .arg5_type = ARG_ANYTHING,
+};
+
BPF_CALL_2(bpf_skb_pull_data, struct sk_buff *, skb, u32, len)
{
/* Idea is the following: should the needed direct read/write
@@ -1857,6 +2085,33 @@ static const struct bpf_func_proto bpf_redirect_proto = {
.arg2_type = ARG_ANYTHING,
};
+BPF_CALL_4(bpf_sk_redirect_hash, struct sk_buff *, skb,
+ struct bpf_map *, map, void *, key, u64, flags)
+{
+ struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
+
+ /* If user passes invalid input drop the packet. */
+ if (unlikely(flags & ~(BPF_F_INGRESS)))
+ return SK_DROP;
+
+ tcb->bpf.flags = flags;
+ tcb->bpf.sk_redir = __sock_hash_lookup_elem(map, key);
+ if (!tcb->bpf.sk_redir)
+ return SK_DROP;
+
+ return SK_PASS;
+}
+
+static const struct bpf_func_proto bpf_sk_redirect_hash_proto = {
+ .func = bpf_sk_redirect_hash,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_CONST_MAP_PTR,
+ .arg3_type = ARG_PTR_TO_MAP_KEY,
+ .arg4_type = ARG_ANYTHING,
+};
+
BPF_CALL_4(bpf_sk_redirect_map, struct sk_buff *, skb,
struct bpf_map *, map, u32, key, u64, flags)
{
@@ -1866,9 +2121,10 @@ BPF_CALL_4(bpf_sk_redirect_map, struct sk_buff *, skb,
if (unlikely(flags & ~(BPF_F_INGRESS)))
return SK_DROP;
- tcb->bpf.key = key;
tcb->bpf.flags = flags;
- tcb->bpf.map = map;
+ tcb->bpf.sk_redir = __sock_map_lookup_elem(map, key);
+ if (!tcb->bpf.sk_redir)
+ return SK_DROP;
return SK_PASS;
}
@@ -1876,16 +2132,8 @@ BPF_CALL_4(bpf_sk_redirect_map, struct sk_buff *, skb,
struct sock *do_sk_redirect_map(struct sk_buff *skb)
{
struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
- struct sock *sk = NULL;
-
- if (tcb->bpf.map) {
- sk = __sock_map_lookup_elem(tcb->bpf.map, tcb->bpf.key);
- tcb->bpf.key = 0;
- tcb->bpf.map = NULL;
- }
-
- return sk;
+ return tcb->bpf.sk_redir;
}
static const struct bpf_func_proto bpf_sk_redirect_map_proto = {
@@ -1898,32 +2146,49 @@ static const struct bpf_func_proto bpf_sk_redirect_map_proto = {
.arg4_type = ARG_ANYTHING,
};
-BPF_CALL_4(bpf_msg_redirect_map, struct sk_msg_buff *, msg,
- struct bpf_map *, map, u32, key, u64, flags)
+BPF_CALL_4(bpf_msg_redirect_hash, struct sk_msg_buff *, msg,
+ struct bpf_map *, map, void *, key, u64, flags)
{
/* If user passes invalid input drop the packet. */
if (unlikely(flags & ~(BPF_F_INGRESS)))
return SK_DROP;
- msg->key = key;
msg->flags = flags;
- msg->map = map;
+ msg->sk_redir = __sock_hash_lookup_elem(map, key);
+ if (!msg->sk_redir)
+ return SK_DROP;
return SK_PASS;
}
-struct sock *do_msg_redirect_map(struct sk_msg_buff *msg)
+static const struct bpf_func_proto bpf_msg_redirect_hash_proto = {
+ .func = bpf_msg_redirect_hash,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_CONST_MAP_PTR,
+ .arg3_type = ARG_PTR_TO_MAP_KEY,
+ .arg4_type = ARG_ANYTHING,
+};
+
+BPF_CALL_4(bpf_msg_redirect_map, struct sk_msg_buff *, msg,
+ struct bpf_map *, map, u32, key, u64, flags)
{
- struct sock *sk = NULL;
+ /* If user passes invalid input drop the packet. */
+ if (unlikely(flags & ~(BPF_F_INGRESS)))
+ return SK_DROP;
- if (msg->map) {
- sk = __sock_map_lookup_elem(msg->map, msg->key);
+ msg->flags = flags;
+ msg->sk_redir = __sock_map_lookup_elem(map, key);
+ if (!msg->sk_redir)
+ return SK_DROP;
- msg->key = 0;
- msg->map = NULL;
- }
+ return SK_PASS;
+}
- return sk;
+struct sock *do_msg_redirect_map(struct sk_msg_buff *msg)
+{
+ return msg->sk_redir;
}
static const struct bpf_func_proto bpf_msg_redirect_map_proto = {
@@ -2186,7 +2451,7 @@ BPF_CALL_3(bpf_skb_vlan_push, struct sk_buff *, skb, __be16, vlan_proto,
return ret;
}
-const struct bpf_func_proto bpf_skb_vlan_push_proto = {
+static const struct bpf_func_proto bpf_skb_vlan_push_proto = {
.func = bpf_skb_vlan_push,
.gpl_only = false,
.ret_type = RET_INTEGER,
@@ -2194,7 +2459,6 @@ const struct bpf_func_proto bpf_skb_vlan_push_proto = {
.arg2_type = ARG_ANYTHING,
.arg3_type = ARG_ANYTHING,
};
-EXPORT_SYMBOL_GPL(bpf_skb_vlan_push_proto);
BPF_CALL_1(bpf_skb_vlan_pop, struct sk_buff *, skb)
{
@@ -2208,13 +2472,12 @@ BPF_CALL_1(bpf_skb_vlan_pop, struct sk_buff *, skb)
return ret;
}
-const struct bpf_func_proto bpf_skb_vlan_pop_proto = {
+static const struct bpf_func_proto bpf_skb_vlan_pop_proto = {
.func = bpf_skb_vlan_pop,
.gpl_only = false,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_CTX,
};
-EXPORT_SYMBOL_GPL(bpf_skb_vlan_pop_proto);
static int bpf_skb_generic_push(struct sk_buff *skb, u32 off, u32 len)
{
@@ -2699,8 +2962,9 @@ static unsigned long xdp_get_metalen(const struct xdp_buff *xdp)
BPF_CALL_2(bpf_xdp_adjust_head, struct xdp_buff *, xdp, int, offset)
{
+ void *xdp_frame_end = xdp->data_hard_start + sizeof(struct xdp_frame);
unsigned long metalen = xdp_get_metalen(xdp);
- void *data_start = xdp->data_hard_start + metalen;
+ void *data_start = xdp_frame_end + metalen;
void *data = xdp->data + offset;
if (unlikely(data < data_start ||
@@ -2724,14 +2988,39 @@ static const struct bpf_func_proto bpf_xdp_adjust_head_proto = {
.arg2_type = ARG_ANYTHING,
};
+BPF_CALL_2(bpf_xdp_adjust_tail, struct xdp_buff *, xdp, int, offset)
+{
+ void *data_end = xdp->data_end + offset;
+
+ /* only shrinking is allowed for now. */
+ if (unlikely(offset >= 0))
+ return -EINVAL;
+
+ if (unlikely(data_end < xdp->data + ETH_HLEN))
+ return -EINVAL;
+
+ xdp->data_end = data_end;
+
+ return 0;
+}
+
+static const struct bpf_func_proto bpf_xdp_adjust_tail_proto = {
+ .func = bpf_xdp_adjust_tail,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+};
+
BPF_CALL_2(bpf_xdp_adjust_meta, struct xdp_buff *, xdp, int, offset)
{
+ void *xdp_frame_end = xdp->data_hard_start + sizeof(struct xdp_frame);
void *meta = xdp->data_meta + offset;
unsigned long metalen = xdp->data - meta;
if (xdp_data_meta_unsupported(xdp))
return -ENOTSUPP;
- if (unlikely(meta < xdp->data_hard_start ||
+ if (unlikely(meta < xdp_frame_end ||
meta > xdp->data))
return -EINVAL;
if (unlikely((metalen & (sizeof(__u32) - 1)) ||
@@ -2756,16 +3045,20 @@ static int __bpf_tx_xdp(struct net_device *dev,
struct xdp_buff *xdp,
u32 index)
{
- int err;
+ struct xdp_frame *xdpf;
+ int sent;
if (!dev->netdev_ops->ndo_xdp_xmit) {
return -EOPNOTSUPP;
}
- err = dev->netdev_ops->ndo_xdp_xmit(dev, xdp);
- if (err)
- return err;
- dev->netdev_ops->ndo_xdp_flush(dev);
+ xdpf = convert_to_xdp_frame(xdp);
+ if (unlikely(!xdpf))
+ return -EOVERFLOW;
+
+ sent = dev->netdev_ops->ndo_xdp_xmit(dev, 1, &xdpf, XDP_XMIT_FLUSH);
+ if (sent <= 0)
+ return sent;
return 0;
}
@@ -2776,24 +3069,33 @@ static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd,
{
int err;
- if (map->map_type == BPF_MAP_TYPE_DEVMAP) {
- struct net_device *dev = fwd;
-
- if (!dev->netdev_ops->ndo_xdp_xmit)
- return -EOPNOTSUPP;
+ switch (map->map_type) {
+ case BPF_MAP_TYPE_DEVMAP: {
+ struct bpf_dtab_netdev *dst = fwd;
- err = dev->netdev_ops->ndo_xdp_xmit(dev, xdp);
+ err = dev_map_enqueue(dst, xdp, dev_rx);
if (err)
return err;
__dev_map_insert_ctx(map, index);
-
- } else if (map->map_type == BPF_MAP_TYPE_CPUMAP) {
+ break;
+ }
+ case BPF_MAP_TYPE_CPUMAP: {
struct bpf_cpu_map_entry *rcpu = fwd;
err = cpu_map_enqueue(rcpu, xdp, dev_rx);
if (err)
return err;
__cpu_map_insert_ctx(map, index);
+ break;
+ }
+ case BPF_MAP_TYPE_XSKMAP: {
+ struct xdp_sock *xs = fwd;
+
+ err = __xsk_map_redirect(map, xdp, xs);
+ return err;
+ }
+ default:
+ break;
}
return 0;
}
@@ -2812,6 +3114,9 @@ void xdp_do_flush_map(void)
case BPF_MAP_TYPE_CPUMAP:
__cpu_map_flush(map);
break;
+ case BPF_MAP_TYPE_XSKMAP:
+ __xsk_map_flush(map);
+ break;
default:
break;
}
@@ -2826,6 +3131,8 @@ static void *__xdp_map_lookup_elem(struct bpf_map *map, u32 index)
return __dev_map_lookup_elem(map, index);
case BPF_MAP_TYPE_CPUMAP:
return __cpu_map_lookup_elem(map, index);
+ case BPF_MAP_TYPE_XSKMAP:
+ return __xsk_map_lookup_elem(map, index);
default:
return NULL;
}
@@ -2923,13 +3230,14 @@ static int __xdp_generic_ok_fwd_dev(struct sk_buff *skb, struct net_device *fwd)
static int xdp_do_generic_redirect_map(struct net_device *dev,
struct sk_buff *skb,
+ struct xdp_buff *xdp,
struct bpf_prog *xdp_prog)
{
struct redirect_info *ri = this_cpu_ptr(&redirect_info);
unsigned long map_owner = ri->map_owner;
struct bpf_map *map = ri->map;
- struct net_device *fwd = NULL;
u32 index = ri->ifindex;
+ void *fwd = NULL;
int err = 0;
ri->ifindex = 0;
@@ -2951,6 +3259,14 @@ static int xdp_do_generic_redirect_map(struct net_device *dev,
if (unlikely((err = __xdp_generic_ok_fwd_dev(skb, fwd))))
goto err;
skb->dev = fwd;
+ generic_xdp_tx(skb, xdp_prog);
+ } else if (map->map_type == BPF_MAP_TYPE_XSKMAP) {
+ struct xdp_sock *xs = fwd;
+
+ err = xsk_generic_rcv(xs, xdp);
+ if (err)
+ goto err;
+ consume_skb(skb);
} else {
/* TODO: Handle BPF_MAP_TYPE_CPUMAP */
err = -EBADRQC;
@@ -2965,7 +3281,7 @@ err:
}
int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb,
- struct bpf_prog *xdp_prog)
+ struct xdp_buff *xdp, struct bpf_prog *xdp_prog)
{
struct redirect_info *ri = this_cpu_ptr(&redirect_info);
u32 index = ri->ifindex;
@@ -2973,7 +3289,7 @@ int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb,
int err = 0;
if (ri->map)
- return xdp_do_generic_redirect_map(dev, skb, xdp_prog);
+ return xdp_do_generic_redirect_map(dev, skb, xdp, xdp_prog);
ri->ifindex = 0;
fwd = dev_get_by_index_rcu(dev_net(dev), index);
@@ -2987,6 +3303,7 @@ int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb,
skb->dev = fwd;
_trace_xdp_redirect(dev, xdp_prog, index);
+ generic_xdp_tx(skb, xdp_prog);
return 0;
err:
_trace_xdp_redirect_err(dev, xdp_prog, index, err);
@@ -3045,27 +3362,6 @@ static const struct bpf_func_proto bpf_xdp_redirect_map_proto = {
.arg3_type = ARG_ANYTHING,
};
-bool bpf_helper_changes_pkt_data(void *func)
-{
- if (func == bpf_skb_vlan_push ||
- func == bpf_skb_vlan_pop ||
- func == bpf_skb_store_bytes ||
- func == bpf_skb_change_proto ||
- func == bpf_skb_change_head ||
- func == bpf_skb_change_tail ||
- func == bpf_skb_adjust_room ||
- func == bpf_skb_pull_data ||
- func == bpf_clone_redirect ||
- func == bpf_l3_csum_replace ||
- func == bpf_l4_csum_replace ||
- func == bpf_xdp_adjust_head ||
- func == bpf_xdp_adjust_meta ||
- func == bpf_msg_pull_data)
- return true;
-
- return false;
-}
-
static unsigned long bpf_skb_copy(void *dst_buff, const void *skb,
unsigned long off, unsigned long len)
{
@@ -3148,6 +3444,7 @@ set_compat:
to->tunnel_id = be64_to_cpu(info->key.tun_id);
to->tunnel_tos = info->key.tos;
to->tunnel_ttl = info->key.ttl;
+ to->tunnel_ext = 0;
if (flags & BPF_F_TUNINFO_IPV6) {
memcpy(to->remote_ipv6, &info->key.u.ipv6.src,
@@ -3155,6 +3452,8 @@ set_compat:
to->tunnel_label = be32_to_cpu(info->key.label);
} else {
to->remote_ipv4 = be32_to_cpu(info->key.u.ipv4.src);
+ memset(&to->remote_ipv6[1], 0, sizeof(__u32) * 3);
+ to->tunnel_label = 0;
}
if (unlikely(size != sizeof(struct bpf_tunnel_key)))
@@ -3364,6 +3663,27 @@ static const struct bpf_func_proto bpf_skb_under_cgroup_proto = {
.arg3_type = ARG_ANYTHING,
};
+#ifdef CONFIG_SOCK_CGROUP_DATA
+BPF_CALL_1(bpf_skb_cgroup_id, const struct sk_buff *, skb)
+{
+ struct sock *sk = skb_to_full_sk(skb);
+ struct cgroup *cgrp;
+
+ if (!sk || !sk_fullsock(sk))
+ return 0;
+
+ cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
+ return cgrp->kn->id.id;
+}
+
+static const struct bpf_func_proto bpf_skb_cgroup_id_proto = {
+ .func = bpf_skb_cgroup_id,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+};
+#endif
+
static unsigned long bpf_xdp_copy(void *dst_buff, const void *src_buff,
unsigned long off, unsigned long len)
{
@@ -3711,6 +4031,603 @@ static const struct bpf_func_proto bpf_bind_proto = {
.arg3_type = ARG_CONST_SIZE,
};
+#ifdef CONFIG_XFRM
+BPF_CALL_5(bpf_skb_get_xfrm_state, struct sk_buff *, skb, u32, index,
+ struct bpf_xfrm_state *, to, u32, size, u64, flags)
+{
+ const struct sec_path *sp = skb_sec_path(skb);
+ const struct xfrm_state *x;
+
+ if (!sp || unlikely(index >= sp->len || flags))
+ goto err_clear;
+
+ x = sp->xvec[index];
+
+ if (unlikely(size != sizeof(struct bpf_xfrm_state)))
+ goto err_clear;
+
+ to->reqid = x->props.reqid;
+ to->spi = x->id.spi;
+ to->family = x->props.family;
+ to->ext = 0;
+
+ if (to->family == AF_INET6) {
+ memcpy(to->remote_ipv6, x->props.saddr.a6,
+ sizeof(to->remote_ipv6));
+ } else {
+ to->remote_ipv4 = x->props.saddr.a4;
+ memset(&to->remote_ipv6[1], 0, sizeof(__u32) * 3);
+ }
+
+ return 0;
+err_clear:
+ memset(to, 0, size);
+ return -EINVAL;
+}
+
+static const struct bpf_func_proto bpf_skb_get_xfrm_state_proto = {
+ .func = bpf_skb_get_xfrm_state,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_PTR_TO_UNINIT_MEM,
+ .arg4_type = ARG_CONST_SIZE,
+ .arg5_type = ARG_ANYTHING,
+};
+#endif
+
+#if IS_ENABLED(CONFIG_INET) || IS_ENABLED(CONFIG_IPV6)
+static int bpf_fib_set_fwd_params(struct bpf_fib_lookup *params,
+ const struct neighbour *neigh,
+ const struct net_device *dev)
+{
+ memcpy(params->dmac, neigh->ha, ETH_ALEN);
+ memcpy(params->smac, dev->dev_addr, ETH_ALEN);
+ params->h_vlan_TCI = 0;
+ params->h_vlan_proto = 0;
+
+ return dev->ifindex;
+}
+#endif
+
+#if IS_ENABLED(CONFIG_INET)
+static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
+ u32 flags, bool check_mtu)
+{
+ struct in_device *in_dev;
+ struct neighbour *neigh;
+ struct net_device *dev;
+ struct fib_result res;
+ struct fib_nh *nh;
+ struct flowi4 fl4;
+ int err;
+ u32 mtu;
+
+ dev = dev_get_by_index_rcu(net, params->ifindex);
+ if (unlikely(!dev))
+ return -ENODEV;
+
+ /* verify forwarding is enabled on this interface */
+ in_dev = __in_dev_get_rcu(dev);
+ if (unlikely(!in_dev || !IN_DEV_FORWARD(in_dev)))
+ return 0;
+
+ if (flags & BPF_FIB_LOOKUP_OUTPUT) {
+ fl4.flowi4_iif = 1;
+ fl4.flowi4_oif = params->ifindex;
+ } else {
+ fl4.flowi4_iif = params->ifindex;
+ fl4.flowi4_oif = 0;
+ }
+ fl4.flowi4_tos = params->tos & IPTOS_RT_MASK;
+ fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
+ fl4.flowi4_flags = 0;
+
+ fl4.flowi4_proto = params->l4_protocol;
+ fl4.daddr = params->ipv4_dst;
+ fl4.saddr = params->ipv4_src;
+ fl4.fl4_sport = params->sport;
+ fl4.fl4_dport = params->dport;
+
+ if (flags & BPF_FIB_LOOKUP_DIRECT) {
+ u32 tbid = l3mdev_fib_table_rcu(dev) ? : RT_TABLE_MAIN;
+ struct fib_table *tb;
+
+ tb = fib_get_table(net, tbid);
+ if (unlikely(!tb))
+ return 0;
+
+ err = fib_table_lookup(tb, &fl4, &res, FIB_LOOKUP_NOREF);
+ } else {
+ fl4.flowi4_mark = 0;
+ fl4.flowi4_secid = 0;
+ fl4.flowi4_tun_key.tun_id = 0;
+ fl4.flowi4_uid = sock_net_uid(net, NULL);
+
+ err = fib_lookup(net, &fl4, &res, FIB_LOOKUP_NOREF);
+ }
+
+ if (err || res.type != RTN_UNICAST)
+ return 0;
+
+ if (res.fi->fib_nhs > 1)
+ fib_select_path(net, &res, &fl4, NULL);
+
+ if (check_mtu) {
+ mtu = ip_mtu_from_fib_result(&res, params->ipv4_dst);
+ if (params->tot_len > mtu)
+ return 0;
+ }
+
+ nh = &res.fi->fib_nh[res.nh_sel];
+
+ /* do not handle lwt encaps right now */
+ if (nh->nh_lwtstate)
+ return 0;
+
+ dev = nh->nh_dev;
+ if (unlikely(!dev))
+ return 0;
+
+ if (nh->nh_gw)
+ params->ipv4_dst = nh->nh_gw;
+
+ params->rt_metric = res.fi->fib_priority;
+
+ /* xdp and cls_bpf programs are run in RCU-bh so
+ * rcu_read_lock_bh is not needed here
+ */
+ neigh = __ipv4_neigh_lookup_noref(dev, (__force u32)params->ipv4_dst);
+ if (neigh)
+ return bpf_fib_set_fwd_params(params, neigh, dev);
+
+ return 0;
+}
+#endif
+
+#if IS_ENABLED(CONFIG_IPV6)
+static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
+ u32 flags, bool check_mtu)
+{
+ struct in6_addr *src = (struct in6_addr *) params->ipv6_src;
+ struct in6_addr *dst = (struct in6_addr *) params->ipv6_dst;
+ struct neighbour *neigh;
+ struct net_device *dev;
+ struct inet6_dev *idev;
+ struct fib6_info *f6i;
+ struct flowi6 fl6;
+ int strict = 0;
+ int oif;
+ u32 mtu;
+
+ /* link local addresses are never forwarded */
+ if (rt6_need_strict(dst) || rt6_need_strict(src))
+ return 0;
+
+ dev = dev_get_by_index_rcu(net, params->ifindex);
+ if (unlikely(!dev))
+ return -ENODEV;
+
+ idev = __in6_dev_get_safely(dev);
+ if (unlikely(!idev || !net->ipv6.devconf_all->forwarding))
+ return 0;
+
+ if (flags & BPF_FIB_LOOKUP_OUTPUT) {
+ fl6.flowi6_iif = 1;
+ oif = fl6.flowi6_oif = params->ifindex;
+ } else {
+ oif = fl6.flowi6_iif = params->ifindex;
+ fl6.flowi6_oif = 0;
+ strict = RT6_LOOKUP_F_HAS_SADDR;
+ }
+ fl6.flowlabel = params->flowinfo;
+ fl6.flowi6_scope = 0;
+ fl6.flowi6_flags = 0;
+ fl6.mp_hash = 0;
+
+ fl6.flowi6_proto = params->l4_protocol;
+ fl6.daddr = *dst;
+ fl6.saddr = *src;
+ fl6.fl6_sport = params->sport;
+ fl6.fl6_dport = params->dport;
+
+ if (flags & BPF_FIB_LOOKUP_DIRECT) {
+ u32 tbid = l3mdev_fib_table_rcu(dev) ? : RT_TABLE_MAIN;
+ struct fib6_table *tb;
+
+ tb = ipv6_stub->fib6_get_table(net, tbid);
+ if (unlikely(!tb))
+ return 0;
+
+ f6i = ipv6_stub->fib6_table_lookup(net, tb, oif, &fl6, strict);
+ } else {
+ fl6.flowi6_mark = 0;
+ fl6.flowi6_secid = 0;
+ fl6.flowi6_tun_key.tun_id = 0;
+ fl6.flowi6_uid = sock_net_uid(net, NULL);
+
+ f6i = ipv6_stub->fib6_lookup(net, oif, &fl6, strict);
+ }
+
+ if (unlikely(IS_ERR_OR_NULL(f6i) || f6i == net->ipv6.fib6_null_entry))
+ return 0;
+
+ if (unlikely(f6i->fib6_flags & RTF_REJECT ||
+ f6i->fib6_type != RTN_UNICAST))
+ return 0;
+
+ if (f6i->fib6_nsiblings && fl6.flowi6_oif == 0)
+ f6i = ipv6_stub->fib6_multipath_select(net, f6i, &fl6,
+ fl6.flowi6_oif, NULL,
+ strict);
+
+ if (check_mtu) {
+ mtu = ipv6_stub->ip6_mtu_from_fib6(f6i, dst, src);
+ if (params->tot_len > mtu)
+ return 0;
+ }
+
+ if (f6i->fib6_nh.nh_lwtstate)
+ return 0;
+
+ if (f6i->fib6_flags & RTF_GATEWAY)
+ *dst = f6i->fib6_nh.nh_gw;
+
+ dev = f6i->fib6_nh.nh_dev;
+ params->rt_metric = f6i->fib6_metric;
+
+ /* xdp and cls_bpf programs are run in RCU-bh so rcu_read_lock_bh is
+ * not needed here. Can not use __ipv6_neigh_lookup_noref here
+ * because we need to get nd_tbl via the stub
+ */
+ neigh = ___neigh_lookup_noref(ipv6_stub->nd_tbl, neigh_key_eq128,
+ ndisc_hashfn, dst, dev);
+ if (neigh)
+ return bpf_fib_set_fwd_params(params, neigh, dev);
+
+ return 0;
+}
+#endif
+
+BPF_CALL_4(bpf_xdp_fib_lookup, struct xdp_buff *, ctx,
+ struct bpf_fib_lookup *, params, int, plen, u32, flags)
+{
+ if (plen < sizeof(*params))
+ return -EINVAL;
+
+ if (flags & ~(BPF_FIB_LOOKUP_DIRECT | BPF_FIB_LOOKUP_OUTPUT))
+ return -EINVAL;
+
+ switch (params->family) {
+#if IS_ENABLED(CONFIG_INET)
+ case AF_INET:
+ return bpf_ipv4_fib_lookup(dev_net(ctx->rxq->dev), params,
+ flags, true);
+#endif
+#if IS_ENABLED(CONFIG_IPV6)
+ case AF_INET6:
+ return bpf_ipv6_fib_lookup(dev_net(ctx->rxq->dev), params,
+ flags, true);
+#endif
+ }
+ return -EAFNOSUPPORT;
+}
+
+static const struct bpf_func_proto bpf_xdp_fib_lookup_proto = {
+ .func = bpf_xdp_fib_lookup,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_PTR_TO_MEM,
+ .arg3_type = ARG_CONST_SIZE,
+ .arg4_type = ARG_ANYTHING,
+};
+
+BPF_CALL_4(bpf_skb_fib_lookup, struct sk_buff *, skb,
+ struct bpf_fib_lookup *, params, int, plen, u32, flags)
+{
+ struct net *net = dev_net(skb->dev);
+ int index = -EAFNOSUPPORT;
+
+ if (plen < sizeof(*params))
+ return -EINVAL;
+
+ if (flags & ~(BPF_FIB_LOOKUP_DIRECT | BPF_FIB_LOOKUP_OUTPUT))
+ return -EINVAL;
+
+ switch (params->family) {
+#if IS_ENABLED(CONFIG_INET)
+ case AF_INET:
+ index = bpf_ipv4_fib_lookup(net, params, flags, false);
+ break;
+#endif
+#if IS_ENABLED(CONFIG_IPV6)
+ case AF_INET6:
+ index = bpf_ipv6_fib_lookup(net, params, flags, false);
+ break;
+#endif
+ }
+
+ if (index > 0) {
+ struct net_device *dev;
+
+ dev = dev_get_by_index_rcu(net, index);
+ if (!is_skb_forwardable(dev, skb))
+ index = 0;
+ }
+
+ return index;
+}
+
+static const struct bpf_func_proto bpf_skb_fib_lookup_proto = {
+ .func = bpf_skb_fib_lookup,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_PTR_TO_MEM,
+ .arg3_type = ARG_CONST_SIZE,
+ .arg4_type = ARG_ANYTHING,
+};
+
+#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF)
+static int bpf_push_seg6_encap(struct sk_buff *skb, u32 type, void *hdr, u32 len)
+{
+ int err;
+ struct ipv6_sr_hdr *srh = (struct ipv6_sr_hdr *)hdr;
+
+ if (!seg6_validate_srh(srh, len))
+ return -EINVAL;
+
+ switch (type) {
+ case BPF_LWT_ENCAP_SEG6_INLINE:
+ if (skb->protocol != htons(ETH_P_IPV6))
+ return -EBADMSG;
+
+ err = seg6_do_srh_inline(skb, srh);
+ break;
+ case BPF_LWT_ENCAP_SEG6:
+ skb_reset_inner_headers(skb);
+ skb->encapsulation = 1;
+ err = seg6_do_srh_encap(skb, srh, IPPROTO_IPV6);
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ bpf_compute_data_pointers(skb);
+ if (err)
+ return err;
+
+ ipv6_hdr(skb)->payload_len = htons(skb->len - sizeof(struct ipv6hdr));
+ skb_set_transport_header(skb, sizeof(struct ipv6hdr));
+
+ return seg6_lookup_nexthop(skb, NULL, 0);
+}
+#endif /* CONFIG_IPV6_SEG6_BPF */
+
+BPF_CALL_4(bpf_lwt_push_encap, struct sk_buff *, skb, u32, type, void *, hdr,
+ u32, len)
+{
+ switch (type) {
+#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF)
+ case BPF_LWT_ENCAP_SEG6:
+ case BPF_LWT_ENCAP_SEG6_INLINE:
+ return bpf_push_seg6_encap(skb, type, hdr, len);
+#endif
+ default:
+ return -EINVAL;
+ }
+}
+
+static const struct bpf_func_proto bpf_lwt_push_encap_proto = {
+ .func = bpf_lwt_push_encap,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_PTR_TO_MEM,
+ .arg4_type = ARG_CONST_SIZE
+};
+
+BPF_CALL_4(bpf_lwt_seg6_store_bytes, struct sk_buff *, skb, u32, offset,
+ const void *, from, u32, len)
+{
+#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF)
+ struct seg6_bpf_srh_state *srh_state =
+ this_cpu_ptr(&seg6_bpf_srh_states);
+ void *srh_tlvs, *srh_end, *ptr;
+ struct ipv6_sr_hdr *srh;
+ int srhoff = 0;
+
+ if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0)
+ return -EINVAL;
+
+ srh = (struct ipv6_sr_hdr *)(skb->data + srhoff);
+ srh_tlvs = (void *)((char *)srh + ((srh->first_segment + 1) << 4));
+ srh_end = (void *)((char *)srh + sizeof(*srh) + srh_state->hdrlen);
+
+ ptr = skb->data + offset;
+ if (ptr >= srh_tlvs && ptr + len <= srh_end)
+ srh_state->valid = 0;
+ else if (ptr < (void *)&srh->flags ||
+ ptr + len > (void *)&srh->segments)
+ return -EFAULT;
+
+ if (unlikely(bpf_try_make_writable(skb, offset + len)))
+ return -EFAULT;
+
+ memcpy(skb->data + offset, from, len);
+ return 0;
+#else /* CONFIG_IPV6_SEG6_BPF */
+ return -EOPNOTSUPP;
+#endif
+}
+
+static const struct bpf_func_proto bpf_lwt_seg6_store_bytes_proto = {
+ .func = bpf_lwt_seg6_store_bytes,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_PTR_TO_MEM,
+ .arg4_type = ARG_CONST_SIZE
+};
+
+BPF_CALL_4(bpf_lwt_seg6_action, struct sk_buff *, skb,
+ u32, action, void *, param, u32, param_len)
+{
+#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF)
+ struct seg6_bpf_srh_state *srh_state =
+ this_cpu_ptr(&seg6_bpf_srh_states);
+ struct ipv6_sr_hdr *srh;
+ int srhoff = 0;
+ int err;
+
+ if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0)
+ return -EINVAL;
+ srh = (struct ipv6_sr_hdr *)(skb->data + srhoff);
+
+ if (!srh_state->valid) {
+ if (unlikely((srh_state->hdrlen & 7) != 0))
+ return -EBADMSG;
+
+ srh->hdrlen = (u8)(srh_state->hdrlen >> 3);
+ if (unlikely(!seg6_validate_srh(srh, (srh->hdrlen + 1) << 3)))
+ return -EBADMSG;
+
+ srh_state->valid = 1;
+ }
+
+ switch (action) {
+ case SEG6_LOCAL_ACTION_END_X:
+ if (param_len != sizeof(struct in6_addr))
+ return -EINVAL;
+ return seg6_lookup_nexthop(skb, (struct in6_addr *)param, 0);
+ case SEG6_LOCAL_ACTION_END_T:
+ if (param_len != sizeof(int))
+ return -EINVAL;
+ return seg6_lookup_nexthop(skb, NULL, *(int *)param);
+ case SEG6_LOCAL_ACTION_END_B6:
+ err = bpf_push_seg6_encap(skb, BPF_LWT_ENCAP_SEG6_INLINE,
+ param, param_len);
+ if (!err)
+ srh_state->hdrlen =
+ ((struct ipv6_sr_hdr *)param)->hdrlen << 3;
+ return err;
+ case SEG6_LOCAL_ACTION_END_B6_ENCAP:
+ err = bpf_push_seg6_encap(skb, BPF_LWT_ENCAP_SEG6,
+ param, param_len);
+ if (!err)
+ srh_state->hdrlen =
+ ((struct ipv6_sr_hdr *)param)->hdrlen << 3;
+ return err;
+ default:
+ return -EINVAL;
+ }
+#else /* CONFIG_IPV6_SEG6_BPF */
+ return -EOPNOTSUPP;
+#endif
+}
+
+static const struct bpf_func_proto bpf_lwt_seg6_action_proto = {
+ .func = bpf_lwt_seg6_action,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_PTR_TO_MEM,
+ .arg4_type = ARG_CONST_SIZE
+};
+
+BPF_CALL_3(bpf_lwt_seg6_adjust_srh, struct sk_buff *, skb, u32, offset,
+ s32, len)
+{
+#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF)
+ struct seg6_bpf_srh_state *srh_state =
+ this_cpu_ptr(&seg6_bpf_srh_states);
+ void *srh_end, *srh_tlvs, *ptr;
+ struct ipv6_sr_hdr *srh;
+ struct ipv6hdr *hdr;
+ int srhoff = 0;
+ int ret;
+
+ if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0)
+ return -EINVAL;
+ srh = (struct ipv6_sr_hdr *)(skb->data + srhoff);
+
+ srh_tlvs = (void *)((unsigned char *)srh + sizeof(*srh) +
+ ((srh->first_segment + 1) << 4));
+ srh_end = (void *)((unsigned char *)srh + sizeof(*srh) +
+ srh_state->hdrlen);
+ ptr = skb->data + offset;
+
+ if (unlikely(ptr < srh_tlvs || ptr > srh_end))
+ return -EFAULT;
+ if (unlikely(len < 0 && (void *)((char *)ptr - len) > srh_end))
+ return -EFAULT;
+
+ if (len > 0) {
+ ret = skb_cow_head(skb, len);
+ if (unlikely(ret < 0))
+ return ret;
+
+ ret = bpf_skb_net_hdr_push(skb, offset, len);
+ } else {
+ ret = bpf_skb_net_hdr_pop(skb, offset, -1 * len);
+ }
+
+ bpf_compute_data_pointers(skb);
+ if (unlikely(ret < 0))
+ return ret;
+
+ hdr = (struct ipv6hdr *)skb->data;
+ hdr->payload_len = htons(skb->len - sizeof(struct ipv6hdr));
+
+ srh_state->hdrlen += len;
+ srh_state->valid = 0;
+ return 0;
+#else /* CONFIG_IPV6_SEG6_BPF */
+ return -EOPNOTSUPP;
+#endif
+}
+
+static const struct bpf_func_proto bpf_lwt_seg6_adjust_srh_proto = {
+ .func = bpf_lwt_seg6_adjust_srh,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_ANYTHING,
+};
+
+bool bpf_helper_changes_pkt_data(void *func)
+{
+ if (func == bpf_skb_vlan_push ||
+ func == bpf_skb_vlan_pop ||
+ func == bpf_skb_store_bytes ||
+ func == bpf_skb_change_proto ||
+ func == bpf_skb_change_head ||
+ func == bpf_skb_change_tail ||
+ func == bpf_skb_adjust_room ||
+ func == bpf_skb_pull_data ||
+ func == bpf_clone_redirect ||
+ func == bpf_l3_csum_replace ||
+ func == bpf_l4_csum_replace ||
+ func == bpf_xdp_adjust_head ||
+ func == bpf_xdp_adjust_meta ||
+ func == bpf_msg_pull_data ||
+ func == bpf_xdp_adjust_tail ||
+ func == bpf_lwt_push_encap ||
+ func == bpf_lwt_seg6_store_bytes ||
+ func == bpf_lwt_seg6_adjust_srh ||
+ func == bpf_lwt_seg6_action
+ )
+ return true;
+
+ return false;
+}
+
static const struct bpf_func_proto *
bpf_base_func_proto(enum bpf_func_id func_id)
{
@@ -3781,6 +4698,8 @@ sk_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
switch (func_id) {
case BPF_FUNC_skb_load_bytes:
return &bpf_skb_load_bytes_proto;
+ case BPF_FUNC_skb_load_bytes_relative:
+ return &bpf_skb_load_bytes_relative_proto;
case BPF_FUNC_get_socket_cookie:
return &bpf_get_socket_cookie_proto;
case BPF_FUNC_get_socket_uid:
@@ -3798,6 +4717,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_skb_store_bytes_proto;
case BPF_FUNC_skb_load_bytes:
return &bpf_skb_load_bytes_proto;
+ case BPF_FUNC_skb_load_bytes_relative:
+ return &bpf_skb_load_bytes_relative_proto;
case BPF_FUNC_skb_pull_data:
return &bpf_skb_pull_data_proto;
case BPF_FUNC_csum_diff:
@@ -3852,6 +4773,16 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_get_socket_cookie_proto;
case BPF_FUNC_get_socket_uid:
return &bpf_get_socket_uid_proto;
+ case BPF_FUNC_fib_lookup:
+ return &bpf_skb_fib_lookup_proto;
+#ifdef CONFIG_XFRM
+ case BPF_FUNC_skb_get_xfrm_state:
+ return &bpf_skb_get_xfrm_state_proto;
+#endif
+#ifdef CONFIG_SOCK_CGROUP_DATA
+ case BPF_FUNC_skb_cgroup_id:
+ return &bpf_skb_cgroup_id_proto;
+#endif
default:
return bpf_base_func_proto(func_id);
}
@@ -3875,33 +4806,10 @@ xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_xdp_redirect_proto;
case BPF_FUNC_redirect_map:
return &bpf_xdp_redirect_map_proto;
- default:
- return bpf_base_func_proto(func_id);
- }
-}
-
-static const struct bpf_func_proto *
-lwt_inout_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
-{
- switch (func_id) {
- case BPF_FUNC_skb_load_bytes:
- return &bpf_skb_load_bytes_proto;
- case BPF_FUNC_skb_pull_data:
- return &bpf_skb_pull_data_proto;
- case BPF_FUNC_csum_diff:
- return &bpf_csum_diff_proto;
- case BPF_FUNC_get_cgroup_classid:
- return &bpf_get_cgroup_classid_proto;
- case BPF_FUNC_get_route_realm:
- return &bpf_get_route_realm_proto;
- case BPF_FUNC_get_hash_recalc:
- return &bpf_get_hash_recalc_proto;
- case BPF_FUNC_perf_event_output:
- return &bpf_skb_event_output_proto;
- case BPF_FUNC_get_smp_processor_id:
- return &bpf_get_smp_processor_id_proto;
- case BPF_FUNC_skb_under_cgroup:
- return &bpf_skb_under_cgroup_proto;
+ case BPF_FUNC_xdp_adjust_tail:
+ return &bpf_xdp_adjust_tail_proto;
+ case BPF_FUNC_fib_lookup:
+ return &bpf_xdp_fib_lookup_proto;
default:
return bpf_base_func_proto(func_id);
}
@@ -3919,6 +4827,8 @@ sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_sock_ops_cb_flags_set_proto;
case BPF_FUNC_sock_map_update:
return &bpf_sock_map_update_proto;
+ case BPF_FUNC_sock_hash_update:
+ return &bpf_sock_hash_update_proto;
default:
return bpf_base_func_proto(func_id);
}
@@ -3930,6 +4840,8 @@ sk_msg_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
switch (func_id) {
case BPF_FUNC_msg_redirect_map:
return &bpf_msg_redirect_map_proto;
+ case BPF_FUNC_msg_redirect_hash:
+ return &bpf_msg_redirect_hash_proto;
case BPF_FUNC_msg_apply_bytes:
return &bpf_msg_apply_bytes_proto;
case BPF_FUNC_msg_cork_bytes:
@@ -3961,12 +4873,52 @@ sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_get_socket_uid_proto;
case BPF_FUNC_sk_redirect_map:
return &bpf_sk_redirect_map_proto;
+ case BPF_FUNC_sk_redirect_hash:
+ return &bpf_sk_redirect_hash_proto;
default:
return bpf_base_func_proto(func_id);
}
}
static const struct bpf_func_proto *
+lwt_out_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+ switch (func_id) {
+ case BPF_FUNC_skb_load_bytes:
+ return &bpf_skb_load_bytes_proto;
+ case BPF_FUNC_skb_pull_data:
+ return &bpf_skb_pull_data_proto;
+ case BPF_FUNC_csum_diff:
+ return &bpf_csum_diff_proto;
+ case BPF_FUNC_get_cgroup_classid:
+ return &bpf_get_cgroup_classid_proto;
+ case BPF_FUNC_get_route_realm:
+ return &bpf_get_route_realm_proto;
+ case BPF_FUNC_get_hash_recalc:
+ return &bpf_get_hash_recalc_proto;
+ case BPF_FUNC_perf_event_output:
+ return &bpf_skb_event_output_proto;
+ case BPF_FUNC_get_smp_processor_id:
+ return &bpf_get_smp_processor_id_proto;
+ case BPF_FUNC_skb_under_cgroup:
+ return &bpf_skb_under_cgroup_proto;
+ default:
+ return bpf_base_func_proto(func_id);
+ }
+}
+
+static const struct bpf_func_proto *
+lwt_in_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+ switch (func_id) {
+ case BPF_FUNC_lwt_push_encap:
+ return &bpf_lwt_push_encap_proto;
+ default:
+ return lwt_out_func_proto(func_id, prog);
+ }
+}
+
+static const struct bpf_func_proto *
lwt_xmit_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
switch (func_id) {
@@ -3997,7 +4949,22 @@ lwt_xmit_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
case BPF_FUNC_set_hash_invalid:
return &bpf_set_hash_invalid_proto;
default:
- return lwt_inout_func_proto(func_id, prog);
+ return lwt_out_func_proto(func_id, prog);
+ }
+}
+
+static const struct bpf_func_proto *
+lwt_seg6local_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+ switch (func_id) {
+ case BPF_FUNC_lwt_seg6_store_bytes:
+ return &bpf_lwt_seg6_store_bytes_proto;
+ case BPF_FUNC_lwt_seg6_action:
+ return &bpf_lwt_seg6_action_proto;
+ case BPF_FUNC_lwt_seg6_adjust_srh:
+ return &bpf_lwt_seg6_adjust_srh_proto;
+ default:
+ return lwt_out_func_proto(func_id, prog);
}
}
@@ -4105,7 +5072,6 @@ static bool lwt_is_valid_access(int off, int size,
return bpf_skb_is_valid_access(off, size, type, prog, info);
}
-
/* Attach type specific accesses */
static bool __sock_filter_check_attach_type(int off,
enum bpf_access_type access_type,
@@ -4221,6 +5187,41 @@ static int bpf_unclone_prologue(struct bpf_insn *insn_buf, bool direct_write,
return insn - insn_buf;
}
+static int bpf_gen_ld_abs(const struct bpf_insn *orig,
+ struct bpf_insn *insn_buf)
+{
+ bool indirect = BPF_MODE(orig->code) == BPF_IND;
+ struct bpf_insn *insn = insn_buf;
+
+ /* We're guaranteed here that CTX is in R6. */
+ *insn++ = BPF_MOV64_REG(BPF_REG_1, BPF_REG_CTX);
+ if (!indirect) {
+ *insn++ = BPF_MOV64_IMM(BPF_REG_2, orig->imm);
+ } else {
+ *insn++ = BPF_MOV64_REG(BPF_REG_2, orig->src_reg);
+ if (orig->imm)
+ *insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, orig->imm);
+ }
+
+ switch (BPF_SIZE(orig->code)) {
+ case BPF_B:
+ *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_8_no_cache);
+ break;
+ case BPF_H:
+ *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_16_no_cache);
+ break;
+ case BPF_W:
+ *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_32_no_cache);
+ break;
+ }
+
+ *insn++ = BPF_JMP_IMM(BPF_JSGE, BPF_REG_0, 0, 2);
+ *insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_0, BPF_REG_0);
+ *insn++ = BPF_EXIT_INSN();
+
+ return insn - insn_buf;
+}
+
static int tc_cls_act_prologue(struct bpf_insn *insn_buf, bool direct_write,
const struct bpf_prog *prog)
{
@@ -4279,8 +5280,15 @@ static bool xdp_is_valid_access(int off, int size,
const struct bpf_prog *prog,
struct bpf_insn_access_aux *info)
{
- if (type == BPF_WRITE)
+ if (type == BPF_WRITE) {
+ if (bpf_prog_is_dev_bound(prog->aux)) {
+ switch (off) {
+ case offsetof(struct xdp_md, rx_queue_index):
+ return __is_valid_xdp_access(off, size);
+ }
+ }
return false;
+ }
switch (off) {
case offsetof(struct xdp_md, data):
@@ -4327,6 +5335,7 @@ static bool sock_addr_is_valid_access(int off, int size,
switch (prog->expected_attach_type) {
case BPF_CGROUP_INET4_BIND:
case BPF_CGROUP_INET4_CONNECT:
+ case BPF_CGROUP_UDP4_SENDMSG:
break;
default:
return false;
@@ -4336,6 +5345,24 @@ static bool sock_addr_is_valid_access(int off, int size,
switch (prog->expected_attach_type) {
case BPF_CGROUP_INET6_BIND:
case BPF_CGROUP_INET6_CONNECT:
+ case BPF_CGROUP_UDP6_SENDMSG:
+ break;
+ default:
+ return false;
+ }
+ break;
+ case bpf_ctx_range(struct bpf_sock_addr, msg_src_ip4):
+ switch (prog->expected_attach_type) {
+ case BPF_CGROUP_UDP4_SENDMSG:
+ break;
+ default:
+ return false;
+ }
+ break;
+ case bpf_ctx_range_till(struct bpf_sock_addr, msg_src_ip6[0],
+ msg_src_ip6[3]):
+ switch (prog->expected_attach_type) {
+ case BPF_CGROUP_UDP6_SENDMSG:
break;
default:
return false;
@@ -4346,6 +5373,9 @@ static bool sock_addr_is_valid_access(int off, int size,
switch (off) {
case bpf_ctx_range(struct bpf_sock_addr, user_ip4):
case bpf_ctx_range_till(struct bpf_sock_addr, user_ip6[0], user_ip6[3]):
+ case bpf_ctx_range(struct bpf_sock_addr, msg_src_ip4):
+ case bpf_ctx_range_till(struct bpf_sock_addr, msg_src_ip6[0],
+ msg_src_ip6[3]):
/* Only narrow read access allowed for now. */
if (type == BPF_READ) {
bpf_ctx_record_field_size(info, size_default);
@@ -4465,18 +5495,23 @@ static bool sk_msg_is_valid_access(int off, int size,
switch (off) {
case offsetof(struct sk_msg_md, data):
info->reg_type = PTR_TO_PACKET;
+ if (size != sizeof(__u64))
+ return false;
break;
case offsetof(struct sk_msg_md, data_end):
info->reg_type = PTR_TO_PACKET_END;
+ if (size != sizeof(__u64))
+ return false;
break;
+ default:
+ if (size != sizeof(__u32))
+ return false;
}
if (off < 0 || off >= sizeof(struct sk_msg_md))
return false;
if (off % size != 0)
return false;
- if (size != sizeof(__u64))
- return false;
return true;
}
@@ -5095,6 +6130,23 @@ static u32 sock_addr_convert_ctx_access(enum bpf_access_type type,
*insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg,
SK_FL_PROTO_SHIFT);
break;
+
+ case offsetof(struct bpf_sock_addr, msg_src_ip4):
+ /* Treat t_ctx as struct in_addr for msg_src_ip4. */
+ SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF(
+ struct bpf_sock_addr_kern, struct in_addr, t_ctx,
+ s_addr, BPF_SIZE(si->code), 0, tmp_reg);
+ break;
+
+ case bpf_ctx_range_till(struct bpf_sock_addr, msg_src_ip6[0],
+ msg_src_ip6[3]):
+ off = si->off;
+ off -= offsetof(struct bpf_sock_addr, msg_src_ip6[0]);
+ /* Treat t_ctx as struct in6_addr for msg_src_ip6. */
+ SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF(
+ struct bpf_sock_addr_kern, struct in6_addr, t_ctx,
+ s6_addr32[0], BPF_SIZE(si->code), off, tmp_reg);
+ break;
}
return insn - insn_buf;
@@ -5152,7 +6204,8 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
break;
case offsetof(struct bpf_sock_ops, local_ip4):
- BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_rcv_saddr) != 4);
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common,
+ skc_rcv_saddr) != 4);
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
struct bpf_sock_ops_kern, sk),
@@ -5469,6 +6522,9 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type,
struct bpf_prog *prog, u32 *target_size)
{
struct bpf_insn *insn = insn_buf;
+#if IS_ENABLED(CONFIG_IPV6)
+ int off;
+#endif
switch (si->off) {
case offsetof(struct sk_msg_md, data):
@@ -5481,6 +6537,107 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type,
si->dst_reg, si->src_reg,
offsetof(struct sk_msg_buff, data_end));
break;
+ case offsetof(struct sk_msg_md, family):
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_family) != 2);
+
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
+ struct sk_msg_buff, sk),
+ si->dst_reg, si->src_reg,
+ offsetof(struct sk_msg_buff, sk));
+ *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
+ offsetof(struct sock_common, skc_family));
+ break;
+
+ case offsetof(struct sk_msg_md, remote_ip4):
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_daddr) != 4);
+
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
+ struct sk_msg_buff, sk),
+ si->dst_reg, si->src_reg,
+ offsetof(struct sk_msg_buff, sk));
+ *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
+ offsetof(struct sock_common, skc_daddr));
+ break;
+
+ case offsetof(struct sk_msg_md, local_ip4):
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common,
+ skc_rcv_saddr) != 4);
+
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
+ struct sk_msg_buff, sk),
+ si->dst_reg, si->src_reg,
+ offsetof(struct sk_msg_buff, sk));
+ *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
+ offsetof(struct sock_common,
+ skc_rcv_saddr));
+ break;
+
+ case offsetof(struct sk_msg_md, remote_ip6[0]) ...
+ offsetof(struct sk_msg_md, remote_ip6[3]):
+#if IS_ENABLED(CONFIG_IPV6)
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common,
+ skc_v6_daddr.s6_addr32[0]) != 4);
+
+ off = si->off;
+ off -= offsetof(struct sk_msg_md, remote_ip6[0]);
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
+ struct sk_msg_buff, sk),
+ si->dst_reg, si->src_reg,
+ offsetof(struct sk_msg_buff, sk));
+ *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
+ offsetof(struct sock_common,
+ skc_v6_daddr.s6_addr32[0]) +
+ off);
+#else
+ *insn++ = BPF_MOV32_IMM(si->dst_reg, 0);
+#endif
+ break;
+
+ case offsetof(struct sk_msg_md, local_ip6[0]) ...
+ offsetof(struct sk_msg_md, local_ip6[3]):
+#if IS_ENABLED(CONFIG_IPV6)
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common,
+ skc_v6_rcv_saddr.s6_addr32[0]) != 4);
+
+ off = si->off;
+ off -= offsetof(struct sk_msg_md, local_ip6[0]);
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
+ struct sk_msg_buff, sk),
+ si->dst_reg, si->src_reg,
+ offsetof(struct sk_msg_buff, sk));
+ *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
+ offsetof(struct sock_common,
+ skc_v6_rcv_saddr.s6_addr32[0]) +
+ off);
+#else
+ *insn++ = BPF_MOV32_IMM(si->dst_reg, 0);
+#endif
+ break;
+
+ case offsetof(struct sk_msg_md, remote_port):
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_dport) != 2);
+
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
+ struct sk_msg_buff, sk),
+ si->dst_reg, si->src_reg,
+ offsetof(struct sk_msg_buff, sk));
+ *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
+ offsetof(struct sock_common, skc_dport));
+#ifndef __BIG_ENDIAN_BITFIELD
+ *insn++ = BPF_ALU32_IMM(BPF_LSH, si->dst_reg, 16);
+#endif
+ break;
+
+ case offsetof(struct sk_msg_md, local_port):
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_num) != 2);
+
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
+ struct sk_msg_buff, sk),
+ si->dst_reg, si->src_reg,
+ offsetof(struct sk_msg_buff, sk));
+ *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
+ offsetof(struct sock_common, skc_num));
+ break;
}
return insn - insn_buf;
@@ -5490,6 +6647,7 @@ const struct bpf_verifier_ops sk_filter_verifier_ops = {
.get_func_proto = sk_filter_func_proto,
.is_valid_access = sk_filter_is_valid_access,
.convert_ctx_access = bpf_convert_ctx_access,
+ .gen_ld_abs = bpf_gen_ld_abs,
};
const struct bpf_prog_ops sk_filter_prog_ops = {
@@ -5501,6 +6659,7 @@ const struct bpf_verifier_ops tc_cls_act_verifier_ops = {
.is_valid_access = tc_cls_act_is_valid_access,
.convert_ctx_access = tc_cls_act_convert_ctx_access,
.gen_prologue = tc_cls_act_prologue,
+ .gen_ld_abs = bpf_gen_ld_abs,
};
const struct bpf_prog_ops tc_cls_act_prog_ops = {
@@ -5527,13 +6686,23 @@ const struct bpf_prog_ops cg_skb_prog_ops = {
.test_run = bpf_prog_test_run_skb,
};
-const struct bpf_verifier_ops lwt_inout_verifier_ops = {
- .get_func_proto = lwt_inout_func_proto,
+const struct bpf_verifier_ops lwt_in_verifier_ops = {
+ .get_func_proto = lwt_in_func_proto,
.is_valid_access = lwt_is_valid_access,
.convert_ctx_access = bpf_convert_ctx_access,
};
-const struct bpf_prog_ops lwt_inout_prog_ops = {
+const struct bpf_prog_ops lwt_in_prog_ops = {
+ .test_run = bpf_prog_test_run_skb,
+};
+
+const struct bpf_verifier_ops lwt_out_verifier_ops = {
+ .get_func_proto = lwt_out_func_proto,
+ .is_valid_access = lwt_is_valid_access,
+ .convert_ctx_access = bpf_convert_ctx_access,
+};
+
+const struct bpf_prog_ops lwt_out_prog_ops = {
.test_run = bpf_prog_test_run_skb,
};
@@ -5548,6 +6717,16 @@ const struct bpf_prog_ops lwt_xmit_prog_ops = {
.test_run = bpf_prog_test_run_skb,
};
+const struct bpf_verifier_ops lwt_seg6local_verifier_ops = {
+ .get_func_proto = lwt_seg6local_func_proto,
+ .is_valid_access = lwt_is_valid_access,
+ .convert_ctx_access = bpf_convert_ctx_access,
+};
+
+const struct bpf_prog_ops lwt_seg6local_prog_ops = {
+ .test_run = bpf_prog_test_run_skb,
+};
+
const struct bpf_verifier_ops cg_sock_verifier_ops = {
.get_func_proto = sock_filter_func_proto,
.is_valid_access = sock_filter_is_valid_access,
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index d29f09bc5ff9..53f96e4f7bf5 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -1253,7 +1253,7 @@ __u32 skb_get_hash_perturb(const struct sk_buff *skb, u32 perturb)
EXPORT_SYMBOL(skb_get_hash_perturb);
u32 __skb_get_poff(const struct sk_buff *skb, void *data,
- const struct flow_keys *keys, int hlen)
+ const struct flow_keys_basic *keys, int hlen)
{
u32 poff = keys->control.thoff;
@@ -1314,9 +1314,9 @@ u32 __skb_get_poff(const struct sk_buff *skb, void *data,
*/
u32 skb_get_poff(const struct sk_buff *skb)
{
- struct flow_keys keys;
+ struct flow_keys_basic keys;
- if (!skb_flow_dissect_flow_keys(skb, &keys, 0))
+ if (!skb_flow_dissect_flow_keys_basic(skb, &keys, NULL, 0, 0, 0, 0))
return 0;
return __skb_get_poff(skb, skb->data, &keys, skb_headlen(skb));
@@ -1334,7 +1334,7 @@ __u32 __get_hash_from_flowi6(const struct flowi6 *fl6, struct flow_keys *keys)
keys->ports.src = fl6->fl6_sport;
keys->ports.dst = fl6->fl6_dport;
keys->keyid.keyid = fl6->fl6_gre_key;
- keys->tags.flow_label = (__force u32)fl6->flowlabel;
+ keys->tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6);
keys->basic.ip_proto = fl6->flowi6_proto;
return flow_hash_from_keys(keys);
@@ -1403,7 +1403,7 @@ static const struct flow_dissector_key flow_keys_dissector_symmetric_keys[] = {
},
};
-static const struct flow_dissector_key flow_keys_buf_dissector_keys[] = {
+static const struct flow_dissector_key flow_keys_basic_dissector_keys[] = {
{
.key_id = FLOW_DISSECTOR_KEY_CONTROL,
.offset = offsetof(struct flow_keys, control),
@@ -1417,7 +1417,8 @@ static const struct flow_dissector_key flow_keys_buf_dissector_keys[] = {
struct flow_dissector flow_keys_dissector __read_mostly;
EXPORT_SYMBOL(flow_keys_dissector);
-struct flow_dissector flow_keys_buf_dissector __read_mostly;
+struct flow_dissector flow_keys_basic_dissector __read_mostly;
+EXPORT_SYMBOL(flow_keys_basic_dissector);
static int __init init_default_flow_dissectors(void)
{
@@ -1427,9 +1428,9 @@ static int __init init_default_flow_dissectors(void)
skb_flow_dissector_init(&flow_keys_dissector_symmetric,
flow_keys_dissector_symmetric_keys,
ARRAY_SIZE(flow_keys_dissector_symmetric_keys));
- skb_flow_dissector_init(&flow_keys_buf_dissector,
- flow_keys_buf_dissector_keys,
- ARRAY_SIZE(flow_keys_buf_dissector_keys));
+ skb_flow_dissector_init(&flow_keys_basic_dissector,
+ flow_keys_basic_dissector_keys,
+ ARRAY_SIZE(flow_keys_basic_dissector_keys));
return 0;
}
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 1fb43bff417d..a7a9c3d738ba 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -820,7 +820,8 @@ static void neigh_periodic_work(struct work_struct *work)
write_lock(&n->lock);
state = n->nud_state;
- if (state & (NUD_PERMANENT | NUD_IN_TIMER)) {
+ if ((state & (NUD_PERMANENT | NUD_IN_TIMER)) ||
+ (n->flags & NTF_EXT_LEARNED)) {
write_unlock(&n->lock);
goto next_elt;
}
@@ -1136,6 +1137,8 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new,
if (neigh->dead)
goto out;
+ neigh_update_ext_learned(neigh, flags, &notify);
+
if (!(new & NUD_VALID)) {
neigh_del_timer(neigh);
if (old & NUD_CONNECTED)
@@ -1781,6 +1784,9 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh,
flags &= ~NEIGH_UPDATE_F_OVERRIDE;
}
+ if (ndm->ndm_flags & NTF_EXT_LEARNED)
+ flags |= NEIGH_UPDATE_F_EXT_LEARNED;
+
if (ndm->ndm_flags & NTF_USE) {
neigh_event_send(neigh, NULL);
err = 0;
diff --git a/net/core/net-traces.c b/net/core/net-traces.c
index 380934580fa1..419af6dfe29f 100644
--- a/net/core/net-traces.c
+++ b/net/core/net-traces.c
@@ -35,10 +35,6 @@
#include <trace/events/tcp.h>
#include <trace/events/fib.h>
#include <trace/events/qdisc.h>
-#if IS_ENABLED(CONFIG_IPV6)
-#include <trace/events/fib6.h>
-EXPORT_TRACEPOINT_SYMBOL_GPL(fib6_table_lookup);
-#endif
#if IS_ENABLED(CONFIG_BRIDGE)
#include <trace/events/bridge.h>
EXPORT_TRACEPOINT_SYMBOL_GPL(br_fdb_add);
diff --git a/net/core/page_pool.c b/net/core/page_pool.c
new file mode 100644
index 000000000000..68bf07206744
--- /dev/null
+++ b/net/core/page_pool.c
@@ -0,0 +1,317 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * page_pool.c
+ * Author: Jesper Dangaard Brouer <netoptimizer@brouer.com>
+ * Copyright (C) 2016 Red Hat, Inc.
+ */
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+
+#include <net/page_pool.h>
+#include <linux/dma-direction.h>
+#include <linux/dma-mapping.h>
+#include <linux/page-flags.h>
+#include <linux/mm.h> /* for __put_page() */
+
+static int page_pool_init(struct page_pool *pool,
+ const struct page_pool_params *params)
+{
+ unsigned int ring_qsize = 1024; /* Default */
+
+ memcpy(&pool->p, params, sizeof(pool->p));
+
+ /* Validate only known flags were used */
+ if (pool->p.flags & ~(PP_FLAG_ALL))
+ return -EINVAL;
+
+ if (pool->p.pool_size)
+ ring_qsize = pool->p.pool_size;
+
+ /* Sanity limit mem that can be pinned down */
+ if (ring_qsize > 32768)
+ return -E2BIG;
+
+ /* DMA direction is either DMA_FROM_DEVICE or DMA_BIDIRECTIONAL.
+ * DMA_BIDIRECTIONAL is for allowing page used for DMA sending,
+ * which is the XDP_TX use-case.
+ */
+ if ((pool->p.dma_dir != DMA_FROM_DEVICE) &&
+ (pool->p.dma_dir != DMA_BIDIRECTIONAL))
+ return -EINVAL;
+
+ if (ptr_ring_init(&pool->ring, ring_qsize, GFP_KERNEL) < 0)
+ return -ENOMEM;
+
+ return 0;
+}
+
+struct page_pool *page_pool_create(const struct page_pool_params *params)
+{
+ struct page_pool *pool;
+ int err = 0;
+
+ pool = kzalloc_node(sizeof(*pool), GFP_KERNEL, params->nid);
+ if (!pool)
+ return ERR_PTR(-ENOMEM);
+
+ err = page_pool_init(pool, params);
+ if (err < 0) {
+ pr_warn("%s() gave up with errno %d\n", __func__, err);
+ kfree(pool);
+ return ERR_PTR(err);
+ }
+ return pool;
+}
+EXPORT_SYMBOL(page_pool_create);
+
+/* fast path */
+static struct page *__page_pool_get_cached(struct page_pool *pool)
+{
+ struct ptr_ring *r = &pool->ring;
+ struct page *page;
+
+ /* Quicker fallback, avoid locks when ring is empty */
+ if (__ptr_ring_empty(r))
+ return NULL;
+
+ /* Test for safe-context, caller should provide this guarantee */
+ if (likely(in_serving_softirq())) {
+ if (likely(pool->alloc.count)) {
+ /* Fast-path */
+ page = pool->alloc.cache[--pool->alloc.count];
+ return page;
+ }
+ /* Slower-path: Alloc array empty, time to refill
+ *
+ * Open-coded bulk ptr_ring consumer.
+ *
+ * Discussion: the ring consumer lock is not really
+ * needed due to the softirq/NAPI protection, but
+ * later need the ability to reclaim pages on the
+ * ring. Thus, keeping the locks.
+ */
+ spin_lock(&r->consumer_lock);
+ while ((page = __ptr_ring_consume(r))) {
+ if (pool->alloc.count == PP_ALLOC_CACHE_REFILL)
+ break;
+ pool->alloc.cache[pool->alloc.count++] = page;
+ }
+ spin_unlock(&r->consumer_lock);
+ return page;
+ }
+
+ /* Slow-path: Get page from locked ring queue */
+ page = ptr_ring_consume(&pool->ring);
+ return page;
+}
+
+/* slow path */
+noinline
+static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool,
+ gfp_t _gfp)
+{
+ struct page *page;
+ gfp_t gfp = _gfp;
+ dma_addr_t dma;
+
+ /* We could always set __GFP_COMP, and avoid this branch, as
+ * prep_new_page() can handle order-0 with __GFP_COMP.
+ */
+ if (pool->p.order)
+ gfp |= __GFP_COMP;
+
+ /* FUTURE development:
+ *
+ * Current slow-path essentially falls back to single page
+ * allocations, which doesn't improve performance. This code
+ * need bulk allocation support from the page allocator code.
+ */
+
+ /* Cache was empty, do real allocation */
+ page = alloc_pages_node(pool->p.nid, gfp, pool->p.order);
+ if (!page)
+ return NULL;
+
+ if (!(pool->p.flags & PP_FLAG_DMA_MAP))
+ goto skip_dma_map;
+
+ /* Setup DMA mapping: use page->private for DMA-addr
+ * This mapping is kept for lifetime of page, until leaving pool.
+ */
+ dma = dma_map_page(pool->p.dev, page, 0,
+ (PAGE_SIZE << pool->p.order),
+ pool->p.dma_dir);
+ if (dma_mapping_error(pool->p.dev, dma)) {
+ put_page(page);
+ return NULL;
+ }
+ set_page_private(page, dma); /* page->private = dma; */
+
+skip_dma_map:
+ /* When page just alloc'ed is should/must have refcnt 1. */
+ return page;
+}
+
+/* For using page_pool replace: alloc_pages() API calls, but provide
+ * synchronization guarantee for allocation side.
+ */
+struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp)
+{
+ struct page *page;
+
+ /* Fast-path: Get a page from cache */
+ page = __page_pool_get_cached(pool);
+ if (page)
+ return page;
+
+ /* Slow-path: cache empty, do real allocation */
+ page = __page_pool_alloc_pages_slow(pool, gfp);
+ return page;
+}
+EXPORT_SYMBOL(page_pool_alloc_pages);
+
+/* Cleanup page_pool state from page */
+static void __page_pool_clean_page(struct page_pool *pool,
+ struct page *page)
+{
+ if (!(pool->p.flags & PP_FLAG_DMA_MAP))
+ return;
+
+ /* DMA unmap */
+ dma_unmap_page(pool->p.dev, page_private(page),
+ PAGE_SIZE << pool->p.order, pool->p.dma_dir);
+ set_page_private(page, 0);
+}
+
+/* Return a page to the page allocator, cleaning up our state */
+static void __page_pool_return_page(struct page_pool *pool, struct page *page)
+{
+ __page_pool_clean_page(pool, page);
+ put_page(page);
+ /* An optimization would be to call __free_pages(page, pool->p.order)
+ * knowing page is not part of page-cache (thus avoiding a
+ * __page_cache_release() call).
+ */
+}
+
+static bool __page_pool_recycle_into_ring(struct page_pool *pool,
+ struct page *page)
+{
+ int ret;
+ /* BH protection not needed if current is serving softirq */
+ if (in_serving_softirq())
+ ret = ptr_ring_produce(&pool->ring, page);
+ else
+ ret = ptr_ring_produce_bh(&pool->ring, page);
+
+ return (ret == 0) ? true : false;
+}
+
+/* Only allow direct recycling in special circumstances, into the
+ * alloc side cache. E.g. during RX-NAPI processing for XDP_DROP use-case.
+ *
+ * Caller must provide appropriate safe context.
+ */
+static bool __page_pool_recycle_direct(struct page *page,
+ struct page_pool *pool)
+{
+ if (unlikely(pool->alloc.count == PP_ALLOC_CACHE_SIZE))
+ return false;
+
+ /* Caller MUST have verified/know (page_ref_count(page) == 1) */
+ pool->alloc.cache[pool->alloc.count++] = page;
+ return true;
+}
+
+void __page_pool_put_page(struct page_pool *pool,
+ struct page *page, bool allow_direct)
+{
+ /* This allocator is optimized for the XDP mode that uses
+ * one-frame-per-page, but have fallbacks that act like the
+ * regular page allocator APIs.
+ *
+ * refcnt == 1 means page_pool owns page, and can recycle it.
+ */
+ if (likely(page_ref_count(page) == 1)) {
+ /* Read barrier done in page_ref_count / READ_ONCE */
+
+ if (allow_direct && in_serving_softirq())
+ if (__page_pool_recycle_direct(page, pool))
+ return;
+
+ if (!__page_pool_recycle_into_ring(pool, page)) {
+ /* Cache full, fallback to free pages */
+ __page_pool_return_page(pool, page);
+ }
+ return;
+ }
+ /* Fallback/non-XDP mode: API user have elevated refcnt.
+ *
+ * Many drivers split up the page into fragments, and some
+ * want to keep doing this to save memory and do refcnt based
+ * recycling. Support this use case too, to ease drivers
+ * switching between XDP/non-XDP.
+ *
+ * In-case page_pool maintains the DMA mapping, API user must
+ * call page_pool_put_page once. In this elevated refcnt
+ * case, the DMA is unmapped/released, as driver is likely
+ * doing refcnt based recycle tricks, meaning another process
+ * will be invoking put_page.
+ */
+ __page_pool_clean_page(pool, page);
+ put_page(page);
+}
+EXPORT_SYMBOL(__page_pool_put_page);
+
+static void __page_pool_empty_ring(struct page_pool *pool)
+{
+ struct page *page;
+
+ /* Empty recycle ring */
+ while ((page = ptr_ring_consume(&pool->ring))) {
+ /* Verify the refcnt invariant of cached pages */
+ if (!(page_ref_count(page) == 1))
+ pr_crit("%s() page_pool refcnt %d violation\n",
+ __func__, page_ref_count(page));
+
+ __page_pool_return_page(pool, page);
+ }
+}
+
+static void __page_pool_destroy_rcu(struct rcu_head *rcu)
+{
+ struct page_pool *pool;
+
+ pool = container_of(rcu, struct page_pool, rcu);
+
+ WARN(pool->alloc.count, "API usage violation");
+
+ __page_pool_empty_ring(pool);
+ ptr_ring_cleanup(&pool->ring, NULL);
+ kfree(pool);
+}
+
+/* Cleanup and release resources */
+void page_pool_destroy(struct page_pool *pool)
+{
+ struct page *page;
+
+ /* Empty alloc cache, assume caller made sure this is
+ * no-longer in use, and page_pool_alloc_pages() cannot be
+ * call concurrently.
+ */
+ while (pool->alloc.count) {
+ page = pool->alloc.cache[--pool->alloc.count];
+ __page_pool_return_page(pool, page);
+ }
+
+ /* No more consumers should exist, but producers could still
+ * be in-flight.
+ */
+ __page_pool_empty_ring(pool);
+
+ /* An xdp_mem_allocator can still ref page_pool pointer */
+ call_rcu(&pool->rcu, __page_pool_destroy_rcu);
+}
+EXPORT_SYMBOL(page_pool_destroy);
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 45936922d7e2..5ef61222fdef 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -59,6 +59,9 @@
#include <net/rtnetlink.h>
#include <net/net_namespace.h>
+#define RTNL_MAX_TYPE 48
+#define RTNL_SLAVE_MAX_TYPE 36
+
struct rtnl_link {
rtnl_doit_func doit;
rtnl_dumpit_func dumpit;
@@ -389,6 +392,11 @@ int rtnl_link_register(struct rtnl_link_ops *ops)
{
int err;
+ /* Sanity-check max sizes to avoid stack buffer overflow. */
+ if (WARN_ON(ops->maxtype > RTNL_MAX_TYPE ||
+ ops->slave_maxtype > RTNL_SLAVE_MAX_TYPE))
+ return -EINVAL;
+
rtnl_lock();
err = __rtnl_link_register(ops);
rtnl_unlock();
@@ -785,13 +793,15 @@ int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst, u32 id,
long expires, u32 error)
{
struct rta_cacheinfo ci = {
- .rta_lastuse = jiffies_delta_to_clock_t(jiffies - dst->lastuse),
- .rta_used = dst->__use,
- .rta_clntref = atomic_read(&(dst->__refcnt)),
.rta_error = error,
.rta_id = id,
};
+ if (dst) {
+ ci.rta_lastuse = jiffies_delta_to_clock_t(jiffies - dst->lastuse);
+ ci.rta_used = dst->__use;
+ ci.rta_clntref = atomic_read(&dst->__refcnt);
+ }
if (expires) {
unsigned long clock;
@@ -2256,6 +2266,10 @@ static int do_setlink(const struct sk_buff *skb,
const struct net_device_ops *ops = dev->netdev_ops;
int err;
+ err = validate_linkmsg(dev, tb);
+ if (err < 0)
+ return err;
+
if (tb[IFLA_NET_NS_PID] || tb[IFLA_NET_NS_FD] || tb[IFLA_IF_NETNSID]) {
struct net *net = rtnl_link_get_net_capable(skb, dev_net(dev),
tb, CAP_NET_ADMIN);
@@ -2619,10 +2633,6 @@ static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh,
goto errout;
}
- err = validate_linkmsg(dev, tb);
- if (err < 0)
- goto errout;
-
err = do_setlink(skb, dev, ifm, extack, tb, ifname, 0);
errout:
return err;
@@ -2900,13 +2910,16 @@ replay:
}
if (1) {
- struct nlattr *attr[ops ? ops->maxtype + 1 : 1];
- struct nlattr *slave_attr[m_ops ? m_ops->slave_maxtype + 1 : 1];
+ struct nlattr *attr[RTNL_MAX_TYPE + 1];
+ struct nlattr *slave_attr[RTNL_SLAVE_MAX_TYPE + 1];
struct nlattr **data = NULL;
struct nlattr **slave_data = NULL;
struct net *dest_net, *link_net = NULL;
if (ops) {
+ if (ops->maxtype > RTNL_MAX_TYPE)
+ return -EINVAL;
+
if (ops->maxtype && linkinfo[IFLA_INFO_DATA]) {
err = nla_parse_nested(attr, ops->maxtype,
linkinfo[IFLA_INFO_DATA],
@@ -2923,6 +2936,9 @@ replay:
}
if (m_ops) {
+ if (m_ops->slave_maxtype > RTNL_SLAVE_MAX_TYPE)
+ return -EINVAL;
+
if (m_ops->slave_maxtype &&
linkinfo[IFLA_INFO_SLAVE_DATA]) {
err = nla_parse_nested(slave_attr,
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 345b51837ca8..c642304f178c 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -1305,7 +1305,7 @@ static void skb_headers_offset_update(struct sk_buff *skb, int off)
skb->inner_mac_header += off;
}
-static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
+void skb_copy_header(struct sk_buff *new, const struct sk_buff *old)
{
__copy_skb_header(new, old);
@@ -1313,6 +1313,7 @@ static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
skb_shinfo(new)->gso_segs = skb_shinfo(old)->gso_segs;
skb_shinfo(new)->gso_type = skb_shinfo(old)->gso_type;
}
+EXPORT_SYMBOL(skb_copy_header);
static inline int skb_alloc_rx_flag(const struct sk_buff *skb)
{
@@ -1355,7 +1356,7 @@ struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t gfp_mask)
BUG_ON(skb_copy_bits(skb, -headerlen, n->head, headerlen + skb->len));
- copy_skb_header(n, skb);
+ skb_copy_header(n, skb);
return n;
}
EXPORT_SYMBOL(skb_copy);
@@ -1419,7 +1420,7 @@ struct sk_buff *__pskb_copy_fclone(struct sk_buff *skb, int headroom,
skb_clone_fraglist(n);
}
- copy_skb_header(n, skb);
+ skb_copy_header(n, skb);
out:
return n;
}
@@ -1599,7 +1600,7 @@ struct sk_buff *skb_copy_expand(const struct sk_buff *skb,
BUG_ON(skb_copy_bits(skb, -head_copy_len, n->head + head_copy_off,
skb->len + head_copy_len));
- copy_skb_header(n, skb);
+ skb_copy_header(n, skb);
skb_headers_offset_update(n, newheadroom - oldheadroom);
@@ -1839,6 +1840,20 @@ done:
}
EXPORT_SYMBOL(___pskb_trim);
+/* Note : use pskb_trim_rcsum() instead of calling this directly
+ */
+int pskb_trim_rcsum_slow(struct sk_buff *skb, unsigned int len)
+{
+ if (skb->ip_summed == CHECKSUM_COMPLETE) {
+ int delta = skb->len - len;
+
+ skb->csum = csum_sub(skb->csum,
+ skb_checksum(skb, len, delta, 0));
+ }
+ return __pskb_trim(skb, len);
+}
+EXPORT_SYMBOL(pskb_trim_rcsum_slow);
+
/**
* __pskb_pull_tail - advance tail of skb header
* @skb: buffer to reallocate
@@ -4926,6 +4941,8 @@ static unsigned int skb_gso_transport_seglen(const struct sk_buff *skb)
thlen = tcp_hdrlen(skb);
} else if (unlikely(skb_is_gso_sctp(skb))) {
thlen = sizeof(struct sctphdr);
+ } else if (shinfo->gso_type & SKB_GSO_UDP_L4) {
+ thlen = sizeof(struct udphdr);
}
/* UFO sets gso_size to the size of the fragmentation
* payload, i.e. the size of the L4 (UDP) header is already
diff --git a/net/core/sock.c b/net/core/sock.c
index 2aed99a541d5..f333d75ef1a9 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -226,7 +226,8 @@ static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
x "AF_RXRPC" , x "AF_ISDN" , x "AF_PHONET" , \
x "AF_IEEE802154", x "AF_CAIF" , x "AF_ALG" , \
x "AF_NFC" , x "AF_VSOCK" , x "AF_KCM" , \
- x "AF_QIPCRTR", x "AF_SMC" , x "AF_MAX"
+ x "AF_QIPCRTR", x "AF_SMC" , x "AF_XDP" , \
+ x "AF_MAX"
static const char *const af_family_key_strings[AF_MAX+1] = {
_sock_locks("sk_lock-")
@@ -262,7 +263,8 @@ static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
"rlock-AF_RXRPC" , "rlock-AF_ISDN" , "rlock-AF_PHONET" ,
"rlock-AF_IEEE802154", "rlock-AF_CAIF" , "rlock-AF_ALG" ,
"rlock-AF_NFC" , "rlock-AF_VSOCK" , "rlock-AF_KCM" ,
- "rlock-AF_QIPCRTR", "rlock-AF_SMC" , "rlock-AF_MAX"
+ "rlock-AF_QIPCRTR", "rlock-AF_SMC" , "rlock-AF_XDP" ,
+ "rlock-AF_MAX"
};
static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
"wlock-AF_UNSPEC", "wlock-AF_UNIX" , "wlock-AF_INET" ,
@@ -279,7 +281,8 @@ static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
"wlock-AF_RXRPC" , "wlock-AF_ISDN" , "wlock-AF_PHONET" ,
"wlock-AF_IEEE802154", "wlock-AF_CAIF" , "wlock-AF_ALG" ,
"wlock-AF_NFC" , "wlock-AF_VSOCK" , "wlock-AF_KCM" ,
- "wlock-AF_QIPCRTR", "wlock-AF_SMC" , "wlock-AF_MAX"
+ "wlock-AF_QIPCRTR", "wlock-AF_SMC" , "wlock-AF_XDP" ,
+ "wlock-AF_MAX"
};
static const char *const af_family_elock_key_strings[AF_MAX+1] = {
"elock-AF_UNSPEC", "elock-AF_UNIX" , "elock-AF_INET" ,
@@ -296,7 +299,8 @@ static const char *const af_family_elock_key_strings[AF_MAX+1] = {
"elock-AF_RXRPC" , "elock-AF_ISDN" , "elock-AF_PHONET" ,
"elock-AF_IEEE802154", "elock-AF_CAIF" , "elock-AF_ALG" ,
"elock-AF_NFC" , "elock-AF_VSOCK" , "elock-AF_KCM" ,
- "elock-AF_QIPCRTR", "elock-AF_SMC" , "elock-AF_MAX"
+ "elock-AF_QIPCRTR", "elock-AF_SMC" , "elock-AF_XDP" ,
+ "elock-AF_MAX"
};
/*
@@ -323,8 +327,8 @@ EXPORT_SYMBOL(sysctl_optmem_max);
int sysctl_tstamp_allow_data __read_mostly = 1;
-struct static_key memalloc_socks = STATIC_KEY_INIT_FALSE;
-EXPORT_SYMBOL_GPL(memalloc_socks);
+DEFINE_STATIC_KEY_FALSE(memalloc_socks_key);
+EXPORT_SYMBOL_GPL(memalloc_socks_key);
/**
* sk_set_memalloc - sets %SOCK_MEMALLOC
@@ -338,7 +342,7 @@ void sk_set_memalloc(struct sock *sk)
{
sock_set_flag(sk, SOCK_MEMALLOC);
sk->sk_allocation |= __GFP_MEMALLOC;
- static_key_slow_inc(&memalloc_socks);
+ static_branch_inc(&memalloc_socks_key);
}
EXPORT_SYMBOL_GPL(sk_set_memalloc);
@@ -346,7 +350,7 @@ void sk_clear_memalloc(struct sock *sk)
{
sock_reset_flag(sk, SOCK_MEMALLOC);
sk->sk_allocation &= ~__GFP_MEMALLOC;
- static_key_slow_dec(&memalloc_socks);
+ static_branch_dec(&memalloc_socks_key);
/*
* SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
@@ -724,9 +728,22 @@ int sock_setsockopt(struct socket *sock, int level, int optname,
sock_valbool_flag(sk, SOCK_DBG, valbool);
break;
case SO_REUSEADDR:
- sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
+ val = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
+ if ((sk->sk_family == PF_INET || sk->sk_family == PF_INET6) &&
+ inet_sk(sk)->inet_num &&
+ (sk->sk_reuse != val)) {
+ ret = (sk->sk_state == TCP_ESTABLISHED) ? -EISCONN : -EUCLEAN;
+ break;
+ }
+ sk->sk_reuse = val;
break;
case SO_REUSEPORT:
+ if ((sk->sk_family == PF_INET || sk->sk_family == PF_INET6) &&
+ inet_sk(sk)->inet_num &&
+ (sk->sk_reuseport != valbool)) {
+ ret = (sk->sk_state == TCP_ESTABLISHED) ? -EISCONN : -EUCLEAN;
+ break;
+ }
sk->sk_reuseport = valbool;
break;
case SO_TYPE:
@@ -905,7 +922,10 @@ set_rcvbuf:
case SO_RCVLOWAT:
if (val < 0)
val = INT_MAX;
- sk->sk_rcvlowat = val ? : 1;
+ if (sock->ops->set_rcvlowat)
+ ret = sock->ops->set_rcvlowat(sk, val);
+ else
+ sk->sk_rcvlowat = val ? : 1;
break;
case SO_RCVTIMEO:
diff --git a/net/core/xdp.c b/net/core/xdp.c
index 097a0f74e004..9d1f22072d5d 100644
--- a/net/core/xdp.c
+++ b/net/core/xdp.c
@@ -5,6 +5,10 @@
*/
#include <linux/types.h>
#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/idr.h>
+#include <linux/rhashtable.h>
+#include <net/page_pool.h>
#include <net/xdp.h>
@@ -13,6 +17,105 @@
#define REG_STATE_UNREGISTERED 0x2
#define REG_STATE_UNUSED 0x3
+static DEFINE_IDA(mem_id_pool);
+static DEFINE_MUTEX(mem_id_lock);
+#define MEM_ID_MAX 0xFFFE
+#define MEM_ID_MIN 1
+static int mem_id_next = MEM_ID_MIN;
+
+static bool mem_id_init; /* false */
+static struct rhashtable *mem_id_ht;
+
+struct xdp_mem_allocator {
+ struct xdp_mem_info mem;
+ union {
+ void *allocator;
+ struct page_pool *page_pool;
+ struct zero_copy_allocator *zc_alloc;
+ };
+ struct rhash_head node;
+ struct rcu_head rcu;
+};
+
+static u32 xdp_mem_id_hashfn(const void *data, u32 len, u32 seed)
+{
+ const u32 *k = data;
+ const u32 key = *k;
+
+ BUILD_BUG_ON(FIELD_SIZEOF(struct xdp_mem_allocator, mem.id)
+ != sizeof(u32));
+
+ /* Use cyclic increasing ID as direct hash key, see rht_bucket_index */
+ return key << RHT_HASH_RESERVED_SPACE;
+}
+
+static int xdp_mem_id_cmp(struct rhashtable_compare_arg *arg,
+ const void *ptr)
+{
+ const struct xdp_mem_allocator *xa = ptr;
+ u32 mem_id = *(u32 *)arg->key;
+
+ return xa->mem.id != mem_id;
+}
+
+static const struct rhashtable_params mem_id_rht_params = {
+ .nelem_hint = 64,
+ .head_offset = offsetof(struct xdp_mem_allocator, node),
+ .key_offset = offsetof(struct xdp_mem_allocator, mem.id),
+ .key_len = FIELD_SIZEOF(struct xdp_mem_allocator, mem.id),
+ .max_size = MEM_ID_MAX,
+ .min_size = 8,
+ .automatic_shrinking = true,
+ .hashfn = xdp_mem_id_hashfn,
+ .obj_cmpfn = xdp_mem_id_cmp,
+};
+
+static void __xdp_mem_allocator_rcu_free(struct rcu_head *rcu)
+{
+ struct xdp_mem_allocator *xa;
+
+ xa = container_of(rcu, struct xdp_mem_allocator, rcu);
+
+ /* Allow this ID to be reused */
+ ida_simple_remove(&mem_id_pool, xa->mem.id);
+
+ /* Notice, driver is expected to free the *allocator,
+ * e.g. page_pool, and MUST also use RCU free.
+ */
+
+ /* Poison memory */
+ xa->mem.id = 0xFFFF;
+ xa->mem.type = 0xF0F0;
+ xa->allocator = (void *)0xDEAD9001;
+
+ kfree(xa);
+}
+
+static void __xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq)
+{
+ struct xdp_mem_allocator *xa;
+ int id = xdp_rxq->mem.id;
+ int err;
+
+ if (id == 0)
+ return;
+
+ mutex_lock(&mem_id_lock);
+
+ xa = rhashtable_lookup(mem_id_ht, &id, mem_id_rht_params);
+ if (!xa) {
+ mutex_unlock(&mem_id_lock);
+ return;
+ }
+
+ err = rhashtable_remove_fast(mem_id_ht, &xa->node, mem_id_rht_params);
+ WARN_ON(err);
+
+ call_rcu(&xa->rcu, __xdp_mem_allocator_rcu_free);
+
+ mutex_unlock(&mem_id_lock);
+}
+
void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq)
{
/* Simplify driver cleanup code paths, allow unreg "unused" */
@@ -21,8 +124,14 @@ void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq)
WARN(!(xdp_rxq->reg_state == REG_STATE_REGISTERED), "Driver BUG");
+ __xdp_rxq_info_unreg_mem_model(xdp_rxq);
+
xdp_rxq->reg_state = REG_STATE_UNREGISTERED;
xdp_rxq->dev = NULL;
+
+ /* Reset mem info to defaults */
+ xdp_rxq->mem.id = 0;
+ xdp_rxq->mem.type = 0;
}
EXPORT_SYMBOL_GPL(xdp_rxq_info_unreg);
@@ -71,3 +180,193 @@ bool xdp_rxq_info_is_reg(struct xdp_rxq_info *xdp_rxq)
return (xdp_rxq->reg_state == REG_STATE_REGISTERED);
}
EXPORT_SYMBOL_GPL(xdp_rxq_info_is_reg);
+
+static int __mem_id_init_hash_table(void)
+{
+ struct rhashtable *rht;
+ int ret;
+
+ if (unlikely(mem_id_init))
+ return 0;
+
+ rht = kzalloc(sizeof(*rht), GFP_KERNEL);
+ if (!rht)
+ return -ENOMEM;
+
+ ret = rhashtable_init(rht, &mem_id_rht_params);
+ if (ret < 0) {
+ kfree(rht);
+ return ret;
+ }
+ mem_id_ht = rht;
+ smp_mb(); /* mutex lock should provide enough pairing */
+ mem_id_init = true;
+
+ return 0;
+}
+
+/* Allocate a cyclic ID that maps to allocator pointer.
+ * See: https://www.kernel.org/doc/html/latest/core-api/idr.html
+ *
+ * Caller must lock mem_id_lock.
+ */
+static int __mem_id_cyclic_get(gfp_t gfp)
+{
+ int retries = 1;
+ int id;
+
+again:
+ id = ida_simple_get(&mem_id_pool, mem_id_next, MEM_ID_MAX, gfp);
+ if (id < 0) {
+ if (id == -ENOSPC) {
+ /* Cyclic allocator, reset next id */
+ if (retries--) {
+ mem_id_next = MEM_ID_MIN;
+ goto again;
+ }
+ }
+ return id; /* errno */
+ }
+ mem_id_next = id + 1;
+
+ return id;
+}
+
+static bool __is_supported_mem_type(enum xdp_mem_type type)
+{
+ if (type == MEM_TYPE_PAGE_POOL)
+ return is_page_pool_compiled_in();
+
+ if (type >= MEM_TYPE_MAX)
+ return false;
+
+ return true;
+}
+
+int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq,
+ enum xdp_mem_type type, void *allocator)
+{
+ struct xdp_mem_allocator *xdp_alloc;
+ gfp_t gfp = GFP_KERNEL;
+ int id, errno, ret;
+ void *ptr;
+
+ if (xdp_rxq->reg_state != REG_STATE_REGISTERED) {
+ WARN(1, "Missing register, driver bug");
+ return -EFAULT;
+ }
+
+ if (!__is_supported_mem_type(type))
+ return -EOPNOTSUPP;
+
+ xdp_rxq->mem.type = type;
+
+ if (!allocator) {
+ if (type == MEM_TYPE_PAGE_POOL || type == MEM_TYPE_ZERO_COPY)
+ return -EINVAL; /* Setup time check page_pool req */
+ return 0;
+ }
+
+ /* Delay init of rhashtable to save memory if feature isn't used */
+ if (!mem_id_init) {
+ mutex_lock(&mem_id_lock);
+ ret = __mem_id_init_hash_table();
+ mutex_unlock(&mem_id_lock);
+ if (ret < 0) {
+ WARN_ON(1);
+ return ret;
+ }
+ }
+
+ xdp_alloc = kzalloc(sizeof(*xdp_alloc), gfp);
+ if (!xdp_alloc)
+ return -ENOMEM;
+
+ mutex_lock(&mem_id_lock);
+ id = __mem_id_cyclic_get(gfp);
+ if (id < 0) {
+ errno = id;
+ goto err;
+ }
+ xdp_rxq->mem.id = id;
+ xdp_alloc->mem = xdp_rxq->mem;
+ xdp_alloc->allocator = allocator;
+
+ /* Insert allocator into ID lookup table */
+ ptr = rhashtable_insert_slow(mem_id_ht, &id, &xdp_alloc->node);
+ if (IS_ERR(ptr)) {
+ errno = PTR_ERR(ptr);
+ goto err;
+ }
+
+ mutex_unlock(&mem_id_lock);
+
+ return 0;
+err:
+ mutex_unlock(&mem_id_lock);
+ kfree(xdp_alloc);
+ return errno;
+}
+EXPORT_SYMBOL_GPL(xdp_rxq_info_reg_mem_model);
+
+/* XDP RX runs under NAPI protection, and in different delivery error
+ * scenarios (e.g. queue full), it is possible to return the xdp_frame
+ * while still leveraging this protection. The @napi_direct boolian
+ * is used for those calls sites. Thus, allowing for faster recycling
+ * of xdp_frames/pages in those cases.
+ */
+static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct,
+ unsigned long handle)
+{
+ struct xdp_mem_allocator *xa;
+ struct page *page;
+
+ switch (mem->type) {
+ case MEM_TYPE_PAGE_POOL:
+ rcu_read_lock();
+ /* mem->id is valid, checked in xdp_rxq_info_reg_mem_model() */
+ xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params);
+ page = virt_to_head_page(data);
+ if (xa)
+ page_pool_put_page(xa->page_pool, page, napi_direct);
+ else
+ put_page(page);
+ rcu_read_unlock();
+ break;
+ case MEM_TYPE_PAGE_SHARED:
+ page_frag_free(data);
+ break;
+ case MEM_TYPE_PAGE_ORDER0:
+ page = virt_to_page(data); /* Assumes order0 page*/
+ put_page(page);
+ break;
+ case MEM_TYPE_ZERO_COPY:
+ /* NB! Only valid from an xdp_buff! */
+ rcu_read_lock();
+ /* mem->id is valid, checked in xdp_rxq_info_reg_mem_model() */
+ xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params);
+ xa->zc_alloc->free(xa->zc_alloc, handle);
+ rcu_read_unlock();
+ default:
+ /* Not possible, checked in xdp_rxq_info_reg_mem_model() */
+ break;
+ }
+}
+
+void xdp_return_frame(struct xdp_frame *xdpf)
+{
+ __xdp_return(xdpf->data, &xdpf->mem, false, 0);
+}
+EXPORT_SYMBOL_GPL(xdp_return_frame);
+
+void xdp_return_frame_rx_napi(struct xdp_frame *xdpf)
+{
+ __xdp_return(xdpf->data, &xdpf->mem, true, 0);
+}
+EXPORT_SYMBOL_GPL(xdp_return_frame_rx_napi);
+
+void xdp_return_buff(struct xdp_buff *xdp)
+{
+ __xdp_return(xdp->data, &xdp->rxq->mem, true, xdp->handle);
+}
+EXPORT_SYMBOL_GPL(xdp_return_buff);
diff --git a/net/dcb/dcbnl.c b/net/dcb/dcbnl.c
index bae7d78aa068..d2f4e0c1faaf 100644
--- a/net/dcb/dcbnl.c
+++ b/net/dcb/dcbnl.c
@@ -176,6 +176,7 @@ static const struct nla_policy dcbnl_ieee_policy[DCB_ATTR_IEEE_MAX + 1] = {
[DCB_ATTR_IEEE_MAXRATE] = {.len = sizeof(struct ieee_maxrate)},
[DCB_ATTR_IEEE_QCN] = {.len = sizeof(struct ieee_qcn)},
[DCB_ATTR_IEEE_QCN_STATS] = {.len = sizeof(struct ieee_qcn_stats)},
+ [DCB_ATTR_DCB_BUFFER] = {.len = sizeof(struct dcbnl_buffer)},
};
/* DCB number of traffic classes nested attributes. */
@@ -1094,6 +1095,16 @@ static int dcbnl_ieee_fill(struct sk_buff *skb, struct net_device *netdev)
return -EMSGSIZE;
}
+ if (ops->dcbnl_getbuffer) {
+ struct dcbnl_buffer buffer;
+
+ memset(&buffer, 0, sizeof(buffer));
+ err = ops->dcbnl_getbuffer(netdev, &buffer);
+ if (!err &&
+ nla_put(skb, DCB_ATTR_DCB_BUFFER, sizeof(buffer), &buffer))
+ return -EMSGSIZE;
+ }
+
app = nla_nest_start(skb, DCB_ATTR_IEEE_APP_TABLE);
if (!app)
return -EMSGSIZE;
@@ -1453,6 +1464,15 @@ static int dcbnl_ieee_set(struct net_device *netdev, struct nlmsghdr *nlh,
goto err;
}
+ if (ieee[DCB_ATTR_DCB_BUFFER] && ops->dcbnl_setbuffer) {
+ struct dcbnl_buffer *buffer =
+ nla_data(ieee[DCB_ATTR_DCB_BUFFER]);
+
+ err = ops->dcbnl_setbuffer(netdev, buffer);
+ if (err)
+ goto err;
+ }
+
if (ieee[DCB_ATTR_IEEE_APP_TABLE]) {
struct nlattr *attr;
int rem;
diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c
index 37ccbe62eb1a..ba6fc3c1186b 100644
--- a/net/dccp/minisocks.c
+++ b/net/dccp/minisocks.c
@@ -53,7 +53,6 @@ void dccp_time_wait(struct sock *sk, int state, int timeo)
if (timeo < rto)
timeo = rto;
- tw->tw_timeout = DCCP_TIMEWAIT_LEN;
if (state == DCCP_TIME_WAIT)
timeo = DCCP_TIMEWAIT_LEN;
diff --git a/net/decnet/dn_rules.c b/net/decnet/dn_rules.c
index c795c3f509c9..72236695db3d 100644
--- a/net/decnet/dn_rules.c
+++ b/net/decnet/dn_rules.c
@@ -121,13 +121,16 @@ static int dn_fib_rule_match(struct fib_rule *rule, struct flowi *fl, int flags)
static int dn_fib_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
struct fib_rule_hdr *frh,
- struct nlattr **tb)
+ struct nlattr **tb,
+ struct netlink_ext_ack *extack)
{
int err = -EINVAL;
struct dn_fib_rule *r = (struct dn_fib_rule *)rule;
- if (frh->tos)
+ if (frh->tos) {
+ NL_SET_ERR_MSG(extack, "Invalid tos value");
goto errout;
+ }
if (rule->table == RT_TABLE_UNSPEC) {
if (rule->action == FR_ACT_TO_TBL) {
diff --git a/net/dsa/Kconfig b/net/dsa/Kconfig
index bbf2c82cf7b2..4183e4ba27a5 100644
--- a/net/dsa/Kconfig
+++ b/net/dsa/Kconfig
@@ -9,7 +9,7 @@ config NET_DSA
depends on HAVE_NET_DSA && MAY_USE_DEVLINK
depends on BRIDGE || BRIDGE=n
select NET_SWITCHDEV
- select PHYLIB
+ select PHYLINK
---help---
Say Y if you want to enable support for the hardware switches supported
by the Distributed Switch Architecture.
diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c
index 47725250b4ca..dc5d9af3dc80 100644
--- a/net/dsa/dsa2.c
+++ b/net/dsa/dsa2.c
@@ -272,7 +272,28 @@ static int dsa_port_setup(struct dsa_port *dp)
case DSA_PORT_TYPE_UNUSED:
break;
case DSA_PORT_TYPE_CPU:
+ /* dp->index is used now as port_number. However
+ * CPU ports should have separate numbering
+ * independent from front panel port numbers.
+ */
+ devlink_port_attrs_set(&dp->devlink_port,
+ DEVLINK_PORT_FLAVOUR_CPU,
+ dp->index, false, 0);
+ err = dsa_port_link_register_of(dp);
+ if (err) {
+ dev_err(ds->dev, "failed to setup link for port %d.%d\n",
+ ds->index, dp->index);
+ return err;
+ }
+ break;
case DSA_PORT_TYPE_DSA:
+ /* dp->index is used now as port_number. However
+ * DSA ports should have separate numbering
+ * independent from front panel port numbers.
+ */
+ devlink_port_attrs_set(&dp->devlink_port,
+ DEVLINK_PORT_FLAVOUR_DSA,
+ dp->index, false, 0);
err = dsa_port_link_register_of(dp);
if (err) {
dev_err(ds->dev, "failed to setup link for port %d.%d\n",
@@ -281,6 +302,9 @@ static int dsa_port_setup(struct dsa_port *dp)
}
break;
case DSA_PORT_TYPE_USER:
+ devlink_port_attrs_set(&dp->devlink_port,
+ DEVLINK_PORT_FLAVOUR_PHYSICAL,
+ dp->index, false, 0);
err = dsa_slave_create(dp);
if (err)
dev_err(ds->dev, "failed to create slave for port %d.%d\n",
diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h
index 053731473c99..3964c6f7a7c0 100644
--- a/net/dsa/dsa_priv.h
+++ b/net/dsa/dsa_priv.h
@@ -75,15 +75,6 @@ struct dsa_slave_priv {
/* DSA port data, such as switch, port index, etc. */
struct dsa_port *dp;
- /*
- * The phylib phy_device pointer for the PHY connected
- * to this port.
- */
- phy_interface_t phy_interface;
- int old_link;
- int old_pause;
- int old_duplex;
-
#ifdef CONFIG_NET_POLL_CONTROLLER
struct netpoll *netpoll;
#endif
diff --git a/net/dsa/master.c b/net/dsa/master.c
index 90e6df0351eb..c90ee3227dea 100644
--- a/net/dsa/master.c
+++ b/net/dsa/master.c
@@ -22,7 +22,7 @@ static void dsa_master_get_ethtool_stats(struct net_device *dev,
int port = cpu_dp->index;
int count = 0;
- if (ops && ops->get_sset_count && ops->get_ethtool_stats) {
+ if (ops->get_sset_count && ops->get_ethtool_stats) {
count = ops->get_sset_count(dev, ETH_SS_STATS);
ops->get_ethtool_stats(dev, stats, data);
}
@@ -31,6 +31,32 @@ static void dsa_master_get_ethtool_stats(struct net_device *dev,
ds->ops->get_ethtool_stats(ds, port, data + count);
}
+static void dsa_master_get_ethtool_phy_stats(struct net_device *dev,
+ struct ethtool_stats *stats,
+ uint64_t *data)
+{
+ struct dsa_port *cpu_dp = dev->dsa_ptr;
+ const struct ethtool_ops *ops = cpu_dp->orig_ethtool_ops;
+ struct dsa_switch *ds = cpu_dp->ds;
+ int port = cpu_dp->index;
+ int count = 0;
+
+ if (dev->phydev && !ops->get_ethtool_phy_stats) {
+ count = phy_ethtool_get_sset_count(dev->phydev);
+ if (count >= 0)
+ phy_ethtool_get_stats(dev->phydev, stats, data);
+ } else if (ops->get_sset_count && ops->get_ethtool_phy_stats) {
+ count = ops->get_sset_count(dev, ETH_SS_PHY_STATS);
+ ops->get_ethtool_phy_stats(dev, stats, data);
+ }
+
+ if (count < 0)
+ count = 0;
+
+ if (ds->ops->get_ethtool_phy_stats)
+ ds->ops->get_ethtool_phy_stats(ds, port, data + count);
+}
+
static int dsa_master_get_sset_count(struct net_device *dev, int sset)
{
struct dsa_port *cpu_dp = dev->dsa_ptr;
@@ -38,11 +64,17 @@ static int dsa_master_get_sset_count(struct net_device *dev, int sset)
struct dsa_switch *ds = cpu_dp->ds;
int count = 0;
- if (ops && ops->get_sset_count)
- count += ops->get_sset_count(dev, sset);
+ if (sset == ETH_SS_PHY_STATS && dev->phydev &&
+ !ops->get_ethtool_phy_stats)
+ count = phy_ethtool_get_sset_count(dev->phydev);
+ else if (ops->get_sset_count)
+ count = ops->get_sset_count(dev, sset);
+
+ if (count < 0)
+ count = 0;
- if (sset == ETH_SS_STATS && ds->ops->get_sset_count)
- count += ds->ops->get_sset_count(ds, cpu_dp->index);
+ if (ds->ops->get_sset_count)
+ count += ds->ops->get_sset_count(ds, cpu_dp->index, sset);
return count;
}
@@ -64,19 +96,28 @@ static void dsa_master_get_strings(struct net_device *dev, uint32_t stringset,
/* We do not want to be NULL-terminated, since this is a prefix */
pfx[sizeof(pfx) - 1] = '_';
- if (ops && ops->get_sset_count && ops->get_strings) {
- mcount = ops->get_sset_count(dev, ETH_SS_STATS);
+ if (stringset == ETH_SS_PHY_STATS && dev->phydev &&
+ !ops->get_ethtool_phy_stats) {
+ mcount = phy_ethtool_get_sset_count(dev->phydev);
+ if (mcount < 0)
+ mcount = 0;
+ else
+ phy_ethtool_get_strings(dev->phydev, data);
+ } else if (ops->get_sset_count && ops->get_strings) {
+ mcount = ops->get_sset_count(dev, stringset);
+ if (mcount < 0)
+ mcount = 0;
ops->get_strings(dev, stringset, data);
}
- if (stringset == ETH_SS_STATS && ds->ops->get_strings) {
+ if (ds->ops->get_strings) {
ndata = data + mcount * len;
/* This function copies ETH_GSTRINGS_LEN bytes, we will mangle
* the output after to prepend our CPU port prefix we
* constructed earlier
*/
- ds->ops->get_strings(ds, port, ndata);
- count = ds->ops->get_sset_count(ds, port);
+ ds->ops->get_strings(ds, port, stringset, ndata);
+ count = ds->ops->get_sset_count(ds, port, stringset);
for (i = 0; i < count; i++) {
memmove(ndata + (i * len + sizeof(pfx)),
ndata + i * len, len - sizeof(pfx));
@@ -102,6 +143,7 @@ static int dsa_master_ethtool_setup(struct net_device *dev)
ops->get_sset_count = dsa_master_get_sset_count;
ops->get_ethtool_stats = dsa_master_get_ethtool_stats;
ops->get_strings = dsa_master_get_strings;
+ ops->get_ethtool_phy_stats = dsa_master_get_ethtool_phy_stats;
dev->ethtool_ops = ops;
diff --git a/net/dsa/port.c b/net/dsa/port.c
index 7acc1169d75e..ed0595459df1 100644
--- a/net/dsa/port.c
+++ b/net/dsa/port.c
@@ -252,6 +252,9 @@ int dsa_port_vlan_add(struct dsa_port *dp,
.vlan = vlan,
};
+ if (netif_is_bridge_master(vlan->obj.orig_dev))
+ return -EOPNOTSUPP;
+
if (br_vlan_enabled(dp->bridge_dev))
return dsa_port_notify(dp, DSA_NOTIFIER_VLAN_ADD, &info);
@@ -267,31 +270,47 @@ int dsa_port_vlan_del(struct dsa_port *dp,
.vlan = vlan,
};
+ if (netif_is_bridge_master(vlan->obj.orig_dev))
+ return -EOPNOTSUPP;
+
if (br_vlan_enabled(dp->bridge_dev))
return dsa_port_notify(dp, DSA_NOTIFIER_VLAN_DEL, &info);
return 0;
}
-static int dsa_port_setup_phy_of(struct dsa_port *dp, bool enable)
+static struct phy_device *dsa_port_get_phy_device(struct dsa_port *dp)
{
- struct device_node *port_dn = dp->dn;
struct device_node *phy_dn;
- struct dsa_switch *ds = dp->ds;
struct phy_device *phydev;
- int port = dp->index;
- int err = 0;
- phy_dn = of_parse_phandle(port_dn, "phy-handle", 0);
+ phy_dn = of_parse_phandle(dp->dn, "phy-handle", 0);
if (!phy_dn)
- return 0;
+ return NULL;
phydev = of_phy_find_device(phy_dn);
if (!phydev) {
- err = -EPROBE_DEFER;
- goto err_put_of;
+ of_node_put(phy_dn);
+ return ERR_PTR(-EPROBE_DEFER);
}
+ return phydev;
+}
+
+static int dsa_port_setup_phy_of(struct dsa_port *dp, bool enable)
+{
+ struct dsa_switch *ds = dp->ds;
+ struct phy_device *phydev;
+ int port = dp->index;
+ int err = 0;
+
+ phydev = dsa_port_get_phy_device(dp);
+ if (!phydev)
+ return 0;
+
+ if (IS_ERR(phydev))
+ return PTR_ERR(phydev);
+
if (enable) {
err = genphy_config_init(phydev);
if (err < 0)
@@ -317,8 +336,6 @@ static int dsa_port_setup_phy_of(struct dsa_port *dp, bool enable)
err_put_dev:
put_device(&phydev->mdio.dev);
-err_put_of:
- of_node_put(phy_dn);
return err;
}
@@ -372,3 +389,60 @@ void dsa_port_link_unregister_of(struct dsa_port *dp)
else
dsa_port_setup_phy_of(dp, false);
}
+
+int dsa_port_get_phy_strings(struct dsa_port *dp, uint8_t *data)
+{
+ struct phy_device *phydev;
+ int ret = -EOPNOTSUPP;
+
+ if (of_phy_is_fixed_link(dp->dn))
+ return ret;
+
+ phydev = dsa_port_get_phy_device(dp);
+ if (IS_ERR_OR_NULL(phydev))
+ return ret;
+
+ ret = phy_ethtool_get_strings(phydev, data);
+ put_device(&phydev->mdio.dev);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(dsa_port_get_phy_strings);
+
+int dsa_port_get_ethtool_phy_stats(struct dsa_port *dp, uint64_t *data)
+{
+ struct phy_device *phydev;
+ int ret = -EOPNOTSUPP;
+
+ if (of_phy_is_fixed_link(dp->dn))
+ return ret;
+
+ phydev = dsa_port_get_phy_device(dp);
+ if (IS_ERR_OR_NULL(phydev))
+ return ret;
+
+ ret = phy_ethtool_get_stats(phydev, NULL, data);
+ put_device(&phydev->mdio.dev);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(dsa_port_get_ethtool_phy_stats);
+
+int dsa_port_get_phy_sset_count(struct dsa_port *dp)
+{
+ struct phy_device *phydev;
+ int ret = -EOPNOTSUPP;
+
+ if (of_phy_is_fixed_link(dp->dn))
+ return ret;
+
+ phydev = dsa_port_get_phy_device(dp);
+ if (IS_ERR_OR_NULL(phydev))
+ return ret;
+
+ ret = phy_ethtool_get_sset_count(phydev);
+ put_device(&phydev->mdio.dev);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(dsa_port_get_phy_sset_count);
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 18561af7a8f1..1e3b6a6d8a40 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -13,6 +13,7 @@
#include <linux/netdevice.h>
#include <linux/phy.h>
#include <linux/phy_fixed.h>
+#include <linux/phylink.h>
#include <linux/of_net.h>
#include <linux/of_mdio.h>
#include <linux/mdio.h>
@@ -97,8 +98,7 @@ static int dsa_slave_open(struct net_device *dev)
if (err)
goto clear_promisc;
- if (dev->phydev)
- phy_start(dev->phydev);
+ phylink_start(dp->pl);
return 0;
@@ -120,8 +120,7 @@ static int dsa_slave_close(struct net_device *dev)
struct net_device *master = dsa_slave_to_master(dev);
struct dsa_port *dp = dsa_slave_to_port(dev);
- if (dev->phydev)
- phy_stop(dev->phydev);
+ phylink_stop(dp->pl);
dsa_port_disable(dp, dev->phydev);
@@ -272,10 +271,7 @@ static int dsa_slave_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
break;
}
- if (!dev->phydev)
- return -ENODEV;
-
- return phy_mii_ioctl(dev->phydev, ifr, cmd);
+ return phylink_mii_ioctl(p->dp->pl, ifr, cmd);
}
static int dsa_slave_port_attr_set(struct net_device *dev,
@@ -498,14 +494,11 @@ dsa_slave_get_regs(struct net_device *dev, struct ethtool_regs *regs, void *_p)
ds->ops->get_regs(ds, dp->index, regs, _p);
}
-static u32 dsa_slave_get_link(struct net_device *dev)
+static int dsa_slave_nway_reset(struct net_device *dev)
{
- if (!dev->phydev)
- return -ENODEV;
-
- genphy_update_link(dev->phydev);
+ struct dsa_port *dp = dsa_slave_to_port(dev);
- return dev->phydev->link;
+ return phylink_ethtool_nway_reset(dp->pl);
}
static int dsa_slave_get_eeprom_len(struct net_device *dev)
@@ -560,7 +553,8 @@ static void dsa_slave_get_strings(struct net_device *dev,
strncpy(data + 2 * len, "rx_packets", len);
strncpy(data + 3 * len, "rx_bytes", len);
if (ds->ops->get_strings)
- ds->ops->get_strings(ds, dp->index, data + 4 * len);
+ ds->ops->get_strings(ds, dp->index, stringset,
+ data + 4 * len);
}
}
@@ -605,7 +599,7 @@ static int dsa_slave_get_sset_count(struct net_device *dev, int sset)
count = 4;
if (ds->ops->get_sset_count)
- count += ds->ops->get_sset_count(ds, dp->index);
+ count += ds->ops->get_sset_count(ds, dp->index, sset);
return count;
}
@@ -618,6 +612,8 @@ static void dsa_slave_get_wol(struct net_device *dev, struct ethtool_wolinfo *w)
struct dsa_port *dp = dsa_slave_to_port(dev);
struct dsa_switch *ds = dp->ds;
+ phylink_ethtool_get_wol(dp->pl, w);
+
if (ds->ops->get_wol)
ds->ops->get_wol(ds, dp->index, w);
}
@@ -628,6 +624,8 @@ static int dsa_slave_set_wol(struct net_device *dev, struct ethtool_wolinfo *w)
struct dsa_switch *ds = dp->ds;
int ret = -EOPNOTSUPP;
+ phylink_ethtool_set_wol(dp->pl, w);
+
if (ds->ops->set_wol)
ret = ds->ops->set_wol(ds, dp->index, w);
@@ -651,13 +649,7 @@ static int dsa_slave_set_eee(struct net_device *dev, struct ethtool_eee *e)
if (ret)
return ret;
- if (e->eee_enabled) {
- ret = phy_init_eee(dev->phydev, 0);
- if (ret)
- return ret;
- }
-
- return phy_ethtool_set_eee(dev->phydev, e);
+ return phylink_ethtool_set_eee(dp->pl, e);
}
static int dsa_slave_get_eee(struct net_device *dev, struct ethtool_eee *e)
@@ -677,7 +669,23 @@ static int dsa_slave_get_eee(struct net_device *dev, struct ethtool_eee *e)
if (ret)
return ret;
- return phy_ethtool_get_eee(dev->phydev, e);
+ return phylink_ethtool_get_eee(dp->pl, e);
+}
+
+static int dsa_slave_get_link_ksettings(struct net_device *dev,
+ struct ethtool_link_ksettings *cmd)
+{
+ struct dsa_port *dp = dsa_slave_to_port(dev);
+
+ return phylink_ethtool_ksettings_get(dp->pl, cmd);
+}
+
+static int dsa_slave_set_link_ksettings(struct net_device *dev,
+ const struct ethtool_link_ksettings *cmd)
+{
+ struct dsa_port *dp = dsa_slave_to_port(dev);
+
+ return phylink_ethtool_ksettings_set(dp->pl, cmd);
}
#ifdef CONFIG_NET_POLL_CONTROLLER
@@ -980,8 +988,8 @@ static const struct ethtool_ops dsa_slave_ethtool_ops = {
.get_drvinfo = dsa_slave_get_drvinfo,
.get_regs_len = dsa_slave_get_regs_len,
.get_regs = dsa_slave_get_regs,
- .nway_reset = phy_ethtool_nway_reset,
- .get_link = dsa_slave_get_link,
+ .nway_reset = dsa_slave_nway_reset,
+ .get_link = ethtool_op_get_link,
.get_eeprom_len = dsa_slave_get_eeprom_len,
.get_eeprom = dsa_slave_get_eeprom,
.set_eeprom = dsa_slave_set_eeprom,
@@ -992,8 +1000,8 @@ static const struct ethtool_ops dsa_slave_ethtool_ops = {
.get_wol = dsa_slave_get_wol,
.set_eee = dsa_slave_set_eee,
.get_eee = dsa_slave_get_eee,
- .get_link_ksettings = phy_ethtool_get_link_ksettings,
- .set_link_ksettings = phy_ethtool_set_link_ksettings,
+ .get_link_ksettings = dsa_slave_get_link_ksettings,
+ .set_link_ksettings = dsa_slave_set_link_ksettings,
.get_rxnfc = dsa_slave_get_rxnfc,
.set_rxnfc = dsa_slave_set_rxnfc,
.get_ts_info = dsa_slave_get_ts_info,
@@ -1052,56 +1060,122 @@ static struct device_type dsa_type = {
.name = "dsa",
};
-static void dsa_slave_adjust_link(struct net_device *dev)
+static void dsa_slave_phylink_validate(struct net_device *dev,
+ unsigned long *supported,
+ struct phylink_link_state *state)
{
struct dsa_port *dp = dsa_slave_to_port(dev);
- struct dsa_slave_priv *p = netdev_priv(dev);
struct dsa_switch *ds = dp->ds;
- unsigned int status_changed = 0;
- if (p->old_link != dev->phydev->link) {
- status_changed = 1;
- p->old_link = dev->phydev->link;
- }
+ if (!ds->ops->phylink_validate)
+ return;
- if (p->old_duplex != dev->phydev->duplex) {
- status_changed = 1;
- p->old_duplex = dev->phydev->duplex;
- }
+ ds->ops->phylink_validate(ds, dp->index, supported, state);
+}
- if (p->old_pause != dev->phydev->pause) {
- status_changed = 1;
- p->old_pause = dev->phydev->pause;
- }
+static int dsa_slave_phylink_mac_link_state(struct net_device *dev,
+ struct phylink_link_state *state)
+{
+ struct dsa_port *dp = dsa_slave_to_port(dev);
+ struct dsa_switch *ds = dp->ds;
+
+ /* Only called for SGMII and 802.3z */
+ if (!ds->ops->phylink_mac_link_state)
+ return -EOPNOTSUPP;
+
+ return ds->ops->phylink_mac_link_state(ds, dp->index, state);
+}
+
+static void dsa_slave_phylink_mac_config(struct net_device *dev,
+ unsigned int mode,
+ const struct phylink_link_state *state)
+{
+ struct dsa_port *dp = dsa_slave_to_port(dev);
+ struct dsa_switch *ds = dp->ds;
+
+ if (!ds->ops->phylink_mac_config)
+ return;
+
+ ds->ops->phylink_mac_config(ds, dp->index, mode, state);
+}
- if (ds->ops->adjust_link && status_changed)
- ds->ops->adjust_link(ds, dp->index, dev->phydev);
+static void dsa_slave_phylink_mac_an_restart(struct net_device *dev)
+{
+ struct dsa_port *dp = dsa_slave_to_port(dev);
+ struct dsa_switch *ds = dp->ds;
- if (status_changed)
- phy_print_status(dev->phydev);
+ if (!ds->ops->phylink_mac_an_restart)
+ return;
+
+ ds->ops->phylink_mac_an_restart(ds, dp->index);
}
-static int dsa_slave_fixed_link_update(struct net_device *dev,
- struct fixed_phy_status *status)
+static void dsa_slave_phylink_mac_link_down(struct net_device *dev,
+ unsigned int mode,
+ phy_interface_t interface)
{
- struct dsa_switch *ds;
- struct dsa_port *dp;
+ struct dsa_port *dp = dsa_slave_to_port(dev);
+ struct dsa_switch *ds = dp->ds;
- if (dev) {
- dp = dsa_slave_to_port(dev);
- ds = dp->ds;
- if (ds->ops->fixed_link_update)
- ds->ops->fixed_link_update(ds, dp->index, status);
+ if (!ds->ops->phylink_mac_link_down) {
+ if (ds->ops->adjust_link && dev->phydev)
+ ds->ops->adjust_link(ds, dp->index, dev->phydev);
+ return;
}
- return 0;
+ ds->ops->phylink_mac_link_down(ds, dp->index, mode, interface);
+}
+
+static void dsa_slave_phylink_mac_link_up(struct net_device *dev,
+ unsigned int mode,
+ phy_interface_t interface,
+ struct phy_device *phydev)
+{
+ struct dsa_port *dp = dsa_slave_to_port(dev);
+ struct dsa_switch *ds = dp->ds;
+
+ if (!ds->ops->phylink_mac_link_up) {
+ if (ds->ops->adjust_link && dev->phydev)
+ ds->ops->adjust_link(ds, dp->index, dev->phydev);
+ return;
+ }
+
+ ds->ops->phylink_mac_link_up(ds, dp->index, mode, interface, phydev);
+}
+
+static const struct phylink_mac_ops dsa_slave_phylink_mac_ops = {
+ .validate = dsa_slave_phylink_validate,
+ .mac_link_state = dsa_slave_phylink_mac_link_state,
+ .mac_config = dsa_slave_phylink_mac_config,
+ .mac_an_restart = dsa_slave_phylink_mac_an_restart,
+ .mac_link_down = dsa_slave_phylink_mac_link_down,
+ .mac_link_up = dsa_slave_phylink_mac_link_up,
+};
+
+void dsa_port_phylink_mac_change(struct dsa_switch *ds, int port, bool up)
+{
+ const struct dsa_port *dp = dsa_to_port(ds, port);
+
+ phylink_mac_change(dp->pl, up);
+}
+EXPORT_SYMBOL_GPL(dsa_port_phylink_mac_change);
+
+static void dsa_slave_phylink_fixed_state(struct net_device *dev,
+ struct phylink_link_state *state)
+{
+ struct dsa_port *dp = dsa_slave_to_port(dev);
+ struct dsa_switch *ds = dp->ds;
+
+ /* No need to check that this operation is valid, the callback would
+ * not be called if it was not.
+ */
+ ds->ops->phylink_fixed_state(ds, dp->index, state);
}
/* slave device setup *******************************************************/
static int dsa_slave_phy_connect(struct net_device *slave_dev, int addr)
{
struct dsa_port *dp = dsa_slave_to_port(slave_dev);
- struct dsa_slave_priv *p = netdev_priv(slave_dev);
struct dsa_switch *ds = dp->ds;
slave_dev->phydev = mdiobus_get_phy(ds->slave_mii_bus, addr);
@@ -1110,75 +1184,54 @@ static int dsa_slave_phy_connect(struct net_device *slave_dev, int addr)
return -ENODEV;
}
- /* Use already configured phy mode */
- if (p->phy_interface == PHY_INTERFACE_MODE_NA)
- p->phy_interface = slave_dev->phydev->interface;
-
- return phy_connect_direct(slave_dev, slave_dev->phydev,
- dsa_slave_adjust_link, p->phy_interface);
+ return phylink_connect_phy(dp->pl, slave_dev->phydev);
}
static int dsa_slave_phy_setup(struct net_device *slave_dev)
{
struct dsa_port *dp = dsa_slave_to_port(slave_dev);
- struct dsa_slave_priv *p = netdev_priv(slave_dev);
struct device_node *port_dn = dp->dn;
struct dsa_switch *ds = dp->ds;
- struct device_node *phy_dn;
- bool phy_is_fixed = false;
u32 phy_flags = 0;
int mode, ret;
mode = of_get_phy_mode(port_dn);
if (mode < 0)
mode = PHY_INTERFACE_MODE_NA;
- p->phy_interface = mode;
- phy_dn = of_parse_phandle(port_dn, "phy-handle", 0);
- if (!phy_dn && of_phy_is_fixed_link(port_dn)) {
- /* In the case of a fixed PHY, the DT node associated
- * to the fixed PHY is the Port DT node
- */
- ret = of_phy_register_fixed_link(port_dn);
- if (ret) {
- netdev_err(slave_dev, "failed to register fixed PHY: %d\n", ret);
- return ret;
- }
- phy_is_fixed = true;
- phy_dn = of_node_get(port_dn);
+ dp->pl = phylink_create(slave_dev, of_fwnode_handle(port_dn), mode,
+ &dsa_slave_phylink_mac_ops);
+ if (IS_ERR(dp->pl)) {
+ netdev_err(slave_dev,
+ "error creating PHYLINK: %ld\n", PTR_ERR(dp->pl));
+ return PTR_ERR(dp->pl);
}
+ /* Register only if the switch provides such a callback, since this
+ * callback takes precedence over polling the link GPIO in PHYLINK
+ * (see phylink_get_fixed_state).
+ */
+ if (ds->ops->phylink_fixed_state)
+ phylink_fixed_state_cb(dp->pl, dsa_slave_phylink_fixed_state);
+
if (ds->ops->get_phy_flags)
phy_flags = ds->ops->get_phy_flags(ds, dp->index);
- if (phy_dn) {
- slave_dev->phydev = of_phy_connect(slave_dev, phy_dn,
- dsa_slave_adjust_link,
- phy_flags,
- p->phy_interface);
- of_node_put(phy_dn);
- }
-
- if (slave_dev->phydev && phy_is_fixed)
- fixed_phy_set_link_update(slave_dev->phydev,
- dsa_slave_fixed_link_update);
-
- /* We could not connect to a designated PHY, so use the switch internal
- * MDIO bus instead
- */
- if (!slave_dev->phydev) {
+ ret = phylink_of_phy_connect(dp->pl, port_dn, phy_flags);
+ if (ret == -ENODEV) {
+ /* We could not connect to a designated PHY or SFP, so use the
+ * switch internal MDIO bus instead
+ */
ret = dsa_slave_phy_connect(slave_dev, dp->index);
if (ret) {
- netdev_err(slave_dev, "failed to connect to port %d: %d\n",
+ netdev_err(slave_dev,
+ "failed to connect to port %d: %d\n",
dp->index, ret);
- if (phy_is_fixed)
- of_phy_deregister_fixed_link(port_dn);
+ phylink_destroy(dp->pl);
return ret;
}
}
- phy_attached_info(slave_dev->phydev);
-
return 0;
}
@@ -1193,29 +1246,26 @@ static void dsa_slave_set_lockdep_class_one(struct net_device *dev,
int dsa_slave_suspend(struct net_device *slave_dev)
{
- struct dsa_slave_priv *p = netdev_priv(slave_dev);
+ struct dsa_port *dp = dsa_slave_to_port(slave_dev);
netif_device_detach(slave_dev);
- if (slave_dev->phydev) {
- phy_stop(slave_dev->phydev);
- p->old_pause = -1;
- p->old_link = -1;
- p->old_duplex = -1;
- phy_suspend(slave_dev->phydev);
- }
+ rtnl_lock();
+ phylink_stop(dp->pl);
+ rtnl_unlock();
return 0;
}
int dsa_slave_resume(struct net_device *slave_dev)
{
+ struct dsa_port *dp = dsa_slave_to_port(slave_dev);
+
netif_device_attach(slave_dev);
- if (slave_dev->phydev) {
- phy_resume(slave_dev->phydev);
- phy_start(slave_dev->phydev);
- }
+ rtnl_lock();
+ phylink_start(dp->pl);
+ rtnl_unlock();
return 0;
}
@@ -1280,11 +1330,6 @@ int dsa_slave_create(struct dsa_port *port)
p->dp = port;
INIT_LIST_HEAD(&p->mall_tc_list);
p->xmit = cpu_dp->tag_ops->xmit;
-
- p->old_pause = -1;
- p->old_link = -1;
- p->old_duplex = -1;
-
port->slave = slave_dev;
netif_carrier_off(slave_dev);
@@ -1307,9 +1352,10 @@ int dsa_slave_create(struct dsa_port *port)
return 0;
out_phy:
- phy_disconnect(slave_dev->phydev);
- if (of_phy_is_fixed_link(port->dn))
- of_phy_deregister_fixed_link(port->dn);
+ rtnl_lock();
+ phylink_disconnect_phy(p->dp->pl);
+ rtnl_unlock();
+ phylink_destroy(p->dp->pl);
out_free:
free_percpu(p->stats64);
free_netdev(slave_dev);
@@ -1321,17 +1367,15 @@ void dsa_slave_destroy(struct net_device *slave_dev)
{
struct dsa_port *dp = dsa_slave_to_port(slave_dev);
struct dsa_slave_priv *p = netdev_priv(slave_dev);
- struct device_node *port_dn = dp->dn;
netif_carrier_off(slave_dev);
- if (slave_dev->phydev) {
- phy_disconnect(slave_dev->phydev);
+ rtnl_lock();
+ phylink_disconnect_phy(dp->pl);
+ rtnl_unlock();
- if (of_phy_is_fixed_link(port_dn))
- of_phy_deregister_fixed_link(port_dn);
- }
dsa_slave_notify(slave_dev, DSA_PORT_UNREGISTER);
unregister_netdev(slave_dev);
+ phylink_destroy(dp->pl);
free_percpu(p->stats64);
free_netdev(slave_dev);
}
@@ -1394,6 +1438,9 @@ static void dsa_slave_switchdev_event_work(struct work_struct *work)
switch (switchdev_work->event) {
case SWITCHDEV_FDB_ADD_TO_DEVICE:
fdb_info = &switchdev_work->fdb_info;
+ if (!fdb_info->added_by_user)
+ break;
+
err = dsa_port_fdb_add(dp, fdb_info->addr, fdb_info->vid);
if (err) {
netdev_dbg(dev, "fdb add failed err=%d\n", err);
@@ -1405,6 +1452,9 @@ static void dsa_slave_switchdev_event_work(struct work_struct *work)
case SWITCHDEV_FDB_DEL_TO_DEVICE:
fdb_info = &switchdev_work->fdb_info;
+ if (!fdb_info->added_by_user)
+ break;
+
err = dsa_port_fdb_del(dp, fdb_info->addr, fdb_info->vid);
if (err) {
netdev_dbg(dev, "fdb del failed err=%d\n", err);
@@ -1457,8 +1507,7 @@ static int dsa_slave_switchdev_event(struct notifier_block *unused,
switch (event) {
case SWITCHDEV_FDB_ADD_TO_DEVICE: /* fall through */
case SWITCHDEV_FDB_DEL_TO_DEVICE:
- if (dsa_slave_switchdev_fdb_work_init(switchdev_work,
- ptr))
+ if (dsa_slave_switchdev_fdb_work_init(switchdev_work, ptr))
goto err_fdb_work_init;
dev_hold(dev);
break;
diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c
index eaeba9b99a73..ee28440f57c5 100644
--- a/net/ethernet/eth.c
+++ b/net/ethernet/eth.c
@@ -128,15 +128,15 @@ u32 eth_get_headlen(void *data, unsigned int len)
{
const unsigned int flags = FLOW_DISSECTOR_F_PARSE_1ST_FRAG;
const struct ethhdr *eth = (const struct ethhdr *)data;
- struct flow_keys keys;
+ struct flow_keys_basic keys;
/* this should never happen, but better safe than sorry */
if (unlikely(len < sizeof(*eth)))
return len;
/* parse any remaining L2/L3 headers, check for L4 */
- if (!skb_flow_dissect_flow_keys_buf(&keys, data, eth->h_proto,
- sizeof(*eth), len, flags))
+ if (!skb_flow_dissect_flow_keys_basic(NULL, &keys, data, eth->h_proto,
+ sizeof(*eth), len, flags))
return max_t(u32, keys.control.thoff, sizeof(*eth));
/* parse for any L4 headers */
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index a07b7dd06def..eec9569ffa5c 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -13,7 +13,10 @@ obj-y := route.o inetpeer.o protocol.o \
tcp_offload.o datagram.o raw.o udp.o udplite.o \
udp_offload.o arp.o icmp.o devinet.o af_inet.o igmp.o \
fib_frontend.o fib_semantics.o fib_trie.o fib_notifier.o \
- inet_fragment.o ping.o ip_tunnel_core.o gre_offload.o
+ inet_fragment.o ping.o ip_tunnel_core.o gre_offload.o \
+ metrics.o netlink.o
+
+obj-$(CONFIG_BPFILTER) += bpfilter/
obj-$(CONFIG_NET_IP_TUNNEL) += ip_tunnel.o
obj-$(CONFIG_SYSCTL) += sysctl_net_ipv4.o
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 8a59428e63ab..15e125558c76 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -994,7 +994,9 @@ const struct proto_ops inet_stream_ops = {
.getsockopt = sock_common_getsockopt,
.sendmsg = inet_sendmsg,
.recvmsg = inet_recvmsg,
- .mmap = sock_no_mmap,
+#ifdef CONFIG_MMU
+ .mmap = tcp_mmap,
+#endif
.sendpage = inet_sendpage,
.splice_read = tcp_splice_read,
.read_sock = tcp_read_sock,
@@ -1006,6 +1008,7 @@ const struct proto_ops inet_stream_ops = {
.compat_getsockopt = compat_sock_common_getsockopt,
.compat_ioctl = inet_compat_ioctl,
#endif
+ .set_rcvlowat = tcp_set_rcvlowat,
};
EXPORT_SYMBOL(inet_stream_ops);
diff --git a/net/ipv4/bpfilter/Makefile b/net/ipv4/bpfilter/Makefile
new file mode 100644
index 000000000000..ce262d76cc48
--- /dev/null
+++ b/net/ipv4/bpfilter/Makefile
@@ -0,0 +1,2 @@
+obj-$(CONFIG_BPFILTER) += sockopt.o
+
diff --git a/net/ipv4/bpfilter/sockopt.c b/net/ipv4/bpfilter/sockopt.c
new file mode 100644
index 000000000000..5e04ed25bc0e
--- /dev/null
+++ b/net/ipv4/bpfilter/sockopt.c
@@ -0,0 +1,43 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/uaccess.h>
+#include <linux/bpfilter.h>
+#include <uapi/linux/bpf.h>
+#include <linux/wait.h>
+#include <linux/kmod.h>
+
+int (*bpfilter_process_sockopt)(struct sock *sk, int optname,
+ char __user *optval,
+ unsigned int optlen, bool is_set);
+EXPORT_SYMBOL_GPL(bpfilter_process_sockopt);
+
+static int bpfilter_mbox_request(struct sock *sk, int optname,
+ char __user *optval,
+ unsigned int optlen, bool is_set)
+{
+ if (!bpfilter_process_sockopt) {
+ int err = request_module("bpfilter");
+
+ if (err)
+ return err;
+ if (!bpfilter_process_sockopt)
+ return -ECHILD;
+ }
+ return bpfilter_process_sockopt(sk, optname, optval, optlen, is_set);
+}
+
+int bpfilter_ip_set_sockopt(struct sock *sk, int optname, char __user *optval,
+ unsigned int optlen)
+{
+ return bpfilter_mbox_request(sk, optname, optval, optlen, true);
+}
+
+int bpfilter_ip_get_sockopt(struct sock *sk, int optname, char __user *optval,
+ int __user *optlen)
+{
+ int len;
+
+ if (get_user(len, optlen))
+ return -EFAULT;
+
+ return bpfilter_mbox_request(sk, optname, optval, len, false);
+}
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 40f001782c1b..d7585ab1a77a 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -99,6 +99,7 @@ static const struct nla_policy ifa_ipv4_policy[IFA_MAX+1] = {
[IFA_LABEL] = { .type = NLA_STRING, .len = IFNAMSIZ - 1 },
[IFA_CACHEINFO] = { .len = sizeof(struct ifa_cacheinfo) },
[IFA_FLAGS] = { .type = NLA_U32 },
+ [IFA_RT_PRIORITY] = { .type = NLA_U32 },
};
#define IN4_ADDR_HSIZE_SHIFT 8
@@ -835,6 +836,9 @@ static struct in_ifaddr *rtm_to_ifaddr(struct net *net, struct nlmsghdr *nlh,
else
memcpy(ifa->ifa_label, dev->name, IFNAMSIZ);
+ if (tb[IFA_RT_PRIORITY])
+ ifa->ifa_rt_priority = nla_get_u32(tb[IFA_RT_PRIORITY]);
+
if (tb[IFA_CACHEINFO]) {
struct ifa_cacheinfo *ci;
@@ -906,12 +910,20 @@ static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh,
return __inet_insert_ifa(ifa, nlh, NETLINK_CB(skb).portid,
extack);
} else {
+ u32 new_metric = ifa->ifa_rt_priority;
+
inet_free_ifa(ifa);
if (nlh->nlmsg_flags & NLM_F_EXCL ||
!(nlh->nlmsg_flags & NLM_F_REPLACE))
return -EEXIST;
ifa = ifa_existing;
+
+ if (ifa->ifa_rt_priority != new_metric) {
+ fib_modify_prefix_metric(ifa, new_metric);
+ ifa->ifa_rt_priority = new_metric;
+ }
+
set_ifa_lifetime(ifa, valid_lft, prefered_lft);
cancel_delayed_work(&check_lifetime_work);
queue_delayed_work(system_power_efficient_wq,
@@ -1549,6 +1561,7 @@ static size_t inet_nlmsg_size(void)
+ nla_total_size(4) /* IFA_BROADCAST */
+ nla_total_size(IFNAMSIZ) /* IFA_LABEL */
+ nla_total_size(4) /* IFA_FLAGS */
+ + nla_total_size(4) /* IFA_RT_PRIORITY */
+ nla_total_size(sizeof(struct ifa_cacheinfo)); /* IFA_CACHEINFO */
}
@@ -1618,6 +1631,8 @@ static int inet_fill_ifaddr(struct sk_buff *skb, struct in_ifaddr *ifa,
(ifa->ifa_label[0] &&
nla_put_string(skb, IFA_LABEL, ifa->ifa_label)) ||
nla_put_u32(skb, IFA_FLAGS, ifa->ifa_flags) ||
+ (ifa->ifa_rt_priority &&
+ nla_put_u32(skb, IFA_RT_PRIORITY, ifa->ifa_rt_priority)) ||
put_cacheinfo(skb, ifa->ifa_cstamp, ifa->ifa_tstamp,
preferred, valid))
goto nla_put_failure;
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index e66172aaf241..63aa39b3af03 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -354,8 +354,6 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
fl4.fl4_dport = 0;
}
- trace_fib_validate_source(dev, &fl4);
-
if (fib_lookup(net, &fl4, &res, 0))
goto last_resort;
if (res.type != RTN_UNICAST &&
@@ -650,6 +648,9 @@ const struct nla_policy rtm_ipv4_policy[RTA_MAX + 1] = {
[RTA_UID] = { .type = NLA_U32 },
[RTA_MARK] = { .type = NLA_U32 },
[RTA_TABLE] = { .type = NLA_U32 },
+ [RTA_IP_PROTO] = { .type = NLA_U8 },
+ [RTA_SPORT] = { .type = NLA_U16 },
+ [RTA_DPORT] = { .type = NLA_U16 },
};
static int rtm_to_fib_config(struct net *net, struct sk_buff *skb,
@@ -846,7 +847,8 @@ out_err:
* to fib engine. It is legal, because all events occur
* only when netlink is already locked.
*/
-static void fib_magic(int cmd, int type, __be32 dst, int dst_len, struct in_ifaddr *ifa)
+static void fib_magic(int cmd, int type, __be32 dst, int dst_len,
+ struct in_ifaddr *ifa, u32 rt_priority)
{
struct net *net = dev_net(ifa->ifa_dev->dev);
u32 tb_id = l3mdev_fib_table(ifa->ifa_dev->dev);
@@ -856,6 +858,7 @@ static void fib_magic(int cmd, int type, __be32 dst, int dst_len, struct in_ifad
.fc_type = type,
.fc_dst = dst,
.fc_dst_len = dst_len,
+ .fc_priority = rt_priority,
.fc_prefsrc = ifa->ifa_local,
.fc_oif = ifa->ifa_dev->dev->ifindex,
.fc_nlflags = NLM_F_CREATE | NLM_F_APPEND,
@@ -901,31 +904,57 @@ void fib_add_ifaddr(struct in_ifaddr *ifa)
}
}
- fib_magic(RTM_NEWROUTE, RTN_LOCAL, addr, 32, prim);
+ fib_magic(RTM_NEWROUTE, RTN_LOCAL, addr, 32, prim, 0);
if (!(dev->flags & IFF_UP))
return;
/* Add broadcast address, if it is explicitly assigned. */
if (ifa->ifa_broadcast && ifa->ifa_broadcast != htonl(0xFFFFFFFF))
- fib_magic(RTM_NEWROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim);
+ fib_magic(RTM_NEWROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32,
+ prim, 0);
if (!ipv4_is_zeronet(prefix) && !(ifa->ifa_flags & IFA_F_SECONDARY) &&
(prefix != addr || ifa->ifa_prefixlen < 32)) {
if (!(ifa->ifa_flags & IFA_F_NOPREFIXROUTE))
fib_magic(RTM_NEWROUTE,
dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST,
- prefix, ifa->ifa_prefixlen, prim);
+ prefix, ifa->ifa_prefixlen, prim,
+ ifa->ifa_rt_priority);
/* Add network specific broadcasts, when it takes a sense */
if (ifa->ifa_prefixlen < 31) {
- fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix, 32, prim);
+ fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix, 32,
+ prim, 0);
fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix | ~mask,
- 32, prim);
+ 32, prim, 0);
}
}
}
+void fib_modify_prefix_metric(struct in_ifaddr *ifa, u32 new_metric)
+{
+ __be32 prefix = ifa->ifa_address & ifa->ifa_mask;
+ struct in_device *in_dev = ifa->ifa_dev;
+ struct net_device *dev = in_dev->dev;
+
+ if (!(dev->flags & IFF_UP) ||
+ ifa->ifa_flags & (IFA_F_SECONDARY | IFA_F_NOPREFIXROUTE) ||
+ ipv4_is_zeronet(prefix) ||
+ prefix == ifa->ifa_local || ifa->ifa_prefixlen == 32)
+ return;
+
+ /* add the new */
+ fib_magic(RTM_NEWROUTE,
+ dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST,
+ prefix, ifa->ifa_prefixlen, ifa, new_metric);
+
+ /* delete the old */
+ fib_magic(RTM_DELROUTE,
+ dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST,
+ prefix, ifa->ifa_prefixlen, ifa, ifa->ifa_rt_priority);
+}
+
/* Delete primary or secondary address.
* Optionally, on secondary address promotion consider the addresses
* from subnet iprim as deleted, even if they are in device list.
@@ -967,7 +996,7 @@ void fib_del_ifaddr(struct in_ifaddr *ifa, struct in_ifaddr *iprim)
if (!(ifa->ifa_flags & IFA_F_NOPREFIXROUTE))
fib_magic(RTM_DELROUTE,
dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST,
- any, ifa->ifa_prefixlen, prim);
+ any, ifa->ifa_prefixlen, prim, 0);
subnet = 1;
}
@@ -1051,17 +1080,20 @@ void fib_del_ifaddr(struct in_ifaddr *ifa, struct in_ifaddr *iprim)
no_promotions:
if (!(ok & BRD_OK))
- fib_magic(RTM_DELROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim);
+ fib_magic(RTM_DELROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32,
+ prim, 0);
if (subnet && ifa->ifa_prefixlen < 31) {
if (!(ok & BRD1_OK))
- fib_magic(RTM_DELROUTE, RTN_BROADCAST, brd, 32, prim);
+ fib_magic(RTM_DELROUTE, RTN_BROADCAST, brd, 32,
+ prim, 0);
if (!(ok & BRD0_OK))
- fib_magic(RTM_DELROUTE, RTN_BROADCAST, any, 32, prim);
+ fib_magic(RTM_DELROUTE, RTN_BROADCAST, any, 32,
+ prim, 0);
}
if (!(ok & LOCAL_OK)) {
unsigned int addr_type;
- fib_magic(RTM_DELROUTE, RTN_LOCAL, ifa->ifa_local, 32, prim);
+ fib_magic(RTM_DELROUTE, RTN_LOCAL, ifa->ifa_local, 32, prim, 0);
/* Check, that this local address finally disappeared. */
addr_type = inet_addr_type_dev_table(dev_net(dev), dev,
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index 737d11bc8838..f8eb78d042a4 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -213,14 +213,17 @@ static const struct nla_policy fib4_rule_policy[FRA_MAX+1] = {
static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
struct fib_rule_hdr *frh,
- struct nlattr **tb)
+ struct nlattr **tb,
+ struct netlink_ext_ack *extack)
{
struct net *net = sock_net(skb->sk);
int err = -EINVAL;
struct fib4_rule *rule4 = (struct fib4_rule *) rule;
- if (frh->tos & ~IPTOS_TOS_MASK)
+ if (frh->tos & ~IPTOS_TOS_MASK) {
+ NL_SET_ERR_MSG(extack, "Invalid tos");
goto errout;
+ }
/* split local/main if they are not already split */
err = fib_unmerge(net);
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index c27122f01b87..f3c89ccf14c5 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -717,6 +717,8 @@ bool fib_metrics_match(struct fib_config *cfg, struct fib_info *fi)
nla_strlcpy(tmp, nla, sizeof(tmp));
val = tcp_ca_get_key_by_name(fi->fib_net, tmp, &ecn_ca);
} else {
+ if (nla_len(nla) != sizeof(u32))
+ return false;
val = nla_get_u32(nla);
}
@@ -1019,47 +1021,8 @@ static bool fib_valid_prefsrc(struct fib_config *cfg, __be32 fib_prefsrc)
static int
fib_convert_metrics(struct fib_info *fi, const struct fib_config *cfg)
{
- bool ecn_ca = false;
- struct nlattr *nla;
- int remaining;
-
- if (!cfg->fc_mx)
- return 0;
-
- nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
- int type = nla_type(nla);
- u32 val;
-
- if (!type)
- continue;
- if (type > RTAX_MAX)
- return -EINVAL;
-
- if (type == RTAX_CC_ALGO) {
- char tmp[TCP_CA_NAME_MAX];
-
- nla_strlcpy(tmp, nla, sizeof(tmp));
- val = tcp_ca_get_key_by_name(fi->fib_net, tmp, &ecn_ca);
- if (val == TCP_CA_UNSPEC)
- return -EINVAL;
- } else {
- val = nla_get_u32(nla);
- }
- if (type == RTAX_ADVMSS && val > 65535 - 40)
- val = 65535 - 40;
- if (type == RTAX_MTU && val > 65535 - 15)
- val = 65535 - 15;
- if (type == RTAX_HOPLIMIT && val > 255)
- val = 255;
- if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK))
- return -EINVAL;
- fi->fib_metrics->metrics[type - 1] = val;
- }
-
- if (ecn_ca)
- fi->fib_metrics->metrics[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA;
-
- return 0;
+ return ip_metrics_convert(fi->fib_net, cfg->fc_mx, cfg->fc_mx_len,
+ fi->fib_metrics->metrics);
}
struct fib_info *fib_create_info(struct fib_config *cfg,
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 99c23a0cb8ca..5bc0c89e81e4 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -1326,14 +1326,14 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
unsigned long index;
t_key cindex;
- trace_fib_table_lookup(tb->tb_id, flp);
-
pn = t->kv;
cindex = 0;
n = get_child_rcu(pn, cindex);
- if (!n)
+ if (!n) {
+ trace_fib_table_lookup(tb->tb_id, flp, NULL, -EAGAIN);
return -EAGAIN;
+ }
#ifdef CONFIG_IP_FIB_TRIE_STATS
this_cpu_inc(stats->gets);
@@ -1416,8 +1416,11 @@ backtrace:
* nothing for us to do as we do not have any
* further nodes to parse.
*/
- if (IS_TRIE(pn))
+ if (IS_TRIE(pn)) {
+ trace_fib_table_lookup(tb->tb_id, flp,
+ NULL, -EAGAIN);
return -EAGAIN;
+ }
#ifdef CONFIG_IP_FIB_TRIE_STATS
this_cpu_inc(stats->backtrack);
#endif
@@ -1459,6 +1462,7 @@ found:
#ifdef CONFIG_IP_FIB_TRIE_STATS
this_cpu_inc(stats->semantic_match_passed);
#endif
+ trace_fib_table_lookup(tb->tb_id, flp, NULL, err);
return err;
}
if (fi->fib_flags & RTNH_F_DEAD)
@@ -1494,7 +1498,7 @@ found:
#ifdef CONFIG_IP_FIB_TRIE_STATS
this_cpu_inc(stats->semantic_match_passed);
#endif
- trace_fib_table_lookup_nh(nh);
+ trace_fib_table_lookup(tb->tb_id, flp, nh, err);
return err;
}
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 881ac6d046f2..33a88e045efd 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -27,11 +27,6 @@
#include <net/sock_reuseport.h>
#include <net/addrconf.h>
-#ifdef INET_CSK_DEBUG
-const char inet_csk_timer_bug_msg[] = "inet_csk BUG: unknown timer value\n";
-EXPORT_SYMBOL(inet_csk_timer_bug_msg);
-#endif
-
#if IS_ENABLED(CONFIG_IPV6)
/* match_wildcard == true: IPV6_ADDR_ANY equals to any IPv6 addresses if IPv6
* only, and any IPv4 addresses if not IPv6 only
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index f200b304f76c..2d8efeecf619 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -578,6 +578,8 @@ static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev,
int tunnel_hlen;
int version;
__be16 df;
+ int nhoff;
+ int thoff;
tun_info = skb_tunnel_info(skb);
if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
@@ -605,6 +607,16 @@ static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev,
truncate = true;
}
+ nhoff = skb_network_header(skb) - skb_mac_header(skb);
+ if (skb->protocol == htons(ETH_P_IP) &&
+ (ntohs(ip_hdr(skb)->tot_len) > skb->len - nhoff))
+ truncate = true;
+
+ thoff = skb_transport_header(skb) - skb_mac_header(skb);
+ if (skb->protocol == htons(ETH_P_IPV6) &&
+ (ntohs(ipv6_hdr(skb)->payload_len) > skb->len - thoff))
+ truncate = true;
+
if (version == 1) {
erspan_build_header(skb, ntohl(tunnel_id_to_key32(key->tun_id)),
ntohl(md->u.index), truncate, true);
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index d54abc097800..af5a830ff6ad 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -878,11 +878,14 @@ static int __ip_append_data(struct sock *sk,
struct rtable *rt = (struct rtable *)cork->dst;
unsigned int wmem_alloc_delta = 0;
u32 tskey = 0;
+ bool paged;
skb = skb_peek_tail(queue);
exthdrlen = !skb ? rt->dst.header_len : 0;
- mtu = cork->fragsize;
+ mtu = cork->gso_size ? IP_MAX_MTU : cork->fragsize;
+ paged = !!cork->gso_size;
+
if (cork->tx_flags & SKBTX_ANY_SW_TSTAMP &&
sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
tskey = sk->sk_tskey++;
@@ -906,8 +909,8 @@ static int __ip_append_data(struct sock *sk,
if (transhdrlen &&
length + fragheaderlen <= mtu &&
rt->dst.dev->features & (NETIF_F_HW_CSUM | NETIF_F_IP_CSUM) &&
- !(flags & MSG_MORE) &&
- !exthdrlen)
+ (!(flags & MSG_MORE) || cork->gso_size) &&
+ (!exthdrlen || (rt->dst.dev->features & NETIF_F_HW_ESP_TX_CSUM)))
csummode = CHECKSUM_PARTIAL;
cork->length += length;
@@ -933,6 +936,7 @@ static int __ip_append_data(struct sock *sk,
unsigned int fraglen;
unsigned int fraggap;
unsigned int alloclen;
+ unsigned int pagedlen = 0;
struct sk_buff *skb_prev;
alloc_new_skb:
skb_prev = skb;
@@ -953,8 +957,12 @@ alloc_new_skb:
if ((flags & MSG_MORE) &&
!(rt->dst.dev->features&NETIF_F_SG))
alloclen = mtu;
- else
+ else if (!paged)
alloclen = fraglen;
+ else {
+ alloclen = min_t(int, fraglen, MAX_HEADER);
+ pagedlen = fraglen - alloclen;
+ }
alloclen += exthdrlen;
@@ -998,7 +1006,7 @@ alloc_new_skb:
/*
* Find where to start putting bytes.
*/
- data = skb_put(skb, fraglen + exthdrlen);
+ data = skb_put(skb, fraglen + exthdrlen - pagedlen);
skb_set_network_header(skb, exthdrlen);
skb->transport_header = (skb->network_header +
fragheaderlen);
@@ -1014,7 +1022,7 @@ alloc_new_skb:
pskb_trim_unique(skb_prev, maxfraglen);
}
- copy = datalen - transhdrlen - fraggap;
+ copy = datalen - transhdrlen - fraggap - pagedlen;
if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
err = -EFAULT;
kfree_skb(skb);
@@ -1022,7 +1030,7 @@ alloc_new_skb:
}
offset += copy;
- length -= datalen - fraggap;
+ length -= copy + transhdrlen;
transhdrlen = 0;
exthdrlen = 0;
csummode = CHECKSUM_NONE;
@@ -1136,6 +1144,8 @@ static int ip_setup_cork(struct sock *sk, struct inet_cork *cork,
*rtp = NULL;
cork->fragsize = ip_sk_use_pmtu(sk) ?
dst_mtu(&rt->dst) : rt->dst.dev->mtu;
+
+ cork->gso_size = sk->sk_type == SOCK_DGRAM ? ipc->gso_size : 0;
cork->dst = &rt->dst;
cork->length = 0;
cork->ttl = ipc->ttl;
@@ -1215,7 +1225,7 @@ ssize_t ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page,
return -EOPNOTSUPP;
hh_len = LL_RESERVED_SPACE(rt->dst.dev);
- mtu = cork->fragsize;
+ mtu = cork->gso_size ? IP_MAX_MTU : cork->fragsize;
fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
@@ -1471,9 +1481,8 @@ struct sk_buff *ip_make_skb(struct sock *sk,
int len, int odd, struct sk_buff *skb),
void *from, int length, int transhdrlen,
struct ipcm_cookie *ipc, struct rtable **rtp,
- unsigned int flags)
+ struct inet_cork *cork, unsigned int flags)
{
- struct inet_cork cork;
struct sk_buff_head queue;
int err;
@@ -1482,22 +1491,22 @@ struct sk_buff *ip_make_skb(struct sock *sk,
__skb_queue_head_init(&queue);
- cork.flags = 0;
- cork.addr = 0;
- cork.opt = NULL;
- err = ip_setup_cork(sk, &cork, ipc, rtp);
+ cork->flags = 0;
+ cork->addr = 0;
+ cork->opt = NULL;
+ err = ip_setup_cork(sk, cork, ipc, rtp);
if (err)
return ERR_PTR(err);
- err = __ip_append_data(sk, fl4, &queue, &cork,
+ err = __ip_append_data(sk, fl4, &queue, cork,
&current->task_frag, getfrag,
from, length, transhdrlen, flags);
if (err) {
- __ip_flush_pending_frames(sk, &queue, &cork);
+ __ip_flush_pending_frames(sk, &queue, cork);
return ERR_PTR(err);
}
- return __ip_make_skb(sk, fl4, &queue, &cork);
+ return __ip_make_skb(sk, fl4, &queue, cork);
}
/*
@@ -1553,7 +1562,7 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb,
oif = skb->skb_iif;
flowi4_init_output(&fl4, oif,
- IP4_REPLY_MARK(net, skb->mark),
+ IP4_REPLY_MARK(net, skb->mark) ?: sk->sk_mark,
RT_TOS(arg->tos),
RT_SCOPE_UNIVERSE, ip_hdr(skb)->protocol,
ip_reply_arg_flowi_flags(arg),
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 57bbb060faaf..fc32fdbeefa6 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -47,6 +47,8 @@
#include <linux/errqueue.h>
#include <linux/uaccess.h>
+#include <linux/bpfilter.h>
+
/*
* SOL_IP control messages.
*/
@@ -1242,6 +1244,11 @@ int ip_setsockopt(struct sock *sk, int level,
return -ENOPROTOOPT;
err = do_ip_setsockopt(sk, level, optname, optval, optlen);
+#ifdef CONFIG_BPFILTER
+ if (optname >= BPFILTER_IPT_SO_SET_REPLACE &&
+ optname < BPFILTER_IPT_SET_MAX)
+ err = bpfilter_ip_set_sockopt(sk, optname, optval, optlen);
+#endif
#ifdef CONFIG_NETFILTER
/* we need to exclude all possible ENOPROTOOPTs except default case */
if (err == -ENOPROTOOPT && optname != IP_HDRINCL &&
@@ -1550,6 +1557,11 @@ int ip_getsockopt(struct sock *sk, int level,
int err;
err = do_ip_getsockopt(sk, level, optname, optval, optlen, 0);
+#ifdef CONFIG_BPFILTER
+ if (optname >= BPFILTER_IPT_SO_GET_INFO &&
+ optname < BPFILTER_IPT_GET_MAX)
+ err = bpfilter_ip_get_sockopt(sk, optname, optval, optlen);
+#endif
#ifdef CONFIG_NETFILTER
/* we need to exclude all possible ENOPROTOOPTs except default case */
if (err == -ENOPROTOOPT && optname != IP_PKTOPTIONS &&
@@ -1582,6 +1594,11 @@ int compat_ip_getsockopt(struct sock *sk, int level, int optname,
err = do_ip_getsockopt(sk, level, optname, optval, optlen,
MSG_CMSG_COMPAT);
+#ifdef CONFIG_BPFILTER
+ if (optname >= BPFILTER_IPT_SO_GET_INFO &&
+ optname < BPFILTER_IPT_GET_MAX)
+ err = bpfilter_ip_get_sockopt(sk, optname, optval, optlen);
+#endif
#ifdef CONFIG_NETFILTER
/* we need to exclude all possible ENOPROTOOPTs except default case */
if (err == -ENOPROTOOPT && optname != IP_PKTOPTIONS &&
diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c
index 2f39479be92f..dde671e97829 100644
--- a/net/ipv4/ip_tunnel_core.c
+++ b/net/ipv4/ip_tunnel_core.c
@@ -423,17 +423,17 @@ void __init ip_tunnel_core_init(void)
lwtunnel_encap_add_ops(&ip6_tun_lwt_ops, LWTUNNEL_ENCAP_IP6);
}
-struct static_key ip_tunnel_metadata_cnt = STATIC_KEY_INIT_FALSE;
+DEFINE_STATIC_KEY_FALSE(ip_tunnel_metadata_cnt);
EXPORT_SYMBOL(ip_tunnel_metadata_cnt);
void ip_tunnel_need_metadata(void)
{
- static_key_slow_inc(&ip_tunnel_metadata_cnt);
+ static_branch_inc(&ip_tunnel_metadata_cnt);
}
EXPORT_SYMBOL_GPL(ip_tunnel_need_metadata);
void ip_tunnel_unneed_metadata(void)
{
- static_key_slow_dec(&ip_tunnel_metadata_cnt);
+ static_branch_dec(&ip_tunnel_metadata_cnt);
}
EXPORT_SYMBOL_GPL(ip_tunnel_unneed_metadata);
diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index bbcbcc113d19..88212615bf4c 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -28,6 +28,9 @@
*
* Multiple Nameservers in /proc/net/pnp
* -- Josef Siemes <jsiemes@web.de>, Aug 2002
+ *
+ * NTP servers in /proc/net/ipconfig/ntp_servers
+ * -- Chris Novakovic <chris@chrisn.me.uk>, April 2018
*/
#include <linux/types.h>
@@ -93,6 +96,7 @@
#define CONF_TIMEOUT_MAX (HZ*30) /* Maximum allowed timeout */
#define CONF_NAMESERVERS_MAX 3 /* Maximum number of nameservers
- '3' from resolv.h */
+#define CONF_NTP_SERVERS_MAX 3 /* Maximum number of NTP servers */
#define NONE cpu_to_be32(INADDR_NONE)
#define ANY cpu_to_be32(INADDR_ANY)
@@ -152,6 +156,7 @@ static int ic_proto_used; /* Protocol used, if any */
#define ic_proto_used 0
#endif
static __be32 ic_nameservers[CONF_NAMESERVERS_MAX]; /* DNS Server IP addresses */
+static __be32 ic_ntp_servers[CONF_NTP_SERVERS_MAX]; /* NTP server IP addresses */
static u8 ic_domain[64]; /* DNS (not NIS) domain name */
/*
@@ -576,6 +581,15 @@ static inline void __init ic_nameservers_predef(void)
ic_nameservers[i] = NONE;
}
+/* Predefine NTP servers */
+static inline void __init ic_ntp_servers_predef(void)
+{
+ int i;
+
+ for (i = 0; i < CONF_NTP_SERVERS_MAX; i++)
+ ic_ntp_servers[i] = NONE;
+}
+
/*
* DHCP/BOOTP support.
*/
@@ -671,6 +685,7 @@ ic_dhcp_init_options(u8 *options, struct ic_device *d)
17, /* Boot path */
26, /* MTU */
40, /* NIS domain name */
+ 42, /* NTP servers */
};
*e++ = 55; /* Parameter request list */
@@ -721,9 +736,11 @@ static void __init ic_bootp_init_ext(u8 *e)
*e++ = 3; /* Default gateway request */
*e++ = 4;
e += 4;
- *e++ = 5; /* Name server request */
- *e++ = 8;
- e += 8;
+#if CONF_NAMESERVERS_MAX > 0
+ *e++ = 6; /* (DNS) name server request */
+ *e++ = 4 * CONF_NAMESERVERS_MAX;
+ e += 4 * CONF_NAMESERVERS_MAX;
+#endif
*e++ = 12; /* Host name request */
*e++ = 32;
e += 32;
@@ -748,7 +765,13 @@ static void __init ic_bootp_init_ext(u8 *e)
*/
static inline void __init ic_bootp_init(void)
{
+ /* Re-initialise all name servers and NTP servers to NONE, in case any
+ * were set via the "ip=" or "nfsaddrs=" kernel command line parameters:
+ * any IP addresses specified there will already have been decoded but
+ * are no longer needed
+ */
ic_nameservers_predef();
+ ic_ntp_servers_predef();
dev_add_pack(&bootp_packet_type);
}
@@ -912,6 +935,15 @@ static void __init ic_do_bootp_ext(u8 *ext)
ic_bootp_string(utsname()->domainname, ext+1, *ext,
__NEW_UTS_LEN);
break;
+ case 42: /* NTP servers */
+ servers = *ext / 4;
+ if (servers > CONF_NTP_SERVERS_MAX)
+ servers = CONF_NTP_SERVERS_MAX;
+ for (i = 0; i < servers; i++) {
+ if (ic_ntp_servers[i] == NONE)
+ memcpy(&ic_ntp_servers[i], ext+1+4*i, 4);
+ }
+ break;
}
}
@@ -1257,7 +1289,10 @@ static int __init ic_dynamic(void)
#endif /* IPCONFIG_DYNAMIC */
#ifdef CONFIG_PROC_FS
+/* proc_dir_entry for /proc/net/ipconfig */
+static struct proc_dir_entry *ipconfig_dir;
+/* Name servers: */
static int pnp_seq_show(struct seq_file *seq, void *v)
{
int i;
@@ -1282,6 +1317,62 @@ static int pnp_seq_show(struct seq_file *seq, void *v)
&ic_servaddr);
return 0;
}
+
+/* Create the /proc/net/ipconfig directory */
+static int __init ipconfig_proc_net_init(void)
+{
+ ipconfig_dir = proc_net_mkdir(&init_net, "ipconfig", init_net.proc_net);
+ if (!ipconfig_dir)
+ return -ENOMEM;
+
+ return 0;
+}
+
+/* Create a new file under /proc/net/ipconfig */
+static int ipconfig_proc_net_create(const char *name,
+ const struct file_operations *fops)
+{
+ char *pname;
+ struct proc_dir_entry *p;
+
+ if (!ipconfig_dir)
+ return -ENOMEM;
+
+ pname = kasprintf(GFP_KERNEL, "%s%s", "ipconfig/", name);
+ if (!pname)
+ return -ENOMEM;
+
+ p = proc_create(pname, 0444, init_net.proc_net, fops);
+ kfree(pname);
+ if (!p)
+ return -ENOMEM;
+
+ return 0;
+}
+
+/* Write NTP server IP addresses to /proc/net/ipconfig/ntp_servers */
+static int ntp_servers_seq_show(struct seq_file *seq, void *v)
+{
+ int i;
+
+ for (i = 0; i < CONF_NTP_SERVERS_MAX; i++) {
+ if (ic_ntp_servers[i] != NONE)
+ seq_printf(seq, "%pI4\n", &ic_ntp_servers[i]);
+ }
+ return 0;
+}
+
+static int ntp_servers_seq_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, ntp_servers_seq_show, NULL);
+}
+
+static const struct file_operations ntp_servers_seq_fops = {
+ .open = ntp_servers_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
#endif /* CONFIG_PROC_FS */
/*
@@ -1356,8 +1447,20 @@ static int __init ip_auto_config(void)
int err;
unsigned int i;
+ /* Initialise all name servers and NTP servers to NONE (but only if the
+ * "ip=" or "nfsaddrs=" kernel command line parameters weren't decoded,
+ * otherwise we'll overwrite the IP addresses specified there)
+ */
+ if (ic_set_manually == 0) {
+ ic_nameservers_predef();
+ ic_ntp_servers_predef();
+ }
+
#ifdef CONFIG_PROC_FS
proc_create_single("pnp", 0444, init_net.proc_net, pnp_seq_show);
+
+ if (ipconfig_proc_net_init() == 0)
+ ipconfig_proc_net_create("ntp_servers", &ntp_servers_seq_fops);
#endif /* CONFIG_PROC_FS */
if (!ic_enable)
@@ -1469,16 +1572,32 @@ static int __init ip_auto_config(void)
&ic_servaddr, &root_server_addr, root_server_path);
if (ic_dev_mtu)
pr_cont(", mtu=%d", ic_dev_mtu);
- for (i = 0; i < CONF_NAMESERVERS_MAX; i++)
+ /* Name servers (if any): */
+ for (i = 0; i < CONF_NAMESERVERS_MAX; i++) {
if (ic_nameservers[i] != NONE) {
- pr_cont(" nameserver%u=%pI4",
- i, &ic_nameservers[i]);
- break;
+ if (i == 0)
+ pr_info(" nameserver%u=%pI4",
+ i, &ic_nameservers[i]);
+ else
+ pr_cont(", nameserver%u=%pI4",
+ i, &ic_nameservers[i]);
}
- for (i++; i < CONF_NAMESERVERS_MAX; i++)
- if (ic_nameservers[i] != NONE)
- pr_cont(", nameserver%u=%pI4", i, &ic_nameservers[i]);
- pr_cont("\n");
+ if (i + 1 == CONF_NAMESERVERS_MAX)
+ pr_cont("\n");
+ }
+ /* NTP servers (if any): */
+ for (i = 0; i < CONF_NTP_SERVERS_MAX; i++) {
+ if (ic_ntp_servers[i] != NONE) {
+ if (i == 0)
+ pr_info(" ntpserver%u=%pI4",
+ i, &ic_ntp_servers[i]);
+ else
+ pr_cont(", ntpserver%u=%pI4",
+ i, &ic_ntp_servers[i]);
+ }
+ if (i + 1 == CONF_NTP_SERVERS_MAX)
+ pr_cont("\n");
+ }
#endif /* !SILENT */
/*
@@ -1576,7 +1695,9 @@ static int __init ip_auto_config_setup(char *addrs)
return 1;
}
+ /* Initialise all name servers and NTP servers to NONE */
ic_nameservers_predef();
+ ic_ntp_servers_predef();
/* Parse string for static IP assignment. */
ip = addrs;
@@ -1635,6 +1756,13 @@ static int __init ip_auto_config_setup(char *addrs)
ic_nameservers[1] = NONE;
}
break;
+ case 9:
+ if (CONF_NTP_SERVERS_MAX >= 1) {
+ ic_ntp_servers[0] = in_aton(ip);
+ if (ic_ntp_servers[0] == ANY)
+ ic_ntp_servers[0] = NONE;
+ }
+ break;
}
}
ip = cp;
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 37c4f885ff7b..9f79b9803a16 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -201,7 +201,8 @@ static const struct nla_policy ipmr_rule_policy[FRA_MAX + 1] = {
};
static int ipmr_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
- struct fib_rule_hdr *frh, struct nlattr **tb)
+ struct fib_rule_hdr *frh, struct nlattr **tb,
+ struct netlink_ext_ack *extack)
{
return 0;
}
diff --git a/net/ipv4/ipmr_base.c b/net/ipv4/ipmr_base.c
index 30221701614c..cafb0506c8c9 100644
--- a/net/ipv4/ipmr_base.c
+++ b/net/ipv4/ipmr_base.c
@@ -35,17 +35,19 @@ mr_table_alloc(struct net *net, u32 id,
struct net *net))
{
struct mr_table *mrt;
+ int err;
mrt = kzalloc(sizeof(*mrt), GFP_KERNEL);
if (!mrt)
- return NULL;
+ return ERR_PTR(-ENOMEM);
mrt->id = id;
write_pnet(&mrt->net, net);
mrt->ops = *ops;
- if (rhltable_init(&mrt->mfc_hash, mrt->ops.rht_params)) {
+ err = rhltable_init(&mrt->mfc_hash, mrt->ops.rht_params);
+ if (err) {
kfree(mrt);
- return NULL;
+ return ERR_PTR(err);
}
INIT_LIST_HEAD(&mrt->mfc_cache_list);
INIT_LIST_HEAD(&mrt->mfc_unres_queue);
diff --git a/net/ipv4/metrics.c b/net/ipv4/metrics.c
new file mode 100644
index 000000000000..04311f7067e2
--- /dev/null
+++ b/net/ipv4/metrics.c
@@ -0,0 +1,55 @@
+#include <linux/netlink.h>
+#include <linux/rtnetlink.h>
+#include <linux/types.h>
+#include <net/ip.h>
+#include <net/net_namespace.h>
+#include <net/tcp.h>
+
+int ip_metrics_convert(struct net *net, struct nlattr *fc_mx, int fc_mx_len,
+ u32 *metrics)
+{
+ bool ecn_ca = false;
+ struct nlattr *nla;
+ int remaining;
+
+ if (!fc_mx)
+ return 0;
+
+ nla_for_each_attr(nla, fc_mx, fc_mx_len, remaining) {
+ int type = nla_type(nla);
+ u32 val;
+
+ if (!type)
+ continue;
+ if (type > RTAX_MAX)
+ return -EINVAL;
+
+ if (type == RTAX_CC_ALGO) {
+ char tmp[TCP_CA_NAME_MAX];
+
+ nla_strlcpy(tmp, nla, sizeof(tmp));
+ val = tcp_ca_get_key_by_name(net, tmp, &ecn_ca);
+ if (val == TCP_CA_UNSPEC)
+ return -EINVAL;
+ } else {
+ if (nla_len(nla) != sizeof(u32))
+ return -EINVAL;
+ val = nla_get_u32(nla);
+ }
+ if (type == RTAX_ADVMSS && val > 65535 - 40)
+ val = 65535 - 40;
+ if (type == RTAX_MTU && val > 65535 - 15)
+ val = 65535 - 15;
+ if (type == RTAX_HOPLIMIT && val > 255)
+ val = 255;
+ if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK))
+ return -EINVAL;
+ metrics[type - 1] = val;
+ }
+
+ if (ecn_ca)
+ metrics[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(ip_metrics_convert);
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index 280048e1e395..bbfc356cb1b5 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -29,7 +29,10 @@ config NF_SOCKET_IPV4
tristate "IPv4 socket lookup support"
help
This option enables the IPv4 socket lookup infrastructure. This is
- is required by the iptables socket match.
+ is required by the {ip,nf}tables socket match.
+
+config NF_TPROXY_IPV4
+ tristate "IPv4 tproxy support"
if NF_TABLES
@@ -129,10 +132,7 @@ config NFT_CHAIN_NAT_IPV4
source and destination ports.
config NF_NAT_MASQUERADE_IPV4
- tristate "IPv4 masquerade support"
- help
- This is the kernel functionality to provide NAT in the masquerade
- flavour (automatic source address selection).
+ bool
config NFT_MASQ_IPV4
tristate "IPv4 masquerading support for nf_tables"
diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile
index 0e5edd0c7926..8394c17c269f 100644
--- a/net/ipv4/netfilter/Makefile
+++ b/net/ipv4/netfilter/Makefile
@@ -10,12 +10,14 @@ nf_conntrack_ipv4-y := nf_conntrack_l3proto_ipv4.o nf_conntrack_proto_icmp.o
obj-$(CONFIG_NF_CONNTRACK_IPV4) += nf_conntrack_ipv4.o
nf_nat_ipv4-y := nf_nat_l3proto_ipv4.o nf_nat_proto_icmp.o
+nf_nat_ipv4-$(CONFIG_NF_NAT_MASQUERADE_IPV4) += nf_nat_masquerade_ipv4.o
obj-$(CONFIG_NF_NAT_IPV4) += nf_nat_ipv4.o
# defrag
obj-$(CONFIG_NF_DEFRAG_IPV4) += nf_defrag_ipv4.o
obj-$(CONFIG_NF_SOCKET_IPV4) += nf_socket_ipv4.o
+obj-$(CONFIG_NF_TPROXY_IPV4) += nf_tproxy_ipv4.o
# logging
obj-$(CONFIG_NF_LOG_ARP) += nf_log_arp.o
@@ -32,9 +34,6 @@ nf_nat_snmp_basic-y := nf_nat_snmp_basic.asn1.o nf_nat_snmp_basic_main.o
$(obj)/nf_nat_snmp_basic_main.o: $(obj)/nf_nat_snmp_basic.asn1.h
obj-$(CONFIG_NF_NAT_SNMP_BASIC) += nf_nat_snmp_basic.o
-obj-$(CONFIG_NF_NAT_MASQUERADE_IPV4) += nf_nat_masquerade_ipv4.o
-
-
# NAT protocols (nf_nat)
obj-$(CONFIG_NF_NAT_PROTO_GRE) += nf_nat_proto_gre.o
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index e85f35b89c49..38ab97b0a2ec 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -301,7 +301,7 @@ ipt_do_table(struct sk_buff *skb,
counter = xt_get_this_cpu_counter(&e->counters);
ADD_COUNTER(*counter, skb->len, 1);
- t = ipt_get_target(e);
+ t = ipt_get_target_c(e);
WARN_ON(!t->u.kernel.target);
#if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE)
@@ -1783,6 +1783,8 @@ int ipt_register_table(struct net *net, const struct xt_table *table,
/* set res now, will see skbs right after nf_register_net_hooks */
WRITE_ONCE(*res, new_table);
+ if (!ops)
+ return 0;
ret = nf_register_net_hooks(net, ops, hweight32(table->valid_hooks));
if (ret != 0) {
@@ -1800,7 +1802,8 @@ out_free:
void ipt_unregister_table(struct net *net, struct xt_table *table,
const struct nf_hook_ops *ops)
{
- nf_unregister_net_hooks(net, ops, hweight32(table->valid_hooks));
+ if (ops)
+ nf_unregister_net_hooks(net, ops, hweight32(table->valid_hooks));
__ipt_unregister_table(net, table);
}
diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c
index a03e4e7ef5f9..ce1512b02cb2 100644
--- a/net/ipv4/netfilter/ipt_MASQUERADE.c
+++ b/net/ipv4/netfilter/ipt_MASQUERADE.c
@@ -47,7 +47,7 @@ static int masquerade_tg_check(const struct xt_tgchk_param *par)
static unsigned int
masquerade_tg(struct sk_buff *skb, const struct xt_action_param *par)
{
- struct nf_nat_range range;
+ struct nf_nat_range2 range;
const struct nf_nat_ipv4_multi_range_compat *mr;
mr = par->targinfo;
diff --git a/net/ipv4/netfilter/iptable_nat.c b/net/ipv4/netfilter/iptable_nat.c
index 0f7255cc65ee..a317445448bf 100644
--- a/net/ipv4/netfilter/iptable_nat.c
+++ b/net/ipv4/netfilter/iptable_nat.c
@@ -33,75 +33,63 @@ static const struct xt_table nf_nat_ipv4_table = {
static unsigned int iptable_nat_do_chain(void *priv,
struct sk_buff *skb,
- const struct nf_hook_state *state,
- struct nf_conn *ct)
-{
- return ipt_do_table(skb, state, state->net->ipv4.nat_table);
-}
-
-static unsigned int iptable_nat_ipv4_fn(void *priv,
- struct sk_buff *skb,
- const struct nf_hook_state *state)
-{
- return nf_nat_ipv4_fn(priv, skb, state, iptable_nat_do_chain);
-}
-
-static unsigned int iptable_nat_ipv4_in(void *priv,
- struct sk_buff *skb,
- const struct nf_hook_state *state)
-{
- return nf_nat_ipv4_in(priv, skb, state, iptable_nat_do_chain);
-}
-
-static unsigned int iptable_nat_ipv4_out(void *priv,
- struct sk_buff *skb,
const struct nf_hook_state *state)
{
- return nf_nat_ipv4_out(priv, skb, state, iptable_nat_do_chain);
-}
-
-static unsigned int iptable_nat_ipv4_local_fn(void *priv,
- struct sk_buff *skb,
- const struct nf_hook_state *state)
-{
- return nf_nat_ipv4_local_fn(priv, skb, state, iptable_nat_do_chain);
+ return ipt_do_table(skb, state, state->net->ipv4.nat_table);
}
static const struct nf_hook_ops nf_nat_ipv4_ops[] = {
- /* Before packet filtering, change destination */
{
- .hook = iptable_nat_ipv4_in,
+ .hook = iptable_nat_do_chain,
.pf = NFPROTO_IPV4,
- .nat_hook = true,
.hooknum = NF_INET_PRE_ROUTING,
.priority = NF_IP_PRI_NAT_DST,
},
- /* After packet filtering, change source */
{
- .hook = iptable_nat_ipv4_out,
+ .hook = iptable_nat_do_chain,
.pf = NFPROTO_IPV4,
- .nat_hook = true,
.hooknum = NF_INET_POST_ROUTING,
.priority = NF_IP_PRI_NAT_SRC,
},
- /* Before packet filtering, change destination */
{
- .hook = iptable_nat_ipv4_local_fn,
+ .hook = iptable_nat_do_chain,
.pf = NFPROTO_IPV4,
- .nat_hook = true,
.hooknum = NF_INET_LOCAL_OUT,
.priority = NF_IP_PRI_NAT_DST,
},
- /* After packet filtering, change source */
{
- .hook = iptable_nat_ipv4_fn,
+ .hook = iptable_nat_do_chain,
.pf = NFPROTO_IPV4,
- .nat_hook = true,
.hooknum = NF_INET_LOCAL_IN,
.priority = NF_IP_PRI_NAT_SRC,
},
};
+static int ipt_nat_register_lookups(struct net *net)
+{
+ int i, ret;
+
+ for (i = 0; i < ARRAY_SIZE(nf_nat_ipv4_ops); i++) {
+ ret = nf_nat_l3proto_ipv4_register_fn(net, &nf_nat_ipv4_ops[i]);
+ if (ret) {
+ while (i)
+ nf_nat_l3proto_ipv4_unregister_fn(net, &nf_nat_ipv4_ops[--i]);
+
+ return ret;
+ }
+ }
+
+ return 0;
+}
+
+static void ipt_nat_unregister_lookups(struct net *net)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(nf_nat_ipv4_ops); i++)
+ nf_nat_l3proto_ipv4_unregister_fn(net, &nf_nat_ipv4_ops[i]);
+}
+
static int __net_init iptable_nat_table_init(struct net *net)
{
struct ipt_replace *repl;
@@ -114,7 +102,18 @@ static int __net_init iptable_nat_table_init(struct net *net)
if (repl == NULL)
return -ENOMEM;
ret = ipt_register_table(net, &nf_nat_ipv4_table, repl,
- nf_nat_ipv4_ops, &net->ipv4.nat_table);
+ NULL, &net->ipv4.nat_table);
+ if (ret < 0) {
+ kfree(repl);
+ return ret;
+ }
+
+ ret = ipt_nat_register_lookups(net);
+ if (ret < 0) {
+ ipt_unregister_table(net, net->ipv4.nat_table, NULL);
+ net->ipv4.nat_table = NULL;
+ }
+
kfree(repl);
return ret;
}
@@ -123,7 +122,8 @@ static void __net_exit iptable_nat_net_exit(struct net *net)
{
if (!net->ipv4.nat_table)
return;
- ipt_unregister_table(net, net->ipv4.nat_table, nf_nat_ipv4_ops);
+ ipt_nat_unregister_lookups(net);
+ ipt_unregister_table(net, net->ipv4.nat_table, NULL);
net->ipv4.nat_table = NULL;
}
diff --git a/net/ipv4/netfilter/nf_flow_table_ipv4.c b/net/ipv4/netfilter/nf_flow_table_ipv4.c
index 0cd46bffa469..e1e56d7123d2 100644
--- a/net/ipv4/netfilter/nf_flow_table_ipv4.c
+++ b/net/ipv4/netfilter/nf_flow_table_ipv4.c
@@ -2,265 +2,12 @@
#include <linux/init.h>
#include <linux/module.h>
#include <linux/netfilter.h>
-#include <linux/rhashtable.h>
-#include <linux/ip.h>
-#include <linux/netdevice.h>
-#include <net/ip.h>
-#include <net/neighbour.h>
#include <net/netfilter/nf_flow_table.h>
#include <net/netfilter/nf_tables.h>
-/* For layer 4 checksum field offset. */
-#include <linux/tcp.h>
-#include <linux/udp.h>
-
-static int nf_flow_nat_ip_tcp(struct sk_buff *skb, unsigned int thoff,
- __be32 addr, __be32 new_addr)
-{
- struct tcphdr *tcph;
-
- if (!pskb_may_pull(skb, thoff + sizeof(*tcph)) ||
- skb_try_make_writable(skb, thoff + sizeof(*tcph)))
- return -1;
-
- tcph = (void *)(skb_network_header(skb) + thoff);
- inet_proto_csum_replace4(&tcph->check, skb, addr, new_addr, true);
-
- return 0;
-}
-
-static int nf_flow_nat_ip_udp(struct sk_buff *skb, unsigned int thoff,
- __be32 addr, __be32 new_addr)
-{
- struct udphdr *udph;
-
- if (!pskb_may_pull(skb, thoff + sizeof(*udph)) ||
- skb_try_make_writable(skb, thoff + sizeof(*udph)))
- return -1;
-
- udph = (void *)(skb_network_header(skb) + thoff);
- if (udph->check || skb->ip_summed == CHECKSUM_PARTIAL) {
- inet_proto_csum_replace4(&udph->check, skb, addr,
- new_addr, true);
- if (!udph->check)
- udph->check = CSUM_MANGLED_0;
- }
-
- return 0;
-}
-
-static int nf_flow_nat_ip_l4proto(struct sk_buff *skb, struct iphdr *iph,
- unsigned int thoff, __be32 addr,
- __be32 new_addr)
-{
- switch (iph->protocol) {
- case IPPROTO_TCP:
- if (nf_flow_nat_ip_tcp(skb, thoff, addr, new_addr) < 0)
- return NF_DROP;
- break;
- case IPPROTO_UDP:
- if (nf_flow_nat_ip_udp(skb, thoff, addr, new_addr) < 0)
- return NF_DROP;
- break;
- }
-
- return 0;
-}
-
-static int nf_flow_snat_ip(const struct flow_offload *flow, struct sk_buff *skb,
- struct iphdr *iph, unsigned int thoff,
- enum flow_offload_tuple_dir dir)
-{
- __be32 addr, new_addr;
-
- switch (dir) {
- case FLOW_OFFLOAD_DIR_ORIGINAL:
- addr = iph->saddr;
- new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_v4.s_addr;
- iph->saddr = new_addr;
- break;
- case FLOW_OFFLOAD_DIR_REPLY:
- addr = iph->daddr;
- new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_v4.s_addr;
- iph->daddr = new_addr;
- break;
- default:
- return -1;
- }
- csum_replace4(&iph->check, addr, new_addr);
-
- return nf_flow_nat_ip_l4proto(skb, iph, thoff, addr, new_addr);
-}
-
-static int nf_flow_dnat_ip(const struct flow_offload *flow, struct sk_buff *skb,
- struct iphdr *iph, unsigned int thoff,
- enum flow_offload_tuple_dir dir)
-{
- __be32 addr, new_addr;
-
- switch (dir) {
- case FLOW_OFFLOAD_DIR_ORIGINAL:
- addr = iph->daddr;
- new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_v4.s_addr;
- iph->daddr = new_addr;
- break;
- case FLOW_OFFLOAD_DIR_REPLY:
- addr = iph->saddr;
- new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_v4.s_addr;
- iph->saddr = new_addr;
- break;
- default:
- return -1;
- }
- csum_replace4(&iph->check, addr, new_addr);
-
- return nf_flow_nat_ip_l4proto(skb, iph, thoff, addr, new_addr);
-}
-
-static int nf_flow_nat_ip(const struct flow_offload *flow, struct sk_buff *skb,
- enum flow_offload_tuple_dir dir)
-{
- struct iphdr *iph = ip_hdr(skb);
- unsigned int thoff = iph->ihl * 4;
-
- if (flow->flags & FLOW_OFFLOAD_SNAT &&
- (nf_flow_snat_port(flow, skb, thoff, iph->protocol, dir) < 0 ||
- nf_flow_snat_ip(flow, skb, iph, thoff, dir) < 0))
- return -1;
- if (flow->flags & FLOW_OFFLOAD_DNAT &&
- (nf_flow_dnat_port(flow, skb, thoff, iph->protocol, dir) < 0 ||
- nf_flow_dnat_ip(flow, skb, iph, thoff, dir) < 0))
- return -1;
-
- return 0;
-}
-
-static bool ip_has_options(unsigned int thoff)
-{
- return thoff != sizeof(struct iphdr);
-}
-
-static int nf_flow_tuple_ip(struct sk_buff *skb, const struct net_device *dev,
- struct flow_offload_tuple *tuple)
-{
- struct flow_ports *ports;
- unsigned int thoff;
- struct iphdr *iph;
-
- if (!pskb_may_pull(skb, sizeof(*iph)))
- return -1;
-
- iph = ip_hdr(skb);
- thoff = iph->ihl * 4;
-
- if (ip_is_fragment(iph) ||
- unlikely(ip_has_options(thoff)))
- return -1;
-
- if (iph->protocol != IPPROTO_TCP &&
- iph->protocol != IPPROTO_UDP)
- return -1;
-
- thoff = iph->ihl * 4;
- if (!pskb_may_pull(skb, thoff + sizeof(*ports)))
- return -1;
-
- ports = (struct flow_ports *)(skb_network_header(skb) + thoff);
-
- tuple->src_v4.s_addr = iph->saddr;
- tuple->dst_v4.s_addr = iph->daddr;
- tuple->src_port = ports->source;
- tuple->dst_port = ports->dest;
- tuple->l3proto = AF_INET;
- tuple->l4proto = iph->protocol;
- tuple->iifidx = dev->ifindex;
-
- return 0;
-}
-
-/* Based on ip_exceeds_mtu(). */
-static bool __nf_flow_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu)
-{
- if (skb->len <= mtu)
- return false;
-
- if ((ip_hdr(skb)->frag_off & htons(IP_DF)) == 0)
- return false;
-
- if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu))
- return false;
-
- return true;
-}
-
-static bool nf_flow_exceeds_mtu(struct sk_buff *skb, const struct rtable *rt)
-{
- u32 mtu;
-
- mtu = ip_dst_mtu_maybe_forward(&rt->dst, true);
- if (__nf_flow_exceeds_mtu(skb, mtu))
- return true;
-
- return false;
-}
-
-unsigned int
-nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb,
- const struct nf_hook_state *state)
-{
- struct flow_offload_tuple_rhash *tuplehash;
- struct nf_flowtable *flow_table = priv;
- struct flow_offload_tuple tuple = {};
- enum flow_offload_tuple_dir dir;
- struct flow_offload *flow;
- struct net_device *outdev;
- const struct rtable *rt;
- struct iphdr *iph;
- __be32 nexthop;
-
- if (skb->protocol != htons(ETH_P_IP))
- return NF_ACCEPT;
-
- if (nf_flow_tuple_ip(skb, state->in, &tuple) < 0)
- return NF_ACCEPT;
-
- tuplehash = flow_offload_lookup(flow_table, &tuple);
- if (tuplehash == NULL)
- return NF_ACCEPT;
-
- outdev = dev_get_by_index_rcu(state->net, tuplehash->tuple.oifidx);
- if (!outdev)
- return NF_ACCEPT;
-
- dir = tuplehash->tuple.dir;
- flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]);
-
- rt = (const struct rtable *)flow->tuplehash[dir].tuple.dst_cache;
- if (unlikely(nf_flow_exceeds_mtu(skb, rt)))
- return NF_ACCEPT;
-
- if (skb_try_make_writable(skb, sizeof(*iph)))
- return NF_DROP;
-
- if (flow->flags & (FLOW_OFFLOAD_SNAT | FLOW_OFFLOAD_DNAT) &&
- nf_flow_nat_ip(flow, skb, dir) < 0)
- return NF_DROP;
-
- flow->timeout = (u32)jiffies + NF_FLOW_TIMEOUT;
- iph = ip_hdr(skb);
- ip_decrease_ttl(iph);
-
- skb->dev = outdev;
- nexthop = rt_nexthop(rt, flow->tuplehash[!dir].tuple.src_v4.s_addr);
- neigh_xmit(NEIGH_ARP_TABLE, outdev, &nexthop, skb);
-
- return NF_STOLEN;
-}
-EXPORT_SYMBOL_GPL(nf_flow_offload_ip_hook);
static struct nf_flowtable_type flowtable_ipv4 = {
.family = NFPROTO_IPV4,
- .params = &nf_flow_offload_rhash_params,
- .gc = nf_flow_offload_work_gc,
+ .init = nf_flow_table_init,
.free = nf_flow_table_free,
.hook = nf_flow_offload_ip_hook,
.owner = THIS_MODULE,
diff --git a/net/ipv4/netfilter/nf_nat_h323.c b/net/ipv4/netfilter/nf_nat_h323.c
index ac8342dcb55e..4e6b53ab6c33 100644
--- a/net/ipv4/netfilter/nf_nat_h323.c
+++ b/net/ipv4/netfilter/nf_nat_h323.c
@@ -395,7 +395,7 @@ static int nat_h245(struct sk_buff *skb, struct nf_conn *ct,
static void ip_nat_q931_expect(struct nf_conn *new,
struct nf_conntrack_expect *this)
{
- struct nf_nat_range range;
+ struct nf_nat_range2 range;
if (this->tuple.src.u3.ip != 0) { /* Only accept calls from GK */
nf_nat_follow_master(new, this);
@@ -497,7 +497,7 @@ static int nat_q931(struct sk_buff *skb, struct nf_conn *ct,
static void ip_nat_callforwarding_expect(struct nf_conn *new,
struct nf_conntrack_expect *this)
{
- struct nf_nat_range range;
+ struct nf_nat_range2 range;
/* This must be a fresh one. */
BUG_ON(new->status & IPS_NAT_DONE_MASK);
diff --git a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
index f7ff6a364d7b..6115bf1ff6f0 100644
--- a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
@@ -63,7 +63,7 @@ static void nf_nat_ipv4_decode_session(struct sk_buff *skb,
#endif /* CONFIG_XFRM */
static bool nf_nat_ipv4_in_range(const struct nf_conntrack_tuple *t,
- const struct nf_nat_range *range)
+ const struct nf_nat_range2 *range)
{
return ntohl(t->src.u3.ip) >= ntohl(range->min_addr.ip) &&
ntohl(t->src.u3.ip) <= ntohl(range->max_addr.ip);
@@ -143,7 +143,7 @@ static void nf_nat_ipv4_csum_recalc(struct sk_buff *skb,
#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
static int nf_nat_ipv4_nlattr_to_range(struct nlattr *tb[],
- struct nf_nat_range *range)
+ struct nf_nat_range2 *range)
{
if (tb[CTA_NAT_V4_MINIP]) {
range->min_addr.ip = nla_get_be32(tb[CTA_NAT_V4_MINIP]);
@@ -241,34 +241,18 @@ int nf_nat_icmp_reply_translation(struct sk_buff *skb,
}
EXPORT_SYMBOL_GPL(nf_nat_icmp_reply_translation);
-unsigned int
+static unsigned int
nf_nat_ipv4_fn(void *priv, struct sk_buff *skb,
- const struct nf_hook_state *state,
- unsigned int (*do_chain)(void *priv,
- struct sk_buff *skb,
- const struct nf_hook_state *state,
- struct nf_conn *ct))
+ const struct nf_hook_state *state)
{
struct nf_conn *ct;
enum ip_conntrack_info ctinfo;
- struct nf_conn_nat *nat;
- /* maniptype == SRC for postrouting. */
- enum nf_nat_manip_type maniptype = HOOK2MANIP(state->hook);
ct = nf_ct_get(skb, &ctinfo);
- /* Can't track? It's not due to stress, or conntrack would
- * have dropped it. Hence it's the user's responsibilty to
- * packet filter it out, or implement conntrack/NAT for that
- * protocol. 8) --RR
- */
if (!ct)
return NF_ACCEPT;
- nat = nfct_nat(ct);
-
- switch (ctinfo) {
- case IP_CT_RELATED:
- case IP_CT_RELATED_REPLY:
+ if (ctinfo == IP_CT_RELATED || ctinfo == IP_CT_RELATED_REPLY) {
if (ip_hdr(skb)->protocol == IPPROTO_ICMP) {
if (!nf_nat_icmp_reply_translation(skb, ct, ctinfo,
state->hook))
@@ -276,78 +260,30 @@ nf_nat_ipv4_fn(void *priv, struct sk_buff *skb,
else
return NF_ACCEPT;
}
- /* Only ICMPs can be IP_CT_IS_REPLY: */
- /* fall through */
- case IP_CT_NEW:
- /* Seen it before? This can happen for loopback, retrans,
- * or local packets.
- */
- if (!nf_nat_initialized(ct, maniptype)) {
- unsigned int ret;
-
- ret = do_chain(priv, skb, state, ct);
- if (ret != NF_ACCEPT)
- return ret;
-
- if (nf_nat_initialized(ct, HOOK2MANIP(state->hook)))
- break;
-
- ret = nf_nat_alloc_null_binding(ct, state->hook);
- if (ret != NF_ACCEPT)
- return ret;
- } else {
- pr_debug("Already setup manip %s for ct %p\n",
- maniptype == NF_NAT_MANIP_SRC ? "SRC" : "DST",
- ct);
- if (nf_nat_oif_changed(state->hook, ctinfo, nat,
- state->out))
- goto oif_changed;
- }
- break;
-
- default:
- /* ESTABLISHED */
- WARN_ON(ctinfo != IP_CT_ESTABLISHED &&
- ctinfo != IP_CT_ESTABLISHED_REPLY);
- if (nf_nat_oif_changed(state->hook, ctinfo, nat, state->out))
- goto oif_changed;
}
- return nf_nat_packet(ct, ctinfo, state->hook, skb);
-
-oif_changed:
- nf_ct_kill_acct(ct, ctinfo, skb);
- return NF_DROP;
+ return nf_nat_inet_fn(priv, skb, state);
}
EXPORT_SYMBOL_GPL(nf_nat_ipv4_fn);
-unsigned int
+static unsigned int
nf_nat_ipv4_in(void *priv, struct sk_buff *skb,
- const struct nf_hook_state *state,
- unsigned int (*do_chain)(void *priv,
- struct sk_buff *skb,
- const struct nf_hook_state *state,
- struct nf_conn *ct))
+ const struct nf_hook_state *state)
{
unsigned int ret;
__be32 daddr = ip_hdr(skb)->daddr;
- ret = nf_nat_ipv4_fn(priv, skb, state, do_chain);
+ ret = nf_nat_ipv4_fn(priv, skb, state);
if (ret != NF_DROP && ret != NF_STOLEN &&
daddr != ip_hdr(skb)->daddr)
skb_dst_drop(skb);
return ret;
}
-EXPORT_SYMBOL_GPL(nf_nat_ipv4_in);
-unsigned int
+static unsigned int
nf_nat_ipv4_out(void *priv, struct sk_buff *skb,
- const struct nf_hook_state *state,
- unsigned int (*do_chain)(void *priv,
- struct sk_buff *skb,
- const struct nf_hook_state *state,
- struct nf_conn *ct))
+ const struct nf_hook_state *state)
{
#ifdef CONFIG_XFRM
const struct nf_conn *ct;
@@ -356,7 +292,7 @@ nf_nat_ipv4_out(void *priv, struct sk_buff *skb,
#endif
unsigned int ret;
- ret = nf_nat_ipv4_fn(priv, skb, state, do_chain);
+ ret = nf_nat_ipv4_fn(priv, skb, state);
#ifdef CONFIG_XFRM
if (ret != NF_DROP && ret != NF_STOLEN &&
!(IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) &&
@@ -376,22 +312,17 @@ nf_nat_ipv4_out(void *priv, struct sk_buff *skb,
#endif
return ret;
}
-EXPORT_SYMBOL_GPL(nf_nat_ipv4_out);
-unsigned int
+static unsigned int
nf_nat_ipv4_local_fn(void *priv, struct sk_buff *skb,
- const struct nf_hook_state *state,
- unsigned int (*do_chain)(void *priv,
- struct sk_buff *skb,
- const struct nf_hook_state *state,
- struct nf_conn *ct))
+ const struct nf_hook_state *state)
{
const struct nf_conn *ct;
enum ip_conntrack_info ctinfo;
unsigned int ret;
int err;
- ret = nf_nat_ipv4_fn(priv, skb, state, do_chain);
+ ret = nf_nat_ipv4_fn(priv, skb, state);
if (ret != NF_DROP && ret != NF_STOLEN &&
(ct = nf_ct_get(skb, &ctinfo)) != NULL) {
enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
@@ -415,7 +346,49 @@ nf_nat_ipv4_local_fn(void *priv, struct sk_buff *skb,
}
return ret;
}
-EXPORT_SYMBOL_GPL(nf_nat_ipv4_local_fn);
+
+static const struct nf_hook_ops nf_nat_ipv4_ops[] = {
+ /* Before packet filtering, change destination */
+ {
+ .hook = nf_nat_ipv4_in,
+ .pf = NFPROTO_IPV4,
+ .hooknum = NF_INET_PRE_ROUTING,
+ .priority = NF_IP_PRI_NAT_DST,
+ },
+ /* After packet filtering, change source */
+ {
+ .hook = nf_nat_ipv4_out,
+ .pf = NFPROTO_IPV4,
+ .hooknum = NF_INET_POST_ROUTING,
+ .priority = NF_IP_PRI_NAT_SRC,
+ },
+ /* Before packet filtering, change destination */
+ {
+ .hook = nf_nat_ipv4_local_fn,
+ .pf = NFPROTO_IPV4,
+ .hooknum = NF_INET_LOCAL_OUT,
+ .priority = NF_IP_PRI_NAT_DST,
+ },
+ /* After packet filtering, change source */
+ {
+ .hook = nf_nat_ipv4_fn,
+ .pf = NFPROTO_IPV4,
+ .hooknum = NF_INET_LOCAL_IN,
+ .priority = NF_IP_PRI_NAT_SRC,
+ },
+};
+
+int nf_nat_l3proto_ipv4_register_fn(struct net *net, const struct nf_hook_ops *ops)
+{
+ return nf_nat_register_fn(net, ops, nf_nat_ipv4_ops, ARRAY_SIZE(nf_nat_ipv4_ops));
+}
+EXPORT_SYMBOL_GPL(nf_nat_l3proto_ipv4_register_fn);
+
+void nf_nat_l3proto_ipv4_unregister_fn(struct net *net, const struct nf_hook_ops *ops)
+{
+ nf_nat_unregister_fn(net, ops, ARRAY_SIZE(nf_nat_ipv4_ops));
+}
+EXPORT_SYMBOL_GPL(nf_nat_l3proto_ipv4_unregister_fn);
static int __init nf_nat_l3proto_ipv4_init(void)
{
diff --git a/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c b/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c
index 0c366aad89cb..ad3aeff152ed 100644
--- a/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c
+++ b/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c
@@ -7,7 +7,6 @@
*/
#include <linux/types.h>
-#include <linux/module.h>
#include <linux/atomic.h>
#include <linux/inetdevice.h>
#include <linux/ip.h>
@@ -24,13 +23,13 @@
unsigned int
nf_nat_masquerade_ipv4(struct sk_buff *skb, unsigned int hooknum,
- const struct nf_nat_range *range,
+ const struct nf_nat_range2 *range,
const struct net_device *out)
{
struct nf_conn *ct;
struct nf_conn_nat *nat;
enum ip_conntrack_info ctinfo;
- struct nf_nat_range newrange;
+ struct nf_nat_range2 newrange;
const struct rtable *rt;
__be32 newsrc, nh;
@@ -157,6 +156,3 @@ void nf_nat_masquerade_ipv4_unregister_notifier(void)
unregister_inetaddr_notifier(&masq_inet_notifier);
}
EXPORT_SYMBOL_GPL(nf_nat_masquerade_ipv4_unregister_notifier);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Rusty Russell <rusty@rustcorp.com.au>");
diff --git a/net/ipv4/netfilter/nf_nat_pptp.c b/net/ipv4/netfilter/nf_nat_pptp.c
index 8a69363b4884..5d259a12e25f 100644
--- a/net/ipv4/netfilter/nf_nat_pptp.c
+++ b/net/ipv4/netfilter/nf_nat_pptp.c
@@ -48,7 +48,7 @@ static void pptp_nat_expected(struct nf_conn *ct,
struct nf_conntrack_tuple t = {};
const struct nf_ct_pptp_master *ct_pptp_info;
const struct nf_nat_pptp *nat_pptp_info;
- struct nf_nat_range range;
+ struct nf_nat_range2 range;
struct nf_conn_nat *nat;
nat = nf_ct_nat_ext_add(ct);
diff --git a/net/ipv4/netfilter/nf_nat_proto_gre.c b/net/ipv4/netfilter/nf_nat_proto_gre.c
index edf05002d674..00fda6331ce5 100644
--- a/net/ipv4/netfilter/nf_nat_proto_gre.c
+++ b/net/ipv4/netfilter/nf_nat_proto_gre.c
@@ -41,7 +41,7 @@ MODULE_DESCRIPTION("Netfilter NAT protocol helper module for GRE");
static void
gre_unique_tuple(const struct nf_nat_l3proto *l3proto,
struct nf_conntrack_tuple *tuple,
- const struct nf_nat_range *range,
+ const struct nf_nat_range2 *range,
enum nf_nat_manip_type maniptype,
const struct nf_conn *ct)
{
diff --git a/net/ipv4/netfilter/nf_nat_proto_icmp.c b/net/ipv4/netfilter/nf_nat_proto_icmp.c
index 7b98baa13ede..6d7cf1d79baf 100644
--- a/net/ipv4/netfilter/nf_nat_proto_icmp.c
+++ b/net/ipv4/netfilter/nf_nat_proto_icmp.c
@@ -30,7 +30,7 @@ icmp_in_range(const struct nf_conntrack_tuple *tuple,
static void
icmp_unique_tuple(const struct nf_nat_l3proto *l3proto,
struct nf_conntrack_tuple *tuple,
- const struct nf_nat_range *range,
+ const struct nf_nat_range2 *range,
enum nf_nat_manip_type maniptype,
const struct nf_conn *ct)
{
diff --git a/net/ipv4/netfilter/nf_tproxy_ipv4.c b/net/ipv4/netfilter/nf_tproxy_ipv4.c
new file mode 100644
index 000000000000..805e83ec3ad9
--- /dev/null
+++ b/net/ipv4/netfilter/nf_tproxy_ipv4.c
@@ -0,0 +1,147 @@
+/*
+ * Copyright (C) 2007-2008 BalaBit IT Ltd.
+ * Author: Krisztian Kovacs
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+
+#include <net/netfilter/nf_tproxy.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <net/inet_sock.h>
+#include <linux/ip.h>
+#include <net/checksum.h>
+#include <net/udp.h>
+#include <net/tcp.h>
+#include <linux/inetdevice.h>
+
+struct sock *
+nf_tproxy_handle_time_wait4(struct net *net, struct sk_buff *skb,
+ __be32 laddr, __be16 lport, struct sock *sk)
+{
+ const struct iphdr *iph = ip_hdr(skb);
+ struct tcphdr _hdr, *hp;
+
+ hp = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_hdr), &_hdr);
+ if (hp == NULL) {
+ inet_twsk_put(inet_twsk(sk));
+ return NULL;
+ }
+
+ if (hp->syn && !hp->rst && !hp->ack && !hp->fin) {
+ /* SYN to a TIME_WAIT socket, we'd rather redirect it
+ * to a listener socket if there's one */
+ struct sock *sk2;
+
+ sk2 = nf_tproxy_get_sock_v4(net, skb, hp, iph->protocol,
+ iph->saddr, laddr ? laddr : iph->daddr,
+ hp->source, lport ? lport : hp->dest,
+ skb->dev, NF_TPROXY_LOOKUP_LISTENER);
+ if (sk2) {
+ inet_twsk_deschedule_put(inet_twsk(sk));
+ sk = sk2;
+ }
+ }
+
+ return sk;
+}
+EXPORT_SYMBOL_GPL(nf_tproxy_handle_time_wait4);
+
+__be32 nf_tproxy_laddr4(struct sk_buff *skb, __be32 user_laddr, __be32 daddr)
+{
+ struct in_device *indev;
+ __be32 laddr;
+
+ if (user_laddr)
+ return user_laddr;
+
+ laddr = 0;
+ indev = __in_dev_get_rcu(skb->dev);
+ for_primary_ifa(indev) {
+ laddr = ifa->ifa_local;
+ break;
+ } endfor_ifa(indev);
+
+ return laddr ? laddr : daddr;
+}
+EXPORT_SYMBOL_GPL(nf_tproxy_laddr4);
+
+struct sock *
+nf_tproxy_get_sock_v4(struct net *net, struct sk_buff *skb, void *hp,
+ const u8 protocol,
+ const __be32 saddr, const __be32 daddr,
+ const __be16 sport, const __be16 dport,
+ const struct net_device *in,
+ const enum nf_tproxy_lookup_t lookup_type)
+{
+ struct sock *sk;
+ struct tcphdr *tcph;
+
+ switch (protocol) {
+ case IPPROTO_TCP:
+ switch (lookup_type) {
+ case NF_TPROXY_LOOKUP_LISTENER:
+ tcph = hp;
+ sk = inet_lookup_listener(net, &tcp_hashinfo, skb,
+ ip_hdrlen(skb) +
+ __tcp_hdrlen(tcph),
+ saddr, sport,
+ daddr, dport,
+ in->ifindex, 0);
+
+ if (sk && !refcount_inc_not_zero(&sk->sk_refcnt))
+ sk = NULL;
+ /* NOTE: we return listeners even if bound to
+ * 0.0.0.0, those are filtered out in
+ * xt_socket, since xt_TPROXY needs 0 bound
+ * listeners too
+ */
+ break;
+ case NF_TPROXY_LOOKUP_ESTABLISHED:
+ sk = inet_lookup_established(net, &tcp_hashinfo,
+ saddr, sport, daddr, dport,
+ in->ifindex);
+ break;
+ default:
+ BUG();
+ }
+ break;
+ case IPPROTO_UDP:
+ sk = udp4_lib_lookup(net, saddr, sport, daddr, dport,
+ in->ifindex);
+ if (sk) {
+ int connected = (sk->sk_state == TCP_ESTABLISHED);
+ int wildcard = (inet_sk(sk)->inet_rcv_saddr == 0);
+
+ /* NOTE: we return listeners even if bound to
+ * 0.0.0.0, those are filtered out in
+ * xt_socket, since xt_TPROXY needs 0 bound
+ * listeners too
+ */
+ if ((lookup_type == NF_TPROXY_LOOKUP_ESTABLISHED &&
+ (!connected || wildcard)) ||
+ (lookup_type == NF_TPROXY_LOOKUP_LISTENER && connected)) {
+ sock_put(sk);
+ sk = NULL;
+ }
+ }
+ break;
+ default:
+ WARN_ON(1);
+ sk = NULL;
+ }
+
+ pr_debug("tproxy socket lookup: proto %u %08x:%u -> %08x:%u, lookup type: %d, sock %p\n",
+ protocol, ntohl(saddr), ntohs(sport), ntohl(daddr), ntohs(dport), lookup_type, sk);
+
+ return sk;
+}
+EXPORT_SYMBOL_GPL(nf_tproxy_get_sock_v4);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Balazs Scheidler, Krisztian Kovacs");
+MODULE_DESCRIPTION("Netfilter IPv4 transparent proxy support");
diff --git a/net/ipv4/netfilter/nft_chain_nat_ipv4.c b/net/ipv4/netfilter/nft_chain_nat_ipv4.c
index b5464a3f253b..a3c4ea303e3e 100644
--- a/net/ipv4/netfilter/nft_chain_nat_ipv4.c
+++ b/net/ipv4/netfilter/nft_chain_nat_ipv4.c
@@ -27,9 +27,8 @@
#include <net/ip.h>
static unsigned int nft_nat_do_chain(void *priv,
- struct sk_buff *skb,
- const struct nf_hook_state *state,
- struct nf_conn *ct)
+ struct sk_buff *skb,
+ const struct nf_hook_state *state)
{
struct nft_pktinfo pkt;
@@ -39,42 +38,14 @@ static unsigned int nft_nat_do_chain(void *priv,
return nft_do_chain(&pkt, priv);
}
-static unsigned int nft_nat_ipv4_fn(void *priv,
- struct sk_buff *skb,
- const struct nf_hook_state *state)
-{
- return nf_nat_ipv4_fn(priv, skb, state, nft_nat_do_chain);
-}
-
-static unsigned int nft_nat_ipv4_in(void *priv,
- struct sk_buff *skb,
- const struct nf_hook_state *state)
-{
- return nf_nat_ipv4_in(priv, skb, state, nft_nat_do_chain);
-}
-
-static unsigned int nft_nat_ipv4_out(void *priv,
- struct sk_buff *skb,
- const struct nf_hook_state *state)
-{
- return nf_nat_ipv4_out(priv, skb, state, nft_nat_do_chain);
-}
-
-static unsigned int nft_nat_ipv4_local_fn(void *priv,
- struct sk_buff *skb,
- const struct nf_hook_state *state)
-{
- return nf_nat_ipv4_local_fn(priv, skb, state, nft_nat_do_chain);
-}
-
-static int nft_nat_ipv4_init(struct nft_ctx *ctx)
+static int nft_nat_ipv4_reg(struct net *net, const struct nf_hook_ops *ops)
{
- return nf_ct_netns_get(ctx->net, ctx->family);
+ return nf_nat_l3proto_ipv4_register_fn(net, ops);
}
-static void nft_nat_ipv4_free(struct nft_ctx *ctx)
+static void nft_nat_ipv4_unreg(struct net *net, const struct nf_hook_ops *ops)
{
- nf_ct_netns_put(ctx->net, ctx->family);
+ nf_nat_l3proto_ipv4_unregister_fn(net, ops);
}
static const struct nft_chain_type nft_chain_nat_ipv4 = {
@@ -87,13 +58,13 @@ static const struct nft_chain_type nft_chain_nat_ipv4 = {
(1 << NF_INET_LOCAL_OUT) |
(1 << NF_INET_LOCAL_IN),
.hooks = {
- [NF_INET_PRE_ROUTING] = nft_nat_ipv4_in,
- [NF_INET_POST_ROUTING] = nft_nat_ipv4_out,
- [NF_INET_LOCAL_OUT] = nft_nat_ipv4_local_fn,
- [NF_INET_LOCAL_IN] = nft_nat_ipv4_fn,
+ [NF_INET_PRE_ROUTING] = nft_nat_do_chain,
+ [NF_INET_POST_ROUTING] = nft_nat_do_chain,
+ [NF_INET_LOCAL_OUT] = nft_nat_do_chain,
+ [NF_INET_LOCAL_IN] = nft_nat_do_chain,
},
- .init = nft_nat_ipv4_init,
- .free = nft_nat_ipv4_free,
+ .ops_register = nft_nat_ipv4_reg,
+ .ops_unregister = nft_nat_ipv4_unreg,
};
static int __init nft_chain_nat_init(void)
diff --git a/net/ipv4/netfilter/nft_masq_ipv4.c b/net/ipv4/netfilter/nft_masq_ipv4.c
index f18677277119..f1193e1e928a 100644
--- a/net/ipv4/netfilter/nft_masq_ipv4.c
+++ b/net/ipv4/netfilter/nft_masq_ipv4.c
@@ -21,7 +21,7 @@ static void nft_masq_ipv4_eval(const struct nft_expr *expr,
const struct nft_pktinfo *pkt)
{
struct nft_masq *priv = nft_expr_priv(expr);
- struct nf_nat_range range;
+ struct nf_nat_range2 range;
memset(&range, 0, sizeof(range));
range.flags = priv->flags;
diff --git a/net/ipv4/netlink.c b/net/ipv4/netlink.c
new file mode 100644
index 000000000000..f86bb4f06609
--- /dev/null
+++ b/net/ipv4/netlink.c
@@ -0,0 +1,23 @@
+#include <linux/netlink.h>
+#include <linux/rtnetlink.h>
+#include <linux/types.h>
+#include <net/net_namespace.h>
+#include <net/netlink.h>
+#include <net/ip.h>
+
+int rtm_getroute_parse_ip_proto(struct nlattr *attr, u8 *ip_proto,
+ struct netlink_ext_ack *extack)
+{
+ *ip_proto = nla_get_u8(attr);
+
+ switch (*ip_proto) {
+ case IPPROTO_TCP:
+ case IPPROTO_UDP:
+ case IPPROTO_ICMP:
+ return 0;
+ default:
+ NL_SET_ERR_MSG(extack, "Unsupported ip proto");
+ return -EOPNOTSUPP;
+ }
+}
+EXPORT_SYMBOL_GPL(rtm_getroute_parse_ip_proto);
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 573e43c8ed87..77350c1256ce 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -284,6 +284,9 @@ static const struct snmp_mib snmp4_net_list[] = {
SNMP_MIB_ITEM("TCPKeepAlive", LINUX_MIB_TCPKEEPALIVE),
SNMP_MIB_ITEM("TCPMTUPFail", LINUX_MIB_TCPMTUPFAIL),
SNMP_MIB_ITEM("TCPMTUPSuccess", LINUX_MIB_TCPMTUPSUCCESS),
+ SNMP_MIB_ITEM("TCPDelivered", LINUX_MIB_TCPDELIVERED),
+ SNMP_MIB_ITEM("TCPDeliveredCE", LINUX_MIB_TCPDELIVEREDCE),
+ SNMP_MIB_ITEM("TCPAckCompressed", LINUX_MIB_TCPACKCOMPRESSED),
SNMP_MIB_SENTINEL
};
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 75fb8864be67..bf4e4adc2d00 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1341,6 +1341,37 @@ static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
return NULL;
}
+/* MTU selection:
+ * 1. mtu on route is locked - use it
+ * 2. mtu from nexthop exception
+ * 3. mtu from egress device
+ */
+
+u32 ip_mtu_from_fib_result(struct fib_result *res, __be32 daddr)
+{
+ struct fib_info *fi = res->fi;
+ struct fib_nh *nh = &fi->fib_nh[res->nh_sel];
+ struct net_device *dev = nh->nh_dev;
+ u32 mtu = 0;
+
+ if (dev_net(dev)->ipv4.sysctl_ip_fwd_use_pmtu ||
+ fi->fib_metrics->metrics[RTAX_LOCK - 1] & (1 << RTAX_MTU))
+ mtu = fi->fib_mtu;
+
+ if (likely(!mtu)) {
+ struct fib_nh_exception *fnhe;
+
+ fnhe = find_exception(nh, daddr);
+ if (fnhe && !time_after_eq(jiffies, fnhe->fnhe_expires))
+ mtu = fnhe->fnhe_pmtu;
+ }
+
+ if (likely(!mtu))
+ mtu = min(READ_ONCE(dev->mtu), IP_MAX_MTU);
+
+ return mtu - lwtunnel_headroom(nh->nh_lwtstate, mtu);
+}
+
static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
__be32 daddr, const bool do_cache)
{
@@ -2563,11 +2594,10 @@ struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
EXPORT_SYMBOL_GPL(ip_route_output_flow);
/* called with rcu_read_lock held */
-static int rt_fill_info(struct net *net, __be32 dst, __be32 src, u32 table_id,
- struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
- u32 seq)
+static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
+ struct rtable *rt, u32 table_id, struct flowi4 *fl4,
+ struct sk_buff *skb, u32 portid, u32 seq)
{
- struct rtable *rt = skb_rtable(skb);
struct rtmsg *r;
struct nlmsghdr *nlh;
unsigned long expires = 0;
@@ -2663,7 +2693,7 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src, u32 table_id,
}
} else
#endif
- if (nla_put_u32(skb, RTA_IIF, skb->dev->ifindex))
+ if (nla_put_u32(skb, RTA_IIF, fl4->flowi4_iif))
goto nla_put_failure;
}
@@ -2678,43 +2708,93 @@ nla_put_failure:
return -EMSGSIZE;
}
+static struct sk_buff *inet_rtm_getroute_build_skb(__be32 src, __be32 dst,
+ u8 ip_proto, __be16 sport,
+ __be16 dport)
+{
+ struct sk_buff *skb;
+ struct iphdr *iph;
+
+ skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+ if (!skb)
+ return NULL;
+
+ /* Reserve room for dummy headers, this skb can pass
+ * through good chunk of routing engine.
+ */
+ skb_reset_mac_header(skb);
+ skb_reset_network_header(skb);
+ skb->protocol = htons(ETH_P_IP);
+ iph = skb_put(skb, sizeof(struct iphdr));
+ iph->protocol = ip_proto;
+ iph->saddr = src;
+ iph->daddr = dst;
+ iph->version = 0x4;
+ iph->frag_off = 0;
+ iph->ihl = 0x5;
+ skb_set_transport_header(skb, skb->len);
+
+ switch (iph->protocol) {
+ case IPPROTO_UDP: {
+ struct udphdr *udph;
+
+ udph = skb_put_zero(skb, sizeof(struct udphdr));
+ udph->source = sport;
+ udph->dest = dport;
+ udph->len = sizeof(struct udphdr);
+ udph->check = 0;
+ break;
+ }
+ case IPPROTO_TCP: {
+ struct tcphdr *tcph;
+
+ tcph = skb_put_zero(skb, sizeof(struct tcphdr));
+ tcph->source = sport;
+ tcph->dest = dport;
+ tcph->doff = sizeof(struct tcphdr) / 4;
+ tcph->rst = 1;
+ tcph->check = ~tcp_v4_check(sizeof(struct tcphdr),
+ src, dst, 0);
+ break;
+ }
+ case IPPROTO_ICMP: {
+ struct icmphdr *icmph;
+
+ icmph = skb_put_zero(skb, sizeof(struct icmphdr));
+ icmph->type = ICMP_ECHO;
+ icmph->code = 0;
+ }
+ }
+
+ return skb;
+}
+
static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
struct netlink_ext_ack *extack)
{
struct net *net = sock_net(in_skb->sk);
- struct rtmsg *rtm;
struct nlattr *tb[RTA_MAX+1];
+ u32 table_id = RT_TABLE_MAIN;
+ __be16 sport = 0, dport = 0;
struct fib_result res = {};
+ u8 ip_proto = IPPROTO_UDP;
struct rtable *rt = NULL;
+ struct sk_buff *skb;
+ struct rtmsg *rtm;
struct flowi4 fl4;
__be32 dst = 0;
__be32 src = 0;
+ kuid_t uid;
u32 iif;
int err;
int mark;
- struct sk_buff *skb;
- u32 table_id = RT_TABLE_MAIN;
- kuid_t uid;
err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy,
extack);
if (err < 0)
- goto errout;
+ return err;
rtm = nlmsg_data(nlh);
-
- skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
- if (!skb) {
- err = -ENOBUFS;
- goto errout;
- }
-
- /* Reserve room for dummy headers, this skb can pass
- through good chunk of routing engine.
- */
- skb_reset_mac_header(skb);
- skb_reset_network_header(skb);
-
src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
@@ -2724,14 +2804,22 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
else
uid = (iif ? INVALID_UID : current_uid());
- /* Bugfix: need to give ip_route_input enough of an IP header to
- * not gag.
- */
- ip_hdr(skb)->protocol = IPPROTO_UDP;
- ip_hdr(skb)->saddr = src;
- ip_hdr(skb)->daddr = dst;
+ if (tb[RTA_IP_PROTO]) {
+ err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
+ &ip_proto, extack);
+ if (err)
+ return err;
+ }
- skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
+ if (tb[RTA_SPORT])
+ sport = nla_get_be16(tb[RTA_SPORT]);
+
+ if (tb[RTA_DPORT])
+ dport = nla_get_be16(tb[RTA_DPORT]);
+
+ skb = inet_rtm_getroute_build_skb(src, dst, ip_proto, sport, dport);
+ if (!skb)
+ return -ENOBUFS;
memset(&fl4, 0, sizeof(fl4));
fl4.daddr = dst;
@@ -2740,6 +2828,11 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
fl4.flowi4_mark = mark;
fl4.flowi4_uid = uid;
+ if (sport)
+ fl4.fl4_sport = sport;
+ if (dport)
+ fl4.fl4_dport = dport;
+ fl4.flowi4_proto = ip_proto;
rcu_read_lock();
@@ -2749,10 +2842,10 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
dev = dev_get_by_index_rcu(net, iif);
if (!dev) {
err = -ENODEV;
- goto errout_free;
+ goto errout_rcu;
}
- skb->protocol = htons(ETH_P_IP);
+ fl4.flowi4_iif = iif; /* for rt_fill_info */
skb->dev = dev;
skb->mark = mark;
err = ip_route_input_rcu(skb, dst, src, rtm->rtm_tos,
@@ -2772,7 +2865,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
}
if (err)
- goto errout_free;
+ goto errout_rcu;
if (rtm->rtm_flags & RTM_F_NOTIFY)
rt->rt_flags |= RTCF_NOTIFY;
@@ -2780,34 +2873,40 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
table_id = res.table ? res.table->tb_id : 0;
+ /* reset skb for netlink reply msg */
+ skb_trim(skb, 0);
+ skb_reset_network_header(skb);
+ skb_reset_transport_header(skb);
+ skb_reset_mac_header(skb);
+
if (rtm->rtm_flags & RTM_F_FIB_MATCH) {
if (!res.fi) {
err = fib_props[res.type].error;
if (!err)
err = -EHOSTUNREACH;
- goto errout_free;
+ goto errout_rcu;
}
err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,
nlh->nlmsg_seq, RTM_NEWROUTE, table_id,
rt->rt_type, res.prefix, res.prefixlen,
fl4.flowi4_tos, res.fi, 0);
} else {
- err = rt_fill_info(net, dst, src, table_id, &fl4, skb,
+ err = rt_fill_info(net, dst, src, rt, table_id, &fl4, skb,
NETLINK_CB(in_skb).portid, nlh->nlmsg_seq);
}
if (err < 0)
- goto errout_free;
+ goto errout_rcu;
rcu_read_unlock();
err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
-errout:
- return err;
errout_free:
+ return err;
+errout_rcu:
rcu_read_unlock();
kfree_skb(skb);
- goto errout;
+ goto errout_free;
}
void ip_rt_multicast_event(struct in_device *in_dev)
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 4b195bac8ac0..d06247ba08b2 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -30,6 +30,7 @@
static int zero;
static int one = 1;
+static int two = 2;
static int four = 4;
static int thousand = 1000;
static int gso_max_segs = GSO_MAX_SEGS;
@@ -46,6 +47,7 @@ static int tcp_syn_retries_min = 1;
static int tcp_syn_retries_max = MAX_TCP_SYNCNT;
static int ip_ping_group_range_min[] = { 0, 0 };
static int ip_ping_group_range_max[] = { GID_T_MAX, GID_T_MAX };
+static int comp_sack_nr_max = 255;
/* obsolete */
static int sysctl_tcp_low_latency __read_mostly;
@@ -844,7 +846,9 @@ static struct ctl_table ipv4_net_table[] = {
.data = &init_net.ipv4.sysctl_tcp_tw_reuse,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = proc_dointvec
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &two,
},
{
.procname = "tcp_max_tw_buckets",
@@ -1152,6 +1156,22 @@ static struct ctl_table ipv4_net_table[] = {
.extra1 = &one,
},
{
+ .procname = "tcp_comp_sack_delay_ns",
+ .data = &init_net.ipv4.sysctl_tcp_comp_sack_delay_ns,
+ .maxlen = sizeof(unsigned long),
+ .mode = 0644,
+ .proc_handler = proc_doulongvec_minmax,
+ },
+ {
+ .procname = "tcp_comp_sack_nr",
+ .data = &init_net.ipv4.sysctl_tcp_comp_sack_nr,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &comp_sack_nr_max,
+ },
+ {
.procname = "udp_rmem_min",
.data = &init_net.ipv4.sysctl_udp_rmem_min,
.maxlen = sizeof(init_net.ipv4.sysctl_udp_rmem_min),
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index dec47e6789e7..2741953adaba 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1691,6 +1691,139 @@ int tcp_peek_len(struct socket *sock)
}
EXPORT_SYMBOL(tcp_peek_len);
+/* Make sure sk_rcvbuf is big enough to satisfy SO_RCVLOWAT hint */
+int tcp_set_rcvlowat(struct sock *sk, int val)
+{
+ sk->sk_rcvlowat = val ? : 1;
+
+ /* Check if we need to signal EPOLLIN right now */
+ tcp_data_ready(sk);
+
+ if (sk->sk_userlocks & SOCK_RCVBUF_LOCK)
+ return 0;
+
+ /* val comes from user space and might be close to INT_MAX */
+ val <<= 1;
+ if (val < 0)
+ val = INT_MAX;
+
+ val = min(val, sock_net(sk)->ipv4.sysctl_tcp_rmem[2]);
+ if (val > sk->sk_rcvbuf) {
+ sk->sk_rcvbuf = val;
+ tcp_sk(sk)->window_clamp = tcp_win_from_space(sk, val);
+ }
+ return 0;
+}
+EXPORT_SYMBOL(tcp_set_rcvlowat);
+
+#ifdef CONFIG_MMU
+static const struct vm_operations_struct tcp_vm_ops = {
+};
+
+int tcp_mmap(struct file *file, struct socket *sock,
+ struct vm_area_struct *vma)
+{
+ if (vma->vm_flags & (VM_WRITE | VM_EXEC))
+ return -EPERM;
+ vma->vm_flags &= ~(VM_MAYWRITE | VM_MAYEXEC);
+
+ /* Instruct vm_insert_page() to not down_read(mmap_sem) */
+ vma->vm_flags |= VM_MIXEDMAP;
+
+ vma->vm_ops = &tcp_vm_ops;
+ return 0;
+}
+EXPORT_SYMBOL(tcp_mmap);
+
+static int tcp_zerocopy_receive(struct sock *sk,
+ struct tcp_zerocopy_receive *zc)
+{
+ unsigned long address = (unsigned long)zc->address;
+ const skb_frag_t *frags = NULL;
+ u32 length = 0, seq, offset;
+ struct vm_area_struct *vma;
+ struct sk_buff *skb = NULL;
+ struct tcp_sock *tp;
+ int ret;
+
+ if (address & (PAGE_SIZE - 1) || address != zc->address)
+ return -EINVAL;
+
+ if (sk->sk_state == TCP_LISTEN)
+ return -ENOTCONN;
+
+ sock_rps_record_flow(sk);
+
+ down_read(&current->mm->mmap_sem);
+
+ ret = -EINVAL;
+ vma = find_vma(current->mm, address);
+ if (!vma || vma->vm_start > address || vma->vm_ops != &tcp_vm_ops)
+ goto out;
+ zc->length = min_t(unsigned long, zc->length, vma->vm_end - address);
+
+ tp = tcp_sk(sk);
+ seq = tp->copied_seq;
+ zc->length = min_t(u32, zc->length, tcp_inq(sk));
+ zc->length &= ~(PAGE_SIZE - 1);
+
+ zap_page_range(vma, address, zc->length);
+
+ zc->recv_skip_hint = 0;
+ ret = 0;
+ while (length + PAGE_SIZE <= zc->length) {
+ if (zc->recv_skip_hint < PAGE_SIZE) {
+ if (skb) {
+ skb = skb->next;
+ offset = seq - TCP_SKB_CB(skb)->seq;
+ } else {
+ skb = tcp_recv_skb(sk, seq, &offset);
+ }
+
+ zc->recv_skip_hint = skb->len - offset;
+ offset -= skb_headlen(skb);
+ if ((int)offset < 0 || skb_has_frag_list(skb))
+ break;
+ frags = skb_shinfo(skb)->frags;
+ while (offset) {
+ if (frags->size > offset)
+ goto out;
+ offset -= frags->size;
+ frags++;
+ }
+ }
+ if (frags->size != PAGE_SIZE || frags->page_offset)
+ break;
+ ret = vm_insert_page(vma, address + length,
+ skb_frag_page(frags));
+ if (ret)
+ break;
+ length += PAGE_SIZE;
+ seq += PAGE_SIZE;
+ zc->recv_skip_hint -= PAGE_SIZE;
+ frags++;
+ }
+out:
+ up_read(&current->mm->mmap_sem);
+ if (length) {
+ tp->copied_seq = seq;
+ tcp_rcv_space_adjust(sk);
+
+ /* Clean up data we have read: This will do ACK frames. */
+ tcp_recv_skb(sk, seq, &offset);
+ tcp_cleanup_rbuf(sk, length);
+ ret = 0;
+ if (length == zc->length)
+ zc->recv_skip_hint = 0;
+ } else {
+ if (!zc->recv_skip_hint && sock_flag(sk, SOCK_DONE))
+ ret = -EIO;
+ }
+ zc->length = length;
+ return ret;
+}
+#endif
+
static void tcp_update_recv_tstamps(struct sk_buff *skb,
struct scm_timestamping *tss)
{
@@ -1746,6 +1879,22 @@ static void tcp_recv_timestamp(struct msghdr *msg, const struct sock *sk,
}
}
+static int tcp_inq_hint(struct sock *sk)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+ u32 copied_seq = READ_ONCE(tp->copied_seq);
+ u32 rcv_nxt = READ_ONCE(tp->rcv_nxt);
+ int inq;
+
+ inq = rcv_nxt - copied_seq;
+ if (unlikely(inq < 0 || copied_seq != READ_ONCE(tp->copied_seq))) {
+ lock_sock(sk);
+ inq = tp->rcv_nxt - tp->copied_seq;
+ release_sock(sk);
+ }
+ return inq;
+}
+
/*
* This routine copies from a sock struct into the user buffer.
*
@@ -1762,13 +1911,14 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
u32 peek_seq;
u32 *seq;
unsigned long used;
- int err;
+ int err, inq;
int target; /* Read at least this many bytes */
long timeo;
struct sk_buff *skb, *last;
u32 urg_hole = 0;
struct scm_timestamping tss;
bool has_tss = false;
+ bool has_cmsg;
if (unlikely(flags & MSG_ERRQUEUE))
return inet_recv_error(sk, msg, len, addr_len);
@@ -1783,6 +1933,7 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
if (sk->sk_state == TCP_LISTEN)
goto out;
+ has_cmsg = tp->recvmsg_inq;
timeo = sock_rcvtimeo(sk, nonblock);
/* Urgent data needs to be handled specially. */
@@ -1969,6 +2120,7 @@ skip_copy:
if (TCP_SKB_CB(skb)->has_rxtstamp) {
tcp_update_recv_tstamps(skb, &tss);
has_tss = true;
+ has_cmsg = true;
}
if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
goto found_fin_ok;
@@ -1988,13 +2140,20 @@ skip_copy:
* on connected socket. I was just happy when found this 8) --ANK
*/
- if (has_tss)
- tcp_recv_timestamp(msg, sk, &tss);
-
/* Clean up data we have read: This will do ACK frames. */
tcp_cleanup_rbuf(sk, copied);
release_sock(sk);
+
+ if (has_cmsg) {
+ if (has_tss)
+ tcp_recv_timestamp(msg, sk, &tss);
+ if (tp->recvmsg_inq) {
+ inq = tcp_inq_hint(sk);
+ put_cmsg(msg, SOL_TCP, TCP_CM_INQ, sizeof(inq), &inq);
+ }
+ }
+
return copied;
out:
@@ -2411,6 +2570,7 @@ int tcp_disconnect(struct sock *sk, int flags)
tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
tp->snd_cwnd_cnt = 0;
tp->window_clamp = 0;
+ tp->delivered_ce = 0;
tcp_set_ca_state(sk, TCP_CA_Open);
tp->is_sack_reneg = 0;
tcp_clear_retrans(tp);
@@ -2424,6 +2584,7 @@ int tcp_disconnect(struct sock *sk, int flags)
dst_release(sk->sk_rx_dst);
sk->sk_rx_dst = NULL;
tcp_saved_syn_free(tp);
+ tp->compressed_ack = 0;
/* Clean up fastopen related fields */
tcp_free_fastopen_req(tp);
@@ -2862,6 +3023,12 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
tp->notsent_lowat = val;
sk->sk_write_space(sk);
break;
+ case TCP_INQ:
+ if (val > 1 || val < 0)
+ err = -EINVAL;
+ else
+ tp->recvmsg_inq = val;
+ break;
default:
err = -ENOPROTOOPT;
break;
@@ -3020,6 +3187,8 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
rate64 = tcp_compute_delivery_rate(tp);
if (rate64)
info->tcpi_delivery_rate = rate64;
+ info->tcpi_delivered = tp->delivered;
+ info->tcpi_delivered_ce = tp->delivered_ce;
unlock_sock_fast(sk, slow);
}
EXPORT_SYMBOL_GPL(tcp_get_info);
@@ -3033,7 +3202,7 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk)
u32 rate;
stats = alloc_skb(7 * nla_total_size_64bit(sizeof(u64)) +
- 5 * nla_total_size(sizeof(u32)) +
+ 7 * nla_total_size(sizeof(u32)) +
3 * nla_total_size(sizeof(u8)), GFP_ATOMIC);
if (!stats)
return NULL;
@@ -3064,9 +3233,12 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk)
nla_put_u8(stats, TCP_NLA_RECUR_RETRANS, inet_csk(sk)->icsk_retransmits);
nla_put_u8(stats, TCP_NLA_DELIVERY_RATE_APP_LMT, !!tp->rate_app_limited);
nla_put_u32(stats, TCP_NLA_SND_SSTHRESH, tp->snd_ssthresh);
+ nla_put_u32(stats, TCP_NLA_DELIVERED, tp->delivered);
+ nla_put_u32(stats, TCP_NLA_DELIVERED_CE, tp->delivered_ce);
nla_put_u32(stats, TCP_NLA_SNDQ_SIZE, tp->write_seq - tp->snd_una);
nla_put_u8(stats, TCP_NLA_CA_STATE, inet_csk(sk)->icsk_ca_state);
+
return stats;
}
@@ -3282,6 +3454,9 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
case TCP_NOTSENT_LOWAT:
val = tp->notsent_lowat;
break;
+ case TCP_INQ:
+ val = tp->recvmsg_inq;
+ break;
case TCP_SAVE_SYN:
val = tp->save_syn;
break;
@@ -3318,6 +3493,25 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
}
return 0;
}
+#ifdef CONFIG_MMU
+ case TCP_ZEROCOPY_RECEIVE: {
+ struct tcp_zerocopy_receive zc;
+ int err;
+
+ if (get_user(len, optlen))
+ return -EFAULT;
+ if (len != sizeof(zc))
+ return -EINVAL;
+ if (copy_from_user(&zc, optval, len))
+ return -EFAULT;
+ lock_sock(sk);
+ err = tcp_zerocopy_receive(sk, &zc);
+ release_sock(sk);
+ if (!err && copy_to_user(optval, &zc, len))
+ err = -EFAULT;
+ return err;
+ }
+#endif
default:
return -ENOPROTOOPT;
}
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index e51c644484dc..355d3dffd021 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -111,6 +111,25 @@ int sysctl_tcp_max_orphans __read_mostly = NR_FILE;
#define REXMIT_LOST 1 /* retransmit packets marked lost */
#define REXMIT_NEW 2 /* FRTO-style transmit of unsent/new packets */
+#if IS_ENABLED(CONFIG_TLS_DEVICE)
+static DEFINE_STATIC_KEY_FALSE(clean_acked_data_enabled);
+
+void clean_acked_data_enable(struct inet_connection_sock *icsk,
+ void (*cad)(struct sock *sk, u32 ack_seq))
+{
+ icsk->icsk_clean_acked = cad;
+ static_branch_inc(&clean_acked_data_enabled);
+}
+EXPORT_SYMBOL_GPL(clean_acked_data_enable);
+
+void clean_acked_data_disable(struct inet_connection_sock *icsk)
+{
+ static_branch_dec(&clean_acked_data_enabled);
+ icsk->icsk_clean_acked = NULL;
+}
+EXPORT_SYMBOL_GPL(clean_acked_data_disable);
+#endif
+
static void tcp_gro_dev_warn(struct sock *sk, const struct sk_buff *skb,
unsigned int len)
{
@@ -184,21 +203,23 @@ static void tcp_measure_rcv_mss(struct sock *sk, const struct sk_buff *skb)
}
}
-static void tcp_incr_quickack(struct sock *sk)
+static void tcp_incr_quickack(struct sock *sk, unsigned int max_quickacks)
{
struct inet_connection_sock *icsk = inet_csk(sk);
unsigned int quickacks = tcp_sk(sk)->rcv_wnd / (2 * icsk->icsk_ack.rcv_mss);
if (quickacks == 0)
quickacks = 2;
+ quickacks = min(quickacks, max_quickacks);
if (quickacks > icsk->icsk_ack.quick)
- icsk->icsk_ack.quick = min(quickacks, TCP_MAX_QUICKACKS);
+ icsk->icsk_ack.quick = quickacks;
}
-static void tcp_enter_quickack_mode(struct sock *sk)
+static void tcp_enter_quickack_mode(struct sock *sk, unsigned int max_quickacks)
{
struct inet_connection_sock *icsk = inet_csk(sk);
- tcp_incr_quickack(sk);
+
+ tcp_incr_quickack(sk, max_quickacks);
icsk->icsk_ack.pingpong = 0;
icsk->icsk_ack.ato = TCP_ATO_MIN;
}
@@ -233,8 +254,10 @@ static void tcp_ecn_withdraw_cwr(struct tcp_sock *tp)
tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
}
-static void __tcp_ecn_check_ce(struct tcp_sock *tp, const struct sk_buff *skb)
+static void __tcp_ecn_check_ce(struct sock *sk, const struct sk_buff *skb)
{
+ struct tcp_sock *tp = tcp_sk(sk);
+
switch (TCP_SKB_CB(skb)->ip_dsfield & INET_ECN_MASK) {
case INET_ECN_NOT_ECT:
/* Funny extension: if ECT is not set on a segment,
@@ -242,31 +265,31 @@ static void __tcp_ecn_check_ce(struct tcp_sock *tp, const struct sk_buff *skb)
* it is probably a retransmit.
*/
if (tp->ecn_flags & TCP_ECN_SEEN)
- tcp_enter_quickack_mode((struct sock *)tp);
+ tcp_enter_quickack_mode(sk, 1);
break;
case INET_ECN_CE:
- if (tcp_ca_needs_ecn((struct sock *)tp))
- tcp_ca_event((struct sock *)tp, CA_EVENT_ECN_IS_CE);
+ if (tcp_ca_needs_ecn(sk))
+ tcp_ca_event(sk, CA_EVENT_ECN_IS_CE);
if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) {
/* Better not delay acks, sender can have a very low cwnd */
- tcp_enter_quickack_mode((struct sock *)tp);
+ tcp_enter_quickack_mode(sk, 1);
tp->ecn_flags |= TCP_ECN_DEMAND_CWR;
}
tp->ecn_flags |= TCP_ECN_SEEN;
break;
default:
- if (tcp_ca_needs_ecn((struct sock *)tp))
- tcp_ca_event((struct sock *)tp, CA_EVENT_ECN_NO_CE);
+ if (tcp_ca_needs_ecn(sk))
+ tcp_ca_event(sk, CA_EVENT_ECN_NO_CE);
tp->ecn_flags |= TCP_ECN_SEEN;
break;
}
}
-static void tcp_ecn_check_ce(struct tcp_sock *tp, const struct sk_buff *skb)
+static void tcp_ecn_check_ce(struct sock *sk, const struct sk_buff *skb)
{
- if (tp->ecn_flags & TCP_ECN_OK)
- __tcp_ecn_check_ce(tp, skb);
+ if (tcp_sk(sk)->ecn_flags & TCP_ECN_OK)
+ __tcp_ecn_check_ce(sk, skb);
}
static void tcp_ecn_rcv_synack(struct tcp_sock *tp, const struct tcphdr *th)
@@ -582,6 +605,8 @@ void tcp_rcv_space_adjust(struct sock *sk)
u32 copied;
int time;
+ trace_tcp_rcv_space_adjust(sk);
+
tcp_mstamp_refresh(tp);
time = tcp_stamp_us_delta(tp->tcp_mstamp, tp->rcvq_space.time);
if (time < (tp->rcv_rtt_est.rtt_us >> 3) || tp->rcv_rtt_est.rtt_us == 0)
@@ -665,7 +690,7 @@ static void tcp_event_data_recv(struct sock *sk, struct sk_buff *skb)
/* The _first_ data packet received, initialize
* delayed ACK engine.
*/
- tcp_incr_quickack(sk);
+ tcp_incr_quickack(sk, TCP_MAX_QUICKACKS);
icsk->icsk_ack.ato = TCP_ATO_MIN;
} else {
int m = now - icsk->icsk_ack.lrcvtime;
@@ -681,13 +706,13 @@ static void tcp_event_data_recv(struct sock *sk, struct sk_buff *skb)
/* Too long gap. Apparently sender failed to
* restart window, so that we send ACKs quickly.
*/
- tcp_incr_quickack(sk);
+ tcp_incr_quickack(sk, TCP_MAX_QUICKACKS);
sk_mem_reclaim(sk);
}
}
icsk->icsk_ack.lrcvtime = now;
- tcp_ecn_check_ce(tp, skb);
+ tcp_ecn_check_ce(sk, skb);
if (skb->len >= 128)
tcp_grow_window(sk, skb);
@@ -1896,19 +1921,54 @@ static inline void tcp_init_undo(struct tcp_sock *tp)
tp->undo_retrans = tp->retrans_out ? : -1;
}
-/* Enter Loss state. If we detect SACK reneging, forget all SACK information
+static bool tcp_is_rack(const struct sock *sk)
+{
+ return sock_net(sk)->ipv4.sysctl_tcp_recovery & TCP_RACK_LOSS_DETECTION;
+}
+
+/* If we detect SACK reneging, forget all SACK information
* and reset tags completely, otherwise preserve SACKs. If receiver
* dropped its ofo queue, we will know this due to reneging detection.
*/
+static void tcp_timeout_mark_lost(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct sk_buff *skb, *head;
+ bool is_reneg; /* is receiver reneging on SACKs? */
+
+ head = tcp_rtx_queue_head(sk);
+ is_reneg = head && (TCP_SKB_CB(head)->sacked & TCPCB_SACKED_ACKED);
+ if (is_reneg) {
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSACKRENEGING);
+ tp->sacked_out = 0;
+ /* Mark SACK reneging until we recover from this loss event. */
+ tp->is_sack_reneg = 1;
+ } else if (tcp_is_reno(tp)) {
+ tcp_reset_reno_sack(tp);
+ }
+
+ skb = head;
+ skb_rbtree_walk_from(skb) {
+ if (is_reneg)
+ TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED;
+ else if (tcp_is_rack(sk) && skb != head &&
+ tcp_rack_skb_timeout(tp, skb, 0) > 0)
+ continue; /* Don't mark recently sent ones lost yet */
+ tcp_mark_skb_lost(sk, skb);
+ }
+ tcp_verify_left_out(tp);
+ tcp_clear_all_retrans_hints(tp);
+}
+
+/* Enter Loss state. */
void tcp_enter_loss(struct sock *sk)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
struct net *net = sock_net(sk);
- struct sk_buff *skb;
bool new_recovery = icsk->icsk_ca_state < TCP_CA_Recovery;
- bool is_reneg; /* is receiver reneging on SACKs? */
- bool mark_lost;
+
+ tcp_timeout_mark_lost(sk);
/* Reduce ssthresh if it has not yet been made inside this window. */
if (icsk->icsk_ca_state <= TCP_CA_Disorder ||
@@ -1920,40 +1980,10 @@ void tcp_enter_loss(struct sock *sk)
tcp_ca_event(sk, CA_EVENT_LOSS);
tcp_init_undo(tp);
}
- tp->snd_cwnd = 1;
+ tp->snd_cwnd = tcp_packets_in_flight(tp) + 1;
tp->snd_cwnd_cnt = 0;
tp->snd_cwnd_stamp = tcp_jiffies32;
- tp->retrans_out = 0;
- tp->lost_out = 0;
-
- if (tcp_is_reno(tp))
- tcp_reset_reno_sack(tp);
-
- skb = tcp_rtx_queue_head(sk);
- is_reneg = skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED);
- if (is_reneg) {
- NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSACKRENEGING);
- tp->sacked_out = 0;
- /* Mark SACK reneging until we recover from this loss event. */
- tp->is_sack_reneg = 1;
- }
- tcp_clear_all_retrans_hints(tp);
-
- skb_rbtree_walk_from(skb) {
- mark_lost = (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) ||
- is_reneg);
- if (mark_lost)
- tcp_sum_lost(tp, skb);
- TCP_SKB_CB(skb)->sacked &= (~TCPCB_TAGBITS)|TCPCB_SACKED_ACKED;
- if (mark_lost) {
- TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED;
- TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
- tp->lost_out += tcp_skb_pcount(skb);
- }
- }
- tcp_verify_left_out(tp);
-
/* Timeout in disordered state after receiving substantial DUPACKs
* suggests that the degree of reordering is over-estimated.
*/
@@ -2120,7 +2150,7 @@ static bool tcp_time_to_recover(struct sock *sk, int flag)
return true;
/* Not-A-Trick#2 : Classic rule... */
- if (tcp_dupack_heuristics(tp) > tp->reordering)
+ if (!tcp_is_rack(sk) && tcp_dupack_heuristics(tp) > tp->reordering)
return true;
return false;
@@ -2197,9 +2227,7 @@ static void tcp_update_scoreboard(struct sock *sk, int fast_rexmit)
{
struct tcp_sock *tp = tcp_sk(sk);
- if (tcp_is_reno(tp)) {
- tcp_mark_head_lost(sk, 1, 1);
- } else {
+ if (tcp_is_sack(tp)) {
int sacked_upto = tp->sacked_out - tp->reordering;
if (sacked_upto >= 0)
tcp_mark_head_lost(sk, sacked_upto, 0);
@@ -2697,12 +2725,16 @@ static bool tcp_try_undo_partial(struct sock *sk, u32 prior_snd_una)
return false;
}
-static void tcp_rack_identify_loss(struct sock *sk, int *ack_flag)
+static void tcp_identify_packet_loss(struct sock *sk, int *ack_flag)
{
struct tcp_sock *tp = tcp_sk(sk);
- /* Use RACK to detect loss */
- if (sock_net(sk)->ipv4.sysctl_tcp_recovery & TCP_RACK_LOSS_DETECTION) {
+ if (tcp_rtx_queue_empty(sk))
+ return;
+
+ if (unlikely(tcp_is_reno(tp))) {
+ tcp_newreno_mark_lost(sk, *ack_flag & FLAG_SND_UNA_ADVANCED);
+ } else if (tcp_is_rack(sk)) {
u32 prior_retrans = tp->retrans_out;
tcp_rack_mark_lost(sk);
@@ -2798,11 +2830,11 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una,
tcp_try_keep_open(sk);
return;
}
- tcp_rack_identify_loss(sk, ack_flag);
+ tcp_identify_packet_loss(sk, ack_flag);
break;
case TCP_CA_Loss:
tcp_process_loss(sk, flag, is_dupack, rexmit);
- tcp_rack_identify_loss(sk, ack_flag);
+ tcp_identify_packet_loss(sk, ack_flag);
if (!(icsk->icsk_ca_state == TCP_CA_Open ||
(*ack_flag & FLAG_LOST_RETRANS)))
return;
@@ -2819,7 +2851,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una,
if (icsk->icsk_ca_state <= TCP_CA_Disorder)
tcp_try_undo_dsack(sk);
- tcp_rack_identify_loss(sk, ack_flag);
+ tcp_identify_packet_loss(sk, ack_flag);
if (!tcp_time_to_recover(sk, flag)) {
tcp_try_to_open(sk, flag);
return;
@@ -2841,7 +2873,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una,
fast_rexmit = 1;
}
- if (do_lost)
+ if (!tcp_is_rack(sk) && do_lost)
tcp_update_scoreboard(sk, fast_rexmit);
*rexmit = REXMIT_LOST;
}
@@ -3496,6 +3528,22 @@ static void tcp_xmit_recovery(struct sock *sk, int rexmit)
tcp_xmit_retransmit_queue(sk);
}
+/* Returns the number of packets newly acked or sacked by the current ACK */
+static u32 tcp_newly_delivered(struct sock *sk, u32 prior_delivered, int flag)
+{
+ const struct net *net = sock_net(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+ u32 delivered;
+
+ delivered = tp->delivered - prior_delivered;
+ NET_ADD_STATS(net, LINUX_MIB_TCPDELIVERED, delivered);
+ if (flag & FLAG_ECE) {
+ tp->delivered_ce += delivered;
+ NET_ADD_STATS(net, LINUX_MIB_TCPDELIVEREDCE, delivered);
+ }
+ return delivered;
+}
+
/* This routine deals with incoming acks, but not outgoing ones. */
static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
{
@@ -3542,6 +3590,12 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
if (after(ack, prior_snd_una)) {
flag |= FLAG_SND_UNA_ADVANCED;
icsk->icsk_retransmits = 0;
+
+#if IS_ENABLED(CONFIG_TLS_DEVICE)
+ if (static_branch_unlikely(&clean_acked_data_enabled))
+ if (icsk->icsk_clean_acked)
+ icsk->icsk_clean_acked(sk, ack);
+#endif
}
prior_fack = tcp_is_sack(tp) ? tcp_highest_sack_seq(tp) : tp->snd_una;
@@ -3619,7 +3673,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP))
sk_dst_confirm(sk);
- delivered = tp->delivered - delivered; /* freshly ACKed or SACKed */
+ delivered = tcp_newly_delivered(sk, delivered, flag);
lost = tp->lost - lost; /* freshly marked lost */
rs.is_ack_delayed = !!(flag & FLAG_ACK_MAYBE_DELAYED);
tcp_rate_gen(sk, delivered, lost, is_sack_reneg, sack_state.rate);
@@ -3629,9 +3683,11 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
no_queue:
/* If data was DSACKed, see if we can undo a cwnd reduction. */
- if (flag & FLAG_DSACKING_ACK)
+ if (flag & FLAG_DSACKING_ACK) {
tcp_fastretrans_alert(sk, prior_snd_una, is_dupack, &flag,
&rexmit);
+ tcp_newly_delivered(sk, delivered, flag);
+ }
/* If this ack opens up a zero window, clear backoff. It was
* being used to time the probes, and is probably far higher than
* it needs to be for normal retransmission.
@@ -3655,6 +3711,7 @@ old_ack:
&sack_state);
tcp_fastretrans_alert(sk, prior_snd_una, is_dupack, &flag,
&rexmit);
+ tcp_newly_delivered(sk, delivered, flag);
tcp_xmit_recovery(sk, rexmit);
}
@@ -4126,7 +4183,7 @@ static void tcp_send_dupack(struct sock *sk, const struct sk_buff *skb)
if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOST);
- tcp_enter_quickack_mode(sk);
+ tcp_enter_quickack_mode(sk, TCP_MAX_QUICKACKS);
if (tcp_is_sack(tp) && sock_net(sk)->ipv4.sysctl_tcp_dsack) {
u32 end_seq = TCP_SKB_CB(skb)->end_seq;
@@ -4196,6 +4253,8 @@ static void tcp_sack_new_ofo_skb(struct sock *sk, u32 seq, u32 end_seq)
* If the sack array is full, forget about the last one.
*/
if (this_sack >= TCP_NUM_SACKS) {
+ if (tp->compressed_ack)
+ tcp_send_ack(sk);
this_sack--;
tp->rx_opt.num_sacks--;
sp--;
@@ -4377,7 +4436,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
u32 seq, end_seq;
bool fragstolen;
- tcp_ecn_check_ce(tp, skb);
+ tcp_ecn_check_ce(sk, skb);
if (unlikely(tcp_try_rmem_schedule(sk, skb, skb->truesize))) {
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFODROP);
@@ -4573,6 +4632,17 @@ err:
}
+void tcp_data_ready(struct sock *sk)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+ int avail = tp->rcv_nxt - tp->copied_seq;
+
+ if (avail < sk->sk_rcvlowat && !sock_flag(sk, SOCK_DONE))
+ return;
+
+ sk->sk_data_ready(sk);
+}
+
static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);
@@ -4630,7 +4700,7 @@ queue_and_out:
if (eaten > 0)
kfree_skb_partial(skb, fragstolen);
if (!sock_flag(sk, SOCK_DEAD))
- sk->sk_data_ready(sk);
+ tcp_data_ready(sk);
return;
}
@@ -4640,7 +4710,7 @@ queue_and_out:
tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
out_of_window:
- tcp_enter_quickack_mode(sk);
+ tcp_enter_quickack_mode(sk, TCP_MAX_QUICKACKS);
inet_csk_schedule_ack(sk);
drop:
tcp_drop(sk, skb);
@@ -4651,8 +4721,6 @@ drop:
if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt + tcp_receive_window(tp)))
goto out_of_window;
- tcp_enter_quickack_mode(sk);
-
if (before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
/* Partial packet, seq < rcv_next < end_seq */
SOCK_DEBUG(sk, "partial packet: rcv_next %X seq %X - %X\n",
@@ -5019,23 +5087,48 @@ static inline void tcp_data_snd_check(struct sock *sk)
static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
{
struct tcp_sock *tp = tcp_sk(sk);
+ unsigned long rtt, delay;
/* More than one full frame received... */
if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss &&
/* ... and right edge of window advances far enough.
- * (tcp_recvmsg() will send ACK otherwise). Or...
+ * (tcp_recvmsg() will send ACK otherwise).
+ * If application uses SO_RCVLOWAT, we want send ack now if
+ * we have not received enough bytes to satisfy the condition.
*/
- __tcp_select_window(sk) >= tp->rcv_wnd) ||
+ (tp->rcv_nxt - tp->copied_seq < sk->sk_rcvlowat ||
+ __tcp_select_window(sk) >= tp->rcv_wnd)) ||
/* We ACK each frame or... */
- tcp_in_quickack_mode(sk) ||
- /* We have out of order data. */
- (ofo_possible && !RB_EMPTY_ROOT(&tp->out_of_order_queue))) {
- /* Then ack it now */
+ tcp_in_quickack_mode(sk)) {
+send_now:
tcp_send_ack(sk);
- } else {
- /* Else, send delayed ack. */
+ return;
+ }
+
+ if (!ofo_possible || RB_EMPTY_ROOT(&tp->out_of_order_queue)) {
tcp_send_delayed_ack(sk);
+ return;
}
+
+ if (!tcp_is_sack(tp) ||
+ tp->compressed_ack >= sock_net(sk)->ipv4.sysctl_tcp_comp_sack_nr)
+ goto send_now;
+ tp->compressed_ack++;
+
+ if (hrtimer_is_queued(&tp->compressed_ack_timer))
+ return;
+
+ /* compress ack timer : 5 % of rtt, but no more than tcp_comp_sack_delay_ns */
+
+ rtt = tp->rcv_rtt_est.rtt_us;
+ if (tp->srtt_us && tp->srtt_us < rtt)
+ rtt = tp->srtt_us;
+
+ delay = min_t(unsigned long, sock_net(sk)->ipv4.sysctl_tcp_comp_sack_delay_ns,
+ rtt * (NSEC_PER_USEC >> 3)/20);
+ sock_hold(sk);
+ hrtimer_start(&tp->compressed_ack_timer, ns_to_ktime(delay),
+ HRTIMER_MODE_REL_PINNED_SOFT);
}
static inline void tcp_ack_snd_check(struct sock *sk)
@@ -5299,11 +5392,11 @@ discard:
* the rest is checked inline. Fast processing is turned on in
* tcp_data_queue when everything is OK.
*/
-void tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
- const struct tcphdr *th)
+void tcp_rcv_established(struct sock *sk, struct sk_buff *skb)
{
- unsigned int len = skb->len;
+ const struct tcphdr *th = (const struct tcphdr *)skb->data;
struct tcp_sock *tp = tcp_sk(sk);
+ unsigned int len = skb->len;
/* TCP congestion window tracking */
trace_tcp_probe(sk, skb);
@@ -5428,7 +5521,7 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
no_ack:
if (eaten)
kfree_skb_partial(skb, fragstolen);
- sk->sk_data_ready(sk);
+ tcp_data_ready(sk);
return;
}
}
@@ -5550,9 +5643,12 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
return true;
}
tp->syn_data_acked = tp->syn_data;
- if (tp->syn_data_acked)
- NET_INC_STATS(sock_net(sk),
- LINUX_MIB_TCPFASTOPENACTIVE);
+ if (tp->syn_data_acked) {
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVE);
+ /* SYN-data is counted as two separate packets in tcp_ack() */
+ if (tp->delivered > 1)
+ --tp->delivered;
+ }
tcp_fastopen_add_skb(sk, synack);
@@ -5698,7 +5794,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
* to stand against the temptation 8) --ANK
*/
inet_csk_schedule_ack(sk);
- tcp_enter_quickack_mode(sk);
+ tcp_enter_quickack_mode(sk, TCP_MAX_QUICKACKS);
inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
TCP_DELACK_MAX, TCP_RTO_MAX);
@@ -5884,6 +5980,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
}
switch (sk->sk_state) {
case TCP_SYN_RECV:
+ tp->delivered++; /* SYN-ACK delivery isn't tracked in tcp_ack */
if (!tp->srtt_us)
tcp_synack_rtt_meas(sk, req);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 2c970626b398..fed3f1c66167 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -110,8 +110,38 @@ static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
{
+ const struct inet_timewait_sock *tw = inet_twsk(sktw);
const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
struct tcp_sock *tp = tcp_sk(sk);
+ int reuse = sock_net(sk)->ipv4.sysctl_tcp_tw_reuse;
+
+ if (reuse == 2) {
+ /* Still does not detect *everything* that goes through
+ * lo, since we require a loopback src or dst address
+ * or direct binding to 'lo' interface.
+ */
+ bool loopback = false;
+ if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
+ loopback = true;
+#if IS_ENABLED(CONFIG_IPV6)
+ if (tw->tw_family == AF_INET6) {
+ if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
+ (ipv6_addr_v4mapped(&tw->tw_v6_daddr) &&
+ (tw->tw_v6_daddr.s6_addr[12] == 127)) ||
+ ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
+ (ipv6_addr_v4mapped(&tw->tw_v6_rcv_saddr) &&
+ (tw->tw_v6_rcv_saddr.s6_addr[12] == 127)))
+ loopback = true;
+ } else
+#endif
+ {
+ if (ipv4_is_loopback(tw->tw_daddr) ||
+ ipv4_is_loopback(tw->tw_rcv_saddr))
+ loopback = true;
+ }
+ if (!loopback)
+ reuse = 0;
+ }
/* With PAWS, it is safe from the viewpoint
of data integrity. Even without PAWS it is safe provided sequence
@@ -125,8 +155,7 @@ int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
and use initial timestamp retrieved from peer table.
*/
if (tcptw->tw_ts_recent_stamp &&
- (!twp || (sock_net(sk)->ipv4.sysctl_tcp_tw_reuse &&
- get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
+ (!twp || (reuse && get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
if (tp->write_seq == 0)
tp->write_seq = 1;
@@ -621,6 +650,7 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
struct sock *sk1 = NULL;
#endif
struct net *net;
+ struct sock *ctl_sk;
/* Never send a reset in response to a reset. */
if (th->rst)
@@ -723,11 +753,16 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
arg.tos = ip_hdr(skb)->tos;
arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
local_bh_disable();
- ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
+ ctl_sk = *this_cpu_ptr(net->ipv4.tcp_sk);
+ if (sk)
+ ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
+ inet_twsk(sk)->tw_mark : sk->sk_mark;
+ ip_send_unicast_reply(ctl_sk,
skb, &TCP_SKB_CB(skb)->header.h4.opt,
ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
&arg, arg.iov[0].iov_len);
+ ctl_sk->sk_mark = 0;
__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
__TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
local_bh_enable();
@@ -759,6 +794,7 @@ static void tcp_v4_send_ack(const struct sock *sk,
} rep;
struct net *net = sock_net(sk);
struct ip_reply_arg arg;
+ struct sock *ctl_sk;
memset(&rep.th, 0, sizeof(struct tcphdr));
memset(&arg, 0, sizeof(arg));
@@ -809,11 +845,16 @@ static void tcp_v4_send_ack(const struct sock *sk,
arg.tos = tos;
arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
local_bh_disable();
- ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
+ ctl_sk = *this_cpu_ptr(net->ipv4.tcp_sk);
+ if (sk)
+ ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
+ inet_twsk(sk)->tw_mark : sk->sk_mark;
+ ip_send_unicast_reply(ctl_sk,
skb, &TCP_SKB_CB(skb)->header.h4.opt,
ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
&arg, arg.iov[0].iov_len);
+ ctl_sk->sk_mark = 0;
__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
local_bh_enable();
}
@@ -1474,7 +1515,7 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
sk->sk_rx_dst = NULL;
}
}
- tcp_rcv_established(sk, skb, tcp_hdr(skb));
+ tcp_rcv_established(sk, skb);
return 0;
}
@@ -2481,7 +2522,7 @@ static int __net_init tcp_sk_init(struct net *net)
net->ipv4.sysctl_tcp_orphan_retries = 0;
net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
- net->ipv4.sysctl_tcp_tw_reuse = 0;
+ net->ipv4.sysctl_tcp_tw_reuse = 2;
cnt = tcp_hashinfo.ehash_mask + 1;
net->ipv4.tcp_death_row.sysctl_max_tw_buckets = (cnt + 1) / 2;
@@ -2524,6 +2565,8 @@ static int __net_init tcp_sk_init(struct net *net)
init_net.ipv4.sysctl_tcp_wmem,
sizeof(init_net.ipv4.sysctl_tcp_wmem));
}
+ net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
+ net->ipv4.sysctl_tcp_comp_sack_nr = 44;
net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60;
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 57b5468b5139..1dda1341a223 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -263,6 +263,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
struct inet_sock *inet = inet_sk(sk);
tw->tw_transparent = inet->transparent;
+ tw->tw_mark = sk->sk_mark;
tw->tw_rcv_wscale = tp->rx_opt.rcv_wscale;
tcptw->tw_rcv_nxt = tp->rcv_nxt;
tcptw->tw_snd_nxt = tp->snd_nxt;
@@ -306,7 +307,6 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
if (timeo < rto)
timeo = rto;
- tw->tw_timeout = TCP_TIMEWAIT_LEN;
if (state == TCP_TIME_WAIT)
timeo = TCP_TIMEWAIT_LEN;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index d07e34f8e309..8e08b409c71e 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -162,6 +162,15 @@ static void tcp_event_data_sent(struct tcp_sock *tp,
/* Account for an ACK we sent. */
static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts)
{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (unlikely(tp->compressed_ack)) {
+ NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPACKCOMPRESSED,
+ tp->compressed_ack);
+ tp->compressed_ack = 0;
+ if (hrtimer_try_to_cancel(&tp->compressed_ack_timer) == 1)
+ __sock_put(sk);
+ }
tcp_dec_quickack_mode(sk, pkts);
inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK);
}
@@ -229,11 +238,9 @@ void tcp_select_initial_window(const struct sock *sk, int __space, __u32 mss,
}
}
- if (mss > (1 << *rcv_wscale)) {
- if (!init_rcv_wnd) /* Use default unless specified otherwise */
- init_rcv_wnd = tcp_default_init_rwnd(mss);
- *rcv_wnd = min(*rcv_wnd, init_rcv_wnd * mss);
- }
+ if (!init_rcv_wnd) /* Use default unless specified otherwise */
+ init_rcv_wnd = tcp_default_init_rwnd(mss);
+ *rcv_wnd = min(*rcv_wnd, init_rcv_wnd * mss);
/* Set the clamp no higher than max representable value */
(*window_clamp) = min_t(__u32, U16_MAX << (*rcv_wscale), *window_clamp);
@@ -585,14 +592,15 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb,
unsigned int remaining = MAX_TCP_OPTION_SPACE;
struct tcp_fastopen_request *fastopen = tp->fastopen_req;
+ *md5 = NULL;
#ifdef CONFIG_TCP_MD5SIG
- *md5 = tp->af_specific->md5_lookup(sk, sk);
- if (*md5) {
- opts->options |= OPTION_MD5;
- remaining -= TCPOLEN_MD5SIG_ALIGNED;
+ if (unlikely(rcu_access_pointer(tp->md5sig_info))) {
+ *md5 = tp->af_specific->md5_lookup(sk, sk);
+ if (*md5) {
+ opts->options |= OPTION_MD5;
+ remaining -= TCPOLEN_MD5SIG_ALIGNED;
+ }
}
-#else
- *md5 = NULL;
#endif
/* We always get an MSS option. The option bytes which will be seen in
@@ -720,14 +728,15 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb
opts->options = 0;
+ *md5 = NULL;
#ifdef CONFIG_TCP_MD5SIG
- *md5 = tp->af_specific->md5_lookup(sk, sk);
- if (unlikely(*md5)) {
- opts->options |= OPTION_MD5;
- size += TCPOLEN_MD5SIG_ALIGNED;
+ if (unlikely(rcu_access_pointer(tp->md5sig_info))) {
+ *md5 = tp->af_specific->md5_lookup(sk, sk);
+ if (*md5) {
+ opts->options |= OPTION_MD5;
+ size += TCPOLEN_MD5SIG_ALIGNED;
+ }
}
-#else
- *md5 = NULL;
#endif
if (likely(tp->rx_opt.tstamp_ok)) {
@@ -772,7 +781,7 @@ struct tsq_tasklet {
};
static DEFINE_PER_CPU(struct tsq_tasklet, tsq_tasklet);
-static void tcp_tsq_handler(struct sock *sk)
+static void tcp_tsq_write(struct sock *sk)
{
if ((1 << sk->sk_state) &
(TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_CLOSING |
@@ -789,6 +798,16 @@ static void tcp_tsq_handler(struct sock *sk)
0, GFP_ATOMIC);
}
}
+
+static void tcp_tsq_handler(struct sock *sk)
+{
+ bh_lock_sock(sk);
+ if (!sock_owned_by_user(sk))
+ tcp_tsq_write(sk);
+ else if (!test_and_set_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags))
+ sock_hold(sk);
+ bh_unlock_sock(sk);
+}
/*
* One tasklet per cpu tries to send more skbs.
* We run in tasklet context but need to disable irqs when
@@ -816,16 +835,7 @@ static void tcp_tasklet_func(unsigned long data)
smp_mb__before_atomic();
clear_bit(TSQ_QUEUED, &sk->sk_tsq_flags);
- if (!sk->sk_lock.owned &&
- test_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags)) {
- bh_lock_sock(sk);
- if (!sock_owned_by_user(sk)) {
- clear_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags);
- tcp_tsq_handler(sk);
- }
- bh_unlock_sock(sk);
- }
-
+ tcp_tsq_handler(sk);
sk_free(sk);
}
}
@@ -853,9 +863,10 @@ void tcp_release_cb(struct sock *sk)
nflags = flags & ~TCP_DEFERRED_ALL;
} while (cmpxchg(&sk->sk_tsq_flags, flags, nflags) != flags);
- if (flags & TCPF_TSQ_DEFERRED)
- tcp_tsq_handler(sk);
-
+ if (flags & TCPF_TSQ_DEFERRED) {
+ tcp_tsq_write(sk);
+ __sock_put(sk);
+ }
/* Here begins the tricky part :
* We are called from release_sock() with :
* 1) BH disabled
@@ -929,7 +940,7 @@ void tcp_wfree(struct sk_buff *skb)
if (!(oval & TSQF_THROTTLED) || (oval & TSQF_QUEUED))
goto out;
- nval = (oval & ~TSQF_THROTTLED) | TSQF_QUEUED | TCPF_TSQ_DEFERRED;
+ nval = (oval & ~TSQF_THROTTLED) | TSQF_QUEUED;
nval = cmpxchg(&sk->sk_tsq_flags, oval, nval);
if (nval != oval)
continue;
@@ -948,37 +959,17 @@ out:
sk_free(sk);
}
-/* Note: Called under hard irq.
- * We can not call TCP stack right away.
+/* Note: Called under soft irq.
+ * We can call TCP stack right away, unless socket is owned by user.
*/
enum hrtimer_restart tcp_pace_kick(struct hrtimer *timer)
{
struct tcp_sock *tp = container_of(timer, struct tcp_sock, pacing_timer);
struct sock *sk = (struct sock *)tp;
- unsigned long nval, oval;
- for (oval = READ_ONCE(sk->sk_tsq_flags);; oval = nval) {
- struct tsq_tasklet *tsq;
- bool empty;
-
- if (oval & TSQF_QUEUED)
- break;
-
- nval = (oval & ~TSQF_THROTTLED) | TSQF_QUEUED | TCPF_TSQ_DEFERRED;
- nval = cmpxchg(&sk->sk_tsq_flags, oval, nval);
- if (nval != oval)
- continue;
+ tcp_tsq_handler(sk);
+ sock_put(sk);
- if (!refcount_inc_not_zero(&sk->sk_wmem_alloc))
- break;
- /* queue this socket to tasklet queue */
- tsq = this_cpu_ptr(&tsq_tasklet);
- empty = list_empty(&tsq->head);
- list_add(&tp->tsq_node, &tsq->head);
- if (empty)
- tasklet_schedule(&tsq->tasklet);
- break;
- }
return HRTIMER_NORESTART;
}
@@ -1011,7 +1002,8 @@ static void tcp_internal_pacing(struct sock *sk, const struct sk_buff *skb)
do_div(len_ns, rate);
hrtimer_start(&tcp_sk(sk)->pacing_timer,
ktime_add_ns(ktime_get(), len_ns),
- HRTIMER_MODE_ABS_PINNED);
+ HRTIMER_MODE_ABS_PINNED_SOFT);
+ sock_hold(sk);
}
static void tcp_update_skb_after_send(struct tcp_sock *tp, struct sk_buff *skb)
@@ -1078,7 +1070,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
/* if no packet is in qdisc/device queue, then allow XPS to select
* another queue. We can be called from tcp_tsq_handler()
- * which holds one reference to sk_wmem_alloc.
+ * which holds one reference to sk.
*
* TODO: Ideally, in-flight pure ACK packets should not matter here.
* One way to get this would be to set skb->truesize = 2 on them.
@@ -2185,7 +2177,7 @@ static int tcp_mtu_probe(struct sock *sk)
static bool tcp_pacing_check(const struct sock *sk)
{
return tcp_needs_internal_pacing(sk) &&
- hrtimer_active(&tcp_sk(sk)->pacing_timer);
+ hrtimer_is_queued(&tcp_sk(sk)->pacing_timer);
}
/* TCP Small Queues :
@@ -2365,8 +2357,6 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
skb, limit, mss_now, gfp)))
break;
- if (test_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags))
- clear_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags);
if (tcp_small_queue_check(sk, skb, 0))
break;
diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c
index 3a81720ac0c4..71593e4400ab 100644
--- a/net/ipv4/tcp_recovery.c
+++ b/net/ipv4/tcp_recovery.c
@@ -2,7 +2,7 @@
#include <linux/tcp.h>
#include <net/tcp.h>
-static void tcp_rack_mark_skb_lost(struct sock *sk, struct sk_buff *skb)
+void tcp_mark_skb_lost(struct sock *sk, struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);
@@ -21,6 +21,38 @@ static bool tcp_rack_sent_after(u64 t1, u64 t2, u32 seq1, u32 seq2)
return t1 > t2 || (t1 == t2 && after(seq1, seq2));
}
+static u32 tcp_rack_reo_wnd(const struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (!tp->rack.reord) {
+ /* If reordering has not been observed, be aggressive during
+ * the recovery or starting the recovery by DUPACK threshold.
+ */
+ if (inet_csk(sk)->icsk_ca_state >= TCP_CA_Recovery)
+ return 0;
+
+ if (tp->sacked_out >= tp->reordering &&
+ !(sock_net(sk)->ipv4.sysctl_tcp_recovery & TCP_RACK_NO_DUPTHRESH))
+ return 0;
+ }
+
+ /* To be more reordering resilient, allow min_rtt/4 settling delay.
+ * Use min_rtt instead of the smoothed RTT because reordering is
+ * often a path property and less related to queuing or delayed ACKs.
+ * Upon receiving DSACKs, linearly increase the window up to the
+ * smoothed RTT.
+ */
+ return min((tcp_min_rtt(tp) >> 2) * tp->rack.reo_wnd_steps,
+ tp->srtt_us >> 3);
+}
+
+s32 tcp_rack_skb_timeout(struct tcp_sock *tp, struct sk_buff *skb, u32 reo_wnd)
+{
+ return tp->rack.rtt_us + reo_wnd -
+ tcp_stamp_us_delta(tp->tcp_mstamp, skb->skb_mstamp);
+}
+
/* RACK loss detection (IETF draft draft-ietf-tcpm-rack-01):
*
* Marks a packet lost, if some packet sent later has been (s)acked.
@@ -44,23 +76,11 @@ static bool tcp_rack_sent_after(u64 t1, u64 t2, u32 seq1, u32 seq2)
static void tcp_rack_detect_loss(struct sock *sk, u32 *reo_timeout)
{
struct tcp_sock *tp = tcp_sk(sk);
- u32 min_rtt = tcp_min_rtt(tp);
struct sk_buff *skb, *n;
u32 reo_wnd;
*reo_timeout = 0;
- /* To be more reordering resilient, allow min_rtt/4 settling delay
- * (lower-bounded to 1000uS). We use min_rtt instead of the smoothed
- * RTT because reordering is often a path property and less related
- * to queuing or delayed ACKs.
- */
- reo_wnd = 1000;
- if ((tp->rack.reord || inet_csk(sk)->icsk_ca_state < TCP_CA_Recovery) &&
- min_rtt != ~0U) {
- reo_wnd = max((min_rtt >> 2) * tp->rack.reo_wnd_steps, reo_wnd);
- reo_wnd = min(reo_wnd, tp->srtt_us >> 3);
- }
-
+ reo_wnd = tcp_rack_reo_wnd(sk);
list_for_each_entry_safe(skb, n, &tp->tsorted_sent_queue,
tcp_tsorted_anchor) {
struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
@@ -78,10 +98,9 @@ static void tcp_rack_detect_loss(struct sock *sk, u32 *reo_timeout)
/* A packet is lost if it has not been s/acked beyond
* the recent RTT plus the reordering window.
*/
- remaining = tp->rack.rtt_us + reo_wnd -
- tcp_stamp_us_delta(tp->tcp_mstamp, skb->skb_mstamp);
+ remaining = tcp_rack_skb_timeout(tp, skb, reo_wnd);
if (remaining <= 0) {
- tcp_rack_mark_skb_lost(sk, skb);
+ tcp_mark_skb_lost(sk, skb);
list_del_init(&skb->tcp_tsorted_anchor);
} else {
/* Record maximum wait time */
@@ -202,3 +221,30 @@ void tcp_rack_update_reo_wnd(struct sock *sk, struct rate_sample *rs)
tp->rack.reo_wnd_steps = 1;
}
}
+
+/* RFC6582 NewReno recovery for non-SACK connection. It simply retransmits
+ * the next unacked packet upon receiving
+ * a) three or more DUPACKs to start the fast recovery
+ * b) an ACK acknowledging new data during the fast recovery.
+ */
+void tcp_newreno_mark_lost(struct sock *sk, bool snd_una_advanced)
+{
+ const u8 state = inet_csk(sk)->icsk_ca_state;
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if ((state < TCP_CA_Recovery && tp->sacked_out >= tp->reordering) ||
+ (state == TCP_CA_Recovery && snd_una_advanced)) {
+ struct sk_buff *skb = tcp_rtx_queue_head(sk);
+ u32 mss;
+
+ if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST)
+ return;
+
+ mss = tcp_skb_mss(skb);
+ if (tcp_skb_pcount(skb) > 1 && skb->len > mss)
+ tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb,
+ mss, mss, GFP_ATOMIC);
+
+ tcp_skb_mark_lost_uncond_verify(tp, skb);
+ }
+}
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index f7d944855f8e..3b3611729928 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -708,11 +708,36 @@ out:
sock_put(sk);
}
+static enum hrtimer_restart tcp_compressed_ack_kick(struct hrtimer *timer)
+{
+ struct tcp_sock *tp = container_of(timer, struct tcp_sock, compressed_ack_timer);
+ struct sock *sk = (struct sock *)tp;
+
+ bh_lock_sock(sk);
+ if (!sock_owned_by_user(sk)) {
+ if (tp->compressed_ack)
+ tcp_send_ack(sk);
+ } else {
+ if (!test_and_set_bit(TCP_DELACK_TIMER_DEFERRED,
+ &sk->sk_tsq_flags))
+ sock_hold(sk);
+ }
+ bh_unlock_sock(sk);
+
+ sock_put(sk);
+
+ return HRTIMER_NORESTART;
+}
+
void tcp_init_xmit_timers(struct sock *sk)
{
inet_csk_init_xmit_timers(sk, &tcp_write_timer, &tcp_delack_timer,
&tcp_keepalive_timer);
hrtimer_init(&tcp_sk(sk)->pacing_timer, CLOCK_MONOTONIC,
- HRTIMER_MODE_ABS_PINNED);
+ HRTIMER_MODE_ABS_PINNED_SOFT);
tcp_sk(sk)->pacing_timer.function = tcp_pace_kick;
+
+ hrtimer_init(&tcp_sk(sk)->compressed_ack_timer, CLOCK_MONOTONIC,
+ HRTIMER_MODE_REL_PINNED_SOFT);
+ tcp_sk(sk)->compressed_ack_timer.function = tcp_compressed_ack_kick;
}
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 675433eb53a8..3365362cac88 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -544,9 +544,7 @@ EXPORT_SYMBOL_GPL(udp4_lib_lookup_skb);
/* Must be called under rcu_read_lock().
* Does increment socket refcount.
*/
-#if IS_ENABLED(CONFIG_NETFILTER_XT_MATCH_SOCKET) || \
- IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TPROXY) || \
- IS_ENABLED(CONFIG_NF_SOCKET_IPV4)
+#if IS_ENABLED(CONFIG_NF_TPROXY_IPV4) || IS_ENABLED(CONFIG_NF_SOCKET_IPV4)
struct sock *udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport,
__be32 daddr, __be16 dport, int dif)
{
@@ -757,7 +755,8 @@ void udp_set_csum(bool nocheck, struct sk_buff *skb,
}
EXPORT_SYMBOL(udp_set_csum);
-static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4)
+static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4,
+ struct inet_cork *cork)
{
struct sock *sk = skb->sk;
struct inet_sock *inet = inet_sk(sk);
@@ -777,6 +776,27 @@ static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4)
uh->len = htons(len);
uh->check = 0;
+ if (cork->gso_size) {
+ const int hlen = skb_network_header_len(skb) +
+ sizeof(struct udphdr);
+
+ if (hlen + cork->gso_size > cork->fragsize)
+ return -EINVAL;
+ if (skb->len > cork->gso_size * UDP_MAX_SEGMENTS)
+ return -EINVAL;
+ if (sk->sk_no_check_tx)
+ return -EINVAL;
+ if (skb->ip_summed != CHECKSUM_PARTIAL || is_udplite ||
+ dst_xfrm(skb_dst(skb)))
+ return -EIO;
+
+ skb_shinfo(skb)->gso_size = cork->gso_size;
+ skb_shinfo(skb)->gso_type = SKB_GSO_UDP_L4;
+ skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(len - sizeof(uh),
+ cork->gso_size);
+ goto csum_partial;
+ }
+
if (is_udplite) /* UDP-Lite */
csum = udplite_csum(skb);
@@ -786,6 +806,7 @@ static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4)
goto send;
} else if (skb->ip_summed == CHECKSUM_PARTIAL) { /* UDP hardware csum */
+csum_partial:
udp4_hwcsum(skb, fl4->saddr, fl4->daddr);
goto send;
@@ -828,7 +849,7 @@ int udp_push_pending_frames(struct sock *sk)
if (!skb)
goto out;
- err = udp_send_skb(skb, fl4);
+ err = udp_send_skb(skb, fl4, &inet->cork.base);
out:
up->len = 0;
@@ -837,10 +858,48 @@ out:
}
EXPORT_SYMBOL(udp_push_pending_frames);
+static int __udp_cmsg_send(struct cmsghdr *cmsg, u16 *gso_size)
+{
+ switch (cmsg->cmsg_type) {
+ case UDP_SEGMENT:
+ if (cmsg->cmsg_len != CMSG_LEN(sizeof(__u16)))
+ return -EINVAL;
+ *gso_size = *(__u16 *)CMSG_DATA(cmsg);
+ return 0;
+ default:
+ return -EINVAL;
+ }
+}
+
+int udp_cmsg_send(struct sock *sk, struct msghdr *msg, u16 *gso_size)
+{
+ struct cmsghdr *cmsg;
+ bool need_ip = false;
+ int err;
+
+ for_each_cmsghdr(cmsg, msg) {
+ if (!CMSG_OK(msg, cmsg))
+ return -EINVAL;
+
+ if (cmsg->cmsg_level != SOL_UDP) {
+ need_ip = true;
+ continue;
+ }
+
+ err = __udp_cmsg_send(cmsg, gso_size);
+ if (err)
+ return err;
+ }
+
+ return need_ip;
+}
+EXPORT_SYMBOL_GPL(udp_cmsg_send);
+
int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
{
struct inet_sock *inet = inet_sk(sk);
struct udp_sock *up = udp_sk(sk);
+ DECLARE_SOCKADDR(struct sockaddr_in *, usin, msg->msg_name);
struct flowi4 fl4_stack;
struct flowi4 *fl4;
int ulen = len;
@@ -895,8 +954,7 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
/*
* Get and verify the address.
*/
- if (msg->msg_name) {
- DECLARE_SOCKADDR(struct sockaddr_in *, usin, msg->msg_name);
+ if (usin) {
if (msg->msg_namelen < sizeof(*usin))
return -EINVAL;
if (usin->sin_family != AF_INET) {
@@ -922,10 +980,14 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
ipc.sockc.tsflags = sk->sk_tsflags;
ipc.addr = inet->inet_saddr;
ipc.oif = sk->sk_bound_dev_if;
+ ipc.gso_size = up->gso_size;
if (msg->msg_controllen) {
- err = ip_cmsg_send(sk, msg, &ipc, sk->sk_family == AF_INET6);
- if (unlikely(err)) {
+ err = udp_cmsg_send(sk, msg, &ipc.gso_size);
+ if (err > 0)
+ err = ip_cmsg_send(sk, msg, &ipc,
+ sk->sk_family == AF_INET6);
+ if (unlikely(err < 0)) {
kfree(ipc.opt);
return err;
}
@@ -946,6 +1008,22 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
rcu_read_unlock();
}
+ if (cgroup_bpf_enabled && !connected) {
+ err = BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK(sk,
+ (struct sockaddr *)usin, &ipc.addr);
+ if (err)
+ goto out_free;
+ if (usin) {
+ if (usin->sin_port == 0) {
+ /* BPF program set invalid port. Reject it. */
+ err = -EINVAL;
+ goto out_free;
+ }
+ daddr = usin->sin_addr.s_addr;
+ dport = usin->sin_port;
+ }
+ }
+
saddr = ipc.addr;
ipc.addr = faddr = daddr;
@@ -1032,12 +1110,14 @@ back_from_confirm:
/* Lockless fast path for the non-corking case. */
if (!corkreq) {
+ struct inet_cork cork;
+
skb = ip_make_skb(sk, fl4, getfrag, msg, ulen,
sizeof(struct udphdr), &ipc, &rt,
- msg->msg_flags);
+ &cork, msg->msg_flags);
err = PTR_ERR(skb);
if (!IS_ERR_OR_NULL(skb))
- err = udp_send_skb(skb, fl4);
+ err = udp_send_skb(skb, fl4, &cork);
goto out;
}
@@ -1813,10 +1893,10 @@ static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
return 0;
}
-static struct static_key udp_encap_needed __read_mostly;
+static DEFINE_STATIC_KEY_FALSE(udp_encap_needed_key);
void udp_encap_enable(void)
{
- static_key_enable(&udp_encap_needed);
+ static_branch_enable(&udp_encap_needed_key);
}
EXPORT_SYMBOL(udp_encap_enable);
@@ -1840,7 +1920,7 @@ static int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
goto drop;
nf_reset(skb);
- if (static_key_false(&udp_encap_needed) && up->encap_type) {
+ if (static_branch_unlikely(&udp_encap_needed_key) && up->encap_type) {
int (*encap_rcv)(struct sock *sk, struct sk_buff *skb);
/*
@@ -2303,7 +2383,7 @@ void udp_destroy_sock(struct sock *sk)
bool slow = lock_sock_fast(sk);
udp_flush_pending_frames(sk);
unlock_sock_fast(sk, slow);
- if (static_key_false(&udp_encap_needed) && up->encap_type) {
+ if (static_branch_unlikely(&udp_encap_needed_key) && up->encap_type) {
void (*encap_destroy)(struct sock *sk);
encap_destroy = READ_ONCE(up->encap_destroy);
if (encap_destroy)
@@ -2368,6 +2448,12 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,
up->no_check6_rx = valbool;
break;
+ case UDP_SEGMENT:
+ if (val < 0 || val > USHRT_MAX)
+ return -EINVAL;
+ up->gso_size = val;
+ break;
+
/*
* UDP-Lite's partial checksum coverage (RFC 3828).
*/
@@ -2458,6 +2544,10 @@ int udp_lib_getsockopt(struct sock *sk, int level, int optname,
val = up->no_check6_rx;
break;
+ case UDP_SEGMENT:
+ val = up->gso_size;
+ break;
+
/* The following two cannot be changed on UDP sockets, the return is
* always 0 (which corresponds to the full checksum coverage of UDP). */
case UDPLITE_SEND_CSCOV:
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index ea6e6e7df0ee..92dc9e5a7ff3 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -187,6 +187,102 @@ out_unlock:
}
EXPORT_SYMBOL(skb_udp_tunnel_segment);
+struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb,
+ netdev_features_t features)
+{
+ struct sock *sk = gso_skb->sk;
+ unsigned int sum_truesize = 0;
+ struct sk_buff *segs, *seg;
+ struct udphdr *uh;
+ unsigned int mss;
+ bool copy_dtor;
+ __sum16 check;
+ __be16 newlen;
+
+ mss = skb_shinfo(gso_skb)->gso_size;
+ if (gso_skb->len <= sizeof(*uh) + mss)
+ return ERR_PTR(-EINVAL);
+
+ skb_pull(gso_skb, sizeof(*uh));
+
+ /* clear destructor to avoid skb_segment assigning it to tail */
+ copy_dtor = gso_skb->destructor == sock_wfree;
+ if (copy_dtor)
+ gso_skb->destructor = NULL;
+
+ segs = skb_segment(gso_skb, features);
+ if (unlikely(IS_ERR_OR_NULL(segs))) {
+ if (copy_dtor)
+ gso_skb->destructor = sock_wfree;
+ return segs;
+ }
+
+ /* GSO partial and frag_list segmentation only requires splitting
+ * the frame into an MSS multiple and possibly a remainder, both
+ * cases return a GSO skb. So update the mss now.
+ */
+ if (skb_is_gso(segs))
+ mss *= skb_shinfo(segs)->gso_segs;
+
+ seg = segs;
+ uh = udp_hdr(seg);
+
+ /* compute checksum adjustment based on old length versus new */
+ newlen = htons(sizeof(*uh) + mss);
+ check = csum16_add(csum16_sub(uh->check, uh->len), newlen);
+
+ for (;;) {
+ if (copy_dtor) {
+ seg->destructor = sock_wfree;
+ seg->sk = sk;
+ sum_truesize += seg->truesize;
+ }
+
+ if (!seg->next)
+ break;
+
+ uh->len = newlen;
+ uh->check = check;
+
+ if (seg->ip_summed == CHECKSUM_PARTIAL)
+ gso_reset_checksum(seg, ~check);
+ else
+ uh->check = gso_make_checksum(seg, ~check) ? :
+ CSUM_MANGLED_0;
+
+ seg = seg->next;
+ uh = udp_hdr(seg);
+ }
+
+ /* last packet can be partial gso_size, account for that in checksum */
+ newlen = htons(skb_tail_pointer(seg) - skb_transport_header(seg) +
+ seg->data_len);
+ check = csum16_add(csum16_sub(uh->check, uh->len), newlen);
+
+ uh->len = newlen;
+ uh->check = check;
+
+ if (seg->ip_summed == CHECKSUM_PARTIAL)
+ gso_reset_checksum(seg, ~check);
+ else
+ uh->check = gso_make_checksum(seg, ~check) ? : CSUM_MANGLED_0;
+
+ /* update refcount for the packet */
+ if (copy_dtor) {
+ int delta = sum_truesize - gso_skb->truesize;
+
+ /* In some pathological cases, delta can be negative.
+ * We need to either use refcount_add() or refcount_sub_and_test()
+ */
+ if (likely(delta >= 0))
+ refcount_add(delta, &sk->sk_wmem_alloc);
+ else
+ WARN_ON_ONCE(refcount_sub_and_test(-delta, &sk->sk_wmem_alloc));
+ }
+ return segs;
+}
+EXPORT_SYMBOL_GPL(__udp_gso_segment);
+
static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb,
netdev_features_t features)
{
@@ -203,12 +299,15 @@ static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb,
goto out;
}
- if (!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP))
+ if (!(skb_shinfo(skb)->gso_type & (SKB_GSO_UDP | SKB_GSO_UDP_L4)))
goto out;
if (!pskb_may_pull(skb, sizeof(struct udphdr)))
goto out;
+ if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4)
+ return __udp_gso_segment(skb, features);
+
mss = skb_shinfo(skb)->gso_size;
if (unlikely(skb->len <= mss))
goto out;
diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig
index 11e4e80cf7e9..0eff75525da1 100644
--- a/net/ipv6/Kconfig
+++ b/net/ipv6/Kconfig
@@ -329,4 +329,9 @@ config IPV6_SEG6_HMAC
If unsure, say N.
+config IPV6_SEG6_BPF
+ def_bool y
+ depends on IPV6_SEG6_LWTUNNEL
+ depends on IPV6 = y
+
endif # IPV6
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 1b5ea3379d9b..89019bf59f46 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -170,7 +170,7 @@ static void addrconf_type_change(struct net_device *dev,
unsigned long event);
static int addrconf_ifdown(struct net_device *dev, int how);
-static struct rt6_info *addrconf_get_prefix_route(const struct in6_addr *pfx,
+static struct fib6_info *addrconf_get_prefix_route(const struct in6_addr *pfx,
int plen,
const struct net_device *dev,
u32 flags, u32 noflags);
@@ -916,7 +916,6 @@ void inet6_ifa_finish_destroy(struct inet6_ifaddr *ifp)
pr_warn("Freeing alive inet6 address %p\n", ifp);
return;
}
- ip6_rt_put(ifp->rt);
kfree_rcu(ifp, rcu);
}
@@ -987,17 +986,15 @@ static int ipv6_add_addr_hash(struct net_device *dev, struct inet6_ifaddr *ifa)
/* On success it returns ifp with increased reference count */
static struct inet6_ifaddr *
-ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr,
- const struct in6_addr *peer_addr, int pfxlen,
- int scope, u32 flags, u32 valid_lft, u32 prefered_lft,
+ipv6_add_addr(struct inet6_dev *idev, struct ifa6_config *cfg,
bool can_block, struct netlink_ext_ack *extack)
{
gfp_t gfp_flags = can_block ? GFP_KERNEL : GFP_ATOMIC;
+ int addr_type = ipv6_addr_type(cfg->pfx);
struct net *net = dev_net(idev->dev);
struct inet6_ifaddr *ifa = NULL;
- struct rt6_info *rt = NULL;
+ struct fib6_info *f6i = NULL;
int err = 0;
- int addr_type = ipv6_addr_type(addr);
if (addr_type == IPV6_ADDR_ANY ||
addr_type & IPV6_ADDR_MULTICAST ||
@@ -1020,7 +1017,7 @@ ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr,
*/
if (can_block) {
struct in6_validator_info i6vi = {
- .i6vi_addr = *addr,
+ .i6vi_addr = *cfg->pfx,
.i6vi_dev = idev,
.extack = extack,
};
@@ -1037,38 +1034,39 @@ ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr,
goto out;
}
- rt = addrconf_dst_alloc(idev, addr, false);
- if (IS_ERR(rt)) {
- err = PTR_ERR(rt);
- rt = NULL;
+ f6i = addrconf_f6i_alloc(net, idev, cfg->pfx, false, gfp_flags);
+ if (IS_ERR(f6i)) {
+ err = PTR_ERR(f6i);
+ f6i = NULL;
goto out;
}
if (net->ipv6.devconf_all->disable_policy ||
idev->cnf.disable_policy)
- rt->dst.flags |= DST_NOPOLICY;
+ f6i->dst_nopolicy = true;
neigh_parms_data_state_setall(idev->nd_parms);
- ifa->addr = *addr;
- if (peer_addr)
- ifa->peer_addr = *peer_addr;
+ ifa->addr = *cfg->pfx;
+ if (cfg->peer_pfx)
+ ifa->peer_addr = *cfg->peer_pfx;
spin_lock_init(&ifa->lock);
INIT_DELAYED_WORK(&ifa->dad_work, addrconf_dad_work);
INIT_HLIST_NODE(&ifa->addr_lst);
- ifa->scope = scope;
- ifa->prefix_len = pfxlen;
- ifa->flags = flags;
+ ifa->scope = cfg->scope;
+ ifa->prefix_len = cfg->plen;
+ ifa->rt_priority = cfg->rt_priority;
+ ifa->flags = cfg->ifa_flags;
/* No need to add the TENTATIVE flag for addresses with NODAD */
- if (!(flags & IFA_F_NODAD))
+ if (!(cfg->ifa_flags & IFA_F_NODAD))
ifa->flags |= IFA_F_TENTATIVE;
- ifa->valid_lft = valid_lft;
- ifa->prefered_lft = prefered_lft;
+ ifa->valid_lft = cfg->valid_lft;
+ ifa->prefered_lft = cfg->preferred_lft;
ifa->cstamp = ifa->tstamp = jiffies;
ifa->tokenized = false;
- ifa->rt = rt;
+ ifa->rt = f6i;
ifa->idev = idev;
in6_dev_hold(idev);
@@ -1102,8 +1100,8 @@ ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr,
inet6addr_notifier_call_chain(NETDEV_UP, ifa);
out:
if (unlikely(err < 0)) {
- if (rt)
- ip6_rt_put(rt);
+ fib6_info_release(f6i);
+
if (ifa) {
if (ifa->idev)
in6_dev_put(ifa->idev);
@@ -1179,19 +1177,19 @@ check_cleanup_prefix_route(struct inet6_ifaddr *ifp, unsigned long *expires)
static void
cleanup_prefix_route(struct inet6_ifaddr *ifp, unsigned long expires, bool del_rt)
{
- struct rt6_info *rt;
+ struct fib6_info *f6i;
- rt = addrconf_get_prefix_route(&ifp->addr,
+ f6i = addrconf_get_prefix_route(&ifp->addr,
ifp->prefix_len,
ifp->idev->dev,
0, RTF_GATEWAY | RTF_DEFAULT);
- if (rt) {
+ if (f6i) {
if (del_rt)
- ip6_del_rt(rt);
+ ip6_del_rt(dev_net(ifp->idev->dev), f6i);
else {
- if (!(rt->rt6i_flags & RTF_EXPIRES))
- rt6_set_expires(rt, expires);
- ip6_rt_put(rt);
+ if (!(f6i->fib6_flags & RTF_EXPIRES))
+ fib6_set_expires(f6i, expires);
+ fib6_info_release(f6i);
}
}
}
@@ -1261,11 +1259,10 @@ static int ipv6_create_tempaddr(struct inet6_ifaddr *ifp,
{
struct inet6_dev *idev = ifp->idev;
struct in6_addr addr, *tmpaddr;
- unsigned long tmp_prefered_lft, tmp_valid_lft, tmp_tstamp, age;
+ unsigned long tmp_tstamp, age;
unsigned long regen_advance;
- int tmp_plen;
+ struct ifa6_config cfg;
int ret = 0;
- u32 addr_flags;
unsigned long now = jiffies;
long max_desync_factor;
s32 cnf_temp_preferred_lft;
@@ -1327,13 +1324,12 @@ retry:
}
}
- tmp_valid_lft = min_t(__u32,
- ifp->valid_lft,
+ cfg.valid_lft = min_t(__u32, ifp->valid_lft,
idev->cnf.temp_valid_lft + age);
- tmp_prefered_lft = cnf_temp_preferred_lft + age -
- idev->desync_factor;
- tmp_prefered_lft = min_t(__u32, ifp->prefered_lft, tmp_prefered_lft);
- tmp_plen = ifp->prefix_len;
+ cfg.preferred_lft = cnf_temp_preferred_lft + age - idev->desync_factor;
+ cfg.preferred_lft = min_t(__u32, ifp->prefered_lft, cfg.preferred_lft);
+
+ cfg.plen = ifp->prefix_len;
tmp_tstamp = ifp->tstamp;
spin_unlock_bh(&ifp->lock);
@@ -1347,21 +1343,23 @@ retry:
* temporary addresses being generated.
*/
age = (now - tmp_tstamp + ADDRCONF_TIMER_FUZZ_MINUS) / HZ;
- if (tmp_prefered_lft <= regen_advance + age) {
+ if (cfg.preferred_lft <= regen_advance + age) {
in6_ifa_put(ifp);
in6_dev_put(idev);
ret = -1;
goto out;
}
- addr_flags = IFA_F_TEMPORARY;
+ cfg.ifa_flags = IFA_F_TEMPORARY;
/* set in addrconf_prefix_rcv() */
if (ifp->flags & IFA_F_OPTIMISTIC)
- addr_flags |= IFA_F_OPTIMISTIC;
+ cfg.ifa_flags |= IFA_F_OPTIMISTIC;
- ift = ipv6_add_addr(idev, &addr, NULL, tmp_plen,
- ipv6_addr_scope(&addr), addr_flags,
- tmp_valid_lft, tmp_prefered_lft, block, NULL);
+ cfg.pfx = &addr;
+ cfg.scope = ipv6_addr_scope(cfg.pfx);
+ cfg.rt_priority = 0;
+
+ ift = ipv6_add_addr(idev, &cfg, block, NULL);
if (IS_ERR(ift)) {
in6_ifa_put(ifp);
in6_dev_put(idev);
@@ -2032,13 +2030,17 @@ void addrconf_dad_failure(struct sk_buff *skb, struct inet6_ifaddr *ifp)
spin_lock_bh(&ifp->lock);
if (ifp->flags & IFA_F_STABLE_PRIVACY) {
- int scope = ifp->scope;
- u32 flags = ifp->flags;
struct in6_addr new_addr;
struct inet6_ifaddr *ifp2;
- u32 valid_lft, preferred_lft;
- int pfxlen = ifp->prefix_len;
int retries = ifp->stable_privacy_retry + 1;
+ struct ifa6_config cfg = {
+ .pfx = &new_addr,
+ .plen = ifp->prefix_len,
+ .ifa_flags = ifp->flags,
+ .valid_lft = ifp->valid_lft,
+ .preferred_lft = ifp->prefered_lft,
+ .scope = ifp->scope,
+ };
if (retries > net->ipv6.sysctl.idgen_retries) {
net_info_ratelimited("%s: privacy stable address generation failed because of DAD conflicts!\n",
@@ -2051,9 +2053,6 @@ void addrconf_dad_failure(struct sk_buff *skb, struct inet6_ifaddr *ifp)
idev))
goto errdad;
- valid_lft = ifp->valid_lft;
- preferred_lft = ifp->prefered_lft;
-
spin_unlock_bh(&ifp->lock);
if (idev->cnf.max_addresses &&
@@ -2064,9 +2063,7 @@ void addrconf_dad_failure(struct sk_buff *skb, struct inet6_ifaddr *ifp)
net_info_ratelimited("%s: generating new stable privacy address because of DAD conflict\n",
ifp->idev->dev->name);
- ifp2 = ipv6_add_addr(idev, &new_addr, NULL, pfxlen,
- scope, flags, valid_lft,
- preferred_lft, false, NULL);
+ ifp2 = ipv6_add_addr(idev, &cfg, false, NULL);
if (IS_ERR(ifp2))
goto lock_errdad;
@@ -2254,6 +2251,7 @@ static int ipv6_generate_eui64(u8 *eui, struct net_device *dev)
return addrconf_ifid_ieee1394(eui, dev);
case ARPHRD_TUNNEL6:
case ARPHRD_IP6GRE:
+ case ARPHRD_RAWIP:
return addrconf_ifid_ip6tnl(eui, dev);
}
return -1;
@@ -2319,18 +2317,20 @@ static void ipv6_try_regen_rndid(struct inet6_dev *idev, struct in6_addr *tmpad
*/
static void
-addrconf_prefix_route(struct in6_addr *pfx, int plen, struct net_device *dev,
- unsigned long expires, u32 flags)
+addrconf_prefix_route(struct in6_addr *pfx, int plen, u32 metric,
+ struct net_device *dev, unsigned long expires,
+ u32 flags, gfp_t gfp_flags)
{
struct fib6_config cfg = {
.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_PREFIX,
- .fc_metric = IP6_RT_PRIO_ADDRCONF,
+ .fc_metric = metric ? : IP6_RT_PRIO_ADDRCONF,
.fc_ifindex = dev->ifindex,
.fc_expires = expires,
.fc_dst_len = plen,
.fc_flags = RTF_UP | flags,
.fc_nlinfo.nl_net = dev_net(dev),
.fc_protocol = RTPROT_KERNEL,
+ .fc_type = RTN_UNICAST,
};
cfg.fc_dst = *pfx;
@@ -2344,17 +2344,17 @@ addrconf_prefix_route(struct in6_addr *pfx, int plen, struct net_device *dev,
cfg.fc_flags |= RTF_NONEXTHOP;
#endif
- ip6_route_add(&cfg, NULL);
+ ip6_route_add(&cfg, gfp_flags, NULL);
}
-static struct rt6_info *addrconf_get_prefix_route(const struct in6_addr *pfx,
+static struct fib6_info *addrconf_get_prefix_route(const struct in6_addr *pfx,
int plen,
const struct net_device *dev,
u32 flags, u32 noflags)
{
struct fib6_node *fn;
- struct rt6_info *rt = NULL;
+ struct fib6_info *rt = NULL;
struct fib6_table *table;
u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_PREFIX;
@@ -2368,14 +2368,13 @@ static struct rt6_info *addrconf_get_prefix_route(const struct in6_addr *pfx,
goto out;
for_each_fib6_node_rt_rcu(fn) {
- if (rt->dst.dev->ifindex != dev->ifindex)
+ if (rt->fib6_nh.nh_dev->ifindex != dev->ifindex)
continue;
- if ((rt->rt6i_flags & flags) != flags)
+ if ((rt->fib6_flags & flags) != flags)
continue;
- if ((rt->rt6i_flags & noflags) != 0)
+ if ((rt->fib6_flags & noflags) != 0)
continue;
- if (!dst_hold_safe(&rt->dst))
- rt = NULL;
+ fib6_info_hold(rt);
break;
}
out:
@@ -2394,12 +2393,13 @@ static void addrconf_add_mroute(struct net_device *dev)
.fc_ifindex = dev->ifindex,
.fc_dst_len = 8,
.fc_flags = RTF_UP,
+ .fc_type = RTN_UNICAST,
.fc_nlinfo.nl_net = dev_net(dev),
};
ipv6_addr_set(&cfg.fc_dst, htonl(0xFF000000), 0, 0, 0);
- ip6_route_add(&cfg, NULL);
+ ip6_route_add(&cfg, GFP_ATOMIC, NULL);
}
static struct inet6_dev *addrconf_add_dev(struct net_device *dev)
@@ -2507,12 +2507,20 @@ int addrconf_prefix_rcv_add_addr(struct net *net, struct net_device *dev,
if (!ifp && valid_lft) {
int max_addresses = in6_dev->cnf.max_addresses;
+ struct ifa6_config cfg = {
+ .pfx = addr,
+ .plen = pinfo->prefix_len,
+ .ifa_flags = addr_flags,
+ .valid_lft = valid_lft,
+ .preferred_lft = prefered_lft,
+ .scope = addr_type & IPV6_ADDR_SCOPE_MASK,
+ };
#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
if ((net->ipv6.devconf_all->optimistic_dad ||
in6_dev->cnf.optimistic_dad) &&
!net->ipv6.devconf_all->forwarding && sllao)
- addr_flags |= IFA_F_OPTIMISTIC;
+ cfg.ifa_flags |= IFA_F_OPTIMISTIC;
#endif
/* Do not allow to create too much of autoconfigured
@@ -2520,16 +2528,11 @@ int addrconf_prefix_rcv_add_addr(struct net *net, struct net_device *dev,
*/
if (!max_addresses ||
ipv6_count_addresses(in6_dev) < max_addresses)
- ifp = ipv6_add_addr(in6_dev, addr, NULL,
- pinfo->prefix_len,
- addr_type&IPV6_ADDR_SCOPE_MASK,
- addr_flags, valid_lft,
- prefered_lft, false, NULL);
+ ifp = ipv6_add_addr(in6_dev, &cfg, false, NULL);
if (IS_ERR_OR_NULL(ifp))
return -1;
- update_lft = 0;
create = 1;
spin_lock_bh(&ifp->lock);
ifp->flags |= IFA_F_MANAGETEMPADDR;
@@ -2551,7 +2554,7 @@ int addrconf_prefix_rcv_add_addr(struct net *net, struct net_device *dev,
stored_lft = ifp->valid_lft - (now - ifp->tstamp) / HZ;
else
stored_lft = 0;
- if (!update_lft && !create && stored_lft) {
+ if (!create && stored_lft) {
const u32 minimum_lft = min_t(u32,
stored_lft, MIN_VALID_LIFETIME);
valid_lft = max(valid_lft, minimum_lft);
@@ -2642,7 +2645,7 @@ void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len, bool sllao)
*/
if (pinfo->onlink) {
- struct rt6_info *rt;
+ struct fib6_info *rt;
unsigned long rt_expires;
/* Avoid arithmetic overflow. Really, we could
@@ -2667,13 +2670,13 @@ void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len, bool sllao)
if (rt) {
/* Autoconf prefix route */
if (valid_lft == 0) {
- ip6_del_rt(rt);
+ ip6_del_rt(net, rt);
rt = NULL;
} else if (addrconf_finite_timeout(rt_expires)) {
/* not infinity */
- rt6_set_expires(rt, jiffies + rt_expires);
+ fib6_set_expires(rt, jiffies + rt_expires);
} else {
- rt6_clean_expires(rt);
+ fib6_clean_expires(rt);
}
} else if (valid_lft) {
clock_t expires = 0;
@@ -2684,9 +2687,10 @@ void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len, bool sllao)
expires = jiffies_to_clock_t(rt_expires);
}
addrconf_prefix_route(&pinfo->prefix, pinfo->prefix_len,
- dev, expires, flags);
+ 0, dev, expires, flags,
+ GFP_ATOMIC);
}
- ip6_rt_put(rt);
+ fib6_info_release(rt);
}
/* Try to figure out our local address for this prefix */
@@ -2831,10 +2835,7 @@ static int ipv6_mc_config(struct sock *sk, bool join,
* Manual configuration of address on an interface
*/
static int inet6_addr_add(struct net *net, int ifindex,
- const struct in6_addr *pfx,
- const struct in6_addr *peer_pfx,
- unsigned int plen, __u32 ifa_flags,
- __u32 prefered_lft, __u32 valid_lft,
+ struct ifa6_config *cfg,
struct netlink_ext_ack *extack)
{
struct inet6_ifaddr *ifp;
@@ -2842,19 +2843,18 @@ static int inet6_addr_add(struct net *net, int ifindex,
struct net_device *dev;
unsigned long timeout;
clock_t expires;
- int scope;
u32 flags;
ASSERT_RTNL();
- if (plen > 128)
+ if (cfg->plen > 128)
return -EINVAL;
/* check the lifetime */
- if (!valid_lft || prefered_lft > valid_lft)
+ if (!cfg->valid_lft || cfg->preferred_lft > cfg->valid_lft)
return -EINVAL;
- if (ifa_flags & IFA_F_MANAGETEMPADDR && plen != 64)
+ if (cfg->ifa_flags & IFA_F_MANAGETEMPADDR && cfg->plen != 64)
return -EINVAL;
dev = __dev_get_by_index(net, ifindex);
@@ -2865,58 +2865,62 @@ static int inet6_addr_add(struct net *net, int ifindex,
if (IS_ERR(idev))
return PTR_ERR(idev);
- if (ifa_flags & IFA_F_MCAUTOJOIN) {
+ if (cfg->ifa_flags & IFA_F_MCAUTOJOIN) {
int ret = ipv6_mc_config(net->ipv6.mc_autojoin_sk,
- true, pfx, ifindex);
+ true, cfg->pfx, ifindex);
if (ret < 0)
return ret;
}
- scope = ipv6_addr_scope(pfx);
+ cfg->scope = ipv6_addr_scope(cfg->pfx);
- timeout = addrconf_timeout_fixup(valid_lft, HZ);
+ timeout = addrconf_timeout_fixup(cfg->valid_lft, HZ);
if (addrconf_finite_timeout(timeout)) {
expires = jiffies_to_clock_t(timeout * HZ);
- valid_lft = timeout;
+ cfg->valid_lft = timeout;
flags = RTF_EXPIRES;
} else {
expires = 0;
flags = 0;
- ifa_flags |= IFA_F_PERMANENT;
+ cfg->ifa_flags |= IFA_F_PERMANENT;
}
- timeout = addrconf_timeout_fixup(prefered_lft, HZ);
+ timeout = addrconf_timeout_fixup(cfg->preferred_lft, HZ);
if (addrconf_finite_timeout(timeout)) {
if (timeout == 0)
- ifa_flags |= IFA_F_DEPRECATED;
- prefered_lft = timeout;
+ cfg->ifa_flags |= IFA_F_DEPRECATED;
+ cfg->preferred_lft = timeout;
}
- ifp = ipv6_add_addr(idev, pfx, peer_pfx, plen, scope, ifa_flags,
- valid_lft, prefered_lft, true, extack);
-
+ ifp = ipv6_add_addr(idev, cfg, true, extack);
if (!IS_ERR(ifp)) {
- if (!(ifa_flags & IFA_F_NOPREFIXROUTE)) {
- addrconf_prefix_route(&ifp->addr, ifp->prefix_len, dev,
- expires, flags);
+ if (!(cfg->ifa_flags & IFA_F_NOPREFIXROUTE)) {
+ addrconf_prefix_route(&ifp->addr, ifp->prefix_len,
+ ifp->rt_priority, dev, expires,
+ flags, GFP_KERNEL);
}
+ /* Send a netlink notification if DAD is enabled and
+ * optimistic flag is not set
+ */
+ if (!(ifp->flags & (IFA_F_OPTIMISTIC | IFA_F_NODAD)))
+ ipv6_ifa_notify(0, ifp);
/*
* Note that section 3.1 of RFC 4429 indicates
* that the Optimistic flag should not be set for
* manually configured addresses
*/
addrconf_dad_start(ifp);
- if (ifa_flags & IFA_F_MANAGETEMPADDR)
- manage_tempaddrs(idev, ifp, valid_lft, prefered_lft,
- true, jiffies);
+ if (cfg->ifa_flags & IFA_F_MANAGETEMPADDR)
+ manage_tempaddrs(idev, ifp, cfg->valid_lft,
+ cfg->preferred_lft, true, jiffies);
in6_ifa_put(ifp);
addrconf_verify_rtnl();
return 0;
- } else if (ifa_flags & IFA_F_MCAUTOJOIN) {
- ipv6_mc_config(net->ipv6.mc_autojoin_sk,
- false, pfx, ifindex);
+ } else if (cfg->ifa_flags & IFA_F_MCAUTOJOIN) {
+ ipv6_mc_config(net->ipv6.mc_autojoin_sk, false,
+ cfg->pfx, ifindex);
}
return PTR_ERR(ifp);
@@ -2967,6 +2971,11 @@ static int inet6_addr_del(struct net *net, int ifindex, u32 ifa_flags,
int addrconf_add_ifaddr(struct net *net, void __user *arg)
{
+ struct ifa6_config cfg = {
+ .ifa_flags = IFA_F_PERMANENT,
+ .preferred_lft = INFINITY_LIFE_TIME,
+ .valid_lft = INFINITY_LIFE_TIME,
+ };
struct in6_ifreq ireq;
int err;
@@ -2976,10 +2985,11 @@ int addrconf_add_ifaddr(struct net *net, void __user *arg)
if (copy_from_user(&ireq, arg, sizeof(struct in6_ifreq)))
return -EFAULT;
+ cfg.pfx = &ireq.ifr6_addr;
+ cfg.plen = ireq.ifr6_prefixlen;
+
rtnl_lock();
- err = inet6_addr_add(net, ireq.ifr6_ifindex, &ireq.ifr6_addr, NULL,
- ireq.ifr6_prefixlen, IFA_F_PERMANENT,
- INFINITY_LIFE_TIME, INFINITY_LIFE_TIME, NULL);
+ err = inet6_addr_add(net, ireq.ifr6_ifindex, &cfg, NULL);
rtnl_unlock();
return err;
}
@@ -3006,11 +3016,16 @@ static void add_addr(struct inet6_dev *idev, const struct in6_addr *addr,
int plen, int scope)
{
struct inet6_ifaddr *ifp;
+ struct ifa6_config cfg = {
+ .pfx = addr,
+ .plen = plen,
+ .ifa_flags = IFA_F_PERMANENT,
+ .valid_lft = INFINITY_LIFE_TIME,
+ .preferred_lft = INFINITY_LIFE_TIME,
+ .scope = scope
+ };
- ifp = ipv6_add_addr(idev, addr, NULL, plen,
- scope, IFA_F_PERMANENT,
- INFINITY_LIFE_TIME, INFINITY_LIFE_TIME,
- true, NULL);
+ ifp = ipv6_add_addr(idev, &cfg, true, NULL);
if (!IS_ERR(ifp)) {
spin_lock_bh(&ifp->lock);
ifp->flags &= ~IFA_F_TENTATIVE;
@@ -3047,7 +3062,8 @@ static void sit_add_v4_addrs(struct inet6_dev *idev)
if (addr.s6_addr32[3]) {
add_addr(idev, &addr, plen, scope);
- addrconf_prefix_route(&addr, plen, idev->dev, 0, pflags);
+ addrconf_prefix_route(&addr, plen, 0, idev->dev, 0, pflags,
+ GFP_ATOMIC);
return;
}
@@ -3071,8 +3087,8 @@ static void sit_add_v4_addrs(struct inet6_dev *idev)
}
add_addr(idev, &addr, plen, flag);
- addrconf_prefix_route(&addr, plen, idev->dev, 0,
- pflags);
+ addrconf_prefix_route(&addr, plen, 0, idev->dev,
+ 0, pflags, GFP_ATOMIC);
}
}
}
@@ -3099,20 +3115,27 @@ static void init_loopback(struct net_device *dev)
void addrconf_add_linklocal(struct inet6_dev *idev,
const struct in6_addr *addr, u32 flags)
{
+ struct ifa6_config cfg = {
+ .pfx = addr,
+ .plen = 64,
+ .ifa_flags = flags | IFA_F_PERMANENT,
+ .valid_lft = INFINITY_LIFE_TIME,
+ .preferred_lft = INFINITY_LIFE_TIME,
+ .scope = IFA_LINK
+ };
struct inet6_ifaddr *ifp;
- u32 addr_flags = flags | IFA_F_PERMANENT;
#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
if ((dev_net(idev->dev)->ipv6.devconf_all->optimistic_dad ||
idev->cnf.optimistic_dad) &&
!dev_net(idev->dev)->ipv6.devconf_all->forwarding)
- addr_flags |= IFA_F_OPTIMISTIC;
+ cfg.ifa_flags |= IFA_F_OPTIMISTIC;
#endif
- ifp = ipv6_add_addr(idev, addr, NULL, 64, IFA_LINK, addr_flags,
- INFINITY_LIFE_TIME, INFINITY_LIFE_TIME, true, NULL);
+ ifp = ipv6_add_addr(idev, &cfg, true, NULL);
if (!IS_ERR(ifp)) {
- addrconf_prefix_route(&ifp->addr, ifp->prefix_len, idev->dev, 0, 0);
+ addrconf_prefix_route(&ifp->addr, ifp->prefix_len, 0, idev->dev,
+ 0, 0, GFP_ATOMIC);
addrconf_dad_start(ifp);
in6_ifa_put(ifp);
}
@@ -3227,7 +3250,8 @@ static void addrconf_addr_gen(struct inet6_dev *idev, bool prefix_route)
addrconf_add_linklocal(idev, &addr,
IFA_F_STABLE_PRIVACY);
else if (prefix_route)
- addrconf_prefix_route(&addr, 64, idev->dev, 0, 0);
+ addrconf_prefix_route(&addr, 64, 0, idev->dev,
+ 0, 0, GFP_KERNEL);
break;
case IN6_ADDR_GEN_MODE_EUI64:
/* addrconf_add_linklocal also adds a prefix_route and we
@@ -3237,7 +3261,8 @@ static void addrconf_addr_gen(struct inet6_dev *idev, bool prefix_route)
if (ipv6_generate_eui64(addr.s6_addr + 8, idev->dev) == 0)
addrconf_add_linklocal(idev, &addr, 0);
else if (prefix_route)
- addrconf_prefix_route(&addr, 64, idev->dev, 0, 0);
+ addrconf_prefix_route(&addr, 64, 0, idev->dev,
+ 0, 0, GFP_KERNEL);
break;
case IN6_ADDR_GEN_MODE_NONE:
default:
@@ -3262,7 +3287,8 @@ static void addrconf_dev_config(struct net_device *dev)
(dev->type != ARPHRD_IP6GRE) &&
(dev->type != ARPHRD_IPGRE) &&
(dev->type != ARPHRD_TUNNEL) &&
- (dev->type != ARPHRD_NONE)) {
+ (dev->type != ARPHRD_NONE) &&
+ (dev->type != ARPHRD_RAWIP)) {
/* Alas, we support only Ethernet autoconfiguration. */
return;
}
@@ -3329,32 +3355,35 @@ static void addrconf_gre_config(struct net_device *dev)
}
#endif
-static int fixup_permanent_addr(struct inet6_dev *idev,
+static int fixup_permanent_addr(struct net *net,
+ struct inet6_dev *idev,
struct inet6_ifaddr *ifp)
{
- /* !rt6i_node means the host route was removed from the
+ /* !fib6_node means the host route was removed from the
* FIB, for example, if 'lo' device is taken down. In that
* case regenerate the host route.
*/
- if (!ifp->rt || !ifp->rt->rt6i_node) {
- struct rt6_info *rt, *prev;
+ if (!ifp->rt || !ifp->rt->fib6_node) {
+ struct fib6_info *f6i, *prev;
- rt = addrconf_dst_alloc(idev, &ifp->addr, false);
- if (IS_ERR(rt))
- return PTR_ERR(rt);
+ f6i = addrconf_f6i_alloc(net, idev, &ifp->addr, false,
+ GFP_ATOMIC);
+ if (IS_ERR(f6i))
+ return PTR_ERR(f6i);
/* ifp->rt can be accessed outside of rtnl */
spin_lock(&ifp->lock);
prev = ifp->rt;
- ifp->rt = rt;
+ ifp->rt = f6i;
spin_unlock(&ifp->lock);
- ip6_rt_put(prev);
+ fib6_info_release(prev);
}
if (!(ifp->flags & IFA_F_NOPREFIXROUTE)) {
addrconf_prefix_route(&ifp->addr, ifp->prefix_len,
- idev->dev, 0, 0);
+ ifp->rt_priority, idev->dev, 0, 0,
+ GFP_ATOMIC);
}
if (ifp->state == INET6_IFADDR_STATE_PREDAD)
@@ -3363,7 +3392,7 @@ static int fixup_permanent_addr(struct inet6_dev *idev,
return 0;
}
-static void addrconf_permanent_addr(struct net_device *dev)
+static void addrconf_permanent_addr(struct net *net, struct net_device *dev)
{
struct inet6_ifaddr *ifp, *tmp;
struct inet6_dev *idev;
@@ -3376,7 +3405,7 @@ static void addrconf_permanent_addr(struct net_device *dev)
list_for_each_entry_safe(ifp, tmp, &idev->addr_list, if_list) {
if ((ifp->flags & IFA_F_PERMANENT) &&
- fixup_permanent_addr(idev, ifp) < 0) {
+ fixup_permanent_addr(net, idev, ifp) < 0) {
write_unlock_bh(&idev->lock);
in6_ifa_hold(ifp);
ipv6_del_addr(ifp);
@@ -3445,7 +3474,7 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event,
if (event == NETDEV_UP) {
/* restore routes for permanent addresses */
- addrconf_permanent_addr(dev);
+ addrconf_permanent_addr(net, dev);
if (!addrconf_link_ready(dev)) {
/* device is not ready yet. */
@@ -3612,8 +3641,7 @@ static int addrconf_ifdown(struct net_device *dev, int how)
struct net *net = dev_net(dev);
struct inet6_dev *idev;
struct inet6_ifaddr *ifa, *tmp;
- int _keep_addr;
- bool keep_addr;
+ bool keep_addr = false;
int state, i;
ASSERT_RTNL();
@@ -3639,15 +3667,18 @@ static int addrconf_ifdown(struct net_device *dev, int how)
}
- /* aggregate the system setting and interface setting */
- _keep_addr = net->ipv6.devconf_all->keep_addr_on_down;
- if (!_keep_addr)
- _keep_addr = idev->cnf.keep_addr_on_down;
-
/* combine the user config with event to determine if permanent
* addresses are to be removed from address hash table
*/
- keep_addr = !(how || _keep_addr <= 0 || idev->cnf.disable_ipv6);
+ if (!how && !idev->cnf.disable_ipv6) {
+ /* aggregate the system setting and interface setting */
+ int _keep_addr = net->ipv6.devconf_all->keep_addr_on_down;
+
+ if (!_keep_addr)
+ _keep_addr = idev->cnf.keep_addr_on_down;
+
+ keep_addr = (_keep_addr > 0);
+ }
/* Step 2: clear hash table */
for (i = 0; i < IN6_ADDR_HSIZE; i++) {
@@ -3697,13 +3728,8 @@ restart:
write_lock_bh(&idev->lock);
}
- /* re-combine the user config with event to determine if permanent
- * addresses are to be removed from the interface list
- */
- keep_addr = (!how && _keep_addr > 0 && !idev->cnf.disable_ipv6);
-
list_for_each_entry_safe(ifa, tmp, &idev->addr_list, if_list) {
- struct rt6_info *rt = NULL;
+ struct fib6_info *rt = NULL;
bool keep;
addrconf_del_dad_work(ifa);
@@ -3731,7 +3757,7 @@ restart:
spin_unlock_bh(&ifa->lock);
if (rt)
- ip6_del_rt(rt);
+ ip6_del_rt(net, rt);
if (state != INET6_IFADDR_STATE_DEAD) {
__ipv6_ifa_notify(RTM_DELADDR, ifa);
@@ -3849,6 +3875,7 @@ static void addrconf_dad_begin(struct inet6_ifaddr *ifp)
struct inet6_dev *idev = ifp->idev;
struct net_device *dev = idev->dev;
bool bump_id, notify = false;
+ struct net *net;
addrconf_join_solict(dev, &ifp->addr);
@@ -3859,8 +3886,9 @@ static void addrconf_dad_begin(struct inet6_ifaddr *ifp)
if (ifp->state == INET6_IFADDR_STATE_DEAD)
goto out;
+ net = dev_net(dev);
if (dev->flags&(IFF_NOARP|IFF_LOOPBACK) ||
- (dev_net(dev)->ipv6.devconf_all->accept_dad < 1 &&
+ (net->ipv6.devconf_all->accept_dad < 1 &&
idev->cnf.accept_dad < 1) ||
!(ifp->flags&IFA_F_TENTATIVE) ||
ifp->flags & IFA_F_NODAD) {
@@ -3896,8 +3924,8 @@ static void addrconf_dad_begin(struct inet6_ifaddr *ifp)
* Frames right away
*/
if (ifp->flags & IFA_F_OPTIMISTIC) {
- ip6_ins_rt(ifp->rt);
- if (ipv6_use_optimistic_addr(dev_net(dev), idev)) {
+ ip6_ins_rt(net, ifp->rt);
+ if (ipv6_use_optimistic_addr(net, idev)) {
/* Because optimistic nodes can use this address,
* notify listeners. If DAD fails, RTM_DELADDR is sent.
*/
@@ -4463,6 +4491,7 @@ static const struct nla_policy ifa_ipv6_policy[IFA_MAX+1] = {
[IFA_LOCAL] = { .len = sizeof(struct in6_addr) },
[IFA_CACHEINFO] = { .len = sizeof(struct ifa_cacheinfo) },
[IFA_FLAGS] = { .len = sizeof(u32) },
+ [IFA_RT_PRIORITY] = { .len = sizeof(u32) },
};
static int
@@ -4495,8 +4524,38 @@ inet6_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh,
ifm->ifa_prefixlen);
}
-static int inet6_addr_modify(struct inet6_ifaddr *ifp, u32 ifa_flags,
- u32 prefered_lft, u32 valid_lft)
+static int modify_prefix_route(struct inet6_ifaddr *ifp,
+ unsigned long expires, u32 flags)
+{
+ struct fib6_info *f6i;
+
+ f6i = addrconf_get_prefix_route(&ifp->addr,
+ ifp->prefix_len,
+ ifp->idev->dev,
+ 0, RTF_GATEWAY | RTF_DEFAULT);
+ if (!f6i)
+ return -ENOENT;
+
+ if (f6i->fib6_metric != ifp->rt_priority) {
+ /* add new one */
+ addrconf_prefix_route(&ifp->addr, ifp->prefix_len,
+ ifp->rt_priority, ifp->idev->dev,
+ expires, flags, GFP_KERNEL);
+ /* delete old one */
+ ip6_del_rt(dev_net(ifp->idev->dev), f6i);
+ } else {
+ if (!expires)
+ fib6_clean_expires(f6i);
+ else
+ fib6_set_expires(f6i, expires);
+
+ fib6_info_release(f6i);
+ }
+
+ return 0;
+}
+
+static int inet6_addr_modify(struct inet6_ifaddr *ifp, struct ifa6_config *cfg)
{
u32 flags;
clock_t expires;
@@ -4506,32 +4565,32 @@ static int inet6_addr_modify(struct inet6_ifaddr *ifp, u32 ifa_flags,
ASSERT_RTNL();
- if (!valid_lft || (prefered_lft > valid_lft))
+ if (!cfg->valid_lft || cfg->preferred_lft > cfg->valid_lft)
return -EINVAL;
- if (ifa_flags & IFA_F_MANAGETEMPADDR &&
+ if (cfg->ifa_flags & IFA_F_MANAGETEMPADDR &&
(ifp->flags & IFA_F_TEMPORARY || ifp->prefix_len != 64))
return -EINVAL;
if (!(ifp->flags & IFA_F_TENTATIVE) || ifp->flags & IFA_F_DADFAILED)
- ifa_flags &= ~IFA_F_OPTIMISTIC;
+ cfg->ifa_flags &= ~IFA_F_OPTIMISTIC;
- timeout = addrconf_timeout_fixup(valid_lft, HZ);
+ timeout = addrconf_timeout_fixup(cfg->valid_lft, HZ);
if (addrconf_finite_timeout(timeout)) {
expires = jiffies_to_clock_t(timeout * HZ);
- valid_lft = timeout;
+ cfg->valid_lft = timeout;
flags = RTF_EXPIRES;
} else {
expires = 0;
flags = 0;
- ifa_flags |= IFA_F_PERMANENT;
+ cfg->ifa_flags |= IFA_F_PERMANENT;
}
- timeout = addrconf_timeout_fixup(prefered_lft, HZ);
+ timeout = addrconf_timeout_fixup(cfg->preferred_lft, HZ);
if (addrconf_finite_timeout(timeout)) {
if (timeout == 0)
- ifa_flags |= IFA_F_DEPRECATED;
- prefered_lft = timeout;
+ cfg->ifa_flags |= IFA_F_DEPRECATED;
+ cfg->preferred_lft = timeout;
}
spin_lock_bh(&ifp->lock);
@@ -4541,18 +4600,30 @@ static int inet6_addr_modify(struct inet6_ifaddr *ifp, u32 ifa_flags,
ifp->flags &= ~(IFA_F_DEPRECATED | IFA_F_PERMANENT | IFA_F_NODAD |
IFA_F_HOMEADDRESS | IFA_F_MANAGETEMPADDR |
IFA_F_NOPREFIXROUTE);
- ifp->flags |= ifa_flags;
+ ifp->flags |= cfg->ifa_flags;
ifp->tstamp = jiffies;
- ifp->valid_lft = valid_lft;
- ifp->prefered_lft = prefered_lft;
+ ifp->valid_lft = cfg->valid_lft;
+ ifp->prefered_lft = cfg->preferred_lft;
+
+ if (cfg->rt_priority && cfg->rt_priority != ifp->rt_priority)
+ ifp->rt_priority = cfg->rt_priority;
spin_unlock_bh(&ifp->lock);
if (!(ifp->flags&IFA_F_TENTATIVE))
ipv6_ifa_notify(0, ifp);
- if (!(ifa_flags & IFA_F_NOPREFIXROUTE)) {
- addrconf_prefix_route(&ifp->addr, ifp->prefix_len, ifp->idev->dev,
- expires, flags);
+ if (!(cfg->ifa_flags & IFA_F_NOPREFIXROUTE)) {
+ int rc = -ENOENT;
+
+ if (had_prefixroute)
+ rc = modify_prefix_route(ifp, expires, flags);
+
+ /* prefix route could have been deleted; if so restore it */
+ if (rc == -ENOENT) {
+ addrconf_prefix_route(&ifp->addr, ifp->prefix_len,
+ ifp->rt_priority, ifp->idev->dev,
+ expires, flags, GFP_KERNEL);
+ }
} else if (had_prefixroute) {
enum cleanup_prefix_rt_t action;
unsigned long rt_expires;
@@ -4568,10 +4639,14 @@ static int inet6_addr_modify(struct inet6_ifaddr *ifp, u32 ifa_flags,
}
if (was_managetempaddr || ifp->flags & IFA_F_MANAGETEMPADDR) {
- if (was_managetempaddr && !(ifp->flags & IFA_F_MANAGETEMPADDR))
- valid_lft = prefered_lft = 0;
- manage_tempaddrs(ifp->idev, ifp, valid_lft, prefered_lft,
- !was_managetempaddr, jiffies);
+ if (was_managetempaddr &&
+ !(ifp->flags & IFA_F_MANAGETEMPADDR)) {
+ cfg->valid_lft = 0;
+ cfg->preferred_lft = 0;
+ }
+ manage_tempaddrs(ifp->idev, ifp, cfg->valid_lft,
+ cfg->preferred_lft, !was_managetempaddr,
+ jiffies);
}
addrconf_verify_rtnl();
@@ -4586,12 +4661,11 @@ inet6_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh,
struct net *net = sock_net(skb->sk);
struct ifaddrmsg *ifm;
struct nlattr *tb[IFA_MAX+1];
- struct in6_addr *pfx, *peer_pfx;
+ struct in6_addr *peer_pfx;
struct inet6_ifaddr *ifa;
struct net_device *dev;
struct inet6_dev *idev;
- u32 valid_lft = INFINITY_LIFE_TIME, preferred_lft = INFINITY_LIFE_TIME;
- u32 ifa_flags;
+ struct ifa6_config cfg;
int err;
err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFA_MAX, ifa_ipv6_policy,
@@ -4599,60 +4673,70 @@ inet6_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh,
if (err < 0)
return err;
+ memset(&cfg, 0, sizeof(cfg));
+
ifm = nlmsg_data(nlh);
- pfx = extract_addr(tb[IFA_ADDRESS], tb[IFA_LOCAL], &peer_pfx);
- if (!pfx)
+ cfg.pfx = extract_addr(tb[IFA_ADDRESS], tb[IFA_LOCAL], &peer_pfx);
+ if (!cfg.pfx)
return -EINVAL;
+ cfg.peer_pfx = peer_pfx;
+ cfg.plen = ifm->ifa_prefixlen;
+ if (tb[IFA_RT_PRIORITY])
+ cfg.rt_priority = nla_get_u32(tb[IFA_RT_PRIORITY]);
+
+ cfg.valid_lft = INFINITY_LIFE_TIME;
+ cfg.preferred_lft = INFINITY_LIFE_TIME;
+
if (tb[IFA_CACHEINFO]) {
struct ifa_cacheinfo *ci;
ci = nla_data(tb[IFA_CACHEINFO]);
- valid_lft = ci->ifa_valid;
- preferred_lft = ci->ifa_prefered;
- } else {
- preferred_lft = INFINITY_LIFE_TIME;
- valid_lft = INFINITY_LIFE_TIME;
+ cfg.valid_lft = ci->ifa_valid;
+ cfg.preferred_lft = ci->ifa_prefered;
}
dev = __dev_get_by_index(net, ifm->ifa_index);
if (!dev)
return -ENODEV;
- ifa_flags = tb[IFA_FLAGS] ? nla_get_u32(tb[IFA_FLAGS]) : ifm->ifa_flags;
+ if (tb[IFA_FLAGS])
+ cfg.ifa_flags = nla_get_u32(tb[IFA_FLAGS]);
+ else
+ cfg.ifa_flags = ifm->ifa_flags;
/* We ignore other flags so far. */
- ifa_flags &= IFA_F_NODAD | IFA_F_HOMEADDRESS | IFA_F_MANAGETEMPADDR |
- IFA_F_NOPREFIXROUTE | IFA_F_MCAUTOJOIN | IFA_F_OPTIMISTIC;
+ cfg.ifa_flags &= IFA_F_NODAD | IFA_F_HOMEADDRESS |
+ IFA_F_MANAGETEMPADDR | IFA_F_NOPREFIXROUTE |
+ IFA_F_MCAUTOJOIN | IFA_F_OPTIMISTIC;
idev = ipv6_find_idev(dev);
if (IS_ERR(idev))
return PTR_ERR(idev);
if (!ipv6_allow_optimistic_dad(net, idev))
- ifa_flags &= ~IFA_F_OPTIMISTIC;
+ cfg.ifa_flags &= ~IFA_F_OPTIMISTIC;
- if (ifa_flags & IFA_F_NODAD && ifa_flags & IFA_F_OPTIMISTIC) {
+ if (cfg.ifa_flags & IFA_F_NODAD &&
+ cfg.ifa_flags & IFA_F_OPTIMISTIC) {
NL_SET_ERR_MSG(extack, "IFA_F_NODAD and IFA_F_OPTIMISTIC are mutually exclusive");
return -EINVAL;
}
- ifa = ipv6_get_ifaddr(net, pfx, dev, 1);
+ ifa = ipv6_get_ifaddr(net, cfg.pfx, dev, 1);
if (!ifa) {
/*
* It would be best to check for !NLM_F_CREATE here but
* userspace already relies on not having to provide this.
*/
- return inet6_addr_add(net, ifm->ifa_index, pfx, peer_pfx,
- ifm->ifa_prefixlen, ifa_flags,
- preferred_lft, valid_lft, extack);
+ return inet6_addr_add(net, ifm->ifa_index, &cfg, extack);
}
if (nlh->nlmsg_flags & NLM_F_EXCL ||
!(nlh->nlmsg_flags & NLM_F_REPLACE))
err = -EEXIST;
else
- err = inet6_addr_modify(ifa, ifa_flags, preferred_lft, valid_lft);
+ err = inet6_addr_modify(ifa, &cfg);
in6_ifa_put(ifa);
@@ -4703,7 +4787,8 @@ static inline int inet6_ifaddr_msgsize(void)
+ nla_total_size(16) /* IFA_LOCAL */
+ nla_total_size(16) /* IFA_ADDRESS */
+ nla_total_size(sizeof(struct ifa_cacheinfo))
- + nla_total_size(4) /* IFA_FLAGS */;
+ + nla_total_size(4) /* IFA_FLAGS */
+ + nla_total_size(4) /* IFA_RT_PRIORITY */;
}
static int inet6_fill_ifaddr(struct sk_buff *skb, struct inet6_ifaddr *ifa,
@@ -4749,6 +4834,10 @@ static int inet6_fill_ifaddr(struct sk_buff *skb, struct inet6_ifaddr *ifa,
if (nla_put_in6_addr(skb, IFA_ADDRESS, &ifa->addr) < 0)
goto error;
+ if (ifa->rt_priority &&
+ nla_put_u32(skb, IFA_RT_PRIORITY, ifa->rt_priority))
+ goto error;
+
if (put_cacheinfo(skb, ifa->cstamp, ifa->tstamp, preferred, valid) < 0)
goto error;
@@ -4792,9 +4881,10 @@ static int inet6_fill_ifmcaddr(struct sk_buff *skb, struct ifmcaddr6 *ifmca,
static int inet6_fill_ifacaddr(struct sk_buff *skb, struct ifacaddr6 *ifaca,
u32 portid, u32 seq, int event, unsigned int flags)
{
+ struct net_device *dev = fib6_info_nh_dev(ifaca->aca_rt);
+ int ifindex = dev ? dev->ifindex : 1;
struct nlmsghdr *nlh;
u8 scope = RT_SCOPE_UNIVERSE;
- int ifindex = ifaca->aca_idev->dev->ifindex;
if (ipv6_addr_scope(&ifaca->aca_addr) & IFA_SITE)
scope = RT_SCOPE_SITE;
@@ -5017,14 +5107,6 @@ static void inet6_ifa_notify(int event, struct inet6_ifaddr *ifa)
struct net *net = dev_net(ifa->idev->dev);
int err = -ENOBUFS;
- /* Don't send DELADDR notification for TENTATIVE address,
- * since NEWADDR notification is sent only after removing
- * TENTATIVE flag, if DAD has not failed.
- */
- if (ifa->flags & IFA_F_TENTATIVE && !(ifa->flags & IFA_F_DADFAILED) &&
- event == RTM_DELADDR)
- return;
-
skb = nlmsg_new(inet6_ifaddr_msgsize(), GFP_ATOMIC);
if (!skb)
goto errout;
@@ -5595,29 +5677,30 @@ static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp)
* our DAD process, so we don't need
* to do it again
*/
- if (!rcu_access_pointer(ifp->rt->rt6i_node))
- ip6_ins_rt(ifp->rt);
+ if (!rcu_access_pointer(ifp->rt->fib6_node))
+ ip6_ins_rt(net, ifp->rt);
if (ifp->idev->cnf.forwarding)
addrconf_join_anycast(ifp);
if (!ipv6_addr_any(&ifp->peer_addr))
- addrconf_prefix_route(&ifp->peer_addr, 128,
- ifp->idev->dev, 0, 0);
+ addrconf_prefix_route(&ifp->peer_addr, 128, 0,
+ ifp->idev->dev, 0, 0,
+ GFP_ATOMIC);
break;
case RTM_DELADDR:
if (ifp->idev->cnf.forwarding)
addrconf_leave_anycast(ifp);
addrconf_leave_solict(ifp->idev, &ifp->addr);
if (!ipv6_addr_any(&ifp->peer_addr)) {
- struct rt6_info *rt;
+ struct fib6_info *rt;
rt = addrconf_get_prefix_route(&ifp->peer_addr, 128,
ifp->idev->dev, 0, 0);
if (rt)
- ip6_del_rt(rt);
+ ip6_del_rt(net, rt);
}
if (ifp->rt) {
- if (dst_hold_safe(&ifp->rt->dst))
- ip6_del_rt(ifp->rt);
+ ip6_del_rt(net, ifp->rt);
+ ifp->rt = NULL;
}
rt_genid_bump_ipv6(net);
break;
@@ -5964,11 +6047,11 @@ void addrconf_disable_policy_idev(struct inet6_dev *idev, int val)
list_for_each_entry(ifa, &idev->addr_list, if_list) {
spin_lock(&ifa->lock);
if (ifa->rt) {
- struct rt6_info *rt = ifa->rt;
+ struct fib6_info *rt = ifa->rt;
int cpu;
rcu_read_lock();
- addrconf_set_nopolicy(ifa->rt, val);
+ ifa->rt->dst_nopolicy = val ? true : false;
if (rt->rt6i_pcpu) {
for_each_possible_cpu(cpu) {
struct rt6_info **rtp;
diff --git a/net/ipv6/addrconf_core.c b/net/ipv6/addrconf_core.c
index 32b564dfd02a..5cd0029d930e 100644
--- a/net/ipv6/addrconf_core.c
+++ b/net/ipv6/addrconf_core.c
@@ -134,8 +134,47 @@ static int eafnosupport_ipv6_dst_lookup(struct net *net, struct sock *u1,
return -EAFNOSUPPORT;
}
+static struct fib6_table *eafnosupport_fib6_get_table(struct net *net, u32 id)
+{
+ return NULL;
+}
+
+static struct fib6_info *
+eafnosupport_fib6_table_lookup(struct net *net, struct fib6_table *table,
+ int oif, struct flowi6 *fl6, int flags)
+{
+ return NULL;
+}
+
+static struct fib6_info *
+eafnosupport_fib6_lookup(struct net *net, int oif, struct flowi6 *fl6,
+ int flags)
+{
+ return NULL;
+}
+
+static struct fib6_info *
+eafnosupport_fib6_multipath_select(const struct net *net, struct fib6_info *f6i,
+ struct flowi6 *fl6, int oif,
+ const struct sk_buff *skb, int strict)
+{
+ return f6i;
+}
+
+static u32
+eafnosupport_ip6_mtu_from_fib6(struct fib6_info *f6i, struct in6_addr *daddr,
+ struct in6_addr *saddr)
+{
+ return 0;
+}
+
const struct ipv6_stub *ipv6_stub __read_mostly = &(struct ipv6_stub) {
- .ipv6_dst_lookup = eafnosupport_ipv6_dst_lookup,
+ .ipv6_dst_lookup = eafnosupport_ipv6_dst_lookup,
+ .fib6_get_table = eafnosupport_fib6_get_table,
+ .fib6_table_lookup = eafnosupport_fib6_table_lookup,
+ .fib6_lookup = eafnosupport_fib6_lookup,
+ .fib6_multipath_select = eafnosupport_fib6_multipath_select,
+ .ip6_mtu_from_fib6 = eafnosupport_ip6_mtu_from_fib6,
};
EXPORT_SYMBOL_GPL(ipv6_stub);
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index d443c18b45fe..74f2a261e8df 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -273,33 +273,8 @@ out_rcu_unlock:
goto out;
}
-
-/* bind for INET6 API */
-int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
-{
- struct sock *sk = sock->sk;
- int err = 0;
-
- /* If the socket has its own bind function then use it. */
- if (sk->sk_prot->bind)
- return sk->sk_prot->bind(sk, uaddr, addr_len);
-
- if (addr_len < SIN6_LEN_RFC2133)
- return -EINVAL;
-
- /* BPF prog is run before any checks are done so that if the prog
- * changes context in a wrong way it will be caught.
- */
- err = BPF_CGROUP_RUN_PROG_INET6_BIND(sk, uaddr);
- if (err)
- return err;
-
- return __inet6_bind(sk, uaddr, addr_len, false, true);
-}
-EXPORT_SYMBOL(inet6_bind);
-
-int __inet6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len,
- bool force_bind_address_no_port, bool with_lock)
+static int __inet6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len,
+ bool force_bind_address_no_port, bool with_lock)
{
struct sockaddr_in6 *addr = (struct sockaddr_in6 *)uaddr;
struct inet_sock *inet = inet_sk(sk);
@@ -444,6 +419,30 @@ out_unlock:
goto out;
}
+/* bind for INET6 API */
+int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
+{
+ struct sock *sk = sock->sk;
+ int err = 0;
+
+ /* If the socket has its own bind function then use it. */
+ if (sk->sk_prot->bind)
+ return sk->sk_prot->bind(sk, uaddr, addr_len);
+
+ if (addr_len < SIN6_LEN_RFC2133)
+ return -EINVAL;
+
+ /* BPF prog is run before any checks are done so that if the prog
+ * changes context in a wrong way it will be caught.
+ */
+ err = BPF_CGROUP_RUN_PROG_INET6_BIND(sk, uaddr);
+ if (err)
+ return err;
+
+ return __inet6_bind(sk, uaddr, addr_len, false, true);
+}
+EXPORT_SYMBOL(inet6_bind);
+
int inet6_release(struct socket *sock)
{
struct sock *sk = sock->sk;
@@ -579,7 +578,9 @@ const struct proto_ops inet6_stream_ops = {
.getsockopt = sock_common_getsockopt, /* ok */
.sendmsg = inet_sendmsg, /* ok */
.recvmsg = inet_recvmsg, /* ok */
- .mmap = sock_no_mmap,
+#ifdef CONFIG_MMU
+ .mmap = tcp_mmap,
+#endif
.sendpage = inet_sendpage,
.sendmsg_locked = tcp_sendmsg_locked,
.sendpage_locked = tcp_sendpage_locked,
@@ -590,6 +591,7 @@ const struct proto_ops inet6_stream_ops = {
.compat_setsockopt = compat_sock_common_setsockopt,
.compat_getsockopt = compat_sock_common_getsockopt,
#endif
+ .set_rcvlowat = tcp_set_rcvlowat,
};
const struct proto_ops inet6_dgram_ops = {
@@ -887,7 +889,12 @@ static struct pernet_operations inet6_net_ops = {
static const struct ipv6_stub ipv6_stub_impl = {
.ipv6_sock_mc_join = ipv6_sock_mc_join,
.ipv6_sock_mc_drop = ipv6_sock_mc_drop,
- .ipv6_dst_lookup = ip6_dst_lookup,
+ .ipv6_dst_lookup = ip6_dst_lookup,
+ .fib6_get_table = fib6_get_table,
+ .fib6_table_lookup = fib6_table_lookup,
+ .fib6_lookup = fib6_lookup,
+ .fib6_multipath_select = fib6_multipath_select,
+ .ip6_mtu_from_fib6 = ip6_mtu_from_fib6,
.udpv6_encap_enable = udpv6_encap_enable,
.ndisc_send_na = ndisc_send_na,
.nd_tbl = &nd_tbl,
diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c
index ebeaf47d5c8d..4e0ff7031edd 100644
--- a/net/ipv6/anycast.c
+++ b/net/ipv6/anycast.c
@@ -212,16 +212,14 @@ static void aca_get(struct ifacaddr6 *aca)
static void aca_put(struct ifacaddr6 *ac)
{
if (refcount_dec_and_test(&ac->aca_refcnt)) {
- in6_dev_put(ac->aca_idev);
- dst_release(&ac->aca_rt->dst);
+ fib6_info_release(ac->aca_rt);
kfree(ac);
}
}
-static struct ifacaddr6 *aca_alloc(struct rt6_info *rt,
+static struct ifacaddr6 *aca_alloc(struct fib6_info *f6i,
const struct in6_addr *addr)
{
- struct inet6_dev *idev = rt->rt6i_idev;
struct ifacaddr6 *aca;
aca = kzalloc(sizeof(*aca), GFP_ATOMIC);
@@ -229,9 +227,8 @@ static struct ifacaddr6 *aca_alloc(struct rt6_info *rt,
return NULL;
aca->aca_addr = *addr;
- in6_dev_hold(idev);
- aca->aca_idev = idev;
- aca->aca_rt = rt;
+ fib6_info_hold(f6i);
+ aca->aca_rt = f6i;
aca->aca_users = 1;
/* aca_tstamp should be updated upon changes */
aca->aca_cstamp = aca->aca_tstamp = jiffies;
@@ -246,7 +243,8 @@ static struct ifacaddr6 *aca_alloc(struct rt6_info *rt,
int __ipv6_dev_ac_inc(struct inet6_dev *idev, const struct in6_addr *addr)
{
struct ifacaddr6 *aca;
- struct rt6_info *rt;
+ struct fib6_info *f6i;
+ struct net *net;
int err;
ASSERT_RTNL();
@@ -265,14 +263,15 @@ int __ipv6_dev_ac_inc(struct inet6_dev *idev, const struct in6_addr *addr)
}
}
- rt = addrconf_dst_alloc(idev, addr, true);
- if (IS_ERR(rt)) {
- err = PTR_ERR(rt);
+ net = dev_net(idev->dev);
+ f6i = addrconf_f6i_alloc(net, idev, addr, true, GFP_ATOMIC);
+ if (IS_ERR(f6i)) {
+ err = PTR_ERR(f6i);
goto out;
}
- aca = aca_alloc(rt, addr);
+ aca = aca_alloc(f6i, addr);
if (!aca) {
- ip6_rt_put(rt);
+ fib6_info_release(f6i);
err = -ENOMEM;
goto out;
}
@@ -286,7 +285,7 @@ int __ipv6_dev_ac_inc(struct inet6_dev *idev, const struct in6_addr *addr)
aca_get(aca);
write_unlock_bh(&idev->lock);
- ip6_ins_rt(rt);
+ ip6_ins_rt(net, f6i);
addrconf_join_solict(idev->dev, &aca->aca_addr);
@@ -328,8 +327,7 @@ int __ipv6_dev_ac_dec(struct inet6_dev *idev, const struct in6_addr *addr)
write_unlock_bh(&idev->lock);
addrconf_leave_solict(idev, &aca->aca_addr);
- dst_hold(&aca->aca_rt->dst);
- ip6_del_rt(aca->aca_rt);
+ ip6_del_rt(dev_net(idev->dev), aca->aca_rt);
aca_put(aca);
return 0;
@@ -356,8 +354,7 @@ void ipv6_ac_destroy_dev(struct inet6_dev *idev)
addrconf_leave_solict(idev, &aca->aca_addr);
- dst_hold(&aca->aca_rt->dst);
- ip6_del_rt(aca->aca_rt);
+ ip6_del_rt(dev_net(idev->dev), aca->aca_rt);
aca_put(aca);
diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c
index bc68eb661970..5bc2bf3733ab 100644
--- a/net/ipv6/exthdrs.c
+++ b/net/ipv6/exthdrs.c
@@ -280,6 +280,7 @@ static const struct tlvtype_proc tlvprocdestopt_lst[] = {
static int ipv6_destopt_rcv(struct sk_buff *skb)
{
+ struct inet6_dev *idev = __in6_dev_get(skb->dev);
struct inet6_skb_parm *opt = IP6CB(skb);
#if IS_ENABLED(CONFIG_IPV6_MIP6)
__u16 dstbuf;
@@ -291,7 +292,7 @@ static int ipv6_destopt_rcv(struct sk_buff *skb)
if (!pskb_may_pull(skb, skb_transport_offset(skb) + 8) ||
!pskb_may_pull(skb, (skb_transport_offset(skb) +
((skb_transport_header(skb)[1] + 1) << 3)))) {
- __IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
+ __IP6_INC_STATS(dev_net(dst->dev), idev,
IPSTATS_MIB_INHDRERRORS);
fail_and_free:
kfree_skb(skb);
@@ -319,8 +320,7 @@ fail_and_free:
return 1;
}
- __IP6_INC_STATS(dev_net(dst->dev),
- ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
+ __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
return -1;
}
@@ -416,8 +416,7 @@ looped_back:
}
if (hdr->segments_left >= (hdr->hdrlen >> 1)) {
- __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
- IPSTATS_MIB_INHDRERRORS);
+ __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
icmpv6_param_prob(skb, ICMPV6_HDR_FIELD,
((&hdr->segments_left) -
skb_network_header(skb)));
@@ -456,8 +455,7 @@ looped_back:
if (skb_dst(skb)->dev->flags & IFF_LOOPBACK) {
if (ipv6_hdr(skb)->hop_limit <= 1) {
- __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
- IPSTATS_MIB_INHDRERRORS);
+ __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
icmpv6_send(skb, ICMPV6_TIME_EXCEED,
ICMPV6_EXC_HOPLIMIT, 0);
kfree_skb(skb);
@@ -481,10 +479,10 @@ looped_back:
/* called with rcu_read_lock() */
static int ipv6_rthdr_rcv(struct sk_buff *skb)
{
+ struct inet6_dev *idev = __in6_dev_get(skb->dev);
struct inet6_skb_parm *opt = IP6CB(skb);
struct in6_addr *addr = NULL;
struct in6_addr daddr;
- struct inet6_dev *idev;
int n, i;
struct ipv6_rt_hdr *hdr;
struct rt0_hdr *rthdr;
@@ -498,8 +496,7 @@ static int ipv6_rthdr_rcv(struct sk_buff *skb)
if (!pskb_may_pull(skb, skb_transport_offset(skb) + 8) ||
!pskb_may_pull(skb, (skb_transport_offset(skb) +
((skb_transport_header(skb)[1] + 1) << 3)))) {
- __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
- IPSTATS_MIB_INHDRERRORS);
+ __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
kfree_skb(skb);
return -1;
}
@@ -508,8 +505,7 @@ static int ipv6_rthdr_rcv(struct sk_buff *skb)
if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr) ||
skb->pkt_type != PACKET_HOST) {
- __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
- IPSTATS_MIB_INADDRERRORS);
+ __IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
kfree_skb(skb);
return -1;
}
@@ -527,7 +523,7 @@ looped_back:
* processed by own
*/
if (!addr) {
- __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
+ __IP6_INC_STATS(net, idev,
IPSTATS_MIB_INADDRERRORS);
kfree_skb(skb);
return -1;
@@ -553,8 +549,7 @@ looped_back:
goto unknown_rh;
/* Silently discard invalid RTH type 2 */
if (hdr->hdrlen != 2 || hdr->segments_left != 1) {
- __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
- IPSTATS_MIB_INHDRERRORS);
+ __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
kfree_skb(skb);
return -1;
}
@@ -572,8 +567,7 @@ looped_back:
n = hdr->hdrlen >> 1;
if (hdr->segments_left > n) {
- __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
- IPSTATS_MIB_INHDRERRORS);
+ __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
icmpv6_param_prob(skb, ICMPV6_HDR_FIELD,
((&hdr->segments_left) -
skb_network_header(skb)));
@@ -609,14 +603,12 @@ looped_back:
if (xfrm6_input_addr(skb, (xfrm_address_t *)addr,
(xfrm_address_t *)&ipv6_hdr(skb)->saddr,
IPPROTO_ROUTING) < 0) {
- __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
- IPSTATS_MIB_INADDRERRORS);
+ __IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
kfree_skb(skb);
return -1;
}
if (!ipv6_chk_home_addr(dev_net(skb_dst(skb)->dev), addr)) {
- __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
- IPSTATS_MIB_INADDRERRORS);
+ __IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
kfree_skb(skb);
return -1;
}
@@ -627,8 +619,7 @@ looped_back:
}
if (ipv6_addr_is_multicast(addr)) {
- __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
- IPSTATS_MIB_INADDRERRORS);
+ __IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
kfree_skb(skb);
return -1;
}
@@ -647,8 +638,7 @@ looped_back:
if (skb_dst(skb)->dev->flags&IFF_LOOPBACK) {
if (ipv6_hdr(skb)->hop_limit <= 1) {
- __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
- IPSTATS_MIB_INHDRERRORS);
+ __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT,
0);
kfree_skb(skb);
@@ -663,7 +653,7 @@ looped_back:
return -1;
unknown_rh:
- __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_INHDRERRORS);
+ __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
icmpv6_param_prob(skb, ICMPV6_HDR_FIELD,
(&hdr->type) - skb_network_header(skb));
return -1;
@@ -755,34 +745,31 @@ static bool ipv6_hop_ra(struct sk_buff *skb, int optoff)
static bool ipv6_hop_jumbo(struct sk_buff *skb, int optoff)
{
const unsigned char *nh = skb_network_header(skb);
+ struct inet6_dev *idev = __in6_dev_get_safely(skb->dev);
struct net *net = ipv6_skb_net(skb);
u32 pkt_len;
if (nh[optoff + 1] != 4 || (optoff & 3) != 2) {
net_dbg_ratelimited("ipv6_hop_jumbo: wrong jumbo opt length/alignment %d\n",
nh[optoff+1]);
- __IP6_INC_STATS(net, ipv6_skb_idev(skb),
- IPSTATS_MIB_INHDRERRORS);
+ __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
goto drop;
}
pkt_len = ntohl(*(__be32 *)(nh + optoff + 2));
if (pkt_len <= IPV6_MAXPLEN) {
- __IP6_INC_STATS(net, ipv6_skb_idev(skb),
- IPSTATS_MIB_INHDRERRORS);
+ __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, optoff+2);
return false;
}
if (ipv6_hdr(skb)->payload_len) {
- __IP6_INC_STATS(net, ipv6_skb_idev(skb),
- IPSTATS_MIB_INHDRERRORS);
+ __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, optoff);
return false;
}
if (pkt_len > skb->len - sizeof(struct ipv6hdr)) {
- __IP6_INC_STATS(net, ipv6_skb_idev(skb),
- IPSTATS_MIB_INTRUNCATEDPKTS);
+ __IP6_INC_STATS(net, idev, IPSTATS_MIB_INTRUNCATEDPKTS);
goto drop;
}
diff --git a/net/ipv6/exthdrs_core.c b/net/ipv6/exthdrs_core.c
index b643f5ce6c80..ae365df8abf7 100644
--- a/net/ipv6/exthdrs_core.c
+++ b/net/ipv6/exthdrs_core.c
@@ -161,7 +161,7 @@ EXPORT_SYMBOL_GPL(ipv6_find_tlv);
* if target < 0. "last header" is transport protocol header, ESP, or
* "No next header".
*
- * Note that *offset is used as input/output parameter. an if it is not zero,
+ * Note that *offset is used as input/output parameter, and if it is not zero,
* then it must be a valid offset to an inner IPv6 header. This can be used
* to explore inner IPv6 header, eg. ICMPv6 error messages.
*
diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c
index df113c7b5fc8..f590446595d8 100644
--- a/net/ipv6/fib6_rules.c
+++ b/net/ipv6/fib6_rules.c
@@ -60,6 +60,39 @@ unsigned int fib6_rules_seq_read(struct net *net)
return fib_rules_seq_read(net, AF_INET6);
}
+/* called with rcu lock held; no reference taken on fib6_info */
+struct fib6_info *fib6_lookup(struct net *net, int oif, struct flowi6 *fl6,
+ int flags)
+{
+ struct fib6_info *f6i;
+ int err;
+
+ if (net->ipv6.fib6_has_custom_rules) {
+ struct fib_lookup_arg arg = {
+ .lookup_ptr = fib6_table_lookup,
+ .lookup_data = &oif,
+ .flags = FIB_LOOKUP_NOREF,
+ };
+
+ l3mdev_update_flow(net, flowi6_to_flowi(fl6));
+
+ err = fib_rules_lookup(net->ipv6.fib6_rules_ops,
+ flowi6_to_flowi(fl6), flags, &arg);
+ if (err)
+ return ERR_PTR(err);
+
+ f6i = arg.result ? : net->ipv6.fib6_null_entry;
+ } else {
+ f6i = fib6_table_lookup(net, net->ipv6.fib6_local_tbl,
+ oif, fl6, flags);
+ if (!f6i || f6i == net->ipv6.fib6_null_entry)
+ f6i = fib6_table_lookup(net, net->ipv6.fib6_main_tbl,
+ oif, fl6, flags);
+ }
+
+ return f6i;
+}
+
struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6,
const struct sk_buff *skb,
int flags, pol_lookup_t lookup)
@@ -96,8 +129,73 @@ struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6,
return &net->ipv6.ip6_null_entry->dst;
}
-static int fib6_rule_action(struct fib_rule *rule, struct flowi *flp,
- int flags, struct fib_lookup_arg *arg)
+static int fib6_rule_saddr(struct net *net, struct fib_rule *rule, int flags,
+ struct flowi6 *flp6, const struct net_device *dev)
+{
+ struct fib6_rule *r = (struct fib6_rule *)rule;
+
+ /* If we need to find a source address for this traffic,
+ * we check the result if it meets requirement of the rule.
+ */
+ if ((rule->flags & FIB_RULE_FIND_SADDR) &&
+ r->src.plen && !(flags & RT6_LOOKUP_F_HAS_SADDR)) {
+ struct in6_addr saddr;
+
+ if (ipv6_dev_get_saddr(net, dev, &flp6->daddr,
+ rt6_flags2srcprefs(flags), &saddr))
+ return -EAGAIN;
+
+ if (!ipv6_prefix_equal(&saddr, &r->src.addr, r->src.plen))
+ return -EAGAIN;
+
+ flp6->saddr = saddr;
+ }
+
+ return 0;
+}
+
+static int fib6_rule_action_alt(struct fib_rule *rule, struct flowi *flp,
+ int flags, struct fib_lookup_arg *arg)
+{
+ struct flowi6 *flp6 = &flp->u.ip6;
+ struct net *net = rule->fr_net;
+ struct fib6_table *table;
+ struct fib6_info *f6i;
+ int err = -EAGAIN, *oif;
+ u32 tb_id;
+
+ switch (rule->action) {
+ case FR_ACT_TO_TBL:
+ break;
+ case FR_ACT_UNREACHABLE:
+ return -ENETUNREACH;
+ case FR_ACT_PROHIBIT:
+ return -EACCES;
+ case FR_ACT_BLACKHOLE:
+ default:
+ return -EINVAL;
+ }
+
+ tb_id = fib_rule_get_table(rule, arg);
+ table = fib6_get_table(net, tb_id);
+ if (!table)
+ return -EAGAIN;
+
+ oif = (int *)arg->lookup_data;
+ f6i = fib6_table_lookup(net, table, *oif, flp6, flags);
+ if (f6i != net->ipv6.fib6_null_entry) {
+ err = fib6_rule_saddr(net, rule, flags, flp6,
+ fib6_info_nh_dev(f6i));
+
+ if (likely(!err))
+ arg->result = f6i;
+ }
+
+ return err;
+}
+
+static int __fib6_rule_action(struct fib_rule *rule, struct flowi *flp,
+ int flags, struct fib_lookup_arg *arg)
{
struct flowi6 *flp6 = &flp->u.ip6;
struct rt6_info *rt = NULL;
@@ -134,27 +232,12 @@ static int fib6_rule_action(struct fib_rule *rule, struct flowi *flp,
rt = lookup(net, table, flp6, arg->lookup_data, flags);
if (rt != net->ipv6.ip6_null_entry) {
- struct fib6_rule *r = (struct fib6_rule *)rule;
-
- /*
- * If we need to find a source address for this traffic,
- * we check the result if it meets requirement of the rule.
- */
- if ((rule->flags & FIB_RULE_FIND_SADDR) &&
- r->src.plen && !(flags & RT6_LOOKUP_F_HAS_SADDR)) {
- struct in6_addr saddr;
-
- if (ipv6_dev_get_saddr(net,
- ip6_dst_idev(&rt->dst)->dev,
- &flp6->daddr,
- rt6_flags2srcprefs(flags),
- &saddr))
- goto again;
- if (!ipv6_prefix_equal(&saddr, &r->src.addr,
- r->src.plen))
- goto again;
- flp6->saddr = saddr;
- }
+ err = fib6_rule_saddr(net, rule, flags, flp6,
+ ip6_dst_idev(&rt->dst)->dev);
+
+ if (err == -EAGAIN)
+ goto again;
+
err = rt->dst.error;
if (err != -EAGAIN)
goto out;
@@ -172,6 +255,15 @@ out:
return err;
}
+static int fib6_rule_action(struct fib_rule *rule, struct flowi *flp,
+ int flags, struct fib_lookup_arg *arg)
+{
+ if (arg->lookup_ptr == fib6_table_lookup)
+ return fib6_rule_action_alt(rule, flp, flags, arg);
+
+ return __fib6_rule_action(rule, flp, flags, arg);
+}
+
static bool fib6_rule_suppress(struct fib_rule *rule, struct fib_lookup_arg *arg)
{
struct rt6_info *rt = (struct rt6_info *) arg->result;
@@ -245,15 +337,18 @@ static const struct nla_policy fib6_rule_policy[FRA_MAX+1] = {
static int fib6_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
struct fib_rule_hdr *frh,
- struct nlattr **tb)
+ struct nlattr **tb,
+ struct netlink_ext_ack *extack)
{
int err = -EINVAL;
struct net *net = sock_net(skb->sk);
struct fib6_rule *rule6 = (struct fib6_rule *) rule;
if (rule->action == FR_ACT_TO_TBL && !rule->l3mdev) {
- if (rule->table == RT6_TABLE_UNSPEC)
+ if (rule->table == RT6_TABLE_UNSPEC) {
+ NL_SET_ERR_MSG(extack, "Invalid table");
goto errout;
+ }
if (fib6_new_table(net, rule->table) == NULL) {
err = -ENOBUFS;
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 01372dd74e38..7aa4c41a3bd9 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -43,7 +43,7 @@ static struct kmem_cache *fib6_node_kmem __read_mostly;
struct fib6_cleaner {
struct fib6_walker w;
struct net *net;
- int (*func)(struct rt6_info *, void *arg);
+ int (*func)(struct fib6_info *, void *arg);
int sernum;
void *arg;
};
@@ -54,7 +54,7 @@ struct fib6_cleaner {
#define FWS_INIT FWS_L
#endif
-static struct rt6_info *fib6_find_prefix(struct net *net,
+static struct fib6_info *fib6_find_prefix(struct net *net,
struct fib6_table *table,
struct fib6_node *fn);
static struct fib6_node *fib6_repair_tree(struct net *net,
@@ -105,13 +105,12 @@ enum {
FIB6_NO_SERNUM_CHANGE = 0,
};
-void fib6_update_sernum(struct rt6_info *rt)
+void fib6_update_sernum(struct net *net, struct fib6_info *f6i)
{
- struct net *net = dev_net(rt->dst.dev);
struct fib6_node *fn;
- fn = rcu_dereference_protected(rt->rt6i_node,
- lockdep_is_held(&rt->rt6i_table->tb6_lock));
+ fn = rcu_dereference_protected(f6i->fib6_node,
+ lockdep_is_held(&f6i->fib6_table->tb6_lock));
if (fn)
fn->fn_sernum = fib6_new_sernum(net);
}
@@ -146,6 +145,69 @@ static __be32 addr_bit_set(const void *token, int fn_bit)
addr[fn_bit >> 5];
}
+struct fib6_info *fib6_info_alloc(gfp_t gfp_flags)
+{
+ struct fib6_info *f6i;
+
+ f6i = kzalloc(sizeof(*f6i), gfp_flags);
+ if (!f6i)
+ return NULL;
+
+ f6i->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, gfp_flags);
+ if (!f6i->rt6i_pcpu) {
+ kfree(f6i);
+ return NULL;
+ }
+
+ INIT_LIST_HEAD(&f6i->fib6_siblings);
+ f6i->fib6_metrics = (struct dst_metrics *)&dst_default_metrics;
+
+ atomic_inc(&f6i->fib6_ref);
+
+ return f6i;
+}
+
+void fib6_info_destroy(struct fib6_info *f6i)
+{
+ struct rt6_exception_bucket *bucket;
+ struct dst_metrics *m;
+
+ WARN_ON(f6i->fib6_node);
+
+ bucket = rcu_dereference_protected(f6i->rt6i_exception_bucket, 1);
+ if (bucket) {
+ f6i->rt6i_exception_bucket = NULL;
+ kfree(bucket);
+ }
+
+ if (f6i->rt6i_pcpu) {
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ struct rt6_info **ppcpu_rt;
+ struct rt6_info *pcpu_rt;
+
+ ppcpu_rt = per_cpu_ptr(f6i->rt6i_pcpu, cpu);
+ pcpu_rt = *ppcpu_rt;
+ if (pcpu_rt) {
+ dst_dev_put(&pcpu_rt->dst);
+ dst_release(&pcpu_rt->dst);
+ *ppcpu_rt = NULL;
+ }
+ }
+ }
+
+ if (f6i->fib6_nh.nh_dev)
+ dev_put(f6i->fib6_nh.nh_dev);
+
+ m = f6i->fib6_metrics;
+ if (m != &dst_default_metrics && refcount_dec_and_test(&m->refcnt))
+ kfree(m);
+
+ kfree(f6i);
+}
+EXPORT_SYMBOL_GPL(fib6_info_destroy);
+
static struct fib6_node *node_alloc(struct net *net)
{
struct fib6_node *fn;
@@ -176,28 +238,6 @@ static void node_free(struct net *net, struct fib6_node *fn)
net->ipv6.rt6_stats->fib_nodes--;
}
-void rt6_free_pcpu(struct rt6_info *non_pcpu_rt)
-{
- int cpu;
-
- if (!non_pcpu_rt->rt6i_pcpu)
- return;
-
- for_each_possible_cpu(cpu) {
- struct rt6_info **ppcpu_rt;
- struct rt6_info *pcpu_rt;
-
- ppcpu_rt = per_cpu_ptr(non_pcpu_rt->rt6i_pcpu, cpu);
- pcpu_rt = *ppcpu_rt;
- if (pcpu_rt) {
- dst_dev_put(&pcpu_rt->dst);
- dst_release(&pcpu_rt->dst);
- *ppcpu_rt = NULL;
- }
- }
-}
-EXPORT_SYMBOL_GPL(rt6_free_pcpu);
-
static void fib6_free_table(struct fib6_table *table)
{
inetpeer_invalidate_tree(&table->tb6_peers);
@@ -232,7 +272,7 @@ static struct fib6_table *fib6_alloc_table(struct net *net, u32 id)
if (table) {
table->tb6_id = id;
rcu_assign_pointer(table->tb6_root.leaf,
- net->ipv6.ip6_null_entry);
+ net->ipv6.fib6_null_entry);
table->tb6_root.fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO;
inet_peer_base_init(&table->tb6_peers);
}
@@ -314,6 +354,13 @@ struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6,
return &rt->dst;
}
+/* called with rcu lock held; no reference taken on fib6_info */
+struct fib6_info *fib6_lookup(struct net *net, int oif, struct flowi6 *fl6,
+ int flags)
+{
+ return fib6_table_lookup(net, net->ipv6.fib6_main_tbl, oif, fl6, flags);
+}
+
static void __net_init fib6_tables_init(struct net *net)
{
fib6_link_table(net, net->ipv6.fib6_main_tbl);
@@ -340,7 +387,7 @@ unsigned int fib6_tables_seq_read(struct net *net)
static int call_fib6_entry_notifier(struct notifier_block *nb, struct net *net,
enum fib_event_type event_type,
- struct rt6_info *rt)
+ struct fib6_info *rt)
{
struct fib6_entry_notifier_info info = {
.rt = rt,
@@ -351,7 +398,7 @@ static int call_fib6_entry_notifier(struct notifier_block *nb, struct net *net,
static int call_fib6_entry_notifiers(struct net *net,
enum fib_event_type event_type,
- struct rt6_info *rt,
+ struct fib6_info *rt,
struct netlink_ext_ack *extack)
{
struct fib6_entry_notifier_info info = {
@@ -359,7 +406,7 @@ static int call_fib6_entry_notifiers(struct net *net,
.rt = rt,
};
- rt->rt6i_table->fib_seq++;
+ rt->fib6_table->fib_seq++;
return call_fib6_notifiers(net, event_type, &info.info);
}
@@ -368,16 +415,16 @@ struct fib6_dump_arg {
struct notifier_block *nb;
};
-static void fib6_rt_dump(struct rt6_info *rt, struct fib6_dump_arg *arg)
+static void fib6_rt_dump(struct fib6_info *rt, struct fib6_dump_arg *arg)
{
- if (rt == arg->net->ipv6.ip6_null_entry)
+ if (rt == arg->net->ipv6.fib6_null_entry)
return;
call_fib6_entry_notifier(arg->nb, arg->net, FIB_EVENT_ENTRY_ADD, rt);
}
static int fib6_node_dump(struct fib6_walker *w)
{
- struct rt6_info *rt;
+ struct fib6_info *rt;
for_each_fib6_walker_rt(w)
fib6_rt_dump(rt, w->args);
@@ -426,7 +473,7 @@ int fib6_tables_dump(struct net *net, struct notifier_block *nb)
static int fib6_dump_node(struct fib6_walker *w)
{
int res;
- struct rt6_info *rt;
+ struct fib6_info *rt;
for_each_fib6_walker_rt(w) {
res = rt6_dump_route(rt, w->args);
@@ -441,10 +488,10 @@ static int fib6_dump_node(struct fib6_walker *w)
* last sibling of this route (no need to dump the
* sibling routes again)
*/
- if (rt->rt6i_nsiblings)
- rt = list_last_entry(&rt->rt6i_siblings,
- struct rt6_info,
- rt6i_siblings);
+ if (rt->fib6_nsiblings)
+ rt = list_last_entry(&rt->fib6_siblings,
+ struct fib6_info,
+ fib6_siblings);
}
w->leaf = NULL;
return 0;
@@ -579,6 +626,24 @@ out:
return res;
}
+void fib6_metric_set(struct fib6_info *f6i, int metric, u32 val)
+{
+ if (!f6i)
+ return;
+
+ if (f6i->fib6_metrics == &dst_default_metrics) {
+ struct dst_metrics *p = kzalloc(sizeof(*p), GFP_ATOMIC);
+
+ if (!p)
+ return;
+
+ refcount_set(&p->refcnt, 1);
+ f6i->fib6_metrics = p;
+ }
+
+ f6i->fib6_metrics->metrics[metric - 1] = val;
+}
+
/*
* Routing Table
*
@@ -608,7 +673,7 @@ static struct fib6_node *fib6_add_1(struct net *net,
fn = root;
do {
- struct rt6_info *leaf = rcu_dereference_protected(fn->leaf,
+ struct fib6_info *leaf = rcu_dereference_protected(fn->leaf,
lockdep_is_held(&table->tb6_lock));
key = (struct rt6key *)((u8 *)leaf + offset);
@@ -637,11 +702,11 @@ static struct fib6_node *fib6_add_1(struct net *net,
/* clean up an intermediate node */
if (!(fn->fn_flags & RTN_RTINFO)) {
RCU_INIT_POINTER(fn->leaf, NULL);
- rt6_release(leaf);
+ fib6_info_release(leaf);
/* remove null_entry in the root node */
} else if (fn->fn_flags & RTN_TL_ROOT &&
rcu_access_pointer(fn->leaf) ==
- net->ipv6.ip6_null_entry) {
+ net->ipv6.fib6_null_entry) {
RCU_INIT_POINTER(fn->leaf, NULL);
}
@@ -750,7 +815,7 @@ insert_above:
RCU_INIT_POINTER(in->parent, pn);
in->leaf = fn->leaf;
atomic_inc(&rcu_dereference_protected(in->leaf,
- lockdep_is_held(&table->tb6_lock))->rt6i_ref);
+ lockdep_is_held(&table->tb6_lock))->fib6_ref);
/* update parent pointer */
if (dir)
@@ -802,44 +867,37 @@ insert_above:
return ln;
}
-static void fib6_copy_metrics(u32 *mp, const struct mx6_config *mxc)
-{
- int i;
-
- for (i = 0; i < RTAX_MAX; i++) {
- if (test_bit(i, mxc->mx_valid))
- mp[i] = mxc->mx[i];
- }
-}
-
-static int fib6_commit_metrics(struct dst_entry *dst, struct mx6_config *mxc)
+static void fib6_drop_pcpu_from(struct fib6_info *f6i,
+ const struct fib6_table *table)
{
- if (!mxc->mx)
- return 0;
-
- if (dst->flags & DST_HOST) {
- u32 *mp = dst_metrics_write_ptr(dst);
+ int cpu;
- if (unlikely(!mp))
- return -ENOMEM;
+ /* release the reference to this fib entry from
+ * all of its cached pcpu routes
+ */
+ for_each_possible_cpu(cpu) {
+ struct rt6_info **ppcpu_rt;
+ struct rt6_info *pcpu_rt;
- fib6_copy_metrics(mp, mxc);
- } else {
- dst_init_metrics(dst, mxc->mx, false);
+ ppcpu_rt = per_cpu_ptr(f6i->rt6i_pcpu, cpu);
+ pcpu_rt = *ppcpu_rt;
+ if (pcpu_rt) {
+ struct fib6_info *from;
- /* We've stolen mx now. */
- mxc->mx = NULL;
+ from = rcu_dereference_protected(pcpu_rt->from,
+ lockdep_is_held(&table->tb6_lock));
+ rcu_assign_pointer(pcpu_rt->from, NULL);
+ fib6_info_release(from);
+ }
}
-
- return 0;
}
-static void fib6_purge_rt(struct rt6_info *rt, struct fib6_node *fn,
+static void fib6_purge_rt(struct fib6_info *rt, struct fib6_node *fn,
struct net *net)
{
- struct fib6_table *table = rt->rt6i_table;
+ struct fib6_table *table = rt->fib6_table;
- if (atomic_read(&rt->rt6i_ref) != 1) {
+ if (atomic_read(&rt->fib6_ref) != 1) {
/* This route is used as dummy address holder in some split
* nodes. It is not leaked, but it still holds other resources,
* which must be released in time. So, scan ascendant nodes
@@ -847,18 +905,22 @@ static void fib6_purge_rt(struct rt6_info *rt, struct fib6_node *fn,
* to still alive ones.
*/
while (fn) {
- struct rt6_info *leaf = rcu_dereference_protected(fn->leaf,
+ struct fib6_info *leaf = rcu_dereference_protected(fn->leaf,
lockdep_is_held(&table->tb6_lock));
- struct rt6_info *new_leaf;
+ struct fib6_info *new_leaf;
if (!(fn->fn_flags & RTN_RTINFO) && leaf == rt) {
new_leaf = fib6_find_prefix(net, table, fn);
- atomic_inc(&new_leaf->rt6i_ref);
+ atomic_inc(&new_leaf->fib6_ref);
+
rcu_assign_pointer(fn->leaf, new_leaf);
- rt6_release(rt);
+ fib6_info_release(rt);
}
fn = rcu_dereference_protected(fn->parent,
lockdep_is_held(&table->tb6_lock));
}
+
+ if (rt->rt6i_pcpu)
+ fib6_drop_pcpu_from(rt, table);
}
}
@@ -866,37 +928,37 @@ static void fib6_purge_rt(struct rt6_info *rt, struct fib6_node *fn,
* Insert routing information in a node.
*/
-static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt,
- struct nl_info *info, struct mx6_config *mxc,
+static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt,
+ struct nl_info *info,
struct netlink_ext_ack *extack)
{
- struct rt6_info *leaf = rcu_dereference_protected(fn->leaf,
- lockdep_is_held(&rt->rt6i_table->tb6_lock));
- struct rt6_info *iter = NULL;
- struct rt6_info __rcu **ins;
- struct rt6_info __rcu **fallback_ins = NULL;
+ struct fib6_info *leaf = rcu_dereference_protected(fn->leaf,
+ lockdep_is_held(&rt->fib6_table->tb6_lock));
+ struct fib6_info *iter = NULL, *match = NULL;
+ struct fib6_info __rcu **ins;
int replace = (info->nlh &&
(info->nlh->nlmsg_flags & NLM_F_REPLACE));
+ int append = (info->nlh &&
+ (info->nlh->nlmsg_flags & NLM_F_APPEND));
int add = (!info->nlh ||
(info->nlh->nlmsg_flags & NLM_F_CREATE));
int found = 0;
- bool rt_can_ecmp = rt6_qualify_for_ecmp(rt);
u16 nlflags = NLM_F_EXCL;
int err;
- if (info->nlh && (info->nlh->nlmsg_flags & NLM_F_APPEND))
+ if (append)
nlflags |= NLM_F_APPEND;
ins = &fn->leaf;
for (iter = leaf; iter;
- iter = rcu_dereference_protected(iter->rt6_next,
- lockdep_is_held(&rt->rt6i_table->tb6_lock))) {
+ iter = rcu_dereference_protected(iter->fib6_next,
+ lockdep_is_held(&rt->fib6_table->tb6_lock))) {
/*
* Search for duplicates
*/
- if (iter->rt6i_metric == rt->rt6i_metric) {
+ if (iter->fib6_metric == rt->fib6_metric) {
/*
* Same priority level
*/
@@ -906,56 +968,32 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt,
nlflags &= ~NLM_F_EXCL;
if (replace) {
- if (rt_can_ecmp == rt6_qualify_for_ecmp(iter)) {
- found++;
- break;
- }
- if (rt_can_ecmp)
- fallback_ins = fallback_ins ?: ins;
- goto next_iter;
+ found++;
+ break;
}
if (rt6_duplicate_nexthop(iter, rt)) {
- if (rt->rt6i_nsiblings)
- rt->rt6i_nsiblings = 0;
- if (!(iter->rt6i_flags & RTF_EXPIRES))
+ if (rt->fib6_nsiblings)
+ rt->fib6_nsiblings = 0;
+ if (!(iter->fib6_flags & RTF_EXPIRES))
return -EEXIST;
- if (!(rt->rt6i_flags & RTF_EXPIRES))
- rt6_clean_expires(iter);
+ if (!(rt->fib6_flags & RTF_EXPIRES))
+ fib6_clean_expires(iter);
else
- rt6_set_expires(iter, rt->dst.expires);
- iter->rt6i_pmtu = rt->rt6i_pmtu;
+ fib6_set_expires(iter, rt->expires);
+ fib6_metric_set(iter, RTAX_MTU, rt->fib6_pmtu);
return -EEXIST;
}
- /* If we have the same destination and the same metric,
- * but not the same gateway, then the route we try to
- * add is sibling to this route, increment our counter
- * of siblings, and later we will add our route to the
- * list.
- * Only static routes (which don't have flag
- * RTF_EXPIRES) are used for ECMPv6.
- *
- * To avoid long list, we only had siblings if the
- * route have a gateway.
- */
- if (rt_can_ecmp &&
- rt6_qualify_for_ecmp(iter))
- rt->rt6i_nsiblings++;
+
+ /* first route that matches */
+ if (!match)
+ match = iter;
}
- if (iter->rt6i_metric > rt->rt6i_metric)
+ if (iter->fib6_metric > rt->fib6_metric)
break;
-next_iter:
- ins = &iter->rt6_next;
- }
-
- if (fallback_ins && !found) {
- /* No ECMP-able route found, replace first non-ECMP one */
- ins = fallback_ins;
- iter = rcu_dereference_protected(*ins,
- lockdep_is_held(&rt->rt6i_table->tb6_lock));
- found++;
+ ins = &iter->fib6_next;
}
/* Reset round-robin state, if necessary */
@@ -963,59 +1001,56 @@ next_iter:
fn->rr_ptr = NULL;
/* Link this route to others same route. */
- if (rt->rt6i_nsiblings) {
- unsigned int rt6i_nsiblings;
- struct rt6_info *sibling, *temp_sibling;
-
- /* Find the first route that have the same metric */
- sibling = leaf;
- while (sibling) {
- if (sibling->rt6i_metric == rt->rt6i_metric &&
- rt6_qualify_for_ecmp(sibling)) {
- list_add_tail(&rt->rt6i_siblings,
- &sibling->rt6i_siblings);
- break;
- }
- sibling = rcu_dereference_protected(sibling->rt6_next,
- lockdep_is_held(&rt->rt6i_table->tb6_lock));
+ if (append && match) {
+ struct fib6_info *sibling, *temp_sibling;
+
+ if (rt->fib6_flags & RTF_REJECT) {
+ NL_SET_ERR_MSG(extack,
+ "Can not append a REJECT route");
+ return -EINVAL;
+ } else if (match->fib6_flags & RTF_REJECT) {
+ NL_SET_ERR_MSG(extack,
+ "Can not append to a REJECT route");
+ return -EINVAL;
}
+ rt->fib6_nsiblings = match->fib6_nsiblings;
+ list_add_tail(&rt->fib6_siblings, &match->fib6_siblings);
+ match->fib6_nsiblings++;
+
/* For each sibling in the list, increment the counter of
* siblings. BUG() if counters does not match, list of siblings
* is broken!
*/
- rt6i_nsiblings = 0;
list_for_each_entry_safe(sibling, temp_sibling,
- &rt->rt6i_siblings, rt6i_siblings) {
- sibling->rt6i_nsiblings++;
- BUG_ON(sibling->rt6i_nsiblings != rt->rt6i_nsiblings);
- rt6i_nsiblings++;
+ &match->fib6_siblings, fib6_siblings) {
+ sibling->fib6_nsiblings++;
+ BUG_ON(sibling->fib6_nsiblings != match->fib6_nsiblings);
}
- BUG_ON(rt6i_nsiblings != rt->rt6i_nsiblings);
- rt6_multipath_rebalance(temp_sibling);
+
+ rt6_multipath_rebalance(match);
}
/*
* insert node
*/
if (!replace) {
+ enum fib_event_type event;
+
if (!add)
pr_warn("NLM_F_CREATE should be set when creating new route\n");
add:
nlflags |= NLM_F_CREATE;
- err = fib6_commit_metrics(&rt->dst, mxc);
- if (err)
- return err;
- err = call_fib6_entry_notifiers(info->nl_net,
- FIB_EVENT_ENTRY_ADD,
- rt, extack);
+ event = append ? FIB_EVENT_ENTRY_APPEND : FIB_EVENT_ENTRY_ADD;
+ err = call_fib6_entry_notifiers(info->nl_net, event, rt,
+ extack);
if (err)
return err;
- rcu_assign_pointer(rt->rt6_next, iter);
- atomic_inc(&rt->rt6i_ref);
- rcu_assign_pointer(rt->rt6i_node, fn);
+ rcu_assign_pointer(rt->fib6_next, iter);
+ atomic_inc(&rt->fib6_ref);
+ rcu_assign_pointer(rt->fib6_node, fn);
rcu_assign_pointer(*ins, rt);
if (!info->skip_notify)
inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
@@ -1027,7 +1062,7 @@ add:
}
} else {
- int nsiblings;
+ struct fib6_info *tmp;
if (!found) {
if (add)
@@ -1036,67 +1071,72 @@ add:
return -ENOENT;
}
- err = fib6_commit_metrics(&rt->dst, mxc);
- if (err)
- return err;
-
err = call_fib6_entry_notifiers(info->nl_net,
FIB_EVENT_ENTRY_REPLACE,
rt, extack);
if (err)
return err;
- atomic_inc(&rt->rt6i_ref);
- rcu_assign_pointer(rt->rt6i_node, fn);
- rt->rt6_next = iter->rt6_next;
+ /* if route being replaced has siblings, set tmp to
+ * last one, otherwise tmp is current route. this is
+ * used to set fib6_next for new route
+ */
+ if (iter->fib6_nsiblings)
+ tmp = list_last_entry(&iter->fib6_siblings,
+ struct fib6_info,
+ fib6_siblings);
+ else
+ tmp = iter;
+
+ /* insert new route */
+ atomic_inc(&rt->fib6_ref);
+ rcu_assign_pointer(rt->fib6_node, fn);
+ rt->fib6_next = tmp->fib6_next;
rcu_assign_pointer(*ins, rt);
+
if (!info->skip_notify)
inet6_rt_notify(RTM_NEWROUTE, rt, info, NLM_F_REPLACE);
if (!(fn->fn_flags & RTN_RTINFO)) {
info->nl_net->ipv6.rt6_stats->fib_route_nodes++;
fn->fn_flags |= RTN_RTINFO;
}
- nsiblings = iter->rt6i_nsiblings;
- iter->rt6i_node = NULL;
- fib6_purge_rt(iter, fn, info->nl_net);
- if (rcu_access_pointer(fn->rr_ptr) == iter)
- fn->rr_ptr = NULL;
- rt6_release(iter);
- if (nsiblings) {
+ /* delete old route */
+ rt = iter;
+
+ if (rt->fib6_nsiblings) {
+ struct fib6_info *tmp;
+
/* Replacing an ECMP route, remove all siblings */
- ins = &rt->rt6_next;
- iter = rcu_dereference_protected(*ins,
- lockdep_is_held(&rt->rt6i_table->tb6_lock));
- while (iter) {
- if (iter->rt6i_metric > rt->rt6i_metric)
- break;
- if (rt6_qualify_for_ecmp(iter)) {
- *ins = iter->rt6_next;
- iter->rt6i_node = NULL;
- fib6_purge_rt(iter, fn, info->nl_net);
- if (rcu_access_pointer(fn->rr_ptr) == iter)
- fn->rr_ptr = NULL;
- rt6_release(iter);
- nsiblings--;
- info->nl_net->ipv6.rt6_stats->fib_rt_entries--;
- } else {
- ins = &iter->rt6_next;
- }
- iter = rcu_dereference_protected(*ins,
- lockdep_is_held(&rt->rt6i_table->tb6_lock));
+ list_for_each_entry_safe(iter, tmp, &rt->fib6_siblings,
+ fib6_siblings) {
+ iter->fib6_node = NULL;
+ fib6_purge_rt(iter, fn, info->nl_net);
+ if (rcu_access_pointer(fn->rr_ptr) == iter)
+ fn->rr_ptr = NULL;
+ fib6_info_release(iter);
+
+ rt->fib6_nsiblings--;
+ info->nl_net->ipv6.rt6_stats->fib_rt_entries--;
}
- WARN_ON(nsiblings != 0);
}
+
+ WARN_ON(rt->fib6_nsiblings != 0);
+
+ rt->fib6_node = NULL;
+ fib6_purge_rt(rt, fn, info->nl_net);
+ if (rcu_access_pointer(fn->rr_ptr) == rt)
+ fn->rr_ptr = NULL;
+ fib6_info_release(rt);
}
return 0;
}
-static void fib6_start_gc(struct net *net, struct rt6_info *rt)
+static void fib6_start_gc(struct net *net, struct fib6_info *rt)
{
if (!timer_pending(&net->ipv6.ip6_fib_timer) &&
- (rt->rt6i_flags & (RTF_EXPIRES | RTF_CACHE)))
+ (rt->fib6_flags & RTF_EXPIRES))
mod_timer(&net->ipv6.ip6_fib_timer,
jiffies + net->ipv6.sysctl.ip6_rt_gc_interval);
}
@@ -1108,22 +1148,22 @@ void fib6_force_start_gc(struct net *net)
jiffies + net->ipv6.sysctl.ip6_rt_gc_interval);
}
-static void __fib6_update_sernum_upto_root(struct rt6_info *rt,
+static void __fib6_update_sernum_upto_root(struct fib6_info *rt,
int sernum)
{
- struct fib6_node *fn = rcu_dereference_protected(rt->rt6i_node,
- lockdep_is_held(&rt->rt6i_table->tb6_lock));
+ struct fib6_node *fn = rcu_dereference_protected(rt->fib6_node,
+ lockdep_is_held(&rt->fib6_table->tb6_lock));
/* paired with smp_rmb() in rt6_get_cookie_safe() */
smp_wmb();
while (fn) {
fn->fn_sernum = sernum;
fn = rcu_dereference_protected(fn->parent,
- lockdep_is_held(&rt->rt6i_table->tb6_lock));
+ lockdep_is_held(&rt->fib6_table->tb6_lock));
}
}
-void fib6_update_sernum_upto_root(struct net *net, struct rt6_info *rt)
+void fib6_update_sernum_upto_root(struct net *net, struct fib6_info *rt)
{
__fib6_update_sernum_upto_root(rt, fib6_new_sernum(net));
}
@@ -1135,22 +1175,16 @@ void fib6_update_sernum_upto_root(struct net *net, struct rt6_info *rt)
* Need to own table->tb6_lock
*/
-int fib6_add(struct fib6_node *root, struct rt6_info *rt,
- struct nl_info *info, struct mx6_config *mxc,
- struct netlink_ext_ack *extack)
+int fib6_add(struct fib6_node *root, struct fib6_info *rt,
+ struct nl_info *info, struct netlink_ext_ack *extack)
{
- struct fib6_table *table = rt->rt6i_table;
+ struct fib6_table *table = rt->fib6_table;
struct fib6_node *fn, *pn = NULL;
int err = -ENOMEM;
int allow_create = 1;
int replace_required = 0;
int sernum = fib6_new_sernum(info->nl_net);
- if (WARN_ON_ONCE(!atomic_read(&rt->dst.__refcnt)))
- return -EINVAL;
- if (WARN_ON_ONCE(rt->rt6i_flags & RTF_CACHE))
- return -EINVAL;
-
if (info->nlh) {
if (!(info->nlh->nlmsg_flags & NLM_F_CREATE))
allow_create = 0;
@@ -1161,8 +1195,8 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt,
pr_warn("RTM_NEWROUTE with no NLM_F_CREATE or NLM_F_REPLACE\n");
fn = fib6_add_1(info->nl_net, table, root,
- &rt->rt6i_dst.addr, rt->rt6i_dst.plen,
- offsetof(struct rt6_info, rt6i_dst), allow_create,
+ &rt->fib6_dst.addr, rt->fib6_dst.plen,
+ offsetof(struct fib6_info, fib6_dst), allow_create,
replace_required, extack);
if (IS_ERR(fn)) {
err = PTR_ERR(fn);
@@ -1173,7 +1207,7 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt,
pn = fn;
#ifdef CONFIG_IPV6_SUBTREES
- if (rt->rt6i_src.plen) {
+ if (rt->fib6_src.plen) {
struct fib6_node *sn;
if (!rcu_access_pointer(fn->subtree)) {
@@ -1194,16 +1228,16 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt,
if (!sfn)
goto failure;
- atomic_inc(&info->nl_net->ipv6.ip6_null_entry->rt6i_ref);
+ atomic_inc(&info->nl_net->ipv6.fib6_null_entry->fib6_ref);
rcu_assign_pointer(sfn->leaf,
- info->nl_net->ipv6.ip6_null_entry);
+ info->nl_net->ipv6.fib6_null_entry);
sfn->fn_flags = RTN_ROOT;
/* Now add the first leaf node to new subtree */
sn = fib6_add_1(info->nl_net, table, sfn,
- &rt->rt6i_src.addr, rt->rt6i_src.plen,
- offsetof(struct rt6_info, rt6i_src),
+ &rt->fib6_src.addr, rt->fib6_src.plen,
+ offsetof(struct fib6_info, fib6_src),
allow_create, replace_required, extack);
if (IS_ERR(sn)) {
@@ -1221,8 +1255,8 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt,
rcu_assign_pointer(fn->subtree, sfn);
} else {
sn = fib6_add_1(info->nl_net, table, FIB6_SUBTREE(fn),
- &rt->rt6i_src.addr, rt->rt6i_src.plen,
- offsetof(struct rt6_info, rt6i_src),
+ &rt->fib6_src.addr, rt->fib6_src.plen,
+ offsetof(struct fib6_info, fib6_src),
allow_create, replace_required, extack);
if (IS_ERR(sn)) {
@@ -1235,9 +1269,9 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt,
if (fn->fn_flags & RTN_TL_ROOT) {
/* put back null_entry for root node */
rcu_assign_pointer(fn->leaf,
- info->nl_net->ipv6.ip6_null_entry);
+ info->nl_net->ipv6.fib6_null_entry);
} else {
- atomic_inc(&rt->rt6i_ref);
+ atomic_inc(&rt->fib6_ref);
rcu_assign_pointer(fn->leaf, rt);
}
}
@@ -1245,7 +1279,7 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt,
}
#endif
- err = fib6_add_rt2node(fn, rt, info, mxc, extack);
+ err = fib6_add_rt2node(fn, rt, info, extack);
if (!err) {
__fib6_update_sernum_upto_root(rt, sernum);
fib6_start_gc(info->nl_net, rt);
@@ -1259,13 +1293,13 @@ out:
* super-tree leaf node we have to find a new one for it.
*/
if (pn != fn) {
- struct rt6_info *pn_leaf =
+ struct fib6_info *pn_leaf =
rcu_dereference_protected(pn->leaf,
lockdep_is_held(&table->tb6_lock));
if (pn_leaf == rt) {
pn_leaf = NULL;
RCU_INIT_POINTER(pn->leaf, NULL);
- atomic_dec(&rt->rt6i_ref);
+ fib6_info_release(rt);
}
if (!pn_leaf && !(pn->fn_flags & RTN_RTINFO)) {
pn_leaf = fib6_find_prefix(info->nl_net, table,
@@ -1274,10 +1308,10 @@ out:
if (!pn_leaf) {
WARN_ON(!pn_leaf);
pn_leaf =
- info->nl_net->ipv6.ip6_null_entry;
+ info->nl_net->ipv6.fib6_null_entry;
}
#endif
- atomic_inc(&pn_leaf->rt6i_ref);
+ fib6_info_hold(pn_leaf);
rcu_assign_pointer(pn->leaf, pn_leaf);
}
}
@@ -1299,10 +1333,6 @@ failure:
(fn->fn_flags & RTN_TL_ROOT &&
!rcu_access_pointer(fn->leaf))))
fib6_repair_tree(info->nl_net, table, fn);
- /* Always release dst as dst->__refcnt is guaranteed
- * to be taken before entering this function
- */
- dst_release_immediate(&rt->dst);
return err;
}
@@ -1312,12 +1342,12 @@ failure:
*/
struct lookup_args {
- int offset; /* key offset on rt6_info */
+ int offset; /* key offset on fib6_info */
const struct in6_addr *addr; /* search key */
};
-static struct fib6_node *fib6_lookup_1(struct fib6_node *root,
- struct lookup_args *args)
+static struct fib6_node *fib6_node_lookup_1(struct fib6_node *root,
+ struct lookup_args *args)
{
struct fib6_node *fn;
__be32 dir;
@@ -1350,7 +1380,7 @@ static struct fib6_node *fib6_lookup_1(struct fib6_node *root,
struct fib6_node *subtree = FIB6_SUBTREE(fn);
if (subtree || fn->fn_flags & RTN_RTINFO) {
- struct rt6_info *leaf = rcu_dereference(fn->leaf);
+ struct fib6_info *leaf = rcu_dereference(fn->leaf);
struct rt6key *key;
if (!leaf)
@@ -1362,7 +1392,8 @@ static struct fib6_node *fib6_lookup_1(struct fib6_node *root,
#ifdef CONFIG_IPV6_SUBTREES
if (subtree) {
struct fib6_node *sfn;
- sfn = fib6_lookup_1(subtree, args + 1);
+ sfn = fib6_node_lookup_1(subtree,
+ args + 1);
if (!sfn)
goto backtrack;
fn = sfn;
@@ -1384,18 +1415,19 @@ backtrack:
/* called with rcu_read_lock() held
*/
-struct fib6_node *fib6_lookup(struct fib6_node *root, const struct in6_addr *daddr,
- const struct in6_addr *saddr)
+struct fib6_node *fib6_node_lookup(struct fib6_node *root,
+ const struct in6_addr *daddr,
+ const struct in6_addr *saddr)
{
struct fib6_node *fn;
struct lookup_args args[] = {
{
- .offset = offsetof(struct rt6_info, rt6i_dst),
+ .offset = offsetof(struct fib6_info, fib6_dst),
.addr = daddr,
},
#ifdef CONFIG_IPV6_SUBTREES
{
- .offset = offsetof(struct rt6_info, rt6i_src),
+ .offset = offsetof(struct fib6_info, fib6_src),
.addr = saddr,
},
#endif
@@ -1404,7 +1436,7 @@ struct fib6_node *fib6_lookup(struct fib6_node *root, const struct in6_addr *dad
}
};
- fn = fib6_lookup_1(root, daddr ? args : args + 1);
+ fn = fib6_node_lookup_1(root, daddr ? args : args + 1);
if (!fn || fn->fn_flags & RTN_TL_ROOT)
fn = root;
@@ -1431,7 +1463,7 @@ static struct fib6_node *fib6_locate_1(struct fib6_node *root,
struct fib6_node *fn, *prev = NULL;
for (fn = root; fn ; ) {
- struct rt6_info *leaf = rcu_dereference(fn->leaf);
+ struct fib6_info *leaf = rcu_dereference(fn->leaf);
struct rt6key *key;
/* This node is being deleted */
@@ -1480,7 +1512,7 @@ struct fib6_node *fib6_locate(struct fib6_node *root,
struct fib6_node *fn;
fn = fib6_locate_1(root, daddr, dst_len,
- offsetof(struct rt6_info, rt6i_dst),
+ offsetof(struct fib6_info, fib6_dst),
exact_match);
#ifdef CONFIG_IPV6_SUBTREES
@@ -1491,7 +1523,7 @@ struct fib6_node *fib6_locate(struct fib6_node *root,
if (subtree) {
fn = fib6_locate_1(subtree, saddr, src_len,
- offsetof(struct rt6_info, rt6i_src),
+ offsetof(struct fib6_info, fib6_src),
exact_match);
}
}
@@ -1510,14 +1542,14 @@ struct fib6_node *fib6_locate(struct fib6_node *root,
*
*/
-static struct rt6_info *fib6_find_prefix(struct net *net,
+static struct fib6_info *fib6_find_prefix(struct net *net,
struct fib6_table *table,
struct fib6_node *fn)
{
struct fib6_node *child_left, *child_right;
if (fn->fn_flags & RTN_ROOT)
- return net->ipv6.ip6_null_entry;
+ return net->ipv6.fib6_null_entry;
while (fn) {
child_left = rcu_dereference_protected(fn->left,
@@ -1554,7 +1586,7 @@ static struct fib6_node *fib6_repair_tree(struct net *net,
/* Set fn->leaf to null_entry for root node. */
if (fn->fn_flags & RTN_TL_ROOT) {
- rcu_assign_pointer(fn->leaf, net->ipv6.ip6_null_entry);
+ rcu_assign_pointer(fn->leaf, net->ipv6.fib6_null_entry);
return fn;
}
@@ -1569,11 +1601,11 @@ static struct fib6_node *fib6_repair_tree(struct net *net,
lockdep_is_held(&table->tb6_lock));
struct fib6_node *pn_l = rcu_dereference_protected(pn->left,
lockdep_is_held(&table->tb6_lock));
- struct rt6_info *fn_leaf = rcu_dereference_protected(fn->leaf,
+ struct fib6_info *fn_leaf = rcu_dereference_protected(fn->leaf,
lockdep_is_held(&table->tb6_lock));
- struct rt6_info *pn_leaf = rcu_dereference_protected(pn->leaf,
+ struct fib6_info *pn_leaf = rcu_dereference_protected(pn->leaf,
lockdep_is_held(&table->tb6_lock));
- struct rt6_info *new_fn_leaf;
+ struct fib6_info *new_fn_leaf;
RT6_TRACE("fixing tree: plen=%d iter=%d\n", fn->fn_bit, iter);
iter++;
@@ -1599,10 +1631,10 @@ static struct fib6_node *fib6_repair_tree(struct net *net,
#if RT6_DEBUG >= 2
if (!new_fn_leaf) {
WARN_ON(!new_fn_leaf);
- new_fn_leaf = net->ipv6.ip6_null_entry;
+ new_fn_leaf = net->ipv6.fib6_null_entry;
}
#endif
- atomic_inc(&new_fn_leaf->rt6i_ref);
+ fib6_info_hold(new_fn_leaf);
rcu_assign_pointer(fn->leaf, new_fn_leaf);
return pn;
}
@@ -1658,26 +1690,24 @@ static struct fib6_node *fib6_repair_tree(struct net *net,
return pn;
RCU_INIT_POINTER(pn->leaf, NULL);
- rt6_release(pn_leaf);
+ fib6_info_release(pn_leaf);
fn = pn;
}
}
static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn,
- struct rt6_info __rcu **rtp, struct nl_info *info)
+ struct fib6_info __rcu **rtp, struct nl_info *info)
{
struct fib6_walker *w;
- struct rt6_info *rt = rcu_dereference_protected(*rtp,
+ struct fib6_info *rt = rcu_dereference_protected(*rtp,
lockdep_is_held(&table->tb6_lock));
struct net *net = info->nl_net;
RT6_TRACE("fib6_del_route\n");
- WARN_ON_ONCE(rt->rt6i_flags & RTF_CACHE);
-
/* Unlink it */
- *rtp = rt->rt6_next;
- rt->rt6i_node = NULL;
+ *rtp = rt->fib6_next;
+ rt->fib6_node = NULL;
net->ipv6.rt6_stats->fib_rt_entries--;
net->ipv6.rt6_stats->fib_discarded_routes++;
@@ -1689,14 +1719,14 @@ static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn,
fn->rr_ptr = NULL;
/* Remove this entry from other siblings */
- if (rt->rt6i_nsiblings) {
- struct rt6_info *sibling, *next_sibling;
+ if (rt->fib6_nsiblings) {
+ struct fib6_info *sibling, *next_sibling;
list_for_each_entry_safe(sibling, next_sibling,
- &rt->rt6i_siblings, rt6i_siblings)
- sibling->rt6i_nsiblings--;
- rt->rt6i_nsiblings = 0;
- list_del_init(&rt->rt6i_siblings);
+ &rt->fib6_siblings, fib6_siblings)
+ sibling->fib6_nsiblings--;
+ rt->fib6_nsiblings = 0;
+ list_del_init(&rt->fib6_siblings);
rt6_multipath_rebalance(next_sibling);
}
@@ -1705,7 +1735,7 @@ static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn,
FOR_WALKERS(net, w) {
if (w->state == FWS_C && w->leaf == rt) {
RT6_TRACE("walker %p adjusted by delroute\n", w);
- w->leaf = rcu_dereference_protected(rt->rt6_next,
+ w->leaf = rcu_dereference_protected(rt->fib6_next,
lockdep_is_held(&table->tb6_lock));
if (!w->leaf)
w->state = FWS_U;
@@ -1730,46 +1760,36 @@ static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn,
call_fib6_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, rt, NULL);
if (!info->skip_notify)
inet6_rt_notify(RTM_DELROUTE, rt, info, 0);
- rt6_release(rt);
+ fib6_info_release(rt);
}
/* Need to own table->tb6_lock */
-int fib6_del(struct rt6_info *rt, struct nl_info *info)
+int fib6_del(struct fib6_info *rt, struct nl_info *info)
{
- struct fib6_node *fn = rcu_dereference_protected(rt->rt6i_node,
- lockdep_is_held(&rt->rt6i_table->tb6_lock));
- struct fib6_table *table = rt->rt6i_table;
+ struct fib6_node *fn = rcu_dereference_protected(rt->fib6_node,
+ lockdep_is_held(&rt->fib6_table->tb6_lock));
+ struct fib6_table *table = rt->fib6_table;
struct net *net = info->nl_net;
- struct rt6_info __rcu **rtp;
- struct rt6_info __rcu **rtp_next;
+ struct fib6_info __rcu **rtp;
+ struct fib6_info __rcu **rtp_next;
-#if RT6_DEBUG >= 2
- if (rt->dst.obsolete > 0) {
- WARN_ON(fn);
- return -ENOENT;
- }
-#endif
- if (!fn || rt == net->ipv6.ip6_null_entry)
+ if (!fn || rt == net->ipv6.fib6_null_entry)
return -ENOENT;
WARN_ON(!(fn->fn_flags & RTN_RTINFO));
- /* remove cached dst from exception table */
- if (rt->rt6i_flags & RTF_CACHE)
- return rt6_remove_exception_rt(rt);
-
/*
* Walk the leaf entries looking for ourself
*/
for (rtp = &fn->leaf; *rtp; rtp = rtp_next) {
- struct rt6_info *cur = rcu_dereference_protected(*rtp,
+ struct fib6_info *cur = rcu_dereference_protected(*rtp,
lockdep_is_held(&table->tb6_lock));
if (rt == cur) {
fib6_del_route(table, fn, rtp, info);
return 0;
}
- rtp_next = &cur->rt6_next;
+ rtp_next = &cur->fib6_next;
}
return -ENOENT;
}
@@ -1907,7 +1927,7 @@ static int fib6_walk(struct net *net, struct fib6_walker *w)
static int fib6_clean_node(struct fib6_walker *w)
{
int res;
- struct rt6_info *rt;
+ struct fib6_info *rt;
struct fib6_cleaner *c = container_of(w, struct fib6_cleaner, w);
struct nl_info info = {
.nl_net = c->net,
@@ -1932,17 +1952,17 @@ static int fib6_clean_node(struct fib6_walker *w)
#if RT6_DEBUG >= 2
pr_debug("%s: del failed: rt=%p@%p err=%d\n",
__func__, rt,
- rcu_access_pointer(rt->rt6i_node),
+ rcu_access_pointer(rt->fib6_node),
res);
#endif
continue;
}
return 0;
} else if (res == -2) {
- if (WARN_ON(!rt->rt6i_nsiblings))
+ if (WARN_ON(!rt->fib6_nsiblings))
continue;
- rt = list_last_entry(&rt->rt6i_siblings,
- struct rt6_info, rt6i_siblings);
+ rt = list_last_entry(&rt->fib6_siblings,
+ struct fib6_info, fib6_siblings);
continue;
}
WARN_ON(res != 0);
@@ -1961,7 +1981,7 @@ static int fib6_clean_node(struct fib6_walker *w)
*/
static void fib6_clean_tree(struct net *net, struct fib6_node *root,
- int (*func)(struct rt6_info *, void *arg),
+ int (*func)(struct fib6_info *, void *arg),
int sernum, void *arg)
{
struct fib6_cleaner c;
@@ -1979,7 +1999,7 @@ static void fib6_clean_tree(struct net *net, struct fib6_node *root,
}
static void __fib6_clean_all(struct net *net,
- int (*func)(struct rt6_info *, void *),
+ int (*func)(struct fib6_info *, void *),
int sernum, void *arg)
{
struct fib6_table *table;
@@ -1999,7 +2019,7 @@ static void __fib6_clean_all(struct net *net,
rcu_read_unlock();
}
-void fib6_clean_all(struct net *net, int (*func)(struct rt6_info *, void *),
+void fib6_clean_all(struct net *net, int (*func)(struct fib6_info *, void *),
void *arg)
{
__fib6_clean_all(net, func, FIB6_NO_SERNUM_CHANGE, arg);
@@ -2016,7 +2036,7 @@ static void fib6_flush_trees(struct net *net)
* Garbage collection
*/
-static int fib6_age(struct rt6_info *rt, void *arg)
+static int fib6_age(struct fib6_info *rt, void *arg)
{
struct fib6_gc_args *gc_args = arg;
unsigned long now = jiffies;
@@ -2026,8 +2046,8 @@ static int fib6_age(struct rt6_info *rt, void *arg)
* Routes are expired even if they are in use.
*/
- if (rt->rt6i_flags & RTF_EXPIRES && rt->dst.expires) {
- if (time_after(now, rt->dst.expires)) {
+ if (rt->fib6_flags & RTF_EXPIRES && rt->expires) {
+ if (time_after(now, rt->expires)) {
RT6_TRACE("expiring %p\n", rt);
return -1;
}
@@ -2110,7 +2130,7 @@ static int __net_init fib6_net_init(struct net *net)
net->ipv6.fib6_main_tbl->tb6_id = RT6_TABLE_MAIN;
rcu_assign_pointer(net->ipv6.fib6_main_tbl->tb6_root.leaf,
- net->ipv6.ip6_null_entry);
+ net->ipv6.fib6_null_entry);
net->ipv6.fib6_main_tbl->tb6_root.fn_flags =
RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO;
inet_peer_base_init(&net->ipv6.fib6_main_tbl->tb6_peers);
@@ -2122,7 +2142,7 @@ static int __net_init fib6_net_init(struct net *net)
goto out_fib6_main_tbl;
net->ipv6.fib6_local_tbl->tb6_id = RT6_TABLE_LOCAL;
rcu_assign_pointer(net->ipv6.fib6_local_tbl->tb6_root.leaf,
- net->ipv6.ip6_null_entry);
+ net->ipv6.fib6_null_entry);
net->ipv6.fib6_local_tbl->tb6_root.fn_flags =
RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO;
inet_peer_base_init(&net->ipv6.fib6_local_tbl->tb6_peers);
@@ -2211,25 +2231,26 @@ void fib6_gc_cleanup(void)
#ifdef CONFIG_PROC_FS
static int ipv6_route_seq_show(struct seq_file *seq, void *v)
{
- struct rt6_info *rt = v;
+ struct fib6_info *rt = v;
struct ipv6_route_iter *iter = seq->private;
+ const struct net_device *dev;
- seq_printf(seq, "%pi6 %02x ", &rt->rt6i_dst.addr, rt->rt6i_dst.plen);
+ seq_printf(seq, "%pi6 %02x ", &rt->fib6_dst.addr, rt->fib6_dst.plen);
#ifdef CONFIG_IPV6_SUBTREES
- seq_printf(seq, "%pi6 %02x ", &rt->rt6i_src.addr, rt->rt6i_src.plen);
+ seq_printf(seq, "%pi6 %02x ", &rt->fib6_src.addr, rt->fib6_src.plen);
#else
seq_puts(seq, "00000000000000000000000000000000 00 ");
#endif
- if (rt->rt6i_flags & RTF_GATEWAY)
- seq_printf(seq, "%pi6", &rt->rt6i_gateway);
+ if (rt->fib6_flags & RTF_GATEWAY)
+ seq_printf(seq, "%pi6", &rt->fib6_nh.nh_gw);
else
seq_puts(seq, "00000000000000000000000000000000");
+ dev = rt->fib6_nh.nh_dev;
seq_printf(seq, " %08x %08x %08x %08x %8s\n",
- rt->rt6i_metric, atomic_read(&rt->dst.__refcnt),
- rt->dst.__use, rt->rt6i_flags,
- rt->dst.dev ? rt->dst.dev->name : "");
+ rt->fib6_metric, atomic_read(&rt->fib6_ref), 0,
+ rt->fib6_flags, dev ? dev->name : "");
iter->w.leaf = NULL;
return 0;
}
@@ -2243,7 +2264,7 @@ static int ipv6_route_yield(struct fib6_walker *w)
do {
iter->w.leaf = rcu_dereference_protected(
- iter->w.leaf->rt6_next,
+ iter->w.leaf->fib6_next,
lockdep_is_held(&iter->tbl->tb6_lock));
iter->skip--;
if (!iter->skip && iter->w.leaf)
@@ -2302,14 +2323,14 @@ static void ipv6_route_check_sernum(struct ipv6_route_iter *iter)
static void *ipv6_route_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
int r;
- struct rt6_info *n;
+ struct fib6_info *n;
struct net *net = seq_file_net(seq);
struct ipv6_route_iter *iter = seq->private;
if (!v)
goto iter_table;
- n = rcu_dereference_bh(((struct rt6_info *)v)->rt6_next);
+ n = rcu_dereference_bh(((struct fib6_info *)v)->fib6_next);
if (n) {
++*pos;
return n;
diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index 458de353f5d9..c8cf2fdbb13b 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -848,7 +848,7 @@ static inline int ip6gre_xmit_ipv6(struct sk_buff *skb, struct net_device *dev)
}
/**
- * ip6_tnl_addr_conflict - compare packet addresses to tunnel's own
+ * ip6gre_tnl_addr_conflict - compare packet addresses to tunnel's own
* @t: the outgoing tunnel device
* @hdr: IPv6 header from the incoming packet
*
@@ -937,6 +937,8 @@ static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb,
struct flowi6 fl6;
int err = -EINVAL;
__u32 mtu;
+ int nhoff;
+ int thoff;
if (!ip6_tnl_xmit_ctl(t, &t->parms.laddr, &t->parms.raddr))
goto tx_err;
@@ -949,6 +951,16 @@ static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb,
truncate = true;
}
+ nhoff = skb_network_header(skb) - skb_mac_header(skb);
+ if (skb->protocol == htons(ETH_P_IP) &&
+ (ntohs(ip_hdr(skb)->tot_len) > skb->len - nhoff))
+ truncate = true;
+
+ thoff = skb_transport_header(skb) - skb_mac_header(skb);
+ if (skb->protocol == htons(ETH_P_IPV6) &&
+ (ntohs(ipv6_hdr(skb)->payload_len) > skb->len - thoff))
+ truncate = true;
+
if (skb_cow_head(skb, dev->needed_headroom ?: t->hlen))
goto tx_err;
@@ -1376,6 +1388,7 @@ static void ip6gre_dev_free(struct net_device *dev)
{
struct ip6_tnl *t = netdev_priv(dev);
+ gro_cells_destroy(&t->gro_cells);
dst_cache_destroy(&t->dst_cache);
free_percpu(dev->tstats);
}
@@ -1443,11 +1456,12 @@ static int ip6gre_tunnel_init_common(struct net_device *dev)
return -ENOMEM;
ret = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL);
- if (ret) {
- free_percpu(dev->tstats);
- dev->tstats = NULL;
- return ret;
- }
+ if (ret)
+ goto cleanup_alloc_pcpu_stats;
+
+ ret = gro_cells_init(&tunnel->gro_cells, dev);
+ if (ret)
+ goto cleanup_dst_cache_init;
t_hlen = ip6gre_calc_hlen(tunnel);
dev->mtu = ETH_DATA_LEN - t_hlen;
@@ -1463,6 +1477,13 @@ static int ip6gre_tunnel_init_common(struct net_device *dev)
ip6gre_tnl_init_features(dev);
return 0;
+
+cleanup_dst_cache_init:
+ dst_cache_destroy(&tunnel->dst_cache);
+cleanup_alloc_pcpu_stats:
+ free_percpu(dev->tstats);
+ dev->tstats = NULL;
+ return ret;
}
static int ip6gre_tunnel_init(struct net_device *dev)
@@ -1822,11 +1843,12 @@ static int ip6erspan_tap_init(struct net_device *dev)
return -ENOMEM;
ret = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL);
- if (ret) {
- free_percpu(dev->tstats);
- dev->tstats = NULL;
- return ret;
- }
+ if (ret)
+ goto cleanup_alloc_pcpu_stats;
+
+ ret = gro_cells_init(&tunnel->gro_cells, dev);
+ if (ret)
+ goto cleanup_dst_cache_init;
t_hlen = ip6erspan_calc_hlen(tunnel);
dev->mtu = ETH_DATA_LEN - t_hlen;
@@ -1839,6 +1861,13 @@ static int ip6erspan_tap_init(struct net_device *dev)
ip6erspan_tnl_link_config(tunnel, 1);
return 0;
+
+cleanup_dst_cache_init:
+ dst_cache_destroy(&tunnel->dst_cache);
+cleanup_alloc_pcpu_stats:
+ free_percpu(dev->tstats);
+ dev->tstats = NULL;
+ return ret;
}
static const struct net_device_ops ip6erspan_netdev_ops = {
diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c
index 9ee208a348f5..f08d34491ece 100644
--- a/net/ipv6/ip6_input.c
+++ b/net/ipv6/ip6_input.c
@@ -336,7 +336,7 @@ int ip6_mc_input(struct sk_buff *skb)
bool deliver;
__IP6_UPD_PO_STATS(dev_net(skb_dst(skb)->dev),
- ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_INMCAST,
+ __in6_dev_get_safely(skb->dev), IPSTATS_MIB_INMCAST,
skb->len);
hdr = ipv6_hdr(skb);
diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c
index 4a87f9428ca5..5b3f2f89ef41 100644
--- a/net/ipv6/ip6_offload.c
+++ b/net/ipv6/ip6_offload.c
@@ -88,9 +88,11 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb,
if (skb->encapsulation &&
skb_shinfo(skb)->gso_type & (SKB_GSO_IPXIP4 | SKB_GSO_IPXIP6))
- udpfrag = proto == IPPROTO_UDP && encap;
+ udpfrag = proto == IPPROTO_UDP && encap &&
+ (skb_shinfo(skb)->gso_type & SKB_GSO_UDP);
else
- udpfrag = proto == IPPROTO_UDP && !skb->encapsulation;
+ udpfrag = proto == IPPROTO_UDP && !skb->encapsulation &&
+ (skb_shinfo(skb)->gso_type & SKB_GSO_UDP);
ops = rcu_dereference(inet6_offloads[proto]);
if (likely(ops && ops->callbacks.gso_segment)) {
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 7b6d1689087b..021e5aef6ba3 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -383,28 +383,6 @@ static inline int ip6_forward_finish(struct net *net, struct sock *sk,
return dst_output(net, sk, skb);
}
-unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst)
-{
- unsigned int mtu;
- struct inet6_dev *idev;
-
- if (dst_metric_locked(dst, RTAX_MTU)) {
- mtu = dst_metric_raw(dst, RTAX_MTU);
- if (mtu)
- return mtu;
- }
-
- mtu = IPV6_MIN_MTU;
- rcu_read_lock();
- idev = __in6_dev_get(dst->dev);
- if (idev)
- mtu = idev->cnf.mtu6;
- rcu_read_unlock();
-
- return mtu;
-}
-EXPORT_SYMBOL_GPL(ip6_dst_mtu_forward);
-
static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
{
if (skb->len <= mtu)
@@ -425,6 +403,7 @@ static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
int ip6_forward(struct sk_buff *skb)
{
+ struct inet6_dev *idev = __in6_dev_get_safely(skb->dev);
struct dst_entry *dst = skb_dst(skb);
struct ipv6hdr *hdr = ipv6_hdr(skb);
struct inet6_skb_parm *opt = IP6CB(skb);
@@ -444,8 +423,7 @@ int ip6_forward(struct sk_buff *skb)
goto drop;
if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
- __IP6_INC_STATS(net, ip6_dst_idev(dst),
- IPSTATS_MIB_INDISCARDS);
+ __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
goto drop;
}
@@ -476,8 +454,7 @@ int ip6_forward(struct sk_buff *skb)
/* Force OUTPUT device used as source address */
skb->dev = dst->dev;
icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
- __IP6_INC_STATS(net, ip6_dst_idev(dst),
- IPSTATS_MIB_INHDRERRORS);
+ __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
kfree_skb(skb);
return -ETIMEDOUT;
@@ -490,15 +467,13 @@ int ip6_forward(struct sk_buff *skb)
if (proxied > 0)
return ip6_input(skb);
else if (proxied < 0) {
- __IP6_INC_STATS(net, ip6_dst_idev(dst),
- IPSTATS_MIB_INDISCARDS);
+ __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
goto drop;
}
}
if (!xfrm6_route_forward(skb)) {
- __IP6_INC_STATS(net, ip6_dst_idev(dst),
- IPSTATS_MIB_INDISCARDS);
+ __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
goto drop;
}
dst = skb_dst(skb);
@@ -507,7 +482,8 @@ int ip6_forward(struct sk_buff *skb)
send redirects to source routed frames.
We don't send redirects to frames decapsulated from IPsec.
*/
- if (skb->dev == dst->dev && opt->srcrt == 0 && !skb_sec_path(skb)) {
+ if (IP6CB(skb)->iif == dst->dev->ifindex &&
+ opt->srcrt == 0 && !skb_sec_path(skb)) {
struct in6_addr *target = NULL;
struct inet_peer *peer;
struct rt6_info *rt;
@@ -554,8 +530,7 @@ int ip6_forward(struct sk_buff *skb)
/* Again, force OUTPUT device used as source address */
skb->dev = dst->dev;
icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
- __IP6_INC_STATS(net, ip6_dst_idev(dst),
- IPSTATS_MIB_INTOOBIGERRORS);
+ __IP6_INC_STATS(net, idev, IPSTATS_MIB_INTOOBIGERRORS);
__IP6_INC_STATS(net, ip6_dst_idev(dst),
IPSTATS_MIB_FRAGFAILS);
kfree_skb(skb);
@@ -579,7 +554,7 @@ int ip6_forward(struct sk_buff *skb)
ip6_forward_finish);
error:
- __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
+ __IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
drop:
kfree_skb(skb);
return -EINVAL;
@@ -966,15 +941,21 @@ static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
* that's why we try it again later.
*/
if (ipv6_addr_any(&fl6->saddr) && (!*dst || !(*dst)->error)) {
+ struct fib6_info *from;
struct rt6_info *rt;
bool had_dst = *dst != NULL;
if (!had_dst)
*dst = ip6_route_output(net, sk, fl6);
rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
- err = ip6_route_get_saddr(net, rt, &fl6->daddr,
+
+ rcu_read_lock();
+ from = rt ? rcu_dereference(rt->from) : NULL;
+ err = ip6_route_get_saddr(net, from, &fl6->daddr,
sk ? inet6_sk(sk)->srcprefs : 0,
&fl6->saddr);
+ rcu_read_unlock();
+
if (err)
goto out_err_release;
@@ -1238,6 +1219,8 @@ static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
if (mtu < IPV6_MIN_MTU)
return -EINVAL;
cork->base.fragsize = mtu;
+ cork->base.gso_size = sk->sk_type == SOCK_DGRAM ? ipc6->gso_size : 0;
+
if (dst_allfrag(xfrm_dst_path(&rt->dst)))
cork->base.flags |= IPCORK_ALLFRAG;
cork->base.length = 0;
@@ -1272,6 +1255,7 @@ static int __ip6_append_data(struct sock *sk,
int csummode = CHECKSUM_NONE;
unsigned int maxnonfragsize, headersize;
unsigned int wmem_alloc_delta = 0;
+ bool paged;
skb = skb_peek_tail(queue);
if (!skb) {
@@ -1279,7 +1263,8 @@ static int __ip6_append_data(struct sock *sk,
dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
}
- mtu = cork->fragsize;
+ paged = !!cork->gso_size;
+ mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize;
orig_mtu = mtu;
hh_len = LL_RESERVED_SPACE(rt->dst.dev);
@@ -1327,7 +1312,7 @@ emsgsize:
if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
headersize == sizeof(struct ipv6hdr) &&
length <= mtu - headersize &&
- !(flags & MSG_MORE) &&
+ (!(flags & MSG_MORE) || cork->gso_size) &&
rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
csummode = CHECKSUM_PARTIAL;
@@ -1370,6 +1355,7 @@ emsgsize:
unsigned int fraglen;
unsigned int fraggap;
unsigned int alloclen;
+ unsigned int pagedlen = 0;
alloc_new_skb:
/* There's no room in the current skb */
if (skb)
@@ -1392,11 +1378,17 @@ alloc_new_skb:
if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
+ fraglen = datalen + fragheaderlen;
+
if ((flags & MSG_MORE) &&
!(rt->dst.dev->features&NETIF_F_SG))
alloclen = mtu;
- else
- alloclen = datalen + fragheaderlen;
+ else if (!paged)
+ alloclen = fraglen;
+ else {
+ alloclen = min_t(int, fraglen, MAX_HEADER);
+ pagedlen = fraglen - alloclen;
+ }
alloclen += dst_exthdrlen;
@@ -1418,7 +1410,7 @@ alloc_new_skb:
*/
alloclen += sizeof(struct frag_hdr);
- copy = datalen - transhdrlen - fraggap;
+ copy = datalen - transhdrlen - fraggap - pagedlen;
if (copy < 0) {
err = -EINVAL;
goto error;
@@ -1457,7 +1449,7 @@ alloc_new_skb:
/*
* Find where to start putting bytes
*/
- data = skb_put(skb, fraglen);
+ data = skb_put(skb, fraglen - pagedlen);
skb_set_network_header(skb, exthdrlen);
data += fragheaderlen;
skb->transport_header = (skb->network_header +
@@ -1480,7 +1472,7 @@ alloc_new_skb:
}
offset += copy;
- length -= datalen - fraggap;
+ length -= copy + transhdrlen;
transhdrlen = 0;
exthdrlen = 0;
dst_exthdrlen = 0;
@@ -1754,9 +1746,9 @@ struct sk_buff *ip6_make_skb(struct sock *sk,
void *from, int length, int transhdrlen,
struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
struct rt6_info *rt, unsigned int flags,
+ struct inet_cork_full *cork,
const struct sockcm_cookie *sockc)
{
- struct inet_cork_full cork;
struct inet6_cork v6_cork;
struct sk_buff_head queue;
int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
@@ -1767,27 +1759,27 @@ struct sk_buff *ip6_make_skb(struct sock *sk,
__skb_queue_head_init(&queue);
- cork.base.flags = 0;
- cork.base.addr = 0;
- cork.base.opt = NULL;
- cork.base.dst = NULL;
+ cork->base.flags = 0;
+ cork->base.addr = 0;
+ cork->base.opt = NULL;
+ cork->base.dst = NULL;
v6_cork.opt = NULL;
- err = ip6_setup_cork(sk, &cork, &v6_cork, ipc6, rt, fl6);
+ err = ip6_setup_cork(sk, cork, &v6_cork, ipc6, rt, fl6);
if (err) {
- ip6_cork_release(&cork, &v6_cork);
+ ip6_cork_release(cork, &v6_cork);
return ERR_PTR(err);
}
if (ipc6->dontfrag < 0)
ipc6->dontfrag = inet6_sk(sk)->dontfrag;
- err = __ip6_append_data(sk, fl6, &queue, &cork.base, &v6_cork,
+ err = __ip6_append_data(sk, fl6, &queue, &cork->base, &v6_cork,
&current->task_frag, getfrag, from,
length + exthdrlen, transhdrlen + exthdrlen,
flags, ipc6, sockc);
if (err) {
- __ip6_flush_pending_frames(sk, &queue, &cork, &v6_cork);
+ __ip6_flush_pending_frames(sk, &queue, cork, &v6_cork);
return ERR_PTR(err);
}
- return __ip6_make_skb(sk, &queue, &cork, &v6_cork);
+ return __ip6_make_skb(sk, &queue, cork, &v6_cork);
}
diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c
index ca957dd93a29..b7f28deddaea 100644
--- a/net/ipv6/ip6_vti.c
+++ b/net/ipv6/ip6_vti.c
@@ -743,7 +743,7 @@ vti6_parm_to_user(struct ip6_tnl_parm2 *u, const struct __ip6_tnl_parm *p)
}
/**
- * vti6_tnl_ioctl - configure vti6 tunnels from userspace
+ * vti6_ioctl - configure vti6 tunnels from userspace
* @dev: virtual device associated with tunnel
* @ifr: parameters passed from userspace
* @cmd: command to be performed
diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index 4a15529d33eb..0d0f0053bb11 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -180,7 +180,8 @@ static const struct nla_policy ip6mr_rule_policy[FRA_MAX + 1] = {
};
static int ip6mr_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
- struct fib_rule_hdr *frh, struct nlattr **tb)
+ struct fib_rule_hdr *frh, struct nlattr **tb,
+ struct netlink_ext_ack *extack)
{
return 0;
}
@@ -227,8 +228,8 @@ static int __net_init ip6mr_rules_init(struct net *net)
INIT_LIST_HEAD(&net->ipv6.mr6_tables);
mrt = ip6mr_new_table(net, RT6_TABLE_DFLT);
- if (!mrt) {
- err = -ENOMEM;
+ if (IS_ERR(mrt)) {
+ err = PTR_ERR(mrt);
goto err1;
}
@@ -301,8 +302,13 @@ static int ip6mr_fib_lookup(struct net *net, struct flowi6 *flp6,
static int __net_init ip6mr_rules_init(struct net *net)
{
- net->ipv6.mrt6 = ip6mr_new_table(net, RT6_TABLE_DFLT);
- return net->ipv6.mrt6 ? 0 : -ENOMEM;
+ struct mr_table *mrt;
+
+ mrt = ip6mr_new_table(net, RT6_TABLE_DFLT);
+ if (IS_ERR(mrt))
+ return PTR_ERR(mrt);
+ net->ipv6.mrt6 = mrt;
+ return 0;
}
static void __net_exit ip6mr_rules_exit(struct net *net)
@@ -1733,9 +1739,11 @@ int ip6_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, uns
rtnl_lock();
ret = 0;
- if (!ip6mr_new_table(net, v))
- ret = -ENOMEM;
- raw6_sk(sk)->ip6mr_table = v;
+ mrt = ip6mr_new_table(net, v);
+ if (IS_ERR(mrt))
+ ret = PTR_ERR(mrt);
+ else
+ raw6_sk(sk)->ip6mr_table = v;
rtnl_unlock();
return ret;
}
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 9de4dfb126ba..e640d2f3c55c 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -1155,7 +1155,8 @@ static void ndisc_router_discovery(struct sk_buff *skb)
struct ra_msg *ra_msg = (struct ra_msg *)skb_transport_header(skb);
struct neighbour *neigh = NULL;
struct inet6_dev *in6_dev;
- struct rt6_info *rt = NULL;
+ struct fib6_info *rt = NULL;
+ struct net *net;
int lifetime;
struct ndisc_options ndopts;
int optlen;
@@ -1253,9 +1254,9 @@ static void ndisc_router_discovery(struct sk_buff *skb)
/* Do not accept RA with source-addr found on local machine unless
* accept_ra_from_local is set to true.
*/
+ net = dev_net(in6_dev->dev);
if (!in6_dev->cnf.accept_ra_from_local &&
- ipv6_chk_addr(dev_net(in6_dev->dev), &ipv6_hdr(skb)->saddr,
- in6_dev->dev, 0)) {
+ ipv6_chk_addr(net, &ipv6_hdr(skb)->saddr, in6_dev->dev, 0)) {
ND_PRINTK(2, info,
"RA from local address detected on dev: %s: default router ignored\n",
skb->dev->name);
@@ -1272,20 +1273,22 @@ static void ndisc_router_discovery(struct sk_buff *skb)
pref = ICMPV6_ROUTER_PREF_MEDIUM;
#endif
- rt = rt6_get_dflt_router(&ipv6_hdr(skb)->saddr, skb->dev);
+ rt = rt6_get_dflt_router(net, &ipv6_hdr(skb)->saddr, skb->dev);
if (rt) {
- neigh = dst_neigh_lookup(&rt->dst, &ipv6_hdr(skb)->saddr);
+ neigh = ip6_neigh_lookup(&rt->fib6_nh.nh_gw,
+ rt->fib6_nh.nh_dev, NULL,
+ &ipv6_hdr(skb)->saddr);
if (!neigh) {
ND_PRINTK(0, err,
"RA: %s got default router without neighbour\n",
__func__);
- ip6_rt_put(rt);
+ fib6_info_release(rt);
return;
}
}
if (rt && lifetime == 0) {
- ip6_del_rt(rt);
+ ip6_del_rt(net, rt);
rt = NULL;
}
@@ -1294,7 +1297,8 @@ static void ndisc_router_discovery(struct sk_buff *skb)
if (!rt && lifetime) {
ND_PRINTK(3, info, "RA: adding default router\n");
- rt = rt6_add_dflt_router(&ipv6_hdr(skb)->saddr, skb->dev, pref);
+ rt = rt6_add_dflt_router(net, &ipv6_hdr(skb)->saddr,
+ skb->dev, pref);
if (!rt) {
ND_PRINTK(0, err,
"RA: %s failed to add default route\n",
@@ -1302,28 +1306,29 @@ static void ndisc_router_discovery(struct sk_buff *skb)
return;
}
- neigh = dst_neigh_lookup(&rt->dst, &ipv6_hdr(skb)->saddr);
+ neigh = ip6_neigh_lookup(&rt->fib6_nh.nh_gw,
+ rt->fib6_nh.nh_dev, NULL,
+ &ipv6_hdr(skb)->saddr);
if (!neigh) {
ND_PRINTK(0, err,
"RA: %s got default router without neighbour\n",
__func__);
- ip6_rt_put(rt);
+ fib6_info_release(rt);
return;
}
neigh->flags |= NTF_ROUTER;
} else if (rt) {
- rt->rt6i_flags = (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
+ rt->fib6_flags = (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
}
if (rt)
- rt6_set_expires(rt, jiffies + (HZ * lifetime));
+ fib6_set_expires(rt, jiffies + (HZ * lifetime));
if (in6_dev->cnf.accept_ra_min_hop_limit < 256 &&
ra_msg->icmph.icmp6_hop_limit) {
if (in6_dev->cnf.accept_ra_min_hop_limit <= ra_msg->icmph.icmp6_hop_limit) {
in6_dev->cnf.hop_limit = ra_msg->icmph.icmp6_hop_limit;
- if (rt)
- dst_metric_set(&rt->dst, RTAX_HOPLIMIT,
- ra_msg->icmph.icmp6_hop_limit);
+ fib6_metric_set(rt, RTAX_HOPLIMIT,
+ ra_msg->icmph.icmp6_hop_limit);
} else {
ND_PRINTK(2, warn, "RA: Got route advertisement with lower hop_limit than minimum\n");
}
@@ -1475,10 +1480,7 @@ skip_routeinfo:
ND_PRINTK(2, warn, "RA: invalid mtu: %d\n", mtu);
} else if (in6_dev->cnf.mtu6 != mtu) {
in6_dev->cnf.mtu6 = mtu;
-
- if (rt)
- dst_metric_set(&rt->dst, RTAX_MTU, mtu);
-
+ fib6_metric_set(rt, RTAX_MTU, mtu);
rt6_mtu_change(skb->dev, mtu);
}
}
@@ -1497,7 +1499,7 @@ skip_routeinfo:
ND_PRINTK(2, warn, "RA: invalid RA options\n");
}
out:
- ip6_rt_put(rt);
+ fib6_info_release(rt);
if (neigh)
neigh_release(neigh);
}
@@ -1576,6 +1578,12 @@ void ndisc_send_redirect(struct sk_buff *skb, const struct in6_addr *target)
ops_data_buf[NDISC_OPS_REDIRECT_DATA_SPACE], *ops_data = NULL;
bool ret;
+ if (netif_is_l3_master(skb->dev)) {
+ dev = __dev_get_by_index(dev_net(skb->dev), IPCB(skb)->iif);
+ if (!dev)
+ return;
+ }
+
if (ipv6_get_lladdr(dev, &saddr_buf, IFA_F_TENTATIVE)) {
ND_PRINTK(2, warn, "Redirect: no link-local address on %s\n",
dev->name);
diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig
index ce77bcc2490c..37b14dc9d863 100644
--- a/net/ipv6/netfilter/Kconfig
+++ b/net/ipv6/netfilter/Kconfig
@@ -29,7 +29,10 @@ config NF_SOCKET_IPV6
tristate "IPv6 socket lookup support"
help
This option enables the IPv6 socket lookup infrastructure. This
- is used by the ip6tables socket match.
+ is used by the {ip6,nf}tables socket match.
+
+config NF_TPROXY_IPV6
+ tristate "IPv6 tproxy support"
if NF_TABLES
@@ -136,10 +139,7 @@ config NF_NAT_IPV6
if NF_NAT_IPV6
config NF_NAT_MASQUERADE_IPV6
- tristate "IPv6 masquerade support"
- help
- This is the kernel functionality to provide NAT in the masquerade
- flavour (automatic source address selection) for IPv6.
+ bool
endif # NF_NAT_IPV6
diff --git a/net/ipv6/netfilter/Makefile b/net/ipv6/netfilter/Makefile
index 44273d6f03a5..10a5a1c87320 100644
--- a/net/ipv6/netfilter/Makefile
+++ b/net/ipv6/netfilter/Makefile
@@ -18,14 +18,15 @@ nf_conntrack_ipv6-y := nf_conntrack_l3proto_ipv6.o nf_conntrack_proto_icmpv6.o
obj-$(CONFIG_NF_CONNTRACK_IPV6) += nf_conntrack_ipv6.o
nf_nat_ipv6-y := nf_nat_l3proto_ipv6.o nf_nat_proto_icmpv6.o
+nf_nat_ipv6-$(CONFIG_NF_NAT_MASQUERADE_IPV6) += nf_nat_masquerade_ipv6.o
obj-$(CONFIG_NF_NAT_IPV6) += nf_nat_ipv6.o
-obj-$(CONFIG_NF_NAT_MASQUERADE_IPV6) += nf_nat_masquerade_ipv6.o
# defrag
nf_defrag_ipv6-y := nf_defrag_ipv6_hooks.o nf_conntrack_reasm.o
obj-$(CONFIG_NF_DEFRAG_IPV6) += nf_defrag_ipv6.o
obj-$(CONFIG_NF_SOCKET_IPV6) += nf_socket_ipv6.o
+obj-$(CONFIG_NF_TPROXY_IPV6) += nf_tproxy_ipv6.o
# logging
obj-$(CONFIG_NF_LOG_IPV6) += nf_log_ipv6.o
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index 97f79dc943d7..0758b5bcfb29 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -529,7 +529,6 @@ static int check_target(struct ip6t_entry *e, struct net *net, const char *name)
.family = NFPROTO_IPV6,
};
- t = ip6t_get_target(e);
return xt_check_target(&par, t->u.target_size - sizeof(*t),
e->ipv6.proto,
e->ipv6.invflags & IP6T_INV_PROTO);
@@ -1794,6 +1793,8 @@ int ip6t_register_table(struct net *net, const struct xt_table *table,
/* set res now, will see skbs right after nf_register_net_hooks */
WRITE_ONCE(*res, new_table);
+ if (!ops)
+ return 0;
ret = nf_register_net_hooks(net, ops, hweight32(table->valid_hooks));
if (ret != 0) {
@@ -1811,7 +1812,8 @@ out_free:
void ip6t_unregister_table(struct net *net, struct xt_table *table,
const struct nf_hook_ops *ops)
{
- nf_unregister_net_hooks(net, ops, hweight32(table->valid_hooks));
+ if (ops)
+ nf_unregister_net_hooks(net, ops, hweight32(table->valid_hooks));
__ip6t_unregister_table(net, table);
}
diff --git a/net/ipv6/netfilter/ip6t_MASQUERADE.c b/net/ipv6/netfilter/ip6t_MASQUERADE.c
index 92c0047e7e33..491f808e356a 100644
--- a/net/ipv6/netfilter/ip6t_MASQUERADE.c
+++ b/net/ipv6/netfilter/ip6t_MASQUERADE.c
@@ -29,7 +29,7 @@ masquerade_tg6(struct sk_buff *skb, const struct xt_action_param *par)
static int masquerade_tg6_checkentry(const struct xt_tgchk_param *par)
{
- const struct nf_nat_range *range = par->targinfo;
+ const struct nf_nat_range2 *range = par->targinfo;
if (range->flags & NF_NAT_RANGE_MAP_IPS)
return -EINVAL;
diff --git a/net/ipv6/netfilter/ip6t_rpfilter.c b/net/ipv6/netfilter/ip6t_rpfilter.c
index d12f511929f5..0fe61ede77c6 100644
--- a/net/ipv6/netfilter/ip6t_rpfilter.c
+++ b/net/ipv6/netfilter/ip6t_rpfilter.c
@@ -48,6 +48,8 @@ static bool rpfilter_lookup_reverse6(struct net *net, const struct sk_buff *skb,
}
fl6.flowi6_mark = flags & XT_RPFILTER_VALID_MARK ? skb->mark : 0;
+ if ((flags & XT_RPFILTER_LOOSE) == 0)
+ fl6.flowi6_oif = dev->ifindex;
rt = (void *)ip6_route_lookup(net, &fl6, skb, lookup_flags);
if (rt->dst.error)
diff --git a/net/ipv6/netfilter/ip6t_srh.c b/net/ipv6/netfilter/ip6t_srh.c
index 33719d5560c8..1059894a6f4c 100644
--- a/net/ipv6/netfilter/ip6t_srh.c
+++ b/net/ipv6/netfilter/ip6t_srh.c
@@ -117,6 +117,130 @@ static bool srh_mt6(const struct sk_buff *skb, struct xt_action_param *par)
return true;
}
+static bool srh1_mt6(const struct sk_buff *skb, struct xt_action_param *par)
+{
+ int hdrlen, psidoff, nsidoff, lsidoff, srhoff = 0;
+ const struct ip6t_srh1 *srhinfo = par->matchinfo;
+ struct in6_addr *psid, *nsid, *lsid;
+ struct in6_addr _psid, _nsid, _lsid;
+ struct ipv6_sr_hdr *srh;
+ struct ipv6_sr_hdr _srh;
+
+ if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0)
+ return false;
+ srh = skb_header_pointer(skb, srhoff, sizeof(_srh), &_srh);
+ if (!srh)
+ return false;
+
+ hdrlen = ipv6_optlen(srh);
+ if (skb->len - srhoff < hdrlen)
+ return false;
+
+ if (srh->type != IPV6_SRCRT_TYPE_4)
+ return false;
+
+ if (srh->segments_left > srh->first_segment)
+ return false;
+
+ /* Next Header matching */
+ if (srhinfo->mt_flags & IP6T_SRH_NEXTHDR)
+ if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_NEXTHDR,
+ !(srh->nexthdr == srhinfo->next_hdr)))
+ return false;
+
+ /* Header Extension Length matching */
+ if (srhinfo->mt_flags & IP6T_SRH_LEN_EQ)
+ if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_LEN_EQ,
+ !(srh->hdrlen == srhinfo->hdr_len)))
+ return false;
+ if (srhinfo->mt_flags & IP6T_SRH_LEN_GT)
+ if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_LEN_GT,
+ !(srh->hdrlen > srhinfo->hdr_len)))
+ return false;
+ if (srhinfo->mt_flags & IP6T_SRH_LEN_LT)
+ if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_LEN_LT,
+ !(srh->hdrlen < srhinfo->hdr_len)))
+ return false;
+
+ /* Segments Left matching */
+ if (srhinfo->mt_flags & IP6T_SRH_SEGS_EQ)
+ if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_SEGS_EQ,
+ !(srh->segments_left == srhinfo->segs_left)))
+ return false;
+ if (srhinfo->mt_flags & IP6T_SRH_SEGS_GT)
+ if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_SEGS_GT,
+ !(srh->segments_left > srhinfo->segs_left)))
+ return false;
+ if (srhinfo->mt_flags & IP6T_SRH_SEGS_LT)
+ if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_SEGS_LT,
+ !(srh->segments_left < srhinfo->segs_left)))
+ return false;
+
+ /**
+ * Last Entry matching
+ * Last_Entry field was introduced in revision 6 of the SRH draft.
+ * It was called First_Segment in the previous revision
+ */
+ if (srhinfo->mt_flags & IP6T_SRH_LAST_EQ)
+ if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_LAST_EQ,
+ !(srh->first_segment == srhinfo->last_entry)))
+ return false;
+ if (srhinfo->mt_flags & IP6T_SRH_LAST_GT)
+ if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_LAST_GT,
+ !(srh->first_segment > srhinfo->last_entry)))
+ return false;
+ if (srhinfo->mt_flags & IP6T_SRH_LAST_LT)
+ if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_LAST_LT,
+ !(srh->first_segment < srhinfo->last_entry)))
+ return false;
+
+ /**
+ * Tag matchig
+ * Tag field was introduced in revision 6 of the SRH draft
+ */
+ if (srhinfo->mt_flags & IP6T_SRH_TAG)
+ if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_TAG,
+ !(srh->tag == srhinfo->tag)))
+ return false;
+
+ /* Previous SID matching */
+ if (srhinfo->mt_flags & IP6T_SRH_PSID) {
+ if (srh->segments_left == srh->first_segment)
+ return false;
+ psidoff = srhoff + sizeof(struct ipv6_sr_hdr) +
+ ((srh->segments_left + 1) * sizeof(struct in6_addr));
+ psid = skb_header_pointer(skb, psidoff, sizeof(_psid), &_psid);
+ if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_PSID,
+ ipv6_masked_addr_cmp(psid, &srhinfo->psid_msk,
+ &srhinfo->psid_addr)))
+ return false;
+ }
+
+ /* Next SID matching */
+ if (srhinfo->mt_flags & IP6T_SRH_NSID) {
+ if (srh->segments_left == 0)
+ return false;
+ nsidoff = srhoff + sizeof(struct ipv6_sr_hdr) +
+ ((srh->segments_left - 1) * sizeof(struct in6_addr));
+ nsid = skb_header_pointer(skb, nsidoff, sizeof(_nsid), &_nsid);
+ if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_NSID,
+ ipv6_masked_addr_cmp(nsid, &srhinfo->nsid_msk,
+ &srhinfo->nsid_addr)))
+ return false;
+ }
+
+ /* Last SID matching */
+ if (srhinfo->mt_flags & IP6T_SRH_LSID) {
+ lsidoff = srhoff + sizeof(struct ipv6_sr_hdr);
+ lsid = skb_header_pointer(skb, lsidoff, sizeof(_lsid), &_lsid);
+ if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_LSID,
+ ipv6_masked_addr_cmp(lsid, &srhinfo->lsid_msk,
+ &srhinfo->lsid_addr)))
+ return false;</