aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorXin LI <delphij@FreeBSD.org>2018-08-15 02:30:11 +0000
committerXin LI <delphij@FreeBSD.org>2018-08-15 02:30:11 +0000
commitf69d7a5a946f625c5a0b449c398181d8c4c75f40 (patch)
tree0ff1cfb3529c929fd7ded0ae9682a813cd91f0b6
parent301fa9c0d6ddc3eaec69cf8b71ccbb65b457d3ac (diff)
downloadsrc-f69d7a5a946f625c5a0b449c398181d8c4c75f40.tar.gz
src-f69d7a5a946f625c5a0b449c398181d8c4c75f40.zip
Revis manual pages. [SA-18:08.tcp]
Fix L1 Terminal Fault (L1TF) kernel information disclosure. [SA-18:09.l1tf] Fix resource exhaustion in IP fragment reassembly. [SA-18:10.ip] Fix unauthenticated EAPOL-Key decryption vulnerability. [SA-18:11.hostapd] Approved by: so
Notes
Notes: svn path=/releng/11.1/; revision=337828
-rw-r--r--UPDATING15
-rw-r--r--contrib/wpa/src/rsn_supp/wpa.c11
-rw-r--r--share/man/man4/inet.437
-rw-r--r--share/man/man4/inet6.439
-rw-r--r--share/man/man4/tcp.42
-rw-r--r--sys/amd64/amd64/pmap.c3
-rw-r--r--sys/amd64/vmm/intel/vmx.c12
-rw-r--r--sys/amd64/vmm/intel/vmx_genassym.c4
-rw-r--r--sys/amd64/vmm/intel/vmx_support.S43
-rw-r--r--sys/conf/newvers.sh2
-rw-r--r--sys/netinet/ip_reass.c193
-rw-r--r--sys/netinet6/frag6.c327
-rw-r--r--sys/netinet6/in6.h4
-rw-r--r--sys/netinet6/in6_proto.c35
-rw-r--r--sys/netinet6/ip6_var.h8
-rw-r--r--sys/vm/vm_page.c36
-rw-r--r--sys/vm/vm_page.h1
-rw-r--r--sys/x86/include/specialreg.h5
18 files changed, 598 insertions, 179 deletions
diff --git a/UPDATING b/UPDATING
index 8a23d1462603..a068eed8d3c6 100644
--- a/UPDATING
+++ b/UPDATING
@@ -16,6 +16,21 @@ from older versions of FreeBSD, try WITHOUT_CLANG and WITH_GCC to bootstrap to
the tip of head, and then rebuild without this option. The bootstrap process
from older version of current across the gcc/clang cutover is a bit fragile.
+20180814 p13 FreeBSD-SA-18:08.tcp [revised]
+ FreeBSD-SA-18:09.l1tf
+ FreeBSD-SA-18:10.ip
+ FreeBSD-SA-18:11.hostapd
+
+ Revise manual pages. [SA-18:08.tcp]
+
+ Fix L1 Terminal Fault (L1TF) kernel information disclosure.
+ [SA-18:09.l1tf]
+
+ Fix resource exhaustion in IP fragment reassembly. [SA-18:10.ip]
+
+ Fix unauthenticated EAPOL-Key decryption vulnerability.
+ [SA-18:11.hostapd]
+
20180806 p12 FreeBSD-SA-18:08.tcp
Fix resource exhaustion in TCP reassembly.
diff --git a/contrib/wpa/src/rsn_supp/wpa.c b/contrib/wpa/src/rsn_supp/wpa.c
index bc50c97141b6..206cda98cc96 100644
--- a/contrib/wpa/src/rsn_supp/wpa.c
+++ b/contrib/wpa/src/rsn_supp/wpa.c
@@ -2027,6 +2027,17 @@ int wpa_sm_rx_eapol(struct wpa_sm *sm, const u8 *src_addr,
if ((sm->proto == WPA_PROTO_RSN || sm->proto == WPA_PROTO_OSEN) &&
(key_info & WPA_KEY_INFO_ENCR_KEY_DATA)) {
+ /*
+ * Only decrypt the Key Data field if the frame's authenticity
+ * was verified. When using AES-SIV (FILS), the MIC flag is not
+ * set, so this check should only be performed if mic_len != 0
+ * which is the case in this code branch.
+ */
+ if (!(key_info & WPA_KEY_INFO_MIC)) {
+ wpa_msg(sm->ctx->msg_ctx, MSG_WARNING,
+ "WPA: Ignore EAPOL-Key with encrypted but unauthenticated data");
+ goto out;
+ }
if (wpa_supplicant_decrypt_key_data(sm, key, ver, key_data,
&key_data_len))
goto out;
diff --git a/share/man/man4/inet.4 b/share/man/man4/inet.4
index 49c050d3de79..b0ccb2565ecf 100644
--- a/share/man/man4/inet.4
+++ b/share/man/man4/inet.4
@@ -28,7 +28,7 @@
.\" From: @(#)inet.4 8.1 (Berkeley) 6/5/93
.\" $FreeBSD$
.\"
-.Dd Feb 4, 2016
+.Dd August 14, 2018
.Dt INET 4
.Os
.Sh NAME
@@ -229,15 +229,38 @@ At the same time, on high-speed links, it can decrease the ID reuse
cycle greatly.
Default is 0 (sequential IP IDs).
IPv6 flow IDs and fragment IDs are always random.
+.It Va ip.maxfrags
+Integer: maximum number of fragments the host will accept and simultaneously
+hold across all reassembly queues in all VNETs.
+If set to 0, reassembly is disabled.
+If set to -1, this limit is not applied.
+This limit is recalculated when the number of mbuf clusters is changed.
+This is a global limit.
.It Va ip.maxfragpackets
-Integer: maximum number of fragmented packets the host will accept and hold
-in the reassembling queue simultaneously.
-0 means that the host will not accept any fragmented packets.
-\-1 means that the host will accept as many fragmented packets as it receives.
+Integer: maximum number of fragmented packets the host will accept and
+simultaneously hold in the reassembly queue for a particular VNET.
+0 means that the host will not accept any fragmented packets for that VNET.
+\-1 means that the host will not apply this limit for that VNET.
+This limit is recalculated when the number of mbuf clusters is changed.
+This is a per-VNET limit.
+.It Va ip.maxfragbucketsize
+Integer: maximum number of reassembly queues per bucket.
+Fragmented packets are hashed to buckets.
+Each bucket has a list of reassembly queues.
+The system must compare the incoming packets to the existing reassembly queues
+in the bucket to find a matching reassembly queue.
+To preserve system resources, the system limits the number of reassembly
+queues allowed in each bucket.
+This limit is recalculated when the number of mbuf clusters is changed or
+when the value of
+.Va ip.maxfragpackets
+changes.
+This is a per-VNET limit.
.It Va ip.maxfragsperpacket
Integer: maximum number of fragments the host will accept and hold
-in the reassembling queue for a packet.
-0 means that the host will not accept any fragmented packets.
+in the reassembly queue for a packet.
+0 means that the host will not accept any fragmented packets for the VNET.
+This is a per-VNET limit.
.El
.Sh SEE ALSO
.Xr ioctl 2 ,
diff --git a/share/man/man4/inet6.4 b/share/man/man4/inet6.4
index 815dee7c0385..40b5a175e0a2 100644
--- a/share/man/man4/inet6.4
+++ b/share/man/man4/inet6.4
@@ -29,7 +29,7 @@
.\"
.\" $FreeBSD$
.\"
-.Dd September 2, 2009
+.Dd August 14, 2018
.Dt INET6 4
.Os
.Sh NAME
@@ -219,12 +219,41 @@ packets.
This value applies to all the transport protocols on top of
.Tn IPv6 .
There are APIs to override the value.
+.It Dv IPV6CTL_MAXFRAGS
+.Pq ip6.maxfrags
+Integer: maximum number of fragments the host will accept and simultaneously
+hold across all reassembly queues in all VNETs.
+If set to 0, fragment reassembly is disabled.
+If set to -1, this limit is not applied.
+This limit is recalculated when the number of mbuf clusters is changed.
+This is a global limit.
.It Dv IPV6CTL_MAXFRAGPACKETS
.Pq ip6.maxfragpackets
-Integer: default maximum number of fragmented packets the node will accept.
-0 means that the node will not accept any fragmented packets.
--1 means that the node will accept as many fragmented packets as it receives.
-The flag is provided basically for avoiding possible DoS attacks.
+Integer: maximum number of fragmented packets the node will accept and
+simultaneously hold in the reassembly queue for a particular VNET.
+0 means that the node will not accept any fragmented packets for that VNET.
+-1 means that the node will not apply this limit for that VNET.
+This limit is recalculated when the number of mbuf clusters is changed.
+This is a per-VNET limit.
+.It Dv IPV6CTL_MAXFRAGBUCKETSIZE
+.Pq ip6.maxfragbucketsize
+Integer: maximum number of reassembly queues per bucket.
+Fragmented packets are hashed to buckets.
+Each bucket has a list of reassembly queues.
+The system must compare the incoming packets to the existing reassembly queues
+in the bucket to find a matching reassembly queue.
+To preserve system resources, the system limits the number of reassembly
+queues allowed in each bucket.
+This limit is recalculated when the number of mbuf clusters is changed or
+when the value of
+.Va ip6.maxfragpackets
+changes.
+This is a per-VNET limit.
+.It Dv IPV6CTL_MAXFRAGSPERPACKET
+.Pq ip6.maxfragsperpacket
+Integer: maximum number of fragments the host will accept and hold in the
+ressembly queue for a packet.
+This is a per-VNET limit.
.It Dv IPV6CTL_ACCEPT_RTADV
.Pq ip6.accept_rtadv
Boolean: the default value of a per-interface flag to
diff --git a/share/man/man4/tcp.4 b/share/man/man4/tcp.4
index 05a352c88f52..e4ef694945f6 100644
--- a/share/man/man4/tcp.4
+++ b/share/man/man4/tcp.4
@@ -34,7 +34,7 @@
.\" From: @(#)tcp.4 8.1 (Berkeley) 6/5/93
.\" $FreeBSD$
.\"
-.Dd February 6, 2017
+.Dd August 6, 2018
.Dt TCP 4
.Os
.Sh NAME
diff --git a/sys/amd64/amd64/pmap.c b/sys/amd64/amd64/pmap.c
index 8e2f04868924..a9d2d1fd9d0d 100644
--- a/sys/amd64/amd64/pmap.c
+++ b/sys/amd64/amd64/pmap.c
@@ -1206,6 +1206,9 @@ pmap_init(void)
vm_size_t s;
int error, i, pv_npg;
+ /* L1TF, reserve page @0 unconditionally */
+ vm_page_blacklist_add(0, bootverbose);
+
/*
* Initialize the vm page array entries for the kernel pmap's
* page table pages.
diff --git a/sys/amd64/vmm/intel/vmx.c b/sys/amd64/vmm/intel/vmx.c
index 0edfe51f7bd2..ca98767ef435 100644
--- a/sys/amd64/vmm/intel/vmx.c
+++ b/sys/amd64/vmm/intel/vmx.c
@@ -183,6 +183,12 @@ static u_int vpid_alloc_failed;
SYSCTL_UINT(_hw_vmm_vmx, OID_AUTO, vpid_alloc_failed, CTLFLAG_RD,
&vpid_alloc_failed, 0, NULL);
+static int guest_l1d_flush;
+SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, l1d_flush, CTLFLAG_RD,
+ &guest_l1d_flush, 0, NULL);
+
+uint64_t vmx_msr_flush_cmd;
+
/*
* Use the last page below 4GB as the APIC access address. This address is
* occupied by the boot firmware so it is guaranteed that it will not conflict
@@ -718,6 +724,12 @@ vmx_init(int ipinum)
return (error);
}
+ guest_l1d_flush = (cpu_ia32_arch_caps & IA32_ARCH_CAP_RDCL_NO) == 0;
+ TUNABLE_INT_FETCH("hw.vmm.l1d_flush", &guest_l1d_flush);
+ if (guest_l1d_flush &&
+ (cpu_stdext_feature3 & CPUID_STDEXT3_L1D_FLUSH) != 0)
+ vmx_msr_flush_cmd = IA32_FLUSH_CMD_L1D;
+
/*
* Stash the cr0 and cr4 bits that must be fixed to 0 or 1
*/
diff --git a/sys/amd64/vmm/intel/vmx_genassym.c b/sys/amd64/vmm/intel/vmx_genassym.c
index e1b98d63f1d5..02779ea28b02 100644
--- a/sys/amd64/vmm/intel/vmx_genassym.c
+++ b/sys/amd64/vmm/intel/vmx_genassym.c
@@ -36,6 +36,7 @@ __FBSDID("$FreeBSD$");
#include <vm/vm.h>
#include <vm/pmap.h>
+#include <vm/vm_param.h>
#include <machine/vmm.h>
#include "vmx_cpufunc.h"
@@ -86,3 +87,6 @@ ASSYM(PM_EPTGEN, offsetof(struct pmap, pm_eptgen));
ASSYM(KERNEL_SS, GSEL(GDATA_SEL, SEL_KPL));
ASSYM(KERNEL_CS, GSEL(GCODE_SEL, SEL_KPL));
+
+ASSYM(PAGE_SIZE, PAGE_SIZE);
+ASSYM(KERNBASE, KERNBASE);
diff --git a/sys/amd64/vmm/intel/vmx_support.S b/sys/amd64/vmm/intel/vmx_support.S
index 84fb5b051cd2..b06dcf5d2527 100644
--- a/sys/amd64/vmm/intel/vmx_support.S
+++ b/sys/amd64/vmm/intel/vmx_support.S
@@ -28,6 +28,7 @@
*/
#include <machine/asmacros.h>
+#include <machine/specialreg.h>
#include "vmx_assym.h"
@@ -136,9 +137,47 @@ ENTRY(vmx_enter_guest)
jbe invept_error /* Check invept instruction error */
guest_restore:
- cmpl $0, %edx
- je do_launch
+ /*
+ * Flush L1D cache if requested. Use IA32_FLUSH_CMD MSR if available,
+ * otherwise load enough of the data from the zero_region to flush
+ * existing L1D content.
+ */
+#define L1D_FLUSH_SIZE (64 * 1024)
+ movl %edx, %r8d
+ cmpb $0, guest_l1d_flush(%rip)
+ je after_l1d
+ movq vmx_msr_flush_cmd(%rip), %rax
+ testq %rax, %rax
+ jz 1f
+ movq %rax, %rdx
+ shrq $32, %rdx
+ movl $MSR_IA32_FLUSH_CMD, %ecx
+ wrmsr
+ jmp after_l1d
+1: movq $KERNBASE, %r9
+ movq $-L1D_FLUSH_SIZE, %rcx
+ /*
+ * pass 1: Preload TLB.
+ * Kernel text is mapped using superpages. TLB preload is
+ * done for the benefit of older CPUs which split 2M page
+ * into 4k TLB entries.
+ */
+2: movb L1D_FLUSH_SIZE(%r9, %rcx), %al
+ addq $PAGE_SIZE, %rcx
+ jne 2b
+ xorl %eax, %eax
+ cpuid
+ movq $-L1D_FLUSH_SIZE, %rcx
+ /* pass 2: Read each cache line */
+3: movb L1D_FLUSH_SIZE(%r9, %rcx), %al
+ addq $64, %rcx
+ jne 3b
+ lfence
+#undef L1D_FLUSH_SIZE
+after_l1d:
+ cmpl $0, %r8d
+ je do_launch
VMX_GUEST_RESTORE
vmresume
/*
diff --git a/sys/conf/newvers.sh b/sys/conf/newvers.sh
index 0f509fc76f76..51b22e1a8f72 100644
--- a/sys/conf/newvers.sh
+++ b/sys/conf/newvers.sh
@@ -44,7 +44,7 @@
TYPE="FreeBSD"
REVISION="11.1"
-BRANCH="RELEASE-p12"
+BRANCH="RELEASE-p13"
if [ -n "${BRANCH_OVERRIDE}" ]; then
BRANCH=${BRANCH_OVERRIDE}
fi
diff --git a/sys/netinet/ip_reass.c b/sys/netinet/ip_reass.c
index dc1ac1476db9..ce4d4f402f83 100644
--- a/sys/netinet/ip_reass.c
+++ b/sys/netinet/ip_reass.c
@@ -42,6 +42,7 @@ __FBSDID("$FreeBSD$");
#include <sys/hash.h>
#include <sys/mbuf.h>
#include <sys/malloc.h>
+#include <sys/limits.h>
#include <sys/lock.h>
#include <sys/mutex.h>
#include <sys/sysctl.h>
@@ -63,13 +64,14 @@ SYSCTL_DECL(_net_inet_ip);
/*
* Reassembly headers are stored in hash buckets.
*/
-#define IPREASS_NHASH_LOG2 6
+#define IPREASS_NHASH_LOG2 10
#define IPREASS_NHASH (1 << IPREASS_NHASH_LOG2)
#define IPREASS_HMASK (IPREASS_NHASH - 1)
struct ipqbucket {
TAILQ_HEAD(ipqhead, ipq) head;
struct mtx lock;
+ int count;
};
static VNET_DEFINE(struct ipqbucket, ipq[IPREASS_NHASH]);
@@ -82,6 +84,9 @@ static VNET_DEFINE(uint32_t, ipq_hashseed);
#define IPQ_UNLOCK(i) mtx_unlock(&V_ipq[i].lock)
#define IPQ_LOCK_ASSERT(i) mtx_assert(&V_ipq[i].lock, MA_OWNED)
+static VNET_DEFINE(int, ipreass_maxbucketsize);
+#define V_ipreass_maxbucketsize VNET(ipreass_maxbucketsize)
+
void ipreass_init(void);
void ipreass_drain(void);
void ipreass_slowtimo(void);
@@ -89,27 +94,53 @@ void ipreass_slowtimo(void);
void ipreass_destroy(void);
#endif
static int sysctl_maxfragpackets(SYSCTL_HANDLER_ARGS);
+static int sysctl_maxfragbucketsize(SYSCTL_HANDLER_ARGS);
static void ipreass_zone_change(void *);
static void ipreass_drain_tomax(void);
-static void ipq_free(struct ipqhead *, struct ipq *);
+static void ipq_free(struct ipqbucket *, struct ipq *);
static struct ipq * ipq_reuse(int);
static inline void
-ipq_timeout(struct ipqhead *head, struct ipq *fp)
+ipq_timeout(struct ipqbucket *bucket, struct ipq *fp)
{
IPSTAT_ADD(ips_fragtimeout, fp->ipq_nfrags);
- ipq_free(head, fp);
+ ipq_free(bucket, fp);
}
static inline void
-ipq_drop(struct ipqhead *head, struct ipq *fp)
+ipq_drop(struct ipqbucket *bucket, struct ipq *fp)
{
IPSTAT_ADD(ips_fragdropped, fp->ipq_nfrags);
- ipq_free(head, fp);
+ ipq_free(bucket, fp);
}
+/*
+ * By default, limit the number of IP fragments across all reassembly
+ * queues to 1/32 of the total number of mbuf clusters.
+ *
+ * Limit the total number of reassembly queues per VNET to the
+ * IP fragment limit, but ensure the limit will not allow any bucket
+ * to grow above 100 items. (The bucket limit is
+ * IP_MAXFRAGPACKETS / (IPREASS_NHASH / 2), so the 50 is the correct
+ * multiplier to reach a 100-item limit.)
+ * The 100-item limit was chosen as brief testing seems to show that
+ * this produces "reasonable" performance on some subset of systems
+ * under DoS attack.
+ */
+#define IP_MAXFRAGS (nmbclusters / 32)
+#define IP_MAXFRAGPACKETS (imin(IP_MAXFRAGS, IPREASS_NHASH * 50))
+
+static int maxfrags;
+static volatile u_int nfrags;
+SYSCTL_INT(_net_inet_ip, OID_AUTO, maxfrags, CTLFLAG_RW,
+ &maxfrags, 0,
+ "Maximum number of IPv4 fragments allowed across all reassembly queues");
+SYSCTL_UINT(_net_inet_ip, OID_AUTO, curfrags, CTLFLAG_RD,
+ __DEVOLATILE(u_int *, &nfrags), 0,
+ "Current number of IPv4 fragments across all reassembly queues");
+
static VNET_DEFINE(uma_zone_t, ipq_zone);
#define V_ipq_zone VNET(ipq_zone)
SYSCTL_PROC(_net_inet_ip, OID_AUTO, maxfragpackets, CTLFLAG_VNET |
@@ -127,6 +158,10 @@ static VNET_DEFINE(int, maxfragsperpacket);
SYSCTL_INT(_net_inet_ip, OID_AUTO, maxfragsperpacket, CTLFLAG_VNET | CTLFLAG_RW,
&VNET_NAME(maxfragsperpacket), 0,
"Maximum number of IPv4 fragments allowed per packet");
+SYSCTL_PROC(_net_inet_ip, OID_AUTO, maxfragbucketsize,
+ CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0,
+ sysctl_maxfragbucketsize, "I",
+ "Maximum number of IPv4 fragment reassembly queue entries per bucket");
/*
* Take incoming datagram fragment and try to reassemble it into
@@ -146,9 +181,9 @@ ip_reass(struct mbuf *m)
struct mbuf *p, *q, *nq, *t;
struct ipq *fp;
struct ipqhead *head;
- int i, hlen, next;
+ int i, hlen, next, tmpmax;
u_int8_t ecn, ecn0;
- uint32_t hash;
+ uint32_t hash, hashkey[3];
#ifdef RSS
uint32_t rss_hash, rss_type;
#endif
@@ -156,8 +191,12 @@ ip_reass(struct mbuf *m)
/*
* If no reassembling or maxfragsperpacket are 0,
* never accept fragments.
+ * Also, drop packet if it would exceed the maximum
+ * number of fragments.
*/
- if (V_noreass == 1 || V_maxfragsperpacket == 0) {
+ tmpmax = maxfrags;
+ if (V_noreass == 1 || V_maxfragsperpacket == 0 ||
+ (tmpmax >= 0 && nfrags >= (u_int)tmpmax)) {
IPSTAT_INC(ips_fragments);
IPSTAT_INC(ips_fragdropped);
m_freem(m);
@@ -202,8 +241,12 @@ ip_reass(struct mbuf *m)
m->m_data += hlen;
m->m_len -= hlen;
- hash = ip->ip_src.s_addr ^ ip->ip_id;
- hash = jenkins_hash32(&hash, 1, V_ipq_hashseed) & IPREASS_HMASK;
+ hashkey[0] = ip->ip_src.s_addr;
+ hashkey[1] = ip->ip_dst.s_addr;
+ hashkey[2] = (uint32_t)ip->ip_p << 16;
+ hashkey[2] += ip->ip_id;
+ hash = jenkins_hash32(hashkey, nitems(hashkey), V_ipq_hashseed);
+ hash &= IPREASS_HMASK;
head = &V_ipq[hash].head;
IPQ_LOCK(hash);
@@ -224,9 +267,12 @@ ip_reass(struct mbuf *m)
* If first fragment to arrive, create a reassembly queue.
*/
if (fp == NULL) {
- fp = uma_zalloc(V_ipq_zone, M_NOWAIT);
+ if (V_ipq[hash].count < V_ipreass_maxbucketsize)
+ fp = uma_zalloc(V_ipq_zone, M_NOWAIT);
if (fp == NULL)
fp = ipq_reuse(hash);
+ if (fp == NULL)
+ goto dropfrag;
#ifdef MAC
if (mac_ipq_init(fp, M_NOWAIT) != 0) {
uma_zfree(V_ipq_zone, fp);
@@ -236,7 +282,9 @@ ip_reass(struct mbuf *m)
mac_ipq_create(m, fp);
#endif
TAILQ_INSERT_HEAD(head, fp, ipq_list);
+ V_ipq[hash].count++;
fp->ipq_nfrags = 1;
+ atomic_add_int(&nfrags, 1);
fp->ipq_ttl = IPFRAGTTL;
fp->ipq_p = ip->ip_p;
fp->ipq_id = ip->ip_id;
@@ -247,6 +295,7 @@ ip_reass(struct mbuf *m)
goto done;
} else {
fp->ipq_nfrags++;
+ atomic_add_int(&nfrags, 1);
#ifdef MAC
mac_ipq_update(m, fp);
#endif
@@ -323,6 +372,7 @@ ip_reass(struct mbuf *m)
m->m_nextpkt = nq;
IPSTAT_INC(ips_fragdropped);
fp->ipq_nfrags--;
+ atomic_subtract_int(&nfrags, 1);
m_freem(q);
}
@@ -340,7 +390,7 @@ ip_reass(struct mbuf *m)
for (p = NULL, q = fp->ipq_frags; q; p = q, q = q->m_nextpkt) {
if (ntohs(GETIP(q)->ip_off) != next) {
if (fp->ipq_nfrags > V_maxfragsperpacket)
- ipq_drop(head, fp);
+ ipq_drop(&V_ipq[hash], fp);
goto done;
}
next += ntohs(GETIP(q)->ip_len);
@@ -348,7 +398,7 @@ ip_reass(struct mbuf *m)
/* Make sure the last packet didn't have the IP_MF flag */
if (p->m_flags & M_IP_FRAG) {
if (fp->ipq_nfrags > V_maxfragsperpacket)
- ipq_drop(head, fp);
+ ipq_drop(&V_ipq[hash], fp);
goto done;
}
@@ -359,7 +409,7 @@ ip_reass(struct mbuf *m)
ip = GETIP(q);
if (next + (ip->ip_hl << 2) > IP_MAXPACKET) {
IPSTAT_INC(ips_toolong);
- ipq_drop(head, fp);
+ ipq_drop(&V_ipq[hash], fp);
goto done;
}
@@ -387,6 +437,7 @@ ip_reass(struct mbuf *m)
while (m->m_pkthdr.csum_data & 0xffff0000)
m->m_pkthdr.csum_data = (m->m_pkthdr.csum_data & 0xffff) +
(m->m_pkthdr.csum_data >> 16);
+ atomic_subtract_int(&nfrags, fp->ipq_nfrags);
#ifdef MAC
mac_ipq_reassemble(fp, m);
mac_ipq_destroy(fp);
@@ -401,6 +452,7 @@ ip_reass(struct mbuf *m)
ip->ip_src = fp->ipq_src;
ip->ip_dst = fp->ipq_dst;
TAILQ_REMOVE(head, fp, ipq_list);
+ V_ipq[hash].count--;
uma_zfree(V_ipq_zone, fp);
m->m_len += (ip->ip_hl << 2);
m->m_data -= (ip->ip_hl << 2);
@@ -446,8 +498,10 @@ ip_reass(struct mbuf *m)
dropfrag:
IPSTAT_INC(ips_fragdropped);
- if (fp != NULL)
+ if (fp != NULL) {
fp->ipq_nfrags--;
+ atomic_subtract_int(&nfrags, 1);
+ }
m_freem(m);
done:
IPQ_UNLOCK(hash);
@@ -462,21 +516,27 @@ done:
void
ipreass_init(void)
{
+ int max;
for (int i = 0; i < IPREASS_NHASH; i++) {
TAILQ_INIT(&V_ipq[i].head);
mtx_init(&V_ipq[i].lock, "IP reassembly", NULL,
MTX_DEF | MTX_DUPOK);
+ V_ipq[i].count = 0;
}
V_ipq_hashseed = arc4random();
V_maxfragsperpacket = 16;
V_ipq_zone = uma_zcreate("ipq", sizeof(struct ipq), NULL, NULL, NULL,
NULL, UMA_ALIGN_PTR, 0);
- uma_zone_set_max(V_ipq_zone, nmbclusters / 32);
+ max = IP_MAXFRAGPACKETS;
+ max = uma_zone_set_max(V_ipq_zone, max);
+ V_ipreass_maxbucketsize = imax(max / (IPREASS_NHASH / 2), 1);
- if (IS_DEFAULT_VNET(curvnet))
+ if (IS_DEFAULT_VNET(curvnet)) {
+ maxfrags = IP_MAXFRAGS;
EVENTHANDLER_REGISTER(nmbclusters_change, ipreass_zone_change,
NULL, EVENTHANDLER_PRI_ANY);
+ }
}
/*
@@ -491,7 +551,7 @@ ipreass_slowtimo(void)
IPQ_LOCK(i);
TAILQ_FOREACH_SAFE(fp, &V_ipq[i].head, ipq_list, tmp)
if (--fp->ipq_ttl == 0)
- ipq_timeout(&V_ipq[i].head, fp);
+ ipq_timeout(&V_ipq[i], fp);
IPQ_UNLOCK(i);
}
}
@@ -506,7 +566,10 @@ ipreass_drain(void)
for (int i = 0; i < IPREASS_NHASH; i++) {
IPQ_LOCK(i);
while(!TAILQ_EMPTY(&V_ipq[i].head))
- ipq_drop(&V_ipq[i].head, TAILQ_FIRST(&V_ipq[i].head));
+ ipq_drop(&V_ipq[i], TAILQ_FIRST(&V_ipq[i].head));
+ KASSERT(V_ipq[i].count == 0,
+ ("%s: V_ipq[%d] count %d (V_ipq=%p)", __func__, i,
+ V_ipq[i].count, V_ipq));
IPQ_UNLOCK(i);
}
}
@@ -534,9 +597,23 @@ ipreass_destroy(void)
static void
ipreass_drain_tomax(void)
{
+ struct ipq *fp;
int target;
/*
+ * Make sure each bucket is under the new limit. If
+ * necessary, drop enough of the oldest elements from
+ * each bucket to get under the new limit.
+ */
+ for (int i = 0; i < IPREASS_NHASH; i++) {
+ IPQ_LOCK(i);
+ while (V_ipq[i].count > V_ipreass_maxbucketsize &&
+ (fp = TAILQ_LAST(&V_ipq[i].head, ipqhead)) != NULL)
+ ipq_timeout(&V_ipq[i], fp);
+ IPQ_UNLOCK(i);
+ }
+
+ /*
* If we are over the maximum number of fragments,
* drain off enough to get down to the new limit,
* stripping off last elements on queues. Every
@@ -544,13 +621,11 @@ ipreass_drain_tomax(void)
*/
target = uma_zone_get_max(V_ipq_zone);
while (uma_zone_get_cur(V_ipq_zone) > target) {
- struct ipq *fp;
-
for (int i = 0; i < IPREASS_NHASH; i++) {
IPQ_LOCK(i);
fp = TAILQ_LAST(&V_ipq[i].head, ipqhead);
if (fp != NULL)
- ipq_timeout(&V_ipq[i].head, fp);
+ ipq_timeout(&V_ipq[i], fp);
IPQ_UNLOCK(i);
}
}
@@ -559,9 +634,20 @@ ipreass_drain_tomax(void)
static void
ipreass_zone_change(void *tag)
{
-
- uma_zone_set_max(V_ipq_zone, nmbclusters / 32);
- ipreass_drain_tomax();
+ VNET_ITERATOR_DECL(vnet_iter);
+ int max;
+
+ maxfrags = IP_MAXFRAGS;
+ max = IP_MAXFRAGPACKETS;
+ VNET_LIST_RLOCK_NOSLEEP();
+ VNET_FOREACH(vnet_iter) {
+ CURVNET_SET(vnet_iter);
+ max = uma_zone_set_max(V_ipq_zone, max);
+ V_ipreass_maxbucketsize = imax(max / (IPREASS_NHASH / 2), 1);
+ ipreass_drain_tomax();
+ CURVNET_RESTORE();
+ }
+ VNET_LIST_RUNLOCK_NOSLEEP();
}
/*
@@ -589,6 +675,7 @@ sysctl_maxfragpackets(SYSCTL_HANDLER_ARGS)
* and place an extreme upper bound.
*/
max = uma_zone_set_max(V_ipq_zone, max);
+ V_ipreass_maxbucketsize = imax(max / (IPREASS_NHASH / 2), 1);
ipreass_drain_tomax();
V_noreass = 0;
} else if (max == 0) {
@@ -597,6 +684,7 @@ sysctl_maxfragpackets(SYSCTL_HANDLER_ARGS)
} else if (max == -1) {
V_noreass = 0;
uma_zone_set_max(V_ipq_zone, 0);
+ V_ipreass_maxbucketsize = INT_MAX;
} else
return (EINVAL);
return (0);
@@ -610,49 +698,72 @@ static struct ipq *
ipq_reuse(int start)
{
struct ipq *fp;
- int i;
+ int bucket, i;
IPQ_LOCK_ASSERT(start);
- for (i = start;; i++) {
- if (i == IPREASS_NHASH)
- i = 0;
- if (i != start && IPQ_TRYLOCK(i) == 0)
+ for (i = 0; i < IPREASS_NHASH; i++) {
+ bucket = (start + i) % IPREASS_NHASH;
+ if (bucket != start && IPQ_TRYLOCK(bucket) == 0)
continue;
- fp = TAILQ_LAST(&V_ipq[i].head, ipqhead);
+ fp = TAILQ_LAST(&V_ipq[bucket].head, ipqhead);
if (fp) {
struct mbuf *m;
IPSTAT_ADD(ips_fragtimeout, fp->ipq_nfrags);
+ atomic_subtract_int(&nfrags, fp->ipq_nfrags);
while (fp->ipq_frags) {
m = fp->ipq_frags;
fp->ipq_frags = m->m_nextpkt;
m_freem(m);
}
- TAILQ_REMOVE(&V_ipq[i].head, fp, ipq_list);
- if (i != start)
- IPQ_UNLOCK(i);
- IPQ_LOCK_ASSERT(start);
- return (fp);
+ TAILQ_REMOVE(&V_ipq[bucket].head, fp, ipq_list);
+ V_ipq[bucket].count--;
+ if (bucket != start)
+ IPQ_UNLOCK(bucket);
+ break;
}
- if (i != start)
- IPQ_UNLOCK(i);
+ if (bucket != start)
+ IPQ_UNLOCK(bucket);
}
+ IPQ_LOCK_ASSERT(start);
+ return (fp);
}
/*
* Free a fragment reassembly header and all associated datagrams.
*/
static void
-ipq_free(struct ipqhead *fhp, struct ipq *fp)
+ipq_free(struct ipqbucket *bucket, struct ipq *fp)
{
struct mbuf *q;
+ atomic_subtract_int(&nfrags, fp->ipq_nfrags);
while (fp->ipq_frags) {
q = fp->ipq_frags;
fp->ipq_frags = q->m_nextpkt;
m_freem(q);
}
- TAILQ_REMOVE(fhp, fp, ipq_list);
+ TAILQ_REMOVE(&bucket->head, fp, ipq_list);
+ bucket->count--;
uma_zfree(V_ipq_zone, fp);
}
+
+/*
+ * Get or set the maximum number of reassembly queues per bucket.
+ */
+static int
+sysctl_maxfragbucketsize(SYSCTL_HANDLER_ARGS)
+{
+ int error, max;
+
+ max = V_ipreass_maxbucketsize;
+ error = sysctl_handle_int(oidp, &max, 0, req);
+ if (error || !req->newptr)
+ return (error);
+ if (max <= 0)
+ return (EINVAL);
+ V_ipreass_maxbucketsize = max;
+ ipreass_drain_tomax();
+ return (0);
+}
diff --git a/sys/netinet6/frag6.c b/sys/netinet6/frag6.c
index df1f72285faf..1e9192d149c5 100644
--- a/sys/netinet6/frag6.c
+++ b/sys/netinet6/frag6.c
@@ -36,6 +36,7 @@ __FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/systm.h>
+#include <sys/hash.h>
#include <sys/malloc.h>
#include <sys/mbuf.h>
#include <sys/domain.h>
@@ -47,6 +48,8 @@ __FBSDID("$FreeBSD$");
#include <sys/kernel.h>
#include <sys/syslog.h>
+#include <machine/atomic.h>
+
#include <net/if.h>
#include <net/if_var.h>
#include <net/netisr.h>
@@ -63,58 +66,110 @@ __FBSDID("$FreeBSD$");
#include <security/mac/mac_framework.h>
-static void frag6_enq(struct ip6asfrag *, struct ip6asfrag *);
-static void frag6_deq(struct ip6asfrag *);
-static void frag6_insque(struct ip6q *, struct ip6q *);
-static void frag6_remque(struct ip6q *);
-static void frag6_freef(struct ip6q *);
-
-static struct mtx ip6qlock;
/*
- * These fields all protected by ip6qlock.
+ * Reassembly headers are stored in hash buckets.
*/
-static VNET_DEFINE(u_int, frag6_nfragpackets);
-static VNET_DEFINE(u_int, frag6_nfrags);
-static VNET_DEFINE(struct ip6q, ip6q); /* ip6 reassemble queue */
+#define IP6REASS_NHASH_LOG2 10
+#define IP6REASS_NHASH (1 << IP6REASS_NHASH_LOG2)
+#define IP6REASS_HMASK (IP6REASS_NHASH - 1)
+
+static void frag6_enq(struct ip6asfrag *, struct ip6asfrag *,
+ uint32_t bucket __unused);
+static void frag6_deq(struct ip6asfrag *, uint32_t bucket __unused);
+static void frag6_insque_head(struct ip6q *, struct ip6q *,
+ uint32_t bucket);
+static void frag6_remque(struct ip6q *, uint32_t bucket);
+static void frag6_freef(struct ip6q *, uint32_t bucket);
+
+struct ip6qbucket {
+ struct ip6q ip6q;
+ struct mtx lock;
+ int count;
+};
+
+static VNET_DEFINE(volatile u_int, frag6_nfragpackets);
+volatile u_int frag6_nfrags = 0;
+static VNET_DEFINE(struct ip6qbucket, ip6q[IP6REASS_NHASH]);
+static VNET_DEFINE(uint32_t, ip6q_hashseed);
#define V_frag6_nfragpackets VNET(frag6_nfragpackets)
-#define V_frag6_nfrags VNET(frag6_nfrags)
#define V_ip6q VNET(ip6q)
+#define V_ip6q_hashseed VNET(ip6q_hashseed)
-#define IP6Q_LOCK_INIT() mtx_init(&ip6qlock, "ip6qlock", NULL, MTX_DEF);
-#define IP6Q_LOCK() mtx_lock(&ip6qlock)
-#define IP6Q_TRYLOCK() mtx_trylock(&ip6qlock)
-#define IP6Q_LOCK_ASSERT() mtx_assert(&ip6qlock, MA_OWNED)
-#define IP6Q_UNLOCK() mtx_unlock(&ip6qlock)
+#define IP6Q_LOCK(i) mtx_lock(&V_ip6q[(i)].lock)
+#define IP6Q_TRYLOCK(i) mtx_trylock(&V_ip6q[(i)].lock)
+#define IP6Q_LOCK_ASSERT(i) mtx_assert(&V_ip6q[(i)].lock, MA_OWNED)
+#define IP6Q_UNLOCK(i) mtx_unlock(&V_ip6q[(i)].lock)
+#define IP6Q_HEAD(i) (&V_ip6q[(i)].ip6q)
static MALLOC_DEFINE(M_FTABLE, "fragment", "fragment reassembly header");
/*
+ * By default, limit the number of IP6 fragments across all reassembly
+ * queues to 1/32 of the total number of mbuf clusters.
+ *
+ * Limit the total number of reassembly queues per VNET to the
+ * IP6 fragment limit, but ensure the limit will not allow any bucket
+ * to grow above 100 items. (The bucket limit is
+ * IP_MAXFRAGPACKETS / (IPREASS_NHASH / 2), so the 50 is the correct
+ * multiplier to reach a 100-item limit.)
+ * The 100-item limit was chosen as brief testing seems to show that
+ * this produces "reasonable" performance on some subset of systems
+ * under DoS attack.
+ */
+#define IP6_MAXFRAGS (nmbclusters / 32)
+#define IP6_MAXFRAGPACKETS (imin(IP6_MAXFRAGS, IP6REASS_NHASH * 50))
+
+/*
* Initialise reassembly queue and fragment identifier.
*/
+void
+frag6_set_bucketsize()
+{
+ int i;
+
+ if ((i = V_ip6_maxfragpackets) > 0)
+ V_ip6_maxfragbucketsize = imax(i / (IP6REASS_NHASH / 2), 1);
+}
+
static void
frag6_change(void *tag)
{
+ VNET_ITERATOR_DECL(vnet_iter);
- V_ip6_maxfragpackets = nmbclusters / 4;
- V_ip6_maxfrags = nmbclusters / 4;
+ ip6_maxfrags = IP6_MAXFRAGS;
+ VNET_LIST_RLOCK_NOSLEEP();
+ VNET_FOREACH(vnet_iter) {
+ CURVNET_SET(vnet_iter);
+ V_ip6_maxfragpackets = IP6_MAXFRAGPACKETS;
+ frag6_set_bucketsize();
+ CURVNET_RESTORE();
+ }
+ VNET_LIST_RUNLOCK_NOSLEEP();
}
void
frag6_init(void)
{
-
- V_ip6_maxfragpackets = nmbclusters / 4;
- V_ip6_maxfrags = nmbclusters / 4;
- V_ip6q.ip6q_next = V_ip6q.ip6q_prev = &V_ip6q;
-
+ struct ip6q *q6;
+ int i;
+
+ V_ip6_maxfragpackets = IP6_MAXFRAGPACKETS;
+ frag6_set_bucketsize();
+ for (i = 0; i < IP6REASS_NHASH; i++) {
+ q6 = IP6Q_HEAD(i);
+ q6->ip6q_next = q6->ip6q_prev = q6;
+ mtx_init(&V_ip6q[i].lock, "ip6qlock", NULL, MTX_DEF);
+ V_ip6q[i].count = 0;
+ }
+ V_ip6q_hashseed = arc4random();
+ V_ip6_maxfragsperpacket = 64;
if (!IS_DEFAULT_VNET(curvnet))
return;
+ ip6_maxfrags = IP6_MAXFRAGS;
EVENTHANDLER_REGISTER(nmbclusters_change,
frag6_change, NULL, EVENTHANDLER_PRI_ANY);
-
- IP6Q_LOCK_INIT();
}
/*
@@ -155,12 +210,13 @@ frag6_input(struct mbuf **mp, int *offp, int proto)
struct mbuf *m = *mp, *t;
struct ip6_hdr *ip6;
struct ip6_frag *ip6f;
- struct ip6q *q6;
+ struct ip6q *head, *q6;
struct ip6asfrag *af6, *ip6af, *af6dwn;
struct in6_ifaddr *ia;
int offset = *offp, nxt, i, next;
int first_frag = 0;
int fragoff, frgpartlen; /* must be larger than u_int16_t */
+ uint32_t hash, hashkey[sizeof(struct in6_addr) * 2 + 1], *hashkeyp;
struct ifnet *dstifp;
u_int8_t ecn, ecn0;
#ifdef RSS
@@ -228,19 +284,38 @@ frag6_input(struct mbuf **mp, int *offp, int proto)
return (ip6f->ip6f_nxt);
}
- IP6Q_LOCK();
+ /* Get fragment length and discard 0-byte fragments. */
+ frgpartlen = sizeof(struct ip6_hdr) + ntohs(ip6->ip6_plen) - offset;
+ if (frgpartlen == 0) {
+ icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER,
+ offsetof(struct ip6_hdr, ip6_plen));
+ in6_ifstat_inc(dstifp, ifs6_reass_fail);
+ IP6STAT_INC(ip6s_fragdropped);
+ return IPPROTO_DONE;
+ }
+
+ hashkeyp = hashkey;
+ memcpy(hashkeyp, &ip6->ip6_src, sizeof(struct in6_addr));
+ hashkeyp += sizeof(struct in6_addr) / sizeof(*hashkeyp);
+ memcpy(hashkeyp, &ip6->ip6_dst, sizeof(struct in6_addr));
+ hashkeyp += sizeof(struct in6_addr) / sizeof(*hashkeyp);
+ *hashkeyp = ip6f->ip6f_ident;
+ hash = jenkins_hash32(hashkey, nitems(hashkey), V_ip6q_hashseed);
+ hash &= IP6REASS_HMASK;
+ head = IP6Q_HEAD(hash);
+ IP6Q_LOCK(hash);
/*
* Enforce upper bound on number of fragments.
* If maxfrag is 0, never accept fragments.
* If maxfrag is -1, accept all fragments without limitation.
*/
- if (V_ip6_maxfrags < 0)
+ if (ip6_maxfrags < 0)
;
- else if (V_frag6_nfrags >= (u_int)V_ip6_maxfrags)
+ else if (frag6_nfrags >= (u_int)ip6_maxfrags)
goto dropfrag;
- for (q6 = V_ip6q.ip6q_next; q6 != &V_ip6q; q6 = q6->ip6q_next)
+ for (q6 = head->ip6q_next; q6 != head; q6 = q6->ip6q_next)
if (ip6f->ip6f_ident == q6->ip6q_ident &&
IN6_ARE_ADDR_EQUAL(&ip6->ip6_src, &q6->ip6q_src) &&
IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst, &q6->ip6q_dst)
@@ -250,7 +325,7 @@ frag6_input(struct mbuf **mp, int *offp, int proto)
)
break;
- if (q6 == &V_ip6q) {
+ if (q6 == head) {
/*
* the first fragment to arrive, create a reassembly queue.
*/
@@ -265,9 +340,10 @@ frag6_input(struct mbuf **mp, int *offp, int proto)
*/
if (V_ip6_maxfragpackets < 0)
;
- else if (V_frag6_nfragpackets >= (u_int)V_ip6_maxfragpackets)
+ else if (V_ip6q[hash].count >= V_ip6_maxfragbucketsize ||
+ V_frag6_nfragpackets >= (u_int)V_ip6_maxfragpackets)
goto dropfrag;
- V_frag6_nfragpackets++;
+ atomic_add_int(&V_frag6_nfragpackets, 1);
q6 = (struct ip6q *)malloc(sizeof(struct ip6q), M_FTABLE,
M_NOWAIT);
if (q6 == NULL)
@@ -280,7 +356,7 @@ frag6_input(struct mbuf **mp, int *offp, int proto)
}
mac_ip6q_create(m, q6);
#endif
- frag6_insque(q6, &V_ip6q);
+ frag6_insque_head(q6, head, hash);
/* ip6q_nxt will be filled afterwards, from 1st fragment */
q6->ip6q_down = q6->ip6q_up = (struct ip6asfrag *)q6;
@@ -314,21 +390,20 @@ frag6_input(struct mbuf **mp, int *offp, int proto)
* in size.
* If it would exceed, discard the fragment and return an ICMP error.
*/
- frgpartlen = sizeof(struct ip6_hdr) + ntohs(ip6->ip6_plen) - offset;
if (q6->ip6q_unfrglen >= 0) {
/* The 1st fragment has already arrived. */
if (q6->ip6q_unfrglen + fragoff + frgpartlen > IPV6_MAXPACKET) {
icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER,
offset - sizeof(struct ip6_frag) +
offsetof(struct ip6_frag, ip6f_offlg));
- IP6Q_UNLOCK();
+ IP6Q_UNLOCK(hash);
return (IPPROTO_DONE);
}
} else if (fragoff + frgpartlen > IPV6_MAXPACKET) {
icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER,
offset - sizeof(struct ip6_frag) +
offsetof(struct ip6_frag, ip6f_offlg));
- IP6Q_UNLOCK();
+ IP6Q_UNLOCK(hash);
return (IPPROTO_DONE);
}
/*
@@ -347,7 +422,7 @@ frag6_input(struct mbuf **mp, int *offp, int proto)
int erroff = af6->ip6af_offset;
/* dequeue the fragment. */
- frag6_deq(af6);
+ frag6_deq(af6, hash);
free(af6, M_FTABLE);
/* adjust pointer. */
@@ -445,7 +520,7 @@ frag6_input(struct mbuf **mp, int *offp, int proto)
}
af6 = af6->ip6af_down;
m_freem(IP6_REASS_MBUF(af6->ip6af_up));
- frag6_deq(af6->ip6af_up);
+ frag6_deq(af6->ip6af_up, hash);
}
#else
/*
@@ -494,29 +569,38 @@ insert:
/*
* Stick new segment in its place;
* check for complete reassembly.
+ * If not complete, check fragment limit.
* Move to front of packet queue, as we are
* the most recently active fragmented packet.
*/
- frag6_enq(ip6af, af6->ip6af_up);
- V_frag6_nfrags++;
+ frag6_enq(ip6af, af6->ip6af_up, hash);
+ atomic_add_int(&frag6_nfrags, 1);
q6->ip6q_nfrag++;
#if 0 /* xxx */
- if (q6 != V_ip6q.ip6q_next) {
- frag6_remque(q6);
- frag6_insque(q6, &V_ip6q);
+ if (q6 != head->ip6q_next) {
+ frag6_remque(q6, hash);
+ frag6_insque_head(q6, head, hash);
}
#endif
next = 0;
for (af6 = q6->ip6q_down; af6 != (struct ip6asfrag *)q6;
af6 = af6->ip6af_down) {
if (af6->ip6af_off != next) {
- IP6Q_UNLOCK();
+ if (q6->ip6q_nfrag > V_ip6_maxfragsperpacket) {
+ IP6STAT_INC(ip6s_fragdropped);
+ frag6_freef(q6, hash);
+ }
+ IP6Q_UNLOCK(hash);
return IPPROTO_DONE;
}
next += af6->ip6af_frglen;
}
if (af6->ip6af_up->ip6af_mff) {
- IP6Q_UNLOCK();
+ if (q6->ip6q_nfrag > V_ip6_maxfragsperpacket) {
+ IP6STAT_INC(ip6s_fragdropped);
+ frag6_freef(q6, hash);
+ }
+ IP6Q_UNLOCK(hash);
return IPPROTO_DONE;
}
@@ -526,7 +610,7 @@ insert:
ip6af = q6->ip6q_down;
t = m = IP6_REASS_MBUF(ip6af);
af6 = ip6af->ip6af_down;
- frag6_deq(ip6af);
+ frag6_deq(ip6af, hash);
while (af6 != (struct ip6asfrag *)q6) {
m->m_pkthdr.csum_flags &=
IP6_REASS_MBUF(af6)->m_pkthdr.csum_flags;
@@ -534,7 +618,7 @@ insert:
IP6_REASS_MBUF(af6)->m_pkthdr.csum_data;
af6dwn = af6->ip6af_down;
- frag6_deq(af6);
+ frag6_deq(af6, hash);
while (t->m_next)
t = t->m_next;
m_adj(IP6_REASS_MBUF(af6), af6->ip6af_offset);
@@ -560,13 +644,13 @@ insert:
#endif
if (ip6_deletefraghdr(m, offset, M_NOWAIT) != 0) {
- frag6_remque(q6);
- V_frag6_nfrags -= q6->ip6q_nfrag;
+ frag6_remque(q6, hash);
+ atomic_subtract_int(&frag6_nfrags, q6->ip6q_nfrag);
#ifdef MAC
mac_ip6q_destroy(q6);
#endif
free(q6, M_FTABLE);
- V_frag6_nfragpackets--;
+ atomic_subtract_int(&V_frag6_nfragpackets, 1);
goto dropfrag;
}
@@ -579,14 +663,14 @@ insert:
*prvnxtp = nxt;
}
- frag6_remque(q6);
- V_frag6_nfrags -= q6->ip6q_nfrag;
+ frag6_remque(q6, hash);
+ atomic_subtract_int(&frag6_nfrags, q6->ip6q_nfrag);
#ifdef MAC
mac_ip6q_reassemble(q6, m);
mac_ip6q_destroy(q6);
#endif
free(q6, M_FTABLE);
- V_frag6_nfragpackets--;
+ atomic_subtract_int(&V_frag6_nfragpackets, 1);
if (m->m_flags & M_PKTHDR) { /* Isn't it always true? */
int plen = 0;
@@ -608,7 +692,7 @@ insert:
m_tag_prepend(m, mtag);
#endif
- IP6Q_UNLOCK();
+ IP6Q_UNLOCK(hash);
IP6STAT_INC(ip6s_reassembled);
in6_ifstat_inc(dstifp, ifs6_reass_ok);
@@ -630,7 +714,7 @@ insert:
return nxt;
dropfrag:
- IP6Q_UNLOCK();
+ IP6Q_UNLOCK(hash);
in6_ifstat_inc(dstifp, ifs6_reass_fail);
IP6STAT_INC(ip6s_fragdropped);
m_freem(m);
@@ -641,19 +725,19 @@ insert:
* Free a fragment reassembly header and all
* associated datagrams.
*/
-void
-frag6_freef(struct ip6q *q6)
+static void
+frag6_freef(struct ip6q *q6, uint32_t bucket)
{
struct ip6asfrag *af6, *down6;
- IP6Q_LOCK_ASSERT();
+ IP6Q_LOCK_ASSERT(bucket);
for (af6 = q6->ip6q_down; af6 != (struct ip6asfrag *)q6;
af6 = down6) {
struct mbuf *m = IP6_REASS_MBUF(af6);
down6 = af6->ip6af_down;
- frag6_deq(af6);
+ frag6_deq(af6, bucket);
/*
* Return ICMP time exceeded error for the 1st fragment.
@@ -675,24 +759,25 @@ frag6_freef(struct ip6q *q6)
m_freem(m);
free(af6, M_FTABLE);
}
- frag6_remque(q6);
- V_frag6_nfrags -= q6->ip6q_nfrag;
+ frag6_remque(q6, bucket);
+ atomic_subtract_int(&frag6_nfrags, q6->ip6q_nfrag);
#ifdef MAC
mac_ip6q_destroy(q6);
#endif
free(q6, M_FTABLE);
- V_frag6_nfragpackets--;
+ atomic_subtract_int(&V_frag6_nfragpackets, 1);
}
/*
* Put an ip fragment on a reassembly chain.
* Like insque, but pointers in middle of structure.
*/
-void
-frag6_enq(struct ip6asfrag *af6, struct ip6asfrag *up6)
+static void
+frag6_enq(struct ip6asfrag *af6, struct ip6asfrag *up6,
+ uint32_t bucket __unused)
{
- IP6Q_LOCK_ASSERT();
+ IP6Q_LOCK_ASSERT(bucket);
af6->ip6af_up = up6;
af6->ip6af_down = up6->ip6af_down;
@@ -703,36 +788,41 @@ frag6_enq(struct ip6asfrag *af6, struct ip6asfrag *up6)
/*
* To frag6_enq as remque is to insque.
*/
-void
-frag6_deq(struct ip6asfrag *af6)
+static void
+frag6_deq(struct ip6asfrag *af6, uint32_t bucket __unused)
{
- IP6Q_LOCK_ASSERT();
+ IP6Q_LOCK_ASSERT(bucket);
af6->ip6af_up->ip6af_down = af6->ip6af_down;
af6->ip6af_down->ip6af_up = af6->ip6af_up;
}
-void
-frag6_insque(struct ip6q *new, struct ip6q *old)
+static void
+frag6_insque_head(struct ip6q *new, struct ip6q *old, uint32_t bucket)
{
- IP6Q_LOCK_ASSERT();
+ IP6Q_LOCK_ASSERT(bucket);
+ KASSERT(IP6Q_HEAD(bucket) == old,
+ ("%s: attempt to insert at head of wrong bucket"
+ " (bucket=%u, old=%p)", __func__, bucket, old));
new->ip6q_prev = old;
new->ip6q_next = old->ip6q_next;
old->ip6q_next->ip6q_prev= new;
old->ip6q_next = new;
+ V_ip6q[bucket].count++;
}
-void
-frag6_remque(struct ip6q *p6)
+static void
+frag6_remque(struct ip6q *p6, uint32_t bucket)
{
- IP6Q_LOCK_ASSERT();
+ IP6Q_LOCK_ASSERT(bucket);
p6->ip6q_prev->ip6q_next = p6->ip6q_next;
p6->ip6q_next->ip6q_prev = p6->ip6q_prev;
+ V_ip6q[bucket].count--;
}
/*
@@ -744,37 +834,71 @@ void
frag6_slowtimo(void)
{
VNET_ITERATOR_DECL(vnet_iter);
- struct ip6q *q6;
+ struct ip6q *head, *q6;
+ int i;
VNET_LIST_RLOCK_NOSLEEP();
- IP6Q_LOCK();
VNET_FOREACH(vnet_iter) {
CURVNET_SET(vnet_iter);
- q6 = V_ip6q.ip6q_next;
- if (q6)
- while (q6 != &V_ip6q) {
+ for (i = 0; i < IP6REASS_NHASH; i++) {
+ IP6Q_LOCK(i);
+ head = IP6Q_HEAD(i);
+ q6 = head->ip6q_next;
+ if (q6 == NULL) {
+ /*
+ * XXXJTL: This should never happen. This
+ * should turn into an assertion.
+ */
+ IP6Q_UNLOCK(i);
+ continue;
+ }
+ while (q6 != head) {
--q6->ip6q_ttl;
q6 = q6->ip6q_next;
if (q6->ip6q_prev->ip6q_ttl == 0) {
IP6STAT_INC(ip6s_fragtimeout);
/* XXX in6_ifstat_inc(ifp, ifs6_reass_fail) */
- frag6_freef(q6->ip6q_prev);
+ frag6_freef(q6->ip6q_prev, i);
}
}
+ /*
+ * If we are over the maximum number of fragments
+ * (due to the limit being lowered), drain off
+ * enough to get down to the new limit.
+ * Note that we drain all reassembly queues if
+ * maxfragpackets is 0 (fragmentation is disabled),
+ * and don't enforce a limit when maxfragpackets
+ * is negative.
+ */
+ while ((V_ip6_maxfragpackets == 0 ||
+ (V_ip6_maxfragpackets > 0 &&
+ V_ip6q[i].count > V_ip6_maxfragbucketsize)) &&
+ head->ip6q_prev != head) {
+ IP6STAT_INC(ip6s_fragoverflow);
+ /* XXX in6_ifstat_inc(ifp, ifs6_reass_fail) */
+ frag6_freef(head->ip6q_prev, i);
+ }
+ IP6Q_UNLOCK(i);
+ }
/*
- * If we are over the maximum number of fragments
- * (due to the limit being lowered), drain off
- * enough to get down to the new limit.
+ * If we are still over the maximum number of fragmented
+ * packets, drain off enough to get down to the new limit.
*/
- while (V_frag6_nfragpackets > (u_int)V_ip6_maxfragpackets &&
- V_ip6q.ip6q_prev) {
- IP6STAT_INC(ip6s_fragoverflow);
- /* XXX in6_ifstat_inc(ifp, ifs6_reass_fail) */
- frag6_freef(V_ip6q.ip6q_prev);
+ i = 0;
+ while (V_ip6_maxfragpackets >= 0 &&
+ V_frag6_nfragpackets > (u_int)V_ip6_maxfragpackets) {
+ IP6Q_LOCK(i);
+ head = IP6Q_HEAD(i);
+ if (head->ip6q_prev != head) {
+ IP6STAT_INC(ip6s_fragoverflow);
+ /* XXX in6_ifstat_inc(ifp, ifs6_reass_fail) */
+ frag6_freef(head->ip6q_prev, i);
+ }
+ IP6Q_UNLOCK(i);
+ i = (i + 1) % IP6REASS_NHASH;
}
CURVNET_RESTORE();
}
- IP6Q_UNLOCK();
VNET_LIST_RUNLOCK_NOSLEEP();
}
@@ -785,22 +909,25 @@ void
frag6_drain(void)
{
VNET_ITERATOR_DECL(vnet_iter);
+ struct ip6q *head;
+ int i;
VNET_LIST_RLOCK_NOSLEEP();
- if (IP6Q_TRYLOCK() == 0) {
- VNET_LIST_RUNLOCK_NOSLEEP();
- return;
- }
VNET_FOREACH(vnet_iter) {
CURVNET_SET(vnet_iter);
- while (V_ip6q.ip6q_next != &V_ip6q) {
- IP6STAT_INC(ip6s_fragdropped);
- /* XXX in6_ifstat_inc(ifp, ifs6_reass_fail) */
- frag6_freef(V_ip6q.ip6q_next);
+ for (i = 0; i < IP6REASS_NHASH; i++) {
+ if (IP6Q_TRYLOCK(i) == 0)
+ continue;
+ head = IP6Q_HEAD(i);
+ while (head->ip6q_next != head) {
+ IP6STAT_INC(ip6s_fragdropped);
+ /* XXX in6_ifstat_inc(ifp, ifs6_reass_fail) */
+ frag6_freef(head->ip6q_next, i);
+ }
+ IP6Q_UNLOCK(i);
}
CURVNET_RESTORE();
}
- IP6Q_UNLOCK();
VNET_LIST_RUNLOCK_NOSLEEP();
}
diff --git a/sys/netinet6/in6.h b/sys/netinet6/in6.h
index e913c0ab977a..9fe3a4677025 100644
--- a/sys/netinet6/in6.h
+++ b/sys/netinet6/in6.h
@@ -637,7 +637,9 @@ struct ip6_mtuinfo {
#define IPV6CTL_INTRQMAXLEN 51 /* max length of IPv6 netisr queue */
#define IPV6CTL_INTRDQMAXLEN 52 /* max length of direct IPv6 netisr
* queue */
-#define IPV6CTL_MAXID 53
+#define IPV6CTL_MAXFRAGSPERPACKET 53 /* Max fragments per packet */
+#define IPV6CTL_MAXFRAGBUCKETSIZE 54 /* Max reassembly queues per bucket */
+#define IPV6CTL_MAXID 55
#endif /* __BSD_VISIBLE */
/*
diff --git a/sys/netinet6/in6_proto.c b/sys/netinet6/in6_proto.c
index 44ea1af01c2f..5c65fe9e0552 100644
--- a/sys/netinet6/in6_proto.c
+++ b/sys/netinet6/in6_proto.c
@@ -386,7 +386,9 @@ VNET_DEFINE(int, ip6_no_radr) = 0;
VNET_DEFINE(int, ip6_norbit_raif) = 0;
VNET_DEFINE(int, ip6_rfc6204w3) = 0;
VNET_DEFINE(int, ip6_maxfragpackets); /* initialized in frag6.c:frag6_init() */
-VNET_DEFINE(int, ip6_maxfrags); /* initialized in frag6.c:frag6_init() */
+int ip6_maxfrags; /* initialized in frag6.c:frag6_init() */
+VNET_DEFINE(int, ip6_maxfragbucketsize);/* initialized in frag6.c:frag6_init() */
+VNET_DEFINE(int, ip6_maxfragsperpacket); /* initialized in frag6.c:frag6_init() */
VNET_DEFINE(int, ip6_log_interval) = 5;
VNET_DEFINE(int, ip6_hdrnestlimit) = 15;/* How many header options will we
* process? */
@@ -473,6 +475,20 @@ sysctl_ip6_tempvltime(SYSCTL_HANDLER_ARGS)
return (0);
}
+static int
+sysctl_ip6_maxfragpackets(SYSCTL_HANDLER_ARGS)
+{
+ int error, val;
+
+ val = V_ip6_maxfragpackets;
+ error = sysctl_handle_int(oidp, &val, 0, req);
+ if (error != 0 || !req->newptr)
+ return (error);
+ V_ip6_maxfragpackets = val;
+ frag6_set_bucketsize();
+ return (0);
+}
+
SYSCTL_INT(_net_inet6_ip6, IPV6CTL_FORWARDING, forwarding,
CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_forwarding), 0,
"Enable forwarding of IPv6 packets between interfaces");
@@ -485,8 +501,9 @@ SYSCTL_INT(_net_inet6_ip6, IPV6CTL_DEFHLIM, hlim,
SYSCTL_VNET_PCPUSTAT(_net_inet6_ip6, IPV6CTL_STATS, stats, struct ip6stat,
ip6stat,
"IP6 statistics (struct ip6stat, netinet6/ip6_var.h)");
-SYSCTL_INT(_net_inet6_ip6, IPV6CTL_MAXFRAGPACKETS, maxfragpackets,
- CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_maxfragpackets), 0,
+SYSCTL_PROC(_net_inet6_ip6, IPV6CTL_MAXFRAGPACKETS, maxfragpackets,
+ CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW, NULL, 0,
+ sysctl_ip6_maxfragpackets, "I",
"Default maximum number of outstanding fragmented IPv6 packets. "
"A value of 0 means no fragmented packets will be accepted, while a "
"a value of -1 means no limit");
@@ -560,8 +577,16 @@ SYSCTL_INT(_net_inet6_ip6, IPV6CTL_USE_DEFAULTZONE, use_defaultzone,
CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_use_defzone), 0,
"Use the default scope zone when none is specified");
SYSCTL_INT(_net_inet6_ip6, IPV6CTL_MAXFRAGS, maxfrags,
- CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_maxfrags), 0,
- "Maximum allowed number of outstanding IPv6 packet fragments");
+ CTLFLAG_RW, &ip6_maxfrags, 0,
+ "Maximum allowed number of outstanding IPv6 packet fragments. "
+ "A value of 0 means no fragmented packets will be accepted, while a "
+ "a value of -1 means no limit");
+SYSCTL_INT(_net_inet6_ip6, IPV6CTL_MAXFRAGBUCKETSIZE, maxfragbucketsize,
+ CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_maxfragbucketsize), 0,
+ "Maximum number of reassembly queues per hash bucket");
+SYSCTL_INT(_net_inet6_ip6, IPV6CTL_MAXFRAGSPERPACKET, maxfragsperpacket,
+ CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_maxfragsperpacket), 0,
+ "Maximum allowed number of fragments per packet");
SYSCTL_INT(_net_inet6_ip6, IPV6CTL_MCAST_PMTU, mcast_pmtu,
CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_mcast_pmtu), 0,
"Enable path MTU discovery for multicast packets");
diff --git a/sys/netinet6/ip6_var.h b/sys/netinet6/ip6_var.h
index e52a32068560..be2d5fd0ee11 100644
--- a/sys/netinet6/ip6_var.h
+++ b/sys/netinet6/ip6_var.h
@@ -296,8 +296,10 @@ VNET_DECLARE(struct socket *, ip6_mrouter); /* multicast routing daemon */
VNET_DECLARE(int, ip6_sendredirects); /* send IP redirects when forwarding? */
VNET_DECLARE(int, ip6_maxfragpackets); /* Maximum packets in reassembly
* queue */
-VNET_DECLARE(int, ip6_maxfrags); /* Maximum fragments in reassembly
+extern int ip6_maxfrags; /* Maximum fragments in reassembly
* queue */
+VNET_DECLARE(int, ip6_maxfragbucketsize); /* Maximum reassembly queues per bucket */
+VNET_DECLARE(int, ip6_maxfragsperpacket); /* Maximum fragments per packet */
VNET_DECLARE(int, ip6_accept_rtadv); /* Acts as a host not a router */
VNET_DECLARE(int, ip6_no_radr); /* No defroute from RA */
VNET_DECLARE(int, ip6_norbit_raif); /* Disable R-bit in NA on RA
@@ -312,7 +314,8 @@ VNET_DECLARE(int, ip6_dad_count); /* DupAddrDetectionTransmits */
#define V_ip6_mrouter VNET(ip6_mrouter)
#define V_ip6_sendredirects VNET(ip6_sendredirects)
#define V_ip6_maxfragpackets VNET(ip6_maxfragpackets)
-#define V_ip6_maxfrags VNET(ip6_maxfrags)
+#define V_ip6_maxfragbucketsize VNET(ip6_maxfragbucketsize)
+#define V_ip6_maxfragsperpacket VNET(ip6_maxfragsperpacket)
#define V_ip6_accept_rtadv VNET(ip6_accept_rtadv)
#define V_ip6_no_radr VNET(ip6_no_radr)
#define V_ip6_norbit_raif VNET(ip6_norbit_raif)
@@ -399,6 +402,7 @@ int ip6_fragment(struct ifnet *, struct mbuf *, int, u_char, int,
int route6_input(struct mbuf **, int *, int);
+void frag6_set_bucketsize(void);
void frag6_init(void);
int frag6_input(struct mbuf **, int *, int);
void frag6_slowtimo(void);
diff --git a/sys/vm/vm_page.c b/sys/vm/vm_page.c
index 9f4fe44f362c..7e6a02620510 100644
--- a/sys/vm/vm_page.c
+++ b/sys/vm/vm_page.c
@@ -290,6 +290,27 @@ vm_page_blacklist_next(char **list, char *end)
return (0);
}
+bool
+vm_page_blacklist_add(vm_paddr_t pa, bool verbose)
+{
+ vm_page_t m;
+ int ret;
+
+ m = vm_phys_paddr_to_vm_page(pa);
+ if (m == NULL)
+ return (true); /* page does not exist, no failure */
+
+ mtx_lock(&vm_page_queue_free_mtx);
+ ret = vm_phys_unfree_page(m);
+ mtx_unlock(&vm_page_queue_free_mtx);
+ if (ret) {
+ TAILQ_INSERT_TAIL(&blacklist_head, m, listq);
+ if (verbose)
+ printf("Skipping page with pa 0x%jx\n", (uintmax_t)pa);
+ }
+ return (ret);
+}
+
/*
* vm_page_blacklist_check:
*
@@ -301,26 +322,13 @@ static void
vm_page_blacklist_check(char *list, char *end)
{
vm_paddr_t pa;
- vm_page_t m;
char *next;
- int ret;
next = list;
while (next != NULL) {
if ((pa = vm_page_blacklist_next(&next, end)) == 0)
continue;
- m = vm_phys_paddr_to_vm_page(pa);
- if (m == NULL)
- continue;
- mtx_lock(&vm_page_queue_free_mtx);
- ret = vm_phys_unfree_page(m);
- mtx_unlock(&vm_page_queue_free_mtx);
- if (ret == TRUE) {
- TAILQ_INSERT_TAIL(&blacklist_head, m, listq);
- if (bootverbose)
- printf("Skipping page with pa 0x%jx\n",
- (uintmax_t)pa);
- }
+ vm_page_blacklist_add(pa, bootverbose);
}
}
diff --git a/sys/vm/vm_page.h b/sys/vm/vm_page.h
index 1ee8dde3a6a0..f34eedebb1bd 100644
--- a/sys/vm/vm_page.h
+++ b/sys/vm/vm_page.h
@@ -448,6 +448,7 @@ vm_page_t vm_page_alloc_contig(vm_object_t object, vm_pindex_t pindex, int req,
u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment,
vm_paddr_t boundary, vm_memattr_t memattr);
vm_page_t vm_page_alloc_freelist(int, int);
+bool vm_page_blacklist_add(vm_paddr_t pa, bool verbose);
vm_page_t vm_page_grab (vm_object_t, vm_pindex_t, int);
int vm_page_try_to_free (vm_page_t);
void vm_page_deactivate (vm_page_t);
diff --git a/sys/x86/include/specialreg.h b/sys/x86/include/specialreg.h
index 04b24897934d..d04a9f525581 100644
--- a/sys/x86/include/specialreg.h
+++ b/sys/x86/include/specialreg.h
@@ -378,6 +378,7 @@
*/
#define CPUID_STDEXT3_IBPB 0x04000000
#define CPUID_STDEXT3_STIBP 0x08000000
+#define CPUID_STDEXT3_L1D_FLUSH 0x10000000
#define CPUID_STDEXT3_ARCH_CAP 0x20000000
/* MSR IA32_ARCH_CAP(ABILITIES) bits */
@@ -427,6 +428,7 @@
#define MSR_IA32_EXT_CONFIG 0x0ee /* Undocumented. Core Solo/Duo only */
#define MSR_MTRRcap 0x0fe
#define MSR_IA32_ARCH_CAP 0x10a
+#define MSR_IA32_FLUSH_CMD 0x10b
#define MSR_BBL_CR_ADDR 0x116
#define MSR_BBL_CR_DECC 0x118
#define MSR_BBL_CR_CTL 0x119
@@ -580,6 +582,9 @@
/* MSR IA32_PRED_CMD */
#define IA32_PRED_CMD_IBPB_BARRIER 0x0000000000000001ULL
+/* MSR IA32_FLUSH_CMD */
+#define IA32_FLUSH_CMD_L1D 0x00000001
+
/*
* PAT modes.
*/