From d362c24162cea6fe61865fc6b67e07c405855a37 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Mon, 3 Jul 2023 03:27:04 +0200 Subject: wireguard: queueing: use saner cpu selection wrapping Using `% nr_cpumask_bits` is slow and complicated, and not totally robust toward dynamic changes to CPU topologies. Rather than storing the next CPU in the round-robin, just store the last one, and also return that value. This simplifies the loop drastically into a much more common pattern. Fixes: a8f1bc7bdea3 ("net: WireGuard secure network tunnel") Cc: stable@vger.kernel.org Reported-by: Linus Torvalds Tested-by: Manuel Leiner Signed-off-by: Jason A. Donenfeld Signed-off-by: David S. Miller --- drivers/net/wireguard/queueing.h | 25 +++++++++++-------------- 1 file changed, 11 insertions(+), 14 deletions(-) (limited to 'drivers/net/wireguard/queueing.h') diff --git a/drivers/net/wireguard/queueing.h b/drivers/net/wireguard/queueing.h index 125284b..1ea4f87 100644 --- a/drivers/net/wireguard/queueing.h +++ b/drivers/net/wireguard/queueing.h @@ -117,20 +117,17 @@ static inline int wg_cpumask_choose_online(int *stored_cpu, unsigned int id) return cpu; } -/* This function is racy, in the sense that next is unlocked, so it could return - * the same CPU twice. A race-free version of this would be to instead store an - * atomic sequence number, do an increment-and-return, and then iterate through - * every possible CPU until we get to that index -- choose_cpu. However that's - * a bit slower, and it doesn't seem like this potential race actually - * introduces any performance loss, so we live with it. +/* This function is racy, in the sense that it's called while last_cpu is + * unlocked, so it could return the same CPU twice. Adding locking or using + * atomic sequence numbers is slower though, and the consequences of racing are + * harmless, so live with it. */ -static inline int wg_cpumask_next_online(int *next) +static inline int wg_cpumask_next_online(int *last_cpu) { - int cpu = *next; - - while (unlikely(!cpumask_test_cpu(cpu, cpu_online_mask))) - cpu = cpumask_next(cpu, cpu_online_mask) % nr_cpumask_bits; - *next = cpumask_next(cpu, cpu_online_mask) % nr_cpumask_bits; + int cpu = cpumask_next(*last_cpu, cpu_online_mask); + if (cpu >= nr_cpu_ids) + cpu = cpumask_first(cpu_online_mask); + *last_cpu = cpu; return cpu; } @@ -159,7 +156,7 @@ static inline void wg_prev_queue_drop_peeked(struct prev_queue *queue) static inline int wg_queue_enqueue_per_device_and_peer( struct crypt_queue *device_queue, struct prev_queue *peer_queue, - struct sk_buff *skb, struct workqueue_struct *wq, int *next_cpu) + struct sk_buff *skb, struct workqueue_struct *wq) { int cpu; @@ -173,7 +170,7 @@ static inline int wg_queue_enqueue_per_device_and_peer( /* Then we queue it up in the device queue, which consumes the * packet as soon as it can. */ - cpu = wg_cpumask_next_online(next_cpu); + cpu = wg_cpumask_next_online(&device_queue->last_cpu); if (unlikely(ptr_ring_produce_bh(&device_queue->ring, skb))) return -EPIPE; queue_work_on(cpu, wq, &per_cpu_ptr(device_queue->worker, cpu)->work); -- cgit v1.2.3