Skip to content

Commit e58e284

Browse files
mrpreKernel Patches Daemon
authored and
Kernel Patches Daemon
committed
bpf, sockmap: Fix concurrency issues between memory charge and uncharge
Triggering WARN_ON_ONCE(sk->sk_forward_alloc) by running the following command, followed by pressing Ctrl-C after 2 seconds: ./bench sockmap -c 2 -p 1 -a --rx-verdict-ingress ''' ------------[ cut here ]------------ WARNING: CPU: 2 PID: 40 at net/ipv4/af_inet.c inet_sock_destruct Call Trace: <TASK> __sk_destruct+0x46/0x222 sk_psock_destroy+0x22f/0x242 process_one_work+0x504/0x8a8 ? process_one_work+0x39d/0x8a8 ? __pfx_process_one_work+0x10/0x10 ? worker_thread+0x44/0x2ae ? __list_add_valid_or_report+0x83/0xea ? srso_return_thunk+0x5/0x5f ? __list_add+0x45/0x52 process_scheduled_works+0x73/0x82 worker_thread+0x1ce/0x2ae ''' Reason: When we are in the backlog process, we allocate sk_msg and then perform the charge process. Meanwhile, in the user process context, the recvmsg() operation performs the uncharge process, leading to concurrency issues between them. The charge process (2 functions): 1. sk_rmem_schedule(size) -> sk_forward_alloc increases by PAGE_SIZE multiples 2. sk_mem_charge(size) -> sk_forward_alloc -= size The uncharge process (sk_mem_uncharge()): 3. sk_forward_alloc += size 4. check if sk_forward_alloc > PAGE_SIZE 5. reclaim -> sk_forward_alloc decreases, possibly becoming 0 Because the sk performing charge and uncharge is not locked (mainly because the backlog process does not lock the socket), therefore, steps 1 to 5 will execute concurrently as follows: cpu0 cpu1 1 3 4 --> sk_forward_alloc >= PAGE_SIZE 5 --> reclaim sk_forward_alloc 2 --> sk_forward_alloc may become negative Solution: 1. Add locking to the kfree_sk_msg() process, which is only called in the user process context. 2. Integrate the charge process into sk_psock_create_ingress_msg() in the backlog process and add locking. 3. Reuse the existing psock->ingress_lock. Fixes: 799aa7f ("skmsg: Avoid lock_sock() in sk_psock_backlog()") Signed-off-by: Jiayuan Chen <[email protected]>
1 parent 5ad6ff6 commit e58e284

File tree

2 files changed

+30
-14
lines changed

2 files changed

+30
-14
lines changed

include/linux/skmsg.h

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -378,10 +378,13 @@ static inline bool sk_psock_queue_empty(const struct sk_psock *psock)
378378
return psock ? list_empty(&psock->ingress_msg) : true;
379379
}
380380

381-
static inline void kfree_sk_msg(struct sk_msg *msg)
381+
static inline void kfree_sk_msg(struct sk_psock *psock, struct sk_msg *msg)
382382
{
383-
if (msg->skb)
383+
if (msg->skb) {
384+
spin_lock_bh(&psock->ingress_lock);
384385
consume_skb(msg->skb);
386+
spin_unlock_bh(&psock->ingress_lock);
387+
}
385388
kfree(msg);
386389
}
387390

net/core/skmsg.c

Lines changed: 25 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -480,7 +480,7 @@ int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg,
480480
msg_rx->sg.start = i;
481481
if (!sge->length && (i == msg_rx->sg.end || sg_is_last(sge))) {
482482
msg_rx = sk_psock_dequeue_msg(psock);
483-
kfree_sk_msg(msg_rx);
483+
kfree_sk_msg(psock, msg_rx);
484484
}
485485
msg_rx = sk_psock_peek_msg(psock);
486486
}
@@ -514,16 +514,36 @@ static struct sk_msg *alloc_sk_msg(gfp_t gfp)
514514
return msg;
515515
}
516516

517-
static struct sk_msg *sk_psock_create_ingress_msg(struct sock *sk,
517+
static struct sk_msg *sk_psock_create_ingress_msg(struct sk_psock *psock,
518+
struct sock *sk,
518519
struct sk_buff *skb)
519520
{
521+
spin_lock_bh(&psock->ingress_lock);
520522
if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf)
521-
return NULL;
523+
goto unlock_err;
522524

523525
if (!sk_rmem_schedule(sk, skb, skb->truesize))
524-
return NULL;
526+
goto unlock_err;
527+
528+
/* This will transition ownership of the data from the socket where
529+
* the BPF program was run initiating the redirect to the socket
530+
* we will eventually receive this data on. The data will be released
531+
* from consume_skb whether in sk_msg_recvmsg() after its been copied
532+
* into user buffers or in other skb release processes.
533+
*/
534+
skb_set_owner_r(skb, sk);
535+
spin_unlock_bh(&psock->ingress_lock);
525536

537+
/* sk_msg itself is not under sk memory limitations and we only concern
538+
* sk_msg->skb, hence no lock protection is needed here. Furthermore,
539+
* adding a ingress_lock would trigger a warning from lockdep about
540+
* 'softirq-safe to softirq-unsafe'.
541+
*/
526542
return alloc_sk_msg(GFP_KERNEL);
543+
544+
unlock_err:
545+
spin_unlock_bh(&psock->ingress_lock);
546+
return NULL;
527547
}
528548

529549
static int sk_psock_skb_ingress_enqueue(struct sk_buff *skb,
@@ -585,17 +605,10 @@ static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb,
585605
*/
586606
if (unlikely(skb->sk == sk))
587607
return sk_psock_skb_ingress_self(psock, skb, off, len, true);
588-
msg = sk_psock_create_ingress_msg(sk, skb);
608+
msg = sk_psock_create_ingress_msg(psock, sk, skb);
589609
if (!msg)
590610
return -EAGAIN;
591611

592-
/* This will transition ownership of the data from the socket where
593-
* the BPF program was run initiating the redirect to the socket
594-
* we will eventually receive this data on. The data will be released
595-
* from skb_consume found in __tcp_bpf_recvmsg() after its been copied
596-
* into user buffers.
597-
*/
598-
skb_set_owner_r(skb, sk);
599612
err = sk_psock_skb_ingress_enqueue(skb, off, len, psock, sk, msg, true);
600613
if (err < 0)
601614
kfree(msg);

0 commit comments

Comments
 (0)