Skip to content

Commit fc6b6a8

Browse files
um: ubd: Submit all data segments atomically
Internally, UBD treats each physical IO segment as a separate command to be submitted in the execution pipe. If the pipe returns a transient error after a few segments have already been written, UBD will tell the block layer to requeue the request, but there is no way to reclaim the segments already submitted. When a new attempt to dispatch the request is done, those segments already submitted will get duplicated, causing the WARN_ON below in the best case, and potentially data corruption. In my system, running a UML instance with 2GB of RAM and a 50M UBD disk, I can reproduce the WARN_ON by simply running mkfs.fvat against the disk on a freshly booted system. There are a few ways to around this, like reducing the pressure on the pipe by reducing the queue depth, which almost eliminates the occurrence of the problem, increasing the pipe buffer size on the host system, or by limiting the request to one physical segment, which causes the block layer to submit way more requests to resolve a single operation. Instead, this patch modifies the format of a UBD command, such that all segments are sent through a single element in the communication pipe, turning the command submission atomic from the point of view of the block layer. The new format has a variable size, depending on the number of elements, and looks like this: +------------+-----------+-----------+------------ | cmd_header | segment 0 | segment 1 | segment ... +------------+-----------+-----------+------------ With this format, we push a pointer to cmd_header in the submission pipe. This has the advantage of reducing the memory footprint of executing a single request, since it allow us to merge some fields in the header. It is possible to reduce even further each segment memory footprint, by merging bitmap_words and cow_offset, for instance, but this is not the focus of this patch and is left as future work. One issue with the patch is that for a big number of segments, we now perform one big memory allocation instead of multiple small ones, but I wasn't able to trigger any real issues or -ENOMEM because of this change, that wouldn't be reproduced otherwise. This was tested using fio with the verify-crc32 option, and by running an ext4 filesystem over this UBD device. The original WARN_ON was: ------------[ cut here ]------------ WARNING: CPU: 0 PID: 0 at lib/refcount.c:28 refcount_warn_saturate+0x13f/0x141 refcount_t: underflow; use-after-free. Modules linked in: CPU: 0 PID: 0 Comm: swapper Not tainted 5.5.0-rc6-00002-g2a5bb2cf75c8 #346 Stack: 6084eed0 6063dc77 00000009 6084ef60 00000000 604b8d9f 6084eee0 6063dcbc 6084ef40 6006ab8d e013d780 1c00000000 Call Trace: [<600a0c1c>] ? printk+0x0/0x94 [<6004a888>] show_stack+0x13b/0x155 [<6063dc77>] ? dump_stack_print_info+0xdf/0xe8 [<604b8d9f>] ? refcount_warn_saturate+0x13f/0x141 [<6063dcbc>] dump_stack+0x2a/0x2c [<6006ab8d>] __warn+0x107/0x134 [<6008da6c>] ? wake_up_process+0x17/0x19 [<60487628>] ? blk_queue_max_discard_sectors+0x0/0xd [<6006b05f>] warn_slowpath_fmt+0xd1/0xdf [<6006af8e>] ? warn_slowpath_fmt+0x0/0xdf [<600acc14>] ? raw_read_seqcount_begin.constprop.0+0x0/0x15 [<600619ae>] ? os_nsecs+0x1d/0x2b [<604b8d9f>] refcount_warn_saturate+0x13f/0x141 [<6048bc8f>] refcount_sub_and_test.constprop.0+0x2f/0x37 [<6048c8de>] blk_mq_free_request+0xf1/0x10d [<6048ca06>] __blk_mq_end_request+0x10c/0x114 [<6005ac0f>] ubd_intr+0xb5/0x169 [<600a1a37>] __handle_irq_event_percpu+0x6b/0x17e [<600a1b70>] handle_irq_event_percpu+0x26/0x69 [<600a1bd9>] handle_irq_event+0x26/0x34 [<600a1bb3>] ? handle_irq_event+0x0/0x34 [<600a5186>] ? unmask_irq+0x0/0x37 [<600a57e6>] handle_edge_irq+0xbc/0xd6 [<600a131a>] generic_handle_irq+0x21/0x29 [<60048f6e>] do_IRQ+0x39/0x54 [...] ---[ end trace c6e7444e55386c0f ]--- Cc: Christopher Obbard <[email protected]> Reported-by: Martyn Welch <[email protected]> Signed-off-by: Gabriel Krisman Bertazi <[email protected]> Tested-by: Christopher Obbard <[email protected]> Acked-by: Anton Ivanov <[email protected]> Signed-off-by: Richard Weinberger <[email protected]>
1 parent ff9632d commit fc6b6a8

File tree

1 file changed

+115
-76
lines changed

1 file changed

+115
-76
lines changed

arch/um/drivers/ubd_kern.c

Lines changed: 115 additions & 76 deletions
Original file line numberDiff line numberDiff line change
@@ -47,18 +47,25 @@
4747
/* Max request size is determined by sector mask - 32K */
4848
#define UBD_MAX_REQUEST (8 * sizeof(long))
4949

50+
struct io_desc {
51+
char *buffer;
52+
unsigned long length;
53+
unsigned long sector_mask;
54+
unsigned long long cow_offset;
55+
unsigned long bitmap_words[2];
56+
};
57+
5058
struct io_thread_req {
5159
struct request *req;
5260
int fds[2];
5361
unsigned long offsets[2];
5462
unsigned long long offset;
55-
unsigned long length;
56-
char *buffer;
5763
int sectorsize;
58-
unsigned long sector_mask;
59-
unsigned long long cow_offset;
60-
unsigned long bitmap_words[2];
6164
int error;
65+
66+
int desc_cnt;
67+
/* io_desc has to be the last element of the struct */
68+
struct io_desc io_desc[];
6269
};
6370

6471

@@ -525,12 +532,7 @@ static void ubd_handler(void)
525532
blk_queue_max_write_zeroes_sectors(io_req->req->q, 0);
526533
blk_queue_flag_clear(QUEUE_FLAG_DISCARD, io_req->req->q);
527534
}
528-
if ((io_req->error) || (io_req->buffer == NULL))
529-
blk_mq_end_request(io_req->req, io_req->error);
530-
else {
531-
if (!blk_update_request(io_req->req, io_req->error, io_req->length))
532-
__blk_mq_end_request(io_req->req, io_req->error);
533-
}
535+
blk_mq_end_request(io_req->req, io_req->error);
534536
kfree(io_req);
535537
}
536538
}
@@ -946,6 +948,7 @@ static int ubd_add(int n, char **error_out)
946948
blk_queue_write_cache(ubd_dev->queue, true, false);
947949

948950
blk_queue_max_segments(ubd_dev->queue, MAX_SG);
951+
blk_queue_segment_boundary(ubd_dev->queue, PAGE_SIZE - 1);
949952
err = ubd_disk_register(UBD_MAJOR, ubd_dev->size, n, &ubd_gendisk[n]);
950953
if(err){
951954
*error_out = "Failed to register device";
@@ -1289,64 +1292,116 @@ static void cowify_bitmap(__u64 io_offset, int length, unsigned long *cow_mask,
12891292
*cow_offset += bitmap_offset;
12901293
}
12911294

1292-
static void cowify_req(struct io_thread_req *req, unsigned long *bitmap,
1295+
static void cowify_req(struct io_thread_req *req, struct io_desc *segment,
1296+
unsigned long offset, unsigned long *bitmap,
12931297
__u64 bitmap_offset, __u64 bitmap_len)
12941298
{
1295-
__u64 sector = req->offset >> SECTOR_SHIFT;
1299+
__u64 sector = offset >> SECTOR_SHIFT;
12961300
int i;
12971301

1298-
if (req->length > (sizeof(req->sector_mask) * 8) << SECTOR_SHIFT)
1302+
if (segment->length > (sizeof(segment->sector_mask) * 8) << SECTOR_SHIFT)
12991303
panic("Operation too long");
13001304

13011305
if (req_op(req->req) == REQ_OP_READ) {
1302-
for (i = 0; i < req->length >> SECTOR_SHIFT; i++) {
1306+
for (i = 0; i < segment->length >> SECTOR_SHIFT; i++) {
13031307
if(ubd_test_bit(sector + i, (unsigned char *) bitmap))
13041308
ubd_set_bit(i, (unsigned char *)
1305-
&req->sector_mask);
1309+
&segment->sector_mask);
1310+
}
1311+
} else {
1312+
cowify_bitmap(offset, segment->length, &segment->sector_mask,
1313+
&segment->cow_offset, bitmap, bitmap_offset,
1314+
segment->bitmap_words, bitmap_len);
1315+
}
1316+
}
1317+
1318+
static void ubd_map_req(struct ubd *dev, struct io_thread_req *io_req,
1319+
struct request *req)
1320+
{
1321+
struct bio_vec bvec;
1322+
struct req_iterator iter;
1323+
int i = 0;
1324+
unsigned long byte_offset = io_req->offset;
1325+
int op = req_op(req);
1326+
1327+
if (op == REQ_OP_WRITE_ZEROES || op == REQ_OP_DISCARD) {
1328+
io_req->io_desc[0].buffer = NULL;
1329+
io_req->io_desc[0].length = blk_rq_bytes(req);
1330+
} else {
1331+
rq_for_each_segment(bvec, req, iter) {
1332+
BUG_ON(i >= io_req->desc_cnt);
1333+
1334+
io_req->io_desc[i].buffer =
1335+
page_address(bvec.bv_page) + bvec.bv_offset;
1336+
io_req->io_desc[i].length = bvec.bv_len;
1337+
i++;
1338+
}
1339+
}
1340+
1341+
if (dev->cow.file) {
1342+
for (i = 0; i < io_req->desc_cnt; i++) {
1343+
cowify_req(io_req, &io_req->io_desc[i], byte_offset,
1344+
dev->cow.bitmap, dev->cow.bitmap_offset,
1345+
dev->cow.bitmap_len);
1346+
byte_offset += io_req->io_desc[i].length;
13061347
}
1348+
13071349
}
1308-
else cowify_bitmap(req->offset, req->length, &req->sector_mask,
1309-
&req->cow_offset, bitmap, bitmap_offset,
1310-
req->bitmap_words, bitmap_len);
13111350
}
13121351

1313-
static int ubd_queue_one_vec(struct blk_mq_hw_ctx *hctx, struct request *req,
1314-
u64 off, struct bio_vec *bvec)
1352+
static struct io_thread_req *ubd_alloc_req(struct ubd *dev, struct request *req,
1353+
int desc_cnt)
13151354
{
1316-
struct ubd *dev = hctx->queue->queuedata;
13171355
struct io_thread_req *io_req;
1318-
int ret;
1356+
int i;
13191357

1320-
io_req = kmalloc(sizeof(struct io_thread_req), GFP_ATOMIC);
1358+
io_req = kmalloc(sizeof(*io_req) +
1359+
(desc_cnt * sizeof(struct io_desc)),
1360+
GFP_ATOMIC);
13211361
if (!io_req)
1322-
return -ENOMEM;
1362+
return NULL;
13231363

13241364
io_req->req = req;
13251365
if (dev->cow.file)
13261366
io_req->fds[0] = dev->cow.fd;
13271367
else
13281368
io_req->fds[0] = dev->fd;
13291369
io_req->error = 0;
1330-
1331-
if (bvec != NULL) {
1332-
io_req->buffer = page_address(bvec->bv_page) + bvec->bv_offset;
1333-
io_req->length = bvec->bv_len;
1334-
} else {
1335-
io_req->buffer = NULL;
1336-
io_req->length = blk_rq_bytes(req);
1337-
}
1338-
13391370
io_req->sectorsize = SECTOR_SIZE;
13401371
io_req->fds[1] = dev->fd;
1341-
io_req->cow_offset = -1;
1342-
io_req->offset = off;
1343-
io_req->sector_mask = 0;
1372+
io_req->offset = (u64) blk_rq_pos(req) << SECTOR_SHIFT;
13441373
io_req->offsets[0] = 0;
13451374
io_req->offsets[1] = dev->cow.data_offset;
13461375

1347-
if (dev->cow.file)
1348-
cowify_req(io_req, dev->cow.bitmap,
1349-
dev->cow.bitmap_offset, dev->cow.bitmap_len);
1376+
for (i = 0 ; i < desc_cnt; i++) {
1377+
io_req->io_desc[i].sector_mask = 0;
1378+
io_req->io_desc[i].cow_offset = -1;
1379+
}
1380+
1381+
return io_req;
1382+
}
1383+
1384+
static int ubd_submit_request(struct ubd *dev, struct request *req)
1385+
{
1386+
int segs = 0;
1387+
struct io_thread_req *io_req;
1388+
int ret;
1389+
int op = req_op(req);
1390+
1391+
if (op == REQ_OP_FLUSH)
1392+
segs = 0;
1393+
else if (op == REQ_OP_WRITE_ZEROES || op == REQ_OP_DISCARD)
1394+
segs = 1;
1395+
else
1396+
segs = blk_rq_nr_phys_segments(req);
1397+
1398+
io_req = ubd_alloc_req(dev, req, segs);
1399+
if (!io_req)
1400+
return -ENOMEM;
1401+
1402+
io_req->desc_cnt = segs;
1403+
if (segs)
1404+
ubd_map_req(dev, io_req, req);
13501405

13511406
ret = os_write_file(thread_fd, &io_req, sizeof(io_req));
13521407
if (ret != sizeof(io_req)) {
@@ -1357,22 +1412,6 @@ static int ubd_queue_one_vec(struct blk_mq_hw_ctx *hctx, struct request *req,
13571412
return ret;
13581413
}
13591414

1360-
static int queue_rw_req(struct blk_mq_hw_ctx *hctx, struct request *req)
1361-
{
1362-
struct req_iterator iter;
1363-
struct bio_vec bvec;
1364-
int ret;
1365-
u64 off = (u64)blk_rq_pos(req) << SECTOR_SHIFT;
1366-
1367-
rq_for_each_segment(bvec, req, iter) {
1368-
ret = ubd_queue_one_vec(hctx, req, off, &bvec);
1369-
if (ret < 0)
1370-
return ret;
1371-
off += bvec.bv_len;
1372-
}
1373-
return 0;
1374-
}
1375-
13761415
static blk_status_t ubd_queue_rq(struct blk_mq_hw_ctx *hctx,
13771416
const struct blk_mq_queue_data *bd)
13781417
{
@@ -1385,17 +1424,12 @@ static blk_status_t ubd_queue_rq(struct blk_mq_hw_ctx *hctx,
13851424
spin_lock_irq(&ubd_dev->lock);
13861425

13871426
switch (req_op(req)) {
1388-
/* operations with no lentgth/offset arguments */
13891427
case REQ_OP_FLUSH:
1390-
ret = ubd_queue_one_vec(hctx, req, 0, NULL);
1391-
break;
13921428
case REQ_OP_READ:
13931429
case REQ_OP_WRITE:
1394-
ret = queue_rw_req(hctx, req);
1395-
break;
13961430
case REQ_OP_DISCARD:
13971431
case REQ_OP_WRITE_ZEROES:
1398-
ret = ubd_queue_one_vec(hctx, req, (u64)blk_rq_pos(req) << 9, NULL);
1432+
ret = ubd_submit_request(ubd_dev, req);
13991433
break;
14001434
default:
14011435
WARN_ON_ONCE(1);
@@ -1483,22 +1517,22 @@ static int map_error(int error_code)
14831517
* will result in unpredictable behaviour and/or crashes.
14841518
*/
14851519

1486-
static int update_bitmap(struct io_thread_req *req)
1520+
static int update_bitmap(struct io_thread_req *req, struct io_desc *segment)
14871521
{
14881522
int n;
14891523

1490-
if(req->cow_offset == -1)
1524+
if (segment->cow_offset == -1)
14911525
return map_error(0);
14921526

1493-
n = os_pwrite_file(req->fds[1], &req->bitmap_words,
1494-
sizeof(req->bitmap_words), req->cow_offset);
1495-
if (n != sizeof(req->bitmap_words))
1527+
n = os_pwrite_file(req->fds[1], &segment->bitmap_words,
1528+
sizeof(segment->bitmap_words), segment->cow_offset);
1529+
if (n != sizeof(segment->bitmap_words))
14961530
return map_error(-n);
14971531

14981532
return map_error(0);
14991533
}
15001534

1501-
static void do_io(struct io_thread_req *req)
1535+
static void do_io(struct io_thread_req *req, struct io_desc *desc)
15021536
{
15031537
char *buf = NULL;
15041538
unsigned long len;
@@ -1513,21 +1547,20 @@ static void do_io(struct io_thread_req *req)
15131547
return;
15141548
}
15151549

1516-
nsectors = req->length / req->sectorsize;
1550+
nsectors = desc->length / req->sectorsize;
15171551
start = 0;
15181552
do {
1519-
bit = ubd_test_bit(start, (unsigned char *) &req->sector_mask);
1553+
bit = ubd_test_bit(start, (unsigned char *) &desc->sector_mask);
15201554
end = start;
15211555
while((end < nsectors) &&
1522-
(ubd_test_bit(end, (unsigned char *)
1523-
&req->sector_mask) == bit))
1556+
(ubd_test_bit(end, (unsigned char *) &desc->sector_mask) == bit))
15241557
end++;
15251558

15261559
off = req->offset + req->offsets[bit] +
15271560
start * req->sectorsize;
15281561
len = (end - start) * req->sectorsize;
1529-
if (req->buffer != NULL)
1530-
buf = &req->buffer[start * req->sectorsize];
1562+
if (desc->buffer != NULL)
1563+
buf = &desc->buffer[start * req->sectorsize];
15311564

15321565
switch (req_op(req->req)) {
15331566
case REQ_OP_READ:
@@ -1567,7 +1600,8 @@ static void do_io(struct io_thread_req *req)
15671600
start = end;
15681601
} while(start < nsectors);
15691602

1570-
req->error = update_bitmap(req);
1603+
req->offset += len;
1604+
req->error = update_bitmap(req, desc);
15711605
}
15721606

15731607
/* Changed in start_io_thread, which is serialized by being called only
@@ -1600,8 +1634,13 @@ int io_thread(void *arg)
16001634
}
16011635

16021636
for (count = 0; count < n/sizeof(struct io_thread_req *); count++) {
1637+
struct io_thread_req *req = (*io_req_buffer)[count];
1638+
int i;
1639+
16031640
io_count++;
1604-
do_io((*io_req_buffer)[count]);
1641+
for (i = 0; !req->error && i < req->desc_cnt; i++)
1642+
do_io(req, &(req->io_desc[i]));
1643+
16051644
}
16061645

16071646
written = 0;

0 commit comments

Comments
 (0)