Skip to content

Commit 3db1de0

Browse files
Daeho JeongJaegeuk Kim
Daeho Jeong
authored and
Jaegeuk Kim
committed
f2fs: change the current atomic write way
Current atomic write has three major issues like below. - keeps the updates in non-reclaimable memory space and they are even hard to be migrated, which is not good for contiguous memory allocation. - disk spaces used for atomic files cannot be garbage collected, so this makes it difficult for the filesystem to be defragmented. - If atomic write operations hit the threshold of either memory usage or garbage collection failure count, All the atomic write operations will fail immediately. To resolve the issues, I will keep a COW inode internally for all the updates to be flushed from memory, when we need to flush them out in a situation like high memory pressure. These COW inodes will be tagged as orphan inodes to be reclaimed in case of sudden power-cut or system failure during atomic writes. Signed-off-by: Daeho Jeong <[email protected]> Signed-off-by: Jaegeuk Kim <[email protected]>
1 parent 6213f5d commit 3db1de0

File tree

13 files changed

+303
-446
lines changed

13 files changed

+303
-446
lines changed

fs/f2fs/data.c

Lines changed: 115 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -69,8 +69,7 @@ static bool __is_cp_guaranteed(struct page *page)
6969

7070
if (f2fs_is_compressed_page(page))
7171
return false;
72-
if ((S_ISREG(inode->i_mode) &&
73-
(f2fs_is_atomic_file(inode) || IS_NOQUOTA(inode))) ||
72+
if ((S_ISREG(inode->i_mode) && IS_NOQUOTA(inode)) ||
7473
page_private_gcing(page))
7574
return true;
7675
return false;
@@ -2563,7 +2562,12 @@ int f2fs_do_write_data_page(struct f2fs_io_info *fio)
25632562
bool ipu_force = false;
25642563
int err = 0;
25652564

2566-
set_new_dnode(&dn, inode, NULL, NULL, 0);
2565+
/* Use COW inode to make dnode_of_data for atomic write */
2566+
if (f2fs_is_atomic_file(inode))
2567+
set_new_dnode(&dn, F2FS_I(inode)->cow_inode, NULL, NULL, 0);
2568+
else
2569+
set_new_dnode(&dn, inode, NULL, NULL, 0);
2570+
25672571
if (need_inplace_update(fio) &&
25682572
f2fs_lookup_extent_cache(inode, page->index, &ei)) {
25692573
fio->old_blkaddr = ei.blk + page->index - ei.fofs;
@@ -2600,6 +2604,7 @@ int f2fs_do_write_data_page(struct f2fs_io_info *fio)
26002604
err = -EFSCORRUPTED;
26012605
goto out_writepage;
26022606
}
2607+
26032608
/*
26042609
* If current allocation needs SSR,
26052610
* it had better in-place writes for updated data.
@@ -3313,6 +3318,100 @@ static int prepare_write_begin(struct f2fs_sb_info *sbi,
33133318
return err;
33143319
}
33153320

3321+
static int __find_data_block(struct inode *inode, pgoff_t index,
3322+
block_t *blk_addr)
3323+
{
3324+
struct dnode_of_data dn;
3325+
struct page *ipage;
3326+
struct extent_info ei = {0, };
3327+
int err = 0;
3328+
3329+
ipage = f2fs_get_node_page(F2FS_I_SB(inode), inode->i_ino);
3330+
if (IS_ERR(ipage))
3331+
return PTR_ERR(ipage);
3332+
3333+
set_new_dnode(&dn, inode, ipage, ipage, 0);
3334+
3335+
if (f2fs_lookup_extent_cache(inode, index, &ei)) {
3336+
dn.data_blkaddr = ei.blk + index - ei.fofs;
3337+
} else {
3338+
/* hole case */
3339+
err = f2fs_get_dnode_of_data(&dn, index, LOOKUP_NODE);
3340+
if (err) {
3341+
dn.data_blkaddr = NULL_ADDR;
3342+
err = 0;
3343+
}
3344+
}
3345+
*blk_addr = dn.data_blkaddr;
3346+
f2fs_put_dnode(&dn);
3347+
return err;
3348+
}
3349+
3350+
static int __reserve_data_block(struct inode *inode, pgoff_t index,
3351+
block_t *blk_addr, bool *node_changed)
3352+
{
3353+
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
3354+
struct dnode_of_data dn;
3355+
struct page *ipage;
3356+
int err = 0;
3357+
3358+
f2fs_do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, true);
3359+
3360+
ipage = f2fs_get_node_page(sbi, inode->i_ino);
3361+
if (IS_ERR(ipage)) {
3362+
err = PTR_ERR(ipage);
3363+
goto unlock_out;
3364+
}
3365+
set_new_dnode(&dn, inode, ipage, ipage, 0);
3366+
3367+
err = f2fs_get_block(&dn, index);
3368+
3369+
*blk_addr = dn.data_blkaddr;
3370+
*node_changed = dn.node_changed;
3371+
f2fs_put_dnode(&dn);
3372+
3373+
unlock_out:
3374+
f2fs_do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, false);
3375+
return err;
3376+
}
3377+
3378+
static int prepare_atomic_write_begin(struct f2fs_sb_info *sbi,
3379+
struct page *page, loff_t pos, unsigned int len,
3380+
block_t *blk_addr, bool *node_changed)
3381+
{
3382+
struct inode *inode = page->mapping->host;
3383+
struct inode *cow_inode = F2FS_I(inode)->cow_inode;
3384+
pgoff_t index = page->index;
3385+
int err = 0;
3386+
block_t ori_blk_addr;
3387+
3388+
/* If pos is beyond the end of file, reserve a new block in COW inode */
3389+
if ((pos & PAGE_MASK) >= i_size_read(inode))
3390+
return __reserve_data_block(cow_inode, index, blk_addr,
3391+
node_changed);
3392+
3393+
/* Look for the block in COW inode first */
3394+
err = __find_data_block(cow_inode, index, blk_addr);
3395+
if (err)
3396+
return err;
3397+
else if (*blk_addr != NULL_ADDR)
3398+
return 0;
3399+
3400+
/* Look for the block in the original inode */
3401+
err = __find_data_block(inode, index, &ori_blk_addr);
3402+
if (err)
3403+
return err;
3404+
3405+
/* Finally, we should reserve a new block in COW inode for the update */
3406+
err = __reserve_data_block(cow_inode, index, blk_addr, node_changed);
3407+
if (err)
3408+
return err;
3409+
3410+
if (ori_blk_addr != NULL_ADDR)
3411+
*blk_addr = ori_blk_addr;
3412+
return 0;
3413+
}
3414+
33163415
static int f2fs_write_begin(struct file *file, struct address_space *mapping,
33173416
loff_t pos, unsigned len, unsigned flags,
33183417
struct page **pagep, void **fsdata)
@@ -3321,7 +3420,7 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping,
33213420
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
33223421
struct page *page = NULL;
33233422
pgoff_t index = ((unsigned long long) pos) >> PAGE_SHIFT;
3324-
bool need_balance = false, drop_atomic = false;
3423+
bool need_balance = false;
33253424
block_t blkaddr = NULL_ADDR;
33263425
int err = 0;
33273426

@@ -3332,14 +3431,6 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping,
33323431
goto fail;
33333432
}
33343433

3335-
if ((f2fs_is_atomic_file(inode) &&
3336-
!f2fs_available_free_memory(sbi, INMEM_PAGES)) ||
3337-
is_inode_flag_set(inode, FI_ATOMIC_REVOKE_REQUEST)) {
3338-
err = -ENOMEM;
3339-
drop_atomic = true;
3340-
goto fail;
3341-
}
3342-
33433434
/*
33443435
* We should check this at this moment to avoid deadlock on inode page
33453436
* and #0 page. The locking rule for inline_data conversion should be:
@@ -3387,7 +3478,11 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping,
33873478

33883479
*pagep = page;
33893480

3390-
err = prepare_write_begin(sbi, page, pos, len,
3481+
if (f2fs_is_atomic_file(inode))
3482+
err = prepare_atomic_write_begin(sbi, page, pos, len,
3483+
&blkaddr, &need_balance);
3484+
else
3485+
err = prepare_write_begin(sbi, page, pos, len,
33913486
&blkaddr, &need_balance);
33923487
if (err)
33933488
goto fail;
@@ -3443,8 +3538,6 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping,
34433538
fail:
34443539
f2fs_put_page(page, 1);
34453540
f2fs_write_failed(inode, pos + len);
3446-
if (drop_atomic)
3447-
f2fs_drop_inmem_pages_all(sbi, false);
34483541
return err;
34493542
}
34503543

@@ -3488,8 +3581,12 @@ static int f2fs_write_end(struct file *file,
34883581
set_page_dirty(page);
34893582

34903583
if (pos + copied > i_size_read(inode) &&
3491-
!f2fs_verity_in_progress(inode))
3584+
!f2fs_verity_in_progress(inode)) {
34923585
f2fs_i_size_write(inode, pos + copied);
3586+
if (f2fs_is_atomic_file(inode))
3587+
f2fs_i_size_write(F2FS_I(inode)->cow_inode,
3588+
pos + copied);
3589+
}
34933590
unlock_out:
34943591
f2fs_put_page(page, 1);
34953592
f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
@@ -3522,9 +3619,6 @@ void f2fs_invalidate_folio(struct folio *folio, size_t offset, size_t length)
35223619
inode->i_ino == F2FS_COMPRESS_INO(sbi))
35233620
clear_page_private_data(&folio->page);
35243621

3525-
if (page_private_atomic(&folio->page))
3526-
return f2fs_drop_inmem_page(inode, &folio->page);
3527-
35283622
folio_detach_private(folio);
35293623
}
35303624

@@ -3534,10 +3628,6 @@ int f2fs_release_page(struct page *page, gfp_t wait)
35343628
if (PageDirty(page))
35353629
return 0;
35363630

3537-
/* This is atomic written page, keep Private */
3538-
if (page_private_atomic(page))
3539-
return 0;
3540-
35413631
if (test_opt(F2FS_P_SB(page), COMPRESS_CACHE)) {
35423632
struct inode *inode = page->mapping->host;
35433633

@@ -3563,18 +3653,6 @@ static bool f2fs_dirty_data_folio(struct address_space *mapping,
35633653
folio_mark_uptodate(folio);
35643654
BUG_ON(folio_test_swapcache(folio));
35653655

3566-
if (f2fs_is_atomic_file(inode) && !f2fs_is_commit_atomic_write(inode)) {
3567-
if (!page_private_atomic(&folio->page)) {
3568-
f2fs_register_inmem_page(inode, &folio->page);
3569-
return true;
3570-
}
3571-
/*
3572-
* Previously, this page has been registered, we just
3573-
* return here.
3574-
*/
3575-
return false;
3576-
}
3577-
35783656
if (!folio_test_dirty(folio)) {
35793657
filemap_dirty_folio(mapping, folio);
35803658
f2fs_update_dirty_folio(inode, folio);
@@ -3654,42 +3732,14 @@ static sector_t f2fs_bmap(struct address_space *mapping, sector_t block)
36543732
int f2fs_migrate_page(struct address_space *mapping,
36553733
struct page *newpage, struct page *page, enum migrate_mode mode)
36563734
{
3657-
int rc, extra_count;
3658-
struct f2fs_inode_info *fi = F2FS_I(mapping->host);
3659-
bool atomic_written = page_private_atomic(page);
3735+
int rc, extra_count = 0;
36603736

36613737
BUG_ON(PageWriteback(page));
36623738

3663-
/* migrating an atomic written page is safe with the inmem_lock hold */
3664-
if (atomic_written) {
3665-
if (mode != MIGRATE_SYNC)
3666-
return -EBUSY;
3667-
if (!mutex_trylock(&fi->inmem_lock))
3668-
return -EAGAIN;
3669-
}
3670-
3671-
/* one extra reference was held for atomic_write page */
3672-
extra_count = atomic_written ? 1 : 0;
36733739
rc = migrate_page_move_mapping(mapping, newpage,
36743740
page, extra_count);
3675-
if (rc != MIGRATEPAGE_SUCCESS) {
3676-
if (atomic_written)
3677-
mutex_unlock(&fi->inmem_lock);
3741+
if (rc != MIGRATEPAGE_SUCCESS)
36783742
return rc;
3679-
}
3680-
3681-
if (atomic_written) {
3682-
struct inmem_pages *cur;
3683-
3684-
list_for_each_entry(cur, &fi->inmem_pages, list)
3685-
if (cur->page == page) {
3686-
cur->page = newpage;
3687-
break;
3688-
}
3689-
mutex_unlock(&fi->inmem_lock);
3690-
put_page(page);
3691-
get_page(newpage);
3692-
}
36933743

36943744
/* guarantee to start from no stale private field */
36953745
set_page_private(newpage, 0);

fs/f2fs/debug.c

Lines changed: 2 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,6 @@ static void update_general_status(struct f2fs_sb_info *sbi)
9191
si->ndirty_files = sbi->ndirty_inode[FILE_INODE];
9292
si->nquota_files = sbi->nquota_files;
9393
si->ndirty_all = sbi->ndirty_inode[DIRTY_META];
94-
si->inmem_pages = get_pages(sbi, F2FS_INMEM_PAGES);
9594
si->aw_cnt = sbi->atomic_files;
9695
si->vw_cnt = atomic_read(&sbi->vw_cnt);
9796
si->max_aw_cnt = atomic_read(&sbi->max_aw_cnt);
@@ -167,8 +166,6 @@ static void update_general_status(struct f2fs_sb_info *sbi)
167166
si->alloc_nids = NM_I(sbi)->nid_cnt[PREALLOC_NID];
168167
si->io_skip_bggc = sbi->io_skip_bggc;
169168
si->other_skip_bggc = sbi->other_skip_bggc;
170-
si->skipped_atomic_files[BG_GC] = sbi->skipped_atomic_files[BG_GC];
171-
si->skipped_atomic_files[FG_GC] = sbi->skipped_atomic_files[FG_GC];
172169
si->util_free = (int)(free_user_blocks(sbi) >> sbi->log_blocks_per_seg)
173170
* 100 / (int)(sbi->user_block_count >> sbi->log_blocks_per_seg)
174171
/ 2;
@@ -296,7 +293,6 @@ static void update_mem_info(struct f2fs_sb_info *sbi)
296293
sizeof(struct nat_entry);
297294
si->cache_mem += NM_I(sbi)->nat_cnt[DIRTY_NAT] *
298295
sizeof(struct nat_entry_set);
299-
si->cache_mem += si->inmem_pages * sizeof(struct inmem_pages);
300296
for (i = 0; i < MAX_INO_ENTRY; i++)
301297
si->cache_mem += sbi->im[i].ino_num * sizeof(struct ino_entry);
302298
si->cache_mem += atomic_read(&sbi->total_ext_tree) *
@@ -491,10 +487,6 @@ static int stat_show(struct seq_file *s, void *v)
491487
si->bg_data_blks);
492488
seq_printf(s, " - node blocks : %d (%d)\n", si->node_blks,
493489
si->bg_node_blks);
494-
seq_printf(s, "Skipped : atomic write %llu (%llu)\n",
495-
si->skipped_atomic_files[BG_GC] +
496-
si->skipped_atomic_files[FG_GC],
497-
si->skipped_atomic_files[BG_GC]);
498490
seq_printf(s, "BG skip : IO: %u, Other: %u\n",
499491
si->io_skip_bggc, si->other_skip_bggc);
500492
seq_puts(s, "\nExtent Cache:\n");
@@ -519,9 +511,9 @@ static int stat_show(struct seq_file *s, void *v)
519511
si->flush_list_empty,
520512
si->nr_discarding, si->nr_discarded,
521513
si->nr_discard_cmd, si->undiscard_blks);
522-
seq_printf(s, " - inmem: %4d, atomic IO: %4d (Max. %4d), "
514+
seq_printf(s, " - atomic IO: %4d (Max. %4d), "
523515
"volatile IO: %4d (Max. %4d)\n",
524-
si->inmem_pages, si->aw_cnt, si->max_aw_cnt,
516+
si->aw_cnt, si->max_aw_cnt,
525517
si->vw_cnt, si->max_vw_cnt);
526518
seq_printf(s, " - compress: %4d, hit:%8d\n", si->compress_pages, si->compress_page_hit);
527519
seq_printf(s, " - nodes: %4d in %4d\n",

0 commit comments

Comments
 (0)