From ab85d2bec4d63e554974f16d960b522709c34860 Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Mon, 21 Jul 2025 18:24:53 +0800 Subject: [PATCH 1/9] anolis: erofs: add missing map->m_flags = 0 in erofs_map_blocks() ANBZ: #11854 This was missing in the original backport commit. Fixes: 34bc2f0eb2de ("erofs: get rid of erofs_map_blocks_flatmode()") Signed-off-by: Gao Xiang --- fs/erofs/data.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/erofs/data.c b/fs/erofs/data.c index a9a411fd5958..1cc8ddc057ae 100644 --- a/fs/erofs/data.c +++ b/fs/erofs/data.c @@ -137,6 +137,7 @@ int erofs_map_blocks(struct inode *inode, struct erofs_map_blocks *map) trace_erofs_map_blocks_enter(inode, map, 0); map->m_deviceid = 0; + map->m_flags = 0; if (map->m_la >= inode->i_size) goto out; -- Gitee From c35964a4200ed62e0eceaf77d37486561ccc446a Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Fri, 7 Apr 2023 22:17:04 +0800 Subject: [PATCH 2/9] erofs: keep meta inode into erofs_buf ANBZ: #11854 commit eb2c5e41be1495cf7a20ff49df473b1c45b82e77 upstream. So that erofs_read_metadata() can read metadata from other inodes (e.g. packed inode) as well. Signed-off-by: Jingbo Xu Acked-by: Chao Yu Signed-off-by: Gao Xiang --- fs/erofs/data.c | 35 +++++++++++++++++++++++------------ fs/erofs/dir.c | 3 ++- fs/erofs/internal.h | 8 ++++++-- fs/erofs/namei.c | 3 ++- fs/erofs/zdata.c | 4 ++-- 5 files changed, 35 insertions(+), 18 deletions(-) diff --git a/fs/erofs/data.c b/fs/erofs/data.c index 1cc8ddc057ae..97b846c2c98d 100644 --- a/fs/erofs/data.c +++ b/fs/erofs/data.c @@ -34,12 +34,14 @@ void erofs_put_metabuf(struct erofs_buf *buf) put_page(buf->page); buf->page = NULL; +#ifdef CONFIG_EROFS_FS_RAFS_V6 if (buf->mapping) { buf->mapping->a_ops->endpfn(buf->mapping, index, &buf->iomap, 0); buf->mapping = NULL; memset(&buf->iomap, 0, sizeof(buf->iomap)); } +#endif } /* @@ -47,9 +49,9 @@ void erofs_put_metabuf(struct erofs_buf *buf) * anonymous inode in fscache mode. */ void *__erofs_bread(struct super_block *sb, struct erofs_buf *buf, - struct inode *inode, erofs_blk_t blkaddr, - enum erofs_kmap_type type) + erofs_blk_t blkaddr, enum erofs_kmap_type type) { + struct inode *inode = buf->inode; erofs_off_t offset = (erofs_off_t)blkaddr << inode->i_blkbits; struct address_space *const mapping = inode->i_mapping; pgoff_t index = offset >> PAGE_SHIFT; @@ -100,25 +102,34 @@ void *__erofs_bread(struct super_block *sb, struct erofs_buf *buf, return buf->base + (offset & ~PAGE_MASK); } -void *erofs_bread(struct erofs_buf *buf, struct inode *inode, - erofs_blk_t blkaddr, enum erofs_kmap_type type) +void *erofs_bread(struct erofs_buf *buf, erofs_blk_t blkaddr, + enum erofs_kmap_type type) +{ + return __erofs_bread(NULL, buf, blkaddr, type); +} + +void erofs_init_metabuf(struct erofs_buf *buf, struct super_block *sb) { - return __erofs_bread(NULL, buf, inode, blkaddr, type); + if (erofs_is_fscache_mode(sb)) + buf->inode = EROFS_SB(sb)->s_fscache->inode; +#ifdef CONFIG_EROFS_FS_RAFS_V6 + else if (erofs_is_rafsv6_mode(sb)) + buf->inode = EROFS_SB(sb)->bootstrap->f_inode; +#endif + else + buf->inode = sb->s_bdev->bd_inode; + } void *erofs_read_metabuf(struct erofs_buf *buf, struct super_block *sb, erofs_blk_t blkaddr, enum erofs_kmap_type type) { + erofs_init_metabuf(buf, sb); #ifdef CONFIG_EROFS_FS_RAFS_V6 if (erofs_is_rafsv6_mode(sb)) - return __erofs_bread(sb, buf, EROFS_SB(sb)->bootstrap->f_inode, - blkaddr, type); + return __erofs_bread(sb, buf, blkaddr, type); #endif - if (erofs_is_fscache_mode(sb)) - return erofs_bread(buf, EROFS_SB(sb)->s_fscache->inode, - blkaddr, type); - - return erofs_bread(buf, sb->s_bdev->bd_inode, blkaddr, type); + return erofs_bread(buf, blkaddr, type); } int erofs_map_blocks(struct inode *inode, struct erofs_map_blocks *map) diff --git a/fs/erofs/dir.c b/fs/erofs/dir.c index 88b3ddaa7351..468abf993709 100644 --- a/fs/erofs/dir.c +++ b/fs/erofs/dir.c @@ -58,11 +58,12 @@ static int erofs_readdir(struct file *f, struct dir_context *ctx) int err = 0; bool initial = true; + buf.inode = dir; while (ctx->pos < dirsize) { struct erofs_dirent *de; unsigned int nameoff, maxsize; - de = erofs_bread(&buf, dir, i, EROFS_KMAP); + de = erofs_bread(&buf, i, EROFS_KMAP); if (IS_ERR(de)) { erofs_err(sb, "fail to readdir of logical block %u of nid %llu", i, EROFS_I(dir)->nid); diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h index 62311d7028c7..a7850692bc70 100644 --- a/fs/erofs/internal.h +++ b/fs/erofs/internal.h @@ -217,8 +217,11 @@ enum erofs_kmap_type { }; struct erofs_buf { +#ifdef CONFIG_EROFS_FS_RAFS_V6 struct iomap iomap; struct address_space *mapping; +#endif + struct inode *inode; struct page *page; void *base; enum erofs_kmap_type kmap_type; @@ -411,8 +414,9 @@ void *erofs_read_metadata(struct super_block *sb, struct erofs_buf *buf, erofs_off_t *offset, int *lengthp); void erofs_unmap_metabuf(struct erofs_buf *buf); void erofs_put_metabuf(struct erofs_buf *buf); -void *erofs_bread(struct erofs_buf *buf, struct inode *inode, - erofs_blk_t blkaddr, enum erofs_kmap_type type); +void *erofs_bread(struct erofs_buf *buf, erofs_blk_t blkaddr, + enum erofs_kmap_type type); +void erofs_init_metabuf(struct erofs_buf *buf, struct super_block *sb); void *erofs_read_metabuf(struct erofs_buf *buf, struct super_block *sb, erofs_blk_t blkaddr, enum erofs_kmap_type type); int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *dev); diff --git a/fs/erofs/namei.c b/fs/erofs/namei.c index fdd6a78b74ec..facb595fac8f 100644 --- a/fs/erofs/namei.c +++ b/fs/erofs/namei.c @@ -99,7 +99,8 @@ static void *erofs_find_target_block(struct erofs_buf *target, struct erofs_buf buf = __EROFS_BUF_INITIALIZER; struct erofs_dirent *de; - de = erofs_bread(&buf, dir, mid, EROFS_KMAP); + buf.inode = dir; + de = erofs_bread(&buf, mid, EROFS_KMAP); if (!IS_ERR(de)) { const int nameoff = nameoff_from_disk(de->nameoff, bsz); const int ndirents = nameoff / sizeof(*de); diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 0735018d6d55..94bc5e326587 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -775,11 +775,11 @@ static int z_erofs_read_fragment(struct super_block *sb, struct page *page, if (!packed_inode) return -EFSCORRUPTED; + buf.inode = packed_inode; for (; cur < end; cur += cnt, pos += cnt) { cnt = min_t(unsigned int, end - cur, sb->s_blocksize - erofs_blkoff(sb, pos)); - src = erofs_bread(&buf, packed_inode, - erofs_blknr(sb, pos), EROFS_KMAP); + src = erofs_bread(&buf, erofs_blknr(sb, pos), EROFS_KMAP); if (IS_ERR(src)) { erofs_put_metabuf(&buf); return PTR_ERR(src); -- Gitee From bda8fd835cd257f7c9252d220593756db4109203 Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Fri, 19 Apr 2024 20:36:10 +0800 Subject: [PATCH 3/9] erofs: get rid of erofs_fs_context ANBZ: #11854 commit 07abe43a28b2c660f726d66f5470f7f114f9643a upstream. Instead of allocating the erofs_sb_info in fill_super() allocate it during erofs_init_fs_context() and ensure that erofs can always have the info available during erofs_kill_sb(). After this erofs_fs_context is no longer needed, replace ctx with sbi, no functional changes. Suggested-by: Jingbo Xu Signed-off-by: Baokun Li Reviewed-by: Jingbo Xu Reviewed-by: Gao Xiang Reviewed-by: Chao Yu Link: https://lore.kernel.org/r/20240419123611.947084-2-libaokun1@huawei.com Conflicts: fs/erofs/internal.h fs/erofs/super.c Signed-off-by: Gao Xiang --- fs/erofs/internal.h | 11 ---- fs/erofs/super.c | 146 ++++++++++++++++++++------------------------ 2 files changed, 65 insertions(+), 92 deletions(-) diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h index a7850692bc70..200357afa44b 100644 --- a/fs/erofs/internal.h +++ b/fs/erofs/internal.h @@ -73,17 +73,6 @@ struct erofs_dev_context { bool flatdev; }; -struct erofs_fs_context { - struct erofs_mount_opts opt; - struct erofs_dev_context *devs; - char *fsid; - char *domain_id; -#ifdef CONFIG_EROFS_FS_RAFS_V6 - char *bootstrap_path; - char *blob_dir_path; -#endif -}; - /* all filesystem-wide lz4 configurations */ struct erofs_sb_lz4_info { /* # of pages needed for EROFS lz4 rolling decompression */ diff --git a/fs/erofs/super.c b/fs/erofs/super.c index 92a3c6615853..ab49cee8fa7c 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -375,17 +375,17 @@ static int erofs_read_superblock(struct super_block *sb) return ret; } -static void erofs_default_options(struct erofs_fs_context *ctx) +static void erofs_default_options(struct erofs_sb_info *sbi) { #ifdef CONFIG_EROFS_FS_ZIP - ctx->opt.cache_strategy = EROFS_ZIP_CACHE_READAROUND; - ctx->opt.max_sync_decompress_pages = 3; + sbi->opt.cache_strategy = EROFS_ZIP_CACHE_READAROUND; + sbi->opt.max_sync_decompress_pages = 3; #endif #ifdef CONFIG_EROFS_FS_XATTR - set_opt(&ctx->opt, XATTR_USER); + set_opt(&sbi->opt, XATTR_USER); #endif #ifdef CONFIG_EROFS_FS_POSIX_ACL - set_opt(&ctx->opt, POSIX_ACL); + set_opt(&sbi->opt, POSIX_ACL); #endif } @@ -440,17 +440,17 @@ static const struct fs_parameter_spec erofs_fs_parameters[] = { static bool erofs_fc_set_dax_mode(struct fs_context *fc, unsigned int mode) { #ifdef CONFIG_FS_DAX - struct erofs_fs_context *ctx = fc->fs_private; + struct erofs_sb_info *sbi = fc->s_fs_info; switch (mode) { case EROFS_MOUNT_DAX_ALWAYS: warnfc(fc, "DAX enabled. Warning: EXPERIMENTAL, use at your own risk"); - set_opt(&ctx->opt, DAX_ALWAYS); - clear_opt(&ctx->opt, DAX_NEVER); + set_opt(&sbi->opt, DAX_ALWAYS); + clear_opt(&sbi->opt, DAX_NEVER); return true; case EROFS_MOUNT_DAX_NEVER: - set_opt(&ctx->opt, DAX_NEVER); - clear_opt(&ctx->opt, DAX_ALWAYS); + set_opt(&sbi->opt, DAX_NEVER); + clear_opt(&sbi->opt, DAX_ALWAYS); return true; default: DBG_BUGON(1); @@ -465,7 +465,7 @@ static bool erofs_fc_set_dax_mode(struct fs_context *fc, unsigned int mode) static int erofs_fc_parse_param(struct fs_context *fc, struct fs_parameter *param) { - struct erofs_fs_context *ctx = fc->fs_private; + struct erofs_sb_info *sbi = fc->s_fs_info; struct fs_parse_result result; struct erofs_device_info *dif; int opt, ret; @@ -478,9 +478,9 @@ static int erofs_fc_parse_param(struct fs_context *fc, case Opt_user_xattr: #ifdef CONFIG_EROFS_FS_XATTR if (result.boolean) - set_opt(&ctx->opt, XATTR_USER); + set_opt(&sbi->opt, XATTR_USER); else - clear_opt(&ctx->opt, XATTR_USER); + clear_opt(&sbi->opt, XATTR_USER); #else errorfc(fc, "{,no}user_xattr options not supported"); #endif @@ -488,16 +488,16 @@ static int erofs_fc_parse_param(struct fs_context *fc, case Opt_acl: #ifdef CONFIG_EROFS_FS_POSIX_ACL if (result.boolean) - set_opt(&ctx->opt, POSIX_ACL); + set_opt(&sbi->opt, POSIX_ACL); else - clear_opt(&ctx->opt, POSIX_ACL); + clear_opt(&sbi->opt, POSIX_ACL); #else errorfc(fc, "{,no}acl options not supported"); #endif break; case Opt_cache_strategy: #ifdef CONFIG_EROFS_FS_ZIP - ctx->opt.cache_strategy = result.uint_32; + sbi->opt.cache_strategy = result.uint_32; #else errorfc(fc, "compression not supported, cache_strategy ignored"); #endif @@ -519,21 +519,21 @@ static int erofs_fc_parse_param(struct fs_context *fc, kfree(dif); return -ENOMEM; } - down_write(&ctx->devs->rwsem); - ret = idr_alloc(&ctx->devs->tree, dif, 0, 0, GFP_KERNEL); - up_write(&ctx->devs->rwsem); + down_write(&sbi->devs->rwsem); + ret = idr_alloc(&sbi->devs->tree, dif, 0, 0, GFP_KERNEL); + up_write(&sbi->devs->rwsem); if (ret < 0) { kfree(dif->path); kfree(dif); return ret; } - ++ctx->devs->extra_devices; + ++sbi->devs->extra_devices; break; case Opt_fsid: #ifdef CONFIG_EROFS_FS_ONDEMAND - kfree(ctx->fsid); - ctx->fsid = kstrdup(param->string, GFP_KERNEL); - if (!ctx->fsid) + kfree(sbi->fsid); + sbi->fsid = kstrdup(param->string, GFP_KERNEL); + if (!sbi->fsid) return -ENOMEM; #else errorfc(fc, "fsid option not supported"); @@ -542,9 +542,9 @@ static int erofs_fc_parse_param(struct fs_context *fc, break; case Opt_domain_id: #ifdef CONFIG_EROFS_FS_ONDEMAND - kfree(ctx->domain_id); - ctx->domain_id = kstrdup(param->string, GFP_KERNEL); - if (!ctx->domain_id) + kfree(sbi->domain_id); + sbi->domain_id = kstrdup(param->string, GFP_KERNEL); + if (!sbi->domain_id) return -ENOMEM; #else errorfc(fc, "domain_id option not supported"); @@ -552,22 +552,22 @@ static int erofs_fc_parse_param(struct fs_context *fc, break; #ifdef CONFIG_EROFS_FS_RAFS_V6 case Opt_bootstrap_path: - kfree(ctx->bootstrap_path); - ctx->bootstrap_path = kstrdup(param->string, GFP_KERNEL); - if (!ctx->bootstrap_path) + kfree(sbi->bootstrap_path); + sbi->bootstrap_path = kstrdup(param->string, GFP_KERNEL); + if (!sbi->bootstrap_path) return -ENOMEM; break; case Opt_blob_dir_path: - kfree(ctx->blob_dir_path); - ctx->blob_dir_path = kstrdup(param->string, GFP_KERNEL); - if (!ctx->blob_dir_path) + kfree(sbi->blob_dir_path); + sbi->blob_dir_path = kstrdup(param->string, GFP_KERNEL); + if (!sbi->blob_dir_path) return -ENOMEM; break; case Opt_blob_mmap_pin: if (result.boolean) - set_opt(&ctx->opt, BLOB_MMAP_PIN); + set_opt(&sbi->opt, BLOB_MMAP_PIN); else - clear_opt(&ctx->opt, BLOB_MMAP_PIN); + clear_opt(&sbi->opt, BLOB_MMAP_PIN); break; #endif default: @@ -715,8 +715,7 @@ static int rafs_v6_fill_super(struct super_block *sb) static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc) { struct inode *inode; - struct erofs_sb_info *sbi; - struct erofs_fs_context *ctx = fc->fs_private; + struct erofs_sb_info *sbi = EROFS_SB(sb); int err; sb->s_magic = EROFS_SUPER_MAGIC; @@ -724,25 +723,6 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc) sb->s_maxbytes = MAX_LFS_FILESIZE; sb->s_op = &erofs_sops; - sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); - if (!sbi) - return -ENOMEM; - - sb->s_fs_info = sbi; - sbi->opt = ctx->opt; - sbi->devs = ctx->devs; - ctx->devs = NULL; - sbi->fsid = ctx->fsid; - ctx->fsid = NULL; - sbi->domain_id = ctx->domain_id; - ctx->domain_id = NULL; -#ifdef CONFIG_EROFS_FS_RAFS_V6 - sbi->bootstrap_path = ctx->bootstrap_path; - ctx->bootstrap_path = NULL; - sbi->blob_dir_path = ctx->blob_dir_path; - ctx->blob_dir_path = NULL; -#endif - sbi->blkszbits = PAGE_SHIFT; if (!sb->s_bdev) { /* fscache or rafsv6 mode */ @@ -843,7 +823,7 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc) static int erofs_fc_get_tree(struct fs_context *fc) { - struct erofs_fs_context *ctx = fc->fs_private; + struct erofs_sb_info *sbi = fc->s_fs_info; #ifdef CONFIG_EROFS_FS_RAFS_V6 if (ctx->blob_dir_path && !ctx->bootstrap_path) { @@ -857,7 +837,7 @@ static int erofs_fc_get_tree(struct fs_context *fc) } #endif - if (IS_ENABLED(CONFIG_EROFS_FS_ONDEMAND) && ctx->fsid) + if (IS_ENABLED(CONFIG_EROFS_FS_ONDEMAND) && sbi->fsid) return get_tree_nodev(fc, erofs_fc_fill_super); #ifdef CONFIG_EROFS_FS_RAFS_V6 @@ -871,32 +851,32 @@ static int erofs_fc_reconfigure(struct fs_context *fc) { struct super_block *sb = fc->root->d_sb; struct erofs_sb_info *sbi = EROFS_SB(sb); - struct erofs_fs_context *ctx = fc->fs_private; + struct erofs_sb_info *new_sbi = fc->s_fs_info; DBG_BUGON(!sb_rdonly(sb)); - if (ctx->fsid || ctx->domain_id) + if (new_sbi->fsid || new_sbi->domain_id) erofs_info(sb, "ignoring reconfiguration for fsid|domain_id."); #ifdef CONFIG_EROFS_FS_RAFS_V6 - if (test_opt(&ctx->opt, BLOB_MMAP_PIN) != - test_opt(&sbi->opt, BLOB_MMAP_PIN)) { + if (test_opt(&sbi->opt, BLOB_MMAP_PIN) != + test_opt(&new_sbi->opt, BLOB_MMAP_PIN)) { erofs_info(sb, "ignoring reconfiguration for rafsv6's blob_mmap_pin."); - if (test_opt(&sbi->opt, BLOB_MMAP_PIN)) - set_opt(&ctx->opt, BLOB_MMAP_PIN); + if (test_opt(&new_sbi->opt, BLOB_MMAP_PIN)) + set_opt(&sbi->opt, BLOB_MMAP_PIN); else - clear_opt(&ctx->opt, BLOB_MMAP_PIN); + clear_opt(&sbi->opt, BLOB_MMAP_PIN); } #endif - if (test_opt(&ctx->opt, POSIX_ACL)) + if (test_opt(&new_sbi->opt, POSIX_ACL)) fc->sb_flags |= SB_POSIXACL; else fc->sb_flags &= ~SB_POSIXACL; - sbi->opt = ctx->opt; + sbi->opt = new_sbi->opt; fc->sb_flags |= SB_RDONLY; return 0; @@ -931,12 +911,15 @@ static void erofs_free_dev_context(struct erofs_dev_context *devs) static void erofs_fc_free(struct fs_context *fc) { - struct erofs_fs_context *ctx = fc->fs_private; + struct erofs_sb_info *sbi = fc->s_fs_info; - erofs_free_dev_context(ctx->devs); - kfree(ctx->fsid); - kfree(ctx->domain_id); - kfree(ctx); + if (!sbi) + return; + + erofs_free_dev_context(sbi->devs); + kfree(sbi->fsid); + kfree(sbi->domain_id); + kfree(sbi); } static const struct fs_context_operations erofs_context_ops = { @@ -948,21 +931,22 @@ static const struct fs_context_operations erofs_context_ops = { static int erofs_init_fs_context(struct fs_context *fc) { - struct erofs_fs_context *ctx; + struct erofs_sb_info *sbi; - ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); - if (!ctx) + sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); + if (!sbi) return -ENOMEM; - ctx->devs = kzalloc(sizeof(struct erofs_dev_context), GFP_KERNEL); - if (!ctx->devs) { - kfree(ctx); + + sbi->devs = kzalloc(sizeof(struct erofs_dev_context), GFP_KERNEL); + if (!sbi->devs) { + kfree(sbi); return -ENOMEM; } - fc->fs_private = ctx; + fc->s_fs_info = sbi; - idr_init(&ctx->devs->tree); - init_rwsem(&ctx->devs->rwsem); - erofs_default_options(ctx); + idr_init(&sbi->devs->tree); + init_rwsem(&sbi->devs->rwsem); + erofs_default_options(sbi); fc->ops = &erofs_context_ops; return 0; } -- Gitee From 666cbb72a862197bc37296759c0d67473c4836ec Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Fri, 30 Aug 2024 11:28:37 +0800 Subject: [PATCH 4/9] erofs: add file-backed mount support ANBZ: #11854 commit fb176750266a3d7f42ebdcf28e8ba40350b27847 upstream. It actually has been around for years: For containers and other sandbox use cases, there will be thousands (and even more) of authenticated (sub)images running on the same host, unlike OS images. Of course, all scenarios can use the same EROFS on-disk format, but bdev-backed mounts just work well for OS images since golden data is dumped into real block devices. However, it's somewhat hard for container runtimes to manage and isolate so many unnecessary virtual block devices safely and efficiently [1]: they just look like a burden to orchestrators and file-backed mounts are preferred indeed. There were already enough attempts such as Incremental FS, the original ComposeFS and PuzzleFS acting in the same way for immutable fses. As for current EROFS users, ComposeFS, containerd and Android APEXs will be directly benefited from it. On the other hand, previous experimental feature "erofs over fscache" was once also intended to provide a similar solution (inspired by Incremental FS discussion [2]), but the following facts show file-backed mounts will be a better approach: - Fscache infrastructure has recently been moved into new Netfslib which is an unexpected dependency to EROFS really, although it originally claims "it could be used for caching other things such as ISO9660 filesystems too." [3] - It takes an unexpectedly long time to upstream Fscache/Cachefiles enhancements. For example, the failover feature took more than one year, and the deamonless feature is still far behind now; - Ongoing HSM "fanotify pre-content hooks" [4] together with this will perfectly supersede "erofs over fscache" in a simpler way since developers (mainly containerd folks) could leverage their existing caching mechanism entirely in userspace instead of strictly following the predefined in-kernel caching tree hierarchy. After "fanotify pre-content hooks" lands upstream to provide the same functionality, "erofs over fscache" will be removed then (as an EROFS internal improvement and EROFS will not have to bother with on-demand fetching and/or caching improvements anymore.) [1] https://github.com/containers/storage/pull/2039 [2] https://lore.kernel.org/r/CAOQ4uxjbVxnubaPjVaGYiSwoGDTdpWbB=w_AeM6YM=zVixsUfQ@mail.gmail.com [3] https://docs.kernel.org/filesystems/caching/fscache.html [4] https://lore.kernel.org/r/cover.1723670362.git.josef@toxicpanda.com Closes: https://github.com/containers/composefs/issues/144 Reviewed-by: Sandeep Dhavale Reviewed-by: Chao Yu Link: https://lore.kernel.org/r/20240830032840.3783206-1-hsiangkao@linux.alibaba.com Conflicts: fs/erofs/Kconfig fs/erofs/data.c fs/erofs/inode.c fs/erofs/internal.h fs/erofs/super.c Signed-off-by: Gao Xiang --- fs/erofs/Kconfig | 17 ++++++++++++ fs/erofs/data.c | 40 +++++++++++++++------------- fs/erofs/inode.c | 5 +++- fs/erofs/internal.h | 7 +++++ fs/erofs/super.c | 64 +++++++++++++++++++++++++++++++-------------- 5 files changed, 94 insertions(+), 39 deletions(-) diff --git a/fs/erofs/Kconfig b/fs/erofs/Kconfig index 0bfaa7e2940a..a6714bed3e73 100644 --- a/fs/erofs/Kconfig +++ b/fs/erofs/Kconfig @@ -73,6 +73,23 @@ config EROFS_FS_SECURITY If you are not using a security module, say N. +config EROFS_FS_BACKED_BY_FILE + bool "File-backed EROFS filesystem support" + depends on EROFS_FS + default y + help + This allows EROFS to use filesystem image files directly, without + the intercession of loopback block devices or likewise. It is + particularly useful for container images with numerous blobs and + other sandboxes, where loop devices behave intricately. It can also + be used to simplify error-prone lifetime management of unnecessary + virtual block devices. + + Note that this feature, along with ongoing fanotify pre-content + hooks, will eventually replace "EROFS over fscache." + + If you don't want to enable this feature, say N. + config EROFS_FS_RAFS_V6 bool "EROFS RAFSv6 kangaroo support (EROFS over virtiofs)" depends on VIRTIO_FS diff --git a/fs/erofs/data.c b/fs/erofs/data.c index 97b846c2c98d..f47438e355ca 100644 --- a/fs/erofs/data.c +++ b/fs/erofs/data.c @@ -110,11 +110,15 @@ void *erofs_bread(struct erofs_buf *buf, erofs_blk_t blkaddr, void erofs_init_metabuf(struct erofs_buf *buf, struct super_block *sb) { - if (erofs_is_fscache_mode(sb)) - buf->inode = EROFS_SB(sb)->s_fscache->inode; + struct erofs_sb_info *sbi = EROFS_SB(sb); + + if (erofs_is_fileio_mode(sbi)) + buf->inode = file_inode(sbi->fdev); + else if (erofs_is_fscache_mode(sb)) + buf->inode = sbi->s_fscache->inode; #ifdef CONFIG_EROFS_FS_RAFS_V6 else if (erofs_is_rafsv6_mode(sb)) - buf->inode = EROFS_SB(sb)->bootstrap->f_inode; + buf->inode = sbi->bootstrap->f_inode; #endif else buf->inode = sb->s_bdev->bd_inode; @@ -224,10 +228,23 @@ int erofs_map_blocks(struct inode *inode, struct erofs_map_blocks *map) return err; } +static void erofs_fill_from_devinfo(struct erofs_map_dev *map, + struct erofs_device_info *dif) +{ + map->m_bdev = NULL; + map->m_bdev = dif->bdev; +#ifdef CONFIG_EROFS_FS_RAFS_V6 + map->m_fp = dif->blobfile; +#endif + map->m_daxdev = dif->dax_dev; + map->m_fscache = dif->fscache; +} + int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map) { struct erofs_dev_context *devs = EROFS_SB(sb)->devs; struct erofs_device_info *dif; + erofs_off_t startoff, length; int id; map->m_bdev = sb->s_bdev; @@ -249,32 +266,19 @@ int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map) up_read(&devs->rwsem); return 0; } - map->m_bdev = dif->bdev; - map->m_daxdev = dif->dax_dev; -#ifdef CONFIG_EROFS_FS_RAFS_V6 - map->m_fp = dif->blobfile; -#endif - map->m_fscache = dif->fscache; + erofs_fill_from_devinfo(map, dif); up_read(&devs->rwsem); } else if (devs->extra_devices && !devs->flatdev) { down_read(&devs->rwsem); idr_for_each_entry(&devs->tree, dif, id) { - erofs_off_t startoff, length; - if (!dif->uniaddr) continue; startoff = erofs_pos(sb, dif->uniaddr); length = erofs_pos(sb, dif->blocks); - if (map->m_pa >= startoff && map->m_pa < startoff + erofs_pos(sb, dif->blocks)) { map->m_pa -= startoff; - map->m_bdev = dif->bdev; - map->m_daxdev = dif->dax_dev; -#ifdef CONFIG_EROFS_FS_RAFS_V6 - map->m_fp = dif->blobfile; -#endif - map->m_fscache = dif->fscache; + erofs_fill_from_devinfo(map, dif); break; } } diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c index c11fd6fb35de..be9aceefc57b 100644 --- a/fs/erofs/inode.c +++ b/fs/erofs/inode.c @@ -236,7 +236,10 @@ static int erofs_fill_inode(struct inode *inode) return 0; } - if (erofs_inode_is_data_compressed(vi->datalayout)) { + if (erofs_is_fileio_mode(EROFS_SB(inode->i_sb))) { + /* XXX: data I/Os will be implemented in the following patches */ + err = -EOPNOTSUPP; + } else if (erofs_inode_is_data_compressed(vi->datalayout)) { err = -EOPNOTSUPP; #ifdef CONFIG_EROFS_FS_ZIP if (!erofs_is_fscache_mode(inode->i_sb)) { diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h index 200357afa44b..350952ccb56c 100644 --- a/fs/erofs/internal.h +++ b/fs/erofs/internal.h @@ -49,6 +49,7 @@ struct erofs_device_info { char *path; struct erofs_fscache *fscache; struct block_device *bdev; + struct file *file; struct dax_device *dax_dev; #ifdef CONFIG_EROFS_FS_RAFS_V6 struct file *blobfile; @@ -115,6 +116,7 @@ struct erofs_sb_info { struct erofs_sb_lz4_info lz4; struct inode *packed_inode; #endif /* CONFIG_EROFS_FS_ZIP */ + struct file *fdev; #ifdef CONFIG_EROFS_FS_RAFS_V6 struct path blob_dir; struct file *bootstrap; @@ -180,6 +182,11 @@ static inline bool erofs_is_rafsv6_mode(struct super_block *sb) #endif } +static inline bool erofs_is_fileio_mode(struct erofs_sb_info *sbi) +{ + return IS_ENABLED(CONFIG_EROFS_FS_BACKED_BY_FILE) && sbi->fdev; +} + static inline bool erofs_is_fscache_mode(struct super_block *sb) { /* to distinguish from rafsv6 which also works in nodev mode */ diff --git a/fs/erofs/super.c b/fs/erofs/super.c index ab49cee8fa7c..4739b70128e2 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -16,6 +16,7 @@ #include #include #include +#include #include "xattr.h" #define CREATE_TRACE_POINTS @@ -132,6 +133,7 @@ static int erofs_init_device(struct erofs_buf *buf, struct super_block *sb, struct erofs_deviceslot *dis; struct block_device *bdev; void *ptr; + struct file *file; ptr = erofs_read_metabuf(buf, sb, erofs_blknr(sb, *pos), EROFS_KMAP); if (IS_ERR(ptr)) @@ -166,11 +168,19 @@ static int erofs_init_device(struct erofs_buf *buf, struct super_block *sb, dif->blobfile = f; #endif } else if (!sbi->devs->flatdev) { - bdev = blkdev_get_by_path(dif->path, FMODE_READ | FMODE_EXCL, - sb->s_type); - if (IS_ERR(bdev)) - return PTR_ERR(bdev); - dif->bdev = bdev; + if (erofs_is_fileio_mode(sbi)) { + file = filp_open(dif->path, O_RDONLY | O_LARGEFILE, 0); + if (IS_ERR(file)) + return PTR_ERR(file); + dif->file = file; + } else { + bdev = blkdev_get_by_path(dif->path, + FMODE_READ | FMODE_EXCL, + sb->s_type); + if (IS_ERR(bdev)) + return PTR_ERR(bdev); + dif->bdev = bdev; + } } dif->blocks = le32_to_cpu(dis->blocks_lo); @@ -725,9 +735,18 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc) sbi->blkszbits = PAGE_SHIFT; if (!sb->s_bdev) { - /* fscache or rafsv6 mode */ sb->s_blocksize = PAGE_SIZE; sb->s_blocksize_bits = PAGE_SHIFT; + + if (erofs_is_fscache_mode(sb)) { + err = erofs_fscache_register_fs(sb); + if (err) + return err; + } + + err = super_setup_bdi(sb); + if (err) + return err; } else { if (!sb_set_blocksize(sb, PAGE_SIZE)) { errorfc(fc, "failed to set initial blksize"); @@ -736,16 +755,6 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc) sbi->dax_dev = fs_dax_get_by_bdev(sb->s_bdev, NULL, NULL); } - if (erofs_is_fscache_mode(sb)) { - err = erofs_fscache_register_fs(sb); - if (err) - return err; - - err = super_setup_bdi(sb); - if (err) - return err; - } - err = rafs_v6_fill_super(sb); if (err) return err; @@ -824,6 +833,7 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc) static int erofs_fc_get_tree(struct fs_context *fc) { struct erofs_sb_info *sbi = fc->s_fs_info; + int ret; #ifdef CONFIG_EROFS_FS_RAFS_V6 if (ctx->blob_dir_path && !ctx->bootstrap_path) { @@ -844,7 +854,19 @@ static int erofs_fc_get_tree(struct fs_context *fc) if (ctx->bootstrap_path && ctx->blob_dir_path) return get_tree_nodev(fc, erofs_fc_fill_super); #endif - return get_tree_bdev(fc, erofs_fc_fill_super); + ret = get_tree_bdev(fc, erofs_fc_fill_super); +#ifdef CONFIG_EROFS_FS_BACKED_BY_FILE + if (ret == -ENOTBLK) { + if (!fc->source) + return invalf(fc, "No source specified"); + sbi->fdev = filp_open(fc->source, O_RDONLY | O_LARGEFILE, 0); + if (IS_ERR(sbi->fdev)) + return PTR_ERR(sbi->fdev); + + return get_tree_nodev(fc, erofs_fc_fill_super); + } +#endif + return ret; } static int erofs_fc_reconfigure(struct fs_context *fc) @@ -893,6 +915,8 @@ static int erofs_release_device_info(int id, void *ptr, void *data) if (dif->blobfile) filp_close(dif->blobfile, NULL); #endif + if (dif->file) + fput(dif->file); erofs_fscache_unregister_cookie(dif->fscache); dif->fscache = NULL; kfree(dif->path); @@ -955,8 +979,6 @@ static void erofs_kill_sb(struct super_block *sb) { struct erofs_sb_info *sbi; - WARN_ON(sb->s_magic != EROFS_SUPER_MAGIC); - /* pseudo mount for anon inodes */ if (sb->s_flags & SB_KERNMOUNT) { kill_anon_super(sb); @@ -984,6 +1006,8 @@ static void erofs_kill_sb(struct super_block *sb) erofs_fscache_unregister_fs(sb); kfree(sbi->fsid); kfree(sbi->domain_id); + if (sbi->fdev) + fput(sbi->fdev); kfree(sbi); sb->s_fs_info = NULL; } @@ -1099,7 +1123,7 @@ static int erofs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_namelen = EROFS_NAME_LEN; if (uuid_is_null(&sb->s_uuid)) - buf->f_fsid = u64_to_fsid(erofs_is_fscache_mode(sb) ? 0 : + buf->f_fsid = u64_to_fsid(!sb->s_bdev ? 0 : huge_encode_dev(sb->s_bdev->bd_dev)); else buf->f_fsid = uuid_to_fsid(sb->s_uuid.b); -- Gitee From 4dfda5ba03206ad2e3e9fd9681a232d8c39ba8ec Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Thu, 5 Sep 2024 17:30:31 +0800 Subject: [PATCH 5/9] erofs: support unencoded inodes for fileio ANBZ: #11854 commit ce63cb62d794c98c7631c2296fa845f2a8d0a4a1 upstream. Since EROFS only needs to handle read requests in simple contexts, Just directly use vfs_iocb_iter_read() for data I/Os. Reviewed-by: Sandeep Dhavale Reviewed-by: Chao Yu Link: https://lore.kernel.org/r/20240905093031.2745929-1-hsiangkao@linux.alibaba.com Conflicts: fs/erofs/Makefile fs/erofs/data.c fs/erofs/inode.c fs/erofs/internal.h fs/erofs/zdata.c Signed-off-by: Gao Xiang --- fs/erofs/Makefile | 1 + fs/erofs/data.c | 54 +++++++++++++- fs/erofs/fileio.c | 178 ++++++++++++++++++++++++++++++++++++++++++++ fs/erofs/inode.c | 20 ++--- fs/erofs/internal.h | 8 +- fs/erofs/super.c | 6 +- fs/erofs/zdata.c | 54 ++------------ 7 files changed, 253 insertions(+), 68 deletions(-) create mode 100644 fs/erofs/fileio.c diff --git a/fs/erofs/Makefile b/fs/erofs/Makefile index 617a4c0e3389..95ac00571a00 100644 --- a/fs/erofs/Makefile +++ b/fs/erofs/Makefile @@ -7,4 +7,5 @@ erofs-$(CONFIG_EROFS_FS_RAFS_V6) += rafsv6.o erofs-$(CONFIG_EROFS_FS_ZIP) += decompressor.o zmap.o zdata.o pcpubuf.o erofs-$(CONFIG_EROFS_FS_ZIP_LZMA) += decompressor_lzma.o erofs-$(CONFIG_EROFS_FS_ZIP_DEFLATE) += decompressor_deflate.o +erofs-$(CONFIG_EROFS_FS_BACKED_BY_FILE) += fileio.o erofs-$(CONFIG_EROFS_FS_ONDEMAND) += fscache.o diff --git a/fs/erofs/data.c b/fs/erofs/data.c index f47438e355ca..3f1b85d74231 100644 --- a/fs/erofs/data.c +++ b/fs/erofs/data.c @@ -231,10 +231,11 @@ int erofs_map_blocks(struct inode *inode, struct erofs_map_blocks *map) static void erofs_fill_from_devinfo(struct erofs_map_dev *map, struct erofs_device_info *dif) { - map->m_bdev = NULL; map->m_bdev = dif->bdev; + map->m_fp = dif->file; #ifdef CONFIG_EROFS_FS_RAFS_V6 - map->m_fp = dif->blobfile; + if (dif->blobfile) + map->m_fp = dif->blobfile; #endif map->m_daxdev = dif->dax_dev; map->m_fscache = dif->fscache; @@ -249,8 +250,10 @@ int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map) map->m_bdev = sb->s_bdev; map->m_daxdev = EROFS_SB(sb)->dax_dev; + map->m_fp = EROFS_SB(sb)->fdev; #ifdef CONFIG_EROFS_FS_RAFS_V6 - map->m_fp = EROFS_SB(sb)->bootstrap; + if (EROFS_SB(sb)->bootstrap) + map->m_fp = EROFS_SB(sb)->bootstrap; #endif map->m_fscache = EROFS_SB(sb)->s_fscache; @@ -287,6 +290,49 @@ int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map) return 0; } +/* + * bit 30: I/O error occurred on this page + * bit 0 - 29: remaining parts to complete this page + */ +#define Z_EROFS_PAGE_EIO (1 << 30) + +void erofs_onlinepage_init(struct page *page) +{ + union { + atomic_t o; + unsigned long v; + } u = { .o = ATOMIC_INIT(1) }; + + set_page_private(page, u.v); + smp_wmb(); + SetPagePrivate(page); +} + +void erofs_onlinepage_split(struct page *page) +{ + atomic_inc((atomic_t *)&page->private); +} + +void erofs_onlinepage_end(struct page *page, int err) +{ + int orig, v; + + DBG_BUGON(!PagePrivate(page)); + + do { + orig = atomic_read((atomic_t *)&page->private); + v = (orig - 1) | (err ? Z_EROFS_PAGE_EIO : 0); + } while (atomic_cmpxchg((atomic_t *)&page->private, orig, v) != orig); + + if (!(v & ~Z_EROFS_PAGE_EIO)) { + set_page_private(page, 0); + ClearPagePrivate(page); + if (!(v & Z_EROFS_PAGE_EIO)) + SetPageUptodate(page); + unlock_page(page); + } +} + static int erofs_iomap_begin(struct inode *inode, loff_t offset, loff_t length, unsigned int flags, struct iomap *iomap, struct iomap *srcmap) { @@ -428,7 +474,7 @@ static ssize_t erofs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) } /* for uncompressed (aligned) files and raw access for other files */ -const struct address_space_operations erofs_raw_access_aops = { +const struct address_space_operations erofs_aops = { .readpage = erofs_readpage, .readahead = erofs_readahead, .bmap = erofs_bmap, diff --git a/fs/erofs/fileio.c b/fs/erofs/fileio.c new file mode 100644 index 000000000000..2b6a3aa3355a --- /dev/null +++ b/fs/erofs/fileio.c @@ -0,0 +1,178 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2024, Alibaba Cloud + */ +#include "internal.h" +#include +#include + +struct erofs_fileio_rq { + struct bio_vec bvecs[BIO_MAX_PAGES]; + struct bio bio; + struct kiocb iocb; +}; + +struct erofs_fileio { + struct erofs_map_blocks map; + struct erofs_map_dev dev; + struct erofs_fileio_rq *rq; +}; + +static void erofs_fileio_ki_complete(struct kiocb *iocb, long ret, long res2) +{ + struct erofs_fileio_rq *rq = + container_of(iocb, struct erofs_fileio_rq, iocb); + struct bio_vec *bvec; + struct bvec_iter_all iter_all; + + DBG_BUGON(rq->bio.bi_end_io); + if (ret > 0) { + if (ret != rq->bio.bi_iter.bi_size) { + bio_advance(&rq->bio, ret); + zero_fill_bio(&rq->bio); + } + ret = 0; + } + bio_for_each_segment_all(bvec, &rq->bio, iter_all) { + struct page *page = bvec->bv_page; + + DBG_BUGON(PageUptodate(page)); + erofs_onlinepage_end(page, ret); + } + bio_uninit(&rq->bio); + kfree(rq); +} + +static void erofs_fileio_rq_submit(struct erofs_fileio_rq *rq) +{ + struct iov_iter iter; + int ret; + + if (!rq) + return; + rq->iocb.ki_pos = rq->bio.bi_iter.bi_sector << SECTOR_SHIFT; + rq->iocb.ki_ioprio = get_current_ioprio(); + rq->iocb.ki_complete = erofs_fileio_ki_complete; + rq->iocb.ki_flags = (rq->iocb.ki_filp->f_mapping->a_ops->direct_IO) ? + IOCB_DIRECT : 0; + iov_iter_bvec(&iter, READ, rq->bvecs, rq->bio.bi_vcnt, + rq->bio.bi_iter.bi_size); + ret = vfs_iocb_iter_read(rq->iocb.ki_filp, &rq->iocb, &iter); + if (ret != -EIOCBQUEUED) + erofs_fileio_ki_complete(&rq->iocb, ret, 0); +} + +static struct erofs_fileio_rq *erofs_fileio_rq_alloc(struct erofs_map_dev *mdev) +{ + struct erofs_fileio_rq *rq = kzalloc(sizeof(*rq), + GFP_KERNEL | __GFP_NOFAIL); + + bio_init(&rq->bio, rq->bvecs, BIO_MAX_PAGES); + rq->bio.bi_opf = REQ_OP_READ; + rq->iocb.ki_filp = mdev->m_fp; + return rq; +} + +static int erofs_fileio_scan_folio(struct erofs_fileio *io, struct page *page) +{ + struct inode *inode = page->mapping->host; + struct erofs_map_blocks *map = &io->map; + unsigned int cur = 0, end = PAGE_SIZE, len, attached = 0; + loff_t pos = page->index << PAGE_SHIFT, ofs; + int err = 0; + + erofs_onlinepage_init(page); + while (cur < end) { + if (pos + cur < map->m_la || + pos + cur >= map->m_la + map->m_llen) { + map->m_la = pos + cur; + map->m_llen = end - cur; + err = erofs_map_blocks(inode, map); + if (err) + break; + } + + ofs = (page->index << PAGE_SHIFT) + cur - map->m_la; + len = min_t(loff_t, map->m_llen - ofs, end - cur); + if (map->m_flags & EROFS_MAP_META) { + struct erofs_buf buf = __EROFS_BUF_INITIALIZER; + void *src; + + src = erofs_read_metabuf(&buf, inode->i_sb, + erofs_blknr(inode->i_sb, map->m_pa + ofs), + EROFS_KMAP); + if (IS_ERR(src)) { + err = PTR_ERR(src); + break; + } + memcpy_to_page(page, cur, + src + erofs_blkoff(inode->i_sb, map->m_pa + ofs), len); + erofs_put_metabuf(&buf); + } else if (!(map->m_flags & EROFS_MAP_MAPPED)) { + zero_user_segment(page, cur, cur + len); + attached = 0; + } else { + if (io->rq && (map->m_pa + ofs != io->dev.m_pa || + map->m_deviceid != io->dev.m_deviceid)) { +io_retry: + erofs_fileio_rq_submit(io->rq); + io->rq = NULL; + } + + if (!io->rq) { + io->dev = (struct erofs_map_dev) { + .m_pa = io->map.m_pa + ofs, + .m_deviceid = io->map.m_deviceid, + }; + err = erofs_map_dev(inode->i_sb, &io->dev); + if (err) + break; + io->rq = erofs_fileio_rq_alloc(&io->dev); + io->rq->bio.bi_iter.bi_sector = io->dev.m_pa >> 9; + attached = 0; + } + if (!attached++) + erofs_onlinepage_split(page); + if (!bio_add_page(&io->rq->bio, page, len, cur)) + goto io_retry; + io->dev.m_pa += len; + } + cur += len; + } + erofs_onlinepage_end(page, err); + return err; +} + +static int erofs_fileio_readpage(struct file *file, struct page *page) +{ + struct erofs_fileio io = {}; + int err; + + trace_erofs_readpage(page, true); + err = erofs_fileio_scan_folio(&io, page); + erofs_fileio_rq_submit(io.rq); + return err; +} + +static void erofs_fileio_readahead(struct readahead_control *rac) +{ + struct inode *inode = rac->mapping->host; + struct erofs_fileio io = {}; + struct page *page; + int err; + + trace_erofs_readpages(inode, readahead_index(rac), + readahead_count(rac), true); + while ((page = readahead_page(rac))) { + err = erofs_fileio_scan_folio(&io, page); + if (err && err != -EINTR) + erofs_err(inode->i_sb, "readahead error at page %lu @ nid %llu", + page->index, EROFS_I(inode)->nid); + } + erofs_fileio_rq_submit(io.rq); +} + +const struct address_space_operations erofs_fileio_aops = { + .readpage = erofs_fileio_readpage, + .readahead = erofs_fileio_readahead, +}; diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c index be9aceefc57b..5dba847cf0b7 100644 --- a/fs/erofs/inode.c +++ b/fs/erofs/inode.c @@ -236,25 +236,27 @@ static int erofs_fill_inode(struct inode *inode) return 0; } - if (erofs_is_fileio_mode(EROFS_SB(inode->i_sb))) { - /* XXX: data I/Os will be implemented in the following patches */ - err = -EOPNOTSUPP; - } else if (erofs_inode_is_data_compressed(vi->datalayout)) { + if (erofs_inode_is_data_compressed(vi->datalayout)) { err = -EOPNOTSUPP; #ifdef CONFIG_EROFS_FS_ZIP - if (!erofs_is_fscache_mode(inode->i_sb)) { + if (!erofs_is_fscache_mode(inode->i_sb) && + !erofs_is_fileio_mode(EROFS_SB(inode->i_sb))) { inode->i_mapping->a_ops = &z_erofs_aops; err = 0; } #endif } else if (erofs_is_rafsv6_mode(inode->i_sb)) { erofs_rafsv6_set_aops(inode); + } else { + inode->i_mapping->a_ops = &erofs_aops; #ifdef CONFIG_EROFS_FS_ONDEMAND - } else if (erofs_is_fscache_mode(inode->i_sb)) { - inode->i_mapping->a_ops = &erofs_fscache_access_aops; + if (erofs_is_fscache_mode(inode->i_sb)) + inode->i_mapping->a_ops = &erofs_fscache_access_aops; +#endif +#ifdef CONFIG_EROFS_FS_BACKED_BY_FILE + if (erofs_is_fileio_mode(EROFS_SB(inode->i_sb))) + inode->i_mapping->a_ops = &erofs_fileio_aops; #endif - } else { - inode->i_mapping->a_ops = &erofs_raw_access_aops; } return err; } diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h index 350952ccb56c..32c7ee5a0423 100644 --- a/fs/erofs/internal.h +++ b/fs/erofs/internal.h @@ -330,7 +330,8 @@ struct page *erofs_grab_cache_page_nowait(struct address_space *mapping, extern const struct super_operations erofs_sops; -extern const struct address_space_operations erofs_raw_access_aops; +extern const struct address_space_operations erofs_aops; +extern const struct address_space_operations erofs_fileio_aops; extern const struct address_space_operations z_erofs_aops; enum { @@ -398,9 +399,7 @@ struct erofs_map_dev { struct erofs_fscache *m_fscache; struct block_device *m_bdev; struct dax_device *m_daxdev; -#ifdef CONFIG_EROFS_FS_RAFS_V6 struct file *m_fp; -#endif erofs_off_t m_pa; unsigned int m_deviceid; }; @@ -433,6 +432,9 @@ extern const struct inode_operations erofs_generic_iops; extern const struct inode_operations erofs_symlink_iops; extern const struct inode_operations erofs_fast_symlink_iops; +void erofs_onlinepage_init(struct page *page); +void erofs_onlinepage_split(struct page *page); +void erofs_onlinepage_end(struct page *page, int err); struct inode *erofs_iget(struct super_block *sb, erofs_nid_t nid); int erofs_getattr(const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags); diff --git a/fs/erofs/super.c b/fs/erofs/super.c index 4739b70128e2..14e47d3129a9 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -836,12 +836,12 @@ static int erofs_fc_get_tree(struct fs_context *fc) int ret; #ifdef CONFIG_EROFS_FS_RAFS_V6 - if (ctx->blob_dir_path && !ctx->bootstrap_path) { + if (sbi->blob_dir_path && !sbi->bootstrap_path) { errorfc(fc, "bootstrap_path required in RAFS mode"); return -EINVAL; } - if (ctx->bootstrap_path && ctx->fsid) { + if (sbi->bootstrap_path && sbi->fsid) { errorfc(fc, "fscache/RAFS modes are mutually exclusive"); return -EINVAL; } @@ -851,7 +851,7 @@ static int erofs_fc_get_tree(struct fs_context *fc) return get_tree_nodev(fc, erofs_fc_fill_super); #ifdef CONFIG_EROFS_FS_RAFS_V6 - if (ctx->bootstrap_path && ctx->blob_dir_path) + if (sbi->bootstrap_path && sbi->blob_dir_path) return get_tree_nodev(fc, erofs_fc_fill_super); #endif ret = get_tree_bdev(fc, erofs_fc_fill_super); diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 94bc5e326587..07d9f00319ad 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -109,49 +109,6 @@ static inline unsigned int z_erofs_pclusterpages(struct z_erofs_pcluster *pcl) return PAGE_ALIGN(pcl->pclustersize) >> PAGE_SHIFT; } -/* - * bit 30: I/O error occurred on this page - * bit 0 - 29: remaining parts to complete this page - */ -#define Z_EROFS_PAGE_EIO (1 << 30) - -static inline void z_erofs_onlinepage_init(struct page *page) -{ - union { - atomic_t o; - unsigned long v; - } u = { .o = ATOMIC_INIT(1) }; - - set_page_private(page, u.v); - smp_wmb(); - SetPagePrivate(page); -} - -static inline void z_erofs_onlinepage_split(struct page *page) -{ - atomic_inc((atomic_t *)&page->private); -} - -static void z_erofs_onlinepage_endio(struct page *page, int err) -{ - int orig, v; - - DBG_BUGON(!PagePrivate(page)); - - do { - orig = atomic_read((atomic_t *)&page->private); - v = (orig - 1) | (err ? Z_EROFS_PAGE_EIO : 0); - } while (atomic_cmpxchg((atomic_t *)&page->private, orig, v) != orig); - - if (!(v & ~Z_EROFS_PAGE_EIO)) { - set_page_private(page, 0); - ClearPagePrivate(page); - if (!(v & Z_EROFS_PAGE_EIO)) - SetPageUptodate(page); - unlock_page(page); - } -} - #define Z_EROFS_ONSTACK_PAGES 32 /* @@ -801,7 +758,7 @@ static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe, unsigned int cur, end, len, split; int err = 0; - z_erofs_onlinepage_init(page); + erofs_onlinepage_init(page); split = 0; end = PAGE_SIZE; repeat: @@ -862,7 +819,7 @@ static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe, if (err) goto out; - z_erofs_onlinepage_split(page); + erofs_onlinepage_split(page); if (fe->pcl->length < offset + end - map->m_la) { fe->pcl->length = offset + end - map->m_la; fe->pcl->pageofs_out = map->m_la & ~PAGE_MASK; @@ -881,7 +838,7 @@ static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe, goto repeat; out: - z_erofs_onlinepage_endio(page, err); + erofs_onlinepage_end(page, err); return err; } @@ -978,7 +935,7 @@ static void z_erofs_fill_other_copies(struct z_erofs_decompress_backend *be, cur += len; } kunmap_atomic(dst); - z_erofs_onlinepage_endio(bvi->bvec.page, err); + erofs_onlinepage_end(bvi->bvec.page, err); list_del(p); kfree(bvi); } @@ -1128,11 +1085,10 @@ static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be, continue; DBG_BUGON(z_erofs_page_is_invalidated(page)); - /* recycle all individual short-lived pages */ if (z_erofs_put_shortlivedpage(be->pagepool, page)) continue; - z_erofs_onlinepage_endio(page, err); + erofs_onlinepage_end(page, err); } if (be->decompressed_pages != be->onstack_pages) -- Gitee From 5b44c4ba8750b225c4b61b8e5b51600cfa634c0f Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Thu, 12 Dec 2024 21:43:36 +0800 Subject: [PATCH 6/9] erofs: use buffered I/O for file-backed mounts by default ANBZ: #11854 commit 6422cde1b0d5a31b206b263417c1c2b3c80fe82c upstream. For many use cases (e.g. container images are just fetched from remote), performance will be impacted if underlay page cache is up-to-date but direct i/o flushes dirty pages first. Instead, let's use buffered I/O by default to keep in sync with loop devices and add a (re)mount option to explicitly give a try to use direct I/O if supported by the underlying files. The container startup time is improved as below: [workload] docker.io/library/workpress:latest unpack 1st run non-1st runs EROFS snapshotter buffered I/O file 4.586404265s 0.308s 0.198s EROFS snapshotter direct I/O file 4.581742849s 2.238s 0.222s EROFS snapshotter loop 4.596023152s 0.346s 0.201s Overlayfs snapshotter 5.382851037s 0.206s 0.214s Fixes: fb176750266a ("erofs: add file-backed mount support") Cc: Derek McGowan Reviewed-by: Chao Yu Conflicts: fs/erofs/fileio.c fs/erofs/super.c Signed-off-by: Gao Xiang Link: https://lore.kernel.org/r/20241212134336.2059899-1-hsiangkao@linux.alibaba.com --- fs/erofs/fileio.c | 12 ++++++++---- fs/erofs/internal.h | 1 + fs/erofs/super.c | 29 ++++++++++++++++++----------- 3 files changed, 27 insertions(+), 15 deletions(-) diff --git a/fs/erofs/fileio.c b/fs/erofs/fileio.c index 2b6a3aa3355a..e22fd0912df2 100644 --- a/fs/erofs/fileio.c +++ b/fs/erofs/fileio.c @@ -10,6 +10,7 @@ struct erofs_fileio_rq { struct bio_vec bvecs[BIO_MAX_PAGES]; struct bio bio; struct kiocb iocb; + struct super_block *sb; }; struct erofs_fileio { @@ -53,8 +54,9 @@ static void erofs_fileio_rq_submit(struct erofs_fileio_rq *rq) rq->iocb.ki_pos = rq->bio.bi_iter.bi_sector << SECTOR_SHIFT; rq->iocb.ki_ioprio = get_current_ioprio(); rq->iocb.ki_complete = erofs_fileio_ki_complete; - rq->iocb.ki_flags = (rq->iocb.ki_filp->f_mapping->a_ops->direct_IO) ? - IOCB_DIRECT : 0; + if (test_opt(&EROFS_SB(rq->sb)->opt, DIRECT_IO) && + rq->iocb.ki_filp->f_mapping->a_ops->direct_IO) + rq->iocb.ki_flags = IOCB_DIRECT; iov_iter_bvec(&iter, READ, rq->bvecs, rq->bio.bi_vcnt, rq->bio.bi_iter.bi_size); ret = vfs_iocb_iter_read(rq->iocb.ki_filp, &rq->iocb, &iter); @@ -62,7 +64,8 @@ static void erofs_fileio_rq_submit(struct erofs_fileio_rq *rq) erofs_fileio_ki_complete(&rq->iocb, ret, 0); } -static struct erofs_fileio_rq *erofs_fileio_rq_alloc(struct erofs_map_dev *mdev) +static struct erofs_fileio_rq *erofs_fileio_rq_alloc(struct super_block *sb, + struct erofs_map_dev *mdev) { struct erofs_fileio_rq *rq = kzalloc(sizeof(*rq), GFP_KERNEL | __GFP_NOFAIL); @@ -70,6 +73,7 @@ static struct erofs_fileio_rq *erofs_fileio_rq_alloc(struct erofs_map_dev *mdev) bio_init(&rq->bio, rq->bvecs, BIO_MAX_PAGES); rq->bio.bi_opf = REQ_OP_READ; rq->iocb.ki_filp = mdev->m_fp; + rq->sb = sb; return rq; } @@ -127,7 +131,7 @@ static int erofs_fileio_scan_folio(struct erofs_fileio *io, struct page *page) err = erofs_map_dev(inode->i_sb, &io->dev); if (err) break; - io->rq = erofs_fileio_rq_alloc(&io->dev); + io->rq = erofs_fileio_rq_alloc(inode->i_sb, &io->dev); io->rq->bio.bi_iter.bi_sector = io->dev.m_pa >> 9; attached = 0; } diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h index 32c7ee5a0423..dcfe0ba2cea0 100644 --- a/fs/erofs/internal.h +++ b/fs/erofs/internal.h @@ -166,6 +166,7 @@ struct erofs_sb_info { #define EROFS_MOUNT_POSIX_ACL 0x00000020 #define EROFS_MOUNT_DAX_ALWAYS 0x00000040 #define EROFS_MOUNT_DAX_NEVER 0x00000080 +#define EROFS_MOUNT_DIRECT_IO 0x00000100 #define EROFS_MOUNT_BLOB_MMAP_PIN 0x80000000 diff --git a/fs/erofs/super.c b/fs/erofs/super.c index 14e47d3129a9..8a7f87ed17e4 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -400,14 +400,8 @@ static void erofs_default_options(struct erofs_sb_info *sbi) } enum { - Opt_user_xattr, - Opt_acl, - Opt_cache_strategy, - Opt_dax, - Opt_dax_enum, - Opt_device, - Opt_fsid, - Opt_domain_id, + Opt_user_xattr, Opt_acl, Opt_cache_strategy, Opt_dax, Opt_dax_enum, + Opt_device, Opt_fsid, Opt_domain_id, Opt_directio, #ifdef CONFIG_EROFS_FS_RAFS_V6 Opt_bootstrap_path, Opt_blob_dir_path, @@ -436,9 +430,10 @@ static const struct fs_parameter_spec erofs_fs_parameters[] = { erofs_param_cache_strategy), fsparam_flag("dax", Opt_dax), fsparam_enum("dax", Opt_dax_enum, erofs_dax_param_enums), - fsparam_string("device", Opt_device), - fsparam_string("fsid", Opt_fsid), - fsparam_string("domain_id", Opt_domain_id), + fsparam_string("device", Opt_device), + fsparam_string("fsid", Opt_fsid), + fsparam_string("domain_id", Opt_domain_id), + fsparam_flag_no("directio", Opt_directio), #ifdef CONFIG_EROFS_FS_RAFS_V6 fsparam_string("bootstrap_path", Opt_bootstrap_path), fsparam_string("blob_dir_path", Opt_blob_dir_path), @@ -580,6 +575,16 @@ static int erofs_fc_parse_param(struct fs_context *fc, clear_opt(&sbi->opt, BLOB_MMAP_PIN); break; #endif + case Opt_directio: +#ifdef CONFIG_EROFS_FS_BACKED_BY_FILE + if (result.boolean) + set_opt(&sbi->opt, DIRECT_IO); + else + clear_opt(&sbi->opt, DIRECT_IO); +#else + errorfc(fc, "%s option not supported", erofs_fs_parameters[opt].name); +#endif + break; default: return -ENOPARAM; } @@ -1147,6 +1152,8 @@ static int erofs_show_options(struct seq_file *seq, struct dentry *root) seq_puts(seq, ",dax=always"); if (test_opt(opt, DAX_NEVER)) seq_puts(seq, ",dax=never"); + if (erofs_is_fileio_mode(sbi) && test_opt(opt, DIRECT_IO)) + seq_puts(seq, ",directio"); #ifdef CONFIG_EROFS_FS_ONDEMAND if (sbi->fsid) seq_printf(seq, ",fsid=%s", sbi->fsid); -- Gitee From ecddd40c5165873e368f2f861f7984eb5659968e Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Tue, 7 Jan 2025 16:28:25 +0800 Subject: [PATCH 7/9] erofs: shorten bvecs[] for file-backed mounts ANBZ: #11854 commit 9f74ae8c9ac97a79f9d45c92bd8ac8598e17f21f upstream. BIO_MAX_VECS is too large for __GFP_NOFAIL allocation. We could use a mempool (since BIOs can always proceed), but it seems overly complicated for now. Reviewed-by: Chao Yu Conflicts: fs/erofs/fileio.c Signed-off-by: Gao Xiang Link: https://lore.kernel.org/r/20250107082825.74242-1-hsiangkao@linux.alibaba.com --- fs/erofs/fileio.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/erofs/fileio.c b/fs/erofs/fileio.c index e22fd0912df2..d27deb2dbc32 100644 --- a/fs/erofs/fileio.c +++ b/fs/erofs/fileio.c @@ -7,7 +7,7 @@ #include struct erofs_fileio_rq { - struct bio_vec bvecs[BIO_MAX_PAGES]; + struct bio_vec bvecs[16]; struct bio bio; struct kiocb iocb; struct super_block *sb; @@ -70,7 +70,7 @@ static struct erofs_fileio_rq *erofs_fileio_rq_alloc(struct super_block *sb, struct erofs_fileio_rq *rq = kzalloc(sizeof(*rq), GFP_KERNEL | __GFP_NOFAIL); - bio_init(&rq->bio, rq->bvecs, BIO_MAX_PAGES); + bio_init(&rq->bio, rq->bvecs, ARRAY_SIZE(rq->bvecs)); rq->bio.bi_opf = REQ_OP_READ; rq->iocb.ki_filp = mdev->m_fp; rq->sb = sb; -- Gitee From 6633717200025c1c1c398d2f5a5d2f9f1ee7b412 Mon Sep 17 00:00:00 2001 From: Max Kellermann Date: Tue, 29 Apr 2025 01:09:33 +0200 Subject: [PATCH 8/9] fs/erofs/fileio: call erofs_onlinefolio_split() after bio_add_folio() ANBZ: #11854 commit bbfe756dc3062c1e934f06e5ba39c239aa953b92 upstream. If bio_add_folio() fails (because it is full), erofs_fileio_scan_folio() needs to submit the I/O request via erofs_fileio_rq_submit() and allocate a new I/O request with an empty `struct bio`. Then it retries the bio_add_folio() call. However, at this point, erofs_onlinefolio_split() has already been called which increments `folio->private`; the retry will call erofs_onlinefolio_split() again, but there will never be a matching erofs_onlinefolio_end() call. This leaves the folio locked forever and all waiters will be stuck in folio_wait_bit_common(). This bug has been added by commit ce63cb62d794 ("erofs: support unencoded inodes for fileio"), but was practically unreachable because there was room for 256 folios in the `struct bio` - until commit 9f74ae8c9ac9 ("erofs: shorten bvecs[] for file-backed mounts") which reduced the array capacity to 16 folios. It was now trivial to trigger the bug by manually invoking readahead from userspace, e.g.: posix_fadvise(fd, 0, st.st_size, POSIX_FADV_WILLNEED); This should be fixed by invoking erofs_onlinefolio_split() only after bio_add_folio() has succeeded. This is safe: asynchronous completions invoking erofs_onlinefolio_end() will not unlock the folio because erofs_fileio_scan_folio() is still holding a reference to be released by erofs_onlinefolio_end() at the end. Fixes: ce63cb62d794 ("erofs: support unencoded inodes for fileio") Fixes: 9f74ae8c9ac9 ("erofs: shorten bvecs[] for file-backed mounts") Cc: stable@vger.kernel.org Signed-off-by: Max Kellermann Reviewed-by: Gao Xiang Tested-by: Hongbo Li Link: https://lore.kernel.org/r/20250428230933.3422273-1-max.kellermann@ionos.com Signed-off-by: Gao Xiang Conflicts: fs/erofs/fileio.c Signed-off-by: Gao Xiang --- fs/erofs/fileio.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/erofs/fileio.c b/fs/erofs/fileio.c index d27deb2dbc32..328b7d63a62a 100644 --- a/fs/erofs/fileio.c +++ b/fs/erofs/fileio.c @@ -135,10 +135,10 @@ static int erofs_fileio_scan_folio(struct erofs_fileio *io, struct page *page) io->rq->bio.bi_iter.bi_sector = io->dev.m_pa >> 9; attached = 0; } - if (!attached++) - erofs_onlinepage_split(page); if (!bio_add_page(&io->rq->bio, page, len, cur)) goto io_retry; + if (!attached++) + erofs_onlinepage_split(page); io->dev.m_pa += len; } cur += len; -- Gitee From 101333fb42a7374fa5647b666292113ae0df2561 Mon Sep 17 00:00:00 2001 From: Tatsuyuki Ishi Date: Thu, 12 Jun 2025 19:18:25 +0900 Subject: [PATCH 9/9] erofs: impersonate the opener's credentials when accessing backing file ANBZ: #11854 commit 905eeb2b7c33adda23a966aeb811ab4cb9e62031 upstream. Previously, file operations on a file-backed mount used the current process' credentials to access the backing FD. Attempting to do so on Android lead to SELinux denials, as ACL rules on the backing file (e.g. /system/apex/foo.apex) is restricted to a small set of process. Arguably, this error is redundant and leaking implementation details, as access to files on a mount is already ACL'ed by path. Instead, override to use the opener's cred when accessing the backing file. This makes the behavior similar to a loop-backed mount, which uses kworker cred when accessing the backing file and does not cause SELinux denials. Signed-off-by: Tatsuyuki Ishi Reviewed-by: Gao Xiang Reviewed-by: Hongbo Li Link: https://lore.kernel.org/r/20250612-b4-erofs-impersonate-v1-1-8ea7d6f65171@google.com Signed-off-by: Gao Xiang --- fs/erofs/fileio.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/erofs/fileio.c b/fs/erofs/fileio.c index 328b7d63a62a..7144d629510d 100644 --- a/fs/erofs/fileio.c +++ b/fs/erofs/fileio.c @@ -46,6 +46,7 @@ static void erofs_fileio_ki_complete(struct kiocb *iocb, long ret, long res2) static void erofs_fileio_rq_submit(struct erofs_fileio_rq *rq) { + const struct cred *old_cred; struct iov_iter iter; int ret; @@ -59,7 +60,9 @@ static void erofs_fileio_rq_submit(struct erofs_fileio_rq *rq) rq->iocb.ki_flags = IOCB_DIRECT; iov_iter_bvec(&iter, READ, rq->bvecs, rq->bio.bi_vcnt, rq->bio.bi_iter.bi_size); + old_cred = override_creds(rq->iocb.ki_filp->f_cred); ret = vfs_iocb_iter_read(rq->iocb.ki_filp, &rq->iocb, &iter); + revert_creds(old_cred); if (ret != -EIOCBQUEUED) erofs_fileio_ki_complete(&rq->iocb, ret, 0); } -- Gitee