diff --git a/fs/erofs/Kconfig b/fs/erofs/Kconfig index 0bfaa7e2940a29ba25912f69119f23c9f8d656ed..a6714bed3e73a52e8330046205a2ff5318d89a7b 100644 --- a/fs/erofs/Kconfig +++ b/fs/erofs/Kconfig @@ -73,6 +73,23 @@ config EROFS_FS_SECURITY If you are not using a security module, say N. +config EROFS_FS_BACKED_BY_FILE + bool "File-backed EROFS filesystem support" + depends on EROFS_FS + default y + help + This allows EROFS to use filesystem image files directly, without + the intercession of loopback block devices or likewise. It is + particularly useful for container images with numerous blobs and + other sandboxes, where loop devices behave intricately. It can also + be used to simplify error-prone lifetime management of unnecessary + virtual block devices. + + Note that this feature, along with ongoing fanotify pre-content + hooks, will eventually replace "EROFS over fscache." + + If you don't want to enable this feature, say N. + config EROFS_FS_RAFS_V6 bool "EROFS RAFSv6 kangaroo support (EROFS over virtiofs)" depends on VIRTIO_FS diff --git a/fs/erofs/Makefile b/fs/erofs/Makefile index 617a4c0e3389b5714ebc65c2a633998a71fb4c80..95ac00571a0077383b02d3d390e3fba9660faed9 100644 --- a/fs/erofs/Makefile +++ b/fs/erofs/Makefile @@ -7,4 +7,5 @@ erofs-$(CONFIG_EROFS_FS_RAFS_V6) += rafsv6.o erofs-$(CONFIG_EROFS_FS_ZIP) += decompressor.o zmap.o zdata.o pcpubuf.o erofs-$(CONFIG_EROFS_FS_ZIP_LZMA) += decompressor_lzma.o erofs-$(CONFIG_EROFS_FS_ZIP_DEFLATE) += decompressor_deflate.o +erofs-$(CONFIG_EROFS_FS_BACKED_BY_FILE) += fileio.o erofs-$(CONFIG_EROFS_FS_ONDEMAND) += fscache.o diff --git a/fs/erofs/data.c b/fs/erofs/data.c index a9a411fd59583f478f23ccbcd6cc7b72f955b622..3f1b85d7423174f405e53acdc5077100bb26e193 100644 --- a/fs/erofs/data.c +++ b/fs/erofs/data.c @@ -34,12 +34,14 @@ void erofs_put_metabuf(struct erofs_buf *buf) put_page(buf->page); buf->page = NULL; +#ifdef CONFIG_EROFS_FS_RAFS_V6 if (buf->mapping) { buf->mapping->a_ops->endpfn(buf->mapping, index, &buf->iomap, 0); buf->mapping = NULL; memset(&buf->iomap, 0, sizeof(buf->iomap)); } +#endif } /* @@ -47,9 +49,9 @@ void erofs_put_metabuf(struct erofs_buf *buf) * anonymous inode in fscache mode. */ void *__erofs_bread(struct super_block *sb, struct erofs_buf *buf, - struct inode *inode, erofs_blk_t blkaddr, - enum erofs_kmap_type type) + erofs_blk_t blkaddr, enum erofs_kmap_type type) { + struct inode *inode = buf->inode; erofs_off_t offset = (erofs_off_t)blkaddr << inode->i_blkbits; struct address_space *const mapping = inode->i_mapping; pgoff_t index = offset >> PAGE_SHIFT; @@ -100,25 +102,38 @@ void *__erofs_bread(struct super_block *sb, struct erofs_buf *buf, return buf->base + (offset & ~PAGE_MASK); } -void *erofs_bread(struct erofs_buf *buf, struct inode *inode, - erofs_blk_t blkaddr, enum erofs_kmap_type type) +void *erofs_bread(struct erofs_buf *buf, erofs_blk_t blkaddr, + enum erofs_kmap_type type) +{ + return __erofs_bread(NULL, buf, blkaddr, type); +} + +void erofs_init_metabuf(struct erofs_buf *buf, struct super_block *sb) { - return __erofs_bread(NULL, buf, inode, blkaddr, type); + struct erofs_sb_info *sbi = EROFS_SB(sb); + + if (erofs_is_fileio_mode(sbi)) + buf->inode = file_inode(sbi->fdev); + else if (erofs_is_fscache_mode(sb)) + buf->inode = sbi->s_fscache->inode; +#ifdef CONFIG_EROFS_FS_RAFS_V6 + else if (erofs_is_rafsv6_mode(sb)) + buf->inode = sbi->bootstrap->f_inode; +#endif + else + buf->inode = sb->s_bdev->bd_inode; + } void *erofs_read_metabuf(struct erofs_buf *buf, struct super_block *sb, erofs_blk_t blkaddr, enum erofs_kmap_type type) { + erofs_init_metabuf(buf, sb); #ifdef CONFIG_EROFS_FS_RAFS_V6 if (erofs_is_rafsv6_mode(sb)) - return __erofs_bread(sb, buf, EROFS_SB(sb)->bootstrap->f_inode, - blkaddr, type); + return __erofs_bread(sb, buf, blkaddr, type); #endif - if (erofs_is_fscache_mode(sb)) - return erofs_bread(buf, EROFS_SB(sb)->s_fscache->inode, - blkaddr, type); - - return erofs_bread(buf, sb->s_bdev->bd_inode, blkaddr, type); + return erofs_bread(buf, blkaddr, type); } int erofs_map_blocks(struct inode *inode, struct erofs_map_blocks *map) @@ -137,6 +152,7 @@ int erofs_map_blocks(struct inode *inode, struct erofs_map_blocks *map) trace_erofs_map_blocks_enter(inode, map, 0); map->m_deviceid = 0; + map->m_flags = 0; if (map->m_la >= inode->i_size) goto out; @@ -212,16 +228,32 @@ int erofs_map_blocks(struct inode *inode, struct erofs_map_blocks *map) return err; } +static void erofs_fill_from_devinfo(struct erofs_map_dev *map, + struct erofs_device_info *dif) +{ + map->m_bdev = dif->bdev; + map->m_fp = dif->file; +#ifdef CONFIG_EROFS_FS_RAFS_V6 + if (dif->blobfile) + map->m_fp = dif->blobfile; +#endif + map->m_daxdev = dif->dax_dev; + map->m_fscache = dif->fscache; +} + int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map) { struct erofs_dev_context *devs = EROFS_SB(sb)->devs; struct erofs_device_info *dif; + erofs_off_t startoff, length; int id; map->m_bdev = sb->s_bdev; map->m_daxdev = EROFS_SB(sb)->dax_dev; + map->m_fp = EROFS_SB(sb)->fdev; #ifdef CONFIG_EROFS_FS_RAFS_V6 - map->m_fp = EROFS_SB(sb)->bootstrap; + if (EROFS_SB(sb)->bootstrap) + map->m_fp = EROFS_SB(sb)->bootstrap; #endif map->m_fscache = EROFS_SB(sb)->s_fscache; @@ -237,32 +269,19 @@ int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map) up_read(&devs->rwsem); return 0; } - map->m_bdev = dif->bdev; - map->m_daxdev = dif->dax_dev; -#ifdef CONFIG_EROFS_FS_RAFS_V6 - map->m_fp = dif->blobfile; -#endif - map->m_fscache = dif->fscache; + erofs_fill_from_devinfo(map, dif); up_read(&devs->rwsem); } else if (devs->extra_devices && !devs->flatdev) { down_read(&devs->rwsem); idr_for_each_entry(&devs->tree, dif, id) { - erofs_off_t startoff, length; - if (!dif->uniaddr) continue; startoff = erofs_pos(sb, dif->uniaddr); length = erofs_pos(sb, dif->blocks); - if (map->m_pa >= startoff && map->m_pa < startoff + erofs_pos(sb, dif->blocks)) { map->m_pa -= startoff; - map->m_bdev = dif->bdev; - map->m_daxdev = dif->dax_dev; -#ifdef CONFIG_EROFS_FS_RAFS_V6 - map->m_fp = dif->blobfile; -#endif - map->m_fscache = dif->fscache; + erofs_fill_from_devinfo(map, dif); break; } } @@ -271,6 +290,49 @@ int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map) return 0; } +/* + * bit 30: I/O error occurred on this page + * bit 0 - 29: remaining parts to complete this page + */ +#define Z_EROFS_PAGE_EIO (1 << 30) + +void erofs_onlinepage_init(struct page *page) +{ + union { + atomic_t o; + unsigned long v; + } u = { .o = ATOMIC_INIT(1) }; + + set_page_private(page, u.v); + smp_wmb(); + SetPagePrivate(page); +} + +void erofs_onlinepage_split(struct page *page) +{ + atomic_inc((atomic_t *)&page->private); +} + +void erofs_onlinepage_end(struct page *page, int err) +{ + int orig, v; + + DBG_BUGON(!PagePrivate(page)); + + do { + orig = atomic_read((atomic_t *)&page->private); + v = (orig - 1) | (err ? Z_EROFS_PAGE_EIO : 0); + } while (atomic_cmpxchg((atomic_t *)&page->private, orig, v) != orig); + + if (!(v & ~Z_EROFS_PAGE_EIO)) { + set_page_private(page, 0); + ClearPagePrivate(page); + if (!(v & Z_EROFS_PAGE_EIO)) + SetPageUptodate(page); + unlock_page(page); + } +} + static int erofs_iomap_begin(struct inode *inode, loff_t offset, loff_t length, unsigned int flags, struct iomap *iomap, struct iomap *srcmap) { @@ -412,7 +474,7 @@ static ssize_t erofs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) } /* for uncompressed (aligned) files and raw access for other files */ -const struct address_space_operations erofs_raw_access_aops = { +const struct address_space_operations erofs_aops = { .readpage = erofs_readpage, .readahead = erofs_readahead, .bmap = erofs_bmap, diff --git a/fs/erofs/dir.c b/fs/erofs/dir.c index 88b3ddaa7351b67542362c1cb4c60fe77c78aa48..468abf993709d222298f3e0b54c4509ae0a15282 100644 --- a/fs/erofs/dir.c +++ b/fs/erofs/dir.c @@ -58,11 +58,12 @@ static int erofs_readdir(struct file *f, struct dir_context *ctx) int err = 0; bool initial = true; + buf.inode = dir; while (ctx->pos < dirsize) { struct erofs_dirent *de; unsigned int nameoff, maxsize; - de = erofs_bread(&buf, dir, i, EROFS_KMAP); + de = erofs_bread(&buf, i, EROFS_KMAP); if (IS_ERR(de)) { erofs_err(sb, "fail to readdir of logical block %u of nid %llu", i, EROFS_I(dir)->nid); diff --git a/fs/erofs/fileio.c b/fs/erofs/fileio.c new file mode 100644 index 0000000000000000000000000000000000000000..7144d629510dc55e1bb3a1f602476fa524fd23e3 --- /dev/null +++ b/fs/erofs/fileio.c @@ -0,0 +1,185 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2024, Alibaba Cloud + */ +#include "internal.h" +#include +#include + +struct erofs_fileio_rq { + struct bio_vec bvecs[16]; + struct bio bio; + struct kiocb iocb; + struct super_block *sb; +}; + +struct erofs_fileio { + struct erofs_map_blocks map; + struct erofs_map_dev dev; + struct erofs_fileio_rq *rq; +}; + +static void erofs_fileio_ki_complete(struct kiocb *iocb, long ret, long res2) +{ + struct erofs_fileio_rq *rq = + container_of(iocb, struct erofs_fileio_rq, iocb); + struct bio_vec *bvec; + struct bvec_iter_all iter_all; + + DBG_BUGON(rq->bio.bi_end_io); + if (ret > 0) { + if (ret != rq->bio.bi_iter.bi_size) { + bio_advance(&rq->bio, ret); + zero_fill_bio(&rq->bio); + } + ret = 0; + } + bio_for_each_segment_all(bvec, &rq->bio, iter_all) { + struct page *page = bvec->bv_page; + + DBG_BUGON(PageUptodate(page)); + erofs_onlinepage_end(page, ret); + } + bio_uninit(&rq->bio); + kfree(rq); +} + +static void erofs_fileio_rq_submit(struct erofs_fileio_rq *rq) +{ + const struct cred *old_cred; + struct iov_iter iter; + int ret; + + if (!rq) + return; + rq->iocb.ki_pos = rq->bio.bi_iter.bi_sector << SECTOR_SHIFT; + rq->iocb.ki_ioprio = get_current_ioprio(); + rq->iocb.ki_complete = erofs_fileio_ki_complete; + if (test_opt(&EROFS_SB(rq->sb)->opt, DIRECT_IO) && + rq->iocb.ki_filp->f_mapping->a_ops->direct_IO) + rq->iocb.ki_flags = IOCB_DIRECT; + iov_iter_bvec(&iter, READ, rq->bvecs, rq->bio.bi_vcnt, + rq->bio.bi_iter.bi_size); + old_cred = override_creds(rq->iocb.ki_filp->f_cred); + ret = vfs_iocb_iter_read(rq->iocb.ki_filp, &rq->iocb, &iter); + revert_creds(old_cred); + if (ret != -EIOCBQUEUED) + erofs_fileio_ki_complete(&rq->iocb, ret, 0); +} + +static struct erofs_fileio_rq *erofs_fileio_rq_alloc(struct super_block *sb, + struct erofs_map_dev *mdev) +{ + struct erofs_fileio_rq *rq = kzalloc(sizeof(*rq), + GFP_KERNEL | __GFP_NOFAIL); + + bio_init(&rq->bio, rq->bvecs, ARRAY_SIZE(rq->bvecs)); + rq->bio.bi_opf = REQ_OP_READ; + rq->iocb.ki_filp = mdev->m_fp; + rq->sb = sb; + return rq; +} + +static int erofs_fileio_scan_folio(struct erofs_fileio *io, struct page *page) +{ + struct inode *inode = page->mapping->host; + struct erofs_map_blocks *map = &io->map; + unsigned int cur = 0, end = PAGE_SIZE, len, attached = 0; + loff_t pos = page->index << PAGE_SHIFT, ofs; + int err = 0; + + erofs_onlinepage_init(page); + while (cur < end) { + if (pos + cur < map->m_la || + pos + cur >= map->m_la + map->m_llen) { + map->m_la = pos + cur; + map->m_llen = end - cur; + err = erofs_map_blocks(inode, map); + if (err) + break; + } + + ofs = (page->index << PAGE_SHIFT) + cur - map->m_la; + len = min_t(loff_t, map->m_llen - ofs, end - cur); + if (map->m_flags & EROFS_MAP_META) { + struct erofs_buf buf = __EROFS_BUF_INITIALIZER; + void *src; + + src = erofs_read_metabuf(&buf, inode->i_sb, + erofs_blknr(inode->i_sb, map->m_pa + ofs), + EROFS_KMAP); + if (IS_ERR(src)) { + err = PTR_ERR(src); + break; + } + memcpy_to_page(page, cur, + src + erofs_blkoff(inode->i_sb, map->m_pa + ofs), len); + erofs_put_metabuf(&buf); + } else if (!(map->m_flags & EROFS_MAP_MAPPED)) { + zero_user_segment(page, cur, cur + len); + attached = 0; + } else { + if (io->rq && (map->m_pa + ofs != io->dev.m_pa || + map->m_deviceid != io->dev.m_deviceid)) { +io_retry: + erofs_fileio_rq_submit(io->rq); + io->rq = NULL; + } + + if (!io->rq) { + io->dev = (struct erofs_map_dev) { + .m_pa = io->map.m_pa + ofs, + .m_deviceid = io->map.m_deviceid, + }; + err = erofs_map_dev(inode->i_sb, &io->dev); + if (err) + break; + io->rq = erofs_fileio_rq_alloc(inode->i_sb, &io->dev); + io->rq->bio.bi_iter.bi_sector = io->dev.m_pa >> 9; + attached = 0; + } + if (!bio_add_page(&io->rq->bio, page, len, cur)) + goto io_retry; + if (!attached++) + erofs_onlinepage_split(page); + io->dev.m_pa += len; + } + cur += len; + } + erofs_onlinepage_end(page, err); + return err; +} + +static int erofs_fileio_readpage(struct file *file, struct page *page) +{ + struct erofs_fileio io = {}; + int err; + + trace_erofs_readpage(page, true); + err = erofs_fileio_scan_folio(&io, page); + erofs_fileio_rq_submit(io.rq); + return err; +} + +static void erofs_fileio_readahead(struct readahead_control *rac) +{ + struct inode *inode = rac->mapping->host; + struct erofs_fileio io = {}; + struct page *page; + int err; + + trace_erofs_readpages(inode, readahead_index(rac), + readahead_count(rac), true); + while ((page = readahead_page(rac))) { + err = erofs_fileio_scan_folio(&io, page); + if (err && err != -EINTR) + erofs_err(inode->i_sb, "readahead error at page %lu @ nid %llu", + page->index, EROFS_I(inode)->nid); + } + erofs_fileio_rq_submit(io.rq); +} + +const struct address_space_operations erofs_fileio_aops = { + .readpage = erofs_fileio_readpage, + .readahead = erofs_fileio_readahead, +}; diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c index c11fd6fb35dec77f2e5a7034e70e1c694f452672..5dba847cf0b708c2e2aafbb5997e15c8a57abe00 100644 --- a/fs/erofs/inode.c +++ b/fs/erofs/inode.c @@ -239,19 +239,24 @@ static int erofs_fill_inode(struct inode *inode) if (erofs_inode_is_data_compressed(vi->datalayout)) { err = -EOPNOTSUPP; #ifdef CONFIG_EROFS_FS_ZIP - if (!erofs_is_fscache_mode(inode->i_sb)) { + if (!erofs_is_fscache_mode(inode->i_sb) && + !erofs_is_fileio_mode(EROFS_SB(inode->i_sb))) { inode->i_mapping->a_ops = &z_erofs_aops; err = 0; } #endif } else if (erofs_is_rafsv6_mode(inode->i_sb)) { erofs_rafsv6_set_aops(inode); + } else { + inode->i_mapping->a_ops = &erofs_aops; #ifdef CONFIG_EROFS_FS_ONDEMAND - } else if (erofs_is_fscache_mode(inode->i_sb)) { - inode->i_mapping->a_ops = &erofs_fscache_access_aops; + if (erofs_is_fscache_mode(inode->i_sb)) + inode->i_mapping->a_ops = &erofs_fscache_access_aops; +#endif +#ifdef CONFIG_EROFS_FS_BACKED_BY_FILE + if (erofs_is_fileio_mode(EROFS_SB(inode->i_sb))) + inode->i_mapping->a_ops = &erofs_fileio_aops; #endif - } else { - inode->i_mapping->a_ops = &erofs_raw_access_aops; } return err; } diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h index 62311d7028c7595a5a883360d50dcd09263c5bfc..dcfe0ba2cea0688b4dc41e446ee7ef57649db0f0 100644 --- a/fs/erofs/internal.h +++ b/fs/erofs/internal.h @@ -49,6 +49,7 @@ struct erofs_device_info { char *path; struct erofs_fscache *fscache; struct block_device *bdev; + struct file *file; struct dax_device *dax_dev; #ifdef CONFIG_EROFS_FS_RAFS_V6 struct file *blobfile; @@ -73,17 +74,6 @@ struct erofs_dev_context { bool flatdev; }; -struct erofs_fs_context { - struct erofs_mount_opts opt; - struct erofs_dev_context *devs; - char *fsid; - char *domain_id; -#ifdef CONFIG_EROFS_FS_RAFS_V6 - char *bootstrap_path; - char *blob_dir_path; -#endif -}; - /* all filesystem-wide lz4 configurations */ struct erofs_sb_lz4_info { /* # of pages needed for EROFS lz4 rolling decompression */ @@ -126,6 +116,7 @@ struct erofs_sb_info { struct erofs_sb_lz4_info lz4; struct inode *packed_inode; #endif /* CONFIG_EROFS_FS_ZIP */ + struct file *fdev; #ifdef CONFIG_EROFS_FS_RAFS_V6 struct path blob_dir; struct file *bootstrap; @@ -175,6 +166,7 @@ struct erofs_sb_info { #define EROFS_MOUNT_POSIX_ACL 0x00000020 #define EROFS_MOUNT_DAX_ALWAYS 0x00000040 #define EROFS_MOUNT_DAX_NEVER 0x00000080 +#define EROFS_MOUNT_DIRECT_IO 0x00000100 #define EROFS_MOUNT_BLOB_MMAP_PIN 0x80000000 @@ -191,6 +183,11 @@ static inline bool erofs_is_rafsv6_mode(struct super_block *sb) #endif } +static inline bool erofs_is_fileio_mode(struct erofs_sb_info *sbi) +{ + return IS_ENABLED(CONFIG_EROFS_FS_BACKED_BY_FILE) && sbi->fdev; +} + static inline bool erofs_is_fscache_mode(struct super_block *sb) { /* to distinguish from rafsv6 which also works in nodev mode */ @@ -217,8 +214,11 @@ enum erofs_kmap_type { }; struct erofs_buf { +#ifdef CONFIG_EROFS_FS_RAFS_V6 struct iomap iomap; struct address_space *mapping; +#endif + struct inode *inode; struct page *page; void *base; enum erofs_kmap_type kmap_type; @@ -331,7 +331,8 @@ struct page *erofs_grab_cache_page_nowait(struct address_space *mapping, extern const struct super_operations erofs_sops; -extern const struct address_space_operations erofs_raw_access_aops; +extern const struct address_space_operations erofs_aops; +extern const struct address_space_operations erofs_fileio_aops; extern const struct address_space_operations z_erofs_aops; enum { @@ -399,9 +400,7 @@ struct erofs_map_dev { struct erofs_fscache *m_fscache; struct block_device *m_bdev; struct dax_device *m_daxdev; -#ifdef CONFIG_EROFS_FS_RAFS_V6 struct file *m_fp; -#endif erofs_off_t m_pa; unsigned int m_deviceid; }; @@ -411,8 +410,9 @@ void *erofs_read_metadata(struct super_block *sb, struct erofs_buf *buf, erofs_off_t *offset, int *lengthp); void erofs_unmap_metabuf(struct erofs_buf *buf); void erofs_put_metabuf(struct erofs_buf *buf); -void *erofs_bread(struct erofs_buf *buf, struct inode *inode, - erofs_blk_t blkaddr, enum erofs_kmap_type type); +void *erofs_bread(struct erofs_buf *buf, erofs_blk_t blkaddr, + enum erofs_kmap_type type); +void erofs_init_metabuf(struct erofs_buf *buf, struct super_block *sb); void *erofs_read_metabuf(struct erofs_buf *buf, struct super_block *sb, erofs_blk_t blkaddr, enum erofs_kmap_type type); int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *dev); @@ -433,6 +433,9 @@ extern const struct inode_operations erofs_generic_iops; extern const struct inode_operations erofs_symlink_iops; extern const struct inode_operations erofs_fast_symlink_iops; +void erofs_onlinepage_init(struct page *page); +void erofs_onlinepage_split(struct page *page); +void erofs_onlinepage_end(struct page *page, int err); struct inode *erofs_iget(struct super_block *sb, erofs_nid_t nid); int erofs_getattr(const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags); diff --git a/fs/erofs/namei.c b/fs/erofs/namei.c index fdd6a78b74ec9385f1023d455dc714ad01daed2f..facb595fac8fbb478ca5f94155bdee1982386b20 100644 --- a/fs/erofs/namei.c +++ b/fs/erofs/namei.c @@ -99,7 +99,8 @@ static void *erofs_find_target_block(struct erofs_buf *target, struct erofs_buf buf = __EROFS_BUF_INITIALIZER; struct erofs_dirent *de; - de = erofs_bread(&buf, dir, mid, EROFS_KMAP); + buf.inode = dir; + de = erofs_bread(&buf, mid, EROFS_KMAP); if (!IS_ERR(de)) { const int nameoff = nameoff_from_disk(de->nameoff, bsz); const int ndirents = nameoff / sizeof(*de); diff --git a/fs/erofs/super.c b/fs/erofs/super.c index 92a3c6615853c9cc03cb0243d04dc677637812a9..8a7f87ed17e40ba01b048ae8acdf0f540d13e800 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -16,6 +16,7 @@ #include #include #include +#include #include "xattr.h" #define CREATE_TRACE_POINTS @@ -132,6 +133,7 @@ static int erofs_init_device(struct erofs_buf *buf, struct super_block *sb, struct erofs_deviceslot *dis; struct block_device *bdev; void *ptr; + struct file *file; ptr = erofs_read_metabuf(buf, sb, erofs_blknr(sb, *pos), EROFS_KMAP); if (IS_ERR(ptr)) @@ -166,11 +168,19 @@ static int erofs_init_device(struct erofs_buf *buf, struct super_block *sb, dif->blobfile = f; #endif } else if (!sbi->devs->flatdev) { - bdev = blkdev_get_by_path(dif->path, FMODE_READ | FMODE_EXCL, - sb->s_type); - if (IS_ERR(bdev)) - return PTR_ERR(bdev); - dif->bdev = bdev; + if (erofs_is_fileio_mode(sbi)) { + file = filp_open(dif->path, O_RDONLY | O_LARGEFILE, 0); + if (IS_ERR(file)) + return PTR_ERR(file); + dif->file = file; + } else { + bdev = blkdev_get_by_path(dif->path, + FMODE_READ | FMODE_EXCL, + sb->s_type); + if (IS_ERR(bdev)) + return PTR_ERR(bdev); + dif->bdev = bdev; + } } dif->blocks = le32_to_cpu(dis->blocks_lo); @@ -375,29 +385,23 @@ static int erofs_read_superblock(struct super_block *sb) return ret; } -static void erofs_default_options(struct erofs_fs_context *ctx) +static void erofs_default_options(struct erofs_sb_info *sbi) { #ifdef CONFIG_EROFS_FS_ZIP - ctx->opt.cache_strategy = EROFS_ZIP_CACHE_READAROUND; - ctx->opt.max_sync_decompress_pages = 3; + sbi->opt.cache_strategy = EROFS_ZIP_CACHE_READAROUND; + sbi->opt.max_sync_decompress_pages = 3; #endif #ifdef CONFIG_EROFS_FS_XATTR - set_opt(&ctx->opt, XATTR_USER); + set_opt(&sbi->opt, XATTR_USER); #endif #ifdef CONFIG_EROFS_FS_POSIX_ACL - set_opt(&ctx->opt, POSIX_ACL); + set_opt(&sbi->opt, POSIX_ACL); #endif } enum { - Opt_user_xattr, - Opt_acl, - Opt_cache_strategy, - Opt_dax, - Opt_dax_enum, - Opt_device, - Opt_fsid, - Opt_domain_id, + Opt_user_xattr, Opt_acl, Opt_cache_strategy, Opt_dax, Opt_dax_enum, + Opt_device, Opt_fsid, Opt_domain_id, Opt_directio, #ifdef CONFIG_EROFS_FS_RAFS_V6 Opt_bootstrap_path, Opt_blob_dir_path, @@ -426,9 +430,10 @@ static const struct fs_parameter_spec erofs_fs_parameters[] = { erofs_param_cache_strategy), fsparam_flag("dax", Opt_dax), fsparam_enum("dax", Opt_dax_enum, erofs_dax_param_enums), - fsparam_string("device", Opt_device), - fsparam_string("fsid", Opt_fsid), - fsparam_string("domain_id", Opt_domain_id), + fsparam_string("device", Opt_device), + fsparam_string("fsid", Opt_fsid), + fsparam_string("domain_id", Opt_domain_id), + fsparam_flag_no("directio", Opt_directio), #ifdef CONFIG_EROFS_FS_RAFS_V6 fsparam_string("bootstrap_path", Opt_bootstrap_path), fsparam_string("blob_dir_path", Opt_blob_dir_path), @@ -440,17 +445,17 @@ static const struct fs_parameter_spec erofs_fs_parameters[] = { static bool erofs_fc_set_dax_mode(struct fs_context *fc, unsigned int mode) { #ifdef CONFIG_FS_DAX - struct erofs_fs_context *ctx = fc->fs_private; + struct erofs_sb_info *sbi = fc->s_fs_info; switch (mode) { case EROFS_MOUNT_DAX_ALWAYS: warnfc(fc, "DAX enabled. Warning: EXPERIMENTAL, use at your own risk"); - set_opt(&ctx->opt, DAX_ALWAYS); - clear_opt(&ctx->opt, DAX_NEVER); + set_opt(&sbi->opt, DAX_ALWAYS); + clear_opt(&sbi->opt, DAX_NEVER); return true; case EROFS_MOUNT_DAX_NEVER: - set_opt(&ctx->opt, DAX_NEVER); - clear_opt(&ctx->opt, DAX_ALWAYS); + set_opt(&sbi->opt, DAX_NEVER); + clear_opt(&sbi->opt, DAX_ALWAYS); return true; default: DBG_BUGON(1); @@ -465,7 +470,7 @@ static bool erofs_fc_set_dax_mode(struct fs_context *fc, unsigned int mode) static int erofs_fc_parse_param(struct fs_context *fc, struct fs_parameter *param) { - struct erofs_fs_context *ctx = fc->fs_private; + struct erofs_sb_info *sbi = fc->s_fs_info; struct fs_parse_result result; struct erofs_device_info *dif; int opt, ret; @@ -478,9 +483,9 @@ static int erofs_fc_parse_param(struct fs_context *fc, case Opt_user_xattr: #ifdef CONFIG_EROFS_FS_XATTR if (result.boolean) - set_opt(&ctx->opt, XATTR_USER); + set_opt(&sbi->opt, XATTR_USER); else - clear_opt(&ctx->opt, XATTR_USER); + clear_opt(&sbi->opt, XATTR_USER); #else errorfc(fc, "{,no}user_xattr options not supported"); #endif @@ -488,16 +493,16 @@ static int erofs_fc_parse_param(struct fs_context *fc, case Opt_acl: #ifdef CONFIG_EROFS_FS_POSIX_ACL if (result.boolean) - set_opt(&ctx->opt, POSIX_ACL); + set_opt(&sbi->opt, POSIX_ACL); else - clear_opt(&ctx->opt, POSIX_ACL); + clear_opt(&sbi->opt, POSIX_ACL); #else errorfc(fc, "{,no}acl options not supported"); #endif break; case Opt_cache_strategy: #ifdef CONFIG_EROFS_FS_ZIP - ctx->opt.cache_strategy = result.uint_32; + sbi->opt.cache_strategy = result.uint_32; #else errorfc(fc, "compression not supported, cache_strategy ignored"); #endif @@ -519,21 +524,21 @@ static int erofs_fc_parse_param(struct fs_context *fc, kfree(dif); return -ENOMEM; } - down_write(&ctx->devs->rwsem); - ret = idr_alloc(&ctx->devs->tree, dif, 0, 0, GFP_KERNEL); - up_write(&ctx->devs->rwsem); + down_write(&sbi->devs->rwsem); + ret = idr_alloc(&sbi->devs->tree, dif, 0, 0, GFP_KERNEL); + up_write(&sbi->devs->rwsem); if (ret < 0) { kfree(dif->path); kfree(dif); return ret; } - ++ctx->devs->extra_devices; + ++sbi->devs->extra_devices; break; case Opt_fsid: #ifdef CONFIG_EROFS_FS_ONDEMAND - kfree(ctx->fsid); - ctx->fsid = kstrdup(param->string, GFP_KERNEL); - if (!ctx->fsid) + kfree(sbi->fsid); + sbi->fsid = kstrdup(param->string, GFP_KERNEL); + if (!sbi->fsid) return -ENOMEM; #else errorfc(fc, "fsid option not supported"); @@ -542,9 +547,9 @@ static int erofs_fc_parse_param(struct fs_context *fc, break; case Opt_domain_id: #ifdef CONFIG_EROFS_FS_ONDEMAND - kfree(ctx->domain_id); - ctx->domain_id = kstrdup(param->string, GFP_KERNEL); - if (!ctx->domain_id) + kfree(sbi->domain_id); + sbi->domain_id = kstrdup(param->string, GFP_KERNEL); + if (!sbi->domain_id) return -ENOMEM; #else errorfc(fc, "domain_id option not supported"); @@ -552,24 +557,34 @@ static int erofs_fc_parse_param(struct fs_context *fc, break; #ifdef CONFIG_EROFS_FS_RAFS_V6 case Opt_bootstrap_path: - kfree(ctx->bootstrap_path); - ctx->bootstrap_path = kstrdup(param->string, GFP_KERNEL); - if (!ctx->bootstrap_path) + kfree(sbi->bootstrap_path); + sbi->bootstrap_path = kstrdup(param->string, GFP_KERNEL); + if (!sbi->bootstrap_path) return -ENOMEM; break; case Opt_blob_dir_path: - kfree(ctx->blob_dir_path); - ctx->blob_dir_path = kstrdup(param->string, GFP_KERNEL); - if (!ctx->blob_dir_path) + kfree(sbi->blob_dir_path); + sbi->blob_dir_path = kstrdup(param->string, GFP_KERNEL); + if (!sbi->blob_dir_path) return -ENOMEM; break; case Opt_blob_mmap_pin: if (result.boolean) - set_opt(&ctx->opt, BLOB_MMAP_PIN); + set_opt(&sbi->opt, BLOB_MMAP_PIN); else - clear_opt(&ctx->opt, BLOB_MMAP_PIN); + clear_opt(&sbi->opt, BLOB_MMAP_PIN); break; #endif + case Opt_directio: +#ifdef CONFIG_EROFS_FS_BACKED_BY_FILE + if (result.boolean) + set_opt(&sbi->opt, DIRECT_IO); + else + clear_opt(&sbi->opt, DIRECT_IO); +#else + errorfc(fc, "%s option not supported", erofs_fs_parameters[opt].name); +#endif + break; default: return -ENOPARAM; } @@ -715,8 +730,7 @@ static int rafs_v6_fill_super(struct super_block *sb) static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc) { struct inode *inode; - struct erofs_sb_info *sbi; - struct erofs_fs_context *ctx = fc->fs_private; + struct erofs_sb_info *sbi = EROFS_SB(sb); int err; sb->s_magic = EROFS_SUPER_MAGIC; @@ -724,30 +738,20 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc) sb->s_maxbytes = MAX_LFS_FILESIZE; sb->s_op = &erofs_sops; - sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); - if (!sbi) - return -ENOMEM; - - sb->s_fs_info = sbi; - sbi->opt = ctx->opt; - sbi->devs = ctx->devs; - ctx->devs = NULL; - sbi->fsid = ctx->fsid; - ctx->fsid = NULL; - sbi->domain_id = ctx->domain_id; - ctx->domain_id = NULL; -#ifdef CONFIG_EROFS_FS_RAFS_V6 - sbi->bootstrap_path = ctx->bootstrap_path; - ctx->bootstrap_path = NULL; - sbi->blob_dir_path = ctx->blob_dir_path; - ctx->blob_dir_path = NULL; -#endif - sbi->blkszbits = PAGE_SHIFT; if (!sb->s_bdev) { - /* fscache or rafsv6 mode */ sb->s_blocksize = PAGE_SIZE; sb->s_blocksize_bits = PAGE_SHIFT; + + if (erofs_is_fscache_mode(sb)) { + err = erofs_fscache_register_fs(sb); + if (err) + return err; + } + + err = super_setup_bdi(sb); + if (err) + return err; } else { if (!sb_set_blocksize(sb, PAGE_SIZE)) { errorfc(fc, "failed to set initial blksize"); @@ -756,16 +760,6 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc) sbi->dax_dev = fs_dax_get_by_bdev(sb->s_bdev, NULL, NULL); } - if (erofs_is_fscache_mode(sb)) { - err = erofs_fscache_register_fs(sb); - if (err) - return err; - - err = super_setup_bdi(sb); - if (err) - return err; - } - err = rafs_v6_fill_super(sb); if (err) return err; @@ -843,60 +837,73 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc) static int erofs_fc_get_tree(struct fs_context *fc) { - struct erofs_fs_context *ctx = fc->fs_private; + struct erofs_sb_info *sbi = fc->s_fs_info; + int ret; #ifdef CONFIG_EROFS_FS_RAFS_V6 - if (ctx->blob_dir_path && !ctx->bootstrap_path) { + if (sbi->blob_dir_path && !sbi->bootstrap_path) { errorfc(fc, "bootstrap_path required in RAFS mode"); return -EINVAL; } - if (ctx->bootstrap_path && ctx->fsid) { + if (sbi->bootstrap_path && sbi->fsid) { errorfc(fc, "fscache/RAFS modes are mutually exclusive"); return -EINVAL; } #endif - if (IS_ENABLED(CONFIG_EROFS_FS_ONDEMAND) && ctx->fsid) + if (IS_ENABLED(CONFIG_EROFS_FS_ONDEMAND) && sbi->fsid) return get_tree_nodev(fc, erofs_fc_fill_super); #ifdef CONFIG_EROFS_FS_RAFS_V6 - if (ctx->bootstrap_path && ctx->blob_dir_path) + if (sbi->bootstrap_path && sbi->blob_dir_path) + return get_tree_nodev(fc, erofs_fc_fill_super); +#endif + ret = get_tree_bdev(fc, erofs_fc_fill_super); +#ifdef CONFIG_EROFS_FS_BACKED_BY_FILE + if (ret == -ENOTBLK) { + if (!fc->source) + return invalf(fc, "No source specified"); + sbi->fdev = filp_open(fc->source, O_RDONLY | O_LARGEFILE, 0); + if (IS_ERR(sbi->fdev)) + return PTR_ERR(sbi->fdev); + return get_tree_nodev(fc, erofs_fc_fill_super); + } #endif - return get_tree_bdev(fc, erofs_fc_fill_super); + return ret; } static int erofs_fc_reconfigure(struct fs_context *fc) { struct super_block *sb = fc->root->d_sb; struct erofs_sb_info *sbi = EROFS_SB(sb); - struct erofs_fs_context *ctx = fc->fs_private; + struct erofs_sb_info *new_sbi = fc->s_fs_info; DBG_BUGON(!sb_rdonly(sb)); - if (ctx->fsid || ctx->domain_id) + if (new_sbi->fsid || new_sbi->domain_id) erofs_info(sb, "ignoring reconfiguration for fsid|domain_id."); #ifdef CONFIG_EROFS_FS_RAFS_V6 - if (test_opt(&ctx->opt, BLOB_MMAP_PIN) != - test_opt(&sbi->opt, BLOB_MMAP_PIN)) { + if (test_opt(&sbi->opt, BLOB_MMAP_PIN) != + test_opt(&new_sbi->opt, BLOB_MMAP_PIN)) { erofs_info(sb, "ignoring reconfiguration for rafsv6's blob_mmap_pin."); - if (test_opt(&sbi->opt, BLOB_MMAP_PIN)) - set_opt(&ctx->opt, BLOB_MMAP_PIN); + if (test_opt(&new_sbi->opt, BLOB_MMAP_PIN)) + set_opt(&sbi->opt, BLOB_MMAP_PIN); else - clear_opt(&ctx->opt, BLOB_MMAP_PIN); + clear_opt(&sbi->opt, BLOB_MMAP_PIN); } #endif - if (test_opt(&ctx->opt, POSIX_ACL)) + if (test_opt(&new_sbi->opt, POSIX_ACL)) fc->sb_flags |= SB_POSIXACL; else fc->sb_flags &= ~SB_POSIXACL; - sbi->opt = ctx->opt; + sbi->opt = new_sbi->opt; fc->sb_flags |= SB_RDONLY; return 0; @@ -913,6 +920,8 @@ static int erofs_release_device_info(int id, void *ptr, void *data) if (dif->blobfile) filp_close(dif->blobfile, NULL); #endif + if (dif->file) + fput(dif->file); erofs_fscache_unregister_cookie(dif->fscache); dif->fscache = NULL; kfree(dif->path); @@ -931,12 +940,15 @@ static void erofs_free_dev_context(struct erofs_dev_context *devs) static void erofs_fc_free(struct fs_context *fc) { - struct erofs_fs_context *ctx = fc->fs_private; + struct erofs_sb_info *sbi = fc->s_fs_info; + + if (!sbi) + return; - erofs_free_dev_context(ctx->devs); - kfree(ctx->fsid); - kfree(ctx->domain_id); - kfree(ctx); + erofs_free_dev_context(sbi->devs); + kfree(sbi->fsid); + kfree(sbi->domain_id); + kfree(sbi); } static const struct fs_context_operations erofs_context_ops = { @@ -948,21 +960,22 @@ static const struct fs_context_operations erofs_context_ops = { static int erofs_init_fs_context(struct fs_context *fc) { - struct erofs_fs_context *ctx; + struct erofs_sb_info *sbi; - ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); - if (!ctx) + sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); + if (!sbi) return -ENOMEM; - ctx->devs = kzalloc(sizeof(struct erofs_dev_context), GFP_KERNEL); - if (!ctx->devs) { - kfree(ctx); + + sbi->devs = kzalloc(sizeof(struct erofs_dev_context), GFP_KERNEL); + if (!sbi->devs) { + kfree(sbi); return -ENOMEM; } - fc->fs_private = ctx; + fc->s_fs_info = sbi; - idr_init(&ctx->devs->tree); - init_rwsem(&ctx->devs->rwsem); - erofs_default_options(ctx); + idr_init(&sbi->devs->tree); + init_rwsem(&sbi->devs->rwsem); + erofs_default_options(sbi); fc->ops = &erofs_context_ops; return 0; } @@ -971,8 +984,6 @@ static void erofs_kill_sb(struct super_block *sb) { struct erofs_sb_info *sbi; - WARN_ON(sb->s_magic != EROFS_SUPER_MAGIC); - /* pseudo mount for anon inodes */ if (sb->s_flags & SB_KERNMOUNT) { kill_anon_super(sb); @@ -1000,6 +1011,8 @@ static void erofs_kill_sb(struct super_block *sb) erofs_fscache_unregister_fs(sb); kfree(sbi->fsid); kfree(sbi->domain_id); + if (sbi->fdev) + fput(sbi->fdev); kfree(sbi); sb->s_fs_info = NULL; } @@ -1115,7 +1128,7 @@ static int erofs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_namelen = EROFS_NAME_LEN; if (uuid_is_null(&sb->s_uuid)) - buf->f_fsid = u64_to_fsid(erofs_is_fscache_mode(sb) ? 0 : + buf->f_fsid = u64_to_fsid(!sb->s_bdev ? 0 : huge_encode_dev(sb->s_bdev->bd_dev)); else buf->f_fsid = uuid_to_fsid(sb->s_uuid.b); @@ -1139,6 +1152,8 @@ static int erofs_show_options(struct seq_file *seq, struct dentry *root) seq_puts(seq, ",dax=always"); if (test_opt(opt, DAX_NEVER)) seq_puts(seq, ",dax=never"); + if (erofs_is_fileio_mode(sbi) && test_opt(opt, DIRECT_IO)) + seq_puts(seq, ",directio"); #ifdef CONFIG_EROFS_FS_ONDEMAND if (sbi->fsid) seq_printf(seq, ",fsid=%s", sbi->fsid); diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 0735018d6d551833ab0462a4be413e0119dfa448..07d9f00319ade2b7e7b8725287eadd8a950679eb 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -109,49 +109,6 @@ static inline unsigned int z_erofs_pclusterpages(struct z_erofs_pcluster *pcl) return PAGE_ALIGN(pcl->pclustersize) >> PAGE_SHIFT; } -/* - * bit 30: I/O error occurred on this page - * bit 0 - 29: remaining parts to complete this page - */ -#define Z_EROFS_PAGE_EIO (1 << 30) - -static inline void z_erofs_onlinepage_init(struct page *page) -{ - union { - atomic_t o; - unsigned long v; - } u = { .o = ATOMIC_INIT(1) }; - - set_page_private(page, u.v); - smp_wmb(); - SetPagePrivate(page); -} - -static inline void z_erofs_onlinepage_split(struct page *page) -{ - atomic_inc((atomic_t *)&page->private); -} - -static void z_erofs_onlinepage_endio(struct page *page, int err) -{ - int orig, v; - - DBG_BUGON(!PagePrivate(page)); - - do { - orig = atomic_read((atomic_t *)&page->private); - v = (orig - 1) | (err ? Z_EROFS_PAGE_EIO : 0); - } while (atomic_cmpxchg((atomic_t *)&page->private, orig, v) != orig); - - if (!(v & ~Z_EROFS_PAGE_EIO)) { - set_page_private(page, 0); - ClearPagePrivate(page); - if (!(v & Z_EROFS_PAGE_EIO)) - SetPageUptodate(page); - unlock_page(page); - } -} - #define Z_EROFS_ONSTACK_PAGES 32 /* @@ -775,11 +732,11 @@ static int z_erofs_read_fragment(struct super_block *sb, struct page *page, if (!packed_inode) return -EFSCORRUPTED; + buf.inode = packed_inode; for (; cur < end; cur += cnt, pos += cnt) { cnt = min_t(unsigned int, end - cur, sb->s_blocksize - erofs_blkoff(sb, pos)); - src = erofs_bread(&buf, packed_inode, - erofs_blknr(sb, pos), EROFS_KMAP); + src = erofs_bread(&buf, erofs_blknr(sb, pos), EROFS_KMAP); if (IS_ERR(src)) { erofs_put_metabuf(&buf); return PTR_ERR(src); @@ -801,7 +758,7 @@ static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe, unsigned int cur, end, len, split; int err = 0; - z_erofs_onlinepage_init(page); + erofs_onlinepage_init(page); split = 0; end = PAGE_SIZE; repeat: @@ -862,7 +819,7 @@ static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe, if (err) goto out; - z_erofs_onlinepage_split(page); + erofs_onlinepage_split(page); if (fe->pcl->length < offset + end - map->m_la) { fe->pcl->length = offset + end - map->m_la; fe->pcl->pageofs_out = map->m_la & ~PAGE_MASK; @@ -881,7 +838,7 @@ static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe, goto repeat; out: - z_erofs_onlinepage_endio(page, err); + erofs_onlinepage_end(page, err); return err; } @@ -978,7 +935,7 @@ static void z_erofs_fill_other_copies(struct z_erofs_decompress_backend *be, cur += len; } kunmap_atomic(dst); - z_erofs_onlinepage_endio(bvi->bvec.page, err); + erofs_onlinepage_end(bvi->bvec.page, err); list_del(p); kfree(bvi); } @@ -1128,11 +1085,10 @@ static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be, continue; DBG_BUGON(z_erofs_page_is_invalidated(page)); - /* recycle all individual short-lived pages */ if (z_erofs_put_shortlivedpage(be->pagepool, page)) continue; - z_erofs_onlinepage_endio(page, err); + erofs_onlinepage_end(page, err); } if (be->decompressed_pages != be->onstack_pages)