new file mode 100644
@@ -0,0 +1,672 @@
+// SPDX-License-Identifier: LGPL-2.1
+/*
+ * Copyright IBM Corporation, 2007
+ * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+ *
+ */
+
+#include <linux/slab.h>
+#include "ext4_jbd2.h"
+#include "ext4_extents.h"
+
+/*
+ * The contiguous blocks details which can be
+ * represented by a single extent
+ */
+struct migrate_struct {
+ ext4_lblk_t first_block, last_block, curr_block;
+ ext4_fsblk_t first_pblock, last_pblock;
+};
+
+static int finish_range(handle_t *handle, struct inode *inode,
+ struct migrate_struct *lb)
+
+{
+ int retval = 0, needed;
+ struct ext4_extent newext;
+ struct ext4_ext_path *path;
+ if (lb->first_pblock == 0)
+ return 0;
+
+ /* Add the extent to temp inode*/
+ newext.ee_block = cpu_to_le32(lb->first_block);
+ newext.ee_len = cpu_to_le16(lb->last_block - lb->first_block + 1);
+ ext4_ext_store_pblock(&newext, lb->first_pblock);
+ /* Locking only for convenience since we are operating on temp inode */
+ down_write(&EXT4_I(inode)->i_data_sem);
+ path = ext4_find_extent(inode, lb->first_block, NULL, 0);
+ if (IS_ERR(path)) {
+ retval = PTR_ERR(path);
+ goto err_out;
+ }
+
+ /*
+ * Calculate the credit needed to inserting this extent
+ * Since we are doing this in loop we may accumulate extra
+ * credit. But below we try to not accumulate too much
+ * of them by restarting the journal.
+ */
+ needed = ext4_ext_calc_credits_for_single_extent(inode,
+ lb->last_block - lb->first_block + 1, path);
+
+ retval = ext4_datasem_ensure_credits(handle, inode, needed, needed, 0);
+ if (retval < 0)
+ goto err_out;
+ path = ext4_ext_insert_extent(handle, inode, path, &newext, 0);
+ if (IS_ERR(path))
+ retval = PTR_ERR(path);
+err_out:
+ up_write((&EXT4_I(inode)->i_data_sem));
+ ext4_free_ext_path(path);
+ lb->first_pblock = 0;
+ return retval;
+}
+
+static int update_extent_range(handle_t *handle, struct inode *inode,
+ ext4_fsblk_t pblock, struct migrate_struct *lb)
+{
+ int retval;
+ /*
+ * See if we can add on to the existing range (if it exists)
+ */
+ if (lb->first_pblock &&
+ (lb->last_pblock+1 == pblock) &&
+ (lb->last_block+1 == lb->curr_block)) {
+ lb->last_pblock = pblock;
+ lb->last_block = lb->curr_block;
+ lb->curr_block++;
+ return 0;
+ }
+ /*
+ * Start a new range.
+ */
+ retval = finish_range(handle, inode, lb);
+ lb->first_pblock = lb->last_pblock = pblock;
+ lb->first_block = lb->last_block = lb->curr_block;
+ lb->curr_block++;
+ return retval;
+}
+
+static int update_ind_extent_range(handle_t *handle, struct inode *inode,
+ ext4_fsblk_t pblock,
+ struct migrate_struct *lb)
+{
+ struct buffer_head *bh;
+ __le32 *i_data;
+ int i, retval = 0;
+ unsigned long max_entries = inode->i_sb->s_blocksize >> 2;
+
+ bh = ext4_sb_bread(inode->i_sb, pblock, 0);
+ if (IS_ERR(bh))
+ return PTR_ERR(bh);
+
+ i_data = (__le32 *)bh->b_data;
+ for (i = 0; i < max_entries; i++) {
+ if (i_data[i]) {
+ retval = update_extent_range(handle, inode,
+ le32_to_cpu(i_data[i]), lb);
+ if (retval)
+ break;
+ } else {
+ lb->curr_block++;
+ }
+ }
+ put_bh(bh);
+ return retval;
+
+}
+
+static int update_dind_extent_range(handle_t *handle, struct inode *inode,
+ ext4_fsblk_t pblock,
+ struct migrate_struct *lb)
+{
+ struct buffer_head *bh;
+ __le32 *i_data;
+ int i, retval = 0;
+ unsigned long max_entries = inode->i_sb->s_blocksize >> 2;
+
+ bh = ext4_sb_bread(inode->i_sb, pblock, 0);
+ if (IS_ERR(bh))
+ return PTR_ERR(bh);
+
+ i_data = (__le32 *)bh->b_data;
+ for (i = 0; i < max_entries; i++) {
+ if (i_data[i]) {
+ retval = update_ind_extent_range(handle, inode,
+ le32_to_cpu(i_data[i]), lb);
+ if (retval)
+ break;
+ } else {
+ /* Only update the file block number */
+ lb->curr_block += max_entries;
+ }
+ }
+ put_bh(bh);
+ return retval;
+
+}
+
+static int update_tind_extent_range(handle_t *handle, struct inode *inode,
+ ext4_fsblk_t pblock,
+ struct migrate_struct *lb)
+{
+ struct buffer_head *bh;
+ __le32 *i_data;
+ int i, retval = 0;
+ unsigned long max_entries = inode->i_sb->s_blocksize >> 2;
+
+ bh = ext4_sb_bread(inode->i_sb, pblock, 0);
+ if (IS_ERR(bh))
+ return PTR_ERR(bh);
+
+ i_data = (__le32 *)bh->b_data;
+ for (i = 0; i < max_entries; i++) {
+ if (i_data[i]) {
+ retval = update_dind_extent_range(handle, inode,
+ le32_to_cpu(i_data[i]), lb);
+ if (retval)
+ break;
+ } else {
+ /* Only update the file block number */
+ lb->curr_block += max_entries * max_entries;
+ }
+ }
+ put_bh(bh);
+ return retval;
+
+}
+
+static int free_dind_blocks(handle_t *handle,
+ struct inode *inode, __le32 i_data)
+{
+ int i;
+ __le32 *tmp_idata;
+ struct buffer_head *bh;
+ struct super_block *sb = inode->i_sb;
+ unsigned long max_entries = inode->i_sb->s_blocksize >> 2;
+ int err;
+
+ bh = ext4_sb_bread(sb, le32_to_cpu(i_data), 0);
+ if (IS_ERR(bh))
+ return PTR_ERR(bh);
+
+ tmp_idata = (__le32 *)bh->b_data;
+ for (i = 0; i < max_entries; i++) {
+ if (tmp_idata[i]) {
+ err = ext4_journal_ensure_credits(handle,
+ EXT4_RESERVE_TRANS_BLOCKS,
+ ext4_free_metadata_revoke_credits(sb, 1));
+ if (err < 0) {
+ put_bh(bh);
+ return err;
+ }
+ ext4_free_blocks(handle, inode, NULL,
+ le32_to_cpu(tmp_idata[i]), 1,
+ EXT4_FREE_BLOCKS_METADATA |
+ EXT4_FREE_BLOCKS_FORGET);
+ }
+ }
+ put_bh(bh);
+ err = ext4_journal_ensure_credits(handle, EXT4_RESERVE_TRANS_BLOCKS,
+ ext4_free_metadata_revoke_credits(sb, 1));
+ if (err < 0)
+ return err;
+ ext4_free_blocks(handle, inode, NULL, le32_to_cpu(i_data), 1,
+ EXT4_FREE_BLOCKS_METADATA |
+ EXT4_FREE_BLOCKS_FORGET);
+ return 0;
+}
+
+static int free_tind_blocks(handle_t *handle,
+ struct inode *inode, __le32 i_data)
+{
+ int i, retval = 0;
+ __le32 *tmp_idata;
+ struct buffer_head *bh;
+ unsigned long max_entries = inode->i_sb->s_blocksize >> 2;
+
+ bh = ext4_sb_bread(inode->i_sb, le32_to_cpu(i_data), 0);
+ if (IS_ERR(bh))
+ return PTR_ERR(bh);
+
+ tmp_idata = (__le32 *)bh->b_data;
+ for (i = 0; i < max_entries; i++) {
+ if (tmp_idata[i]) {
+ retval = free_dind_blocks(handle,
+ inode, tmp_idata[i]);
+ if (retval) {
+ put_bh(bh);
+ return retval;
+ }
+ }
+ }
+ put_bh(bh);
+ retval = ext4_journal_ensure_credits(handle, EXT4_RESERVE_TRANS_BLOCKS,
+ ext4_free_metadata_revoke_credits(inode->i_sb, 1));
+ if (retval < 0)
+ return retval;
+ ext4_free_blocks(handle, inode, NULL, le32_to_cpu(i_data), 1,
+ EXT4_FREE_BLOCKS_METADATA |
+ EXT4_FREE_BLOCKS_FORGET);
+ return 0;
+}
+
+static int free_ind_block(handle_t *handle, struct inode *inode, __le32 *i_data)
+{
+ int retval;
+
+ /* ei->i_data[EXT4_IND_BLOCK] */
+ if (i_data[0]) {
+ retval = ext4_journal_ensure_credits(handle,
+ EXT4_RESERVE_TRANS_BLOCKS,
+ ext4_free_metadata_revoke_credits(inode->i_sb, 1));
+ if (retval < 0)
+ return retval;
+ ext4_free_blocks(handle, inode, NULL,
+ le32_to_cpu(i_data[0]), 1,
+ EXT4_FREE_BLOCKS_METADATA |
+ EXT4_FREE_BLOCKS_FORGET);
+ }
+
+ /* ei->i_data[EXT4_DIND_BLOCK] */
+ if (i_data[1]) {
+ retval = free_dind_blocks(handle, inode, i_data[1]);
+ if (retval)
+ return retval;
+ }
+
+ /* ei->i_data[EXT4_TIND_BLOCK] */
+ if (i_data[2]) {
+ retval = free_tind_blocks(handle, inode, i_data[2]);
+ if (retval)
+ return retval;
+ }
+ return 0;
+}
+
+static int ext4_ext_swap_inode_data(handle_t *handle, struct inode *inode,
+ struct inode *tmp_inode)
+{
+ int retval, retval2 = 0;
+ __le32 i_data[3];
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ struct ext4_inode_info *tmp_ei = EXT4_I(tmp_inode);
+
+ /*
+ * One credit accounted for writing the
+ * i_data field of the original inode
+ */
+ retval = ext4_journal_ensure_credits(handle, 1, 0);
+ if (retval < 0)
+ goto err_out;
+
+ i_data[0] = ei->i_data[EXT4_IND_BLOCK];
+ i_data[1] = ei->i_data[EXT4_DIND_BLOCK];
+ i_data[2] = ei->i_data[EXT4_TIND_BLOCK];
+
+ down_write(&EXT4_I(inode)->i_data_sem);
+ /*
+ * if EXT4_STATE_EXT_MIGRATE is cleared a block allocation
+ * happened after we started the migrate. We need to
+ * fail the migrate
+ */
+ if (!ext4_test_inode_state(inode, EXT4_STATE_EXT_MIGRATE)) {
+ retval = -EAGAIN;
+ up_write(&EXT4_I(inode)->i_data_sem);
+ goto err_out;
+ } else
+ ext4_clear_inode_state(inode, EXT4_STATE_EXT_MIGRATE);
+ /*
+ * We have the extent map build with the tmp inode.
+ * Now copy the i_data across
+ */
+ ext4_set_inode_flag(inode, EXT4_INODE_EXTENTS);
+ memcpy(ei->i_data, tmp_ei->i_data, sizeof(ei->i_data));
+
+ /*
+ * Update i_blocks with the new blocks that got
+ * allocated while adding extents for extent index
+ * blocks.
+ *
+ * While converting to extents we need not
+ * update the original inode i_blocks for extent blocks
+ * via quota APIs. The quota update happened via tmp_inode already.
+ */
+ spin_lock(&inode->i_lock);
+ inode->i_blocks += tmp_inode->i_blocks;
+ spin_unlock(&inode->i_lock);
+ up_write(&EXT4_I(inode)->i_data_sem);
+
+ /*
+ * We mark the inode dirty after, because we decrement the
+ * i_blocks when freeing the indirect meta-data blocks
+ */
+ retval = free_ind_block(handle, inode, i_data);
+ retval2 = ext4_mark_inode_dirty(handle, inode);
+ if (unlikely(retval2 && !retval))
+ retval = retval2;
+
+err_out:
+ return retval;
+}
+
+static int free_ext_idx(handle_t *handle, struct inode *inode,
+ struct ext4_extent_idx *ix)
+{
+ int i, retval = 0;
+ ext4_fsblk_t block;
+ struct buffer_head *bh;
+ struct ext4_extent_header *eh;
+
+ block = ext4_idx_pblock(ix);
+ bh = ext4_sb_bread(inode->i_sb, block, 0);
+ if (IS_ERR(bh))
+ return PTR_ERR(bh);
+
+ eh = (struct ext4_extent_header *)bh->b_data;
+ if (eh->eh_depth != 0) {
+ ix = EXT_FIRST_INDEX(eh);
+ for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ix++) {
+ retval = free_ext_idx(handle, inode, ix);
+ if (retval) {
+ put_bh(bh);
+ return retval;
+ }
+ }
+ }
+ put_bh(bh);
+ retval = ext4_journal_ensure_credits(handle, EXT4_RESERVE_TRANS_BLOCKS,
+ ext4_free_metadata_revoke_credits(inode->i_sb, 1));
+ if (retval < 0)
+ return retval;
+ ext4_free_blocks(handle, inode, NULL, block, 1,
+ EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET);
+ return 0;
+}
+
+/*
+ * Free the extent meta data blocks only
+ */
+static int free_ext_block(handle_t *handle, struct inode *inode)
+{
+ int i, retval = 0;
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ struct ext4_extent_header *eh = (struct ext4_extent_header *)ei->i_data;
+ struct ext4_extent_idx *ix;
+ if (eh->eh_depth == 0)
+ /*
+ * No extra blocks allocated for extent meta data
+ */
+ return 0;
+ ix = EXT_FIRST_INDEX(eh);
+ for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ix++) {
+ retval = free_ext_idx(handle, inode, ix);
+ if (retval)
+ return retval;
+ }
+ return retval;
+}
+
+int ext4_ext_migrate(struct inode *inode)
+{
+ handle_t *handle;
+ int retval = 0, i;
+ __le32 *i_data;
+ struct ext4_inode_info *ei;
+ struct inode *tmp_inode = NULL;
+ struct migrate_struct lb;
+ unsigned long max_entries;
+ __u32 goal, tmp_csum_seed;
+ uid_t owner[2];
+ int alloc_ctx;
+
+ /*
+ * If the filesystem does not support extents, or the inode
+ * already is extent-based, error out.
+ */
+ if (!ext4_has_feature_extents(inode->i_sb) ||
+ ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) ||
+ ext4_has_inline_data(inode))
+ return -EINVAL;
+
+ if (S_ISLNK(inode->i_mode) && inode->i_blocks == 0)
+ /*
+ * don't migrate fast symlink
+ */
+ return retval;
+
+ alloc_ctx = ext4_writepages_down_write(inode->i_sb);
+
+ /*
+ * Worst case we can touch the allocation bitmaps and a block
+ * group descriptor block. We do need to worry about
+ * credits for modifying the quota inode.
+ */
+ handle = ext4_journal_start(inode, EXT4_HT_MIGRATE,
+ 3 + EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb));
+
+ if (IS_ERR(handle)) {
+ retval = PTR_ERR(handle);
+ goto out_unlock;
+ }
+ goal = (((inode->i_ino - 1) / EXT4_INODES_PER_GROUP(inode->i_sb)) *
+ EXT4_INODES_PER_GROUP(inode->i_sb)) + 1;
+ owner[0] = i_uid_read(inode);
+ owner[1] = i_gid_read(inode);
+ tmp_inode = ext4_new_inode(handle, d_inode(inode->i_sb->s_root),
+ S_IFREG, NULL, goal, owner, 0);
+ if (IS_ERR(tmp_inode)) {
+ retval = PTR_ERR(tmp_inode);
+ ext4_journal_stop(handle);
+ goto out_unlock;
+ }
+ /*
+ * Use the correct seed for checksum (i.e. the seed from 'inode'). This
+ * is so that the metadata blocks will have the correct checksum after
+ * the migration.
+ */
+ ei = EXT4_I(inode);
+ tmp_csum_seed = EXT4_I(tmp_inode)->i_csum_seed;
+ EXT4_I(tmp_inode)->i_csum_seed = ei->i_csum_seed;
+ i_size_write(tmp_inode, i_size_read(inode));
+ /*
+ * Set the i_nlink to zero so it will be deleted later
+ * when we drop inode reference.
+ */
+ clear_nlink(tmp_inode);
+
+ ext4_ext_tree_init(handle, tmp_inode);
+ ext4_journal_stop(handle);
+
+ /*
+ * start with one credit accounted for
+ * superblock modification.
+ *
+ * For the tmp_inode we already have committed the
+ * transaction that created the inode. Later as and
+ * when we add extents we extent the journal
+ */
+ /*
+ * Even though we take i_rwsem we can still cause block
+ * allocation via mmap write to holes. If we have allocated
+ * new blocks we fail migrate. New block allocation will
+ * clear EXT4_STATE_EXT_MIGRATE flag. The flag is updated
+ * with i_data_sem held to prevent racing with block
+ * allocation.
+ */
+ down_read(&EXT4_I(inode)->i_data_sem);
+ ext4_set_inode_state(inode, EXT4_STATE_EXT_MIGRATE);
+ up_read((&EXT4_I(inode)->i_data_sem));
+
+ handle = ext4_journal_start(inode, EXT4_HT_MIGRATE, 1);
+ if (IS_ERR(handle)) {
+ retval = PTR_ERR(handle);
+ goto out_tmp_inode;
+ }
+
+ i_data = ei->i_data;
+ memset(&lb, 0, sizeof(lb));
+
+ /* 32 bit block address 4 bytes */
+ max_entries = inode->i_sb->s_blocksize >> 2;
+ for (i = 0; i < EXT4_NDIR_BLOCKS; i++) {
+ if (i_data[i]) {
+ retval = update_extent_range(handle, tmp_inode,
+ le32_to_cpu(i_data[i]), &lb);
+ if (retval)
+ goto err_out;
+ } else
+ lb.curr_block++;
+ }
+ if (i_data[EXT4_IND_BLOCK]) {
+ retval = update_ind_extent_range(handle, tmp_inode,
+ le32_to_cpu(i_data[EXT4_IND_BLOCK]), &lb);
+ if (retval)
+ goto err_out;
+ } else
+ lb.curr_block += max_entries;
+ if (i_data[EXT4_DIND_BLOCK]) {
+ retval = update_dind_extent_range(handle, tmp_inode,
+ le32_to_cpu(i_data[EXT4_DIND_BLOCK]), &lb);
+ if (retval)
+ goto err_out;
+ } else
+ lb.curr_block += max_entries * max_entries;
+ if (i_data[EXT4_TIND_BLOCK]) {
+ retval = update_tind_extent_range(handle, tmp_inode,
+ le32_to_cpu(i_data[EXT4_TIND_BLOCK]), &lb);
+ if (retval)
+ goto err_out;
+ }
+ /*
+ * Build the last extent
+ */
+ retval = finish_range(handle, tmp_inode, &lb);
+err_out:
+ if (retval)
+ /*
+ * Failure case delete the extent information with the
+ * tmp_inode
+ */
+ free_ext_block(handle, tmp_inode);
+ else {
+ retval = ext4_ext_swap_inode_data(handle, inode, tmp_inode);
+ if (retval)
+ /*
+ * if we fail to swap inode data free the extent
+ * details of the tmp inode
+ */
+ free_ext_block(handle, tmp_inode);
+ }
+
+ /* We mark the tmp_inode dirty via ext4_ext_tree_init. */
+ retval = ext4_journal_ensure_credits(handle, 1, 0);
+ if (retval < 0)
+ goto out_stop;
+ /*
+ * Mark the tmp_inode as of size zero
+ */
+ i_size_write(tmp_inode, 0);
+
+ /*
+ * set the i_blocks count to zero
+ * so that the ext4_evict_inode() does the
+ * right job
+ *
+ * We don't need to take the i_lock because
+ * the inode is not visible to user space.
+ */
+ tmp_inode->i_blocks = 0;
+ EXT4_I(tmp_inode)->i_csum_seed = tmp_csum_seed;
+
+ /* Reset the extent details */
+ ext4_ext_tree_init(handle, tmp_inode);
+out_stop:
+ ext4_journal_stop(handle);
+out_tmp_inode:
+ unlock_new_inode(tmp_inode);
+ iput(tmp_inode);
+out_unlock:
+ ext4_writepages_up_write(inode->i_sb, alloc_ctx);
+ return retval;
+}
+
+/*
+ * Migrate a simple extent-based inode to use the i_blocks[] array
+ */
+int ext4_ind_migrate(struct inode *inode)
+{
+ struct ext4_extent_header *eh;
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ struct ext4_super_block *es = sbi->s_es;
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ struct ext4_extent *ex;
+ unsigned int i, len;
+ ext4_lblk_t start, end;
+ ext4_fsblk_t blk;
+ handle_t *handle;
+ int ret, ret2 = 0;
+ int alloc_ctx;
+
+ if (!ext4_has_feature_extents(inode->i_sb) ||
+ (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
+ return -EINVAL;
+
+ if (ext4_has_feature_bigalloc(inode->i_sb))
+ return -EOPNOTSUPP;
+
+ /*
+ * In order to get correct extent info, force all delayed allocation
+ * blocks to be allocated, otherwise delayed allocation blocks may not
+ * be reflected and bypass the checks on extent header.
+ */
+ if (test_opt(inode->i_sb, DELALLOC))
+ ext4_alloc_da_blocks(inode);
+
+ alloc_ctx = ext4_writepages_down_write(inode->i_sb);
+
+ handle = ext4_journal_start(inode, EXT4_HT_MIGRATE, 1);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ goto out_unlock;
+ }
+
+ down_write(&EXT4_I(inode)->i_data_sem);
+ ret = ext4_ext_check_inode(inode);
+ if (ret)
+ goto errout;
+
+ eh = ext_inode_hdr(inode);
+ ex = EXT_FIRST_EXTENT(eh);
+ if (ext4_blocks_count(es) > EXT4_MAX_BLOCK_FILE_PHYS ||
+ eh->eh_depth != 0 || le16_to_cpu(eh->eh_entries) > 1) {
+ ret = -EOPNOTSUPP;
+ goto errout;
+ }
+ if (eh->eh_entries == 0)
+ blk = len = start = end = 0;
+ else {
+ len = le16_to_cpu(ex->ee_len);
+ blk = ext4_ext_pblock(ex);
+ start = le32_to_cpu(ex->ee_block);
+ end = start + len - 1;
+ if (end >= EXT4_NDIR_BLOCKS) {
+ ret = -EOPNOTSUPP;
+ goto errout;
+ }
+ }
+
+ ext4_clear_inode_flag(inode, EXT4_INODE_EXTENTS);
+ memset(ei->i_data, 0, sizeof(ei->i_data));
+ for (i = start; i <= end; i++)
+ ei->i_data[i] = cpu_to_le32(blk++);
+ ret2 = ext4_mark_inode_dirty(handle, inode);
+ if (unlikely(ret2 && !ret))
+ ret = ret2;
+errout:
+ up_write(&EXT4_I(inode)->i_data_sem);
+ ext4_journal_stop(handle);
+out_unlock:
+ ext4_writepages_up_write(inode->i_sb, alloc_ctx);
+ return ret;
+}
new file mode 100644
@@ -0,0 +1,408 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/fs.h>
+#include <linux/random.h>
+#include <linux/buffer_head.h>
+#include <linux/utsname.h>
+#include <linux/kthread.h>
+
+#include "ext4.h"
+
+/* Checksumming functions */
+static __le32 ext4_mmp_csum(struct super_block *sb, struct mmp_struct *mmp)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ int offset = offsetof(struct mmp_struct, mmp_checksum);
+ __u32 csum;
+
+ csum = ext4_chksum(sbi->s_csum_seed, (char *)mmp, offset);
+
+ return cpu_to_le32(csum);
+}
+
+static int ext4_mmp_csum_verify(struct super_block *sb, struct mmp_struct *mmp)
+{
+ if (!ext4_has_feature_metadata_csum(sb))
+ return 1;
+
+ return mmp->mmp_checksum == ext4_mmp_csum(sb, mmp);
+}
+
+static void ext4_mmp_csum_set(struct super_block *sb, struct mmp_struct *mmp)
+{
+ if (!ext4_has_feature_metadata_csum(sb))
+ return;
+
+ mmp->mmp_checksum = ext4_mmp_csum(sb, mmp);
+}
+
+/*
+ * Write the MMP block using REQ_SYNC to try to get the block on-disk
+ * faster.
+ */
+static int write_mmp_block_thawed(struct super_block *sb,
+ struct buffer_head *bh)
+{
+ struct mmp_struct *mmp = (struct mmp_struct *)(bh->b_data);
+
+ ext4_mmp_csum_set(sb, mmp);
+ lock_buffer(bh);
+ bh->b_end_io = end_buffer_write_sync;
+ get_bh(bh);
+ submit_bh(REQ_OP_WRITE | REQ_SYNC | REQ_META | REQ_PRIO, bh);
+ wait_on_buffer(bh);
+ if (unlikely(!buffer_uptodate(bh)))
+ return -EIO;
+ return 0;
+}
+
+static int write_mmp_block(struct super_block *sb, struct buffer_head *bh)
+{
+ int err;
+
+ /*
+ * We protect against freezing so that we don't create dirty buffers
+ * on frozen filesystem.
+ */
+ sb_start_write(sb);
+ err = write_mmp_block_thawed(sb, bh);
+ sb_end_write(sb);
+ return err;
+}
+
+/*
+ * Read the MMP block. It _must_ be read from disk and hence we clear the
+ * uptodate flag on the buffer.
+ */
+static int read_mmp_block(struct super_block *sb, struct buffer_head **bh,
+ ext4_fsblk_t mmp_block)
+{
+ struct mmp_struct *mmp;
+ int ret;
+
+ if (*bh)
+ clear_buffer_uptodate(*bh);
+
+ /* This would be sb_bread(sb, mmp_block), except we need to be sure
+ * that the MD RAID device cache has been bypassed, and that the read
+ * is not blocked in the elevator. */
+ if (!*bh) {
+ *bh = sb_getblk(sb, mmp_block);
+ if (!*bh) {
+ ret = -ENOMEM;
+ goto warn_exit;
+ }
+ }
+
+ lock_buffer(*bh);
+ ret = ext4_read_bh(*bh, REQ_META | REQ_PRIO, NULL, false);
+ if (ret)
+ goto warn_exit;
+
+ mmp = (struct mmp_struct *)((*bh)->b_data);
+ if (le32_to_cpu(mmp->mmp_magic) != EXT4_MMP_MAGIC) {
+ ret = -EFSCORRUPTED;
+ goto warn_exit;
+ }
+ if (!ext4_mmp_csum_verify(sb, mmp)) {
+ ret = -EFSBADCRC;
+ goto warn_exit;
+ }
+ return 0;
+warn_exit:
+ brelse(*bh);
+ *bh = NULL;
+ ext4_warning(sb, "Error %d while reading MMP block %llu",
+ ret, mmp_block);
+ return ret;
+}
+
+/*
+ * Dump as much information as possible to help the admin.
+ */
+void __dump_mmp_msg(struct super_block *sb, struct mmp_struct *mmp,
+ const char *function, unsigned int line, const char *msg)
+{
+ __ext4_warning(sb, function, line, "%s", msg);
+ __ext4_warning(sb, function, line,
+ "MMP failure info: last update time: %llu, last update node: %.*s, last update device: %.*s",
+ (unsigned long long)le64_to_cpu(mmp->mmp_time),
+ (int)sizeof(mmp->mmp_nodename), mmp->mmp_nodename,
+ (int)sizeof(mmp->mmp_bdevname), mmp->mmp_bdevname);
+}
+
+/*
+ * kmmpd will update the MMP sequence every s_mmp_update_interval seconds
+ */
+static int kmmpd(void *data)
+{
+ struct super_block *sb = data;
+ struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+ struct buffer_head *bh = EXT4_SB(sb)->s_mmp_bh;
+ struct mmp_struct *mmp;
+ ext4_fsblk_t mmp_block;
+ u32 seq = 0;
+ unsigned long failed_writes = 0;
+ int mmp_update_interval = le16_to_cpu(es->s_mmp_update_interval);
+ unsigned mmp_check_interval;
+ unsigned long last_update_time;
+ unsigned long diff;
+ int retval = 0;
+
+ mmp_block = le64_to_cpu(es->s_mmp_block);
+ mmp = (struct mmp_struct *)(bh->b_data);
+ mmp->mmp_time = cpu_to_le64(ktime_get_real_seconds());
+ /*
+ * Start with the higher mmp_check_interval and reduce it if
+ * the MMP block is being updated on time.
+ */
+ mmp_check_interval = max(EXT4_MMP_CHECK_MULT * mmp_update_interval,
+ EXT4_MMP_MIN_CHECK_INTERVAL);
+ mmp->mmp_check_interval = cpu_to_le16(mmp_check_interval);
+
+ memcpy(mmp->mmp_nodename, init_utsname()->nodename,
+ sizeof(mmp->mmp_nodename));
+
+ while (!kthread_should_stop() && !ext4_emergency_state(sb)) {
+ if (!ext4_has_feature_mmp(sb)) {
+ ext4_warning(sb, "kmmpd being stopped since MMP feature"
+ " has been disabled.");
+ goto wait_to_exit;
+ }
+ if (++seq > EXT4_MMP_SEQ_MAX)
+ seq = 1;
+
+ mmp->mmp_seq = cpu_to_le32(seq);
+ mmp->mmp_time = cpu_to_le64(ktime_get_real_seconds());
+ last_update_time = jiffies;
+
+ retval = write_mmp_block(sb, bh);
+ /*
+ * Don't spew too many error messages. Print one every
+ * (s_mmp_update_interval * 60) seconds.
+ */
+ if (retval) {
+ if ((failed_writes % 60) == 0) {
+ ext4_error_err(sb, -retval,
+ "Error writing to MMP block");
+ }
+ failed_writes++;
+ }
+
+ diff = jiffies - last_update_time;
+ if (diff < mmp_update_interval * HZ)
+ schedule_timeout_interruptible(mmp_update_interval *
+ HZ - diff);
+
+ /*
+ * We need to make sure that more than mmp_check_interval
+ * seconds have not passed since writing. If that has happened
+ * we need to check if the MMP block is as we left it.
+ */
+ diff = jiffies - last_update_time;
+ if (diff > mmp_check_interval * HZ) {
+ struct buffer_head *bh_check = NULL;
+ struct mmp_struct *mmp_check;
+
+ retval = read_mmp_block(sb, &bh_check, mmp_block);
+ if (retval) {
+ ext4_error_err(sb, -retval,
+ "error reading MMP data: %d",
+ retval);
+ goto wait_to_exit;
+ }
+
+ mmp_check = (struct mmp_struct *)(bh_check->b_data);
+ if (mmp->mmp_seq != mmp_check->mmp_seq ||
+ memcmp(mmp->mmp_nodename, mmp_check->mmp_nodename,
+ sizeof(mmp->mmp_nodename))) {
+ dump_mmp_msg(sb, mmp_check,
+ "Error while updating MMP info. "
+ "The filesystem seems to have been"
+ " multiply mounted.");
+ ext4_error_err(sb, EBUSY, "abort");
+ put_bh(bh_check);
+ retval = -EBUSY;
+ goto wait_to_exit;
+ }
+ put_bh(bh_check);
+ }
+
+ /*
+ * Adjust the mmp_check_interval depending on how much time
+ * it took for the MMP block to be written.
+ */
+ mmp_check_interval = clamp(EXT4_MMP_CHECK_MULT * diff / HZ,
+ EXT4_MMP_MIN_CHECK_INTERVAL,
+ EXT4_MMP_MAX_CHECK_INTERVAL);
+ mmp->mmp_check_interval = cpu_to_le16(mmp_check_interval);
+ }
+
+ /*
+ * Unmount seems to be clean.
+ */
+ mmp->mmp_seq = cpu_to_le32(EXT4_MMP_SEQ_CLEAN);
+ mmp->mmp_time = cpu_to_le64(ktime_get_real_seconds());
+
+ retval = write_mmp_block(sb, bh);
+
+wait_to_exit:
+ while (!kthread_should_stop()) {
+ set_current_state(TASK_INTERRUPTIBLE);
+ if (!kthread_should_stop())
+ schedule();
+ }
+ set_current_state(TASK_RUNNING);
+ return retval;
+}
+
+void ext4_stop_mmpd(struct ext4_sb_info *sbi)
+{
+ if (sbi->s_mmp_tsk) {
+ kthread_stop(sbi->s_mmp_tsk);
+ brelse(sbi->s_mmp_bh);
+ sbi->s_mmp_tsk = NULL;
+ }
+}
+
+/*
+ * Get a random new sequence number but make sure it is not greater than
+ * EXT4_MMP_SEQ_MAX.
+ */
+static unsigned int mmp_new_seq(void)
+{
+ return get_random_u32_below(EXT4_MMP_SEQ_MAX + 1);
+}
+
+/*
+ * Protect the filesystem from being mounted more than once.
+ */
+int ext4_multi_mount_protect(struct super_block *sb,
+ ext4_fsblk_t mmp_block)
+{
+ struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+ struct buffer_head *bh = NULL;
+ struct mmp_struct *mmp = NULL;
+ u32 seq;
+ unsigned int mmp_check_interval = le16_to_cpu(es->s_mmp_update_interval);
+ unsigned int wait_time = 0;
+ int retval;
+
+ if (mmp_block < le32_to_cpu(es->s_first_data_block) ||
+ mmp_block >= ext4_blocks_count(es)) {
+ ext4_warning(sb, "Invalid MMP block in superblock");
+ retval = -EINVAL;
+ goto failed;
+ }
+
+ retval = read_mmp_block(sb, &bh, mmp_block);
+ if (retval)
+ goto failed;
+
+ mmp = (struct mmp_struct *)(bh->b_data);
+
+ if (mmp_check_interval < EXT4_MMP_MIN_CHECK_INTERVAL)
+ mmp_check_interval = EXT4_MMP_MIN_CHECK_INTERVAL;
+
+ /*
+ * If check_interval in MMP block is larger, use that instead of
+ * update_interval from the superblock.
+ */
+ if (le16_to_cpu(mmp->mmp_check_interval) > mmp_check_interval)
+ mmp_check_interval = le16_to_cpu(mmp->mmp_check_interval);
+
+ seq = le32_to_cpu(mmp->mmp_seq);
+ if (seq == EXT4_MMP_SEQ_CLEAN)
+ goto skip;
+
+ if (seq == EXT4_MMP_SEQ_FSCK) {
+ dump_mmp_msg(sb, mmp, "fsck is running on the filesystem");
+ retval = -EBUSY;
+ goto failed;
+ }
+
+ wait_time = min(mmp_check_interval * 2 + 1,
+ mmp_check_interval + 60);
+
+ /* Print MMP interval if more than 20 secs. */
+ if (wait_time > EXT4_MMP_MIN_CHECK_INTERVAL * 4)
+ ext4_warning(sb, "MMP interval %u higher than expected, please"
+ " wait.\n", wait_time * 2);
+
+ if (schedule_timeout_interruptible(HZ * wait_time) != 0) {
+ ext4_warning(sb, "MMP startup interrupted, failing mount\n");
+ retval = -ETIMEDOUT;
+ goto failed;
+ }
+
+ retval = read_mmp_block(sb, &bh, mmp_block);
+ if (retval)
+ goto failed;
+ mmp = (struct mmp_struct *)(bh->b_data);
+ if (seq != le32_to_cpu(mmp->mmp_seq)) {
+ dump_mmp_msg(sb, mmp,
+ "Device is already active on another node.");
+ retval = -EBUSY;
+ goto failed;
+ }
+
+skip:
+ /*
+ * write a new random sequence number.
+ */
+ seq = mmp_new_seq();
+ mmp->mmp_seq = cpu_to_le32(seq);
+
+ /*
+ * On mount / remount we are protected against fs freezing (by s_umount
+ * semaphore) and grabbing freeze protection upsets lockdep
+ */
+ retval = write_mmp_block_thawed(sb, bh);
+ if (retval)
+ goto failed;
+
+ /*
+ * wait for MMP interval and check mmp_seq.
+ */
+ if (schedule_timeout_interruptible(HZ * wait_time) != 0) {
+ ext4_warning(sb, "MMP startup interrupted, failing mount");
+ retval = -ETIMEDOUT;
+ goto failed;
+ }
+
+ retval = read_mmp_block(sb, &bh, mmp_block);
+ if (retval)
+ goto failed;
+ mmp = (struct mmp_struct *)(bh->b_data);
+ if (seq != le32_to_cpu(mmp->mmp_seq)) {
+ dump_mmp_msg(sb, mmp,
+ "Device is already active on another node.");
+ retval = -EBUSY;
+ goto failed;
+ }
+
+ EXT4_SB(sb)->s_mmp_bh = bh;
+
+ BUILD_BUG_ON(sizeof(mmp->mmp_bdevname) < BDEVNAME_SIZE);
+ snprintf(mmp->mmp_bdevname, sizeof(mmp->mmp_bdevname),
+ "%pg", bh->b_bdev);
+
+ /*
+ * Start a kernel thread to update the MMP block periodically.
+ */
+ EXT4_SB(sb)->s_mmp_tsk = kthread_run(kmmpd, sb, "kmmpd-%.*s",
+ (int)sizeof(mmp->mmp_bdevname),
+ mmp->mmp_bdevname);
+ if (IS_ERR(EXT4_SB(sb)->s_mmp_tsk)) {
+ EXT4_SB(sb)->s_mmp_tsk = NULL;
+ ext4_warning(sb, "Unable to create kmmpd thread for %s.",
+ sb->s_id);
+ retval = -ENOMEM;
+ goto failed;
+ }
+
+ return 0;
+
+failed:
+ brelse(bh);
+ return retval;
+}
new file mode 100644
@@ -0,0 +1,706 @@
+// SPDX-License-Identifier: LGPL-2.1
+/*
+ * Copyright (c) 2008,2009 NEC Software Tohoku, Ltd.
+ * Written by Takashi Sato <t-sato@yk.jp.nec.com>
+ * Akira Fujita <a-fujita@rs.jp.nec.com>
+ */
+
+#include <linux/fs.h>
+#include <linux/quotaops.h>
+#include <linux/slab.h>
+#include <linux/sched/mm.h>
+#include "ext4_jbd2.h"
+#include "ext4.h"
+#include "ext4_extents.h"
+
+/**
+ * get_ext_path() - Find an extent path for designated logical block number.
+ * @inode: inode to be searched
+ * @lblock: logical block number to find an extent path
+ * @path: pointer to an extent path
+ *
+ * ext4_find_extent wrapper. Return an extent path pointer on success,
+ * or an error pointer on failure.
+ */
+static inline struct ext4_ext_path *
+get_ext_path(struct inode *inode, ext4_lblk_t lblock,
+ struct ext4_ext_path *path)
+{
+ path = ext4_find_extent(inode, lblock, path, EXT4_EX_NOCACHE);
+ if (IS_ERR(path))
+ return path;
+ if (path[ext_depth(inode)].p_ext == NULL) {
+ ext4_free_ext_path(path);
+ return ERR_PTR(-ENODATA);
+ }
+ return path;
+}
+
+/**
+ * ext4_double_down_write_data_sem() - write lock two inodes's i_data_sem
+ * @first: inode to be locked
+ * @second: inode to be locked
+ *
+ * Acquire write lock of i_data_sem of the two inodes
+ */
+void
+ext4_double_down_write_data_sem(struct inode *first, struct inode *second)
+{
+ if (first < second) {
+ down_write(&EXT4_I(first)->i_data_sem);
+ down_write_nested(&EXT4_I(second)->i_data_sem, I_DATA_SEM_OTHER);
+ } else {
+ down_write(&EXT4_I(second)->i_data_sem);
+ down_write_nested(&EXT4_I(first)->i_data_sem, I_DATA_SEM_OTHER);
+
+ }
+}
+
+/**
+ * ext4_double_up_write_data_sem - Release two inodes' write lock of i_data_sem
+ *
+ * @orig_inode: original inode structure to be released its lock first
+ * @donor_inode: donor inode structure to be released its lock second
+ * Release write lock of i_data_sem of two inodes (orig and donor).
+ */
+void
+ext4_double_up_write_data_sem(struct inode *orig_inode,
+ struct inode *donor_inode)
+{
+ up_write(&EXT4_I(orig_inode)->i_data_sem);
+ up_write(&EXT4_I(donor_inode)->i_data_sem);
+}
+
+/**
+ * mext_check_coverage - Check that all extents in range has the same type
+ *
+ * @inode: inode in question
+ * @from: block offset of inode
+ * @count: block count to be checked
+ * @unwritten: extents expected to be unwritten
+ * @err: pointer to save error value
+ *
+ * Return 1 if all extents in range has expected type, and zero otherwise.
+ */
+static int
+mext_check_coverage(struct inode *inode, ext4_lblk_t from, ext4_lblk_t count,
+ int unwritten, int *err)
+{
+ struct ext4_ext_path *path = NULL;
+ struct ext4_extent *ext;
+ int ret = 0;
+ ext4_lblk_t last = from + count;
+ while (from < last) {
+ path = get_ext_path(inode, from, path);
+ if (IS_ERR(path)) {
+ *err = PTR_ERR(path);
+ return ret;
+ }
+ ext = path[ext_depth(inode)].p_ext;
+ if (unwritten != ext4_ext_is_unwritten(ext))
+ goto out;
+ from += ext4_ext_get_actual_len(ext);
+ }
+ ret = 1;
+out:
+ ext4_free_ext_path(path);
+ return ret;
+}
+
+/**
+ * mext_folio_double_lock - Grab and lock folio on both @inode1 and @inode2
+ *
+ * @inode1: the inode structure
+ * @inode2: the inode structure
+ * @index1: folio index
+ * @index2: folio index
+ * @folio: result folio vector
+ *
+ * Grab two locked folio for inode's by inode order
+ */
+static int
+mext_folio_double_lock(struct inode *inode1, struct inode *inode2,
+ pgoff_t index1, pgoff_t index2, struct folio *folio[2])
+{
+ struct address_space *mapping[2];
+ unsigned int flags;
+
+ BUG_ON(!inode1 || !inode2);
+ if (inode1 < inode2) {
+ mapping[0] = inode1->i_mapping;
+ mapping[1] = inode2->i_mapping;
+ } else {
+ swap(index1, index2);
+ mapping[0] = inode2->i_mapping;
+ mapping[1] = inode1->i_mapping;
+ }
+
+ flags = memalloc_nofs_save();
+ folio[0] = __filemap_get_folio(mapping[0], index1, FGP_WRITEBEGIN,
+ mapping_gfp_mask(mapping[0]));
+ if (IS_ERR(folio[0])) {
+ memalloc_nofs_restore(flags);
+ return PTR_ERR(folio[0]);
+ }
+
+ folio[1] = __filemap_get_folio(mapping[1], index2, FGP_WRITEBEGIN,
+ mapping_gfp_mask(mapping[1]));
+ memalloc_nofs_restore(flags);
+ if (IS_ERR(folio[1])) {
+ folio_unlock(folio[0]);
+ folio_put(folio[0]);
+ return PTR_ERR(folio[1]);
+ }
+ /*
+ * __filemap_get_folio() may not wait on folio's writeback if
+ * BDI not demand that. But it is reasonable to be very conservative
+ * here and explicitly wait on folio's writeback
+ */
+ folio_wait_writeback(folio[0]);
+ folio_wait_writeback(folio[1]);
+ if (inode1 > inode2)
+ swap(folio[0], folio[1]);
+
+ return 0;
+}
+
+/* Force folio buffers uptodate w/o dropping folio's lock */
+static int mext_page_mkuptodate(struct folio *folio, size_t from, size_t to)
+{
+ struct inode *inode = folio->mapping->host;
+ sector_t block;
+ struct buffer_head *bh, *head;
+ unsigned int blocksize, block_start, block_end;
+ int nr = 0;
+ bool partial = false;
+
+ BUG_ON(!folio_test_locked(folio));
+ BUG_ON(folio_test_writeback(folio));
+
+ if (folio_test_uptodate(folio))
+ return 0;
+
+ blocksize = i_blocksize(inode);
+ head = folio_buffers(folio);
+ if (!head)
+ head = create_empty_buffers(folio, blocksize, 0);
+
+ block = folio_pos(folio) >> inode->i_blkbits;
+ block_end = 0;
+ bh = head;
+ do {
+ block_start = block_end;
+ block_end = block_start + blocksize;
+ if (block_end <= from || block_start >= to) {
+ if (!buffer_uptodate(bh))
+ partial = true;
+ continue;
+ }
+ if (buffer_uptodate(bh))
+ continue;
+ if (!buffer_mapped(bh)) {
+ int err = ext4_get_block(inode, block, bh, 0);
+ if (err)
+ return err;
+ if (!buffer_mapped(bh)) {
+ folio_zero_range(folio, block_start, blocksize);
+ set_buffer_uptodate(bh);
+ continue;
+ }
+ }
+ lock_buffer(bh);
+ if (buffer_uptodate(bh)) {
+ unlock_buffer(bh);
+ continue;
+ }
+ ext4_read_bh_nowait(bh, 0, NULL, false);
+ nr++;
+ } while (block++, (bh = bh->b_this_page) != head);
+
+ /* No io required */
+ if (!nr)
+ goto out;
+
+ bh = head;
+ do {
+ if (bh_offset(bh) + blocksize <= from)
+ continue;
+ if (bh_offset(bh) >= to)
+ break;
+ wait_on_buffer(bh);
+ if (buffer_uptodate(bh))
+ continue;
+ return -EIO;
+ } while ((bh = bh->b_this_page) != head);
+out:
+ if (!partial)
+ folio_mark_uptodate(folio);
+ return 0;
+}
+
+/**
+ * move_extent_per_page - Move extent data per page
+ *
+ * @o_filp: file structure of original file
+ * @donor_inode: donor inode
+ * @orig_page_offset: page index on original file
+ * @donor_page_offset: page index on donor file
+ * @data_offset_in_page: block index where data swapping starts
+ * @block_len_in_page: the number of blocks to be swapped
+ * @unwritten: orig extent is unwritten or not
+ * @err: pointer to save return value
+ *
+ * Save the data in original inode blocks and replace original inode extents
+ * with donor inode extents by calling ext4_swap_extents().
+ * Finally, write out the saved data in new original inode blocks. Return
+ * replaced block count.
+ */
+static int
+move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
+ pgoff_t orig_page_offset, pgoff_t donor_page_offset,
+ int data_offset_in_page,
+ int block_len_in_page, int unwritten, int *err)
+{
+ struct inode *orig_inode = file_inode(o_filp);
+ struct folio *folio[2] = {NULL, NULL};
+ handle_t *handle;
+ ext4_lblk_t orig_blk_offset, donor_blk_offset;
+ unsigned long blocksize = orig_inode->i_sb->s_blocksize;
+ unsigned int tmp_data_size, data_size, replaced_size;
+ int i, err2, jblocks, retries = 0;
+ int replaced_count = 0;
+ int from;
+ int blocks_per_page = PAGE_SIZE >> orig_inode->i_blkbits;
+ struct super_block *sb = orig_inode->i_sb;
+ struct buffer_head *bh = NULL;
+
+ /*
+ * It needs twice the amount of ordinary journal buffers because
+ * inode and donor_inode may change each different metadata blocks.
+ */
+again:
+ *err = 0;
+ jblocks = ext4_meta_trans_blocks(orig_inode, block_len_in_page,
+ block_len_in_page) * 2;
+ handle = ext4_journal_start(orig_inode, EXT4_HT_MOVE_EXTENTS, jblocks);
+ if (IS_ERR(handle)) {
+ *err = PTR_ERR(handle);
+ return 0;
+ }
+
+ orig_blk_offset = orig_page_offset * blocks_per_page +
+ data_offset_in_page;
+
+ donor_blk_offset = donor_page_offset * blocks_per_page +
+ data_offset_in_page;
+
+ /* Calculate data_size */
+ if ((orig_blk_offset + block_len_in_page - 1) ==
+ ((orig_inode->i_size - 1) >> orig_inode->i_blkbits)) {
+ /* Replace the last block */
+ tmp_data_size = orig_inode->i_size & (blocksize - 1);
+ /*
+ * If data_size equal zero, it shows data_size is multiples of
+ * blocksize. So we set appropriate value.
+ */
+ if (tmp_data_size == 0)
+ tmp_data_size = blocksize;
+
+ data_size = tmp_data_size +
+ ((block_len_in_page - 1) << orig_inode->i_blkbits);
+ } else
+ data_size = block_len_in_page << orig_inode->i_blkbits;
+
+ replaced_size = data_size;
+
+ *err = mext_folio_double_lock(orig_inode, donor_inode, orig_page_offset,
+ donor_page_offset, folio);
+ if (unlikely(*err < 0))
+ goto stop_journal;
+ /*
+ * If orig extent was unwritten it can become initialized
+ * at any time after i_data_sem was dropped, in order to
+ * serialize with delalloc we have recheck extent while we
+ * hold page's lock, if it is still the case data copy is not
+ * necessary, just swap data blocks between orig and donor.
+ */
+ if (unwritten) {
+ ext4_double_down_write_data_sem(orig_inode, donor_inode);
+ /* If any of extents in range became initialized we have to
+ * fallback to data copying */
+ unwritten = mext_check_coverage(orig_inode, orig_blk_offset,
+ block_len_in_page, 1, err);
+ if (*err)
+ goto drop_data_sem;
+
+ unwritten &= mext_check_coverage(donor_inode, donor_blk_offset,
+ block_len_in_page, 1, err);
+ if (*err)
+ goto drop_data_sem;
+
+ if (!unwritten) {
+ ext4_double_up_write_data_sem(orig_inode, donor_inode);
+ goto data_copy;
+ }
+ if (!filemap_release_folio(folio[0], 0) ||
+ !filemap_release_folio(folio[1], 0)) {
+ *err = -EBUSY;
+ goto drop_data_sem;
+ }
+ replaced_count = ext4_swap_extents(handle, orig_inode,
+ donor_inode, orig_blk_offset,
+ donor_blk_offset,
+ block_len_in_page, 1, err);
+ drop_data_sem:
+ ext4_double_up_write_data_sem(orig_inode, donor_inode);
+ goto unlock_folios;
+ }
+data_copy:
+ from = offset_in_folio(folio[0],
+ orig_blk_offset << orig_inode->i_blkbits);
+ *err = mext_page_mkuptodate(folio[0], from, from + replaced_size);
+ if (*err)
+ goto unlock_folios;
+
+ /* At this point all buffers in range are uptodate, old mapping layout
+ * is no longer required, try to drop it now. */
+ if (!filemap_release_folio(folio[0], 0) ||
+ !filemap_release_folio(folio[1], 0)) {
+ *err = -EBUSY;
+ goto unlock_folios;
+ }
+ ext4_double_down_write_data_sem(orig_inode, donor_inode);
+ replaced_count = ext4_swap_extents(handle, orig_inode, donor_inode,
+ orig_blk_offset, donor_blk_offset,
+ block_len_in_page, 1, err);
+ ext4_double_up_write_data_sem(orig_inode, donor_inode);
+ if (*err) {
+ if (replaced_count) {
+ block_len_in_page = replaced_count;
+ replaced_size =
+ block_len_in_page << orig_inode->i_blkbits;
+ } else
+ goto unlock_folios;
+ }
+ /* Perform all necessary steps similar write_begin()/write_end()
+ * but keeping in mind that i_size will not change */
+ bh = folio_buffers(folio[0]);
+ if (!bh)
+ bh = create_empty_buffers(folio[0],
+ 1 << orig_inode->i_blkbits, 0);
+ for (i = 0; i < from >> orig_inode->i_blkbits; i++)
+ bh = bh->b_this_page;
+ for (i = 0; i < block_len_in_page; i++) {
+ *err = ext4_get_block(orig_inode, orig_blk_offset + i, bh, 0);
+ if (*err < 0)
+ goto repair_branches;
+ bh = bh->b_this_page;
+ }
+
+ block_commit_write(folio[0], from, from + replaced_size);
+
+ /* Even in case of data=writeback it is reasonable to pin
+ * inode to transaction, to prevent unexpected data loss */
+ *err = ext4_jbd2_inode_add_write(handle, orig_inode,
+ (loff_t)orig_page_offset << PAGE_SHIFT, replaced_size);
+
+unlock_folios:
+ folio_unlock(folio[0]);
+ folio_put(folio[0]);
+ folio_unlock(folio[1]);
+ folio_put(folio[1]);
+stop_journal:
+ ext4_journal_stop(handle);
+ if (*err == -ENOSPC &&
+ ext4_should_retry_alloc(sb, &retries))
+ goto again;
+ /* Buffer was busy because probably is pinned to journal transaction,
+ * force transaction commit may help to free it. */
+ if (*err == -EBUSY && retries++ < 4 && EXT4_SB(sb)->s_journal &&
+ jbd2_journal_force_commit_nested(EXT4_SB(sb)->s_journal))
+ goto again;
+ return replaced_count;
+
+repair_branches:
+ /*
+ * This should never ever happen!
+ * Extents are swapped already, but we are not able to copy data.
+ * Try to swap extents to it's original places
+ */
+ ext4_double_down_write_data_sem(orig_inode, donor_inode);
+ replaced_count = ext4_swap_extents(handle, donor_inode, orig_inode,
+ orig_blk_offset, donor_blk_offset,
+ block_len_in_page, 0, &err2);
+ ext4_double_up_write_data_sem(orig_inode, donor_inode);
+ if (replaced_count != block_len_in_page) {
+ ext4_error_inode_block(orig_inode, (sector_t)(orig_blk_offset),
+ EIO, "Unable to copy data block,"
+ " data will be lost.");
+ *err = -EIO;
+ }
+ replaced_count = 0;
+ goto unlock_folios;
+}
+
+/**
+ * mext_check_arguments - Check whether move extent can be done
+ *
+ * @orig_inode: original inode
+ * @donor_inode: donor inode
+ * @orig_start: logical start offset in block for orig
+ * @donor_start: logical start offset in block for donor
+ * @len: the number of blocks to be moved
+ *
+ * Check the arguments of ext4_move_extents() whether the files can be
+ * exchanged with each other.
+ * Return 0 on success, or a negative error value on failure.
+ */
+static int
+mext_check_arguments(struct inode *orig_inode,
+ struct inode *donor_inode, __u64 orig_start,
+ __u64 donor_start, __u64 *len)
+{
+ __u64 orig_eof, donor_eof;
+ unsigned int blkbits = orig_inode->i_blkbits;
+ unsigned int blocksize = 1 << blkbits;
+
+ orig_eof = (i_size_read(orig_inode) + blocksize - 1) >> blkbits;
+ donor_eof = (i_size_read(donor_inode) + blocksize - 1) >> blkbits;
+
+
+ if (donor_inode->i_mode & (S_ISUID|S_ISGID)) {
+ ext4_debug("ext4 move extent: suid or sgid is set"
+ " to donor file [ino:orig %lu, donor %lu]\n",
+ orig_inode->i_ino, donor_inode->i_ino);
+ return -EINVAL;
+ }
+
+ if (IS_IMMUTABLE(donor_inode) || IS_APPEND(donor_inode))
+ return -EPERM;
+
+ /* Ext4 move extent does not support swap files */
+ if (IS_SWAPFILE(orig_inode) || IS_SWAPFILE(donor_inode)) {
+ ext4_debug("ext4 move extent: The argument files should not be swap files [ino:orig %lu, donor %lu]\n",
+ orig_inode->i_ino, donor_inode->i_ino);
+ return -ETXTBSY;
+ }
+
+ if (ext4_is_quota_file(orig_inode) && ext4_is_quota_file(donor_inode)) {
+ ext4_debug("ext4 move extent: The argument files should not be quota files [ino:orig %lu, donor %lu]\n",
+ orig_inode->i_ino, donor_inode->i_ino);
+ return -EOPNOTSUPP;
+ }
+
+ /* Ext4 move extent supports only extent based file */
+ if (!(ext4_test_inode_flag(orig_inode, EXT4_INODE_EXTENTS))) {
+ ext4_debug("ext4 move extent: orig file is not extents "
+ "based file [ino:orig %lu]\n", orig_inode->i_ino);
+ return -EOPNOTSUPP;
+ } else if (!(ext4_test_inode_flag(donor_inode, EXT4_INODE_EXTENTS))) {
+ ext4_debug("ext4 move extent: donor file is not extents "
+ "based file [ino:donor %lu]\n", donor_inode->i_ino);
+ return -EOPNOTSUPP;
+ }
+
+ if ((!orig_inode->i_size) || (!donor_inode->i_size)) {
+ ext4_debug("ext4 move extent: File size is 0 byte\n");
+ return -EINVAL;
+ }
+
+ /* Start offset should be same */
+ if ((orig_start & ~(PAGE_MASK >> orig_inode->i_blkbits)) !=
+ (donor_start & ~(PAGE_MASK >> orig_inode->i_blkbits))) {
+ ext4_debug("ext4 move extent: orig and donor's start "
+ "offsets are not aligned [ino:orig %lu, donor %lu]\n",
+ orig_inode->i_ino, donor_inode->i_ino);
+ return -EINVAL;
+ }
+
+ if ((orig_start >= EXT_MAX_BLOCKS) ||
+ (donor_start >= EXT_MAX_BLOCKS) ||
+ (*len > EXT_MAX_BLOCKS) ||
+ (donor_start + *len >= EXT_MAX_BLOCKS) ||
+ (orig_start + *len >= EXT_MAX_BLOCKS)) {
+ ext4_debug("ext4 move extent: Can't handle over [%u] blocks "
+ "[ino:orig %lu, donor %lu]\n", EXT_MAX_BLOCKS,
+ orig_inode->i_ino, donor_inode->i_ino);
+ return -EINVAL;
+ }
+ if (orig_eof <= orig_start)
+ *len = 0;
+ else if (orig_eof < orig_start + *len - 1)
+ *len = orig_eof - orig_start;
+ if (donor_eof <= donor_start)
+ *len = 0;
+ else if (donor_eof < donor_start + *len - 1)
+ *len = donor_eof - donor_start;
+ if (!*len) {
+ ext4_debug("ext4 move extent: len should not be 0 "
+ "[ino:orig %lu, donor %lu]\n", orig_inode->i_ino,
+ donor_inode->i_ino);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+/**
+ * ext4_move_extents - Exchange the specified range of a file
+ *
+ * @o_filp: file structure of the original file
+ * @d_filp: file structure of the donor file
+ * @orig_blk: start offset in block for orig
+ * @donor_blk: start offset in block for donor
+ * @len: the number of blocks to be moved
+ * @moved_len: moved block length
+ *
+ * This function returns 0 and moved block length is set in moved_len
+ * if succeed, otherwise returns error value.
+ *
+ */
+int
+ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 orig_blk,
+ __u64 donor_blk, __u64 len, __u64 *moved_len)
+{
+ struct inode *orig_inode = file_inode(o_filp);
+ struct inode *donor_inode = file_inode(d_filp);
+ struct ext4_ext_path *path = NULL;
+ int blocks_per_page = PAGE_SIZE >> orig_inode->i_blkbits;
+ ext4_lblk_t o_end, o_start = orig_blk;
+ ext4_lblk_t d_start = donor_blk;
+ int ret;
+
+ if (orig_inode->i_sb != donor_inode->i_sb) {
+ ext4_debug("ext4 move extent: The argument files "
+ "should be in same FS [ino:orig %lu, donor %lu]\n",
+ orig_inode->i_ino, donor_inode->i_ino);
+ return -EINVAL;
+ }
+
+ /* orig and donor should be different inodes */
+ if (orig_inode == donor_inode) {
+ ext4_debug("ext4 move extent: The argument files should not "
+ "be same inode [ino:orig %lu, donor %lu]\n",
+ orig_inode->i_ino, donor_inode->i_ino);
+ return -EINVAL;
+ }
+
+ /* Regular file check */
+ if (!S_ISREG(orig_inode->i_mode) || !S_ISREG(donor_inode->i_mode)) {
+ ext4_debug("ext4 move extent: The argument files should be "
+ "regular file [ino:orig %lu, donor %lu]\n",
+ orig_inode->i_ino, donor_inode->i_ino);
+ return -EINVAL;
+ }
+
+ /* TODO: it's not obvious how to swap blocks for inodes with full
+ journaling enabled */
+ if (ext4_should_journal_data(orig_inode) ||
+ ext4_should_journal_data(donor_inode)) {
+ ext4_msg(orig_inode->i_sb, KERN_ERR,
+ "Online defrag not supported with data journaling");
+ return -EOPNOTSUPP;
+ }
+
+ if (IS_ENCRYPTED(orig_inode) || IS_ENCRYPTED(donor_inode)) {
+ ext4_msg(orig_inode->i_sb, KERN_ERR,
+ "Online defrag not supported for encrypted files");
+ return -EOPNOTSUPP;
+ }
+
+ /* Protect orig and donor inodes against a truncate */
+ lock_two_nondirectories(orig_inode, donor_inode);
+
+ /* Wait for all existing dio workers */
+ inode_dio_wait(orig_inode);
+ inode_dio_wait(donor_inode);
+
+ /* Protect extent tree against block allocations via delalloc */
+ ext4_double_down_write_data_sem(orig_inode, donor_inode);
+ /* Check the filesystem environment whether move_extent can be done */
+ ret = mext_check_arguments(orig_inode, donor_inode, orig_blk,
+ donor_blk, &len);
+ if (ret)
+ goto out;
+ o_end = o_start + len;
+
+ *moved_len = 0;
+ while (o_start < o_end) {
+ struct ext4_extent *ex;
+ ext4_lblk_t cur_blk, next_blk;
+ pgoff_t orig_page_index, donor_page_index;
+ int offset_in_page;
+ int unwritten, cur_len;
+
+ path = get_ext_path(orig_inode, o_start, path);
+ if (IS_ERR(path)) {
+ ret = PTR_ERR(path);
+ goto out;
+ }
+ ex = path[path->p_depth].p_ext;
+ cur_blk = le32_to_cpu(ex->ee_block);
+ cur_len = ext4_ext_get_actual_len(ex);
+ /* Check hole before the start pos */
+ if (cur_blk + cur_len - 1 < o_start) {
+ next_blk = ext4_ext_next_allocated_block(path);
+ if (next_blk == EXT_MAX_BLOCKS) {
+ ret = -ENODATA;
+ goto out;
+ }
+ d_start += next_blk - o_start;
+ o_start = next_blk;
+ continue;
+ /* Check hole after the start pos */
+ } else if (cur_blk > o_start) {
+ /* Skip hole */
+ d_start += cur_blk - o_start;
+ o_start = cur_blk;
+ /* Extent inside requested range ?*/
+ if (cur_blk >= o_end)
+ goto out;
+ } else { /* in_range(o_start, o_blk, o_len) */
+ cur_len += cur_blk - o_start;
+ }
+ unwritten = ext4_ext_is_unwritten(ex);
+ if (o_end - o_start < cur_len)
+ cur_len = o_end - o_start;
+
+ orig_page_index = o_start >> (PAGE_SHIFT -
+ orig_inode->i_blkbits);
+ donor_page_index = d_start >> (PAGE_SHIFT -
+ donor_inode->i_blkbits);
+ offset_in_page = o_start % blocks_per_page;
+ if (cur_len > blocks_per_page - offset_in_page)
+ cur_len = blocks_per_page - offset_in_page;
+ /*
+ * Up semaphore to avoid following problems:
+ * a. transaction deadlock among ext4_journal_start,
+ * ->write_begin via pagefault, and jbd2_journal_commit
+ * b. racing with ->read_folio, ->write_begin, and
+ * ext4_get_block in move_extent_per_page
+ */
+ ext4_double_up_write_data_sem(orig_inode, donor_inode);
+ /* Swap original branches with new branches */
+ *moved_len += move_extent_per_page(o_filp, donor_inode,
+ orig_page_index, donor_page_index,
+ offset_in_page, cur_len,
+ unwritten, &ret);
+ ext4_double_down_write_data_sem(orig_inode, donor_inode);
+ if (ret < 0)
+ break;
+ o_start += cur_len;
+ d_start += cur_len;
+ }
+
+out:
+ if (*moved_len) {
+ ext4_discard_preallocations(orig_inode);
+ ext4_discard_preallocations(donor_inode);
+ }
+
+ ext4_free_ext_path(path);
+ ext4_double_up_write_data_sem(orig_inode, donor_inode);
+ unlock_two_nondirectories(orig_inode, donor_inode);
+
+ return ret;
+}
new file mode 100644
@@ -0,0 +1,659 @@
+/*
+ * Ext4 orphan inode handling
+ */
+#include <linux/fs.h>
+#include <linux/quotaops.h>
+#include <linux/buffer_head.h>
+
+#include "ext4.h"
+#include "ext4_jbd2.h"
+
+static int ext4_orphan_file_add(handle_t *handle, struct inode *inode)
+{
+ int i, j, start;
+ struct ext4_orphan_info *oi = &EXT4_SB(inode->i_sb)->s_orphan_info;
+ int ret = 0;
+ bool found = false;
+ __le32 *bdata;
+ int inodes_per_ob = ext4_inodes_per_orphan_block(inode->i_sb);
+ int looped = 0;
+
+ /*
+ * Find block with free orphan entry. Use CPU number for a naive hash
+ * for a search start in the orphan file
+ */
+ start = raw_smp_processor_id()*13 % oi->of_blocks;
+ i = start;
+ do {
+ if (atomic_dec_if_positive(&oi->of_binfo[i].ob_free_entries)
+ >= 0) {
+ found = true;
+ break;
+ }
+ if (++i >= oi->of_blocks)
+ i = 0;
+ } while (i != start);
+
+ if (!found) {
+ /*
+ * For now we don't grow or shrink orphan file. We just use
+ * whatever was allocated at mke2fs time. The additional
+ * credits we would have to reserve for each orphan inode
+ * operation just don't seem worth it.
+ */
+ return -ENOSPC;
+ }
+
+ ret = ext4_journal_get_write_access(handle, inode->i_sb,
+ oi->of_binfo[i].ob_bh, EXT4_JTR_ORPHAN_FILE);
+ if (ret) {
+ atomic_inc(&oi->of_binfo[i].ob_free_entries);
+ return ret;
+ }
+
+ bdata = (__le32 *)(oi->of_binfo[i].ob_bh->b_data);
+ /* Find empty slot in a block */
+ j = 0;
+ do {
+ if (looped) {
+ /*
+ * Did we walk through the block several times without
+ * finding free entry? It is theoretically possible
+ * if entries get constantly allocated and freed or
+ * if the block is corrupted. Avoid indefinite looping
+ * and bail. We'll use orphan list instead.
+ */
+ if (looped > 3) {
+ atomic_inc(&oi->of_binfo[i].ob_free_entries);
+ return -ENOSPC;
+ }
+ cond_resched();
+ }
+ while (bdata[j]) {
+ if (++j >= inodes_per_ob) {
+ j = 0;
+ looped++;
+ }
+ }
+ } while (cmpxchg(&bdata[j], (__le32)0, cpu_to_le32(inode->i_ino)) !=
+ (__le32)0);
+
+ EXT4_I(inode)->i_orphan_idx = i * inodes_per_ob + j;
+ ext4_set_inode_state(inode, EXT4_STATE_ORPHAN_FILE);
+
+ return ext4_handle_dirty_metadata(handle, NULL, oi->of_binfo[i].ob_bh);
+}
+
+/*
+ * ext4_orphan_add() links an unlinked or truncated inode into a list of
+ * such inodes, starting at the superblock, in case we crash before the
+ * file is closed/deleted, or in case the inode truncate spans multiple
+ * transactions and the last transaction is not recovered after a crash.
+ *
+ * At filesystem recovery time, we walk this list deleting unlinked
+ * inodes and truncating linked inodes in ext4_orphan_cleanup().
+ *
+ * Orphan list manipulation functions must be called under i_rwsem unless
+ * we are just creating the inode or deleting it.
+ */
+int ext4_orphan_add(handle_t *handle, struct inode *inode)
+{
+ struct super_block *sb = inode->i_sb;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_iloc iloc;
+ int err = 0, rc;
+ bool dirty = false;
+
+ if (!sbi->s_journal || is_bad_inode(inode))
+ return 0;
+
+ WARN_ON_ONCE(!(inode->i_state & (I_NEW | I_FREEING)) &&
+ !inode_is_locked(inode));
+ if (ext4_inode_orphan_tracked(inode))
+ return 0;
+
+ /*
+ * Orphan handling is only valid for files with data blocks
+ * being truncated, or files being unlinked. Note that we either
+ * hold i_rwsem, or the inode can not be referenced from outside,
+ * so i_nlink should not be bumped due to race
+ */
+ ASSERT((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
+ S_ISLNK(inode->i_mode)) || inode->i_nlink == 0);
+
+ if (sbi->s_orphan_info.of_blocks) {
+ err = ext4_orphan_file_add(handle, inode);
+ /*
+ * Fallback to normal orphan list of orphan file is
+ * out of space
+ */
+ if (err != -ENOSPC)
+ return err;
+ }
+
+ BUFFER_TRACE(sbi->s_sbh, "get_write_access");
+ err = ext4_journal_get_write_access(handle, sb, sbi->s_sbh,
+ EXT4_JTR_NONE);
+ if (err)
+ goto out;
+
+ err = ext4_reserve_inode_write(handle, inode, &iloc);
+ if (err)
+ goto out;
+
+ mutex_lock(&sbi->s_orphan_lock);
+ /*
+ * Due to previous errors inode may be already a part of on-disk
+ * orphan list. If so skip on-disk list modification.
+ */
+ if (!NEXT_ORPHAN(inode) || NEXT_ORPHAN(inode) >
+ (le32_to_cpu(sbi->s_es->s_inodes_count))) {
+ /* Insert this inode at the head of the on-disk orphan list */
+ NEXT_ORPHAN(inode) = le32_to_cpu(sbi->s_es->s_last_orphan);
+ lock_buffer(sbi->s_sbh);
+ sbi->s_es->s_last_orphan = cpu_to_le32(inode->i_ino);
+ ext4_superblock_csum_set(sb);
+ unlock_buffer(sbi->s_sbh);
+ dirty = true;
+ }
+ list_add(&EXT4_I(inode)->i_orphan, &sbi->s_orphan);
+ mutex_unlock(&sbi->s_orphan_lock);
+
+ if (dirty) {
+ err = ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh);
+ rc = ext4_mark_iloc_dirty(handle, inode, &iloc);
+ if (!err)
+ err = rc;
+ if (err) {
+ /*
+ * We have to remove inode from in-memory list if
+ * addition to on disk orphan list failed. Stray orphan
+ * list entries can cause panics at unmount time.
+ */
+ mutex_lock(&sbi->s_orphan_lock);
+ list_del_init(&EXT4_I(inode)->i_orphan);
+ mutex_unlock(&sbi->s_orphan_lock);
+ }
+ } else
+ brelse(iloc.bh);
+
+ ext4_debug("superblock will point to %lu\n", inode->i_ino);
+ ext4_debug("orphan inode %lu will point to %d\n",
+ inode->i_ino, NEXT_ORPHAN(inode));
+out:
+ ext4_std_error(sb, err);
+ return err;
+}
+
+static int ext4_orphan_file_del(handle_t *handle, struct inode *inode)
+{
+ struct ext4_orphan_info *oi = &EXT4_SB(inode->i_sb)->s_orphan_info;
+ __le32 *bdata;
+ int blk, off;
+ int inodes_per_ob = ext4_inodes_per_orphan_block(inode->i_sb);
+ int ret = 0;
+
+ if (!handle)
+ goto out;
+ blk = EXT4_I(inode)->i_orphan_idx / inodes_per_ob;
+ off = EXT4_I(inode)->i_orphan_idx % inodes_per_ob;
+ if (WARN_ON_ONCE(blk >= oi->of_blocks))
+ goto out;
+
+ ret = ext4_journal_get_write_access(handle, inode->i_sb,
+ oi->of_binfo[blk].ob_bh, EXT4_JTR_ORPHAN_FILE);
+ if (ret)
+ goto out;
+
+ bdata = (__le32 *)(oi->of_binfo[blk].ob_bh->b_data);
+ bdata[off] = 0;
+ atomic_inc(&oi->of_binfo[blk].ob_free_entries);
+ ret = ext4_handle_dirty_metadata(handle, NULL, oi->of_binfo[blk].ob_bh);
+out:
+ ext4_clear_inode_state(inode, EXT4_STATE_ORPHAN_FILE);
+ INIT_LIST_HEAD(&EXT4_I(inode)->i_orphan);
+
+ return ret;
+}
+
+/*
+ * ext4_orphan_del() removes an unlinked or truncated inode from the list
+ * of such inodes stored on disk, because it is finally being cleaned up.
+ */
+int ext4_orphan_del(handle_t *handle, struct inode *inode)
+{
+ struct list_head *prev;
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ __u32 ino_next;
+ struct ext4_iloc iloc;
+ int err = 0;
+
+ if (!sbi->s_journal && !(sbi->s_mount_state & EXT4_ORPHAN_FS))
+ return 0;
+
+ WARN_ON_ONCE(!(inode->i_state & (I_NEW | I_FREEING)) &&
+ !inode_is_locked(inode));
+ if (ext4_test_inode_state(inode, EXT4_STATE_ORPHAN_FILE))
+ return ext4_orphan_file_del(handle, inode);
+
+ /* Do this quick check before taking global s_orphan_lock. */
+ if (list_empty(&ei->i_orphan))
+ return 0;
+
+ if (handle) {
+ /* Grab inode buffer early before taking global s_orphan_lock */
+ err = ext4_reserve_inode_write(handle, inode, &iloc);
+ }
+
+ mutex_lock(&sbi->s_orphan_lock);
+ ext4_debug("remove inode %lu from orphan list\n", inode->i_ino);
+
+ prev = ei->i_orphan.prev;
+ list_del_init(&ei->i_orphan);
+
+ /* If we're on an error path, we may not have a valid
+ * transaction handle with which to update the orphan list on
+ * disk, but we still need to remove the inode from the linked
+ * list in memory. */
+ if (!handle || err) {
+ mutex_unlock(&sbi->s_orphan_lock);
+ goto out_err;
+ }
+
+ ino_next = NEXT_ORPHAN(inode);
+ if (prev == &sbi->s_orphan) {
+ ext4_debug("superblock will point to %u\n", ino_next);
+ BUFFER_TRACE(sbi->s_sbh, "get_write_access");
+ err = ext4_journal_get_write_access(handle, inode->i_sb,
+ sbi->s_sbh, EXT4_JTR_NONE);
+ if (err) {
+ mutex_unlock(&sbi->s_orphan_lock);
+ goto out_brelse;
+ }
+ lock_buffer(sbi->s_sbh);
+ sbi->s_es->s_last_orphan = cpu_to_le32(ino_next);
+ ext4_superblock_csum_set(inode->i_sb);
+ unlock_buffer(sbi->s_sbh);
+ mutex_unlock(&sbi->s_orphan_lock);
+ err = ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh);
+ } else {
+ struct ext4_iloc iloc2;
+ struct inode *i_prev =
+ &list_entry(prev, struct ext4_inode_info, i_orphan)->vfs_inode;
+
+ ext4_debug("orphan inode %lu will point to %u\n",
+ i_prev->i_ino, ino_next);
+ err = ext4_reserve_inode_write(handle, i_prev, &iloc2);
+ if (err) {
+ mutex_unlock(&sbi->s_orphan_lock);
+ goto out_brelse;
+ }
+ NEXT_ORPHAN(i_prev) = ino_next;
+ err = ext4_mark_iloc_dirty(handle, i_prev, &iloc2);
+ mutex_unlock(&sbi->s_orphan_lock);
+ }
+ if (err)
+ goto out_brelse;
+ NEXT_ORPHAN(inode) = 0;
+ err = ext4_mark_iloc_dirty(handle, inode, &iloc);
+out_err:
+ ext4_std_error(inode->i_sb, err);
+ return err;
+
+out_brelse:
+ brelse(iloc.bh);
+ goto out_err;
+}
+
+#ifdef CONFIG_QUOTA
+static int ext4_quota_on_mount(struct super_block *sb, int type)
+{
+ return dquot_quota_on_mount(sb,
+ rcu_dereference_protected(EXT4_SB(sb)->s_qf_names[type],
+ lockdep_is_held(&sb->s_umount)),
+ EXT4_SB(sb)->s_jquota_fmt, type);
+}
+#endif
+
+static void ext4_process_orphan(struct inode *inode,
+ int *nr_truncates, int *nr_orphans)
+{
+ struct super_block *sb = inode->i_sb;
+ int ret;
+
+ dquot_initialize(inode);
+ if (inode->i_nlink) {
+ if (test_opt(sb, DEBUG))
+ ext4_msg(sb, KERN_DEBUG,
+ "%s: truncating inode %lu to %lld bytes",
+ __func__, inode->i_ino, inode->i_size);
+ ext4_debug("truncating inode %lu to %lld bytes\n",
+ inode->i_ino, inode->i_size);
+ inode_lock(inode);
+ truncate_inode_pages(inode->i_mapping, inode->i_size);
+ ret = ext4_truncate(inode);
+ if (ret) {
+ /*
+ * We need to clean up the in-core orphan list
+ * manually if ext4_truncate() failed to get a
+ * transaction handle.
+ */
+ ext4_orphan_del(NULL, inode);
+ ext4_std_error(inode->i_sb, ret);
+ }
+ inode_unlock(inode);
+ (*nr_truncates)++;
+ } else {
+ if (test_opt(sb, DEBUG))
+ ext4_msg(sb, KERN_DEBUG,
+ "%s: deleting unreferenced inode %lu",
+ __func__, inode->i_ino);
+ ext4_debug("deleting unreferenced inode %lu\n",
+ inode->i_ino);
+ (*nr_orphans)++;
+ }
+ iput(inode); /* The delete magic happens here! */
+}
+
+/* ext4_orphan_cleanup() walks a singly-linked list of inodes (starting at
+ * the superblock) which were deleted from all directories, but held open by
+ * a process at the time of a crash. We walk the list and try to delete these
+ * inodes at recovery time (only with a read-write filesystem).
+ *
+ * In order to keep the orphan inode chain consistent during traversal (in
+ * case of crash during recovery), we link each inode into the superblock
+ * orphan list_head and handle it the same way as an inode deletion during
+ * normal operation (which journals the operations for us).
+ *
+ * We only do an iget() and an iput() on each inode, which is very safe if we
+ * accidentally point at an in-use or already deleted inode. The worst that
+ * can happen in this case is that we get a "bit already cleared" message from
+ * ext4_free_inode(). The only reason we would point at a wrong inode is if
+ * e2fsck was run on this filesystem, and it must have already done the orphan
+ * inode cleanup for us, so we can safely abort without any further action.
+ */
+void ext4_orphan_cleanup(struct super_block *sb, struct ext4_super_block *es)
+{
+ unsigned int s_flags = sb->s_flags;
+ int nr_orphans = 0, nr_truncates = 0;
+ struct inode *inode;
+ int i, j;
+#ifdef CONFIG_QUOTA
+ int quota_update = 0;
+#endif
+ __le32 *bdata;
+ struct ext4_orphan_info *oi = &EXT4_SB(sb)->s_orphan_info;
+ int inodes_per_ob = ext4_inodes_per_orphan_block(sb);
+
+ if (!es->s_last_orphan && !oi->of_blocks) {
+ ext4_debug("no orphan inodes to clean up\n");
+ return;
+ }
+
+ if (bdev_read_only(sb->s_bdev)) {
+ ext4_msg(sb, KERN_ERR, "write access "
+ "unavailable, skipping orphan cleanup");
+ return;
+ }
+
+ /* Check if feature set would not allow a r/w mount */
+ if (!ext4_feature_set_ok(sb, 0)) {
+ ext4_msg(sb, KERN_INFO, "Skipping orphan cleanup due to "
+ "unknown ROCOMPAT features");
+ return;
+ }
+
+ if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) {
+ /* don't clear list on RO mount w/ errors */
+ if (es->s_last_orphan && !(s_flags & SB_RDONLY)) {
+ ext4_msg(sb, KERN_INFO, "Errors on filesystem, "
+ "clearing orphan list.");
+ es->s_last_orphan = 0;
+ }
+ ext4_debug("Skipping orphan recovery on fs with errors.\n");
+ return;
+ }
+
+ if (s_flags & SB_RDONLY) {
+ ext4_msg(sb, KERN_INFO, "orphan cleanup on readonly fs");
+ sb->s_flags &= ~SB_RDONLY;
+ }
+#ifdef CONFIG_QUOTA
+ /*
+ * Turn on quotas which were not enabled for read-only mounts if
+ * filesystem has quota feature, so that they are updated correctly.
+ */
+ if (ext4_has_feature_quota(sb) && (s_flags & SB_RDONLY)) {
+ int ret = ext4_enable_quotas(sb);
+
+ if (!ret)
+ quota_update = 1;
+ else
+ ext4_msg(sb, KERN_ERR,
+ "Cannot turn on quotas: error %d", ret);
+ }
+
+ /* Turn on journaled quotas used for old sytle */
+ for (i = 0; i < EXT4_MAXQUOTAS; i++) {
+ if (EXT4_SB(sb)->s_qf_names[i]) {
+ int ret = ext4_quota_on_mount(sb, i);
+
+ if (!ret)
+ quota_update = 1;
+ else
+ ext4_msg(sb, KERN_ERR,
+ "Cannot turn on journaled "
+ "quota: type %d: error %d", i, ret);
+ }
+ }
+#endif
+
+ while (es->s_last_orphan) {
+ /*
+ * We may have encountered an error during cleanup; if
+ * so, skip the rest.
+ */
+ if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) {
+ ext4_debug("Skipping orphan recovery on fs with errors.\n");
+ es->s_last_orphan = 0;
+ break;
+ }
+
+ inode = ext4_orphan_get(sb, le32_to_cpu(es->s_last_orphan));
+ if (IS_ERR(inode)) {
+ es->s_last_orphan = 0;
+ break;
+ }
+
+ list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan);
+ ext4_process_orphan(inode, &nr_truncates, &nr_orphans);
+ }
+
+ for (i = 0; i < oi->of_blocks; i++) {
+ bdata = (__le32 *)(oi->of_binfo[i].ob_bh->b_data);
+ for (j = 0; j < inodes_per_ob; j++) {
+ if (!bdata[j])
+ continue;
+ inode = ext4_orphan_get(sb, le32_to_cpu(bdata[j]));
+ if (IS_ERR(inode))
+ continue;
+ ext4_set_inode_state(inode, EXT4_STATE_ORPHAN_FILE);
+ EXT4_I(inode)->i_orphan_idx = i * inodes_per_ob + j;
+ ext4_process_orphan(inode, &nr_truncates, &nr_orphans);
+ }
+ }
+
+#define PLURAL(x) (x), ((x) == 1) ? "" : "s"
+
+ if (nr_orphans)
+ ext4_msg(sb, KERN_INFO, "%d orphan inode%s deleted",
+ PLURAL(nr_orphans));
+ if (nr_truncates)
+ ext4_msg(sb, KERN_INFO, "%d truncate%s cleaned up",
+ PLURAL(nr_truncates));
+#ifdef CONFIG_QUOTA
+ /* Turn off quotas if they were enabled for orphan cleanup */
+ if (quota_update) {
+ for (i = 0; i < EXT4_MAXQUOTAS; i++) {
+ if (sb_dqopt(sb)->files[i])
+ dquot_quota_off(sb, i);
+ }
+ }
+#endif
+ sb->s_flags = s_flags; /* Restore SB_RDONLY status */
+}
+
+void ext4_release_orphan_info(struct super_block *sb)
+{
+ int i;
+ struct ext4_orphan_info *oi = &EXT4_SB(sb)->s_orphan_info;
+
+ if (!oi->of_blocks)
+ return;
+ for (i = 0; i < oi->of_blocks; i++)
+ brelse(oi->of_binfo[i].ob_bh);
+ kvfree(oi->of_binfo);
+}
+
+static struct ext4_orphan_block_tail *ext4_orphan_block_tail(
+ struct super_block *sb,
+ struct buffer_head *bh)
+{
+ return (struct ext4_orphan_block_tail *)(bh->b_data + sb->s_blocksize -
+ sizeof(struct ext4_orphan_block_tail));
+}
+
+static int ext4_orphan_file_block_csum_verify(struct super_block *sb,
+ struct buffer_head *bh)
+{
+ __u32 calculated;
+ int inodes_per_ob = ext4_inodes_per_orphan_block(sb);
+ struct ext4_orphan_info *oi = &EXT4_SB(sb)->s_orphan_info;
+ struct ext4_orphan_block_tail *ot;
+ __le64 dsk_block_nr = cpu_to_le64(bh->b_blocknr);
+
+ if (!ext4_has_feature_metadata_csum(sb))
+ return 1;
+
+ ot = ext4_orphan_block_tail(sb, bh);
+ calculated = ext4_chksum(oi->of_csum_seed, (__u8 *)&dsk_block_nr,
+ sizeof(dsk_block_nr));
+ calculated = ext4_chksum(calculated, (__u8 *)bh->b_data,
+ inodes_per_ob * sizeof(__u32));
+ return le32_to_cpu(ot->ob_checksum) == calculated;
+}
+
+/* This gets called only when checksumming is enabled */
+void ext4_orphan_file_block_trigger(struct jbd2_buffer_trigger_type *triggers,
+ struct buffer_head *bh,
+ void *data, size_t size)
+{
+ struct super_block *sb = EXT4_TRIGGER(triggers)->sb;
+ __u32 csum;
+ int inodes_per_ob = ext4_inodes_per_orphan_block(sb);
+ struct ext4_orphan_info *oi = &EXT4_SB(sb)->s_orphan_info;
+ struct ext4_orphan_block_tail *ot;
+ __le64 dsk_block_nr = cpu_to_le64(bh->b_blocknr);
+
+ csum = ext4_chksum(oi->of_csum_seed, (__u8 *)&dsk_block_nr,
+ sizeof(dsk_block_nr));
+ csum = ext4_chksum(csum, (__u8 *)data, inodes_per_ob * sizeof(__u32));
+ ot = ext4_orphan_block_tail(sb, bh);
+ ot->ob_checksum = cpu_to_le32(csum);
+}
+
+int ext4_init_orphan_info(struct super_block *sb)
+{
+ struct ext4_orphan_info *oi = &EXT4_SB(sb)->s_orphan_info;
+ struct inode *inode;
+ int i, j;
+ int ret;
+ int free;
+ __le32 *bdata;
+ int inodes_per_ob = ext4_inodes_per_orphan_block(sb);
+ struct ext4_orphan_block_tail *ot;
+ ino_t orphan_ino = le32_to_cpu(EXT4_SB(sb)->s_es->s_orphan_file_inum);
+
+ if (!ext4_has_feature_orphan_file(sb))
+ return 0;
+
+ inode = ext4_iget(sb, orphan_ino, EXT4_IGET_SPECIAL);
+ if (IS_ERR(inode)) {
+ ext4_msg(sb, KERN_ERR, "get orphan inode failed");
+ return PTR_ERR(inode);
+ }
+ /*
+ * This is just an artificial limit to prevent corrupted fs from
+ * consuming absurd amounts of memory when pinning blocks of orphan
+ * file in memory.
+ */
+ if (inode->i_size > 8 << 20) {
+ ext4_msg(sb, KERN_ERR, "orphan file too big: %llu",
+ (unsigned long long)inode->i_size);
+ ret = -EFSCORRUPTED;
+ goto out_put;
+ }
+ oi->of_blocks = inode->i_size >> sb->s_blocksize_bits;
+ oi->of_csum_seed = EXT4_I(inode)->i_csum_seed;
+ oi->of_binfo = kvmalloc_array(oi->of_blocks,
+ sizeof(struct ext4_orphan_block),
+ GFP_KERNEL);
+ if (!oi->of_binfo) {
+ ret = -ENOMEM;
+ goto out_put;
+ }
+ for (i = 0; i < oi->of_blocks; i++) {
+ oi->of_binfo[i].ob_bh = ext4_bread(NULL, inode, i, 0);
+ if (IS_ERR(oi->of_binfo[i].ob_bh)) {
+ ret = PTR_ERR(oi->of_binfo[i].ob_bh);
+ goto out_free;
+ }
+ if (!oi->of_binfo[i].ob_bh) {
+ ret = -EIO;
+ goto out_free;
+ }
+ ot = ext4_orphan_block_tail(sb, oi->of_binfo[i].ob_bh);
+ if (le32_to_cpu(ot->ob_magic) != EXT4_ORPHAN_BLOCK_MAGIC) {
+ ext4_error(sb, "orphan file block %d: bad magic", i);
+ ret = -EIO;
+ goto out_free;
+ }
+ if (!ext4_orphan_file_block_csum_verify(sb,
+ oi->of_binfo[i].ob_bh)) {
+ ext4_error(sb, "orphan file block %d: bad checksum", i);
+ ret = -EIO;
+ goto out_free;
+ }
+ bdata = (__le32 *)(oi->of_binfo[i].ob_bh->b_data);
+ free = 0;
+ for (j = 0; j < inodes_per_ob; j++)
+ if (bdata[j] == 0)
+ free++;
+ atomic_set(&oi->of_binfo[i].ob_free_entries, free);
+ }
+ iput(inode);
+ return 0;
+out_free:
+ for (i--; i >= 0; i--)
+ brelse(oi->of_binfo[i].ob_bh);
+ kvfree(oi->of_binfo);
+out_put:
+ iput(inode);
+ return ret;
+}
+
+int ext4_orphan_file_empty(struct super_block *sb)
+{
+ struct ext4_orphan_info *oi = &EXT4_SB(sb)->s_orphan_info;
+ int i;
+ int inodes_per_ob = ext4_inodes_per_orphan_block(sb);
+
+ if (!ext4_has_feature_orphan_file(sb))
+ return 1;
+ for (i = 0; i < oi->of_blocks; i++)
+ if (atomic_read(&oi->of_binfo[i].ob_free_entries) !=
+ inodes_per_ob)
+ return 0;
+ return 1;
+}
new file mode 100644
@@ -0,0 +1,593 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * linux/fs/ext4/page-io.c
+ *
+ * This contains the new page_io functions for ext4
+ *
+ * Written by Theodore Ts'o, 2010.
+ */
+
+#include <linux/fs.h>
+#include <linux/time.h>
+#include <linux/highuid.h>
+#include <linux/pagemap.h>
+#include <linux/quotaops.h>
+#include <linux/string.h>
+#include <linux/buffer_head.h>
+#include <linux/writeback.h>
+#include <linux/pagevec.h>
+#include <linux/mpage.h>
+#include <linux/namei.h>
+#include <linux/uio.h>
+#include <linux/bio.h>
+#include <linux/workqueue.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include <linux/sched/mm.h>
+
+#include "ext4_jbd2.h"
+#include "xattr.h"
+#include "acl.h"
+
+static struct kmem_cache *io_end_cachep;
+static struct kmem_cache *io_end_vec_cachep;
+
+int __init ext4_init_pageio(void)
+{
+ io_end_cachep = KMEM_CACHE(ext4_io_end, SLAB_RECLAIM_ACCOUNT);
+ if (io_end_cachep == NULL)
+ return -ENOMEM;
+
+ io_end_vec_cachep = KMEM_CACHE(ext4_io_end_vec, 0);
+ if (io_end_vec_cachep == NULL) {
+ kmem_cache_destroy(io_end_cachep);
+ return -ENOMEM;
+ }
+ return 0;
+}
+
+void ext4_exit_pageio(void)
+{
+ kmem_cache_destroy(io_end_cachep);
+ kmem_cache_destroy(io_end_vec_cachep);
+}
+
+struct ext4_io_end_vec *ext4_alloc_io_end_vec(ext4_io_end_t *io_end)
+{
+ struct ext4_io_end_vec *io_end_vec;
+
+ io_end_vec = kmem_cache_zalloc(io_end_vec_cachep, GFP_NOFS);
+ if (!io_end_vec)
+ return ERR_PTR(-ENOMEM);
+ INIT_LIST_HEAD(&io_end_vec->list);
+ list_add_tail(&io_end_vec->list, &io_end->list_vec);
+ return io_end_vec;
+}
+
+static void ext4_free_io_end_vec(ext4_io_end_t *io_end)
+{
+ struct ext4_io_end_vec *io_end_vec, *tmp;
+
+ if (list_empty(&io_end->list_vec))
+ return;
+ list_for_each_entry_safe(io_end_vec, tmp, &io_end->list_vec, list) {
+ list_del(&io_end_vec->list);
+ kmem_cache_free(io_end_vec_cachep, io_end_vec);
+ }
+}
+
+struct ext4_io_end_vec *ext4_last_io_end_vec(ext4_io_end_t *io_end)
+{
+ BUG_ON(list_empty(&io_end->list_vec));
+ return list_last_entry(&io_end->list_vec, struct ext4_io_end_vec, list);
+}
+
+/*
+ * Print an buffer I/O error compatible with the fs/buffer.c. This
+ * provides compatibility with dmesg scrapers that look for a specific
+ * buffer I/O error message. We really need a unified error reporting
+ * structure to userspace ala Digital Unix's uerf system, but it's
+ * probably not going to happen in my lifetime, due to LKML politics...
+ */
+static void buffer_io_error(struct buffer_head *bh)
+{
+ printk_ratelimited(KERN_ERR "Buffer I/O error on device %pg, logical block %llu\n",
+ bh->b_bdev,
+ (unsigned long long)bh->b_blocknr);
+}
+
+static void ext4_finish_bio(struct bio *bio)
+{
+ struct folio_iter fi;
+
+ bio_for_each_folio_all(fi, bio) {
+ struct folio *folio = fi.folio;
+ struct folio *io_folio = NULL;
+ struct buffer_head *bh, *head;
+ size_t bio_start = fi.offset;
+ size_t bio_end = bio_start + fi.length;
+ unsigned under_io = 0;
+ unsigned long flags;
+
+ if (fscrypt_is_bounce_folio(folio)) {
+ io_folio = folio;
+ folio = fscrypt_pagecache_folio(folio);
+ }
+
+ if (bio->bi_status) {
+ int err = blk_status_to_errno(bio->bi_status);
+ mapping_set_error(folio->mapping, err);
+ }
+ bh = head = folio_buffers(folio);
+ /*
+ * We check all buffers in the folio under b_uptodate_lock
+ * to avoid races with other end io clearing async_write flags
+ */
+ spin_lock_irqsave(&head->b_uptodate_lock, flags);
+ do {
+ if (bh_offset(bh) < bio_start ||
+ bh_offset(bh) + bh->b_size > bio_end) {
+ if (buffer_async_write(bh))
+ under_io++;
+ continue;
+ }
+ clear_buffer_async_write(bh);
+ if (bio->bi_status) {
+ set_buffer_write_io_error(bh);
+ buffer_io_error(bh);
+ }
+ } while ((bh = bh->b_this_page) != head);
+ spin_unlock_irqrestore(&head->b_uptodate_lock, flags);
+ if (!under_io) {
+ fscrypt_free_bounce_page(&io_folio->page);
+ folio_end_writeback(folio);
+ }
+ }
+}
+
+static void ext4_release_io_end(ext4_io_end_t *io_end)
+{
+ struct bio *bio, *next_bio;
+
+ BUG_ON(!list_empty(&io_end->list));
+ BUG_ON(io_end->flag & EXT4_IO_END_UNWRITTEN);
+ WARN_ON(io_end->handle);
+
+ for (bio = io_end->bio; bio; bio = next_bio) {
+ next_bio = bio->bi_private;
+ ext4_finish_bio(bio);
+ bio_put(bio);
+ }
+ ext4_free_io_end_vec(io_end);
+ kmem_cache_free(io_end_cachep, io_end);
+}
+
+/*
+ * On successful IO, check a range of space and convert unwritten extents to
+ * written. On IO failure, check if journal abort is needed. Note that
+ * we are protected from truncate touching same part of extent tree by the
+ * fact that truncate code waits for all DIO to finish (thus exclusion from
+ * direct IO is achieved) and also waits for PageWriteback bits. Thus we
+ * cannot get to ext4_ext_truncate() before all IOs overlapping that range are
+ * completed (happens from ext4_free_ioend()).
+ */
+static int ext4_end_io_end(ext4_io_end_t *io_end)
+{
+ struct inode *inode = io_end->inode;
+ handle_t *handle = io_end->handle;
+ struct super_block *sb = inode->i_sb;
+ int ret = 0;
+
+ ext4_debug("ext4_end_io_nolock: io_end 0x%p from inode %lu,list->next 0x%p,"
+ "list->prev 0x%p\n",
+ io_end, inode->i_ino, io_end->list.next, io_end->list.prev);
+
+ /*
+ * Do not convert the unwritten extents if data writeback fails,
+ * or stale data may be exposed.
+ */
+ io_end->handle = NULL; /* Following call will use up the handle */
+ if (unlikely(io_end->flag & EXT4_IO_END_FAILED)) {
+ ret = -EIO;
+ if (handle)
+ jbd2_journal_free_reserved(handle);
+
+ if (test_opt(sb, DATA_ERR_ABORT))
+ jbd2_journal_abort(EXT4_SB(sb)->s_journal, ret);
+ } else {
+ ret = ext4_convert_unwritten_io_end_vec(handle, io_end);
+ }
+ if (ret < 0 && !ext4_emergency_state(sb) &&
+ io_end->flag & EXT4_IO_END_UNWRITTEN) {
+ ext4_msg(sb, KERN_EMERG,
+ "failed to convert unwritten extents to written "
+ "extents -- potential data loss! "
+ "(inode %lu, error %d)", inode->i_ino, ret);
+ }
+
+ ext4_clear_io_unwritten_flag(io_end);
+ ext4_release_io_end(io_end);
+ return ret;
+}
+
+static void dump_completed_IO(struct inode *inode, struct list_head *head)
+{
+#ifdef EXT4FS_DEBUG
+ struct list_head *cur, *before, *after;
+ ext4_io_end_t *io_end, *io_end0, *io_end1;
+
+ if (list_empty(head))
+ return;
+
+ ext4_debug("Dump inode %lu completed io list\n", inode->i_ino);
+ list_for_each_entry(io_end, head, list) {
+ cur = &io_end->list;
+ before = cur->prev;
+ io_end0 = container_of(before, ext4_io_end_t, list);
+ after = cur->next;
+ io_end1 = container_of(after, ext4_io_end_t, list);
+
+ ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n",
+ io_end, inode->i_ino, io_end0, io_end1);
+ }
+#endif
+}
+
+static bool ext4_io_end_defer_completion(ext4_io_end_t *io_end)
+{
+ if (io_end->flag & EXT4_IO_END_UNWRITTEN &&
+ !list_empty(&io_end->list_vec))
+ return true;
+ if (test_opt(io_end->inode->i_sb, DATA_ERR_ABORT) &&
+ io_end->flag & EXT4_IO_END_FAILED &&
+ !ext4_emergency_state(io_end->inode->i_sb))
+ return true;
+ return false;
+}
+
+/* Add the io_end to per-inode completed end_io list. */
+static void ext4_add_complete_io(ext4_io_end_t *io_end)
+{
+ struct ext4_inode_info *ei = EXT4_I(io_end->inode);
+ struct ext4_sb_info *sbi = EXT4_SB(io_end->inode->i_sb);
+ struct workqueue_struct *wq;
+ unsigned long flags;
+
+ /* Only reserved conversions or pending IO errors will enter here. */
+ WARN_ON(!(io_end->flag & EXT4_IO_END_DEFER_COMPLETION));
+ WARN_ON(io_end->flag & EXT4_IO_END_UNWRITTEN &&
+ !io_end->handle && sbi->s_journal);
+ WARN_ON(!io_end->bio);
+
+ spin_lock_irqsave(&ei->i_completed_io_lock, flags);
+ wq = sbi->rsv_conversion_wq;
+ if (list_empty(&ei->i_rsv_conversion_list))
+ queue_work(wq, &ei->i_rsv_conversion_work);
+ list_add_tail(&io_end->list, &ei->i_rsv_conversion_list);
+ spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
+}
+
+static int ext4_do_flush_completed_IO(struct inode *inode,
+ struct list_head *head)
+{
+ ext4_io_end_t *io_end;
+ struct list_head unwritten;
+ unsigned long flags;
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ int err, ret = 0;
+
+ spin_lock_irqsave(&ei->i_completed_io_lock, flags);
+ dump_completed_IO(inode, head);
+ list_replace_init(head, &unwritten);
+ spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
+
+ while (!list_empty(&unwritten)) {
+ io_end = list_entry(unwritten.next, ext4_io_end_t, list);
+ BUG_ON(!(io_end->flag & EXT4_IO_END_DEFER_COMPLETION));
+ list_del_init(&io_end->list);
+
+ err = ext4_end_io_end(io_end);
+ if (unlikely(!ret && err))
+ ret = err;
+ }
+ return ret;
+}
+
+/*
+ * Used to convert unwritten extents to written extents upon IO completion,
+ * or used to abort the journal upon IO errors.
+ */
+void ext4_end_io_rsv_work(struct work_struct *work)
+{
+ struct ext4_inode_info *ei = container_of(work, struct ext4_inode_info,
+ i_rsv_conversion_work);
+ ext4_do_flush_completed_IO(&ei->vfs_inode, &ei->i_rsv_conversion_list);
+}
+
+ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags)
+{
+ ext4_io_end_t *io_end = kmem_cache_zalloc(io_end_cachep, flags);
+
+ if (io_end) {
+ io_end->inode = inode;
+ INIT_LIST_HEAD(&io_end->list);
+ INIT_LIST_HEAD(&io_end->list_vec);
+ refcount_set(&io_end->count, 1);
+ }
+ return io_end;
+}
+
+void ext4_put_io_end_defer(ext4_io_end_t *io_end)
+{
+ if (refcount_dec_and_test(&io_end->count)) {
+ if (ext4_io_end_defer_completion(io_end))
+ return ext4_add_complete_io(io_end);
+
+ ext4_release_io_end(io_end);
+ }
+}
+
+int ext4_put_io_end(ext4_io_end_t *io_end)
+{
+ if (refcount_dec_and_test(&io_end->count)) {
+ if (ext4_io_end_defer_completion(io_end))
+ return ext4_end_io_end(io_end);
+
+ ext4_release_io_end(io_end);
+ }
+ return 0;
+}
+
+ext4_io_end_t *ext4_get_io_end(ext4_io_end_t *io_end)
+{
+ refcount_inc(&io_end->count);
+ return io_end;
+}
+
+/* BIO completion function for page writeback */
+static void ext4_end_bio(struct bio *bio)
+{
+ ext4_io_end_t *io_end = bio->bi_private;
+ sector_t bi_sector = bio->bi_iter.bi_sector;
+
+ if (WARN_ONCE(!io_end, "io_end is NULL: %pg: sector %Lu len %u err %d\n",
+ bio->bi_bdev,
+ (long long) bio->bi_iter.bi_sector,
+ (unsigned) bio_sectors(bio),
+ bio->bi_status)) {
+ ext4_finish_bio(bio);
+ bio_put(bio);
+ return;
+ }
+ bio->bi_end_io = NULL;
+
+ if (bio->bi_status) {
+ struct inode *inode = io_end->inode;
+
+ ext4_warning(inode->i_sb, "I/O error %d writing to inode %lu "
+ "starting block %llu)",
+ bio->bi_status, inode->i_ino,
+ (unsigned long long)
+ bi_sector >> (inode->i_blkbits - 9));
+ io_end->flag |= EXT4_IO_END_FAILED;
+ mapping_set_error(inode->i_mapping,
+ blk_status_to_errno(bio->bi_status));
+ }
+
+ if (ext4_io_end_defer_completion(io_end)) {
+ /*
+ * Link bio into list hanging from io_end. We have to do it
+ * atomically as bio completions can be racing against each
+ * other.
+ */
+ bio->bi_private = xchg(&io_end->bio, bio);
+ ext4_put_io_end_defer(io_end);
+ } else {
+ /*
+ * Drop io_end reference early. Inode can get freed once
+ * we finish the bio.
+ */
+ ext4_put_io_end_defer(io_end);
+ ext4_finish_bio(bio);
+ bio_put(bio);
+ }
+}
+
+void ext4_io_submit(struct ext4_io_submit *io)
+{
+ struct bio *bio = io->io_bio;
+
+ if (bio) {
+ if (io->io_wbc->sync_mode == WB_SYNC_ALL)
+ io->io_bio->bi_opf |= REQ_SYNC;
+ submit_bio(io->io_bio);
+ }
+ io->io_bio = NULL;
+}
+
+void ext4_io_submit_init(struct ext4_io_submit *io,
+ struct writeback_control *wbc)
+{
+ io->io_wbc = wbc;
+ io->io_bio = NULL;
+ io->io_end = NULL;
+}
+
+static void io_submit_init_bio(struct ext4_io_submit *io,
+ struct buffer_head *bh)
+{
+ struct bio *bio;
+
+ /*
+ * bio_alloc will _always_ be able to allocate a bio if
+ * __GFP_DIRECT_RECLAIM is set, see comments for bio_alloc_bioset().
+ */
+ bio = bio_alloc(bh->b_bdev, BIO_MAX_VECS, REQ_OP_WRITE, GFP_NOIO);
+ fscrypt_set_bio_crypt_ctx_bh(bio, bh, GFP_NOIO);
+ bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
+ bio->bi_end_io = ext4_end_bio;
+ bio->bi_private = ext4_get_io_end(io->io_end);
+ io->io_bio = bio;
+ io->io_next_block = bh->b_blocknr;
+ wbc_init_bio(io->io_wbc, bio);
+}
+
+static void io_submit_add_bh(struct ext4_io_submit *io,
+ struct inode *inode,
+ struct folio *folio,
+ struct folio *io_folio,
+ struct buffer_head *bh)
+{
+ if (io->io_bio && (bh->b_blocknr != io->io_next_block ||
+ !fscrypt_mergeable_bio_bh(io->io_bio, bh))) {
+submit_and_retry:
+ ext4_io_submit(io);
+ }
+ if (io->io_bio == NULL) {
+ io_submit_init_bio(io, bh);
+ io->io_bio->bi_write_hint = inode->i_write_hint;
+ }
+ if (!bio_add_folio(io->io_bio, io_folio, bh->b_size, bh_offset(bh)))
+ goto submit_and_retry;
+ wbc_account_cgroup_owner(io->io_wbc, folio, bh->b_size);
+ io->io_next_block++;
+}
+
+int ext4_bio_write_folio(struct ext4_io_submit *io, struct folio *folio,
+ size_t len)
+{
+ struct folio *io_folio = folio;
+ struct inode *inode = folio->mapping->host;
+ unsigned block_start;
+ struct buffer_head *bh, *head;
+ int ret = 0;
+ int nr_to_submit = 0;
+ struct writeback_control *wbc = io->io_wbc;
+ bool keep_towrite = false;
+
+ BUG_ON(!folio_test_locked(folio));
+ BUG_ON(folio_test_writeback(folio));
+
+ /*
+ * Comments copied from block_write_full_folio:
+ *
+ * The folio straddles i_size. It must be zeroed out on each and every
+ * writepage invocation because it may be mmapped. "A file is mapped
+ * in multiples of the page size. For a file that is not a multiple of
+ * the page size, the remaining memory is zeroed when mapped, and
+ * writes to that region are not written out to the file."
+ */
+ if (len < folio_size(folio))
+ folio_zero_segment(folio, len, folio_size(folio));
+ /*
+ * In the first loop we prepare and mark buffers to submit. We have to
+ * mark all buffers in the folio before submitting so that
+ * folio_end_writeback() cannot be called from ext4_end_bio() when IO
+ * on the first buffer finishes and we are still working on submitting
+ * the second buffer.
+ */
+ bh = head = folio_buffers(folio);
+ do {
+ block_start = bh_offset(bh);
+ if (block_start >= len) {
+ clear_buffer_dirty(bh);
+ set_buffer_uptodate(bh);
+ continue;
+ }
+ if (!buffer_dirty(bh) || buffer_delay(bh) ||
+ !buffer_mapped(bh) || buffer_unwritten(bh)) {
+ /* A hole? We can safely clear the dirty bit */
+ if (!buffer_mapped(bh))
+ clear_buffer_dirty(bh);
+ /*
+ * Keeping dirty some buffer we cannot write? Make sure
+ * to redirty the folio and keep TOWRITE tag so that
+ * racing WB_SYNC_ALL writeback does not skip the folio.
+ * This happens e.g. when doing writeout for
+ * transaction commit or when journalled data is not
+ * yet committed.
+ */
+ if (buffer_dirty(bh) ||
+ (buffer_jbd(bh) && buffer_jbddirty(bh))) {
+ if (!folio_test_dirty(folio))
+ folio_redirty_for_writepage(wbc, folio);
+ keep_towrite = true;
+ }
+ continue;
+ }
+ if (buffer_new(bh))
+ clear_buffer_new(bh);
+ set_buffer_async_write(bh);
+ clear_buffer_dirty(bh);
+ nr_to_submit++;
+ } while ((bh = bh->b_this_page) != head);
+
+ /* Nothing to submit? Just unlock the folio... */
+ if (!nr_to_submit)
+ return 0;
+
+ bh = head = folio_buffers(folio);
+
+ /*
+ * If any blocks are being written to an encrypted file, encrypt them
+ * into a bounce page. For simplicity, just encrypt until the last
+ * block which might be needed. This may cause some unneeded blocks
+ * (e.g. holes) to be unnecessarily encrypted, but this is rare and
+ * can't happen in the common case of blocksize == PAGE_SIZE.
+ */
+ if (fscrypt_inode_uses_fs_layer_crypto(inode)) {
+ gfp_t gfp_flags = GFP_NOFS;
+ unsigned int enc_bytes = round_up(len, i_blocksize(inode));
+ struct page *bounce_page;
+
+ /*
+ * Since bounce page allocation uses a mempool, we can only use
+ * a waiting mask (i.e. request guaranteed allocation) on the
+ * first page of the bio. Otherwise it can deadlock.
+ */
+ if (io->io_bio)
+ gfp_flags = GFP_NOWAIT;
+ retry_encrypt:
+ bounce_page = fscrypt_encrypt_pagecache_blocks(folio,
+ enc_bytes, 0, gfp_flags);
+ if (IS_ERR(bounce_page)) {
+ ret = PTR_ERR(bounce_page);
+ if (ret == -ENOMEM &&
+ (io->io_bio || wbc->sync_mode == WB_SYNC_ALL)) {
+ gfp_t new_gfp_flags = GFP_NOFS;
+ if (io->io_bio)
+ ext4_io_submit(io);
+ else
+ new_gfp_flags |= __GFP_NOFAIL;
+ memalloc_retry_wait(gfp_flags);
+ gfp_flags = new_gfp_flags;
+ goto retry_encrypt;
+ }
+
+ printk_ratelimited(KERN_ERR "%s: ret = %d\n", __func__, ret);
+ folio_redirty_for_writepage(wbc, folio);
+ do {
+ if (buffer_async_write(bh)) {
+ clear_buffer_async_write(bh);
+ set_buffer_dirty(bh);
+ }
+ bh = bh->b_this_page;
+ } while (bh != head);
+
+ return ret;
+ }
+ io_folio = page_folio(bounce_page);
+ }
+
+ __folio_start_writeback(folio, keep_towrite);
+
+ /* Now submit buffers to write */
+ do {
+ if (!buffer_async_write(bh))
+ continue;
+ io_submit_add_bh(io, inode, folio, io_folio, bh);
+ } while ((bh = bh->b_this_page) != head);
+
+ return 0;
+}