new file mode 100644
@@ -0,0 +1,283 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * KUnit test of ext4 inode that verify the seconds part of [a/c/m]
+ * timestamps in ext4 inode structs are decoded correctly.
+ */
+
+#include <kunit/test.h>
+#include <linux/kernel.h>
+#include <linux/time64.h>
+
+#include "ext4.h"
+
+/*
+ * For constructing the nonnegative timestamp lower bound value.
+ * binary: 00000000 00000000 00000000 00000000
+ */
+#define LOWER_MSB_0 0L
+/*
+ * For constructing the nonnegative timestamp upper bound value.
+ * binary: 01111111 11111111 11111111 11111111
+ *
+ */
+#define UPPER_MSB_0 0x7fffffffL
+/*
+ * For constructing the negative timestamp lower bound value.
+ * binary: 10000000 00000000 00000000 00000000
+ */
+#define LOWER_MSB_1 (-(UPPER_MSB_0) - 1L) /* avoid overflow */
+/*
+ * For constructing the negative timestamp upper bound value.
+ * binary: 11111111 11111111 11111111 11111111
+ */
+#define UPPER_MSB_1 (-1L)
+/*
+ * Upper bound for nanoseconds value supported by the encoding.
+ * binary: 00111111 11111111 11111111 11111111
+ */
+#define MAX_NANOSECONDS ((1L << 30) - 1)
+
+#define CASE_NAME_FORMAT "%s: msb:%x lower_bound:%x extra_bits: %x"
+
+#define LOWER_BOUND_NEG_NO_EXTRA_BITS_CASE\
+ "1901-12-13 Lower bound of 32bit < 0 timestamp, no extra bits"
+#define UPPER_BOUND_NEG_NO_EXTRA_BITS_CASE\
+ "1969-12-31 Upper bound of 32bit < 0 timestamp, no extra bits"
+#define LOWER_BOUND_NONNEG_NO_EXTRA_BITS_CASE\
+ "1970-01-01 Lower bound of 32bit >=0 timestamp, no extra bits"
+#define UPPER_BOUND_NONNEG_NO_EXTRA_BITS_CASE\
+ "2038-01-19 Upper bound of 32bit >=0 timestamp, no extra bits"
+#define LOWER_BOUND_NEG_LO_1_CASE\
+ "2038-01-19 Lower bound of 32bit <0 timestamp, lo extra sec bit on"
+#define UPPER_BOUND_NEG_LO_1_CASE\
+ "2106-02-07 Upper bound of 32bit <0 timestamp, lo extra sec bit on"
+#define LOWER_BOUND_NONNEG_LO_1_CASE\
+ "2106-02-07 Lower bound of 32bit >=0 timestamp, lo extra sec bit on"
+#define UPPER_BOUND_NONNEG_LO_1_CASE\
+ "2174-02-25 Upper bound of 32bit >=0 timestamp, lo extra sec bit on"
+#define LOWER_BOUND_NEG_HI_1_CASE\
+ "2174-02-25 Lower bound of 32bit <0 timestamp, hi extra sec bit on"
+#define UPPER_BOUND_NEG_HI_1_CASE\
+ "2242-03-16 Upper bound of 32bit <0 timestamp, hi extra sec bit on"
+#define LOWER_BOUND_NONNEG_HI_1_CASE\
+ "2242-03-16 Lower bound of 32bit >=0 timestamp, hi extra sec bit on"
+#define UPPER_BOUND_NONNEG_HI_1_CASE\
+ "2310-04-04 Upper bound of 32bit >=0 timestamp, hi extra sec bit on"
+#define UPPER_BOUND_NONNEG_HI_1_NS_1_CASE\
+ "2310-04-04 Upper bound of 32bit>=0 timestamp, hi extra sec bit 1. 1 ns"
+#define LOWER_BOUND_NONNEG_HI_1_NS_MAX_CASE\
+ "2378-04-22 Lower bound of 32bit>= timestamp. Extra sec bits 1. Max ns"
+#define LOWER_BOUND_NONNEG_EXTRA_BITS_1_CASE\
+ "2378-04-22 Lower bound of 32bit >=0 timestamp. All extra sec bits on"
+#define UPPER_BOUND_NONNEG_EXTRA_BITS_1_CASE\
+ "2446-05-10 Upper bound of 32bit >=0 timestamp. All extra sec bits on"
+
+struct timestamp_expectation {
+ const char *test_case_name;
+ struct timespec64 expected;
+ u32 extra_bits;
+ bool msb_set;
+ bool lower_bound;
+};
+
+static const struct timestamp_expectation test_data[] = {
+ {
+ .test_case_name = LOWER_BOUND_NEG_NO_EXTRA_BITS_CASE,
+ .msb_set = true,
+ .lower_bound = true,
+ .extra_bits = 0,
+ .expected = {.tv_sec = -0x80000000LL, .tv_nsec = 0L},
+ },
+
+ {
+ .test_case_name = UPPER_BOUND_NEG_NO_EXTRA_BITS_CASE,
+ .msb_set = true,
+ .lower_bound = false,
+ .extra_bits = 0,
+ .expected = {.tv_sec = -1LL, .tv_nsec = 0L},
+ },
+
+ {
+ .test_case_name = LOWER_BOUND_NONNEG_NO_EXTRA_BITS_CASE,
+ .msb_set = false,
+ .lower_bound = true,
+ .extra_bits = 0,
+ .expected = {0LL, 0L},
+ },
+
+ {
+ .test_case_name = UPPER_BOUND_NONNEG_NO_EXTRA_BITS_CASE,
+ .msb_set = false,
+ .lower_bound = false,
+ .extra_bits = 0,
+ .expected = {.tv_sec = 0x7fffffffLL, .tv_nsec = 0L},
+ },
+
+ {
+ .test_case_name = LOWER_BOUND_NEG_LO_1_CASE,
+ .msb_set = true,
+ .lower_bound = true,
+ .extra_bits = 1,
+ .expected = {.tv_sec = 0x80000000LL, .tv_nsec = 0L},
+ },
+
+ {
+ .test_case_name = UPPER_BOUND_NEG_LO_1_CASE,
+ .msb_set = true,
+ .lower_bound = false,
+ .extra_bits = 1,
+ .expected = {.tv_sec = 0xffffffffLL, .tv_nsec = 0L},
+ },
+
+ {
+ .test_case_name = LOWER_BOUND_NONNEG_LO_1_CASE,
+ .msb_set = false,
+ .lower_bound = true,
+ .extra_bits = 1,
+ .expected = {.tv_sec = 0x100000000LL, .tv_nsec = 0L},
+ },
+
+ {
+ .test_case_name = UPPER_BOUND_NONNEG_LO_1_CASE,
+ .msb_set = false,
+ .lower_bound = false,
+ .extra_bits = 1,
+ .expected = {.tv_sec = 0x17fffffffLL, .tv_nsec = 0L},
+ },
+
+ {
+ .test_case_name = LOWER_BOUND_NEG_HI_1_CASE,
+ .msb_set = true,
+ .lower_bound = true,
+ .extra_bits = 2,
+ .expected = {.tv_sec = 0x180000000LL, .tv_nsec = 0L},
+ },
+
+ {
+ .test_case_name = UPPER_BOUND_NEG_HI_1_CASE,
+ .msb_set = true,
+ .lower_bound = false,
+ .extra_bits = 2,
+ .expected = {.tv_sec = 0x1ffffffffLL, .tv_nsec = 0L},
+ },
+
+ {
+ .test_case_name = LOWER_BOUND_NONNEG_HI_1_CASE,
+ .msb_set = false,
+ .lower_bound = true,
+ .extra_bits = 2,
+ .expected = {.tv_sec = 0x200000000LL, .tv_nsec = 0L},
+ },
+
+ {
+ .test_case_name = UPPER_BOUND_NONNEG_HI_1_CASE,
+ .msb_set = false,
+ .lower_bound = false,
+ .extra_bits = 2,
+ .expected = {.tv_sec = 0x27fffffffLL, .tv_nsec = 0L},
+ },
+
+ {
+ .test_case_name = UPPER_BOUND_NONNEG_HI_1_NS_1_CASE,
+ .msb_set = false,
+ .lower_bound = false,
+ .extra_bits = 6,
+ .expected = {.tv_sec = 0x27fffffffLL, .tv_nsec = 1L},
+ },
+
+ {
+ .test_case_name = LOWER_BOUND_NONNEG_HI_1_NS_MAX_CASE,
+ .msb_set = false,
+ .lower_bound = true,
+ .extra_bits = 0xFFFFFFFF,
+ .expected = {.tv_sec = 0x300000000LL,
+ .tv_nsec = MAX_NANOSECONDS},
+ },
+
+ {
+ .test_case_name = LOWER_BOUND_NONNEG_EXTRA_BITS_1_CASE,
+ .msb_set = false,
+ .lower_bound = true,
+ .extra_bits = 3,
+ .expected = {.tv_sec = 0x300000000LL, .tv_nsec = 0L},
+ },
+
+ {
+ .test_case_name = UPPER_BOUND_NONNEG_EXTRA_BITS_1_CASE,
+ .msb_set = false,
+ .lower_bound = false,
+ .extra_bits = 3,
+ .expected = {.tv_sec = 0x37fffffffLL, .tv_nsec = 0L},
+ }
+};
+
+static void timestamp_expectation_to_desc(const struct timestamp_expectation *t,
+ char *desc)
+{
+ strscpy(desc, t->test_case_name, KUNIT_PARAM_DESC_SIZE);
+}
+
+KUNIT_ARRAY_PARAM(ext4_inode, test_data, timestamp_expectation_to_desc);
+
+static time64_t get_32bit_time(const struct timestamp_expectation * const test)
+{
+ if (test->msb_set) {
+ if (test->lower_bound)
+ return LOWER_MSB_1;
+
+ return UPPER_MSB_1;
+ }
+
+ if (test->lower_bound)
+ return LOWER_MSB_0;
+ return UPPER_MSB_0;
+}
+
+
+/*
+ * Test data is derived from the table in the Inode Timestamps section of
+ * Documentation/filesystems/ext4/inodes.rst.
+ */
+static void inode_test_xtimestamp_decoding(struct kunit *test)
+{
+ struct timespec64 timestamp;
+
+ struct timestamp_expectation *test_param =
+ (struct timestamp_expectation *)(test->param_value);
+
+ timestamp = ext4_decode_extra_time(
+ cpu_to_le32(get_32bit_time(test_param)),
+ cpu_to_le32(test_param->extra_bits));
+
+ KUNIT_EXPECT_EQ_MSG(test,
+ test_param->expected.tv_sec,
+ timestamp.tv_sec,
+ CASE_NAME_FORMAT,
+ test_param->test_case_name,
+ test_param->msb_set,
+ test_param->lower_bound,
+ test_param->extra_bits);
+ KUNIT_EXPECT_EQ_MSG(test,
+ test_param->expected.tv_nsec,
+ timestamp.tv_nsec,
+ CASE_NAME_FORMAT,
+ test_param->test_case_name,
+ test_param->msb_set,
+ test_param->lower_bound,
+ test_param->extra_bits);
+}
+
+static struct kunit_case ext4_inode_test_cases[] = {
+ KUNIT_CASE_PARAM(inode_test_xtimestamp_decoding, ext4_inode_gen_params),
+ {}
+};
+
+static struct kunit_suite ext4_inode_test_suite = {
+ .name = "ext4_inode_test",
+ .test_cases = ext4_inode_test_cases,
+};
+
+kunit_test_suites(&ext4_inode_test_suite);
+
+MODULE_DESCRIPTION("KUnit test of ext4 inode timestamp decoding");
+MODULE_LICENSE("GPL v2");
new file mode 100644
@@ -0,0 +1,2020 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * linux/fs/ext4/ioctl.c
+ *
+ * Copyright (C) 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ */
+
+#include <linux/fs.h>
+#include <linux/capability.h>
+#include <linux/time.h>
+#include <linux/compat.h>
+#include <linux/mount.h>
+#include <linux/file.h>
+#include <linux/quotaops.h>
+#include <linux/random.h>
+#include <linux/uaccess.h>
+#include <linux/delay.h>
+#include <linux/iversion.h>
+#include <linux/fileattr.h>
+#include <linux/uuid.h>
+#include "ext4_jbd2.h"
+#include "ext4.h"
+#include <linux/fsmap.h>
+#include "fsmap.h"
+#include <trace/events/ext4.h>
+
+typedef void ext4_update_sb_callback(struct ext4_sb_info *sbi,
+ struct ext4_super_block *es,
+ const void *arg);
+
+/*
+ * Superblock modification callback function for changing file system
+ * label
+ */
+static void ext4_sb_setlabel(struct ext4_sb_info *sbi,
+ struct ext4_super_block *es, const void *arg)
+{
+ /* Sanity check, this should never happen */
+ BUILD_BUG_ON(sizeof(es->s_volume_name) < EXT4_LABEL_MAX);
+
+ memcpy(es->s_volume_name, (char *)arg, EXT4_LABEL_MAX);
+}
+
+/*
+ * Superblock modification callback function for changing file system
+ * UUID.
+ */
+static void ext4_sb_setuuid(struct ext4_sb_info *sbi,
+ struct ext4_super_block *es, const void *arg)
+{
+ memcpy(es->s_uuid, (__u8 *)arg, UUID_SIZE);
+}
+
+static
+int ext4_update_primary_sb(struct super_block *sb, handle_t *handle,
+ ext4_update_sb_callback func,
+ const void *arg)
+{
+ int err = 0;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct buffer_head *bh = sbi->s_sbh;
+ struct ext4_super_block *es = sbi->s_es;
+
+ trace_ext4_update_sb(sb, bh->b_blocknr, 1);
+
+ BUFFER_TRACE(bh, "get_write_access");
+ err = ext4_journal_get_write_access(handle, sb,
+ bh,
+ EXT4_JTR_NONE);
+ if (err)
+ goto out_err;
+
+ lock_buffer(bh);
+ func(sbi, es, arg);
+ ext4_superblock_csum_set(sb);
+ unlock_buffer(bh);
+
+ if (buffer_write_io_error(bh) || !buffer_uptodate(bh)) {
+ ext4_msg(sbi->s_sb, KERN_ERR, "previous I/O error to "
+ "superblock detected");
+ clear_buffer_write_io_error(bh);
+ set_buffer_uptodate(bh);
+ }
+
+ err = ext4_handle_dirty_metadata(handle, NULL, bh);
+ if (err)
+ goto out_err;
+ err = sync_dirty_buffer(bh);
+out_err:
+ ext4_std_error(sb, err);
+ return err;
+}
+
+/*
+ * Update one backup superblock in the group 'grp' using the callback
+ * function 'func' and argument 'arg'. If the handle is NULL the
+ * modification is not journalled.
+ *
+ * Returns: 0 when no modification was done (no superblock in the group)
+ * 1 when the modification was successful
+ * <0 on error
+ */
+static int ext4_update_backup_sb(struct super_block *sb,
+ handle_t *handle, ext4_group_t grp,
+ ext4_update_sb_callback func, const void *arg)
+{
+ int err = 0;
+ ext4_fsblk_t sb_block;
+ struct buffer_head *bh;
+ unsigned long offset = 0;
+ struct ext4_super_block *es;
+
+ if (!ext4_bg_has_super(sb, grp))
+ return 0;
+
+ /*
+ * For the group 0 there is always 1k padding, so we have
+ * either adjust offset, or sb_block depending on blocksize
+ */
+ if (grp == 0) {
+ sb_block = 1 * EXT4_MIN_BLOCK_SIZE;
+ offset = do_div(sb_block, sb->s_blocksize);
+ } else {
+ sb_block = ext4_group_first_block_no(sb, grp);
+ offset = 0;
+ }
+
+ trace_ext4_update_sb(sb, sb_block, handle ? 1 : 0);
+
+ bh = ext4_sb_bread(sb, sb_block, 0);
+ if (IS_ERR(bh))
+ return PTR_ERR(bh);
+
+ if (handle) {
+ BUFFER_TRACE(bh, "get_write_access");
+ err = ext4_journal_get_write_access(handle, sb,
+ bh,
+ EXT4_JTR_NONE);
+ if (err)
+ goto out_bh;
+ }
+
+ es = (struct ext4_super_block *) (bh->b_data + offset);
+ lock_buffer(bh);
+ if (ext4_has_feature_metadata_csum(sb) &&
+ es->s_checksum != ext4_superblock_csum(es)) {
+ ext4_msg(sb, KERN_ERR, "Invalid checksum for backup "
+ "superblock %llu", sb_block);
+ unlock_buffer(bh);
+ goto out_bh;
+ }
+ func(EXT4_SB(sb), es, arg);
+ if (ext4_has_feature_metadata_csum(sb))
+ es->s_checksum = ext4_superblock_csum(es);
+ set_buffer_uptodate(bh);
+ unlock_buffer(bh);
+
+ if (handle) {
+ err = ext4_handle_dirty_metadata(handle, NULL, bh);
+ if (err)
+ goto out_bh;
+ } else {
+ BUFFER_TRACE(bh, "marking dirty");
+ mark_buffer_dirty(bh);
+ }
+ err = sync_dirty_buffer(bh);
+
+out_bh:
+ brelse(bh);
+ ext4_std_error(sb, err);
+ return (err) ? err : 1;
+}
+
+/*
+ * Update primary and backup superblocks using the provided function
+ * func and argument arg.
+ *
+ * Only the primary superblock and at most two backup superblock
+ * modifications are journalled; the rest is modified without journal.
+ * This is safe because e2fsck will re-write them if there is a problem,
+ * and we're very unlikely to ever need more than two backups.
+ */
+static
+int ext4_update_superblocks_fn(struct super_block *sb,
+ ext4_update_sb_callback func,
+ const void *arg)
+{
+ handle_t *handle;
+ ext4_group_t ngroups;
+ unsigned int three = 1;
+ unsigned int five = 5;
+ unsigned int seven = 7;
+ int err = 0, ret, i;
+ ext4_group_t grp, primary_grp;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+ /*
+ * We can't update superblocks while the online resize is running
+ */
+ if (test_and_set_bit_lock(EXT4_FLAGS_RESIZING,
+ &sbi->s_ext4_flags)) {
+ ext4_msg(sb, KERN_ERR, "Can't modify superblock while"
+ "performing online resize");
+ return -EBUSY;
+ }
+
+ /*
+ * We're only going to update primary superblock and two
+ * backup superblocks in this transaction.
+ */
+ handle = ext4_journal_start_sb(sb, EXT4_HT_MISC, 3);
+ if (IS_ERR(handle)) {
+ err = PTR_ERR(handle);
+ goto out;
+ }
+
+ /* Update primary superblock */
+ err = ext4_update_primary_sb(sb, handle, func, arg);
+ if (err) {
+ ext4_msg(sb, KERN_ERR, "Failed to update primary "
+ "superblock");
+ goto out_journal;
+ }
+
+ primary_grp = ext4_get_group_number(sb, sbi->s_sbh->b_blocknr);
+ ngroups = ext4_get_groups_count(sb);
+
+ /*
+ * Update backup superblocks. We have to start from group 0
+ * because it might not be where the primary superblock is
+ * if the fs is mounted with -o sb=<backup_sb_block>
+ */
+ i = 0;
+ grp = 0;
+ while (grp < ngroups) {
+ /* Skip primary superblock */
+ if (grp == primary_grp)
+ goto next_grp;
+
+ ret = ext4_update_backup_sb(sb, handle, grp, func, arg);
+ if (ret < 0) {
+ /* Ignore bad checksum; try to update next sb */
+ if (ret == -EFSBADCRC)
+ goto next_grp;
+ err = ret;
+ goto out_journal;
+ }
+
+ i += ret;
+ if (handle && i > 1) {
+ /*
+ * We're only journalling primary superblock and
+ * two backup superblocks; the rest is not
+ * journalled.
+ */
+ err = ext4_journal_stop(handle);
+ if (err)
+ goto out;
+ handle = NULL;
+ }
+next_grp:
+ grp = ext4_list_backups(sb, &three, &five, &seven);
+ }
+
+out_journal:
+ if (handle) {
+ ret = ext4_journal_stop(handle);
+ if (ret && !err)
+ err = ret;
+ }
+out:
+ clear_bit_unlock(EXT4_FLAGS_RESIZING, &sbi->s_ext4_flags);
+ smp_mb__after_atomic();
+ return err ? err : 0;
+}
+
+/*
+ * Swap memory between @a and @b for @len bytes.
+ *
+ * @a: pointer to first memory area
+ * @b: pointer to second memory area
+ * @len: number of bytes to swap
+ *
+ */
+static void memswap(void *a, void *b, size_t len)
+{
+ unsigned char *ap, *bp;
+
+ ap = (unsigned char *)a;
+ bp = (unsigned char *)b;
+ while (len-- > 0) {
+ swap(*ap, *bp);
+ ap++;
+ bp++;
+ }
+}
+
+/*
+ * Swap i_data and associated attributes between @inode1 and @inode2.
+ * This function is used for the primary swap between inode1 and inode2
+ * and also to revert this primary swap in case of errors.
+ *
+ * Therefore you have to make sure, that calling this method twice
+ * will revert all changes.
+ *
+ * @inode1: pointer to first inode
+ * @inode2: pointer to second inode
+ */
+static void swap_inode_data(struct inode *inode1, struct inode *inode2)
+{
+ loff_t isize;
+ struct ext4_inode_info *ei1;
+ struct ext4_inode_info *ei2;
+ unsigned long tmp;
+ struct timespec64 ts1, ts2;
+
+ ei1 = EXT4_I(inode1);
+ ei2 = EXT4_I(inode2);
+
+ swap(inode1->i_version, inode2->i_version);
+
+ ts1 = inode_get_atime(inode1);
+ ts2 = inode_get_atime(inode2);
+ inode_set_atime_to_ts(inode1, ts2);
+ inode_set_atime_to_ts(inode2, ts1);
+
+ ts1 = inode_get_mtime(inode1);
+ ts2 = inode_get_mtime(inode2);
+ inode_set_mtime_to_ts(inode1, ts2);
+ inode_set_mtime_to_ts(inode2, ts1);
+
+ memswap(ei1->i_data, ei2->i_data, sizeof(ei1->i_data));
+ tmp = ei1->i_flags & EXT4_FL_SHOULD_SWAP;
+ ei1->i_flags = (ei2->i_flags & EXT4_FL_SHOULD_SWAP) |
+ (ei1->i_flags & ~EXT4_FL_SHOULD_SWAP);
+ ei2->i_flags = tmp | (ei2->i_flags & ~EXT4_FL_SHOULD_SWAP);
+ swap(ei1->i_disksize, ei2->i_disksize);
+ ext4_es_remove_extent(inode1, 0, EXT_MAX_BLOCKS);
+ ext4_es_remove_extent(inode2, 0, EXT_MAX_BLOCKS);
+
+ isize = i_size_read(inode1);
+ i_size_write(inode1, i_size_read(inode2));
+ i_size_write(inode2, isize);
+}
+
+void ext4_reset_inode_seed(struct inode *inode)
+{
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ __le32 inum = cpu_to_le32(inode->i_ino);
+ __le32 gen = cpu_to_le32(inode->i_generation);
+ __u32 csum;
+
+ if (!ext4_has_feature_metadata_csum(inode->i_sb))
+ return;
+
+ csum = ext4_chksum(sbi->s_csum_seed, (__u8 *)&inum, sizeof(inum));
+ ei->i_csum_seed = ext4_chksum(csum, (__u8 *)&gen, sizeof(gen));
+}
+
+/*
+ * Swap the information from the given @inode and the inode
+ * EXT4_BOOT_LOADER_INO. It will basically swap i_data and all other
+ * important fields of the inodes.
+ *
+ * @sb: the super block of the filesystem
+ * @idmap: idmap of the mount the inode was found from
+ * @inode: the inode to swap with EXT4_BOOT_LOADER_INO
+ *
+ */
+static long swap_inode_boot_loader(struct super_block *sb,
+ struct mnt_idmap *idmap,
+ struct inode *inode)
+{
+ handle_t *handle;
+ int err;
+ struct inode *inode_bl;
+ struct ext4_inode_info *ei_bl;
+ qsize_t size, size_bl, diff;
+ blkcnt_t blocks;
+ unsigned short bytes;
+
+ inode_bl = ext4_iget(sb, EXT4_BOOT_LOADER_INO,
+ EXT4_IGET_SPECIAL | EXT4_IGET_BAD);
+ if (IS_ERR(inode_bl))
+ return PTR_ERR(inode_bl);
+ ei_bl = EXT4_I(inode_bl);
+
+ /* Protect orig inodes against a truncate and make sure,
+ * that only 1 swap_inode_boot_loader is running. */
+ lock_two_nondirectories(inode, inode_bl);
+
+ if (inode->i_nlink != 1 || !S_ISREG(inode->i_mode) ||
+ IS_SWAPFILE(inode) || IS_ENCRYPTED(inode) ||
+ (EXT4_I(inode)->i_flags & EXT4_JOURNAL_DATA_FL) ||
+ ext4_has_inline_data(inode)) {
+ err = -EINVAL;
+ goto journal_err_out;
+ }
+
+ if (IS_RDONLY(inode) || IS_APPEND(inode) || IS_IMMUTABLE(inode) ||
+ !inode_owner_or_capable(idmap, inode) ||
+ !capable(CAP_SYS_ADMIN)) {
+ err = -EPERM;
+ goto journal_err_out;
+ }
+
+ filemap_invalidate_lock(inode->i_mapping);
+ err = filemap_write_and_wait(inode->i_mapping);
+ if (err)
+ goto err_out;
+
+ err = filemap_write_and_wait(inode_bl->i_mapping);
+ if (err)
+ goto err_out;
+
+ /* Wait for all existing dio workers */
+ inode_dio_wait(inode);
+ inode_dio_wait(inode_bl);
+
+ truncate_inode_pages(&inode->i_data, 0);
+ truncate_inode_pages(&inode_bl->i_data, 0);
+
+ handle = ext4_journal_start(inode_bl, EXT4_HT_MOVE_EXTENTS, 2);
+ if (IS_ERR(handle)) {
+ err = -EINVAL;
+ goto err_out;
+ }
+ ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_SWAP_BOOT, handle);
+
+ /* Protect extent tree against block allocations via delalloc */
+ ext4_double_down_write_data_sem(inode, inode_bl);
+
+ if (is_bad_inode(inode_bl) || !S_ISREG(inode_bl->i_mode)) {
+ /* this inode has never been used as a BOOT_LOADER */
+ set_nlink(inode_bl, 1);
+ i_uid_write(inode_bl, 0);
+ i_gid_write(inode_bl, 0);
+ inode_bl->i_flags = 0;
+ ei_bl->i_flags = 0;
+ inode_set_iversion(inode_bl, 1);
+ i_size_write(inode_bl, 0);
+ EXT4_I(inode_bl)->i_disksize = inode_bl->i_size;
+ inode_bl->i_mode = S_IFREG;
+ if (ext4_has_feature_extents(sb)) {
+ ext4_set_inode_flag(inode_bl, EXT4_INODE_EXTENTS);
+ ext4_ext_tree_init(handle, inode_bl);
+ } else
+ memset(ei_bl->i_data, 0, sizeof(ei_bl->i_data));
+ }
+
+ err = dquot_initialize(inode);
+ if (err)
+ goto err_out1;
+
+ size = (qsize_t)(inode->i_blocks) * (1 << 9) + inode->i_bytes;
+ size_bl = (qsize_t)(inode_bl->i_blocks) * (1 << 9) + inode_bl->i_bytes;
+ diff = size - size_bl;
+ swap_inode_data(inode, inode_bl);
+
+ inode_set_ctime_current(inode);
+ inode_set_ctime_current(inode_bl);
+ inode_inc_iversion(inode);
+
+ inode->i_generation = get_random_u32();
+ inode_bl->i_generation = get_random_u32();
+ ext4_reset_inode_seed(inode);
+ ext4_reset_inode_seed(inode_bl);
+
+ ext4_discard_preallocations(inode);
+
+ err = ext4_mark_inode_dirty(handle, inode);
+ if (err < 0) {
+ /* No need to update quota information. */
+ ext4_warning(inode->i_sb,
+ "couldn't mark inode #%lu dirty (err %d)",
+ inode->i_ino, err);
+ /* Revert all changes: */
+ swap_inode_data(inode, inode_bl);
+ ext4_mark_inode_dirty(handle, inode);
+ goto err_out1;
+ }
+
+ blocks = inode_bl->i_blocks;
+ bytes = inode_bl->i_bytes;
+ inode_bl->i_blocks = inode->i_blocks;
+ inode_bl->i_bytes = inode->i_bytes;
+ err = ext4_mark_inode_dirty(handle, inode_bl);
+ if (err < 0) {
+ /* No need to update quota information. */
+ ext4_warning(inode_bl->i_sb,
+ "couldn't mark inode #%lu dirty (err %d)",
+ inode_bl->i_ino, err);
+ goto revert;
+ }
+
+ /* Bootloader inode should not be counted into quota information. */
+ if (diff > 0)
+ dquot_free_space(inode, diff);
+ else
+ err = dquot_alloc_space(inode, -1 * diff);
+
+ if (err < 0) {
+revert:
+ /* Revert all changes: */
+ inode_bl->i_blocks = blocks;
+ inode_bl->i_bytes = bytes;
+ swap_inode_data(inode, inode_bl);
+ ext4_mark_inode_dirty(handle, inode);
+ ext4_mark_inode_dirty(handle, inode_bl);
+ }
+
+err_out1:
+ ext4_journal_stop(handle);
+ ext4_double_up_write_data_sem(inode, inode_bl);
+
+err_out:
+ filemap_invalidate_unlock(inode->i_mapping);
+journal_err_out:
+ unlock_two_nondirectories(inode, inode_bl);
+ iput(inode_bl);
+ return err;
+}
+
+/*
+ * If immutable is set and we are not clearing it, we're not allowed to change
+ * anything else in the inode. Don't error out if we're only trying to set
+ * immutable on an immutable file.
+ */
+static int ext4_ioctl_check_immutable(struct inode *inode, __u32 new_projid,
+ unsigned int flags)
+{
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ unsigned int oldflags = ei->i_flags;
+
+ if (!(oldflags & EXT4_IMMUTABLE_FL) || !(flags & EXT4_IMMUTABLE_FL))
+ return 0;
+
+ if ((oldflags & ~EXT4_IMMUTABLE_FL) != (flags & ~EXT4_IMMUTABLE_FL))
+ return -EPERM;
+ if (ext4_has_feature_project(inode->i_sb) &&
+ __kprojid_val(ei->i_projid) != new_projid)
+ return -EPERM;
+
+ return 0;
+}
+
+static void ext4_dax_dontcache(struct inode *inode, unsigned int flags)
+{
+ struct ext4_inode_info *ei = EXT4_I(inode);
+
+ if (S_ISDIR(inode->i_mode))
+ return;
+
+ if (test_opt2(inode->i_sb, DAX_NEVER) ||
+ test_opt(inode->i_sb, DAX_ALWAYS))
+ return;
+
+ if ((ei->i_flags ^ flags) & EXT4_DAX_FL)
+ d_mark_dontcache(inode);
+}
+
+static bool dax_compatible(struct inode *inode, unsigned int oldflags,
+ unsigned int flags)
+{
+ /* Allow the DAX flag to be changed on inline directories */
+ if (S_ISDIR(inode->i_mode)) {
+ flags &= ~EXT4_INLINE_DATA_FL;
+ oldflags &= ~EXT4_INLINE_DATA_FL;
+ }
+
+ if (flags & EXT4_DAX_FL) {
+ if ((oldflags & EXT4_DAX_MUT_EXCL) ||
+ ext4_test_inode_state(inode,
+ EXT4_STATE_VERITY_IN_PROGRESS)) {
+ return false;
+ }
+ }
+
+ if ((flags & EXT4_DAX_MUT_EXCL) && (oldflags & EXT4_DAX_FL))
+ return false;
+
+ return true;
+}
+
+static int ext4_ioctl_setflags(struct inode *inode,
+ unsigned int flags)
+{
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ handle_t *handle = NULL;
+ int err = -EPERM, migrate = 0;
+ struct ext4_iloc iloc;
+ unsigned int oldflags, mask, i;
+ struct super_block *sb = inode->i_sb;
+
+ /* Is it quota file? Do not allow user to mess with it */
+ if (ext4_is_quota_file(inode))
+ goto flags_out;
+
+ oldflags = ei->i_flags;
+ /*
+ * The JOURNAL_DATA flag can only be changed by
+ * the relevant capability.
+ */
+ if ((flags ^ oldflags) & (EXT4_JOURNAL_DATA_FL)) {
+ if (!capable(CAP_SYS_RESOURCE))
+ goto flags_out;
+ }
+
+ if (!dax_compatible(inode, oldflags, flags)) {
+ err = -EOPNOTSUPP;
+ goto flags_out;
+ }
+
+ if ((flags ^ oldflags) & EXT4_EXTENTS_FL)
+ migrate = 1;
+
+ if ((flags ^ oldflags) & EXT4_CASEFOLD_FL) {
+ if (!ext4_has_feature_casefold(sb)) {
+ err = -EOPNOTSUPP;
+ goto flags_out;
+ }
+
+ if (!S_ISDIR(inode->i_mode)) {
+ err = -ENOTDIR;
+ goto flags_out;
+ }
+
+ if (!ext4_empty_dir(inode)) {
+ err = -ENOTEMPTY;
+ goto flags_out;
+ }
+ }
+
+ /*
+ * Wait for all pending directio and then flush all the dirty pages
+ * for this file. The flush marks all the pages readonly, so any
+ * subsequent attempt to write to the file (particularly mmap pages)
+ * will come through the filesystem and fail.
+ */
+ if (S_ISREG(inode->i_mode) && !IS_IMMUTABLE(inode) &&
+ (flags & EXT4_IMMUTABLE_FL)) {
+ inode_dio_wait(inode);
+ err = filemap_write_and_wait(inode->i_mapping);
+ if (err)
+ goto flags_out;
+ }
+
+ handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
+ if (IS_ERR(handle)) {
+ err = PTR_ERR(handle);
+ goto flags_out;
+ }
+ if (IS_SYNC(inode))
+ ext4_handle_sync(handle);
+ err = ext4_reserve_inode_write(handle, inode, &iloc);
+ if (err)
+ goto flags_err;
+
+ ext4_dax_dontcache(inode, flags);
+
+ for (i = 0, mask = 1; i < 32; i++, mask <<= 1) {
+ if (!(mask & EXT4_FL_USER_MODIFIABLE))
+ continue;
+ /* These flags get special treatment later */
+ if (mask == EXT4_JOURNAL_DATA_FL || mask == EXT4_EXTENTS_FL)
+ continue;
+ if (mask & flags)
+ ext4_set_inode_flag(inode, i);
+ else
+ ext4_clear_inode_flag(inode, i);
+ }
+
+ ext4_set_inode_flags(inode, false);
+
+ inode_set_ctime_current(inode);
+ inode_inc_iversion(inode);
+
+ err = ext4_mark_iloc_dirty(handle, inode, &iloc);
+flags_err:
+ ext4_journal_stop(handle);
+ if (err)
+ goto flags_out;
+
+ if ((flags ^ oldflags) & (EXT4_JOURNAL_DATA_FL)) {
+ /*
+ * Changes to the journaling mode can cause unsafe changes to
+ * S_DAX if the inode is DAX
+ */
+ if (IS_DAX(inode)) {
+ err = -EBUSY;
+ goto flags_out;
+ }
+
+ err = ext4_change_inode_journal_flag(inode,
+ flags & EXT4_JOURNAL_DATA_FL);
+ if (err)
+ goto flags_out;
+ }
+ if (migrate) {
+ if (flags & EXT4_EXTENTS_FL)
+ err = ext4_ext_migrate(inode);
+ else
+ err = ext4_ind_migrate(inode);
+ }
+
+flags_out:
+ return err;
+}
+
+#ifdef CONFIG_QUOTA
+static int ext4_ioctl_setproject(struct inode *inode, __u32 projid)
+{
+ struct super_block *sb = inode->i_sb;
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ int err, rc;
+ handle_t *handle;
+ kprojid_t kprojid;
+ struct ext4_iloc iloc;
+ struct ext4_inode *raw_inode;
+ struct dquot *transfer_to[MAXQUOTAS] = { };
+
+ if (!ext4_has_feature_project(sb)) {
+ if (projid != EXT4_DEF_PROJID)
+ return -EOPNOTSUPP;
+ else
+ return 0;
+ }
+
+ if (EXT4_INODE_SIZE(sb) <= EXT4_GOOD_OLD_INODE_SIZE)
+ return -EOPNOTSUPP;
+
+ kprojid = make_kprojid(&init_user_ns, (projid_t)projid);
+
+ if (projid_eq(kprojid, EXT4_I(inode)->i_projid))
+ return 0;
+
+ err = -EPERM;
+ /* Is it quota file? Do not allow user to mess with it */
+ if (ext4_is_quota_file(inode))
+ return err;
+
+ err = dquot_initialize(inode);
+ if (err)
+ return err;
+
+ err = ext4_get_inode_loc(inode, &iloc);
+ if (err)
+ return err;
+
+ raw_inode = ext4_raw_inode(&iloc);
+ if (!EXT4_FITS_IN_INODE(raw_inode, ei, i_projid)) {
+ err = ext4_expand_extra_isize(inode,
+ EXT4_SB(sb)->s_want_extra_isize,
+ &iloc);
+ if (err)
+ return err;
+ } else {
+ brelse(iloc.bh);
+ }
+
+ handle = ext4_journal_start(inode, EXT4_HT_QUOTA,
+ EXT4_QUOTA_INIT_BLOCKS(sb) +
+ EXT4_QUOTA_DEL_BLOCKS(sb) + 3);
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+
+ err = ext4_reserve_inode_write(handle, inode, &iloc);
+ if (err)
+ goto out_stop;
+
+ transfer_to[PRJQUOTA] = dqget(sb, make_kqid_projid(kprojid));
+ if (!IS_ERR(transfer_to[PRJQUOTA])) {
+
+ /* __dquot_transfer() calls back ext4_get_inode_usage() which
+ * counts xattr inode references.
+ */
+ down_read(&EXT4_I(inode)->xattr_sem);
+ err = __dquot_transfer(inode, transfer_to);
+ up_read(&EXT4_I(inode)->xattr_sem);
+ dqput(transfer_to[PRJQUOTA]);
+ if (err)
+ goto out_dirty;
+ }
+
+ EXT4_I(inode)->i_projid = kprojid;
+ inode_set_ctime_current(inode);
+ inode_inc_iversion(inode);
+out_dirty:
+ rc = ext4_mark_iloc_dirty(handle, inode, &iloc);
+ if (!err)
+ err = rc;
+out_stop:
+ ext4_journal_stop(handle);
+ return err;
+}
+#else
+static int ext4_ioctl_setproject(struct inode *inode, __u32 projid)
+{
+ if (projid != EXT4_DEF_PROJID)
+ return -EOPNOTSUPP;
+ return 0;
+}
+#endif
+
+int ext4_force_shutdown(struct super_block *sb, u32 flags)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ int ret;
+
+ if (flags > EXT4_GOING_FLAGS_NOLOGFLUSH)
+ return -EINVAL;
+
+ if (ext4_forced_shutdown(sb))
+ return 0;
+
+ ext4_msg(sb, KERN_ALERT, "shut down requested (%d)", flags);
+ trace_ext4_shutdown(sb, flags);
+
+ switch (flags) {
+ case EXT4_GOING_FLAGS_DEFAULT:
+ ret = bdev_freeze(sb->s_bdev);
+ if (ret)
+ return ret;
+ set_bit(EXT4_FLAGS_SHUTDOWN, &sbi->s_ext4_flags);
+ bdev_thaw(sb->s_bdev);
+ break;
+ case EXT4_GOING_FLAGS_LOGFLUSH:
+ set_bit(EXT4_FLAGS_SHUTDOWN, &sbi->s_ext4_flags);
+ if (sbi->s_journal && !is_journal_aborted(sbi->s_journal)) {
+ (void) ext4_force_commit(sb);
+ jbd2_journal_abort(sbi->s_journal, -ESHUTDOWN);
+ }
+ break;
+ case EXT4_GOING_FLAGS_NOLOGFLUSH:
+ set_bit(EXT4_FLAGS_SHUTDOWN, &sbi->s_ext4_flags);
+ if (sbi->s_journal && !is_journal_aborted(sbi->s_journal))
+ jbd2_journal_abort(sbi->s_journal, -ESHUTDOWN);
+ break;
+ default:
+ return -EINVAL;
+ }
+ clear_opt(sb, DISCARD);
+ return 0;
+}
+
+static int ext4_ioctl_shutdown(struct super_block *sb, unsigned long arg)
+{
+ u32 flags;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (get_user(flags, (__u32 __user *)arg))
+ return -EFAULT;
+
+ return ext4_force_shutdown(sb, flags);
+}
+
+struct getfsmap_info {
+ struct super_block *gi_sb;
+ struct fsmap_head __user *gi_data;
+ unsigned int gi_idx;
+ __u32 gi_last_flags;
+};
+
+static int ext4_getfsmap_format(struct ext4_fsmap *xfm, void *priv)
+{
+ struct getfsmap_info *info = priv;
+ struct fsmap fm;
+
+ trace_ext4_getfsmap_mapping(info->gi_sb, xfm);
+
+ info->gi_last_flags = xfm->fmr_flags;
+ ext4_fsmap_from_internal(info->gi_sb, &fm, xfm);
+ if (copy_to_user(&info->gi_data->fmh_recs[info->gi_idx++], &fm,
+ sizeof(struct fsmap)))
+ return -EFAULT;
+
+ return 0;
+}
+
+static int ext4_ioc_getfsmap(struct super_block *sb,
+ struct fsmap_head __user *arg)
+{
+ struct getfsmap_info info = { NULL };
+ struct ext4_fsmap_head xhead = {0};
+ struct fsmap_head head;
+ bool aborted = false;
+ int error;
+
+ if (copy_from_user(&head, arg, sizeof(struct fsmap_head)))
+ return -EFAULT;
+ if (memchr_inv(head.fmh_reserved, 0, sizeof(head.fmh_reserved)) ||
+ memchr_inv(head.fmh_keys[0].fmr_reserved, 0,
+ sizeof(head.fmh_keys[0].fmr_reserved)) ||
+ memchr_inv(head.fmh_keys[1].fmr_reserved, 0,
+ sizeof(head.fmh_keys[1].fmr_reserved)))
+ return -EINVAL;
+ /*
+ * ext4 doesn't report file extents at all, so the only valid
+ * file offsets are the magic ones (all zeroes or all ones).
+ */
+ if (head.fmh_keys[0].fmr_offset ||
+ (head.fmh_keys[1].fmr_offset != 0 &&
+ head.fmh_keys[1].fmr_offset != -1ULL))
+ return -EINVAL;
+
+ xhead.fmh_iflags = head.fmh_iflags;
+ xhead.fmh_count = head.fmh_count;
+ ext4_fsmap_to_internal(sb, &xhead.fmh_keys[0], &head.fmh_keys[0]);
+ ext4_fsmap_to_internal(sb, &xhead.fmh_keys[1], &head.fmh_keys[1]);
+
+ trace_ext4_getfsmap_low_key(sb, &xhead.fmh_keys[0]);
+ trace_ext4_getfsmap_high_key(sb, &xhead.fmh_keys[1]);
+
+ info.gi_sb = sb;
+ info.gi_data = arg;
+ error = ext4_getfsmap(sb, &xhead, ext4_getfsmap_format, &info);
+ if (error == EXT4_QUERY_RANGE_ABORT)
+ aborted = true;
+ else if (error)
+ return error;
+
+ /* If we didn't abort, set the "last" flag in the last fmx */
+ if (!aborted && info.gi_idx) {
+ info.gi_last_flags |= FMR_OF_LAST;
+ if (copy_to_user(&info.gi_data->fmh_recs[info.gi_idx - 1].fmr_flags,
+ &info.gi_last_flags,
+ sizeof(info.gi_last_flags)))
+ return -EFAULT;
+ }
+
+ /* copy back header */
+ head.fmh_entries = xhead.fmh_entries;
+ head.fmh_oflags = xhead.fmh_oflags;
+ if (copy_to_user(arg, &head, sizeof(struct fsmap_head)))
+ return -EFAULT;
+
+ return 0;
+}
+
+static long ext4_ioctl_group_add(struct file *file,
+ struct ext4_new_group_data *input)
+{
+ struct super_block *sb = file_inode(file)->i_sb;
+ int err, err2=0;
+
+ err = ext4_resize_begin(sb);
+ if (err)
+ return err;
+
+ if (ext4_has_feature_bigalloc(sb)) {
+ ext4_msg(sb, KERN_ERR,
+ "Online resizing not supported with bigalloc");
+ err = -EOPNOTSUPP;
+ goto group_add_out;
+ }
+
+ err = mnt_want_write_file(file);
+ if (err)
+ goto group_add_out;
+
+ err = ext4_group_add(sb, input);
+ if (EXT4_SB(sb)->s_journal) {
+ jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
+ err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal, 0);
+ jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
+ }
+ if (err == 0)
+ err = err2;
+ mnt_drop_write_file(file);
+ if (!err && ext4_has_group_desc_csum(sb) &&
+ test_opt(sb, INIT_INODE_TABLE))
+ err = ext4_register_li_request(sb, input->group);
+group_add_out:
+ err2 = ext4_resize_end(sb, false);
+ if (err == 0)
+ err = err2;
+ return err;
+}
+
+int ext4_fileattr_get(struct dentry *dentry, struct file_kattr *fa)
+{
+ struct inode *inode = d_inode(dentry);
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ u32 flags = ei->i_flags & EXT4_FL_USER_VISIBLE;
+
+ if (S_ISREG(inode->i_mode))
+ flags &= ~FS_PROJINHERIT_FL;
+
+ fileattr_fill_flags(fa, flags);
+ if (ext4_has_feature_project(inode->i_sb))
+ fa->fsx_projid = from_kprojid(&init_user_ns, ei->i_projid);
+
+ return 0;
+}
+
+int ext4_fileattr_set(struct mnt_idmap *idmap,
+ struct dentry *dentry, struct file_kattr *fa)
+{
+ struct inode *inode = d_inode(dentry);
+ u32 flags = fa->flags;
+ int err = -EOPNOTSUPP;
+
+ if (flags & ~EXT4_FL_USER_VISIBLE)
+ goto out;
+
+ /*
+ * chattr(1) grabs flags via GETFLAGS, modifies the result and
+ * passes that to SETFLAGS. So we cannot easily make SETFLAGS
+ * more restrictive than just silently masking off visible but
+ * not settable flags as we always did.
+ */
+ flags &= EXT4_FL_USER_MODIFIABLE;
+ if (ext4_mask_flags(inode->i_mode, flags) != flags)
+ goto out;
+ err = ext4_ioctl_check_immutable(inode, fa->fsx_projid, flags);
+ if (err)
+ goto out;
+ err = ext4_ioctl_setflags(inode, flags);
+ if (err)
+ goto out;
+ err = ext4_ioctl_setproject(inode, fa->fsx_projid);
+out:
+ return err;
+}
+
+/* So that the fiemap access checks can't overflow on 32 bit machines. */
+#define FIEMAP_MAX_EXTENTS (UINT_MAX / sizeof(struct fiemap_extent))
+
+static int ext4_ioctl_get_es_cache(struct file *filp, unsigned long arg)
+{
+ struct fiemap fiemap;
+ struct fiemap __user *ufiemap = (struct fiemap __user *) arg;
+ struct fiemap_extent_info fieinfo = { 0, };
+ struct inode *inode = file_inode(filp);
+ int error;
+
+ if (copy_from_user(&fiemap, ufiemap, sizeof(fiemap)))
+ return -EFAULT;
+
+ if (fiemap.fm_extent_count > FIEMAP_MAX_EXTENTS)
+ return -EINVAL;
+
+ fieinfo.fi_flags = fiemap.fm_flags;
+ fieinfo.fi_extents_max = fiemap.fm_extent_count;
+ fieinfo.fi_extents_start = ufiemap->fm_extents;
+
+ error = ext4_get_es_cache(inode, &fieinfo, fiemap.fm_start,
+ fiemap.fm_length);
+ fiemap.fm_flags = fieinfo.fi_flags;
+ fiemap.fm_mapped_extents = fieinfo.fi_extents_mapped;
+ if (copy_to_user(ufiemap, &fiemap, sizeof(fiemap)))
+ error = -EFAULT;
+
+ return error;
+}
+
+static int ext4_ioctl_checkpoint(struct file *filp, unsigned long arg)
+{
+ int err = 0;
+ __u32 flags = 0;
+ unsigned int flush_flags = 0;
+ struct super_block *sb = file_inode(filp)->i_sb;
+
+ if (copy_from_user(&flags, (__u32 __user *)arg,
+ sizeof(__u32)))
+ return -EFAULT;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ /* check for invalid bits set */
+ if ((flags & ~EXT4_IOC_CHECKPOINT_FLAG_VALID) ||
+ ((flags & JBD2_JOURNAL_FLUSH_DISCARD) &&
+ (flags & JBD2_JOURNAL_FLUSH_ZEROOUT)))
+ return -EINVAL;
+
+ if (!EXT4_SB(sb)->s_journal)
+ return -ENODEV;
+
+ if ((flags & JBD2_JOURNAL_FLUSH_DISCARD) &&
+ !bdev_max_discard_sectors(EXT4_SB(sb)->s_journal->j_dev))
+ return -EOPNOTSUPP;
+
+ if (flags & EXT4_IOC_CHECKPOINT_FLAG_DRY_RUN)
+ return 0;
+
+ if (flags & EXT4_IOC_CHECKPOINT_FLAG_DISCARD)
+ flush_flags |= JBD2_JOURNAL_FLUSH_DISCARD;
+
+ if (flags & EXT4_IOC_CHECKPOINT_FLAG_ZEROOUT) {
+ flush_flags |= JBD2_JOURNAL_FLUSH_ZEROOUT;
+ pr_info_ratelimited("warning: checkpointing journal with EXT4_IOC_CHECKPOINT_FLAG_ZEROOUT can be slow");
+ }
+
+ jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
+ err = jbd2_journal_flush(EXT4_SB(sb)->s_journal, flush_flags);
+ jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
+
+ return err;
+}
+
+static int ext4_ioctl_setlabel(struct file *filp, const char __user *user_label)
+{
+ size_t len;
+ int ret = 0;
+ char new_label[EXT4_LABEL_MAX + 1];
+ struct super_block *sb = file_inode(filp)->i_sb;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ /*
+ * Copy the maximum length allowed for ext4 label with one more to
+ * find the required terminating null byte in order to test the
+ * label length. The on disk label doesn't need to be null terminated.
+ */
+ if (copy_from_user(new_label, user_label, EXT4_LABEL_MAX + 1))
+ return -EFAULT;
+
+ len = strnlen(new_label, EXT4_LABEL_MAX + 1);
+ if (len > EXT4_LABEL_MAX)
+ return -EINVAL;
+
+ /*
+ * Clear the buffer after the new label
+ */
+ memset(new_label + len, 0, EXT4_LABEL_MAX - len);
+
+ ret = mnt_want_write_file(filp);
+ if (ret)
+ return ret;
+
+ ret = ext4_update_superblocks_fn(sb, ext4_sb_setlabel, new_label);
+
+ mnt_drop_write_file(filp);
+ return ret;
+}
+
+static int ext4_ioctl_getlabel(struct ext4_sb_info *sbi, char __user *user_label)
+{
+ char label[EXT4_LABEL_MAX + 1];
+
+ /*
+ * EXT4_LABEL_MAX must always be smaller than FSLABEL_MAX because
+ * FSLABEL_MAX must include terminating null byte, while s_volume_name
+ * does not have to.
+ */
+ BUILD_BUG_ON(EXT4_LABEL_MAX >= FSLABEL_MAX);
+
+ lock_buffer(sbi->s_sbh);
+ memtostr_pad(label, sbi->s_es->s_volume_name);
+ unlock_buffer(sbi->s_sbh);
+
+ if (copy_to_user(user_label, label, sizeof(label)))
+ return -EFAULT;
+ return 0;
+}
+
+static int ext4_ioctl_getuuid(struct ext4_sb_info *sbi,
+ struct fsuuid __user *ufsuuid)
+{
+ struct fsuuid fsuuid;
+ __u8 uuid[UUID_SIZE];
+
+ if (copy_from_user(&fsuuid, ufsuuid, sizeof(fsuuid)))
+ return -EFAULT;
+
+ if (fsuuid.fsu_len == 0) {
+ fsuuid.fsu_len = UUID_SIZE;
+ if (copy_to_user(&ufsuuid->fsu_len, &fsuuid.fsu_len,
+ sizeof(fsuuid.fsu_len)))
+ return -EFAULT;
+ return 0;
+ }
+
+ if (fsuuid.fsu_len < UUID_SIZE || fsuuid.fsu_flags != 0)
+ return -EINVAL;
+
+ lock_buffer(sbi->s_sbh);
+ memcpy(uuid, sbi->s_es->s_uuid, UUID_SIZE);
+ unlock_buffer(sbi->s_sbh);
+
+ fsuuid.fsu_len = UUID_SIZE;
+ if (copy_to_user(ufsuuid, &fsuuid, sizeof(fsuuid)) ||
+ copy_to_user(&ufsuuid->fsu_uuid[0], uuid, UUID_SIZE))
+ return -EFAULT;
+ return 0;
+}
+
+static int ext4_ioctl_setuuid(struct file *filp,
+ const struct fsuuid __user *ufsuuid)
+{
+ int ret = 0;
+ struct super_block *sb = file_inode(filp)->i_sb;
+ struct fsuuid fsuuid;
+ __u8 uuid[UUID_SIZE];
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ /*
+ * If any checksums (group descriptors or metadata) are being used
+ * then the checksum seed feature is required to change the UUID.
+ */
+ if (((ext4_has_feature_gdt_csum(sb) ||
+ ext4_has_feature_metadata_csum(sb))
+ && !ext4_has_feature_csum_seed(sb))
+ || ext4_has_feature_stable_inodes(sb))
+ return -EOPNOTSUPP;
+
+ if (copy_from_user(&fsuuid, ufsuuid, sizeof(fsuuid)))
+ return -EFAULT;
+
+ if (fsuuid.fsu_len != UUID_SIZE || fsuuid.fsu_flags != 0)
+ return -EINVAL;
+
+ if (copy_from_user(uuid, &ufsuuid->fsu_uuid[0], UUID_SIZE))
+ return -EFAULT;
+
+ ret = mnt_want_write_file(filp);
+ if (ret)
+ return ret;
+
+ ret = ext4_update_superblocks_fn(sb, ext4_sb_setuuid, &uuid);
+ mnt_drop_write_file(filp);
+
+ return ret;
+}
+
+
+#define TUNE_OPS_SUPPORTED (EXT4_TUNE_FL_ERRORS_BEHAVIOR | \
+ EXT4_TUNE_FL_MNT_COUNT | EXT4_TUNE_FL_MAX_MNT_COUNT | \
+ EXT4_TUNE_FL_CHECKINTRVAL | EXT4_TUNE_FL_LAST_CHECK_TIME | \
+ EXT4_TUNE_FL_RESERVED_BLOCKS | EXT4_TUNE_FL_RESERVED_UID | \
+ EXT4_TUNE_FL_RESERVED_GID | EXT4_TUNE_FL_DEFAULT_MNT_OPTS | \
+ EXT4_TUNE_FL_DEF_HASH_ALG | EXT4_TUNE_FL_RAID_STRIDE | \
+ EXT4_TUNE_FL_RAID_STRIPE_WIDTH | EXT4_TUNE_FL_MOUNT_OPTS | \
+ EXT4_TUNE_FL_FEATURES | EXT4_TUNE_FL_EDIT_FEATURES | \
+ EXT4_TUNE_FL_FORCE_FSCK | EXT4_TUNE_FL_ENCODING | \
+ EXT4_TUNE_FL_ENCODING_FLAGS)
+
+#define EXT4_TUNE_SET_COMPAT_SUPP \
+ (EXT4_FEATURE_COMPAT_DIR_INDEX | \
+ EXT4_FEATURE_COMPAT_STABLE_INODES)
+#define EXT4_TUNE_SET_INCOMPAT_SUPP \
+ (EXT4_FEATURE_INCOMPAT_EXTENTS | \
+ EXT4_FEATURE_INCOMPAT_EA_INODE | \
+ EXT4_FEATURE_INCOMPAT_ENCRYPT | \
+ EXT4_FEATURE_INCOMPAT_CSUM_SEED | \
+ EXT4_FEATURE_INCOMPAT_LARGEDIR | \
+ EXT4_FEATURE_INCOMPAT_CASEFOLD)
+#define EXT4_TUNE_SET_RO_COMPAT_SUPP \
+ (EXT4_FEATURE_RO_COMPAT_LARGE_FILE | \
+ EXT4_FEATURE_RO_COMPAT_DIR_NLINK | \
+ EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE | \
+ EXT4_FEATURE_RO_COMPAT_PROJECT | \
+ EXT4_FEATURE_RO_COMPAT_VERITY)
+
+#define EXT4_TUNE_CLEAR_COMPAT_SUPP (0)
+#define EXT4_TUNE_CLEAR_INCOMPAT_SUPP (0)
+#define EXT4_TUNE_CLEAR_RO_COMPAT_SUPP (0)
+
+#define SB_ENC_SUPP_MASK (SB_ENC_STRICT_MODE_FL | \
+ SB_ENC_NO_COMPAT_FALLBACK_FL)
+
+static int ext4_ioctl_get_tune_sb(struct ext4_sb_info *sbi,
+ struct ext4_tune_sb_params __user *params)
+{
+ struct ext4_tune_sb_params ret;
+ struct ext4_super_block *es = sbi->s_es;
+
+ memset(&ret, 0, sizeof(ret));
+ ret.set_flags = TUNE_OPS_SUPPORTED;
+ ret.errors_behavior = le16_to_cpu(es->s_errors);
+ ret.mnt_count = le16_to_cpu(es->s_mnt_count);
+ ret.max_mnt_count = le16_to_cpu(es->s_max_mnt_count);
+ ret.checkinterval = le32_to_cpu(es->s_checkinterval);
+ ret.last_check_time = le32_to_cpu(es->s_lastcheck);
+ ret.reserved_blocks = ext4_r_blocks_count(es);
+ ret.blocks_count = ext4_blocks_count(es);
+ ret.reserved_uid = ext4_get_resuid(es);
+ ret.reserved_gid = ext4_get_resgid(es);
+ ret.default_mnt_opts = le32_to_cpu(es->s_default_mount_opts);
+ ret.def_hash_alg = es->s_def_hash_version;
+ ret.raid_stride = le16_to_cpu(es->s_raid_stride);
+ ret.raid_stripe_width = le32_to_cpu(es->s_raid_stripe_width);
+ ret.encoding = le16_to_cpu(es->s_encoding);
+ ret.encoding_flags = le16_to_cpu(es->s_encoding_flags);
+ strscpy_pad(ret.mount_opts, es->s_mount_opts);
+ ret.feature_compat = le32_to_cpu(es->s_feature_compat);
+ ret.feature_incompat = le32_to_cpu(es->s_feature_incompat);
+ ret.feature_ro_compat = le32_to_cpu(es->s_feature_ro_compat);
+ ret.set_feature_compat_mask = EXT4_TUNE_SET_COMPAT_SUPP;
+ ret.set_feature_incompat_mask = EXT4_TUNE_SET_INCOMPAT_SUPP;
+ ret.set_feature_ro_compat_mask = EXT4_TUNE_SET_RO_COMPAT_SUPP;
+ ret.clear_feature_compat_mask = EXT4_TUNE_CLEAR_COMPAT_SUPP;
+ ret.clear_feature_incompat_mask = EXT4_TUNE_CLEAR_INCOMPAT_SUPP;
+ ret.clear_feature_ro_compat_mask = EXT4_TUNE_CLEAR_RO_COMPAT_SUPP;
+ if (copy_to_user(params, &ret, sizeof(ret)))
+ return -EFAULT;
+ return 0;
+}
+
+static void ext4_sb_setparams(struct ext4_sb_info *sbi,
+ struct ext4_super_block *es, const void *arg)
+{
+ const struct ext4_tune_sb_params *params = arg;
+
+ if (params->set_flags & EXT4_TUNE_FL_ERRORS_BEHAVIOR)
+ es->s_errors = cpu_to_le16(params->errors_behavior);
+ if (params->set_flags & EXT4_TUNE_FL_MNT_COUNT)
+ es->s_mnt_count = cpu_to_le16(params->mnt_count);
+ if (params->set_flags & EXT4_TUNE_FL_MAX_MNT_COUNT)
+ es->s_max_mnt_count = cpu_to_le16(params->max_mnt_count);
+ if (params->set_flags & EXT4_TUNE_FL_CHECKINTRVAL)
+ es->s_checkinterval = cpu_to_le32(params->checkinterval);
+ if (params->set_flags & EXT4_TUNE_FL_LAST_CHECK_TIME)
+ es->s_lastcheck = cpu_to_le32(params->last_check_time);
+ if (params->set_flags & EXT4_TUNE_FL_RESERVED_BLOCKS) {
+ ext4_fsblk_t blk = params->reserved_blocks;
+
+ es->s_r_blocks_count_lo = cpu_to_le32((u32)blk);
+ es->s_r_blocks_count_hi = cpu_to_le32(blk >> 32);
+ }
+ if (params->set_flags & EXT4_TUNE_FL_RESERVED_UID) {
+ int uid = params->reserved_uid;
+
+ es->s_def_resuid = cpu_to_le16(uid & 0xFFFF);
+ es->s_def_resuid_hi = cpu_to_le16(uid >> 16);
+ }
+ if (params->set_flags & EXT4_TUNE_FL_RESERVED_GID) {
+ int gid = params->reserved_gid;
+
+ es->s_def_resgid = cpu_to_le16(gid & 0xFFFF);
+ es->s_def_resgid_hi = cpu_to_le16(gid >> 16);
+ }
+ if (params->set_flags & EXT4_TUNE_FL_DEFAULT_MNT_OPTS)
+ es->s_default_mount_opts = cpu_to_le32(params->default_mnt_opts);
+ if (params->set_flags & EXT4_TUNE_FL_DEF_HASH_ALG)
+ es->s_def_hash_version = params->def_hash_alg;
+ if (params->set_flags & EXT4_TUNE_FL_RAID_STRIDE)
+ es->s_raid_stride = cpu_to_le16(params->raid_stride);
+ if (params->set_flags & EXT4_TUNE_FL_RAID_STRIPE_WIDTH)
+ es->s_raid_stripe_width =
+ cpu_to_le32(params->raid_stripe_width);
+ if (params->set_flags & EXT4_TUNE_FL_ENCODING)
+ es->s_encoding = cpu_to_le16(params->encoding);
+ if (params->set_flags & EXT4_TUNE_FL_ENCODING_FLAGS)
+ es->s_encoding_flags = cpu_to_le16(params->encoding_flags);
+ strscpy_pad(es->s_mount_opts, params->mount_opts);
+ if (params->set_flags & EXT4_TUNE_FL_EDIT_FEATURES) {
+ es->s_feature_compat |=
+ cpu_to_le32(params->set_feature_compat_mask);
+ es->s_feature_incompat |=
+ cpu_to_le32(params->set_feature_incompat_mask);
+ es->s_feature_ro_compat |=
+ cpu_to_le32(params->set_feature_ro_compat_mask);
+ es->s_feature_compat &=
+ ~cpu_to_le32(params->clear_feature_compat_mask);
+ es->s_feature_incompat &=
+ ~cpu_to_le32(params->clear_feature_incompat_mask);
+ es->s_feature_ro_compat &=
+ ~cpu_to_le32(params->clear_feature_ro_compat_mask);
+ if (params->set_feature_compat_mask &
+ EXT4_FEATURE_COMPAT_DIR_INDEX)
+ es->s_def_hash_version = sbi->s_def_hash_version;
+ if (params->set_feature_incompat_mask &
+ EXT4_FEATURE_INCOMPAT_CSUM_SEED)
+ es->s_checksum_seed = cpu_to_le32(sbi->s_csum_seed);
+ }
+ if (params->set_flags & EXT4_TUNE_FL_FORCE_FSCK)
+ es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
+}
+
+static int ext4_ioctl_set_tune_sb(struct file *filp,
+ struct ext4_tune_sb_params __user *in)
+{
+ struct ext4_tune_sb_params params;
+ struct super_block *sb = file_inode(filp)->i_sb;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_super_block *es = sbi->s_es;
+ int enabling_casefold = 0;
+ int ret;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (copy_from_user(¶ms, in, sizeof(params)))
+ return -EFAULT;
+
+ if ((params.set_flags & ~TUNE_OPS_SUPPORTED) != 0)
+ return -EOPNOTSUPP;
+
+ if ((params.set_flags & EXT4_TUNE_FL_ERRORS_BEHAVIOR) &&
+ (params.errors_behavior > EXT4_ERRORS_PANIC))
+ return -EINVAL;
+
+ if ((params.set_flags & EXT4_TUNE_FL_RESERVED_BLOCKS) &&
+ (params.reserved_blocks > ext4_blocks_count(sbi->s_es) / 2))
+ return -EINVAL;
+ if ((params.set_flags & EXT4_TUNE_FL_DEF_HASH_ALG) &&
+ ((params.def_hash_alg > DX_HASH_LAST) ||
+ (params.def_hash_alg == DX_HASH_SIPHASH)))
+ return -EINVAL;
+ if ((params.set_flags & EXT4_TUNE_FL_FEATURES) &&
+ (params.set_flags & EXT4_TUNE_FL_EDIT_FEATURES))
+ return -EINVAL;
+
+ if (params.set_flags & EXT4_TUNE_FL_FEATURES) {
+ params.set_feature_compat_mask =
+ params.feature_compat &
+ ~le32_to_cpu(es->s_feature_compat);
+ params.set_feature_incompat_mask =
+ params.feature_incompat &
+ ~le32_to_cpu(es->s_feature_incompat);
+ params.set_feature_ro_compat_mask =
+ params.feature_ro_compat &
+ ~le32_to_cpu(es->s_feature_ro_compat);
+ params.clear_feature_compat_mask =
+ ~params.feature_compat &
+ le32_to_cpu(es->s_feature_compat);
+ params.clear_feature_incompat_mask =
+ ~params.feature_incompat &
+ le32_to_cpu(es->s_feature_incompat);
+ params.clear_feature_ro_compat_mask =
+ ~params.feature_ro_compat &
+ le32_to_cpu(es->s_feature_ro_compat);
+ params.set_flags |= EXT4_TUNE_FL_EDIT_FEATURES;
+ }
+ if (params.set_flags & EXT4_TUNE_FL_EDIT_FEATURES) {
+ if ((params.set_feature_compat_mask &
+ ~EXT4_TUNE_SET_COMPAT_SUPP) ||
+ (params.set_feature_incompat_mask &
+ ~EXT4_TUNE_SET_INCOMPAT_SUPP) ||
+ (params.set_feature_ro_compat_mask &
+ ~EXT4_TUNE_SET_RO_COMPAT_SUPP) ||
+ (params.clear_feature_compat_mask &
+ ~EXT4_TUNE_CLEAR_COMPAT_SUPP) ||
+ (params.clear_feature_incompat_mask &
+ ~EXT4_TUNE_CLEAR_INCOMPAT_SUPP) ||
+ (params.clear_feature_ro_compat_mask &
+ ~EXT4_TUNE_CLEAR_RO_COMPAT_SUPP))
+ return -EOPNOTSUPP;
+
+ /*
+ * Filter out the features that are already set from
+ * the set_mask.
+ */
+ params.set_feature_compat_mask &=
+ ~le32_to_cpu(es->s_feature_compat);
+ params.set_feature_incompat_mask &=
+ ~le32_to_cpu(es->s_feature_incompat);
+ params.set_feature_ro_compat_mask &=
+ ~le32_to_cpu(es->s_feature_ro_compat);
+ if ((params.set_feature_incompat_mask &
+ EXT4_FEATURE_INCOMPAT_CASEFOLD)) {
+ enabling_casefold = 1;
+ if (!(params.set_flags & EXT4_TUNE_FL_ENCODING)) {
+ params.encoding = EXT4_ENC_UTF8_12_1;
+ params.set_flags |= EXT4_TUNE_FL_ENCODING;
+ }
+ if (!(params.set_flags & EXT4_TUNE_FL_ENCODING_FLAGS)) {
+ params.encoding_flags = 0;
+ params.set_flags |= EXT4_TUNE_FL_ENCODING_FLAGS;
+ }
+ }
+ if ((params.set_feature_compat_mask &
+ EXT4_FEATURE_COMPAT_DIR_INDEX)) {
+ uuid_t uu;
+
+ memcpy(&uu, sbi->s_hash_seed, UUID_SIZE);
+ if (uuid_is_null(&uu))
+ generate_random_uuid((char *)
+ &sbi->s_hash_seed);
+ if (params.set_flags & EXT4_TUNE_FL_DEF_HASH_ALG)
+ sbi->s_def_hash_version = params.def_hash_alg;
+ else if (sbi->s_def_hash_version == 0)
+ sbi->s_def_hash_version = DX_HASH_HALF_MD4;
+ if (!(es->s_flags &
+ cpu_to_le32(EXT2_FLAGS_UNSIGNED_HASH)) &&
+ !(es->s_flags &
+ cpu_to_le32(EXT2_FLAGS_SIGNED_HASH))) {
+#ifdef __CHAR_UNSIGNED__
+ sbi->s_hash_unsigned = 3;
+#else
+ sbi->s_hash_unsigned = 0;
+#endif
+ }
+ }
+ }
+ if (params.set_flags & EXT4_TUNE_FL_ENCODING) {
+ if (!enabling_casefold)
+ return -EINVAL;
+ if (params.encoding == 0)
+ params.encoding = EXT4_ENC_UTF8_12_1;
+ else if (params.encoding != EXT4_ENC_UTF8_12_1)
+ return -EINVAL;
+ }
+ if (params.set_flags & EXT4_TUNE_FL_ENCODING_FLAGS) {
+ if (!enabling_casefold)
+ return -EINVAL;
+ if (params.encoding_flags & ~SB_ENC_SUPP_MASK)
+ return -EINVAL;
+ }
+
+ ret = mnt_want_write_file(filp);
+ if (ret)
+ return ret;
+
+ ret = ext4_update_superblocks_fn(sb, ext4_sb_setparams, ¶ms);
+ mnt_drop_write_file(filp);
+
+ if (params.set_flags & EXT4_TUNE_FL_DEF_HASH_ALG)
+ sbi->s_def_hash_version = params.def_hash_alg;
+
+ return ret;
+}
+
+static long __ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+{
+ struct inode *inode = file_inode(filp);
+ struct super_block *sb = inode->i_sb;
+ struct mnt_idmap *idmap = file_mnt_idmap(filp);
+
+ ext4_debug("cmd = %u, arg = %lu\n", cmd, arg);
+
+ switch (cmd) {
+ case FS_IOC_GETFSMAP:
+ return ext4_ioc_getfsmap(sb, (void __user *)arg);
+ case EXT4_IOC_GETVERSION:
+ case EXT4_IOC_GETVERSION_OLD:
+ return put_user(inode->i_generation, (int __user *) arg);
+ case EXT4_IOC_SETVERSION:
+ case EXT4_IOC_SETVERSION_OLD: {
+ handle_t *handle;
+ struct ext4_iloc iloc;
+ __u32 generation;
+ int err;
+
+ if (!inode_owner_or_capable(idmap, inode))
+ return -EPERM;
+
+ if (ext4_has_feature_metadata_csum(inode->i_sb)) {
+ ext4_warning(sb, "Setting inode version is not "
+ "supported with metadata_csum enabled.");
+ return -ENOTTY;
+ }
+
+ err = mnt_want_write_file(filp);
+ if (err)
+ return err;
+ if (get_user(generation, (int __user *) arg)) {
+ err = -EFAULT;
+ goto setversion_out;
+ }
+
+ inode_lock(inode);
+ handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
+ if (IS_ERR(handle)) {
+ err = PTR_ERR(handle);
+ goto unlock_out;
+ }
+ err = ext4_reserve_inode_write(handle, inode, &iloc);
+ if (err == 0) {
+ inode_set_ctime_current(inode);
+ inode_inc_iversion(inode);
+ inode->i_generation = generation;
+ err = ext4_mark_iloc_dirty(handle, inode, &iloc);
+ }
+ ext4_journal_stop(handle);
+
+unlock_out:
+ inode_unlock(inode);
+setversion_out:
+ mnt_drop_write_file(filp);
+ return err;
+ }
+ case EXT4_IOC_GROUP_EXTEND: {
+ ext4_fsblk_t n_blocks_count;
+ int err, err2=0;
+
+ err = ext4_resize_begin(sb);
+ if (err)
+ return err;
+
+ if (get_user(n_blocks_count, (__u32 __user *)arg)) {
+ err = -EFAULT;
+ goto group_extend_out;
+ }
+
+ if (ext4_has_feature_bigalloc(sb)) {
+ ext4_msg(sb, KERN_ERR,
+ "Online resizing not supported with bigalloc");
+ err = -EOPNOTSUPP;
+ goto group_extend_out;
+ }
+
+ err = mnt_want_write_file(filp);
+ if (err)
+ goto group_extend_out;
+
+ err = ext4_group_extend(sb, EXT4_SB(sb)->s_es, n_blocks_count);
+ if (EXT4_SB(sb)->s_journal) {
+ jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
+ err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal, 0);
+ jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
+ }
+ if (err == 0)
+ err = err2;
+ mnt_drop_write_file(filp);
+group_extend_out:
+ err2 = ext4_resize_end(sb, false);
+ if (err == 0)
+ err = err2;
+ return err;
+ }
+
+ case EXT4_IOC_MOVE_EXT: {
+ struct move_extent me;
+ int err;
+
+ if (!(filp->f_mode & FMODE_READ) ||
+ !(filp->f_mode & FMODE_WRITE))
+ return -EBADF;
+
+ if (copy_from_user(&me,
+ (struct move_extent __user *)arg, sizeof(me)))
+ return -EFAULT;
+ me.moved_len = 0;
+
+ CLASS(fd, donor)(me.donor_fd);
+ if (fd_empty(donor))
+ return -EBADF;
+
+ if (!(fd_file(donor)->f_mode & FMODE_WRITE))
+ return -EBADF;
+
+ if (ext4_has_feature_bigalloc(sb)) {
+ ext4_msg(sb, KERN_ERR,
+ "Online defrag not supported with bigalloc");
+ return -EOPNOTSUPP;
+ } else if (IS_DAX(inode)) {
+ ext4_msg(sb, KERN_ERR,
+ "Online defrag not supported with DAX");
+ return -EOPNOTSUPP;
+ }
+
+ err = mnt_want_write_file(filp);
+ if (err)
+ return err;
+
+ err = ext4_move_extents(filp, fd_file(donor), me.orig_start,
+ me.donor_start, me.len, &me.moved_len);
+ mnt_drop_write_file(filp);
+
+ if (copy_to_user((struct move_extent __user *)arg,
+ &me, sizeof(me)))
+ err = -EFAULT;
+ return err;
+ }
+
+ case EXT4_IOC_GROUP_ADD: {
+ struct ext4_new_group_data input;
+
+ if (copy_from_user(&input, (struct ext4_new_group_input __user *)arg,
+ sizeof(input)))
+ return -EFAULT;
+
+ return ext4_ioctl_group_add(filp, &input);
+ }
+
+ case EXT4_IOC_MIGRATE:
+ {
+ int err;
+ if (!inode_owner_or_capable(idmap, inode))
+ return -EACCES;
+
+ err = mnt_want_write_file(filp);
+ if (err)
+ return err;
+ /*
+ * inode_mutex prevent write and truncate on the file.
+ * Read still goes through. We take i_data_sem in
+ * ext4_ext_swap_inode_data before we switch the
+ * inode format to prevent read.
+ */
+ inode_lock((inode));
+ err = ext4_ext_migrate(inode);
+ inode_unlock((inode));
+ mnt_drop_write_file(filp);
+ return err;
+ }
+
+ case EXT4_IOC_ALLOC_DA_BLKS:
+ {
+ int err;
+ if (!inode_owner_or_capable(idmap, inode))
+ return -EACCES;
+
+ err = mnt_want_write_file(filp);
+ if (err)
+ return err;
+ err = ext4_alloc_da_blocks(inode);
+ mnt_drop_write_file(filp);
+ return err;
+ }
+
+ case EXT4_IOC_SWAP_BOOT:
+ {
+ int err;
+ if (!(filp->f_mode & FMODE_WRITE))
+ return -EBADF;
+ err = mnt_want_write_file(filp);
+ if (err)
+ return err;
+ err = swap_inode_boot_loader(sb, idmap, inode);
+ mnt_drop_write_file(filp);
+ return err;
+ }
+
+ case EXT4_IOC_RESIZE_FS: {
+ ext4_fsblk_t n_blocks_count;
+ int err = 0, err2 = 0;
+ ext4_group_t o_group = EXT4_SB(sb)->s_groups_count;
+
+ if (copy_from_user(&n_blocks_count, (__u64 __user *)arg,
+ sizeof(__u64))) {
+ return -EFAULT;
+ }
+
+ err = ext4_resize_begin(sb);
+ if (err)
+ return err;
+
+ err = mnt_want_write_file(filp);
+ if (err)
+ goto resizefs_out;
+
+ err = ext4_resize_fs(sb, n_blocks_count);
+ if (EXT4_SB(sb)->s_journal) {
+ ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_RESIZE, NULL);
+ jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
+ err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal, 0);
+ jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
+ }
+ if (err == 0)
+ err = err2;
+ mnt_drop_write_file(filp);
+ if (!err && (o_group < EXT4_SB(sb)->s_groups_count) &&
+ ext4_has_group_desc_csum(sb) &&
+ test_opt(sb, INIT_INODE_TABLE))
+ err = ext4_register_li_request(sb, o_group);
+
+resizefs_out:
+ err2 = ext4_resize_end(sb, true);
+ if (err == 0)
+ err = err2;
+ return err;
+ }
+
+ case FITRIM:
+ {
+ struct fstrim_range range;
+ int ret = 0;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (!bdev_max_discard_sectors(sb->s_bdev))
+ return -EOPNOTSUPP;
+
+ /*
+ * We haven't replayed the journal, so we cannot use our
+ * block-bitmap-guided storage zapping commands.
+ */
+ if (test_opt(sb, NOLOAD) && ext4_has_feature_journal(sb))
+ return -EROFS;
+
+ if (copy_from_user(&range, (struct fstrim_range __user *)arg,
+ sizeof(range)))
+ return -EFAULT;
+
+ ret = ext4_trim_fs(sb, &range);
+ if (ret < 0)
+ return ret;
+
+ if (copy_to_user((struct fstrim_range __user *)arg, &range,
+ sizeof(range)))
+ return -EFAULT;
+
+ return 0;
+ }
+ case EXT4_IOC_PRECACHE_EXTENTS:
+ {
+ int ret;
+
+ inode_lock_shared(inode);
+ ret = ext4_ext_precache(inode);
+ inode_unlock_shared(inode);
+ return ret;
+ }
+ case FS_IOC_SET_ENCRYPTION_POLICY:
+ if (!ext4_has_feature_encrypt(sb))
+ return -EOPNOTSUPP;
+ return fscrypt_ioctl_set_policy(filp, (const void __user *)arg);
+
+ case FS_IOC_GET_ENCRYPTION_PWSALT:
+ return ext4_ioctl_get_encryption_pwsalt(filp, (void __user *)arg);
+
+ case FS_IOC_GET_ENCRYPTION_POLICY:
+ if (!ext4_has_feature_encrypt(sb))
+ return -EOPNOTSUPP;
+ return fscrypt_ioctl_get_policy(filp, (void __user *)arg);
+
+ case FS_IOC_GET_ENCRYPTION_POLICY_EX:
+ if (!ext4_has_feature_encrypt(sb))
+ return -EOPNOTSUPP;
+ return fscrypt_ioctl_get_policy_ex(filp, (void __user *)arg);
+
+ case FS_IOC_ADD_ENCRYPTION_KEY:
+ if (!ext4_has_feature_encrypt(sb))
+ return -EOPNOTSUPP;
+ return fscrypt_ioctl_add_key(filp, (void __user *)arg);
+
+ case FS_IOC_REMOVE_ENCRYPTION_KEY:
+ if (!ext4_has_feature_encrypt(sb))
+ return -EOPNOTSUPP;
+ return fscrypt_ioctl_remove_key(filp, (void __user *)arg);
+
+ case FS_IOC_REMOVE_ENCRYPTION_KEY_ALL_USERS:
+ if (!ext4_has_feature_encrypt(sb))
+ return -EOPNOTSUPP;
+ return fscrypt_ioctl_remove_key_all_users(filp,
+ (void __user *)arg);
+ case FS_IOC_GET_ENCRYPTION_KEY_STATUS:
+ if (!ext4_has_feature_encrypt(sb))
+ return -EOPNOTSUPP;
+ return fscrypt_ioctl_get_key_status(filp, (void __user *)arg);
+
+ case FS_IOC_GET_ENCRYPTION_NONCE:
+ if (!ext4_has_feature_encrypt(sb))
+ return -EOPNOTSUPP;
+ return fscrypt_ioctl_get_nonce(filp, (void __user *)arg);
+
+ case EXT4_IOC_CLEAR_ES_CACHE:
+ {
+ if (!inode_owner_or_capable(idmap, inode))
+ return -EACCES;
+ ext4_clear_inode_es(inode);
+ return 0;
+ }
+
+ case EXT4_IOC_GETSTATE:
+ {
+ __u32 state = 0;
+
+ if (ext4_test_inode_state(inode, EXT4_STATE_EXT_PRECACHED))
+ state |= EXT4_STATE_FLAG_EXT_PRECACHED;
+ if (ext4_test_inode_state(inode, EXT4_STATE_NEW))
+ state |= EXT4_STATE_FLAG_NEW;
+ if (ext4_test_inode_state(inode, EXT4_STATE_NEWENTRY))
+ state |= EXT4_STATE_FLAG_NEWENTRY;
+ if (ext4_test_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE))
+ state |= EXT4_STATE_FLAG_DA_ALLOC_CLOSE;
+
+ return put_user(state, (__u32 __user *) arg);
+ }
+
+ case EXT4_IOC_GET_ES_CACHE:
+ return ext4_ioctl_get_es_cache(filp, arg);
+
+ case EXT4_IOC_SHUTDOWN:
+ return ext4_ioctl_shutdown(sb, arg);
+
+ case FS_IOC_ENABLE_VERITY:
+ if (!ext4_has_feature_verity(sb))
+ return -EOPNOTSUPP;
+ return fsverity_ioctl_enable(filp, (const void __user *)arg);
+
+ case FS_IOC_MEASURE_VERITY:
+ if (!ext4_has_feature_verity(sb))
+ return -EOPNOTSUPP;
+ return fsverity_ioctl_measure(filp, (void __user *)arg);
+
+ case FS_IOC_READ_VERITY_METADATA:
+ if (!ext4_has_feature_verity(sb))
+ return -EOPNOTSUPP;
+ return fsverity_ioctl_read_metadata(filp,
+ (const void __user *)arg);
+
+ case EXT4_IOC_CHECKPOINT:
+ return ext4_ioctl_checkpoint(filp, arg);
+
+ case FS_IOC_GETFSLABEL:
+ return ext4_ioctl_getlabel(EXT4_SB(sb), (void __user *)arg);
+
+ case FS_IOC_SETFSLABEL:
+ return ext4_ioctl_setlabel(filp,
+ (const void __user *)arg);
+
+ case EXT4_IOC_GETFSUUID:
+ return ext4_ioctl_getuuid(EXT4_SB(sb), (void __user *)arg);
+ case EXT4_IOC_SETFSUUID:
+ return ext4_ioctl_setuuid(filp, (const void __user *)arg);
+ case EXT4_IOC_GET_TUNE_SB_PARAM:
+ return ext4_ioctl_get_tune_sb(EXT4_SB(sb),
+ (void __user *)arg);
+ case EXT4_IOC_SET_TUNE_SB_PARAM:
+ return ext4_ioctl_set_tune_sb(filp, (void __user *)arg);
+ default:
+ return -ENOTTY;
+ }
+}
+
+long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+{
+ return __ext4_ioctl(filp, cmd, arg);
+}
+
+#ifdef CONFIG_COMPAT
+long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+ /* These are just misnamed, they actually get/put from/to user an int */
+ switch (cmd) {
+ case EXT4_IOC32_GETVERSION:
+ cmd = EXT4_IOC_GETVERSION;
+ break;
+ case EXT4_IOC32_SETVERSION:
+ cmd = EXT4_IOC_SETVERSION;
+ break;
+ case EXT4_IOC32_GROUP_EXTEND:
+ cmd = EXT4_IOC_GROUP_EXTEND;
+ break;
+ case EXT4_IOC32_GETVERSION_OLD:
+ cmd = EXT4_IOC_GETVERSION_OLD;
+ break;
+ case EXT4_IOC32_SETVERSION_OLD:
+ cmd = EXT4_IOC_SETVERSION_OLD;
+ break;
+ case EXT4_IOC32_GETRSVSZ:
+ cmd = EXT4_IOC_GETRSVSZ;
+ break;
+ case EXT4_IOC32_SETRSVSZ:
+ cmd = EXT4_IOC_SETRSVSZ;
+ break;
+ case EXT4_IOC32_GROUP_ADD: {
+ struct compat_ext4_new_group_input __user *uinput;
+ struct ext4_new_group_data input;
+ int err;
+
+ uinput = compat_ptr(arg);
+ err = get_user(input.group, &uinput->group);
+ err |= get_user(input.block_bitmap, &uinput->block_bitmap);
+ err |= get_user(input.inode_bitmap, &uinput->inode_bitmap);
+ err |= get_user(input.inode_table, &uinput->inode_table);
+ err |= get_user(input.blocks_count, &uinput->blocks_count);
+ err |= get_user(input.reserved_blocks,
+ &uinput->reserved_blocks);
+ if (err)
+ return -EFAULT;
+ return ext4_ioctl_group_add(file, &input);
+ }
+ case EXT4_IOC_MOVE_EXT:
+ case EXT4_IOC_RESIZE_FS:
+ case FITRIM:
+ case EXT4_IOC_PRECACHE_EXTENTS:
+ case FS_IOC_SET_ENCRYPTION_POLICY:
+ case FS_IOC_GET_ENCRYPTION_PWSALT:
+ case FS_IOC_GET_ENCRYPTION_POLICY:
+ case FS_IOC_GET_ENCRYPTION_POLICY_EX:
+ case FS_IOC_ADD_ENCRYPTION_KEY:
+ case FS_IOC_REMOVE_ENCRYPTION_KEY:
+ case FS_IOC_REMOVE_ENCRYPTION_KEY_ALL_USERS:
+ case FS_IOC_GET_ENCRYPTION_KEY_STATUS:
+ case FS_IOC_GET_ENCRYPTION_NONCE:
+ case EXT4_IOC_SHUTDOWN:
+ case FS_IOC_GETFSMAP:
+ case FS_IOC_ENABLE_VERITY:
+ case FS_IOC_MEASURE_VERITY:
+ case FS_IOC_READ_VERITY_METADATA:
+ case EXT4_IOC_CLEAR_ES_CACHE:
+ case EXT4_IOC_GETSTATE:
+ case EXT4_IOC_GET_ES_CACHE:
+ case EXT4_IOC_CHECKPOINT:
+ case FS_IOC_GETFSLABEL:
+ case FS_IOC_SETFSLABEL:
+ case EXT4_IOC_GETFSUUID:
+ case EXT4_IOC_SETFSUUID:
+ break;
+ default:
+ return -ENOIOCTLCMD;
+ }
+ return ext4_ioctl(file, cmd, (unsigned long) compat_ptr(arg));
+}
+#endif
+
+static void set_overhead(struct ext4_sb_info *sbi,
+ struct ext4_super_block *es, const void *arg)
+{
+ es->s_overhead_clusters = cpu_to_le32(*((unsigned long *) arg));
+}
+
+int ext4_update_overhead(struct super_block *sb, bool force)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+ if (ext4_emergency_state(sb) || sb_rdonly(sb))
+ return 0;
+ if (!force &&
+ (sbi->s_overhead == 0 ||
+ sbi->s_overhead == le32_to_cpu(sbi->s_es->s_overhead_clusters)))
+ return 0;
+ return ext4_update_superblocks_fn(sb, set_overhead, &sbi->s_overhead);
+}
new file mode 100644
@@ -0,0 +1,999 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * KUnit test of ext4 multiblocks allocation.
+ */
+
+#include <kunit/test.h>
+#include <kunit/static_stub.h>
+#include <linux/random.h>
+
+#include "ext4.h"
+
+struct mbt_grp_ctx {
+ struct buffer_head bitmap_bh;
+ /* desc and gd_bh are just the place holders for now */
+ struct ext4_group_desc desc;
+ struct buffer_head gd_bh;
+};
+
+struct mbt_ctx {
+ struct mbt_grp_ctx *grp_ctx;
+};
+
+struct mbt_ext4_super_block {
+ struct ext4_super_block es;
+ struct ext4_sb_info sbi;
+ struct mbt_ctx mbt_ctx;
+};
+
+#define MBT_SB(_sb) (container_of((_sb)->s_fs_info, struct mbt_ext4_super_block, sbi))
+#define MBT_CTX(_sb) (&MBT_SB(_sb)->mbt_ctx)
+#define MBT_GRP_CTX(_sb, _group) (&MBT_CTX(_sb)->grp_ctx[_group])
+
+static struct inode *mbt_alloc_inode(struct super_block *sb)
+{
+ struct ext4_inode_info *ei;
+
+ ei = kmalloc(sizeof(struct ext4_inode_info), GFP_KERNEL);
+ if (!ei)
+ return NULL;
+
+ INIT_LIST_HEAD(&ei->i_orphan);
+ init_rwsem(&ei->xattr_sem);
+ init_rwsem(&ei->i_data_sem);
+ inode_init_once(&ei->vfs_inode);
+ ext4_fc_init_inode(&ei->vfs_inode);
+
+ return &ei->vfs_inode;
+}
+
+static void mbt_free_inode(struct inode *inode)
+{
+ kfree(EXT4_I(inode));
+}
+
+static const struct super_operations mbt_sops = {
+ .alloc_inode = mbt_alloc_inode,
+ .free_inode = mbt_free_inode,
+};
+
+static void mbt_kill_sb(struct super_block *sb)
+{
+ generic_shutdown_super(sb);
+}
+
+static struct file_system_type mbt_fs_type = {
+ .name = "mballoc test",
+ .kill_sb = mbt_kill_sb,
+};
+
+static int mbt_mb_init(struct super_block *sb)
+{
+ ext4_fsblk_t block;
+ int ret;
+
+ /* needed by ext4_mb_init->bdev_nonrot(sb->s_bdev) */
+ sb->s_bdev = kzalloc(sizeof(*sb->s_bdev), GFP_KERNEL);
+ if (sb->s_bdev == NULL)
+ return -ENOMEM;
+
+ sb->s_bdev->bd_queue = kzalloc(sizeof(struct request_queue), GFP_KERNEL);
+ if (sb->s_bdev->bd_queue == NULL) {
+ kfree(sb->s_bdev);
+ return -ENOMEM;
+ }
+
+ /*
+ * needed by ext4_mb_init->ext4_mb_init_backend-> sbi->s_buddy_cache =
+ * new_inode(sb);
+ */
+ INIT_LIST_HEAD(&sb->s_inodes);
+ sb->s_op = &mbt_sops;
+
+ ret = ext4_mb_init(sb);
+ if (ret != 0)
+ goto err_out;
+
+ block = ext4_count_free_clusters(sb);
+ ret = percpu_counter_init(&EXT4_SB(sb)->s_freeclusters_counter, block,
+ GFP_KERNEL);
+ if (ret != 0)
+ goto err_mb_release;
+
+ ret = percpu_counter_init(&EXT4_SB(sb)->s_dirtyclusters_counter, 0,
+ GFP_KERNEL);
+ if (ret != 0)
+ goto err_freeclusters;
+
+ return 0;
+
+err_freeclusters:
+ percpu_counter_destroy(&EXT4_SB(sb)->s_freeclusters_counter);
+err_mb_release:
+ ext4_mb_release(sb);
+err_out:
+ kfree(sb->s_bdev->bd_queue);
+ kfree(sb->s_bdev);
+ return ret;
+}
+
+static void mbt_mb_release(struct super_block *sb)
+{
+ percpu_counter_destroy(&EXT4_SB(sb)->s_dirtyclusters_counter);
+ percpu_counter_destroy(&EXT4_SB(sb)->s_freeclusters_counter);
+ ext4_mb_release(sb);
+ kfree(sb->s_bdev->bd_queue);
+ kfree(sb->s_bdev);
+}
+
+static int mbt_set(struct super_block *sb, void *data)
+{
+ return 0;
+}
+
+static struct super_block *mbt_ext4_alloc_super_block(void)
+{
+ struct mbt_ext4_super_block *fsb;
+ struct super_block *sb;
+ struct ext4_sb_info *sbi;
+
+ fsb = kzalloc(sizeof(*fsb), GFP_KERNEL);
+ if (fsb == NULL)
+ return NULL;
+
+ sb = sget(&mbt_fs_type, NULL, mbt_set, 0, NULL);
+ if (IS_ERR(sb))
+ goto out;
+
+ sbi = &fsb->sbi;
+
+ sbi->s_blockgroup_lock =
+ kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL);
+ if (!sbi->s_blockgroup_lock)
+ goto out_deactivate;
+
+ bgl_lock_init(sbi->s_blockgroup_lock);
+
+ sbi->s_es = &fsb->es;
+ sbi->s_sb = sb;
+ sb->s_fs_info = sbi;
+
+ up_write(&sb->s_umount);
+ return sb;
+
+out_deactivate:
+ deactivate_locked_super(sb);
+out:
+ kfree(fsb);
+ return NULL;
+}
+
+static void mbt_ext4_free_super_block(struct super_block *sb)
+{
+ struct mbt_ext4_super_block *fsb = MBT_SB(sb);
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+ kfree(sbi->s_blockgroup_lock);
+ deactivate_super(sb);
+ kfree(fsb);
+}
+
+struct mbt_ext4_block_layout {
+ unsigned char blocksize_bits;
+ unsigned int cluster_bits;
+ uint32_t blocks_per_group;
+ ext4_group_t group_count;
+ uint16_t desc_size;
+};
+
+static void mbt_init_sb_layout(struct super_block *sb,
+ struct mbt_ext4_block_layout *layout)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_super_block *es = sbi->s_es;
+
+ sb->s_blocksize = 1UL << layout->blocksize_bits;
+ sb->s_blocksize_bits = layout->blocksize_bits;
+
+ sbi->s_groups_count = layout->group_count;
+ sbi->s_blocks_per_group = layout->blocks_per_group;
+ sbi->s_cluster_bits = layout->cluster_bits;
+ sbi->s_cluster_ratio = 1U << layout->cluster_bits;
+ sbi->s_clusters_per_group = layout->blocks_per_group >>
+ layout->cluster_bits;
+ sbi->s_desc_size = layout->desc_size;
+ sbi->s_desc_per_block_bits =
+ sb->s_blocksize_bits - (fls(layout->desc_size) - 1);
+ sbi->s_desc_per_block = 1 << sbi->s_desc_per_block_bits;
+
+ es->s_first_data_block = cpu_to_le32(0);
+ es->s_blocks_count_lo = cpu_to_le32(layout->blocks_per_group *
+ layout->group_count);
+}
+
+static int mbt_grp_ctx_init(struct super_block *sb,
+ struct mbt_grp_ctx *grp_ctx)
+{
+ ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb);
+
+ grp_ctx->bitmap_bh.b_data = kzalloc(EXT4_BLOCK_SIZE(sb), GFP_KERNEL);
+ if (grp_ctx->bitmap_bh.b_data == NULL)
+ return -ENOMEM;
+ mb_set_bits(grp_ctx->bitmap_bh.b_data, max, sb->s_blocksize * 8 - max);
+ ext4_free_group_clusters_set(sb, &grp_ctx->desc, max);
+
+ return 0;
+}
+
+static void mbt_grp_ctx_release(struct mbt_grp_ctx *grp_ctx)
+{
+ kfree(grp_ctx->bitmap_bh.b_data);
+ grp_ctx->bitmap_bh.b_data = NULL;
+}
+
+static void mbt_ctx_mark_used(struct super_block *sb, ext4_group_t group,
+ unsigned int start, unsigned int len)
+{
+ struct mbt_grp_ctx *grp_ctx = MBT_GRP_CTX(sb, group);
+
+ mb_set_bits(grp_ctx->bitmap_bh.b_data, start, len);
+}
+
+static void *mbt_ctx_bitmap(struct super_block *sb, ext4_group_t group)
+{
+ struct mbt_grp_ctx *grp_ctx = MBT_GRP_CTX(sb, group);
+
+ return grp_ctx->bitmap_bh.b_data;
+}
+
+/* called after mbt_init_sb_layout */
+static int mbt_ctx_init(struct super_block *sb)
+{
+ struct mbt_ctx *ctx = MBT_CTX(sb);
+ ext4_group_t i, ngroups = ext4_get_groups_count(sb);
+
+ ctx->grp_ctx = kcalloc(ngroups, sizeof(struct mbt_grp_ctx),
+ GFP_KERNEL);
+ if (ctx->grp_ctx == NULL)
+ return -ENOMEM;
+
+ for (i = 0; i < ngroups; i++)
+ if (mbt_grp_ctx_init(sb, &ctx->grp_ctx[i]))
+ goto out;
+
+ /*
+ * first data block(first cluster in first group) is used by
+ * metadata, mark it used to avoid to alloc data block at first
+ * block which will fail ext4_sb_block_valid check.
+ */
+ mb_set_bits(ctx->grp_ctx[0].bitmap_bh.b_data, 0, 1);
+ ext4_free_group_clusters_set(sb, &ctx->grp_ctx[0].desc,
+ EXT4_CLUSTERS_PER_GROUP(sb) - 1);
+
+ return 0;
+out:
+ while (i-- > 0)
+ mbt_grp_ctx_release(&ctx->grp_ctx[i]);
+ kfree(ctx->grp_ctx);
+ return -ENOMEM;
+}
+
+static void mbt_ctx_release(struct super_block *sb)
+{
+ struct mbt_ctx *ctx = MBT_CTX(sb);
+ ext4_group_t i, ngroups = ext4_get_groups_count(sb);
+
+ for (i = 0; i < ngroups; i++)
+ mbt_grp_ctx_release(&ctx->grp_ctx[i]);
+ kfree(ctx->grp_ctx);
+}
+
+static struct buffer_head *
+ext4_read_block_bitmap_nowait_stub(struct super_block *sb, ext4_group_t block_group,
+ bool ignore_locked)
+{
+ struct mbt_grp_ctx *grp_ctx = MBT_GRP_CTX(sb, block_group);
+
+ /* paired with brelse from caller of ext4_read_block_bitmap_nowait */
+ get_bh(&grp_ctx->bitmap_bh);
+ return &grp_ctx->bitmap_bh;
+}
+
+static int ext4_wait_block_bitmap_stub(struct super_block *sb,
+ ext4_group_t block_group,
+ struct buffer_head *bh)
+{
+ /*
+ * real ext4_wait_block_bitmap will set these flags and
+ * functions like ext4_mb_init_cache will verify the flags.
+ */
+ set_buffer_uptodate(bh);
+ set_bitmap_uptodate(bh);
+ set_buffer_verified(bh);
+ return 0;
+}
+
+static struct ext4_group_desc *
+ext4_get_group_desc_stub(struct super_block *sb, ext4_group_t block_group,
+ struct buffer_head **bh)
+{
+ struct mbt_grp_ctx *grp_ctx = MBT_GRP_CTX(sb, block_group);
+
+ if (bh != NULL)
+ *bh = &grp_ctx->gd_bh;
+
+ return &grp_ctx->desc;
+}
+
+static int
+ext4_mb_mark_context_stub(handle_t *handle, struct super_block *sb, bool state,
+ ext4_group_t group, ext4_grpblk_t blkoff,
+ ext4_grpblk_t len, int flags,
+ ext4_grpblk_t *ret_changed)
+{
+ struct mbt_grp_ctx *grp_ctx = MBT_GRP_CTX(sb, group);
+ struct buffer_head *bitmap_bh = &grp_ctx->bitmap_bh;
+
+ if (state)
+ mb_set_bits(bitmap_bh->b_data, blkoff, len);
+ else
+ mb_clear_bits(bitmap_bh->b_data, blkoff, len);
+
+ return 0;
+}
+
+#define TEST_GOAL_GROUP 1
+static int mbt_kunit_init(struct kunit *test)
+{
+ struct mbt_ext4_block_layout *layout =
+ (struct mbt_ext4_block_layout *)(test->param_value);
+ struct super_block *sb;
+ int ret;
+
+ sb = mbt_ext4_alloc_super_block();
+ if (sb == NULL)
+ return -ENOMEM;
+
+ mbt_init_sb_layout(sb, layout);
+
+ ret = mbt_ctx_init(sb);
+ if (ret != 0) {
+ mbt_ext4_free_super_block(sb);
+ return ret;
+ }
+
+ test->priv = sb;
+ kunit_activate_static_stub(test,
+ ext4_read_block_bitmap_nowait,
+ ext4_read_block_bitmap_nowait_stub);
+ kunit_activate_static_stub(test,
+ ext4_wait_block_bitmap,
+ ext4_wait_block_bitmap_stub);
+ kunit_activate_static_stub(test,
+ ext4_get_group_desc,
+ ext4_get_group_desc_stub);
+ kunit_activate_static_stub(test,
+ ext4_mb_mark_context,
+ ext4_mb_mark_context_stub);
+
+ /* stub function will be called in mbt_mb_init->ext4_mb_init */
+ if (mbt_mb_init(sb) != 0) {
+ mbt_ctx_release(sb);
+ mbt_ext4_free_super_block(sb);
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+static void mbt_kunit_exit(struct kunit *test)
+{
+ struct super_block *sb = (struct super_block *)test->priv;
+
+ mbt_mb_release(sb);
+ mbt_ctx_release(sb);
+ mbt_ext4_free_super_block(sb);
+}
+
+static void test_new_blocks_simple(struct kunit *test)
+{
+ struct super_block *sb = (struct super_block *)test->priv;
+ struct inode *inode;
+ struct ext4_allocation_request ar;
+ ext4_group_t i, goal_group = TEST_GOAL_GROUP;
+ int err = 0;
+ ext4_fsblk_t found;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+ inode = kunit_kzalloc(test, sizeof(*inode), GFP_KERNEL);
+ if (!inode)
+ return;
+
+ inode->i_sb = sb;
+ ar.inode = inode;
+
+ /* get block at goal */
+ ar.goal = ext4_group_first_block_no(sb, goal_group);
+ found = ext4_mb_new_blocks_simple(&ar, &err);
+ KUNIT_ASSERT_EQ_MSG(test, ar.goal, found,
+ "failed to alloc block at goal, expected %llu found %llu",
+ ar.goal, found);
+
+ /* get block after goal in goal group */
+ ar.goal = ext4_group_first_block_no(sb, goal_group);
+ found = ext4_mb_new_blocks_simple(&ar, &err);
+ KUNIT_ASSERT_EQ_MSG(test, ar.goal + EXT4_C2B(sbi, 1), found,
+ "failed to alloc block after goal in goal group, expected %llu found %llu",
+ ar.goal + 1, found);
+
+ /* get block after goal group */
+ mbt_ctx_mark_used(sb, goal_group, 0, EXT4_CLUSTERS_PER_GROUP(sb));
+ ar.goal = ext4_group_first_block_no(sb, goal_group);
+ found = ext4_mb_new_blocks_simple(&ar, &err);
+ KUNIT_ASSERT_EQ_MSG(test,
+ ext4_group_first_block_no(sb, goal_group + 1), found,
+ "failed to alloc block after goal group, expected %llu found %llu",
+ ext4_group_first_block_no(sb, goal_group + 1), found);
+
+ /* get block before goal group */
+ for (i = goal_group; i < ext4_get_groups_count(sb); i++)
+ mbt_ctx_mark_used(sb, i, 0, EXT4_CLUSTERS_PER_GROUP(sb));
+ ar.goal = ext4_group_first_block_no(sb, goal_group);
+ found = ext4_mb_new_blocks_simple(&ar, &err);
+ KUNIT_ASSERT_EQ_MSG(test,
+ ext4_group_first_block_no(sb, 0) + EXT4_C2B(sbi, 1), found,
+ "failed to alloc block before goal group, expected %llu found %llu",
+ ext4_group_first_block_no(sb, 0 + EXT4_C2B(sbi, 1)), found);
+
+ /* no block available, fail to allocate block */
+ for (i = 0; i < ext4_get_groups_count(sb); i++)
+ mbt_ctx_mark_used(sb, i, 0, EXT4_CLUSTERS_PER_GROUP(sb));
+ ar.goal = ext4_group_first_block_no(sb, goal_group);
+ found = ext4_mb_new_blocks_simple(&ar, &err);
+ KUNIT_ASSERT_NE_MSG(test, err, 0,
+ "unexpectedly get block when no block is available");
+}
+
+#define TEST_RANGE_COUNT 8
+
+struct test_range {
+ ext4_grpblk_t start;
+ ext4_grpblk_t len;
+};
+
+static void
+mbt_generate_test_ranges(struct super_block *sb, struct test_range *ranges,
+ int count)
+{
+ ext4_grpblk_t start, len, max;
+ int i;
+
+ max = EXT4_CLUSTERS_PER_GROUP(sb) / count;
+ for (i = 0; i < count; i++) {
+ start = get_random_u32() % max;
+ len = get_random_u32() % max;
+ len = min(len, max - start);
+
+ ranges[i].start = start + i * max;
+ ranges[i].len = len;
+ }
+}
+
+static void
+validate_free_blocks_simple(struct kunit *test, struct super_block *sb,
+ ext4_group_t goal_group, ext4_grpblk_t start,
+ ext4_grpblk_t len)
+{
+ void *bitmap;
+ ext4_grpblk_t bit, max = EXT4_CLUSTERS_PER_GROUP(sb);
+ ext4_group_t i;
+
+ for (i = 0; i < ext4_get_groups_count(sb); i++) {
+ if (i == goal_group)
+ continue;
+
+ bitmap = mbt_ctx_bitmap(sb, i);
+ bit = mb_find_next_zero_bit(bitmap, max, 0);
+ KUNIT_ASSERT_EQ_MSG(test, bit, max,
+ "free block on unexpected group %d", i);
+ }
+
+ bitmap = mbt_ctx_bitmap(sb, goal_group);
+ bit = mb_find_next_zero_bit(bitmap, max, 0);
+ KUNIT_ASSERT_EQ(test, bit, start);
+
+ bit = mb_find_next_bit(bitmap, max, bit + 1);
+ KUNIT_ASSERT_EQ(test, bit, start + len);
+}
+
+static void
+test_free_blocks_simple_range(struct kunit *test, ext4_group_t goal_group,
+ ext4_grpblk_t start, ext4_grpblk_t len)
+{
+ struct super_block *sb = (struct super_block *)test->priv;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct inode *inode;
+ ext4_fsblk_t block;
+
+ inode = kunit_kzalloc(test, sizeof(*inode), GFP_KERNEL);
+ if (!inode)
+ return;
+ inode->i_sb = sb;
+
+ if (len == 0)
+ return;
+
+ block = ext4_group_first_block_no(sb, goal_group) +
+ EXT4_C2B(sbi, start);
+ ext4_free_blocks_simple(inode, block, len);
+ validate_free_blocks_simple(test, sb, goal_group, start, len);
+ mbt_ctx_mark_used(sb, goal_group, 0, EXT4_CLUSTERS_PER_GROUP(sb));
+}
+
+static void test_free_blocks_simple(struct kunit *test)
+{
+ struct super_block *sb = (struct super_block *)test->priv;
+ ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb);
+ ext4_group_t i;
+ struct test_range ranges[TEST_RANGE_COUNT];
+
+ for (i = 0; i < ext4_get_groups_count(sb); i++)
+ mbt_ctx_mark_used(sb, i, 0, max);
+
+ mbt_generate_test_ranges(sb, ranges, TEST_RANGE_COUNT);
+ for (i = 0; i < TEST_RANGE_COUNT; i++)
+ test_free_blocks_simple_range(test, TEST_GOAL_GROUP,
+ ranges[i].start, ranges[i].len);
+}
+
+static void
+test_mark_diskspace_used_range(struct kunit *test,
+ struct ext4_allocation_context *ac,
+ ext4_grpblk_t start,
+ ext4_grpblk_t len)
+{
+ struct super_block *sb = (struct super_block *)test->priv;
+ int ret;
+ void *bitmap;
+ ext4_grpblk_t i, max;
+
+ /* ext4_mb_mark_diskspace_used will BUG if len is 0 */
+ if (len == 0)
+ return;
+
+ ac->ac_b_ex.fe_group = TEST_GOAL_GROUP;
+ ac->ac_b_ex.fe_start = start;
+ ac->ac_b_ex.fe_len = len;
+
+ bitmap = mbt_ctx_bitmap(sb, TEST_GOAL_GROUP);
+ memset(bitmap, 0, sb->s_blocksize);
+ ret = ext4_mb_mark_diskspace_used(ac, NULL, 0);
+ KUNIT_ASSERT_EQ(test, ret, 0);
+
+ max = EXT4_CLUSTERS_PER_GROUP(sb);
+ i = mb_find_next_bit(bitmap, max, 0);
+ KUNIT_ASSERT_EQ(test, i, start);
+ i = mb_find_next_zero_bit(bitmap, max, i + 1);
+ KUNIT_ASSERT_EQ(test, i, start + len);
+ i = mb_find_next_bit(bitmap, max, i + 1);
+ KUNIT_ASSERT_EQ(test, max, i);
+}
+
+static void test_mark_diskspace_used(struct kunit *test)
+{
+ struct super_block *sb = (struct super_block *)test->priv;
+ struct inode *inode;
+ struct ext4_allocation_context ac;
+ struct test_range ranges[TEST_RANGE_COUNT];
+ int i;
+
+ mbt_generate_test_ranges(sb, ranges, TEST_RANGE_COUNT);
+
+ inode = kunit_kzalloc(test, sizeof(*inode), GFP_KERNEL);
+ if (!inode)
+ return;
+ inode->i_sb = sb;
+
+ ac.ac_status = AC_STATUS_FOUND;
+ ac.ac_sb = sb;
+ ac.ac_inode = inode;
+ for (i = 0; i < TEST_RANGE_COUNT; i++)
+ test_mark_diskspace_used_range(test, &ac, ranges[i].start,
+ ranges[i].len);
+}
+
+static void mbt_generate_buddy(struct super_block *sb, void *buddy,
+ void *bitmap, struct ext4_group_info *grp)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ uint32_t order, off;
+ void *bb, *bb_h;
+ int max;
+
+ memset(buddy, 0xff, sb->s_blocksize);
+ memset(grp, 0, offsetof(struct ext4_group_info,
+ bb_counters[MB_NUM_ORDERS(sb)]));
+
+ bb = bitmap;
+ max = EXT4_CLUSTERS_PER_GROUP(sb);
+ bb_h = buddy + sbi->s_mb_offsets[1];
+
+ off = mb_find_next_zero_bit(bb, max, 0);
+ grp->bb_first_free = off;
+ while (off < max) {
+ grp->bb_counters[0]++;
+ grp->bb_free++;
+
+ if (!(off & 1) && !mb_test_bit(off + 1, bb)) {
+ grp->bb_free++;
+ grp->bb_counters[0]--;
+ mb_clear_bit(off >> 1, bb_h);
+ grp->bb_counters[1]++;
+ grp->bb_largest_free_order = 1;
+ off++;
+ }
+
+ off = mb_find_next_zero_bit(bb, max, off + 1);
+ }
+
+ for (order = 1; order < MB_NUM_ORDERS(sb) - 1; order++) {
+ bb = buddy + sbi->s_mb_offsets[order];
+ bb_h = buddy + sbi->s_mb_offsets[order + 1];
+ max = max >> 1;
+ off = mb_find_next_zero_bit(bb, max, 0);
+
+ while (off < max) {
+ if (!(off & 1) && !mb_test_bit(off + 1, bb)) {
+ mb_set_bits(bb, off, 2);
+ grp->bb_counters[order] -= 2;
+ mb_clear_bit(off >> 1, bb_h);
+ grp->bb_counters[order + 1]++;
+ grp->bb_largest_free_order = order + 1;
+ off++;
+ }
+
+ off = mb_find_next_zero_bit(bb, max, off + 1);
+ }
+ }
+
+ max = EXT4_CLUSTERS_PER_GROUP(sb);
+ off = mb_find_next_zero_bit(bitmap, max, 0);
+ while (off < max) {
+ grp->bb_fragments++;
+
+ off = mb_find_next_bit(bitmap, max, off + 1);
+ if (off + 1 >= max)
+ break;
+
+ off = mb_find_next_zero_bit(bitmap, max, off + 1);
+ }
+}
+
+static void
+mbt_validate_group_info(struct kunit *test, struct ext4_group_info *grp1,
+ struct ext4_group_info *grp2)
+{
+ struct super_block *sb = (struct super_block *)test->priv;
+ int i;
+
+ KUNIT_ASSERT_EQ(test, grp1->bb_first_free,
+ grp2->bb_first_free);
+ KUNIT_ASSERT_EQ(test, grp1->bb_fragments,
+ grp2->bb_fragments);
+ KUNIT_ASSERT_EQ(test, grp1->bb_free, grp2->bb_free);
+ KUNIT_ASSERT_EQ(test, grp1->bb_largest_free_order,
+ grp2->bb_largest_free_order);
+
+ for (i = 1; i < MB_NUM_ORDERS(sb); i++) {
+ KUNIT_ASSERT_EQ_MSG(test, grp1->bb_counters[i],
+ grp2->bb_counters[i],
+ "bb_counters[%d] diffs, expected %d, generated %d",
+ i, grp1->bb_counters[i],
+ grp2->bb_counters[i]);
+ }
+}
+
+static void
+do_test_generate_buddy(struct kunit *test, struct super_block *sb, void *bitmap,
+ void *mbt_buddy, struct ext4_group_info *mbt_grp,
+ void *ext4_buddy, struct ext4_group_info *ext4_grp)
+{
+ int i;
+
+ mbt_generate_buddy(sb, mbt_buddy, bitmap, mbt_grp);
+
+ for (i = 0; i < MB_NUM_ORDERS(sb); i++)
+ ext4_grp->bb_counters[i] = 0;
+ /* needed by validation in ext4_mb_generate_buddy */
+ ext4_grp->bb_free = mbt_grp->bb_free;
+ memset(ext4_buddy, 0xff, sb->s_blocksize);
+ ext4_mb_generate_buddy(sb, ext4_buddy, bitmap, TEST_GOAL_GROUP,
+ ext4_grp);
+
+ KUNIT_ASSERT_EQ(test, memcmp(mbt_buddy, ext4_buddy, sb->s_blocksize),
+ 0);
+ mbt_validate_group_info(test, mbt_grp, ext4_grp);
+}
+
+static void test_mb_generate_buddy(struct kunit *test)
+{
+ struct super_block *sb = (struct super_block *)test->priv;
+ void *bitmap, *expected_bb, *generate_bb;
+ struct ext4_group_info *expected_grp, *generate_grp;
+ struct test_range ranges[TEST_RANGE_COUNT];
+ int i;
+
+ bitmap = kunit_kzalloc(test, sb->s_blocksize, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, bitmap);
+ expected_bb = kunit_kzalloc(test, sb->s_blocksize, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, expected_bb);
+ generate_bb = kunit_kzalloc(test, sb->s_blocksize, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, generate_bb);
+ expected_grp = kunit_kzalloc(test, offsetof(struct ext4_group_info,
+ bb_counters[MB_NUM_ORDERS(sb)]), GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, expected_grp);
+ generate_grp = ext4_get_group_info(sb, TEST_GOAL_GROUP);
+ KUNIT_ASSERT_NOT_NULL(test, generate_grp);
+
+ mbt_generate_test_ranges(sb, ranges, TEST_RANGE_COUNT);
+ for (i = 0; i < TEST_RANGE_COUNT; i++) {
+ mb_set_bits(bitmap, ranges[i].start, ranges[i].len);
+ do_test_generate_buddy(test, sb, bitmap, expected_bb,
+ expected_grp, generate_bb, generate_grp);
+ }
+}
+
+static void
+test_mb_mark_used_range(struct kunit *test, struct ext4_buddy *e4b,
+ ext4_grpblk_t start, ext4_grpblk_t len, void *bitmap,
+ void *buddy, struct ext4_group_info *grp)
+{
+ struct super_block *sb = (struct super_block *)test->priv;
+ struct ext4_free_extent ex;
+ int i;
+
+ /* mb_mark_used only accepts non-zero len */
+ if (len == 0)
+ return;
+
+ ex.fe_start = start;
+ ex.fe_len = len;
+ ex.fe_group = TEST_GOAL_GROUP;
+
+ ext4_lock_group(sb, TEST_GOAL_GROUP);
+ mb_mark_used(e4b, &ex);
+ ext4_unlock_group(sb, TEST_GOAL_GROUP);
+
+ mb_set_bits(bitmap, start, len);
+ /* bypass bb_free validatoin in ext4_mb_generate_buddy */
+ grp->bb_free -= len;
+ memset(buddy, 0xff, sb->s_blocksize);
+ for (i = 0; i < MB_NUM_ORDERS(sb); i++)
+ grp->bb_counters[i] = 0;
+ ext4_mb_generate_buddy(sb, buddy, bitmap, 0, grp);
+
+ KUNIT_ASSERT_EQ(test, memcmp(buddy, e4b->bd_buddy, sb->s_blocksize),
+ 0);
+ mbt_validate_group_info(test, grp, e4b->bd_info);
+}
+
+static void test_mb_mark_used(struct kunit *test)
+{
+ struct ext4_buddy e4b;
+ struct super_block *sb = (struct super_block *)test->priv;
+ void *bitmap, *buddy;
+ struct ext4_group_info *grp;
+ int ret;
+ struct test_range ranges[TEST_RANGE_COUNT];
+ int i;
+
+ /* buddy cache assumes that each page contains at least one block */
+ if (sb->s_blocksize > PAGE_SIZE)
+ kunit_skip(test, "blocksize exceeds pagesize");
+
+ bitmap = kunit_kzalloc(test, sb->s_blocksize, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, bitmap);
+ buddy = kunit_kzalloc(test, sb->s_blocksize, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, buddy);
+ grp = kunit_kzalloc(test, offsetof(struct ext4_group_info,
+ bb_counters[MB_NUM_ORDERS(sb)]), GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, grp);
+
+ ret = ext4_mb_load_buddy(sb, TEST_GOAL_GROUP, &e4b);
+ KUNIT_ASSERT_EQ(test, ret, 0);
+
+ grp->bb_free = EXT4_CLUSTERS_PER_GROUP(sb);
+ grp->bb_largest_free_order = -1;
+ grp->bb_avg_fragment_size_order = -1;
+ mbt_generate_test_ranges(sb, ranges, TEST_RANGE_COUNT);
+ for (i = 0; i < TEST_RANGE_COUNT; i++)
+ test_mb_mark_used_range(test, &e4b, ranges[i].start,
+ ranges[i].len, bitmap, buddy, grp);
+
+ ext4_mb_unload_buddy(&e4b);
+}
+
+static void
+test_mb_free_blocks_range(struct kunit *test, struct ext4_buddy *e4b,
+ ext4_grpblk_t start, ext4_grpblk_t len, void *bitmap,
+ void *buddy, struct ext4_group_info *grp)
+{
+ struct super_block *sb = (struct super_block *)test->priv;
+ int i;
+
+ /* mb_free_blocks will WARN if len is 0 */
+ if (len == 0)
+ return;
+
+ ext4_lock_group(sb, e4b->bd_group);
+ mb_free_blocks(NULL, e4b, start, len);
+ ext4_unlock_group(sb, e4b->bd_group);
+
+ mb_clear_bits(bitmap, start, len);
+ /* bypass bb_free validatoin in ext4_mb_generate_buddy */
+ grp->bb_free += len;
+ memset(buddy, 0xff, sb->s_blocksize);
+ for (i = 0; i < MB_NUM_ORDERS(sb); i++)
+ grp->bb_counters[i] = 0;
+ ext4_mb_generate_buddy(sb, buddy, bitmap, 0, grp);
+
+ KUNIT_ASSERT_EQ(test, memcmp(buddy, e4b->bd_buddy, sb->s_blocksize),
+ 0);
+ mbt_validate_group_info(test, grp, e4b->bd_info);
+
+}
+
+static void test_mb_free_blocks(struct kunit *test)
+{
+ struct ext4_buddy e4b;
+ struct super_block *sb = (struct super_block *)test->priv;
+ void *bitmap, *buddy;
+ struct ext4_group_info *grp;
+ struct ext4_free_extent ex;
+ int ret;
+ int i;
+ struct test_range ranges[TEST_RANGE_COUNT];
+
+ /* buddy cache assumes that each page contains at least one block */
+ if (sb->s_blocksize > PAGE_SIZE)
+ kunit_skip(test, "blocksize exceeds pagesize");
+
+ bitmap = kunit_kzalloc(test, sb->s_blocksize, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, bitmap);
+ buddy = kunit_kzalloc(test, sb->s_blocksize, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, buddy);
+ grp = kunit_kzalloc(test, offsetof(struct ext4_group_info,
+ bb_counters[MB_NUM_ORDERS(sb)]), GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, grp);
+
+ ret = ext4_mb_load_buddy(sb, TEST_GOAL_GROUP, &e4b);
+ KUNIT_ASSERT_EQ(test, ret, 0);
+
+ ex.fe_start = 0;
+ ex.fe_len = EXT4_CLUSTERS_PER_GROUP(sb);
+ ex.fe_group = TEST_GOAL_GROUP;
+
+ ext4_lock_group(sb, TEST_GOAL_GROUP);
+ mb_mark_used(&e4b, &ex);
+ ext4_unlock_group(sb, TEST_GOAL_GROUP);
+
+ grp->bb_free = 0;
+ grp->bb_largest_free_order = -1;
+ grp->bb_avg_fragment_size_order = -1;
+ memset(bitmap, 0xff, sb->s_blocksize);
+
+ mbt_generate_test_ranges(sb, ranges, TEST_RANGE_COUNT);
+ for (i = 0; i < TEST_RANGE_COUNT; i++)
+ test_mb_free_blocks_range(test, &e4b, ranges[i].start,
+ ranges[i].len, bitmap, buddy, grp);
+
+ ext4_mb_unload_buddy(&e4b);
+}
+
+#define COUNT_FOR_ESTIMATE 100000
+static void test_mb_mark_used_cost(struct kunit *test)
+{
+ struct ext4_buddy e4b;
+ struct super_block *sb = (struct super_block *)test->priv;
+ struct ext4_free_extent ex;
+ int ret;
+ struct test_range ranges[TEST_RANGE_COUNT];
+ int i, j;
+ unsigned long start, end, all = 0;
+
+ /* buddy cache assumes that each page contains at least one block */
+ if (sb->s_blocksize > PAGE_SIZE)
+ kunit_skip(test, "blocksize exceeds pagesize");
+
+ ret = ext4_mb_load_buddy(sb, TEST_GOAL_GROUP, &e4b);
+ KUNIT_ASSERT_EQ(test, ret, 0);
+
+ ex.fe_group = TEST_GOAL_GROUP;
+ for (j = 0; j < COUNT_FOR_ESTIMATE; j++) {
+ mbt_generate_test_ranges(sb, ranges, TEST_RANGE_COUNT);
+ start = jiffies;
+ for (i = 0; i < TEST_RANGE_COUNT; i++) {
+ if (ranges[i].len == 0)
+ continue;
+
+ ex.fe_start = ranges[i].start;
+ ex.fe_len = ranges[i].len;
+ ext4_lock_group(sb, TEST_GOAL_GROUP);
+ mb_mark_used(&e4b, &ex);
+ ext4_unlock_group(sb, TEST_GOAL_GROUP);
+ }
+ end = jiffies;
+ all += (end - start);
+
+ for (i = 0; i < TEST_RANGE_COUNT; i++) {
+ if (ranges[i].len == 0)
+ continue;
+
+ ext4_lock_group(sb, TEST_GOAL_GROUP);
+ mb_free_blocks(NULL, &e4b, ranges[i].start,
+ ranges[i].len);
+ ext4_unlock_group(sb, TEST_GOAL_GROUP);
+ }
+ }
+
+ kunit_info(test, "costed jiffies %lu\n", all);
+ ext4_mb_unload_buddy(&e4b);
+}
+
+static const struct mbt_ext4_block_layout mbt_test_layouts[] = {
+ {
+ .blocksize_bits = 10,
+ .cluster_bits = 3,
+ .blocks_per_group = 8192,
+ .group_count = 4,
+ .desc_size = 64,
+ },
+ {
+ .blocksize_bits = 12,
+ .cluster_bits = 3,
+ .blocks_per_group = 8192,
+ .group_count = 4,
+ .desc_size = 64,
+ },
+ {
+ .blocksize_bits = 16,
+ .cluster_bits = 3,
+ .blocks_per_group = 8192,
+ .group_count = 4,
+ .desc_size = 64,
+ },
+};
+
+static void mbt_show_layout(const struct mbt_ext4_block_layout *layout,
+ char *desc)
+{
+ snprintf(desc, KUNIT_PARAM_DESC_SIZE, "block_bits=%d cluster_bits=%d "
+ "blocks_per_group=%d group_count=%d desc_size=%d\n",
+ layout->blocksize_bits, layout->cluster_bits,
+ layout->blocks_per_group, layout->group_count,
+ layout->desc_size);
+}
+KUNIT_ARRAY_PARAM(mbt_layouts, mbt_test_layouts, mbt_show_layout);
+
+static struct kunit_case mbt_test_cases[] = {
+ KUNIT_CASE_PARAM(test_new_blocks_simple, mbt_layouts_gen_params),
+ KUNIT_CASE_PARAM(test_free_blocks_simple, mbt_layouts_gen_params),
+ KUNIT_CASE_PARAM(test_mb_generate_buddy, mbt_layouts_gen_params),
+ KUNIT_CASE_PARAM(test_mb_mark_used, mbt_layouts_gen_params),
+ KUNIT_CASE_PARAM(test_mb_free_blocks, mbt_layouts_gen_params),
+ KUNIT_CASE_PARAM(test_mark_diskspace_used, mbt_layouts_gen_params),
+ KUNIT_CASE_PARAM_ATTR(test_mb_mark_used_cost, mbt_layouts_gen_params,
+ { .speed = KUNIT_SPEED_SLOW }),
+ {}
+};
+
+static struct kunit_suite mbt_test_suite = {
+ .name = "ext4_mballoc_test",
+ .init = mbt_kunit_init,
+ .exit = mbt_kunit_exit,
+ .test_cases = mbt_test_cases,
+};
+
+kunit_test_suites(&mbt_test_suite);
+
+MODULE_LICENSE("GPL");
new file mode 100644
@@ -0,0 +1,273 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * fs/ext4/mballoc.h
+ *
+ * Written by: Alex Tomas <alex@clusterfs.com>
+ *
+ */
+#ifndef _EXT4_MBALLOC_H
+#define _EXT4_MBALLOC_H
+
+#include <linux/time.h>
+#include <linux/fs.h>
+#include <linux/namei.h>
+#include <linux/quotaops.h>
+#include <linux/buffer_head.h>
+#include <linux/module.h>
+#include <linux/swap.h>
+#include <linux/proc_fs.h>
+#include <linux/pagemap.h>
+#include <linux/seq_file.h>
+#include <linux/blkdev.h>
+#include <linux/mutex.h>
+#include "ext4_jbd2.h"
+#include "ext4.h"
+
+/*
+ * mb_debug() dynamic printk msgs could be used to debug mballoc code.
+ */
+#ifdef CONFIG_EXT4_DEBUG
+#define mb_debug(sb, fmt, ...) \
+ pr_debug("[%s/%d] EXT4-fs (%s): (%s, %d): %s: " fmt, \
+ current->comm, task_pid_nr(current), sb->s_id, \
+ __FILE__, __LINE__, __func__, ##__VA_ARGS__)
+#else
+#define mb_debug(sb, fmt, ...) no_printk(fmt, ##__VA_ARGS__)
+#endif
+
+#define EXT4_MB_HISTORY_ALLOC 1 /* allocation */
+#define EXT4_MB_HISTORY_PREALLOC 2 /* preallocated blocks used */
+
+/*
+ * How long mballoc can look for a best extent (in found extents)
+ */
+#define MB_DEFAULT_MAX_TO_SCAN 200
+
+/*
+ * How long mballoc must look for a best extent
+ */
+#define MB_DEFAULT_MIN_TO_SCAN 10
+
+/*
+ * with 's_mb_stats' allocator will collect stats that will be
+ * shown at umount. The collecting costs though!
+ */
+#define MB_DEFAULT_STATS 0
+
+/*
+ * files smaller than MB_DEFAULT_STREAM_THRESHOLD are served
+ * by the stream allocator, which purpose is to pack requests
+ * as close each to other as possible to produce smooth I/O traffic
+ * We use locality group prealloc space for stream request.
+ * We can tune the same via /proc/fs/ext4/<partition>/stream_req
+ */
+#define MB_DEFAULT_STREAM_THRESHOLD 16 /* 64K */
+
+/*
+ * for which requests use 2^N search using buddies
+ */
+#define MB_DEFAULT_ORDER2_REQS 2
+
+/*
+ * default group prealloc size 512 blocks
+ */
+#define MB_DEFAULT_GROUP_PREALLOC 512
+
+/*
+ * Number of groups to search linearly before performing group scanning
+ * optimization.
+ */
+#define MB_DEFAULT_LINEAR_LIMIT 4
+
+/*
+ * Minimum number of groups that should be present in the file system to perform
+ * group scanning optimizations.
+ */
+#define MB_DEFAULT_LINEAR_SCAN_THRESHOLD 16
+
+/*
+ * The maximum order upto which CR_BEST_AVAIL_LEN can trim a particular
+ * allocation request. Example, if we have an order 7 request and max trim order
+ * of 3, we can trim this request upto order 4.
+ */
+#define MB_DEFAULT_BEST_AVAIL_TRIM_ORDER 3
+
+/*
+ * Number of valid buddy orders
+ */
+#define MB_NUM_ORDERS(sb) ((sb)->s_blocksize_bits + 2)
+
+struct ext4_free_data {
+ /* this links the free block information from sb_info */
+ struct list_head efd_list;
+
+ /* this links the free block information from group_info */
+ struct rb_node efd_node;
+
+ /* group which free block extent belongs */
+ ext4_group_t efd_group;
+
+ /* free block extent */
+ ext4_grpblk_t efd_start_cluster;
+ ext4_grpblk_t efd_count;
+
+ /* transaction which freed this extent */
+ tid_t efd_tid;
+};
+
+struct ext4_prealloc_space {
+ union {
+ struct rb_node inode_node; /* for inode PA rbtree */
+ struct list_head lg_list; /* for lg PAs */
+ } pa_node;
+ struct list_head pa_group_list;
+ union {
+ struct list_head pa_tmp_list;
+ struct rcu_head pa_rcu;
+ } u;
+ spinlock_t pa_lock;
+ atomic_t pa_count;
+ unsigned pa_deleted;
+ ext4_fsblk_t pa_pstart; /* phys. block */
+ ext4_lblk_t pa_lstart; /* log. block */
+ ext4_grpblk_t pa_len; /* len of preallocated chunk */
+ ext4_grpblk_t pa_free; /* how many blocks are free */
+ unsigned short pa_type; /* pa type. inode or group */
+ union {
+ rwlock_t *inode_lock; /* locks the rbtree holding this PA */
+ spinlock_t *lg_lock; /* locks the lg list holding this PA */
+ } pa_node_lock;
+ struct inode *pa_inode; /* used to get the inode during group discard */
+};
+
+enum {
+ MB_INODE_PA = 0,
+ MB_GROUP_PA = 1
+};
+
+struct ext4_free_extent {
+ ext4_lblk_t fe_logical;
+ ext4_grpblk_t fe_start; /* In cluster units */
+ ext4_group_t fe_group;
+ ext4_grpblk_t fe_len; /* In cluster units */
+};
+
+/*
+ * Locality group:
+ * we try to group all related changes together
+ * so that writeback can flush/allocate them together as well
+ * Size of lg_prealloc_list hash is determined by MB_DEFAULT_GROUP_PREALLOC
+ * (512). We store prealloc space into the hash based on the pa_free blocks
+ * order value.ie, fls(pa_free)-1;
+ */
+#define PREALLOC_TB_SIZE 10
+struct ext4_locality_group {
+ /* for allocator */
+ /* to serialize allocates */
+ struct mutex lg_mutex;
+ /* list of preallocations */
+ struct list_head lg_prealloc_list[PREALLOC_TB_SIZE];
+ spinlock_t lg_prealloc_lock;
+};
+
+struct ext4_allocation_context {
+ struct inode *ac_inode;
+ struct super_block *ac_sb;
+
+ /* original request */
+ struct ext4_free_extent ac_o_ex;
+
+ /* goal request (normalized ac_o_ex) */
+ struct ext4_free_extent ac_g_ex;
+
+ /* the best found extent */
+ struct ext4_free_extent ac_b_ex;
+
+ /* copy of the best found extent taken before preallocation efforts */
+ struct ext4_free_extent ac_f_ex;
+
+ /*
+ * goal len can change in CR_BEST_AVAIL_LEN, so save the original len.
+ * This is used while adjusting the PA window and for accounting.
+ */
+ ext4_grpblk_t ac_orig_goal_len;
+
+ ext4_group_t ac_prefetch_grp;
+ unsigned int ac_prefetch_ios;
+ unsigned int ac_prefetch_nr;
+
+ int ac_first_err;
+
+ __u32 ac_flags; /* allocation hints */
+ __u16 ac_groups_scanned;
+ __u16 ac_found;
+ __u16 ac_cX_found[EXT4_MB_NUM_CRS];
+ __u16 ac_tail;
+ __u16 ac_buddy;
+ __u8 ac_status;
+ __u8 ac_criteria;
+ __u8 ac_2order; /* if request is to allocate 2^N blocks and
+ * N > 0, the field stores N, otherwise 0 */
+ __u8 ac_op; /* operation, for history only */
+
+ struct ext4_buddy *ac_e4b;
+ struct folio *ac_bitmap_folio;
+ struct folio *ac_buddy_folio;
+ struct ext4_prealloc_space *ac_pa;
+ struct ext4_locality_group *ac_lg;
+};
+
+#define AC_STATUS_CONTINUE 1
+#define AC_STATUS_FOUND 2
+#define AC_STATUS_BREAK 3
+
+struct ext4_buddy {
+ struct folio *bd_buddy_folio;
+ void *bd_buddy;
+ struct folio *bd_bitmap_folio;
+ void *bd_bitmap;
+ struct ext4_group_info *bd_info;
+ struct super_block *bd_sb;
+ __u16 bd_blkbits;
+ ext4_group_t bd_group;
+};
+
+static inline ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb,
+ struct ext4_free_extent *fex)
+{
+ return ext4_group_first_block_no(sb, fex->fe_group) +
+ (fex->fe_start << EXT4_SB(sb)->s_cluster_bits);
+}
+
+static inline loff_t extent_logical_end(struct ext4_sb_info *sbi,
+ struct ext4_free_extent *fex)
+{
+ /* Use loff_t to avoid end exceeding ext4_lblk_t max. */
+ return (loff_t)fex->fe_logical + EXT4_C2B(sbi, fex->fe_len);
+}
+
+static inline loff_t pa_logical_end(struct ext4_sb_info *sbi,
+ struct ext4_prealloc_space *pa)
+{
+ /* Use loff_t to avoid end exceeding ext4_lblk_t max. */
+ return (loff_t)pa->pa_lstart + EXT4_C2B(sbi, pa->pa_len);
+}
+
+typedef int (*ext4_mballoc_query_range_fn)(
+ struct super_block *sb,
+ ext4_group_t agno,
+ ext4_grpblk_t start,
+ ext4_grpblk_t len,
+ void *priv);
+
+int
+ext4_mballoc_query_range(
+ struct super_block *sb,
+ ext4_group_t agno,
+ ext4_grpblk_t start,
+ ext4_grpblk_t end,
+ ext4_mballoc_query_range_fn meta_formatter,
+ ext4_mballoc_query_range_fn formatter,
+ void *priv);
+
+#endif