From 60e07cf515e541ea3e13b888d273c9b19a2ad9dd Mon Sep 17 00:00:00 2001 From: Yongqiang Yang Date: Sun, 18 Dec 2011 15:49:54 -0500 Subject: [PATCH 01/32] ext4: do not reference pa_inode from group_pa pa_inode in group_pa is set NULL in ext4_mb_new_group_pa, so pa_inode should be not referenced. Reported-by: Wu Fengguang Signed-off-by: Yongqiang Yang Signed-off-by: "Theodore Ts'o" --- fs/ext4/mballoc.c | 2 +- include/trace/events/ext4.h | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index e2d8be8f28bf..cb990b21c698 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -3671,7 +3671,7 @@ ext4_mb_release_group_pa(struct ext4_buddy *e4b, ext4_group_t group; ext4_grpblk_t bit; - trace_ext4_mb_release_group_pa(pa); + trace_ext4_mb_release_group_pa(sb, pa); BUG_ON(pa->pa_deleted == 0); ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit); BUG_ON(group != e4b->bd_group && pa->pa_len != 0); diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index 748ff7cbe555..319538bf17d2 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -573,9 +573,9 @@ TRACE_EVENT(ext4_mb_release_inode_pa, ); TRACE_EVENT(ext4_mb_release_group_pa, - TP_PROTO(struct ext4_prealloc_space *pa), + TP_PROTO(struct super_block *sb, struct ext4_prealloc_space *pa), - TP_ARGS(pa), + TP_ARGS(sb, pa), TP_STRUCT__entry( __field( dev_t, dev ) @@ -585,7 +585,7 @@ TRACE_EVENT(ext4_mb_release_group_pa, ), TP_fast_assign( - __entry->dev = pa->pa_inode->i_sb->s_dev; + __entry->dev = sb->s_dev; __entry->pa_pstart = pa->pa_pstart; __entry->pa_len = pa->pa_len; ), From 5635a62b83c04d05e4eb4575a1c3de51a35bacdc Mon Sep 17 00:00:00 2001 From: Zheng Liu Date: Sun, 18 Dec 2011 16:13:58 -0500 Subject: [PATCH 02/32] ext4: add missing space to ext4_msg output in ext4_fill_super() Signed-off-by: Zheng Liu Signed-off-by: "Theodore Ts'o" --- fs/ext4/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 3e1329e2f826..35377d57ec4c 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -3508,7 +3508,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) * of the filesystem. */ if (le32_to_cpu(es->s_first_data_block) >= ext4_blocks_count(es)) { - ext4_msg(sb, KERN_WARNING, "bad geometry: first data" + ext4_msg(sb, KERN_WARNING, "bad geometry: first data " "block %u is beyond end of filesystem (%llu)", le32_to_cpu(es->s_first_data_block), ext4_blocks_count(es)); From acd6ad83517639e8f09a8c5525b1dccd81cd2a10 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Sun, 18 Dec 2011 17:37:02 -0500 Subject: [PATCH 03/32] ext4: fix error handling on inode bitmap corruption When insert_inode_locked() fails in ext4_new_inode() it most likely means inode bitmap got corrupted and we allocated again inode which is already in use. Also doing unlock_new_inode() during error recovery is wrong since the inode does not have I_NEW set. Fix the problem by jumping to fail: (instead of fail_drop:) which declares filesystem error and does not call unlock_new_inode(). Signed-off-by: Jan Kara Signed-off-by: "Theodore Ts'o" --- fs/ext4/ialloc.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 00beb4f9cc4f..8fb6844f9734 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -885,8 +885,12 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode, if (IS_DIRSYNC(inode)) ext4_handle_sync(handle); if (insert_inode_locked(inode) < 0) { - err = -EINVAL; - goto fail_drop; + /* + * Likely a bitmap corruption causing inode to be allocated + * twice. + */ + err = -EIO; + goto fail; } spin_lock(&sbi->s_next_gen_lock); inode->i_generation = sbi->s_next_generation++; From 14d7f3efe923bc60839c65f9818793c64b4d708b Mon Sep 17 00:00:00 2001 From: Curt Wohlgemuth Date: Sun, 18 Dec 2011 17:39:02 -0500 Subject: [PATCH 04/32] ext4: remove unused local variable In get_implied_cluster_alloc(), rr_cluster_end was being defined and set, but was never used. Removed this. Signed-off-by: Curt Wohlgemuth Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 607b1557d292..4423b11476af 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3625,7 +3625,7 @@ static int get_implied_cluster_alloc(struct super_block *sb, struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_lblk_t c_offset = map->m_lblk & (sbi->s_cluster_ratio-1); ext4_lblk_t ex_cluster_start, ex_cluster_end; - ext4_lblk_t rr_cluster_start, rr_cluster_end; + ext4_lblk_t rr_cluster_start; ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block); ext4_fsblk_t ee_start = ext4_ext_pblock(ex); unsigned short ee_len = ext4_ext_get_actual_len(ex); @@ -3636,7 +3636,6 @@ static int get_implied_cluster_alloc(struct super_block *sb, /* The requested region passed into ext4_map_blocks() */ rr_cluster_start = EXT4_B2C(sbi, map->m_lblk); - rr_cluster_end = EXT4_B2C(sbi, map->m_lblk + map->m_len - 1); if ((rr_cluster_start == ex_cluster_end) || (rr_cluster_start == ex_cluster_start)) { From 8c48f7e88e293b9dd422bd8884842aea85d30b22 Mon Sep 17 00:00:00 2001 From: Robin Dong Date: Sun, 18 Dec 2011 23:05:43 -0500 Subject: [PATCH 05/32] ext4: optimize ext4_find_delalloc_range() in nodelalloc mode We found performance regression when using bigalloc with "nodelalloc" (1MB cluster size): 1. mke2fs -C 1048576 -O ^has_journal,bigalloc /dev/sda 2. mount -o nodelalloc /dev/sda /test/ 3. time dd if=/dev/zero of=/test/io bs=1048576 count=1024 The "dd" will cost about 2 seconds to finish, but if we mke2fs without "bigalloc", "dd" will only cost less than 1 second. The reason is: when using ext4 with "nodelalloc", it will call ext4_find_delalloc_cluster() nearly everytime it call ext4_ext_map_blocks(), and ext4_find_delalloc_range() will also scan all pages in cluster because no buffer is "delayed". A cluster has 256 pages (1MB cluster), so it will scan 256 * 256k pags when creating a 1G file. That severely hurts the performance. Therefore, we return immediately from ext4_find_delalloc_range() in nodelalloc mode, since by definition there can't be any delalloc pages. Signed-off-by: Robin Dong Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 4423b11476af..5684f2510921 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3281,6 +3281,9 @@ static int ext4_find_delalloc_range(struct inode *inode, ext4_lblk_t i, pg_lblk; pgoff_t index; + if (!test_opt(inode->i_sb, DELALLOC)) + return 0; + /* reverse search wont work if fs block size is less than page size */ if (inode->i_blkbits < PAGE_CACHE_SHIFT) search_hint_reverse = 0; From 22cdfca5641817060dd724a9c30442f5c0675fcd Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Wed, 21 Dec 2011 14:14:31 -0500 Subject: [PATCH 06/32] ext4: remove unneeded file_remove_suid() from ext4_ioctl() In the code to support EXT4_IOC_MOVE_EXT, ext4_ioctl calls file_remove_suid() after the call to ext4_move_extents() if any extents has been moved. There are at least three things wrong with this. First, file_remove_suid() should be called with i_mutex down, which is not here. Second, it should be called before the donor file has been modified, to avoid a potential race condition. Third, and most importantly, it's pointless, because ext4_file_extents() already checks if the donor file has the setuid or setgid bit set, and will return an error in that case. So the first two objections don't really matter, since file_remove_suid() will never need to modify the inode in any case. Signed-off-by: "Theodore Ts'o" --- fs/ext4/ioctl.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index a56796814d6a..ff1aab7cd6e8 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -247,8 +247,6 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) err = ext4_move_extents(filp, donor_filp, me.orig_start, me.donor_start, me.len, &me.moved_len); mnt_drop_write(filp->f_path.mnt); - if (me.moved_len > 0) - file_remove_suid(donor_filp); if (copy_to_user((struct move_extent __user *)arg, &me, sizeof(me))) From 2aff57b0c052344e8401a8b4a33c2a1ecb0f627c Mon Sep 17 00:00:00 2001 From: Yongqiang Yang Date: Wed, 28 Dec 2011 12:02:13 -0500 Subject: [PATCH 07/32] ext4: allocate delalloc blocks before changing journal mode delalloc blocks should be allocated before changing journal mode, otherwise they can not be allocated and even more truncate on delalloc blocks could triggre BUG by flushing delalloc buffers. Signed-off-by: Yongqiang Yang Signed-off-by: "Theodore Ts'o" --- fs/ext4/inode.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 92655fd89657..cb0ba9d77a8e 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4647,6 +4647,17 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val) return 0; if (is_journal_aborted(journal)) return -EROFS; + /* We have to allocate physical blocks for delalloc blocks + * before flushing journal. otherwise delalloc blocks can not + * be allocated any more. even more truncate on delalloc blocks + * could trigger BUG by flushing delalloc blocks in journal. + * There is no delalloc block in non-journal data mode. + */ + if (val && test_opt(inode->i_sb, DELALLOC)) { + err = ext4_alloc_da_blocks(inode); + if (err < 0) + return err; + } jbd2_journal_lock_updates(journal); jbd2_journal_flush(journal); From 5872ddaaf05bf25e3ab90580295ebc946405928c Mon Sep 17 00:00:00 2001 From: Yongqiang Yang Date: Wed, 28 Dec 2011 13:55:51 -0500 Subject: [PATCH 08/32] ext4: flush journal when switching from data=journal mode It's necessary to flush the journal when switching away from data=journal mode. This is because there are no revoke records when data blocks are journalled, but revoke records are required in the other journal modes. However, it is not necessary to flush the journal when switching into data=journal mode, and flushing the journal is expensive. So let's avoid it in that case. Signed-off-by: Yongqiang Yang Signed-off-by: "Theodore Ts'o" --- fs/ext4/inode.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index cb0ba9d77a8e..1254934de693 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4660,7 +4660,6 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val) } jbd2_journal_lock_updates(journal); - jbd2_journal_flush(journal); /* * OK, there are no updates running now, and all cached data is @@ -4672,8 +4671,10 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val) if (val) ext4_set_inode_flag(inode, EXT4_INODE_JOURNAL_DATA); - else + else { + jbd2_journal_flush(journal); ext4_clear_inode_flag(inode, EXT4_INODE_JOURNAL_DATA); + } ext4_set_aops(inode); jbd2_journal_unlock_updates(journal); From 1ba37268cd19e5a2a80924bfe8618bf1ba3e8249 Mon Sep 17 00:00:00 2001 From: Yongqiang Yang Date: Wed, 28 Dec 2011 17:46:46 -0500 Subject: [PATCH 09/32] jbd2: clear revoked flag on buffers before a new transaction started Currently, we clear revoked flag only when a block is reused. However, this can tigger a false journal error. Consider a situation when a block is used as a meta block and is deleted(revoked) in ordered mode, then the block is allocated as a data block to a file. At this moment, user changes the file's journal mode from ordered to journaled and truncates the file. The block will be considered re-revoked by journal because it has revoked flag still pending from the last transaction and an assertion triggers. We fix the problem by keeping the revoked status more uptodate - we clear revoked flag when switching revoke tables to reflect there is no revoked buffers in current transaction any more. Signed-off-by: Yongqiang Yang Signed-off-by: "Theodore Ts'o" --- fs/jbd2/commit.c | 6 ++++++ fs/jbd2/revoke.c | 34 ++++++++++++++++++++++++++++++++++ include/linux/jbd2.h | 1 + 3 files changed, 41 insertions(+) diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 68d704db787f..5069b8475150 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -429,6 +429,12 @@ void jbd2_journal_commit_transaction(journal_t *journal) jbd_debug(3, "JBD2: commit phase 1\n"); + /* + * Clear revoked flag to reflect there is no revoked buffers + * in the next transaction which is going to be started. + */ + jbd2_clear_buffer_revoked_flags(journal); + /* * Switch to a new revoke table. */ diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c index 69fd93588118..30b2867d6cc9 100644 --- a/fs/jbd2/revoke.c +++ b/fs/jbd2/revoke.c @@ -47,6 +47,10 @@ * overwriting the new data. We don't even need to clear the revoke * bit here. * + * We cache revoke status of a buffer in the current transaction in b_states + * bits. As the name says, revokevalid flag indicates that the cached revoke + * status of a buffer is valid and we can rely on the cached status. + * * Revoke information on buffers is a tri-state value: * * RevokeValid clear: no cached revoke status, need to look it up @@ -478,6 +482,36 @@ int jbd2_journal_cancel_revoke(handle_t *handle, struct journal_head *jh) return did_revoke; } +/* + * journal_clear_revoked_flag clears revoked flag of buffers in + * revoke table to reflect there is no revoked buffers in the next + * transaction which is going to be started. + */ +void jbd2_clear_buffer_revoked_flags(journal_t *journal) +{ + struct jbd2_revoke_table_s *revoke = journal->j_revoke; + int i = 0; + + for (i = 0; i < revoke->hash_size; i++) { + struct list_head *hash_list; + struct list_head *list_entry; + hash_list = &revoke->hash_table[i]; + + list_for_each(list_entry, hash_list) { + struct jbd2_revoke_record_s *record; + struct buffer_head *bh; + record = (struct jbd2_revoke_record_s *)list_entry; + bh = __find_get_block(journal->j_fs_dev, + record->blocknr, + journal->j_blocksize); + if (bh) { + clear_buffer_revoked(bh); + __brelse(bh); + } + } + } +} + /* journal_switch_revoke table select j_revoke for next transaction * we do not want to suspend any processing until all revokes are * written -bzzz diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index 2092ea21e469..5557baefed60 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -1151,6 +1151,7 @@ extern int jbd2_journal_set_revoke(journal_t *, unsigned long long, tid_t); extern int jbd2_journal_test_revoke(journal_t *, unsigned long long, tid_t); extern void jbd2_journal_clear_revoke(journal_t *); extern void jbd2_journal_switch_revoke_table(journal_t *journal); +extern void jbd2_clear_buffer_revoked_flags(journal_t *journal); /* * The log thread user interface: From 88635ca277adb67db34e88281817d1ce10713553 Mon Sep 17 00:00:00 2001 From: Zheng Liu Date: Wed, 28 Dec 2011 19:00:25 -0500 Subject: [PATCH 10/32] ext4: add missing spaces to debugging printk's Fix ext4_debug format in ext4_ext_handle_uninitialized_extents() and ext4_end_io_dio(). Signed-off-by: Zheng Liu Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 4 ++-- fs/ext4/inode.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 5684f2510921..b35bb40556dc 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3456,8 +3456,8 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, int err = 0; ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio; - ext_debug("ext4_ext_handle_uninitialized_extents: inode %lu, logical" - "block %llu, max_blocks %u, flags %d, allocated %u", + ext_debug("ext4_ext_handle_uninitialized_extents: inode %lu, logical " + "block %llu, max_blocks %u, flags %x, allocated %u\n", inode->i_ino, (unsigned long long)map->m_lblk, map->m_len, flags, allocated); ext4_ext_show_leaf(inode, path); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 1254934de693..ef9e8fdddfba 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2760,7 +2760,7 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, if (!io_end || !size) goto out; - ext_debug("ext4_end_io_dio(): io_end 0x%p" + ext_debug("ext4_end_io_dio(): io_end 0x%p " "for inode %lu, iocb 0x%p, offset %llu, size %llu\n", iocb->private, io_end->inode->i_ino, iocb, offset, size); From 14c83c9fddf2e75bdd0c20f1072f35260e356484 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Wed, 28 Dec 2011 20:25:13 -0500 Subject: [PATCH 11/32] ext4: avoid counting the number of free inodes twice in find_group_orlov() Signed-off-by: "Theodore Ts'o" --- fs/ext4/ialloc.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 8fb6844f9734..cdafc05d79c9 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -358,7 +358,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent, struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_group_t real_ngroups = ext4_get_groups_count(sb); int inodes_per_group = EXT4_INODES_PER_GROUP(sb); - unsigned int freei, avefreei; + unsigned int freei, avefreei, grp_free; ext4_fsblk_t freeb, avefreec; unsigned int ndirs; int max_dirs, min_inodes; @@ -477,8 +477,8 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent, for (i = 0; i < ngroups; i++) { grp = (parent_group + i) % ngroups; desc = ext4_get_group_desc(sb, grp, NULL); - if (desc && ext4_free_inodes_count(sb, desc) && - ext4_free_inodes_count(sb, desc) >= avefreei) { + grp_free = ext4_free_inodes_count(sb, desc); + if (desc && grp_free && grp_free >= avefreei) { *group = grp; return 0; } From ccb4d7af914e0fe9b2f1022f8ea6c300463fd5e6 Mon Sep 17 00:00:00 2001 From: Zheng Liu Date: Wed, 28 Dec 2011 20:25:40 -0500 Subject: [PATCH 12/32] ext4: remove no longer used functions in inode.c The functions ext4_block_truncate_page() and ext4_block_zero_page_range() are no longer used, so remove them. Signed-off-by: Zheng Liu Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4.h | 4 -- fs/ext4/inode.c | 120 ------------------------------------------------ 2 files changed, 124 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 5b0e26a1272d..ae2407f4502a 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1880,10 +1880,6 @@ extern int ext4_alloc_da_blocks(struct inode *inode); extern void ext4_set_aops(struct inode *inode); extern int ext4_writepage_trans_blocks(struct inode *); extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks); -extern int ext4_block_truncate_page(handle_t *handle, - struct address_space *mapping, loff_t from); -extern int ext4_block_zero_page_range(handle_t *handle, - struct address_space *mapping, loff_t from, loff_t length); extern int ext4_discard_partial_page_buffers(handle_t *handle, struct address_space *mapping, loff_t from, loff_t length, int flags); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index ef9e8fdddfba..e6cc24dfa98d 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3301,126 +3301,6 @@ int ext4_discard_partial_page_buffers_no_lock(handle_t *handle, return err; } -/* - * ext4_block_truncate_page() zeroes out a mapping from file offset `from' - * up to the end of the block which corresponds to `from'. - * This required during truncate. We need to physically zero the tail end - * of that block so it doesn't yield old data if the file is later grown. - */ -int ext4_block_truncate_page(handle_t *handle, - struct address_space *mapping, loff_t from) -{ - unsigned offset = from & (PAGE_CACHE_SIZE-1); - unsigned length; - unsigned blocksize; - struct inode *inode = mapping->host; - - blocksize = inode->i_sb->s_blocksize; - length = blocksize - (offset & (blocksize - 1)); - - return ext4_block_zero_page_range(handle, mapping, from, length); -} - -/* - * ext4_block_zero_page_range() zeros out a mapping of length 'length' - * starting from file offset 'from'. The range to be zero'd must - * be contained with in one block. If the specified range exceeds - * the end of the block it will be shortened to end of the block - * that cooresponds to 'from' - */ -int ext4_block_zero_page_range(handle_t *handle, - struct address_space *mapping, loff_t from, loff_t length) -{ - ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT; - unsigned offset = from & (PAGE_CACHE_SIZE-1); - unsigned blocksize, max, pos; - ext4_lblk_t iblock; - struct inode *inode = mapping->host; - struct buffer_head *bh; - struct page *page; - int err = 0; - - page = find_or_create_page(mapping, from >> PAGE_CACHE_SHIFT, - mapping_gfp_mask(mapping) & ~__GFP_FS); - if (!page) - return -ENOMEM; - - blocksize = inode->i_sb->s_blocksize; - max = blocksize - (offset & (blocksize - 1)); - - /* - * correct length if it does not fall between - * 'from' and the end of the block - */ - if (length > max || length < 0) - length = max; - - iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); - - if (!page_has_buffers(page)) - create_empty_buffers(page, blocksize, 0); - - /* Find the buffer that contains "offset" */ - bh = page_buffers(page); - pos = blocksize; - while (offset >= pos) { - bh = bh->b_this_page; - iblock++; - pos += blocksize; - } - - err = 0; - if (buffer_freed(bh)) { - BUFFER_TRACE(bh, "freed: skip"); - goto unlock; - } - - if (!buffer_mapped(bh)) { - BUFFER_TRACE(bh, "unmapped"); - ext4_get_block(inode, iblock, bh, 0); - /* unmapped? It's a hole - nothing to do */ - if (!buffer_mapped(bh)) { - BUFFER_TRACE(bh, "still unmapped"); - goto unlock; - } - } - - /* Ok, it's mapped. Make sure it's up-to-date */ - if (PageUptodate(page)) - set_buffer_uptodate(bh); - - if (!buffer_uptodate(bh)) { - err = -EIO; - ll_rw_block(READ, 1, &bh); - wait_on_buffer(bh); - /* Uhhuh. Read error. Complain and punt. */ - if (!buffer_uptodate(bh)) - goto unlock; - } - - if (ext4_should_journal_data(inode)) { - BUFFER_TRACE(bh, "get write access"); - err = ext4_journal_get_write_access(handle, bh); - if (err) - goto unlock; - } - - zero_user(page, offset, length); - - BUFFER_TRACE(bh, "zeroed end of block"); - - err = 0; - if (ext4_should_journal_data(inode)) { - err = ext4_handle_dirty_metadata(handle, inode, bh); - } else - mark_buffer_dirty(bh); - -unlock: - unlock_page(page); - page_cache_release(page); - return err; -} - int ext4_can_truncate(struct inode *inode) { if (S_ISREG(inode->i_mode)) From 597d508c17a6dcd17770f4dd9da873d93cc15493 Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Wed, 28 Dec 2011 20:32:07 -0500 Subject: [PATCH 13/32] ext4: use proper little-endian bitops ext4_{set,clear}_bit() is defined as __test_and_{set,clear}_bit_le() for ext4. Only two ext4_{set,clear}_bit() calls check the return value. The rest of calls ignore the return value and they can be replaced with __{set,clear}_bit_le(). This changes ext4_{set,clear}_bit() from __test_and_{set,clear}_bit_le() to __{set,clear}_bit_le() and introduces ext4_test_and_{set,clear}_bit() for the two places where old bit needs to be returned. This ext4_{set,clear}_bit() change is considered safe, because if someone uses these macros without noticing the change, new ext4_{set,clear}_bit don't have return value and causes compiler errors where the return value is used. This also removes unused ext4_find_first_zero_bit(). Signed-off-by: Akinobu Mita Signed-off-by: Andrew Morton Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4.h | 7 ++++--- fs/ext4/ialloc.c | 4 ++-- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index ae2407f4502a..0e43bba049a9 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -957,12 +957,13 @@ struct ext4_inode_info { #define test_opt2(sb, opt) (EXT4_SB(sb)->s_mount_opt2 & \ EXT4_MOUNT2_##opt) -#define ext4_set_bit __test_and_set_bit_le +#define ext4_test_and_set_bit __test_and_set_bit_le +#define ext4_set_bit __set_bit_le #define ext4_set_bit_atomic ext2_set_bit_atomic -#define ext4_clear_bit __test_and_clear_bit_le +#define ext4_test_and_clear_bit __test_and_clear_bit_le +#define ext4_clear_bit __clear_bit_le #define ext4_clear_bit_atomic ext2_clear_bit_atomic #define ext4_test_bit test_bit_le -#define ext4_find_first_zero_bit find_first_zero_bit_le #define ext4_find_next_zero_bit find_next_zero_bit_le #define ext4_find_next_bit find_next_bit_le diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index cdafc05d79c9..72fc9892231f 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -252,7 +252,7 @@ void ext4_free_inode(handle_t *handle, struct inode *inode) fatal = ext4_journal_get_write_access(handle, bh2); } ext4_lock_group(sb, block_group); - cleared = ext4_clear_bit(bit, bitmap_bh->b_data); + cleared = ext4_test_and_clear_bit(bit, bitmap_bh->b_data); if (fatal || !cleared) { ext4_unlock_group(sb, block_group); goto out; @@ -618,7 +618,7 @@ static int ext4_claim_inode(struct super_block *sb, */ down_read(&grp->alloc_sem); ext4_lock_group(sb, group); - if (ext4_set_bit(ino, inode_bitmap_bh->b_data)) { + if (ext4_test_and_set_bit(ino, inode_bitmap_bh->b_data)) { /* not a free inode */ retval = 1; goto err_ret; From 18e3143848f1abdd07e7d9879cf67f4e147ff8b7 Mon Sep 17 00:00:00 2001 From: Yongqiang Yang Date: Tue, 3 Jan 2012 23:18:50 -0500 Subject: [PATCH 14/32] ext4: add a function which extends a group without checking parameters This patch added a function named ext4_group_extend_no_check() whose code is copied from ext4_group_extend(). ext4_group_extend_no_check() assumes the parameter is valid and has been checked by caller. Signed-off-by: Yongqiang Yang Signed-off-by: "Theodore Ts'o" --- fs/ext4/resize.c | 51 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 996780ab4f4e..1c3c2b56049d 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -968,6 +968,57 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) return err; } /* ext4_group_add */ +/* + * extend a group without checking assuming that checking has been done. + */ +static int ext4_group_extend_no_check(struct super_block *sb, + ext4_fsblk_t o_blocks_count, ext4_grpblk_t add) +{ + struct ext4_super_block *es = EXT4_SB(sb)->s_es; + handle_t *handle; + int err = 0, err2; + + /* We will update the superblock, one block bitmap, and + * one group descriptor via ext4_group_add_blocks(). + */ + handle = ext4_journal_start_sb(sb, 3); + if (IS_ERR(handle)) { + err = PTR_ERR(handle); + ext4_warning(sb, "error %d on journal start", err); + return err; + } + + err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh); + if (err) { + ext4_warning(sb, "error %d on journal write access", err); + goto errout; + } + + ext4_blocks_count_set(es, o_blocks_count + add); + ext4_debug("freeing blocks %llu through %llu\n", o_blocks_count, + o_blocks_count + add); + /* We add the blocks to the bitmap and set the group need init bit */ + err = ext4_group_add_blocks(handle, sb, o_blocks_count, add); + if (err) + goto errout; + ext4_handle_dirty_super(handle, sb); + ext4_debug("freed blocks %llu through %llu\n", o_blocks_count, + o_blocks_count + add); +errout: + err2 = ext4_journal_stop(handle); + if (err2 && !err) + err = err2; + + if (!err) { + if (test_opt(sb, DEBUG)) + printk(KERN_DEBUG "EXT4-fs: extended group to %llu " + "blocks\n", ext4_blocks_count(es)); + update_backups(sb, EXT4_SB(sb)->s_sbh->b_blocknr, (char *)es, + sizeof(struct ext4_super_block)); + } + return err; +} + /* * Extend the filesystem to the new number of blocks specified. This entry * point is only used to extend the current filesystem to the end of the last From bb08c1e7d8c072da338f6d905a89376b36023017 Mon Sep 17 00:00:00 2001 From: Yongqiang Yang Date: Tue, 3 Jan 2012 23:20:50 -0500 Subject: [PATCH 15/32] ext4: add a function which adds a new group descriptors to a fs This patch adds a function named ext4_add_new_descs() which adds one or more new group descriptors to a fs and whose code is copied from ext4_group_add(). The function will be used by new resize implementation. Signed-off-by: Yongqiang Yang Signed-off-by: "Theodore Ts'o" --- fs/ext4/resize.c | 46 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 1c3c2b56049d..3bb4e7b502ec 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -735,6 +735,52 @@ static void update_backups(struct super_block *sb, } } +/* + * ext4_add_new_descs() adds @count group descriptor of groups + * starting at @group + * + * @handle: journal handle + * @sb: super block + * @group: the group no. of the first group desc to be added + * @resize_inode: the resize inode + * @count: number of group descriptors to be added + */ +static int ext4_add_new_descs(handle_t *handle, struct super_block *sb, + ext4_group_t group, struct inode *resize_inode, + ext4_group_t count) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_super_block *es = sbi->s_es; + struct buffer_head *gdb_bh; + int i, gdb_off, gdb_num, err = 0; + + for (i = 0; i < count; i++, group++) { + int reserved_gdb = ext4_bg_has_super(sb, group) ? + le16_to_cpu(es->s_reserved_gdt_blocks) : 0; + + gdb_off = group % EXT4_DESC_PER_BLOCK(sb); + gdb_num = group / EXT4_DESC_PER_BLOCK(sb); + + /* + * We will only either add reserved group blocks to a backup group + * or remove reserved blocks for the first group in a new group block. + * Doing both would be mean more complex code, and sane people don't + * use non-sparse filesystems anymore. This is already checked above. + */ + if (gdb_off) { + gdb_bh = sbi->s_group_desc[gdb_num]; + err = ext4_journal_get_write_access(handle, gdb_bh); + + if (!err && reserved_gdb && ext4_bg_num_gdb(sb, group)) + err = reserve_backup_gdb(handle, resize_inode, group); + } else + err = add_new_gdb(handle, resize_inode, group); + if (err) + break; + } + return err; +} + /* Add group descriptor data to an existing or new group descriptor block. * Ensure we handle all possible error conditions _before_ we start modifying * the filesystem, because we cannot abort the transaction and not have it From 28c7bac0091687e6116ebd6c179e154ae4053c90 Mon Sep 17 00:00:00 2001 From: Yongqiang Yang Date: Tue, 3 Jan 2012 23:22:50 -0500 Subject: [PATCH 16/32] ext4: add a structure which will be used by 64bit-resize interface This patch adds a structure which will be used by 64bit-resize interface. Two functions which allocate and destroy the structure respectively are added. Signed-off-by: Yongqiang Yang Signed-off-by: "Theodore Ts'o" --- fs/ext4/resize.c | 55 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 3bb4e7b502ec..6076d5e4b513 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -134,6 +134,61 @@ static int verify_group_input(struct super_block *sb, return err; } +/* + * ext4_new_flex_group_data is used by 64bit-resize interface to add a flex + * group each time. + */ +struct ext4_new_flex_group_data { + struct ext4_new_group_data *groups; /* new_group_data for groups + in the flex group */ + __u16 *bg_flags; /* block group flags of groups + in @groups */ + ext4_group_t count; /* number of groups in @groups + */ +}; + +/* + * alloc_flex_gd() allocates a ext4_new_flex_group_data with size of + * @flexbg_size. + * + * Returns NULL on failure otherwise address of the allocated structure. + */ +static struct ext4_new_flex_group_data *alloc_flex_gd(unsigned long flexbg_size) +{ + struct ext4_new_flex_group_data *flex_gd; + + flex_gd = kmalloc(sizeof(*flex_gd), GFP_NOFS); + if (flex_gd == NULL) + goto out3; + + flex_gd->count = flexbg_size; + + flex_gd->groups = kmalloc(sizeof(struct ext4_new_group_data) * + flexbg_size, GFP_NOFS); + if (flex_gd->groups == NULL) + goto out2; + + flex_gd->bg_flags = kmalloc(flexbg_size * sizeof(__u16), GFP_NOFS); + if (flex_gd->bg_flags == NULL) + goto out1; + + return flex_gd; + +out1: + kfree(flex_gd->groups); +out2: + kfree(flex_gd); +out3: + return NULL; +} + +static void free_flex_gd(struct ext4_new_flex_group_data *flex_gd) +{ + kfree(flex_gd->bg_flags); + kfree(flex_gd->groups); + kfree(flex_gd); +} + static struct buffer_head *bclean(handle_t *handle, struct super_block *sb, ext4_fsblk_t blk) { From 33afdcc5402d0abf70ef2dfb96d0b901d20bcc37 Mon Sep 17 00:00:00 2001 From: Yongqiang Yang Date: Tue, 3 Jan 2012 23:32:52 -0500 Subject: [PATCH 17/32] ext4: add a function which sets up group blocks of a flex bg This patch adds a function named setup_new_flex_group_blocks() which sets up group blocks of a flex bg. Signed-off-by: Yongqiang Yang Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4.h | 8 ++ fs/ext4/resize.c | 250 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 258 insertions(+) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 0e43bba049a9..05058e2b7f4f 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -511,6 +511,14 @@ struct ext4_new_group_data { __u32 free_blocks_count; }; +/* Indexes used to index group tables in ext4_new_group_data */ +enum { + BLOCK_BITMAP = 0, /* block bitmap */ + INODE_BITMAP, /* inode bitmap */ + INODE_TABLE, /* inode tables */ + GROUP_TABLE_COUNT, +}; + /* * Flags used by ext4_map_blocks() */ diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 6076d5e4b513..e8ccb2f8f45b 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -233,6 +233,256 @@ static int extend_or_restart_transaction(handle_t *handle, int thresh) return 0; } +/* + * set_flexbg_block_bitmap() mark @count blocks starting from @block used. + * + * Helper function for ext4_setup_new_group_blocks() which set . + * + * @sb: super block + * @handle: journal handle + * @flex_gd: flex group data + */ +static int set_flexbg_block_bitmap(struct super_block *sb, handle_t *handle, + struct ext4_new_flex_group_data *flex_gd, + ext4_fsblk_t block, ext4_group_t count) +{ + ext4_group_t count2; + + ext4_debug("mark blocks [%llu/%u] used\n", block, count); + for (count2 = count; count > 0; count -= count2, block += count2) { + ext4_fsblk_t start; + struct buffer_head *bh; + ext4_group_t group; + int err; + + ext4_get_group_no_and_offset(sb, block, &group, NULL); + start = ext4_group_first_block_no(sb, group); + group -= flex_gd->groups[0].group; + + count2 = sb->s_blocksize * 8 - (block - start); + if (count2 > count) + count2 = count; + + if (flex_gd->bg_flags[group] & EXT4_BG_BLOCK_UNINIT) { + BUG_ON(flex_gd->count > 1); + continue; + } + + err = extend_or_restart_transaction(handle, 1); + if (err) + return err; + + bh = sb_getblk(sb, flex_gd->groups[group].block_bitmap); + if (!bh) + return -EIO; + + err = ext4_journal_get_write_access(handle, bh); + if (err) + return err; + ext4_debug("mark block bitmap %#04llx (+%llu/%u)\n", block, + block - start, count2); + ext4_set_bits(bh->b_data, block - start, count2); + + err = ext4_handle_dirty_metadata(handle, NULL, bh); + if (unlikely(err)) + return err; + brelse(bh); + } + + return 0; +} + +/* + * Set up the block and inode bitmaps, and the inode table for the new groups. + * This doesn't need to be part of the main transaction, since we are only + * changing blocks outside the actual filesystem. We still do journaling to + * ensure the recovery is correct in case of a failure just after resize. + * If any part of this fails, we simply abort the resize. + * + * setup_new_flex_group_blocks handles a flex group as follow: + * 1. copy super block and GDT, and initialize group tables if necessary. + * In this step, we only set bits in blocks bitmaps for blocks taken by + * super block and GDT. + * 2. allocate group tables in block bitmaps, that is, set bits in block + * bitmap for blocks taken by group tables. + */ +static int setup_new_flex_group_blocks(struct super_block *sb, + struct ext4_new_flex_group_data *flex_gd) +{ + int group_table_count[] = {1, 1, EXT4_SB(sb)->s_itb_per_group}; + ext4_fsblk_t start; + ext4_fsblk_t block; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_super_block *es = sbi->s_es; + struct ext4_new_group_data *group_data = flex_gd->groups; + __u16 *bg_flags = flex_gd->bg_flags; + handle_t *handle; + ext4_group_t group, count; + struct buffer_head *bh = NULL; + int reserved_gdb, i, j, err = 0, err2; + + BUG_ON(!flex_gd->count || !group_data || + group_data[0].group != sbi->s_groups_count); + + reserved_gdb = le16_to_cpu(es->s_reserved_gdt_blocks); + + /* This transaction may be extended/restarted along the way */ + handle = ext4_journal_start_sb(sb, EXT4_MAX_TRANS_DATA); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + group = group_data[0].group; + for (i = 0; i < flex_gd->count; i++, group++) { + unsigned long gdblocks; + + gdblocks = ext4_bg_num_gdb(sb, group); + start = ext4_group_first_block_no(sb, group); + + /* Copy all of the GDT blocks into the backup in this group */ + for (j = 0, block = start + 1; j < gdblocks; j++, block++) { + struct buffer_head *gdb; + + ext4_debug("update backup group %#04llx\n", block); + err = extend_or_restart_transaction(handle, 1); + if (err) + goto out; + + gdb = sb_getblk(sb, block); + if (!gdb) { + err = -EIO; + goto out; + } + + err = ext4_journal_get_write_access(handle, gdb); + if (err) { + brelse(gdb); + goto out; + } + memcpy(gdb->b_data, sbi->s_group_desc[j]->b_data, + gdb->b_size); + set_buffer_uptodate(gdb); + + err = ext4_handle_dirty_metadata(handle, NULL, gdb); + if (unlikely(err)) { + brelse(gdb); + goto out; + } + brelse(gdb); + } + + /* Zero out all of the reserved backup group descriptor + * table blocks + */ + if (ext4_bg_has_super(sb, group)) { + err = sb_issue_zeroout(sb, gdblocks + start + 1, + reserved_gdb, GFP_NOFS); + if (err) + goto out; + } + + /* Initialize group tables of the grop @group */ + if (!(bg_flags[i] & EXT4_BG_INODE_ZEROED)) + goto handle_bb; + + /* Zero out all of the inode table blocks */ + block = group_data[i].inode_table; + ext4_debug("clear inode table blocks %#04llx -> %#04lx\n", + block, sbi->s_itb_per_group); + err = sb_issue_zeroout(sb, block, sbi->s_itb_per_group, + GFP_NOFS); + if (err) + goto out; + +handle_bb: + if (bg_flags[i] & EXT4_BG_BLOCK_UNINIT) + goto handle_ib; + + /* Initialize block bitmap of the @group */ + block = group_data[i].block_bitmap; + err = extend_or_restart_transaction(handle, 1); + if (err) + goto out; + + bh = bclean(handle, sb, block); + if (IS_ERR(bh)) { + err = PTR_ERR(bh); + goto out; + } + if (ext4_bg_has_super(sb, group)) { + ext4_debug("mark backup superblock %#04llx (+0)\n", + start); + ext4_set_bits(bh->b_data, 0, gdblocks + reserved_gdb + + 1); + } + ext4_mark_bitmap_end(group_data[i].blocks_count, + sb->s_blocksize * 8, bh->b_data); + err = ext4_handle_dirty_metadata(handle, NULL, bh); + if (err) + goto out; + brelse(bh); + +handle_ib: + if (bg_flags[i] & EXT4_BG_INODE_UNINIT) + continue; + + /* Initialize inode bitmap of the @group */ + block = group_data[i].inode_bitmap; + err = extend_or_restart_transaction(handle, 1); + if (err) + goto out; + /* Mark unused entries in inode bitmap used */ + bh = bclean(handle, sb, block); + if (IS_ERR(bh)) { + err = PTR_ERR(bh); + goto out; + } + + ext4_mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), + sb->s_blocksize * 8, bh->b_data); + err = ext4_handle_dirty_metadata(handle, NULL, bh); + if (err) + goto out; + brelse(bh); + } + bh = NULL; + + /* Mark group tables in block bitmap */ + for (j = 0; j < GROUP_TABLE_COUNT; j++) { + count = group_table_count[j]; + start = (&group_data[0].block_bitmap)[j]; + block = start; + for (i = 1; i < flex_gd->count; i++) { + block += group_table_count[j]; + if (block == (&group_data[i].block_bitmap)[j]) { + count += group_table_count[j]; + continue; + } + err = set_flexbg_block_bitmap(sb, handle, + flex_gd, start, count); + if (err) + goto out; + count = group_table_count[j]; + start = group_data[i].block_bitmap; + block = start; + } + + if (count) { + err = set_flexbg_block_bitmap(sb, handle, + flex_gd, start, count); + if (err) + goto out; + } + } + +out: + brelse(bh); + err2 = ext4_journal_stop(handle); + if (err2 && !err) + err = err2; + + return err; +} + /* * Set up the block and inode bitmaps, and the inode table for the new group. * This doesn't need to be part of the main transaction, since we are only From 083f5b24cc55448e0602a807a5c2872e1f3796e2 Mon Sep 17 00:00:00 2001 From: Yongqiang Yang Date: Tue, 3 Jan 2012 23:37:31 -0500 Subject: [PATCH 18/32] ext4: add a function which sets up a block group descriptors of a flex bg This patch adds a function named ext4_setup_new_descs which sets up the block group descriptors of a flex bg. Signed-off-by: Yongqiang Yang Signed-off-by: "Theodore Ts'o" --- fs/ext4/resize.c | 56 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index e8ccb2f8f45b..098bdb8e97cb 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -1086,6 +1086,62 @@ static int ext4_add_new_descs(handle_t *handle, struct super_block *sb, return err; } +/* + * ext4_setup_new_descs() will set up the group descriptor descriptors of a flex bg + */ +static int ext4_setup_new_descs(handle_t *handle, struct super_block *sb, + struct ext4_new_flex_group_data *flex_gd) +{ + struct ext4_new_group_data *group_data = flex_gd->groups; + struct ext4_group_desc *gdp; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct buffer_head *gdb_bh; + ext4_group_t group; + __u16 *bg_flags = flex_gd->bg_flags; + int i, gdb_off, gdb_num, err = 0; + + + for (i = 0; i < flex_gd->count; i++, group_data++, bg_flags++) { + group = group_data->group; + + gdb_off = group % EXT4_DESC_PER_BLOCK(sb); + gdb_num = group / EXT4_DESC_PER_BLOCK(sb); + + /* + * get_write_access() has been called on gdb_bh by ext4_add_new_desc(). + */ + gdb_bh = sbi->s_group_desc[gdb_num]; + /* Update group descriptor block for new group */ + gdp = (struct ext4_group_desc *)((char *)gdb_bh->b_data + + gdb_off * EXT4_DESC_SIZE(sb)); + + memset(gdp, 0, EXT4_DESC_SIZE(sb)); + ext4_block_bitmap_set(sb, gdp, group_data->block_bitmap); + ext4_inode_bitmap_set(sb, gdp, group_data->inode_bitmap); + ext4_inode_table_set(sb, gdp, group_data->inode_table); + ext4_free_group_clusters_set(sb, gdp, + EXT4_B2C(sbi, group_data->free_blocks_count)); + ext4_free_inodes_set(sb, gdp, EXT4_INODES_PER_GROUP(sb)); + gdp->bg_flags = cpu_to_le16(*bg_flags); + gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp); + + err = ext4_handle_dirty_metadata(handle, NULL, gdb_bh); + if (unlikely(err)) { + ext4_std_error(sb, err); + break; + } + + /* + * We can allocate memory for mb_alloc based on the new group + * descriptor + */ + err = ext4_mb_add_groupinfo(sb, group, gdp); + if (err) + break; + } + return err; +} + /* Add group descriptor data to an existing or new group descriptor block. * Ensure we handle all possible error conditions _before_ we start modifying * the filesystem, because we cannot abort the transaction and not have it From 2e10e2f2e5a800a54ad2f16dfdd8c034e005958b Mon Sep 17 00:00:00 2001 From: Yongqiang Yang Date: Tue, 3 Jan 2012 23:41:39 -0500 Subject: [PATCH 19/32] ext4: add a function which updates the super block during online resizing This patch adds a function named ext4_update_super() which updates super block so the newly created block groups are visible to the file system. This code is copied from ext4_group_add(). The function will be used by new resize implementation. Signed-off-by: Yongqiang Yang Signed-off-by: "Theodore Ts'o" --- fs/ext4/resize.c | 94 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 94 insertions(+) diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 098bdb8e97cb..eb0aebcca55f 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -1142,6 +1142,100 @@ static int ext4_setup_new_descs(handle_t *handle, struct super_block *sb, return err; } +/* + * ext4_update_super() updates the super block so that the newly added + * groups can be seen by the filesystem. + * + * @sb: super block + * @flex_gd: new added groups + */ +static void ext4_update_super(struct super_block *sb, + struct ext4_new_flex_group_data *flex_gd) +{ + ext4_fsblk_t blocks_count = 0; + ext4_fsblk_t free_blocks = 0; + ext4_fsblk_t reserved_blocks = 0; + struct ext4_new_group_data *group_data = flex_gd->groups; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_super_block *es = sbi->s_es; + int i; + + BUG_ON(flex_gd->count == 0 || group_data == NULL); + /* + * Make the new blocks and inodes valid next. We do this before + * increasing the group count so that once the group is enabled, + * all of its blocks and inodes are already valid. + * + * We always allocate group-by-group, then block-by-block or + * inode-by-inode within a group, so enabling these + * blocks/inodes before the group is live won't actually let us + * allocate the new space yet. + */ + for (i = 0; i < flex_gd->count; i++) { + blocks_count += group_data[i].blocks_count; + free_blocks += group_data[i].free_blocks_count; + } + + reserved_blocks = ext4_r_blocks_count(es) * 100; + do_div(reserved_blocks, ext4_blocks_count(es)); + reserved_blocks *= blocks_count; + do_div(reserved_blocks, 100); + + ext4_blocks_count_set(es, ext4_blocks_count(es) + blocks_count); + le32_add_cpu(&es->s_inodes_count, EXT4_INODES_PER_GROUP(sb) * + flex_gd->count); + + /* + * We need to protect s_groups_count against other CPUs seeing + * inconsistent state in the superblock. + * + * The precise rules we use are: + * + * * Writers must perform a smp_wmb() after updating all + * dependent data and before modifying the groups count + * + * * Readers must perform an smp_rmb() after reading the groups + * count and before reading any dependent data. + * + * NB. These rules can be relaxed when checking the group count + * while freeing data, as we can only allocate from a block + * group after serialising against the group count, and we can + * only then free after serialising in turn against that + * allocation. + */ + smp_wmb(); + + /* Update the global fs size fields */ + sbi->s_groups_count += flex_gd->count; + + /* Update the reserved block counts only once the new group is + * active. */ + ext4_r_blocks_count_set(es, ext4_r_blocks_count(es) + + reserved_blocks); + + /* Update the free space counts */ + percpu_counter_add(&sbi->s_freeclusters_counter, + EXT4_B2C(sbi, free_blocks)); + percpu_counter_add(&sbi->s_freeinodes_counter, + EXT4_INODES_PER_GROUP(sb) * flex_gd->count); + + if (EXT4_HAS_INCOMPAT_FEATURE(sb, + EXT4_FEATURE_INCOMPAT_FLEX_BG) && + sbi->s_log_groups_per_flex) { + ext4_group_t flex_group; + flex_group = ext4_flex_group(sbi, group_data[0].group); + atomic_add(EXT4_B2C(sbi, free_blocks), + &sbi->s_flex_groups[flex_group].free_clusters); + atomic_add(EXT4_INODES_PER_GROUP(sb) * flex_gd->count, + &sbi->s_flex_groups[flex_group].free_inodes); + } + + if (test_opt(sb, DEBUG)) + printk(KERN_DEBUG "EXT4-fs: added group %u:" + "%llu blocks(%llu free %llu reserved)\n", flex_gd->count, + blocks_count, free_blocks, reserved_blocks); +} + /* Add group descriptor data to an existing or new group descriptor block. * Ensure we handle all possible error conditions _before_ we start modifying * the filesystem, because we cannot abort the transaction and not have it From c72df9f928efd5b17e84bdb7b8ec1be3b9c1ea9d Mon Sep 17 00:00:00 2001 From: Yongqiang Yang Date: Tue, 3 Jan 2012 23:43:39 -0500 Subject: [PATCH 20/32] ext4: pass verify_reserved_gdb() the number of group decriptors The 64bit resizer adds a flex group each time, so verify_reserved_gdb can not use s_groups_count directly, it should use the number of group decriptors before the added group. Signed-off-by: Yongqiang Yang Signed-off-by: "Theodore Ts'o" --- fs/ext4/resize.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index eb0aebcca55f..12eace096546 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -656,10 +656,10 @@ static unsigned ext4_list_backups(struct super_block *sb, unsigned *three, * groups in current filesystem that have BACKUPS, or -ve error code. */ static int verify_reserved_gdb(struct super_block *sb, + ext4_group_t end, struct buffer_head *primary) { const ext4_fsblk_t blk = primary->b_blocknr; - const ext4_group_t end = EXT4_SB(sb)->s_groups_count; unsigned three = 1; unsigned five = 5; unsigned seven = 7; @@ -734,7 +734,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, if (!gdb_bh) return -EIO; - gdbackups = verify_reserved_gdb(sb, gdb_bh); + gdbackups = verify_reserved_gdb(sb, group, gdb_bh); if (gdbackups < 0) { err = gdbackups; goto exit_bh; @@ -897,7 +897,8 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode, err = -EIO; goto exit_bh; } - if ((gdbackups = verify_reserved_gdb(sb, primary[res])) < 0) { + gdbackups = verify_reserved_gdb(sb, group, primary[res]); + if (gdbackups < 0) { brelse(primary[res]); err = gdbackups; goto exit_bh; From 3fbea4b3683a5dfa86489ef7799cbe55e8003dfa Mon Sep 17 00:00:00 2001 From: Yongqiang Yang Date: Tue, 3 Jan 2012 23:44:38 -0500 Subject: [PATCH 21/32] ext4: add a new function which allocates bitmaps and inode tables This patch adds a new function named ext4_allocates_group_table() which allocates block bitmaps, inode bitmaps and inode tables for a flex groups and is used by resize code. Signed-off-by: Yongqiang Yang Signed-off-by: "Theodore Ts'o" --- fs/ext4/resize.c | 111 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 111 insertions(+) diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 12eace096546..a4075de73c72 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -189,6 +189,117 @@ static void free_flex_gd(struct ext4_new_flex_group_data *flex_gd) kfree(flex_gd); } +/* + * ext4_alloc_group_tables() allocates block bitmaps, inode bitmaps + * and inode tables for a flex group. + * + * This function is used by 64bit-resize. Note that this function allocates + * group tables from the 1st group of groups contained by @flexgd, which may + * be a partial of a flex group. + * + * @sb: super block of fs to which the groups belongs + */ +static void ext4_alloc_group_tables(struct super_block *sb, + struct ext4_new_flex_group_data *flex_gd, + int flexbg_size) +{ + struct ext4_new_group_data *group_data = flex_gd->groups; + struct ext4_super_block *es = EXT4_SB(sb)->s_es; + ext4_fsblk_t start_blk; + ext4_fsblk_t last_blk; + ext4_group_t src_group; + ext4_group_t bb_index = 0; + ext4_group_t ib_index = 0; + ext4_group_t it_index = 0; + ext4_group_t group; + ext4_group_t last_group; + unsigned overhead; + + BUG_ON(flex_gd->count == 0 || group_data == NULL); + + src_group = group_data[0].group; + last_group = src_group + flex_gd->count - 1; + + BUG_ON((flexbg_size > 1) && ((src_group & ~(flexbg_size - 1)) != + (last_group & ~(flexbg_size - 1)))); +next_group: + group = group_data[0].group; + start_blk = ext4_group_first_block_no(sb, src_group); + last_blk = start_blk + group_data[src_group - group].blocks_count; + + overhead = ext4_bg_has_super(sb, src_group) ? + (1 + ext4_bg_num_gdb(sb, src_group) + + le16_to_cpu(es->s_reserved_gdt_blocks)) : 0; + + start_blk += overhead; + + BUG_ON(src_group >= group_data[0].group + flex_gd->count); + /* We collect contiguous blocks as much as possible. */ + src_group++; + for (; src_group <= last_group; src_group++) + if (!ext4_bg_has_super(sb, src_group)) + last_blk += group_data[src_group - group].blocks_count; + else + break; + + /* Allocate block bitmaps */ + for (; bb_index < flex_gd->count; bb_index++) { + if (start_blk >= last_blk) + goto next_group; + group_data[bb_index].block_bitmap = start_blk++; + ext4_get_group_no_and_offset(sb, start_blk - 1, &group, NULL); + group -= group_data[0].group; + group_data[group].free_blocks_count--; + if (flexbg_size > 1) + flex_gd->bg_flags[group] &= ~EXT4_BG_BLOCK_UNINIT; + } + + /* Allocate inode bitmaps */ + for (; ib_index < flex_gd->count; ib_index++) { + if (start_blk >= last_blk) + goto next_group; + group_data[ib_index].inode_bitmap = start_blk++; + ext4_get_group_no_and_offset(sb, start_blk - 1, &group, NULL); + group -= group_data[0].group; + group_data[group].free_blocks_count--; + if (flexbg_size > 1) + flex_gd->bg_flags[group] &= ~EXT4_BG_BLOCK_UNINIT; + } + + /* Allocate inode tables */ + for (; it_index < flex_gd->count; it_index++) { + if (start_blk + EXT4_SB(sb)->s_itb_per_group > last_blk) + goto next_group; + group_data[it_index].inode_table = start_blk; + ext4_get_group_no_and_offset(sb, start_blk, &group, NULL); + group -= group_data[0].group; + group_data[group].free_blocks_count -= + EXT4_SB(sb)->s_itb_per_group; + if (flexbg_size > 1) + flex_gd->bg_flags[group] &= ~EXT4_BG_BLOCK_UNINIT; + + start_blk += EXT4_SB(sb)->s_itb_per_group; + } + + if (test_opt(sb, DEBUG)) { + int i; + group = group_data[0].group; + + printk(KERN_DEBUG "EXT4-fs: adding a flex group with " + "%d groups, flexbg size is %d:\n", flex_gd->count, + flexbg_size); + + for (i = 0; i < flex_gd->count; i++) { + printk(KERN_DEBUG "adding %s group %u: %u " + "blocks (%d free)\n", + ext4_bg_has_super(sb, group + i) ? "normal" : + "no-super", group + i, + group_data[i].blocks_count, + group_data[i].free_blocks_count); + } + } +} + static struct buffer_head *bclean(handle_t *handle, struct super_block *sb, ext4_fsblk_t blk) { From 4bac1f8cef7bfd2c62793f75aba66a5b8357dede Mon Sep 17 00:00:00 2001 From: Yongqiang Yang Date: Tue, 3 Jan 2012 23:44:38 -0500 Subject: [PATCH 22/32] ext4: add a new function which adds a flex group to a fs This patch adds a new function named ext4_flex_group_add() which adds a flex group to a fs. The function is used by 64bit-resize interface. Signed-off-by: Yongqiang Yang Signed-off-by: "Theodore Ts'o" --- fs/ext4/resize.c | 82 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 82 insertions(+) diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index a4075de73c72..dac23561f3eb 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -1348,6 +1348,88 @@ static void ext4_update_super(struct super_block *sb, blocks_count, free_blocks, reserved_blocks); } +/* Add a flex group to an fs. Ensure we handle all possible error conditions + * _before_ we start modifying the filesystem, because we cannot abort the + * transaction and not have it write the data to disk. + */ +static int ext4_flex_group_add(struct super_block *sb, + struct inode *resize_inode, + struct ext4_new_flex_group_data *flex_gd) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_super_block *es = sbi->s_es; + ext4_fsblk_t o_blocks_count; + ext4_grpblk_t last; + ext4_group_t group; + handle_t *handle; + unsigned reserved_gdb; + int err = 0, err2 = 0, credit; + + BUG_ON(!flex_gd->count || !flex_gd->groups || !flex_gd->bg_flags); + + reserved_gdb = le16_to_cpu(es->s_reserved_gdt_blocks); + o_blocks_count = ext4_blocks_count(es); + ext4_get_group_no_and_offset(sb, o_blocks_count, &group, &last); + BUG_ON(last); + + err = setup_new_flex_group_blocks(sb, flex_gd); + if (err) + goto exit; + /* + * We will always be modifying at least the superblock and GDT + * block. If we are adding a group past the last current GDT block, + * we will also modify the inode and the dindirect block. If we + * are adding a group with superblock/GDT backups we will also + * modify each of the reserved GDT dindirect blocks. + */ + credit = flex_gd->count * 4 + reserved_gdb; + handle = ext4_journal_start_sb(sb, credit); + if (IS_ERR(handle)) { + err = PTR_ERR(handle); + goto exit; + } + + err = ext4_journal_get_write_access(handle, sbi->s_sbh); + if (err) + goto exit_journal; + + group = flex_gd->groups[0].group; + BUG_ON(group != EXT4_SB(sb)->s_groups_count); + err = ext4_add_new_descs(handle, sb, group, + resize_inode, flex_gd->count); + if (err) + goto exit_journal; + + err = ext4_setup_new_descs(handle, sb, flex_gd); + if (err) + goto exit_journal; + + ext4_update_super(sb, flex_gd); + + err = ext4_handle_dirty_super(handle, sb); + +exit_journal: + err2 = ext4_journal_stop(handle); + if (!err) + err = err2; + + if (!err) { + int i; + update_backups(sb, sbi->s_sbh->b_blocknr, (char *)es, + sizeof(struct ext4_super_block)); + for (i = 0; i < flex_gd->count; i++, group++) { + struct buffer_head *gdb_bh; + int gdb_num; + gdb_num = group / EXT4_BLOCKS_PER_GROUP(sb); + gdb_bh = sbi->s_group_desc[gdb_num]; + update_backups(sb, gdb_bh->b_blocknr, gdb_bh->b_data, + gdb_bh->b_size); + } + } +exit: + return err; +} + /* Add group descriptor data to an existing or new group descriptor block. * Ensure we handle all possible error conditions _before_ we start modifying * the filesystem, because we cannot abort the transaction and not have it From 19c5246d251640ac76daa4d34165af78c64b1454 Mon Sep 17 00:00:00 2001 From: Yongqiang Yang Date: Wed, 4 Jan 2012 17:09:44 -0500 Subject: [PATCH 23/32] ext4: add new online resize interface This patch adds new online resize interface, whose input argument is a 64-bit integer indicating how many blocks there are in the resized fs. In new resize impelmentation, all work like allocating group tables are done by kernel side, so the new resize interface can support flex_bg feature and prepares ground for suppoting resize with features like bigalloc and exclude bitmap. Besides these, user-space tools just passes in the new number of blocks. We delay initializing the bitmaps and inode tables of added groups if possible and add multi groups (a flex groups) each time, so new resize is very fast like mkfs. Signed-off-by: Yongqiang Yang Signed-off-by: "Theodore Ts'o" --- Documentation/filesystems/ext4.txt | 7 ++ fs/ext4/ext4.h | 2 + fs/ext4/ioctl.c | 57 ++++++++++ fs/ext4/resize.c | 177 +++++++++++++++++++++++++++++ 4 files changed, 243 insertions(+) diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt index 4917cf24a5e0..10ec4639f152 100644 --- a/Documentation/filesystems/ext4.txt +++ b/Documentation/filesystems/ext4.txt @@ -581,6 +581,13 @@ Table of Ext4 specific ioctls behaviour may change in the future as it is not necessary and has been done this way only for sake of simplicity. + + EXT4_IOC_RESIZE_FS Resize the filesystem to a new size. The number + of blocks of resized filesystem is passed in via + 64 bit integer argument. The kernel allocates + bitmaps and inode table, the userspace tool thus + just passes the new number of blocks. + .............................................................................. References diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 05058e2b7f4f..4bc0e82a9054 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -583,6 +583,7 @@ enum { /* note ioctl 11 reserved for filesystem-independent FIEMAP ioctl */ #define EXT4_IOC_ALLOC_DA_BLKS _IO('f', 12) #define EXT4_IOC_MOVE_EXT _IOWR('f', 15, struct move_extent) +#define EXT4_IOC_RESIZE_FS _IOW('f', 16, __u64) #if defined(__KERNEL__) && defined(CONFIG_COMPAT) /* @@ -1929,6 +1930,7 @@ extern int ext4_group_add(struct super_block *sb, extern int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es, ext4_fsblk_t n_blocks_count); +extern int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count); /* super.c */ extern void *ext4_kvmalloc(size_t size, gfp_t flags); diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index ff1aab7cd6e8..c1a98804a383 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -18,6 +18,8 @@ #include "ext4_jbd2.h" #include "ext4.h" +#define MAX_32_NUM ((((unsigned long long) 1) << 32) - 1) + long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { struct inode *inode = filp->f_dentry->d_inode; @@ -329,6 +331,60 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) return err; } + case EXT4_IOC_RESIZE_FS: { + ext4_fsblk_t n_blocks_count; + struct super_block *sb = inode->i_sb; + int err = 0, err2 = 0; + + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_BIGALLOC)) { + ext4_msg(sb, KERN_ERR, + "Online resizing not (yet) supported with bigalloc"); + return -EOPNOTSUPP; + } + + if (EXT4_HAS_INCOMPAT_FEATURE(sb, + EXT4_FEATURE_INCOMPAT_META_BG)) { + ext4_msg(sb, KERN_ERR, + "Online resizing not (yet) supported with meta_bg"); + return -EOPNOTSUPP; + } + + if (copy_from_user(&n_blocks_count, (__u64 __user *)arg, + sizeof(__u64))) { + return -EFAULT; + } + + if (n_blocks_count > MAX_32_NUM && + !EXT4_HAS_INCOMPAT_FEATURE(sb, + EXT4_FEATURE_INCOMPAT_64BIT)) { + ext4_msg(sb, KERN_ERR, + "File system only supports 32-bit block numbers"); + return -EOPNOTSUPP; + } + + err = ext4_resize_begin(sb); + if (err) + return err; + + err = mnt_want_write(filp->f_path.mnt); + if (err) + goto resizefs_out; + + err = ext4_resize_fs(sb, n_blocks_count); + if (EXT4_SB(sb)->s_journal) { + jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal); + err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal); + jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); + } + if (err == 0) + err = err2; + mnt_drop_write(filp->f_path.mnt); +resizefs_out: + ext4_resize_end(sb); + return err; + } + case FITRIM: { struct request_queue *q = bdev_get_queue(sb->s_bdev); @@ -427,6 +483,7 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) } case EXT4_IOC_MOVE_EXT: case FITRIM: + case EXT4_IOC_RESIZE_FS: break; default: return -ENOIOCTLCMD; diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index dac23561f3eb..5fe2a013ee65 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -1430,6 +1430,70 @@ static int ext4_flex_group_add(struct super_block *sb, return err; } +static int ext4_setup_next_flex_gd(struct super_block *sb, + struct ext4_new_flex_group_data *flex_gd, + ext4_fsblk_t n_blocks_count, + unsigned long flexbg_size) +{ + struct ext4_super_block *es = EXT4_SB(sb)->s_es; + struct ext4_new_group_data *group_data = flex_gd->groups; + ext4_fsblk_t o_blocks_count; + ext4_group_t n_group; + ext4_group_t group; + ext4_group_t last_group; + ext4_grpblk_t last; + ext4_grpblk_t blocks_per_group; + unsigned long i; + + blocks_per_group = EXT4_BLOCKS_PER_GROUP(sb); + + o_blocks_count = ext4_blocks_count(es); + + if (o_blocks_count == n_blocks_count) + return 0; + + ext4_get_group_no_and_offset(sb, o_blocks_count, &group, &last); + BUG_ON(last); + ext4_get_group_no_and_offset(sb, n_blocks_count - 1, &n_group, &last); + + last_group = group | (flexbg_size - 1); + if (last_group > n_group) + last_group = n_group; + + flex_gd->count = last_group - group + 1; + + for (i = 0; i < flex_gd->count; i++) { + int overhead; + + group_data[i].group = group + i; + group_data[i].blocks_count = blocks_per_group; + overhead = ext4_bg_has_super(sb, group + i) ? + (1 + ext4_bg_num_gdb(sb, group + i) + + le16_to_cpu(es->s_reserved_gdt_blocks)) : 0; + group_data[i].free_blocks_count = blocks_per_group - overhead; + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) + flex_gd->bg_flags[i] = EXT4_BG_BLOCK_UNINIT | + EXT4_BG_INODE_UNINIT; + else + flex_gd->bg_flags[i] = EXT4_BG_INODE_ZEROED; + } + + if (last_group == n_group && + EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) + /* We need to initialize block bitmap of last group. */ + flex_gd->bg_flags[i - 1] &= ~EXT4_BG_BLOCK_UNINIT; + + if ((last_group == n_group) && (last != blocks_per_group - 1)) { + group_data[i - 1].blocks_count = last + 1; + group_data[i - 1].free_blocks_count -= blocks_per_group- + last - 1; + } + + return 1; +} + /* Add group descriptor data to an existing or new group descriptor block. * Ensure we handle all possible error conditions _before_ we start modifying * the filesystem, because we cannot abort the transaction and not have it @@ -1827,3 +1891,116 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es, exit_put: return err; } /* ext4_group_extend */ + +/* + * ext4_resize_fs() resizes a fs to new size specified by @n_blocks_count + * + * @sb: super block of the fs to be resized + * @n_blocks_count: the number of blocks resides in the resized fs + */ +int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count) +{ + struct ext4_new_flex_group_data *flex_gd = NULL; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_super_block *es = sbi->s_es; + struct buffer_head *bh; + struct inode *resize_inode; + ext4_fsblk_t o_blocks_count; + ext4_group_t o_group; + ext4_group_t n_group; + ext4_grpblk_t offset; + unsigned long n_desc_blocks; + unsigned long o_desc_blocks; + unsigned long desc_blocks; + int err = 0, flexbg_size = 1; + + o_blocks_count = ext4_blocks_count(es); + + if (test_opt(sb, DEBUG)) + printk(KERN_DEBUG "EXT4-fs: resizing filesystem from %llu " + "upto %llu blocks\n", o_blocks_count, n_blocks_count); + + if (n_blocks_count < o_blocks_count) { + /* On-line shrinking not supported */ + ext4_warning(sb, "can't shrink FS - resize aborted"); + return -EINVAL; + } + + if (n_blocks_count == o_blocks_count) + /* Nothing need to do */ + return 0; + + ext4_get_group_no_and_offset(sb, n_blocks_count - 1, &n_group, &offset); + ext4_get_group_no_and_offset(sb, o_blocks_count, &o_group, &offset); + + n_desc_blocks = (n_group + EXT4_DESC_PER_BLOCK(sb)) / + EXT4_DESC_PER_BLOCK(sb); + o_desc_blocks = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) / + EXT4_DESC_PER_BLOCK(sb); + desc_blocks = n_desc_blocks - o_desc_blocks; + + if (desc_blocks && + (!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_RESIZE_INODE) || + le16_to_cpu(es->s_reserved_gdt_blocks) < desc_blocks)) { + ext4_warning(sb, "No reserved GDT blocks, can't resize"); + return -EPERM; + } + + resize_inode = ext4_iget(sb, EXT4_RESIZE_INO); + if (IS_ERR(resize_inode)) { + ext4_warning(sb, "Error opening resize inode"); + return PTR_ERR(resize_inode); + } + + /* See if the device is actually as big as what was requested */ + bh = sb_bread(sb, n_blocks_count - 1); + if (!bh) { + ext4_warning(sb, "can't read last block, resize aborted"); + return -ENOSPC; + } + brelse(bh); + + if (offset != 0) { + /* extend the last group */ + ext4_grpblk_t add; + add = EXT4_BLOCKS_PER_GROUP(sb) - offset; + err = ext4_group_extend_no_check(sb, o_blocks_count, add); + if (err) + goto out; + } + + if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG) && + es->s_log_groups_per_flex) + flexbg_size = 1 << es->s_log_groups_per_flex; + + o_blocks_count = ext4_blocks_count(es); + if (o_blocks_count == n_blocks_count) + goto out; + + flex_gd = alloc_flex_gd(flexbg_size); + if (flex_gd == NULL) { + err = -ENOMEM; + goto out; + } + + /* Add flex groups. Note that a regular group is a + * flex group with 1 group. + */ + while (ext4_setup_next_flex_gd(sb, flex_gd, n_blocks_count, + flexbg_size)) { + ext4_alloc_group_tables(sb, flex_gd, flexbg_size); + err = ext4_flex_group_add(sb, resize_inode, flex_gd); + if (unlikely(err)) + break; + } + +out: + if (flex_gd) + free_flex_gd(flex_gd); + + iput(resize_inode); + if (test_opt(sb, DEBUG)) + printk(KERN_DEBUG "EXT4-fs: resized filesystem from %llu " + "upto %llu blocks\n", o_blocks_count, n_blocks_count); + return err; +} From d89651c8e222b2d2797bf66d4eb7064459f4f4f4 Mon Sep 17 00:00:00 2001 From: Yongqiang Yang Date: Wed, 4 Jan 2012 17:09:48 -0500 Subject: [PATCH 24/32] ext4: let ext4_group_extend() use common code ext4_group_extend_no_check() is moved out from ext4_group_extend(), this patch lets ext4_group_extend() call ext4_group_extentd_no_check() instead. Signed-off-by: Yongqiang Yang Signed-off-by: "Theodore Ts'o" --- fs/ext4/resize.c | 41 ++--------------------------------------- 1 file changed, 2 insertions(+), 39 deletions(-) diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 5fe2a013ee65..eba706d9276a 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -1795,8 +1795,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es, ext4_grpblk_t last; ext4_grpblk_t add; struct buffer_head *bh; - handle_t *handle; - int err, err2; + int err; ext4_group_t group; o_blocks_count = ext4_blocks_count(es); @@ -1852,43 +1851,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es, } brelse(bh); - /* We will update the superblock, one block bitmap, and - * one group descriptor via ext4_free_blocks(). - */ - handle = ext4_journal_start_sb(sb, 3); - if (IS_ERR(handle)) { - err = PTR_ERR(handle); - ext4_warning(sb, "error %d on journal start", err); - goto exit_put; - } - - if ((err = ext4_journal_get_write_access(handle, - EXT4_SB(sb)->s_sbh))) { - ext4_warning(sb, "error %d on journal write access", err); - ext4_journal_stop(handle); - goto exit_put; - } - ext4_blocks_count_set(es, o_blocks_count + add); - ext4_debug("freeing blocks %llu through %llu\n", o_blocks_count, - o_blocks_count + add); - /* We add the blocks to the bitmap and set the group need init bit */ - err = ext4_group_add_blocks(handle, sb, o_blocks_count, add); - ext4_handle_dirty_super(handle, sb); - ext4_debug("freed blocks %llu through %llu\n", o_blocks_count, - o_blocks_count + add); - err2 = ext4_journal_stop(handle); - if (!err && err2) - err = err2; - - if (err) - goto exit_put; - - if (test_opt(sb, DEBUG)) - printk(KERN_DEBUG "EXT4-fs: extended group to %llu blocks\n", - ext4_blocks_count(es)); - update_backups(sb, EXT4_SB(sb)->s_sbh->b_blocknr, (char *)es, - sizeof(struct ext4_super_block)); -exit_put: + err = ext4_group_extend_no_check(sb, o_blocks_count, add); return err; } /* ext4_group_extend */ From 61f296cc49751f1dc992039229d12b0de7e0c2ae Mon Sep 17 00:00:00 2001 From: Yongqiang Yang Date: Wed, 4 Jan 2012 17:09:50 -0500 Subject: [PATCH 25/32] ext4: let ext4_group_add() use common code This patch lets ext4_group_add() call ext4_flex_group_add(). Signed-off-by: Yongqiang Yang Signed-off-by: "Theodore Ts'o" --- fs/ext4/resize.c | 309 ++--------------------------------------------- 1 file changed, 10 insertions(+), 299 deletions(-) diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index eba706d9276a..f9d948f0eb86 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -594,137 +594,6 @@ static int setup_new_flex_group_blocks(struct super_block *sb, return err; } -/* - * Set up the block and inode bitmaps, and the inode table for the new group. - * This doesn't need to be part of the main transaction, since we are only - * changing blocks outside the actual filesystem. We still do journaling to - * ensure the recovery is correct in case of a failure just after resize. - * If any part of this fails, we simply abort the resize. - */ -static int setup_new_group_blocks(struct super_block *sb, - struct ext4_new_group_data *input) -{ - struct ext4_sb_info *sbi = EXT4_SB(sb); - ext4_fsblk_t start = ext4_group_first_block_no(sb, input->group); - int reserved_gdb = ext4_bg_has_super(sb, input->group) ? - le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) : 0; - unsigned long gdblocks = ext4_bg_num_gdb(sb, input->group); - struct buffer_head *bh; - handle_t *handle; - ext4_fsblk_t block; - ext4_grpblk_t bit; - int i; - int err = 0, err2; - - /* This transaction may be extended/restarted along the way */ - handle = ext4_journal_start_sb(sb, EXT4_MAX_TRANS_DATA); - - if (IS_ERR(handle)) - return PTR_ERR(handle); - - BUG_ON(input->group != sbi->s_groups_count); - - /* Copy all of the GDT blocks into the backup in this group */ - for (i = 0, bit = 1, block = start + 1; - i < gdblocks; i++, block++, bit++) { - struct buffer_head *gdb; - - ext4_debug("update backup group %#04llx (+%d)\n", block, bit); - err = extend_or_restart_transaction(handle, 1); - if (err) - goto exit_journal; - - gdb = sb_getblk(sb, block); - if (!gdb) { - err = -EIO; - goto exit_journal; - } - if ((err = ext4_journal_get_write_access(handle, gdb))) { - brelse(gdb); - goto exit_journal; - } - memcpy(gdb->b_data, sbi->s_group_desc[i]->b_data, gdb->b_size); - set_buffer_uptodate(gdb); - err = ext4_handle_dirty_metadata(handle, NULL, gdb); - if (unlikely(err)) { - brelse(gdb); - goto exit_journal; - } - brelse(gdb); - } - - /* Zero out all of the reserved backup group descriptor table blocks */ - ext4_debug("clear inode table blocks %#04llx -> %#04lx\n", - block, sbi->s_itb_per_group); - err = sb_issue_zeroout(sb, gdblocks + start + 1, reserved_gdb, - GFP_NOFS); - if (err) - goto exit_journal; - - err = extend_or_restart_transaction(handle, 2); - if (err) - goto exit_journal; - - bh = bclean(handle, sb, input->block_bitmap); - if (IS_ERR(bh)) { - err = PTR_ERR(bh); - goto exit_journal; - } - - if (ext4_bg_has_super(sb, input->group)) { - ext4_debug("mark backup group tables %#04llx (+0)\n", start); - ext4_set_bits(bh->b_data, 0, gdblocks + reserved_gdb + 1); - } - - ext4_debug("mark block bitmap %#04llx (+%llu)\n", input->block_bitmap, - input->block_bitmap - start); - ext4_set_bit(input->block_bitmap - start, bh->b_data); - ext4_debug("mark inode bitmap %#04llx (+%llu)\n", input->inode_bitmap, - input->inode_bitmap - start); - ext4_set_bit(input->inode_bitmap - start, bh->b_data); - - /* Zero out all of the inode table blocks */ - block = input->inode_table; - ext4_debug("clear inode table blocks %#04llx -> %#04lx\n", - block, sbi->s_itb_per_group); - err = sb_issue_zeroout(sb, block, sbi->s_itb_per_group, GFP_NOFS); - if (err) - goto exit_bh; - ext4_set_bits(bh->b_data, input->inode_table - start, - sbi->s_itb_per_group); - - - ext4_mark_bitmap_end(input->blocks_count, sb->s_blocksize * 8, - bh->b_data); - err = ext4_handle_dirty_metadata(handle, NULL, bh); - if (unlikely(err)) { - ext4_std_error(sb, err); - goto exit_bh; - } - brelse(bh); - /* Mark unused entries in inode bitmap used */ - ext4_debug("clear inode bitmap %#04llx (+%llu)\n", - input->inode_bitmap, input->inode_bitmap - start); - if (IS_ERR(bh = bclean(handle, sb, input->inode_bitmap))) { - err = PTR_ERR(bh); - goto exit_journal; - } - - ext4_mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8, - bh->b_data); - err = ext4_handle_dirty_metadata(handle, NULL, bh); - if (unlikely(err)) - ext4_std_error(sb, err); -exit_bh: - brelse(bh); - -exit_journal: - if ((err2 = ext4_journal_stop(handle)) && !err) - err = err2; - - return err; -} - /* * Iterate through the groups which hold BACKUP superblock/GDT copies in an * ext4 filesystem. The counters should be initialized to 1, 5, and 7 before @@ -1509,16 +1378,15 @@ static int ext4_setup_next_flex_gd(struct super_block *sb, */ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) { + struct ext4_new_flex_group_data flex_gd; struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es = sbi->s_es; int reserved_gdb = ext4_bg_has_super(sb, input->group) ? le16_to_cpu(es->s_reserved_gdt_blocks) : 0; - struct buffer_head *primary = NULL; - struct ext4_group_desc *gdp; struct inode *inode = NULL; - handle_t *handle; int gdb_off, gdb_num; - int err, err2; + int err; + __u16 bg_flags = 0; gdb_num = input->group / EXT4_DESC_PER_BLOCK(sb); gdb_off = input->group % EXT4_DESC_PER_BLOCK(sb); @@ -1557,172 +1425,15 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) } - if ((err = verify_group_input(sb, input))) - goto exit_put; - - if ((err = setup_new_group_blocks(sb, input))) - goto exit_put; - - /* - * We will always be modifying at least the superblock and a GDT - * block. If we are adding a group past the last current GDT block, - * we will also modify the inode and the dindirect block. If we - * are adding a group with superblock/GDT backups we will also - * modify each of the reserved GDT dindirect blocks. - */ - handle = ext4_journal_start_sb(sb, - ext4_bg_has_super(sb, input->group) ? - 3 + reserved_gdb : 4); - if (IS_ERR(handle)) { - err = PTR_ERR(handle); - goto exit_put; - } - - if ((err = ext4_journal_get_write_access(handle, sbi->s_sbh))) - goto exit_journal; - - /* - * We will only either add reserved group blocks to a backup group - * or remove reserved blocks for the first group in a new group block. - * Doing both would be mean more complex code, and sane people don't - * use non-sparse filesystems anymore. This is already checked above. - */ - if (gdb_off) { - primary = sbi->s_group_desc[gdb_num]; - if ((err = ext4_journal_get_write_access(handle, primary))) - goto exit_journal; - - if (reserved_gdb && ext4_bg_num_gdb(sb, input->group)) { - err = reserve_backup_gdb(handle, inode, input->group); - if (err) - goto exit_journal; - } - } else { - /* - * Note that we can access new group descriptor block safely - * only if add_new_gdb() succeeds. - */ - err = add_new_gdb(handle, inode, input->group); - if (err) - goto exit_journal; - primary = sbi->s_group_desc[gdb_num]; - } - - /* - * OK, now we've set up the new group. Time to make it active. - * - * so we have to be safe wrt. concurrent accesses the group - * data. So we need to be careful to set all of the relevant - * group descriptor data etc. *before* we enable the group. - * - * The key field here is sbi->s_groups_count: as long as - * that retains its old value, nobody is going to access the new - * group. - * - * So first we update all the descriptor metadata for the new - * group; then we update the total disk blocks count; then we - * update the groups count to enable the group; then finally we - * update the free space counts so that the system can start - * using the new disk blocks. - */ - - /* Update group descriptor block for new group */ - gdp = (struct ext4_group_desc *)((char *)primary->b_data + - gdb_off * EXT4_DESC_SIZE(sb)); - - memset(gdp, 0, EXT4_DESC_SIZE(sb)); - ext4_block_bitmap_set(sb, gdp, input->block_bitmap); /* LV FIXME */ - ext4_inode_bitmap_set(sb, gdp, input->inode_bitmap); /* LV FIXME */ - ext4_inode_table_set(sb, gdp, input->inode_table); /* LV FIXME */ - ext4_free_group_clusters_set(sb, gdp, input->free_blocks_count); - ext4_free_inodes_set(sb, gdp, EXT4_INODES_PER_GROUP(sb)); - gdp->bg_flags = cpu_to_le16(EXT4_BG_INODE_ZEROED); - gdp->bg_checksum = ext4_group_desc_csum(sbi, input->group, gdp); - - /* - * We can allocate memory for mb_alloc based on the new group - * descriptor - */ - err = ext4_mb_add_groupinfo(sb, input->group, gdp); + err = verify_group_input(sb, input); if (err) - goto exit_journal; + goto out; - /* - * Make the new blocks and inodes valid next. We do this before - * increasing the group count so that once the group is enabled, - * all of its blocks and inodes are already valid. - * - * We always allocate group-by-group, then block-by-block or - * inode-by-inode within a group, so enabling these - * blocks/inodes before the group is live won't actually let us - * allocate the new space yet. - */ - ext4_blocks_count_set(es, ext4_blocks_count(es) + - input->blocks_count); - le32_add_cpu(&es->s_inodes_count, EXT4_INODES_PER_GROUP(sb)); - - /* - * We need to protect s_groups_count against other CPUs seeing - * inconsistent state in the superblock. - * - * The precise rules we use are: - * - * * Writers must perform a smp_wmb() after updating all dependent - * data and before modifying the groups count - * - * * Readers must perform an smp_rmb() after reading the groups count - * and before reading any dependent data. - * - * NB. These rules can be relaxed when checking the group count - * while freeing data, as we can only allocate from a block - * group after serialising against the group count, and we can - * only then free after serialising in turn against that - * allocation. - */ - smp_wmb(); - - /* Update the global fs size fields */ - sbi->s_groups_count++; - - err = ext4_handle_dirty_metadata(handle, NULL, primary); - if (unlikely(err)) { - ext4_std_error(sb, err); - goto exit_journal; - } - - /* Update the reserved block counts only once the new group is - * active. */ - ext4_r_blocks_count_set(es, ext4_r_blocks_count(es) + - input->reserved_blocks); - - /* Update the free space counts */ - percpu_counter_add(&sbi->s_freeclusters_counter, - EXT4_B2C(sbi, input->free_blocks_count)); - percpu_counter_add(&sbi->s_freeinodes_counter, - EXT4_INODES_PER_GROUP(sb)); - - if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG) && - sbi->s_log_groups_per_flex) { - ext4_group_t flex_group; - flex_group = ext4_flex_group(sbi, input->group); - atomic_add(EXT4_B2C(sbi, input->free_blocks_count), - &sbi->s_flex_groups[flex_group].free_clusters); - atomic_add(EXT4_INODES_PER_GROUP(sb), - &sbi->s_flex_groups[flex_group].free_inodes); - } - - ext4_handle_dirty_super(handle, sb); - -exit_journal: - if ((err2 = ext4_journal_stop(handle)) && !err) - err = err2; - if (!err && primary) { - update_backups(sb, sbi->s_sbh->b_blocknr, (char *)es, - sizeof(struct ext4_super_block)); - update_backups(sb, primary->b_blocknr, primary->b_data, - primary->b_size); - } -exit_put: + flex_gd.count = 1; + flex_gd.groups = input; + flex_gd.bg_flags = &bg_flags; + err = ext4_flex_group_add(sb, inode, &flex_gd); +out: iput(inode); return err; } /* ext4_group_add */ From 014a1770371a028d22f364718c805f4216911ecd Mon Sep 17 00:00:00 2001 From: Djalal Harouni Date: Wed, 4 Jan 2012 17:09:52 -0500 Subject: [PATCH 26/32] ext4: add missing ext4_resize_end on error paths Online resize ioctls 'EXT4_IOC_GROUP_EXTEND' and 'EXT4_IOC_GROUP_ADD' call ext4_resize_begin() to check permissions and to set the EXT4_RESIZING bit lock, they do their work and they must finish with ext4_resize_end() which calls clear_bit_unlock() to unlock and to avoid -EBUSY errors for the next resize operations. This patch adds the missing ext4_resize_end() calls on error paths. Patch tested. Cc: stable@vger.kernel.org Signed-off-by: Djalal Harouni Signed-off-by: "Theodore Ts'o" --- fs/ext4/ioctl.c | 32 ++++++++++++++++++++------------ 1 file changed, 20 insertions(+), 12 deletions(-) diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index c1a98804a383..b81a5f1b6976 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -184,19 +184,22 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) if (err) return err; - if (get_user(n_blocks_count, (__u32 __user *)arg)) - return -EFAULT; + if (get_user(n_blocks_count, (__u32 __user *)arg)) { + err = -EFAULT; + goto group_extend_out; + } if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_BIGALLOC)) { ext4_msg(sb, KERN_ERR, "Online resizing not supported with bigalloc"); - return -EOPNOTSUPP; + err = -EOPNOTSUPP; + goto group_extend_out; } err = mnt_want_write(filp->f_path.mnt); if (err) - return err; + goto group_extend_out; err = ext4_group_extend(sb, EXT4_SB(sb)->s_es, n_blocks_count); if (EXT4_SB(sb)->s_journal) { @@ -206,9 +209,10 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) } if (err == 0) err = err2; - mnt_drop_write(filp->f_path.mnt); - ext4_resize_end(sb); + mnt_drop_write(filp->f_path.mnt); +group_extend_out: + ext4_resize_end(sb); return err; } @@ -267,19 +271,22 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) return err; if (copy_from_user(&input, (struct ext4_new_group_input __user *)arg, - sizeof(input))) - return -EFAULT; + sizeof(input))) { + err = -EFAULT; + goto group_add_out; + } if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_BIGALLOC)) { ext4_msg(sb, KERN_ERR, "Online resizing not supported with bigalloc"); - return -EOPNOTSUPP; + err = -EOPNOTSUPP; + goto group_add_out; } err = mnt_want_write(filp->f_path.mnt); if (err) - return err; + goto group_add_out; err = ext4_group_add(sb, &input); if (EXT4_SB(sb)->s_journal) { @@ -289,9 +296,10 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) } if (err == 0) err = err2; - mnt_drop_write(filp->f_path.mnt); - ext4_resize_end(sb); + mnt_drop_write(filp->f_path.mnt); +group_add_out: + ext4_resize_end(sb); return err; } From 1d526fc91bea04ee35b7599bf8b82f86c0aaf46c Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Wed, 4 Jan 2012 21:22:51 -0500 Subject: [PATCH 27/32] ext4: Report max_batch_time option correctly Currently the value reported for max_batch_time is really the value of min_batch_time. Reported-by: Russell Coker Signed-off-by: Ben Hutchings --- fs/ext4/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 35377d57ec4c..36570b7af7c5 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1096,7 +1096,7 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs) } if (sbi->s_max_batch_time != EXT4_DEF_MAX_BATCH_TIME) { seq_printf(seq, ",max_batch_time=%u", - (unsigned) sbi->s_min_batch_time); + (unsigned) sbi->s_max_batch_time); } /* From 9b90e5e02896406a6da28a376568003d14c06770 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Wed, 4 Jan 2012 22:01:53 -0500 Subject: [PATCH 28/32] ext4: reserve new feature flag codepoints Reserve the ext4 features flags EXT4_FEATURE_RO_COMPAT_METADATA_CSUM, EXT4_FEATURE_INCOMPAT_INLINEDATA, and EXT4_FEATURE_INCOMPAT_LARGEDIR. Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 4bc0e82a9054..13d15149c85c 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1407,6 +1407,7 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei) #define EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE 0x0040 #define EXT4_FEATURE_RO_COMPAT_QUOTA 0x0100 #define EXT4_FEATURE_RO_COMPAT_BIGALLOC 0x0200 +#define EXT4_FEATURE_RO_COMPAT_METADATA_CSUM 0x0400 #define EXT4_FEATURE_INCOMPAT_COMPRESSION 0x0001 #define EXT4_FEATURE_INCOMPAT_FILETYPE 0x0002 @@ -1419,6 +1420,8 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei) #define EXT4_FEATURE_INCOMPAT_FLEX_BG 0x0200 #define EXT4_FEATURE_INCOMPAT_EA_INODE 0x0400 /* EA in inode */ #define EXT4_FEATURE_INCOMPAT_DIRDATA 0x1000 /* data in dirent */ +#define EXT4_FEATURE_INCOMPAT_INLINEDATA 0x2000 /* data in inode */ +#define EXT4_FEATURE_INCOMPAT_LARGEDIR 0x4000 /* >2GB or 3-lvl htree */ #define EXT2_FEATURE_COMPAT_SUPP EXT4_FEATURE_COMPAT_EXT_ATTR #define EXT2_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \ From 9837d8e982b7e87a7207f90618e45d460e196e6a Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 4 Jan 2012 22:03:11 -0500 Subject: [PATCH 29/32] jbd2: fix hung processes in jbd2_journal_lock_updates() Toshiyuki Okajima found out that when running for ((i=0; i < 100000; i++)); do if ((i%2 == 0)); then chattr +j /mnt/file else chattr -j /mnt/file fi echo "0" >> /mnt/file done process sometimes hangs indefinitely in jbd2_journal_lock_updates(). Toshiyuki identified that the following race happens: jbd2_journal_lock_updates() |jbd2_journal_stop() ---------------------------------------+--------------------------------------- write_lock(&journal->j_state_lock) | . ++journal->j_barrier_count | . spin_lock(&tran->t_handle_lock) | . atomic_read(&tran->t_updates) //not 0 | | atomic_dec_and_test(&tran->t_updates) | // t_updates = 0 | wake_up(&journal->j_wait_updates) prepare_to_wait() | // no process is woken up. spin_unlock(&tran->t_handle_lock) | write_unlock(&journal->j_state_lock) | schedule() // never return | We fix the problem by first calling prepare_to_wait() and only after that checking t_updates in jbd2_journal_lock_updates(). Reported-and-analyzed-by: Toshiyuki Okajima Signed-off-by: Jan Kara Signed-off-by: "Theodore Ts'o" --- fs/jbd2/transaction.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index a0e41a4c080e..35ae096bed5d 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -517,12 +517,13 @@ void jbd2_journal_lock_updates(journal_t *journal) break; spin_lock(&transaction->t_handle_lock); - if (!atomic_read(&transaction->t_updates)) { - spin_unlock(&transaction->t_handle_lock); - break; - } prepare_to_wait(&journal->j_wait_updates, &wait, TASK_UNINTERRUPTIBLE); + if (!atomic_read(&transaction->t_updates)) { + spin_unlock(&transaction->t_handle_lock); + finish_wait(&journal->j_wait_updates, &wait); + break; + } spin_unlock(&transaction->t_handle_lock); write_unlock(&journal->j_state_lock); schedule(); From 176576dbc8141528557eeeb007af2d5a2a4891ef Mon Sep 17 00:00:00 2001 From: Djalal Harouni Date: Wed, 4 Jan 2012 22:32:12 -0500 Subject: [PATCH 30/32] ext4: make local symbol ext4_initxattrs static The ext4_initxattrs symbol is used only in this file, so it should be declared static. Signed-off-by: Djalal Harouni Signed-off-by: "Theodore Ts'o" --- fs/ext4/xattr_security.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/ext4/xattr_security.c b/fs/ext4/xattr_security.c index 34e4350dd4d9..e247f8bca8e4 100644 --- a/fs/ext4/xattr_security.c +++ b/fs/ext4/xattr_security.c @@ -48,8 +48,9 @@ ext4_xattr_security_set(struct dentry *dentry, const char *name, name, value, size, flags); } -int ext4_initxattrs(struct inode *inode, const struct xattr *xattr_array, - void *fs_info) +static int +ext4_initxattrs(struct inode *inode, const struct xattr *xattr_array, + void *fs_info) { const struct xattr *xattr; handle_t *handle = fs_info; From 5f163cc759a9fa8844a4efcf1f579dc5b2ca2491 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Wed, 4 Jan 2012 22:33:28 -0500 Subject: [PATCH 31/32] ext4: make more symbols static A couple more functions can reasonably be made static if desired. Signed-off-by: Eric Sandeen Signed-off-by: "Theodore Ts'o" --- fs/ext4/balloc.c | 4 +++- fs/ext4/ext4.h | 5 ----- fs/ext4/inode.c | 5 ++++- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 12ccacda44e0..f9e2cd8cf711 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -23,6 +23,8 @@ #include +static unsigned ext4_num_base_meta_clusters(struct super_block *sb, + ext4_group_t block_group); /* * balloc.c contains the blocks allocation and deallocation routines */ @@ -668,7 +670,7 @@ unsigned long ext4_bg_num_gdb(struct super_block *sb, ext4_group_t group) * This function returns the number of file system metadata clusters at * the beginning of a block group, including the reserved gdt blocks. */ -unsigned ext4_num_base_meta_clusters(struct super_block *sb, +static unsigned ext4_num_base_meta_clusters(struct super_block *sb, ext4_group_t block_group) { struct ext4_sb_info *sbi = EXT4_SB(sb); diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 13d15149c85c..e7dc9ad73941 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1803,8 +1803,6 @@ extern void ext4_init_block_bitmap(struct super_block *sb, extern unsigned ext4_free_clusters_after_init(struct super_block *sb, ext4_group_t block_group, struct ext4_group_desc *gdp); -extern unsigned ext4_num_base_meta_clusters(struct super_block *sb, - ext4_group_t block_group); extern unsigned ext4_num_overhead_clusters(struct super_block *sb, ext4_group_t block_group, struct ext4_group_desc *gdp); @@ -1896,9 +1894,6 @@ extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks); extern int ext4_discard_partial_page_buffers(handle_t *handle, struct address_space *mapping, loff_t from, loff_t length, int flags); -extern int ext4_discard_partial_page_buffers_no_lock(handle_t *handle, - struct inode *inode, struct page *page, loff_t from, - loff_t length, int flags); extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); extern qsize_t *ext4_get_reserved_space(struct inode *inode); extern void ext4_da_update_reserve_space(struct inode *inode, diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index e6cc24dfa98d..a526684cbe3e 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -72,6 +72,9 @@ static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode); static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate); static int __ext4_journalled_writepage(struct page *page, unsigned int len); static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh); +static int ext4_discard_partial_page_buffers_no_lock(handle_t *handle, + struct inode *inode, struct page *page, loff_t from, + loff_t length, int flags); /* * Test whether an inode is a fast symlink. @@ -3161,7 +3164,7 @@ int ext4_discard_partial_page_buffers(handle_t *handle, * * Returns zero on sucess or negative on failure. */ -int ext4_discard_partial_page_buffers_no_lock(handle_t *handle, +static int ext4_discard_partial_page_buffers_no_lock(handle_t *handle, struct inode *inode, struct page *page, loff_t from, loff_t length, int flags) { From d50f2ab6f050311dbf7b8f5501b25f0bf64a439b Mon Sep 17 00:00:00 2001 From: Xi Wang Date: Tue, 10 Jan 2012 11:51:10 -0500 Subject: [PATCH 32/32] ext4: fix undefined behavior in ext4_fill_flex_info() Commit 503358ae01b70ce6909d19dd01287093f6b6271c ("ext4: avoid divide by zero when trying to mount a corrupted file system") fixes CVE-2009-4307 by performing a sanity check on s_log_groups_per_flex, since it can be set to a bogus value by an attacker. sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex; groups_per_flex = 1 << sbi->s_log_groups_per_flex; if (groups_per_flex < 2) { ... } This patch fixes two potential issues in the previous commit. 1) The sanity check might only work on architectures like PowerPC. On x86, 5 bits are used for the shifting amount. That means, given a large s_log_groups_per_flex value like 36, groups_per_flex = 1 << 36 is essentially 1 << 4 = 16, rather than 0. This will bypass the check, leaving s_log_groups_per_flex and groups_per_flex inconsistent. 2) The sanity check relies on undefined behavior, i.e., oversized shift. A standard-confirming C compiler could rewrite the check in unexpected ways. Consider the following equivalent form, assuming groups_per_flex is unsigned for simplicity. groups_per_flex = 1 << sbi->s_log_groups_per_flex; if (groups_per_flex == 0 || groups_per_flex == 1) { We compile the code snippet using Clang 3.0 and GCC 4.6. Clang will completely optimize away the check groups_per_flex == 0, leaving the patched code as vulnerable as the original. GCC keeps the check, but there is no guarantee that future versions will do the same. Signed-off-by: Xi Wang Signed-off-by: "Theodore Ts'o" Cc: stable@vger.kernel.org --- fs/ext4/super.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 36570b7af7c5..108c3af8617b 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -2006,17 +2006,16 @@ static int ext4_fill_flex_info(struct super_block *sb) struct ext4_group_desc *gdp = NULL; ext4_group_t flex_group_count; ext4_group_t flex_group; - int groups_per_flex = 0; + unsigned int groups_per_flex = 0; size_t size; int i; sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex; - groups_per_flex = 1 << sbi->s_log_groups_per_flex; - - if (groups_per_flex < 2) { + if (sbi->s_log_groups_per_flex < 1 || sbi->s_log_groups_per_flex > 31) { sbi->s_log_groups_per_flex = 0; return 1; } + groups_per_flex = 1 << sbi->s_log_groups_per_flex; /* We allocate both existing and potentially added groups */ flex_group_count = ((sbi->s_groups_count + groups_per_flex - 1) +