linux_dsm_epyc7002/fs/ext4/extents_status.h

259 lines
8.0 KiB
C
Raw Normal View History

// SPDX-License-Identifier: GPL-2.0
/*
* fs/ext4/extents_status.h
*
* Written by Yongqiang Yang <xiaoqiangnk@gmail.com>
* Modified by
* Allison Henderson <achender@linux.vnet.ibm.com>
* Zheng Liu <wenqing.lz@taobao.com>
*
*/
#ifndef _EXT4_EXTENTS_STATUS_H
#define _EXT4_EXTENTS_STATUS_H
/*
* Turn on ES_DEBUG__ to get lots of info about extent status operations.
*/
#ifdef ES_DEBUG__
#define es_debug(fmt, ...) printk(fmt, ##__VA_ARGS__)
#else
#define es_debug(fmt, ...) no_printk(fmt, ##__VA_ARGS__)
#endif
/*
* With ES_AGGRESSIVE_TEST defined, the result of es caching will be
* checked with old map_block's result.
*/
#define ES_AGGRESSIVE_TEST__
/*
* These flags live in the high bits of extent_status.es_pblk
*/
enum {
ES_WRITTEN_B,
ES_UNWRITTEN_B,
ES_DELAYED_B,
ES_HOLE_B,
ES_REFERENCED_B,
ES_FLAGS
};
#define ES_SHIFT (sizeof(ext4_fsblk_t)*8 - ES_FLAGS)
#define ES_MASK (~((ext4_fsblk_t)0) << ES_SHIFT)
#define EXTENT_STATUS_WRITTEN (1 << ES_WRITTEN_B)
#define EXTENT_STATUS_UNWRITTEN (1 << ES_UNWRITTEN_B)
#define EXTENT_STATUS_DELAYED (1 << ES_DELAYED_B)
#define EXTENT_STATUS_HOLE (1 << ES_HOLE_B)
#define EXTENT_STATUS_REFERENCED (1 << ES_REFERENCED_B)
#define ES_TYPE_MASK ((ext4_fsblk_t)(EXTENT_STATUS_WRITTEN | \
EXTENT_STATUS_UNWRITTEN | \
EXTENT_STATUS_DELAYED | \
EXTENT_STATUS_HOLE) << ES_SHIFT)
ext4: improve extent cache shrink mechanism to avoid to burn CPU time Now we maintain an proper in-order LRU list in ext4 to reclaim entries from extent status tree when we are under heavy memory pressure. For keeping this order, a spin lock is used to protect this list. But this lock burns a lot of CPU time. We can use the following steps to trigger it. % cd /dev/shm % dd if=/dev/zero of=ext4-img bs=1M count=2k % mkfs.ext4 ext4-img % mount -t ext4 -o loop ext4-img /mnt % cd /mnt % for ((i=0;i<160;i++)); do truncate -s 64g $i; done % for ((i=0;i<160;i++)); do cp $i /dev/null &; done % perf record -a -g % perf report This commit tries to fix this problem. Now a new member called i_touch_when is added into ext4_inode_info to record the last access time for an inode. Meanwhile we never need to keep a proper in-order LRU list. So this can avoid to burns some CPU time. When we try to reclaim some entries from extent status tree, we use list_sort() to get a proper in-order list. Then we traverse this list to discard some entries. In ext4_sb_info, we use s_es_last_sorted to record the last time of sorting this list. When we traverse the list, we skip the inode that is newer than this time, and move this inode to the tail of LRU list. When the head of the list is newer than s_es_last_sorted, we will sort the LRU list again. In this commit, we break the loop if s_extent_cache_cnt == 0 because that means that all extents in extent status tree have been reclaimed. Meanwhile in this commit, ext4_es_{un}register_shrinker()'s prototype is changed to save a local variable in these functions. Reported-by: Dave Hansen <dave.hansen@intel.com> Signed-off-by: Zheng Liu <wenqing.lz@taobao.com> Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
2013-07-01 19:12:37 +07:00
struct ext4_sb_info;
struct ext4_extent;
struct extent_status {
struct rb_node rb_node;
ext4_lblk_t es_lblk; /* first logical block extent covers */
ext4_lblk_t es_len; /* length of extent in block */
ext4_fsblk_t es_pblk; /* first physical block */
};
struct ext4_es_tree {
struct rb_root root;
struct extent_status *cache_es; /* recently accessed extent */
};
ext4: track extent status tree shrinker delay statictics This commit adds some statictics in extent status tree shrinker. The purpose to add these is that we want to collect more details when we encounter a stall caused by extent status tree shrinker. Here we count the following statictics: stats: the number of all objects on all extent status trees the number of reclaimable objects on lru list cache hits/misses the last sorted interval the number of inodes on lru list average: scan time for shrinking some objects the number of shrunk objects maximum: the inode that has max nr. of objects on lru list the maximum scan time for shrinking some objects The output looks like below: $ cat /proc/fs/ext4/sda1/es_shrinker_info stats: 28228 objects 6341 reclaimable objects 5281/631 cache hits/misses 586 ms last sorted interval 250 inodes on lru list average: 153 us scan time 128 shrunk objects maximum: 255 inode (255 objects, 198 reclaimable) 125723 us max scan time If the lru list has never been sorted, the following line will not be printed: 586ms last sorted interval If there is an empty lru list, the following lines also will not be printed: 250 inodes on lru list ... maximum: 255 inode (255 objects, 198 reclaimable) 0 us max scan time Meanwhile in this commit a new trace point is defined to print some details in __ext4_es_shrink(). Cc: Andreas Dilger <adilger.kernel@dilger.ca> Cc: Jan Kara <jack@suse.cz> Reviewed-by: Jan Kara <jack@suse.cz> Signed-off-by: Zheng Liu <wenqing.lz@taobao.com> Signed-off-by: Theodore Ts'o <tytso@mit.edu>
2014-09-02 09:26:49 +07:00
struct ext4_es_stats {
unsigned long es_stats_shrunk;
struct percpu_counter es_stats_cache_hits;
struct percpu_counter es_stats_cache_misses;
ext4: track extent status tree shrinker delay statictics This commit adds some statictics in extent status tree shrinker. The purpose to add these is that we want to collect more details when we encounter a stall caused by extent status tree shrinker. Here we count the following statictics: stats: the number of all objects on all extent status trees the number of reclaimable objects on lru list cache hits/misses the last sorted interval the number of inodes on lru list average: scan time for shrinking some objects the number of shrunk objects maximum: the inode that has max nr. of objects on lru list the maximum scan time for shrinking some objects The output looks like below: $ cat /proc/fs/ext4/sda1/es_shrinker_info stats: 28228 objects 6341 reclaimable objects 5281/631 cache hits/misses 586 ms last sorted interval 250 inodes on lru list average: 153 us scan time 128 shrunk objects maximum: 255 inode (255 objects, 198 reclaimable) 125723 us max scan time If the lru list has never been sorted, the following line will not be printed: 586ms last sorted interval If there is an empty lru list, the following lines also will not be printed: 250 inodes on lru list ... maximum: 255 inode (255 objects, 198 reclaimable) 0 us max scan time Meanwhile in this commit a new trace point is defined to print some details in __ext4_es_shrink(). Cc: Andreas Dilger <adilger.kernel@dilger.ca> Cc: Jan Kara <jack@suse.cz> Reviewed-by: Jan Kara <jack@suse.cz> Signed-off-by: Zheng Liu <wenqing.lz@taobao.com> Signed-off-by: Theodore Ts'o <tytso@mit.edu>
2014-09-02 09:26:49 +07:00
u64 es_stats_scan_time;
u64 es_stats_max_scan_time;
struct percpu_counter es_stats_all_cnt;
struct percpu_counter es_stats_shk_cnt;
ext4: track extent status tree shrinker delay statictics This commit adds some statictics in extent status tree shrinker. The purpose to add these is that we want to collect more details when we encounter a stall caused by extent status tree shrinker. Here we count the following statictics: stats: the number of all objects on all extent status trees the number of reclaimable objects on lru list cache hits/misses the last sorted interval the number of inodes on lru list average: scan time for shrinking some objects the number of shrunk objects maximum: the inode that has max nr. of objects on lru list the maximum scan time for shrinking some objects The output looks like below: $ cat /proc/fs/ext4/sda1/es_shrinker_info stats: 28228 objects 6341 reclaimable objects 5281/631 cache hits/misses 586 ms last sorted interval 250 inodes on lru list average: 153 us scan time 128 shrunk objects maximum: 255 inode (255 objects, 198 reclaimable) 125723 us max scan time If the lru list has never been sorted, the following line will not be printed: 586ms last sorted interval If there is an empty lru list, the following lines also will not be printed: 250 inodes on lru list ... maximum: 255 inode (255 objects, 198 reclaimable) 0 us max scan time Meanwhile in this commit a new trace point is defined to print some details in __ext4_es_shrink(). Cc: Andreas Dilger <adilger.kernel@dilger.ca> Cc: Jan Kara <jack@suse.cz> Reviewed-by: Jan Kara <jack@suse.cz> Signed-off-by: Zheng Liu <wenqing.lz@taobao.com> Signed-off-by: Theodore Ts'o <tytso@mit.edu>
2014-09-02 09:26:49 +07:00
};
/*
* Pending cluster reservations for bigalloc file systems
*
* A cluster with a pending reservation is a logical cluster shared by at
* least one extent in the extents status tree with delayed and unwritten
* status and at least one other written or unwritten extent. The
* reservation is said to be pending because a cluster reservation would
* have to be taken in the event all blocks in the cluster shared with
* written or unwritten extents were deleted while the delayed and
* unwritten blocks remained.
*
* The set of pending cluster reservations is an auxiliary data structure
* used with the extents status tree to implement reserved cluster/block
* accounting for bigalloc file systems. The set is kept in memory and
* records all pending cluster reservations.
*
* Its primary function is to avoid the need to read extents from the
* disk when invalidating pages as a result of a truncate, punch hole, or
* collapse range operation. Page invalidation requires a decrease in the
* reserved cluster count if it results in the removal of all delayed
* and unwritten extents (blocks) from a cluster that is not shared with a
* written or unwritten extent, and no decrease otherwise. Determining
* whether the cluster is shared can be done by searching for a pending
* reservation on it.
*
* Secondarily, it provides a potentially faster method for determining
* whether the reserved cluster count should be increased when a physical
* cluster is deallocated as a result of a truncate, punch hole, or
* collapse range operation. The necessary information is also present
* in the extents status tree, but might be more rapidly accessed in
* the pending reservation set in many cases due to smaller size.
*
* The pending cluster reservation set is implemented as a red-black tree
* with the goal of minimizing per page search time overhead.
*/
struct pending_reservation {
struct rb_node rb_node;
ext4_lblk_t lclu;
};
struct ext4_pending_tree {
struct rb_root root;
};
extern int __init ext4_init_es(void);
extern void ext4_exit_es(void);
extern void ext4_es_init_tree(struct ext4_es_tree *tree);
extern int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
ext4_lblk_t len, ext4_fsblk_t pblk,
unsigned int status);
extern void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk,
ext4_lblk_t len, ext4_fsblk_t pblk,
unsigned int status);
extern int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
ext4_lblk_t len);
extern void ext4_es_find_extent_range(struct inode *inode,
int (*match_fn)(struct extent_status *es),
ext4_lblk_t lblk, ext4_lblk_t end,
struct extent_status *es);
extern int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk,
ext4_lblk_t *next_lblk,
struct extent_status *es);
extern bool ext4_es_scan_range(struct inode *inode,
int (*matching_fn)(struct extent_status *es),
ext4_lblk_t lblk, ext4_lblk_t end);
extern bool ext4_es_scan_clu(struct inode *inode,
int (*matching_fn)(struct extent_status *es),
ext4_lblk_t lblk);
static inline unsigned int ext4_es_status(struct extent_status *es)
{
return es->es_pblk >> ES_SHIFT;
}
static inline unsigned int ext4_es_type(struct extent_status *es)
{
return (es->es_pblk & ES_TYPE_MASK) >> ES_SHIFT;
}
static inline int ext4_es_is_written(struct extent_status *es)
{
return (ext4_es_type(es) & EXTENT_STATUS_WRITTEN) != 0;
}
static inline int ext4_es_is_unwritten(struct extent_status *es)
{
return (ext4_es_type(es) & EXTENT_STATUS_UNWRITTEN) != 0;
}
static inline int ext4_es_is_delayed(struct extent_status *es)
{
return (ext4_es_type(es) & EXTENT_STATUS_DELAYED) != 0;
}
static inline int ext4_es_is_hole(struct extent_status *es)
{
return (ext4_es_type(es) & EXTENT_STATUS_HOLE) != 0;
}
static inline int ext4_es_is_mapped(struct extent_status *es)
{
return (ext4_es_is_written(es) || ext4_es_is_unwritten(es));
}
static inline int ext4_es_is_delonly(struct extent_status *es)
{
return (ext4_es_is_delayed(es) && !ext4_es_is_unwritten(es));
}
static inline void ext4_es_set_referenced(struct extent_status *es)
{
es->es_pblk |= ((ext4_fsblk_t)EXTENT_STATUS_REFERENCED) << ES_SHIFT;
}
static inline void ext4_es_clear_referenced(struct extent_status *es)
{
es->es_pblk &= ~(((ext4_fsblk_t)EXTENT_STATUS_REFERENCED) << ES_SHIFT);
}
static inline int ext4_es_is_referenced(struct extent_status *es)
{
return (ext4_es_status(es) & EXTENT_STATUS_REFERENCED) != 0;
}
static inline ext4_fsblk_t ext4_es_pblock(struct extent_status *es)
{
return es->es_pblk & ~ES_MASK;
}
static inline ext4_fsblk_t ext4_es_show_pblock(struct extent_status *es)
{
ext4_fsblk_t pblock = ext4_es_pblock(es);
return pblock == ~ES_MASK ? 0 : pblock;
}
static inline void ext4_es_store_pblock(struct extent_status *es,
ext4_fsblk_t pb)
{
ext4_fsblk_t block;
block = (pb & ~ES_MASK) | (es->es_pblk & ES_MASK);
es->es_pblk = block;
}
static inline void ext4_es_store_status(struct extent_status *es,
unsigned int status)
{
es->es_pblk = (((ext4_fsblk_t)status << ES_SHIFT) & ES_MASK) |
(es->es_pblk & ~ES_MASK);
}
static inline void ext4_es_store_pblock_status(struct extent_status *es,
ext4_fsblk_t pb,
unsigned int status)
{
es->es_pblk = (((ext4_fsblk_t)status << ES_SHIFT) & ES_MASK) |
(pb & ~ES_MASK);
}
ext4: track extent status tree shrinker delay statictics This commit adds some statictics in extent status tree shrinker. The purpose to add these is that we want to collect more details when we encounter a stall caused by extent status tree shrinker. Here we count the following statictics: stats: the number of all objects on all extent status trees the number of reclaimable objects on lru list cache hits/misses the last sorted interval the number of inodes on lru list average: scan time for shrinking some objects the number of shrunk objects maximum: the inode that has max nr. of objects on lru list the maximum scan time for shrinking some objects The output looks like below: $ cat /proc/fs/ext4/sda1/es_shrinker_info stats: 28228 objects 6341 reclaimable objects 5281/631 cache hits/misses 586 ms last sorted interval 250 inodes on lru list average: 153 us scan time 128 shrunk objects maximum: 255 inode (255 objects, 198 reclaimable) 125723 us max scan time If the lru list has never been sorted, the following line will not be printed: 586ms last sorted interval If there is an empty lru list, the following lines also will not be printed: 250 inodes on lru list ... maximum: 255 inode (255 objects, 198 reclaimable) 0 us max scan time Meanwhile in this commit a new trace point is defined to print some details in __ext4_es_shrink(). Cc: Andreas Dilger <adilger.kernel@dilger.ca> Cc: Jan Kara <jack@suse.cz> Reviewed-by: Jan Kara <jack@suse.cz> Signed-off-by: Zheng Liu <wenqing.lz@taobao.com> Signed-off-by: Theodore Ts'o <tytso@mit.edu>
2014-09-02 09:26:49 +07:00
extern int ext4_es_register_shrinker(struct ext4_sb_info *sbi);
ext4: improve extent cache shrink mechanism to avoid to burn CPU time Now we maintain an proper in-order LRU list in ext4 to reclaim entries from extent status tree when we are under heavy memory pressure. For keeping this order, a spin lock is used to protect this list. But this lock burns a lot of CPU time. We can use the following steps to trigger it. % cd /dev/shm % dd if=/dev/zero of=ext4-img bs=1M count=2k % mkfs.ext4 ext4-img % mount -t ext4 -o loop ext4-img /mnt % cd /mnt % for ((i=0;i<160;i++)); do truncate -s 64g $i; done % for ((i=0;i<160;i++)); do cp $i /dev/null &; done % perf record -a -g % perf report This commit tries to fix this problem. Now a new member called i_touch_when is added into ext4_inode_info to record the last access time for an inode. Meanwhile we never need to keep a proper in-order LRU list. So this can avoid to burns some CPU time. When we try to reclaim some entries from extent status tree, we use list_sort() to get a proper in-order list. Then we traverse this list to discard some entries. In ext4_sb_info, we use s_es_last_sorted to record the last time of sorting this list. When we traverse the list, we skip the inode that is newer than this time, and move this inode to the tail of LRU list. When the head of the list is newer than s_es_last_sorted, we will sort the LRU list again. In this commit, we break the loop if s_extent_cache_cnt == 0 because that means that all extents in extent status tree have been reclaimed. Meanwhile in this commit, ext4_es_{un}register_shrinker()'s prototype is changed to save a local variable in these functions. Reported-by: Dave Hansen <dave.hansen@intel.com> Signed-off-by: Zheng Liu <wenqing.lz@taobao.com> Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
2013-07-01 19:12:37 +07:00
extern void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi);
extern int ext4_seq_es_shrinker_info_show(struct seq_file *seq, void *v);
extern int __init ext4_init_pending(void);
extern void ext4_exit_pending(void);
extern void ext4_init_pending_tree(struct ext4_pending_tree *tree);
extern void ext4_remove_pending(struct inode *inode, ext4_lblk_t lblk);
extern bool ext4_is_pending(struct inode *inode, ext4_lblk_t lblk);
extern int ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk,
bool allocated);
extern unsigned int ext4_es_delayed_clu(struct inode *inode, ext4_lblk_t lblk,
ext4_lblk_t len);
extern void ext4_clear_inode_es(struct inode *inode);
#endif /* _EXT4_EXTENTS_STATUS_H */