ocfs2: fix the application IO timeout when fstrim is running

The user reported this problem, the upper application IO was timeout
when fstrim was running on this ocfs2 partition.  the application
monitoring resource agent considered that this application did not work,
then this node was fenced by the cluster brain (e.g.  pacemaker).

The root cause is that fstrim thread always holds main_bm meta-file
related locks until all the cluster groups are trimmed.  This patch will
make fstrim thread release main_bm meta-file related locks when each
cluster group is trimmed, this will let the current application IO has a
chance to claim the clusters from main_bm meta-file.

Link: http://lkml.kernel.org/r/20190111090014.31645-1-ghe@suse.com
Signed-off-by: Gang He <ghe@suse.com>
Reviewed-by: Changwei Ge <ge.changwei@h3c.com>
Cc: Mark Fasheh <mfasheh@versity.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Joseph Qi <joseph.qi@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
This commit is contained in:
Gang He 2019-03-05 15:41:45 -08:00 committed by Linus Torvalds
parent cc725ef3cb
commit 5500ab4ed3
5 changed files with 106 additions and 63 deletions

View File

@ -7532,10 +7532,11 @@ static int ocfs2_trim_group(struct super_block *sb,
return count; return count;
} }
int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range) static
int ocfs2_trim_mainbm(struct super_block *sb, struct fstrim_range *range)
{ {
struct ocfs2_super *osb = OCFS2_SB(sb); struct ocfs2_super *osb = OCFS2_SB(sb);
u64 start, len, trimmed, first_group, last_group, group; u64 start, len, trimmed = 0, first_group, last_group = 0, group = 0;
int ret, cnt; int ret, cnt;
u32 first_bit, last_bit, minlen; u32 first_bit, last_bit, minlen;
struct buffer_head *main_bm_bh = NULL; struct buffer_head *main_bm_bh = NULL;
@ -7543,7 +7544,6 @@ int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
struct buffer_head *gd_bh = NULL; struct buffer_head *gd_bh = NULL;
struct ocfs2_dinode *main_bm; struct ocfs2_dinode *main_bm;
struct ocfs2_group_desc *gd = NULL; struct ocfs2_group_desc *gd = NULL;
struct ocfs2_trim_fs_info info, *pinfo = NULL;
start = range->start >> osb->s_clustersize_bits; start = range->start >> osb->s_clustersize_bits;
len = range->len >> osb->s_clustersize_bits; len = range->len >> osb->s_clustersize_bits;
@ -7552,6 +7552,9 @@ int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
if (minlen >= osb->bitmap_cpg || range->len < sb->s_blocksize) if (minlen >= osb->bitmap_cpg || range->len < sb->s_blocksize)
return -EINVAL; return -EINVAL;
trace_ocfs2_trim_mainbm(start, len, minlen);
next_group:
main_bm_inode = ocfs2_get_system_file_inode(osb, main_bm_inode = ocfs2_get_system_file_inode(osb,
GLOBAL_BITMAP_SYSTEM_INODE, GLOBAL_BITMAP_SYSTEM_INODE,
OCFS2_INVALID_SLOT); OCFS2_INVALID_SLOT);
@ -7570,64 +7573,34 @@ int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
} }
main_bm = (struct ocfs2_dinode *)main_bm_bh->b_data; main_bm = (struct ocfs2_dinode *)main_bm_bh->b_data;
/*
* Do some check before trim the first group.
*/
if (!group) {
if (start >= le32_to_cpu(main_bm->i_clusters)) { if (start >= le32_to_cpu(main_bm->i_clusters)) {
ret = -EINVAL; ret = -EINVAL;
goto out_unlock; goto out_unlock;
} }
len = range->len >> osb->s_clustersize_bits;
if (start + len > le32_to_cpu(main_bm->i_clusters)) if (start + len > le32_to_cpu(main_bm->i_clusters))
len = le32_to_cpu(main_bm->i_clusters) - start; len = le32_to_cpu(main_bm->i_clusters) - start;
trace_ocfs2_trim_fs(start, len, minlen); /*
* Determine first and last group to examine based on
ocfs2_trim_fs_lock_res_init(osb); * start and len
ret = ocfs2_trim_fs_lock(osb, NULL, 1); */
if (ret < 0) {
if (ret != -EAGAIN) {
mlog_errno(ret);
ocfs2_trim_fs_lock_res_uninit(osb);
goto out_unlock;
}
mlog(ML_NOTICE, "Wait for trim on device (%s) to "
"finish, which is running from another node.\n",
osb->dev_str);
ret = ocfs2_trim_fs_lock(osb, &info, 0);
if (ret < 0) {
mlog_errno(ret);
ocfs2_trim_fs_lock_res_uninit(osb);
goto out_unlock;
}
if (info.tf_valid && info.tf_success &&
info.tf_start == start && info.tf_len == len &&
info.tf_minlen == minlen) {
/* Avoid sending duplicated trim to a shared device */
mlog(ML_NOTICE, "The same trim on device (%s) was "
"just done from node (%u), return.\n",
osb->dev_str, info.tf_nodenum);
range->len = info.tf_trimlen;
goto out_trimunlock;
}
}
info.tf_nodenum = osb->node_num;
info.tf_start = start;
info.tf_len = len;
info.tf_minlen = minlen;
/* Determine first and last group to examine based on start and len */
first_group = ocfs2_which_cluster_group(main_bm_inode, start); first_group = ocfs2_which_cluster_group(main_bm_inode, start);
if (first_group == osb->first_cluster_group_blkno) if (first_group == osb->first_cluster_group_blkno)
first_bit = start; first_bit = start;
else else
first_bit = start - ocfs2_blocks_to_clusters(sb, first_group); first_bit = start - ocfs2_blocks_to_clusters(sb,
last_group = ocfs2_which_cluster_group(main_bm_inode, start + len - 1); first_group);
last_bit = osb->bitmap_cpg; last_group = ocfs2_which_cluster_group(main_bm_inode,
start + len - 1);
group = first_group;
}
trimmed = 0; do {
for (group = first_group; group <= last_group;) {
if (first_bit + len >= osb->bitmap_cpg) if (first_bit + len >= osb->bitmap_cpg)
last_bit = osb->bitmap_cpg; last_bit = osb->bitmap_cpg;
else else
@ -7659,21 +7632,81 @@ int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
group = ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg); group = ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg);
else else
group += ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg); group += ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg);
} } while (0);
range->len = trimmed * sb->s_blocksize;
info.tf_trimlen = range->len;
info.tf_success = (ret ? 0 : 1);
pinfo = &info;
out_trimunlock:
ocfs2_trim_fs_unlock(osb, pinfo);
ocfs2_trim_fs_lock_res_uninit(osb);
out_unlock: out_unlock:
ocfs2_inode_unlock(main_bm_inode, 0); ocfs2_inode_unlock(main_bm_inode, 0);
brelse(main_bm_bh); brelse(main_bm_bh);
main_bm_bh = NULL;
out_mutex: out_mutex:
inode_unlock(main_bm_inode); inode_unlock(main_bm_inode);
iput(main_bm_inode); iput(main_bm_inode);
/*
* If all the groups trim are not done or failed, but we should release
* main_bm related locks for avoiding the current IO starve, then go to
* trim the next group
*/
if (ret >= 0 && group <= last_group)
goto next_group;
out: out:
range->len = trimmed * sb->s_blocksize;
return ret;
}
int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
{
int ret;
struct ocfs2_super *osb = OCFS2_SB(sb);
struct ocfs2_trim_fs_info info, *pinfo = NULL;
ocfs2_trim_fs_lock_res_init(osb);
trace_ocfs2_trim_fs(range->start, range->len, range->minlen);
ret = ocfs2_trim_fs_lock(osb, NULL, 1);
if (ret < 0) {
if (ret != -EAGAIN) {
mlog_errno(ret);
ocfs2_trim_fs_lock_res_uninit(osb);
return ret;
}
mlog(ML_NOTICE, "Wait for trim on device (%s) to "
"finish, which is running from another node.\n",
osb->dev_str);
ret = ocfs2_trim_fs_lock(osb, &info, 0);
if (ret < 0) {
mlog_errno(ret);
ocfs2_trim_fs_lock_res_uninit(osb);
return ret;
}
if (info.tf_valid && info.tf_success &&
info.tf_start == range->start &&
info.tf_len == range->len &&
info.tf_minlen == range->minlen) {
/* Avoid sending duplicated trim to a shared device */
mlog(ML_NOTICE, "The same trim on device (%s) was "
"just done from node (%u), return.\n",
osb->dev_str, info.tf_nodenum);
range->len = info.tf_trimlen;
goto out;
}
}
info.tf_nodenum = osb->node_num;
info.tf_start = range->start;
info.tf_len = range->len;
info.tf_minlen = range->minlen;
ret = ocfs2_trim_mainbm(sb, range);
info.tf_trimlen = range->len;
info.tf_success = (ret < 0 ? 0 : 1);
pinfo = &info;
out:
ocfs2_trim_fs_unlock(osb, pinfo);
ocfs2_trim_fs_lock_res_uninit(osb);
return ret; return ret;
} }

View File

@ -686,6 +686,9 @@ void ocfs2_trim_fs_lock_res_init(struct ocfs2_super *osb)
{ {
struct ocfs2_lock_res *lockres = &osb->osb_trim_fs_lockres; struct ocfs2_lock_res *lockres = &osb->osb_trim_fs_lockres;
/* Only one trimfs thread are allowed to work at the same time. */
mutex_lock(&osb->obs_trim_fs_mutex);
ocfs2_lock_res_init_once(lockres); ocfs2_lock_res_init_once(lockres);
ocfs2_build_lock_name(OCFS2_LOCK_TYPE_TRIM_FS, 0, 0, lockres->l_name); ocfs2_build_lock_name(OCFS2_LOCK_TYPE_TRIM_FS, 0, 0, lockres->l_name);
ocfs2_lock_res_init_common(osb, lockres, OCFS2_LOCK_TYPE_TRIM_FS, ocfs2_lock_res_init_common(osb, lockres, OCFS2_LOCK_TYPE_TRIM_FS,
@ -698,6 +701,8 @@ void ocfs2_trim_fs_lock_res_uninit(struct ocfs2_super *osb)
ocfs2_simple_drop_lockres(osb, lockres); ocfs2_simple_drop_lockres(osb, lockres);
ocfs2_lock_res_free(lockres); ocfs2_lock_res_free(lockres);
mutex_unlock(&osb->obs_trim_fs_mutex);
} }
static void ocfs2_orphan_scan_lock_res_init(struct ocfs2_lock_res *res, static void ocfs2_orphan_scan_lock_res_init(struct ocfs2_lock_res *res,

View File

@ -407,6 +407,7 @@ struct ocfs2_super
struct ocfs2_lock_res osb_rename_lockres; struct ocfs2_lock_res osb_rename_lockres;
struct ocfs2_lock_res osb_nfs_sync_lockres; struct ocfs2_lock_res osb_nfs_sync_lockres;
struct ocfs2_lock_res osb_trim_fs_lockres; struct ocfs2_lock_res osb_trim_fs_lockres;
struct mutex obs_trim_fs_mutex;
struct ocfs2_dlm_debug *osb_dlm_debug; struct ocfs2_dlm_debug *osb_dlm_debug;
struct dentry *osb_debug_root; struct dentry *osb_debug_root;

View File

@ -712,6 +712,8 @@ TRACE_EVENT(ocfs2_trim_extent,
DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_trim_group); DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_trim_group);
DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_trim_mainbm);
DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_trim_fs); DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_trim_fs);
/* End of trace events for fs/ocfs2/alloc.c. */ /* End of trace events for fs/ocfs2/alloc.c. */

View File

@ -1847,6 +1847,8 @@ static int ocfs2_mount_volume(struct super_block *sb)
if (ocfs2_is_hard_readonly(osb)) if (ocfs2_is_hard_readonly(osb))
goto leave; goto leave;
mutex_init(&osb->obs_trim_fs_mutex);
status = ocfs2_dlm_init(osb); status = ocfs2_dlm_init(osb);
if (status < 0) { if (status < 0) {
mlog_errno(status); mlog_errno(status);