From 15f2547b4157a1e7e4d75bec5df097c1436f6cbd Mon Sep 17 00:00:00 2001
From: Bob Peterson <rpeterso@redhat.com>
Date: Wed, 15 Jan 2020 12:47:46 -0600
Subject: [PATCH 01/10] gfs2: Allow ASPACE glocks to also have an lvb

Signed-off-by: Bob Peterson <rpeterso@redhat.com>
Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
---
 fs/gfs2/glock.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index bf70e3b14938..86e9e621f346 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -125,12 +125,11 @@ static void gfs2_glock_dealloc(struct rcu_head *rcu)
 {
 	struct gfs2_glock *gl = container_of(rcu, struct gfs2_glock, gl_rcu);
 
-	if (gl->gl_ops->go_flags & GLOF_ASPACE) {
+	kfree(gl->gl_lksb.sb_lvbptr);
+	if (gl->gl_ops->go_flags & GLOF_ASPACE)
 		kmem_cache_free(gfs2_glock_aspace_cachep, gl);
-	} else {
-		kfree(gl->gl_lksb.sb_lvbptr);
+	else
 		kmem_cache_free(gfs2_glock_cachep, gl);
-	}
 }
 
 /**

From f286d627ef026a4d04b41ae5917d58ddf243c3c5 Mon Sep 17 00:00:00 2001
From: Andreas Gruenbacher <agruenba@redhat.com>
Date: Mon, 13 Jan 2020 21:21:49 +0100
Subject: [PATCH 02/10] gfs2: Keep track of deleted inode generations in LVBs

When deleting an inode, keep track of the generation of the deleted inode in
the inode glock Lock Value Block (LVB).  When trying to delete an inode
remotely, check the last-known inode generation against the deleted inode
generation to skip duplicate remote deletes.  This avoids taking the resource
group glock in order to verify the block type.

Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
---
 fs/gfs2/glock.c                  | 19 +++++++++++++++++++
 fs/gfs2/glock.h                  |  3 +++
 fs/gfs2/glops.c                  |  2 +-
 fs/gfs2/super.c                  |  3 +++
 include/uapi/linux/gfs2_ondisk.h |  6 ++++++
 5 files changed, 32 insertions(+), 1 deletion(-)

diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 86e9e621f346..12681616eb76 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -755,6 +755,25 @@ __acquires(&gl->gl_lockref.lock)
 	return;
 }
 
+void gfs2_inode_remember_delete(struct gfs2_glock *gl, u64 generation)
+{
+	struct gfs2_inode_lvb *ri = (void *)gl->gl_lksb.sb_lvbptr;
+
+	if (ri->ri_magic == 0)
+		ri->ri_magic = cpu_to_be32(GFS2_MAGIC);
+	if (ri->ri_magic == cpu_to_be32(GFS2_MAGIC))
+		ri->ri_generation_deleted = cpu_to_be64(generation);
+}
+
+bool gfs2_inode_already_deleted(struct gfs2_glock *gl, u64 generation)
+{
+	struct gfs2_inode_lvb *ri = (void *)gl->gl_lksb.sb_lvbptr;
+
+	if (ri->ri_magic != cpu_to_be32(GFS2_MAGIC))
+		return false;
+	return generation <= be64_to_cpu(ri->ri_generation_deleted);
+}
+
 static void delete_work_func(struct work_struct *work)
 {
 	struct gfs2_glock *gl = container_of(work, struct gfs2_glock, gl_delete);
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index b8adaf80e4c5..5c1b60fdedcf 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -306,4 +306,7 @@ static inline void glock_clear_object(struct gfs2_glock *gl, void *object)
 	spin_unlock(&gl->gl_lockref.lock);
 }
 
+extern void gfs2_inode_remember_delete(struct gfs2_glock *gl, u64 generation);
+extern bool gfs2_inode_already_deleted(struct gfs2_glock *gl, u64 generation);
+
 #endif /* __GLOCK_DOT_H__ */
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 9e9c7a4b8c66..63ae9e45ce34 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -692,7 +692,7 @@ const struct gfs2_glock_operations gfs2_inode_glops = {
 	.go_lock = inode_go_lock,
 	.go_dump = inode_go_dump,
 	.go_type = LM_TYPE_INODE,
-	.go_flags = GLOF_ASPACE | GLOF_LRU,
+	.go_flags = GLOF_ASPACE | GLOF_LRU | GLOF_LVB,
 	.go_free = inode_go_free,
 };
 
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 956fced0a8ec..e69efed9fb51 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -1315,6 +1315,8 @@ static void gfs2_evict_inode(struct inode *inode)
 		goto out;
 	}
 
+	if (gfs2_inode_already_deleted(ip->i_gl, ip->i_no_formal_ino))
+		goto out_truncate;
 	error = gfs2_check_blk_type(sdp, ip->i_no_addr, GFS2_BLKST_UNLINKED);
 	if (error)
 		goto out_truncate;
@@ -1368,6 +1370,7 @@ static void gfs2_evict_inode(struct inode *inode)
 	   that subsequent inode creates don't see an old gl_object. */
 	glock_clear_object(ip->i_gl, ip);
 	error = gfs2_dinode_dealloc(ip);
+	gfs2_inode_remember_delete(ip->i_gl, ip->i_no_formal_ino);
 	goto out_unlock;
 
 out_truncate:
diff --git a/include/uapi/linux/gfs2_ondisk.h b/include/uapi/linux/gfs2_ondisk.h
index 2dc10a034de1..07e508e6691b 100644
--- a/include/uapi/linux/gfs2_ondisk.h
+++ b/include/uapi/linux/gfs2_ondisk.h
@@ -171,6 +171,12 @@ struct gfs2_rindex {
 #define GFS2_RGF_NOALLOC	0x00000008
 #define GFS2_RGF_TRIMMED	0x00000010
 
+struct gfs2_inode_lvb {
+	__be32 ri_magic;
+	__be32 __pad;
+	__be64 ri_generation_deleted;
+};
+
 struct gfs2_rgrp_lvb {
 	__be32 rl_magic;
 	__be32 rl_flags;

From a0e3cc65fa29f497cc97a069c318532c2a48d148 Mon Sep 17 00:00:00 2001
From: Andreas Gruenbacher <agruenba@redhat.com>
Date: Thu, 16 Jan 2020 20:12:26 +0100
Subject: [PATCH 03/10] gfs2: Turn gl_delete into a delayed work

This requires flushing delayed work items in gfs2_make_fs_ro (which is called
before unmounting a filesystem).

When inodes are deleted and then recreated, pending gl_delete work items would
have no effect because the inode generations will have changed, so we can
cancel any pending gl_delete works before reusing iopen glocks.

Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
---
 fs/gfs2/glock.c  | 47 +++++++++++++++++++++++++++++++++++++++++++++--
 fs/gfs2/glock.h  |  4 ++++
 fs/gfs2/glops.c  |  9 ++++++++-
 fs/gfs2/incore.h |  5 +++--
 fs/gfs2/inode.c  |  2 ++
 fs/gfs2/rgrp.c   |  2 +-
 fs/gfs2/super.c  |  4 ++--
 7 files changed, 65 insertions(+), 8 deletions(-)

diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 12681616eb76..0332086f7ab9 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -776,11 +776,16 @@ bool gfs2_inode_already_deleted(struct gfs2_glock *gl, u64 generation)
 
 static void delete_work_func(struct work_struct *work)
 {
-	struct gfs2_glock *gl = container_of(work, struct gfs2_glock, gl_delete);
+	struct delayed_work *dwork = to_delayed_work(work);
+	struct gfs2_glock *gl = container_of(dwork, struct gfs2_glock, gl_delete);
 	struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
 	struct inode *inode;
 	u64 no_addr = gl->gl_name.ln_number;
 
+	spin_lock(&gl->gl_lockref.lock);
+	clear_bit(GLF_PENDING_DELETE, &gl->gl_flags);
+	spin_unlock(&gl->gl_lockref.lock);
+
 	/* If someone's using this glock to create a new dinode, the block must
 	   have been freed by another node, then re-used, in which case our
 	   iopen callback is too late after the fact. Ignore it. */
@@ -949,7 +954,7 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
 	gl->gl_object = NULL;
 	gl->gl_hold_time = GL_GLOCK_DFT_HOLD;
 	INIT_DELAYED_WORK(&gl->gl_work, glock_work_func);
-	INIT_WORK(&gl->gl_delete, delete_work_func);
+	INIT_DELAYED_WORK(&gl->gl_delete, delete_work_func);
 
 	mapping = gfs2_glock2aspace(gl);
 	if (mapping) {
@@ -1772,6 +1777,44 @@ static void glock_hash_walk(glock_examiner examiner, const struct gfs2_sbd *sdp)
 	rhashtable_walk_exit(&iter);
 }
 
+bool gfs2_queue_delete_work(struct gfs2_glock *gl, unsigned long delay)
+{
+	bool queued;
+
+	spin_lock(&gl->gl_lockref.lock);
+	queued = queue_delayed_work(gfs2_delete_workqueue,
+				    &gl->gl_delete, delay);
+	if (queued)
+		set_bit(GLF_PENDING_DELETE, &gl->gl_flags);
+	spin_unlock(&gl->gl_lockref.lock);
+	return queued;
+}
+
+void gfs2_cancel_delete_work(struct gfs2_glock *gl)
+{
+	if (cancel_delayed_work_sync(&gl->gl_delete)) {
+		clear_bit(GLF_PENDING_DELETE, &gl->gl_flags);
+		gfs2_glock_put(gl);
+	}
+}
+
+bool gfs2_delete_work_queued(const struct gfs2_glock *gl)
+{
+	return test_bit(GLF_PENDING_DELETE, &gl->gl_flags);
+}
+
+static void flush_delete_work(struct gfs2_glock *gl)
+{
+	flush_delayed_work(&gl->gl_delete);
+	gfs2_glock_queue_work(gl, 0);
+}
+
+void gfs2_flush_delete_work(struct gfs2_sbd *sdp)
+{
+	glock_hash_walk(flush_delete_work, sdp);
+	flush_workqueue(gfs2_delete_workqueue);
+}
+
 /**
  * thaw_glock - thaw out a glock which has an unprocessed reply waiting
  * @gl: The glock to thaw
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index 5c1b60fdedcf..711b3005a7ea 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -235,6 +235,10 @@ static inline int gfs2_glock_nq_init(struct gfs2_glock *gl,
 
 extern void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state);
 extern void gfs2_glock_complete(struct gfs2_glock *gl, int ret);
+extern bool gfs2_queue_delete_work(struct gfs2_glock *gl, unsigned long delay);
+extern void gfs2_cancel_delete_work(struct gfs2_glock *gl);
+extern bool gfs2_delete_work_queued(const struct gfs2_glock *gl);
+extern void gfs2_flush_delete_work(struct gfs2_sbd *sdp);
 extern void gfs2_gl_hash_clear(struct gfs2_sbd *sdp);
 extern void gfs2_glock_finish_truncate(struct gfs2_inode *ip);
 extern void gfs2_glock_thaw(struct gfs2_sbd *sdp);
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 63ae9e45ce34..909cd722e46a 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -608,11 +608,17 @@ static void iopen_go_callback(struct gfs2_glock *gl, bool remote)
 	if (gl->gl_demote_state == LM_ST_UNLOCKED &&
 	    gl->gl_state == LM_ST_SHARED && ip) {
 		gl->gl_lockref.count++;
-		if (queue_work(gfs2_delete_workqueue, &gl->gl_delete) == 0)
+		if (!queue_delayed_work(gfs2_delete_workqueue,
+					&gl->gl_delete, 0))
 			gl->gl_lockref.count--;
 	}
 }
 
+static int iopen_go_demote_ok(const struct gfs2_glock *gl)
+{
+       return !gfs2_delete_work_queued(gl);
+}
+
 /**
  * inode_go_free - wake up anyone waiting for dlm's unlock ast to free it
  * @gl: glock being freed
@@ -716,6 +722,7 @@ const struct gfs2_glock_operations gfs2_freeze_glops = {
 const struct gfs2_glock_operations gfs2_iopen_glops = {
 	.go_type = LM_TYPE_IOPEN,
 	.go_callback = iopen_go_callback,
+	.go_demote_ok = iopen_go_demote_ok,
 	.go_flags = GLOF_LRU | GLOF_NONDISK,
 };
 
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 84a824293a78..fdcf7a2f06c5 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -345,6 +345,7 @@ enum {
 	GLF_OBJECT			= 14, /* Used only for tracing */
 	GLF_BLOCKING			= 15,
 	GLF_INODE_CREATING		= 16, /* Inode creation occurring */
+	GLF_PENDING_DELETE		= 17,
 	GLF_FREEING			= 18, /* Wait for glock to be freed */
 };
 
@@ -378,8 +379,8 @@ struct gfs2_glock {
 	atomic_t gl_revokes;
 	struct delayed_work gl_work;
 	union {
-		/* For inode and iopen glocks only */
-		struct work_struct gl_delete;
+		/* For iopen glocks only */
+		struct delayed_work gl_delete;
 		/* For rgrp glocks only */
 		struct {
 			loff_t start;
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 5acd3ce30759..a4112906abc2 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -170,6 +170,7 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type,
 		error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, GL_EXACT, &ip->i_iopen_gh);
 		if (unlikely(error))
 			goto fail;
+		gfs2_cancel_delete_work(ip->i_iopen_gh.gh_gl);
 		glock_set_object(ip->i_iopen_gh.gh_gl, ip);
 		gfs2_glock_put(io_gl);
 		io_gl = NULL;
@@ -724,6 +725,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
 	if (error)
 		goto fail_gunlock2;
 
+	gfs2_cancel_delete_work(ip->i_iopen_gh.gh_gl);
 	glock_set_object(ip->i_iopen_gh.gh_gl, ip);
 	gfs2_set_iop(inode);
 	insert_inode_hash(inode);
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index a321c34e3d6e..074f228ea839 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -1835,7 +1835,7 @@ static void try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, u64 skip
 		 */
 		ip = gl->gl_object;
 
-		if (ip || queue_work(gfs2_delete_workqueue, &gl->gl_delete) == 0)
+		if (ip || !gfs2_queue_delete_work(gl, 0))
 			gfs2_glock_put(gl);
 		else
 			found++;
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index e69efed9fb51..71218a6fd9b4 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -626,7 +626,7 @@ int gfs2_make_fs_ro(struct gfs2_sbd *sdp)
 		}
 	}
 
-	flush_workqueue(gfs2_delete_workqueue);
+	gfs2_flush_delete_work(sdp);
 	if (!log_write_allowed && current == sdp->sd_quotad_process)
 		fs_warn(sdp, "The quotad daemon is withdrawing.\n");
 	else if (sdp->sd_quotad_process)
@@ -1054,7 +1054,7 @@ static int gfs2_drop_inode(struct inode *inode)
 		struct gfs2_glock *gl = ip->i_iopen_gh.gh_gl;
 
 		gfs2_glock_hold(gl);
-		if (queue_work(gfs2_delete_workqueue, &gl->gl_delete) == 0)
+		if (!gfs2_queue_delete_work(gl, 0))
 			gfs2_glock_queue_put(gl);
 		return false;
 	}

From 8c7b9262a8607636ecd7250f29c7aac17f08901c Mon Sep 17 00:00:00 2001
From: Andreas Gruenbacher <agruenba@redhat.com>
Date: Mon, 13 Jan 2020 22:16:17 +0100
Subject: [PATCH 04/10] gfs2: Give up the iopen glock on contention

When there's contention on the iopen glock, it means that the link count
of the corresponding inode has dropped to zero on a remote node which is
now trying to delete the inode.  In that case, try to evict the inode so
that the iopen glock will be released, which will allow the remote node
to do its job.

When the inode is still open locally, the inode's reference count won't
drop to zero and so we'll keep holding the inode and its iopen glock.
The remote node will time out its request to grab the iopen glock, and
when the inode is finally closed locally, we'll try to delete it
ourself.

Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
---
 fs/gfs2/glock.c  | 51 ++++++++++++++++++++++++++++++++++++++++++++++++
 fs/gfs2/incore.h |  1 +
 fs/gfs2/super.c  |  7 +++++--
 3 files changed, 57 insertions(+), 2 deletions(-)

diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 0332086f7ab9..bf7daa35f73f 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -774,6 +774,42 @@ bool gfs2_inode_already_deleted(struct gfs2_glock *gl, u64 generation)
 	return generation <= be64_to_cpu(ri->ri_generation_deleted);
 }
 
+static bool gfs2_try_evict(struct gfs2_glock *gl)
+{
+	struct gfs2_inode *ip;
+	bool evicted = false;
+
+	/*
+	 * If there is contention on the iopen glock and we have an inode, try
+	 * to grab and release the inode so that it can be evicted.  This will
+	 * allow the remote node to go ahead and delete the inode without us
+	 * having to do it, which will avoid rgrp glock thrashing.
+	 *
+	 * The remote node is likely still holding the corresponding inode
+	 * glock, so it will run before we get to verify that the delete has
+	 * happened below.
+	 */
+	spin_lock(&gl->gl_lockref.lock);
+	ip = gl->gl_object;
+	if (ip && !igrab(&ip->i_inode))
+		ip = NULL;
+	spin_unlock(&gl->gl_lockref.lock);
+	if (ip) {
+		set_bit(GIF_DEFERRED_DELETE, &ip->i_flags);
+		d_prune_aliases(&ip->i_inode);
+		iput(&ip->i_inode);
+
+		/* If the inode was evicted, gl->gl_object will now be NULL. */
+		spin_lock(&gl->gl_lockref.lock);
+		ip = gl->gl_object;
+		if (ip)
+			clear_bit(GIF_DEFERRED_DELETE, &ip->i_flags);
+		spin_unlock(&gl->gl_lockref.lock);
+		evicted = !ip;
+	}
+	return evicted;
+}
+
 static void delete_work_func(struct work_struct *work)
 {
 	struct delayed_work *dwork = to_delayed_work(work);
@@ -792,6 +828,21 @@ static void delete_work_func(struct work_struct *work)
 	if (test_bit(GLF_INODE_CREATING, &gl->gl_flags))
 		goto out;
 
+	if (test_bit(GLF_DEMOTE, &gl->gl_flags)) {
+		/*
+		 * If we can evict the inode, give the remote node trying to
+		 * delete the inode some time before verifying that the delete
+		 * has happened.  Otherwise, if we cause contention on the inode glock
+		 * immediately, the remote node will think that we still have
+		 * the inode in use, and so it will give up waiting.
+		 */
+		if (gfs2_try_evict(gl)) {
+			if (gfs2_queue_delete_work(gl, 5 * HZ))
+				return;
+			goto out;
+		}
+	}
+
 	inode = gfs2_lookup_by_inum(sdp, no_addr, NULL, GFS2_BLKST_UNLINKED);
 	if (!IS_ERR_OR_NULL(inode)) {
 		d_prune_aliases(inode);
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index fdcf7a2f06c5..76ac2578e658 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -399,6 +399,7 @@ enum {
 	GIF_ORDERED		= 4,
 	GIF_FREE_VFS_INODE      = 5,
 	GIF_GLOP_PENDING	= 6,
+	GIF_DEFERRED_DELETE	= 7,
 };
 
 struct gfs2_inode {
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 71218a6fd9b4..7d8caf169efd 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -1299,9 +1299,12 @@ static void gfs2_evict_inode(struct inode *inode)
 	if (test_bit(GIF_ALLOC_FAILED, &ip->i_flags)) {
 		BUG_ON(!gfs2_glock_is_locked_by_me(ip->i_gl));
 		gfs2_holder_mark_uninitialized(&gh);
-		goto alloc_failed;
+		goto out_delete;
 	}
 
+	if (test_bit(GIF_DEFERRED_DELETE, &ip->i_flags))
+		goto out;
+
 	/* Deletes should never happen under memory pressure anymore.  */
 	if (WARN_ON_ONCE(current->flags & PF_MEMALLOC))
 		goto out;
@@ -1333,7 +1336,7 @@ static void gfs2_evict_inode(struct inode *inode)
 	if (inode->i_nlink)
 		goto out_truncate;
 
-alloc_failed:
+out_delete:
 	if (gfs2_holder_initialized(&ip->i_iopen_gh) &&
 	    test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags)) {
 		ip->i_iopen_gh.gh_flags |= GL_NOCACHE;

From 9e73330f298acf544de72436b7bb825ff3aa1a54 Mon Sep 17 00:00:00 2001
From: Andreas Gruenbacher <agruenba@redhat.com>
Date: Tue, 14 Jan 2020 14:59:08 +0100
Subject: [PATCH 05/10] gfs2: Try harder to delete inodes locally

When an inode's link count drops to zero and the inode is cached on
other nodes, the current behavior of gfs2 is to immediately give up and
to rely on the other node(s) to delete the inode if there is iopen glock
contention.  This leads to resource group glock bouncing and the loss of
caching.  With the previous patches in place, we can fix that by not
giving up immediately.

When the inode is still open on other nodes, those nodes won't be able
to evict the inode and give up the iopen glock.  In that case, our lock
conversion request will time out.  The unlink system call will block for
the duration of the iopen lock conversion request.  We're also holding
the inode glock in EX mode for an extended duration, so other nodes
won't be able to make progress on the inode, either.

This is worse than what we had before, but we can prevent other nodes
from getting stuck by aborting our iopen locking request if there is
contention on the inode glock.  This will the the subject of a future
patch.

Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
---
 fs/gfs2/super.c | 53 +++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 47 insertions(+), 6 deletions(-)

diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 7d8caf169efd..bcb56f13f50a 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -1258,6 +1258,50 @@ static void gfs2_glock_put_eventually(struct gfs2_glock *gl)
 		gfs2_glock_put(gl);
 }
 
+static bool gfs2_upgrade_iopen_glock(struct inode *inode)
+{
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_sbd *sdp = GFS2_SB(inode);
+	struct gfs2_holder *gh = &ip->i_iopen_gh;
+	long timeout = 5 * HZ;
+	int error;
+
+	gh->gh_flags |= GL_NOCACHE;
+	gfs2_glock_dq_wait(gh);
+
+	/*
+	 * If there are no other lock holders, we'll get the lock immediately.
+	 * Otherwise, the other nodes holding the lock will be notified about
+	 * our locking request.  If they don't have the inode open, they'll
+	 * evict the cached inode and release the lock.  As a last resort,
+	 * we'll eventually time out.
+	 *
+	 * Note that we're passing the LM_FLAG_TRY_1CB flag to the first
+	 * locking request as an optimization to notify lock holders as soon as
+	 * possible.  Without that flag, they'd be notified implicitly by the
+	 * second locking request.
+	 */
+
+	gfs2_holder_reinit(LM_ST_EXCLUSIVE, LM_FLAG_TRY_1CB | GL_NOCACHE, gh);
+	error = gfs2_glock_nq(gh);
+	if (error != GLR_TRYFAILED)
+		return !error;
+
+	gfs2_holder_reinit(LM_ST_EXCLUSIVE, GL_ASYNC | GL_NOCACHE, gh);
+	error = gfs2_glock_nq(gh);
+	if (error)
+		return false;
+
+	timeout = wait_event_interruptible_timeout(sdp->sd_async_glock_wait,
+		!test_bit(HIF_WAIT, &gh->gh_iflags),
+		timeout);
+	if (!test_bit(HIF_HOLDER, &gh->gh_iflags)) {
+		gfs2_glock_dq(gh);
+		return false;
+	}
+	return true;
+}
+
 /**
  * gfs2_evict_inode - Remove an inode from cache
  * @inode: The inode to evict
@@ -1339,13 +1383,10 @@ static void gfs2_evict_inode(struct inode *inode)
 out_delete:
 	if (gfs2_holder_initialized(&ip->i_iopen_gh) &&
 	    test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags)) {
-		ip->i_iopen_gh.gh_flags |= GL_NOCACHE;
-		gfs2_glock_dq_wait(&ip->i_iopen_gh);
-		gfs2_holder_reinit(LM_ST_EXCLUSIVE, LM_FLAG_TRY_1CB | GL_NOCACHE,
-				   &ip->i_iopen_gh);
-		error = gfs2_glock_nq(&ip->i_iopen_gh);
-		if (error)
+		if (!gfs2_upgrade_iopen_glock(inode)) {
+			gfs2_holder_uninit(&ip->i_iopen_gh);
 			goto out_truncate;
+		}
 	}
 
 	if (S_ISDIR(inode->i_mode) &&

From 6bdcadea75768bbd1cd8f6f13011978e1e19a53b Mon Sep 17 00:00:00 2001
From: Andreas Gruenbacher <agruenba@redhat.com>
Date: Wed, 15 Jan 2020 05:31:38 +0100
Subject: [PATCH 06/10] gfs2: Minor gfs2_lookup_by_inum cleanup

Use a zero no_formal_ino instead of a NULL pointer to indicate that any inode
generation number will qualify: a valid inode never has a zero no_formal_ino.

Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
---
 fs/gfs2/export.c |  4 +++-
 fs/gfs2/glock.c  |  2 +-
 fs/gfs2/inode.c  | 11 +++++++++--
 fs/gfs2/inode.h  |  2 +-
 4 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c
index 3f717285ee48..756d05779200 100644
--- a/fs/gfs2/export.c
+++ b/fs/gfs2/export.c
@@ -134,7 +134,9 @@ static struct dentry *gfs2_get_dentry(struct super_block *sb,
 	struct gfs2_sbd *sdp = sb->s_fs_info;
 	struct inode *inode;
 
-	inode = gfs2_lookup_by_inum(sdp, inum->no_addr, &inum->no_formal_ino,
+	if (!inum->no_formal_ino)
+		return ERR_PTR(-ESTALE);
+	inode = gfs2_lookup_by_inum(sdp, inum->no_addr, inum->no_formal_ino,
 				    GFS2_BLKST_DINODE);
 	if (IS_ERR(inode))
 		return ERR_CAST(inode);
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index bf7daa35f73f..b6078b0e74b9 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -843,7 +843,7 @@ static void delete_work_func(struct work_struct *work)
 		}
 	}
 
-	inode = gfs2_lookup_by_inum(sdp, no_addr, NULL, GFS2_BLKST_UNLINKED);
+	inode = gfs2_lookup_by_inum(sdp, no_addr, 0, GFS2_BLKST_UNLINKED);
 	if (!IS_ERR_OR_NULL(inode)) {
 		d_prune_aliases(inode);
 		iput(inode);
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index a4112906abc2..812a6ae03f6c 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -207,8 +207,15 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type,
 	return ERR_PTR(error);
 }
 
+/**
+ * gfs2_lookup_by_inum - look up an inode by inode number
+ * @sdp: The super block
+ * @no_addr: The inode number
+ * @no_formal_ino: The inode generation number (0 for any)
+ * @blktype: Requested block type (see gfs2_inode_lookup)
+ */
 struct inode *gfs2_lookup_by_inum(struct gfs2_sbd *sdp, u64 no_addr,
-				  u64 *no_formal_ino, unsigned int blktype)
+				  u64 no_formal_ino, unsigned int blktype)
 {
 	struct super_block *sb = sdp->sd_vfs;
 	struct inode *inode;
@@ -221,7 +228,7 @@ struct inode *gfs2_lookup_by_inum(struct gfs2_sbd *sdp, u64 no_addr,
 	/* Two extra checks for NFS only */
 	if (no_formal_ino) {
 		error = -ESTALE;
-		if (GFS2_I(inode)->i_no_formal_ino != *no_formal_ino)
+		if (GFS2_I(inode)->i_no_formal_ino != no_formal_ino)
 			goto fail_iput;
 
 		error = -EIO;
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index 580adbf0b5e1..b52ecf4ffe63 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -92,7 +92,7 @@ extern struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type,
 				       u64 no_addr, u64 no_formal_ino,
 				       unsigned int blktype);
 extern struct inode *gfs2_lookup_by_inum(struct gfs2_sbd *sdp, u64 no_addr,
-					 u64 *no_formal_ino,
+					 u64 no_formal_ino,
 					 unsigned int blktype);
 
 extern int gfs2_inode_refresh(struct gfs2_inode *ip);

From b66648ad6dcfefd9f02b5408c1381987c090cb13 Mon Sep 17 00:00:00 2001
From: Andreas Gruenbacher <agruenba@redhat.com>
Date: Wed, 15 Jan 2020 06:21:42 +0100
Subject: [PATCH 07/10] gfs2: Move inode generation number check into
 gfs2_inode_lookup

Move the inode generation number check from gfs2_lookup_by_inum into
gfs2_inode_lookup: gfs2_inode_lookup may be able to decide that an inode with
the given inode generation number cannot exist without having to verify the
block type or reading the inode from disk.

Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
---
 fs/gfs2/inode.c | 35 +++++++++++++++++++++++++----------
 1 file changed, 25 insertions(+), 10 deletions(-)

diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 812a6ae03f6c..499f861b6e09 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -114,6 +114,10 @@ static void gfs2_set_iop(struct inode *inode)
  * placeholder because it doesn't otherwise make sense), the on-disk block type
  * is verified to be @blktype.
  *
+ * When @no_formal_ino is non-zero, this function will return ERR_PTR(-ESTALE)
+ * if it detects that @no_formal_ino doesn't match the actual inode generation
+ * number.  However, it doesn't always know unless @type is DT_UNKNOWN.
+ *
  * Returns: A VFS inode, or an error
  */
 
@@ -157,6 +161,11 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type,
 			if (error)
 				goto fail;
 
+			error = -ESTALE;
+			if (no_formal_ino &&
+			    gfs2_inode_already_deleted(ip->i_gl, no_formal_ino))
+				goto fail;
+
 			if (blktype != GFS2_BLKST_FREE) {
 				error = gfs2_check_blk_type(sdp, no_addr,
 							    blktype);
@@ -189,13 +198,23 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type,
 			inode->i_mode = DT2IF(type);
 		}
 
-		gfs2_set_iop(inode);
+		if (gfs2_holder_initialized(&i_gh))
+			gfs2_glock_dq_uninit(&i_gh);
 
-		unlock_new_inode(inode);
+		gfs2_set_iop(inode);
 	}
 
-	if (gfs2_holder_initialized(&i_gh))
-		gfs2_glock_dq_uninit(&i_gh);
+	if (no_formal_ino && ip->i_no_formal_ino &&
+	    no_formal_ino != ip->i_no_formal_ino) {
+		if (inode->i_state & I_NEW)
+			goto fail;
+		iput(inode);
+		return ERR_PTR(-ESTALE);
+	}
+
+	if (inode->i_state & I_NEW)
+		unlock_new_inode(inode);
+
 	return inode;
 
 fail:
@@ -221,16 +240,12 @@ struct inode *gfs2_lookup_by_inum(struct gfs2_sbd *sdp, u64 no_addr,
 	struct inode *inode;
 	int error;
 
-	inode = gfs2_inode_lookup(sb, DT_UNKNOWN, no_addr, 0, blktype);
+	inode = gfs2_inode_lookup(sb, DT_UNKNOWN, no_addr, no_formal_ino,
+				  blktype);
 	if (IS_ERR(inode))
 		return inode;
 
-	/* Two extra checks for NFS only */
 	if (no_formal_ino) {
-		error = -ESTALE;
-		if (GFS2_I(inode)->i_no_formal_ino != no_formal_ino)
-			goto fail_iput;
-
 		error = -EIO;
 		if (GFS2_I(inode)->i_diskflags & GFS2_DIF_SYSTEM)
 			goto fail_iput;

From b0dcffd8da3339ad0300587ce7030efdf2e914a9 Mon Sep 17 00:00:00 2001
From: Andreas Gruenbacher <agruenba@redhat.com>
Date: Wed, 15 Jan 2020 09:54:14 +0100
Subject: [PATCH 08/10] gfs2: Check inode generation number in delete_work_func

In delete_work_func, if the iopen glock still has an inode attached,
limit the inode lookup to that specific generation number: in the likely
case that the inode was deleted on the node on which the inode's link
count dropped to zero, we can skip verifying the on-disk block type and
reading in the inode.  The same applies if another node that had the
inode open managed to delete the inode before us.

Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
---
 fs/gfs2/glock.c  | 4 +++-
 fs/gfs2/incore.h | 5 ++++-
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index b6078b0e74b9..711259f68d55 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -795,6 +795,7 @@ static bool gfs2_try_evict(struct gfs2_glock *gl)
 		ip = NULL;
 	spin_unlock(&gl->gl_lockref.lock);
 	if (ip) {
+		gl->gl_no_formal_ino = ip->i_no_formal_ino;
 		set_bit(GIF_DEFERRED_DELETE, &ip->i_flags);
 		d_prune_aliases(&ip->i_inode);
 		iput(&ip->i_inode);
@@ -843,7 +844,8 @@ static void delete_work_func(struct work_struct *work)
 		}
 	}
 
-	inode = gfs2_lookup_by_inum(sdp, no_addr, 0, GFS2_BLKST_UNLINKED);
+	inode = gfs2_lookup_by_inum(sdp, no_addr, gl->gl_no_formal_ino,
+				    GFS2_BLKST_UNLINKED);
 	if (!IS_ERR_OR_NULL(inode)) {
 		d_prune_aliases(inode);
 		iput(inode);
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 76ac2578e658..03ab11fab962 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -380,7 +380,10 @@ struct gfs2_glock {
 	struct delayed_work gl_work;
 	union {
 		/* For iopen glocks only */
-		struct delayed_work gl_delete;
+		struct {
+			struct delayed_work gl_delete;
+			u64 gl_no_formal_ino;
+		};
 		/* For rgrp glocks only */
 		struct {
 			loff_t start;

From 35b6f8fbcf9b2ebdee0a0f77143a8d203a9616e1 Mon Sep 17 00:00:00 2001
From: Andreas Gruenbacher <agruenba@redhat.com>
Date: Fri, 17 Jan 2020 13:48:49 +0100
Subject: [PATCH 09/10] gfs2: Wake up when setting GLF_DEMOTE

Wake up the sdp->sd_async_glock_wait wait queue when setting the GLF_DEMOTE
flag.

Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
---
 fs/gfs2/glock.c | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 711259f68d55..7ad06dd49352 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -464,6 +464,15 @@ static void state_change(struct gfs2_glock *gl, unsigned int new_state)
 	gl->gl_tchange = jiffies;
 }
 
+static void gfs2_set_demote(struct gfs2_glock *gl)
+{
+	struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
+
+	set_bit(GLF_DEMOTE, &gl->gl_flags);
+	smp_mb();
+	wake_up(&sdp->sd_async_glock_wait);
+}
+
 static void gfs2_demote_wake(struct gfs2_glock *gl)
 {
 	gl->gl_demote_state = LM_ST_EXCLUSIVE;
@@ -876,7 +885,7 @@ static void glock_work_func(struct work_struct *work)
 
 		if (!delay) {
 			clear_bit(GLF_PENDING_DEMOTE, &gl->gl_flags);
-			set_bit(GLF_DEMOTE, &gl->gl_flags);
+			gfs2_set_demote(gl);
 		}
 	}
 	run_queue(gl, 0);
@@ -1221,9 +1230,10 @@ int gfs2_glock_async_wait(unsigned int num_gh, struct gfs2_holder *ghs)
 static void handle_callback(struct gfs2_glock *gl, unsigned int state,
 			    unsigned long delay, bool remote)
 {
-	int bit = delay ? GLF_PENDING_DEMOTE : GLF_DEMOTE;
-
-	set_bit(bit, &gl->gl_flags);
+	if (delay)
+		set_bit(GLF_PENDING_DEMOTE, &gl->gl_flags);
+	else
+		gfs2_set_demote(gl);
 	if (gl->gl_demote_state == LM_ST_EXCLUSIVE) {
 		gl->gl_demote_state = state;
 		gl->gl_demote_time = jiffies;

From 9e8990dea9266af68a668b1503dc6f55c56f1bb6 Mon Sep 17 00:00:00 2001
From: Andreas Gruenbacher <agruenba@redhat.com>
Date: Fri, 17 Jan 2020 10:53:23 +0100
Subject: [PATCH 10/10] gfs2: Smarter iopen glock waiting

When trying to upgrade the iopen glock from a shared to an exclusive lock in
gfs2_evict_inode, abort the wait if there is contention on the corresponding
inode glock: in that case, the inode must still be in active use on another
node, and we're not guaranteed to get the iopen glock anytime soon.

To make this work even better, when we notice contention on the iopen glock and
we can't evict the corresponsing inode and release the iopen glock immediately,
poke the inode glock.  The other node(s) trying to acquire the lock can then
abort instead of timing out.

Thanks to Heinz Mauelshagen for pointing out a locking bug in a previous
version of this patch.

Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
---
 fs/gfs2/glock.c | 34 ++++++++++++++++++++++++++++++++--
 fs/gfs2/super.c | 11 ++++++++---
 2 files changed, 40 insertions(+), 5 deletions(-)

diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 7ad06dd49352..71091e35f83d 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -783,6 +783,17 @@ bool gfs2_inode_already_deleted(struct gfs2_glock *gl, u64 generation)
 	return generation <= be64_to_cpu(ri->ri_generation_deleted);
 }
 
+static void gfs2_glock_poke(struct gfs2_glock *gl)
+{
+	int flags = LM_FLAG_TRY_1CB | LM_FLAG_ANY | GL_SKIP;
+	struct gfs2_holder gh;
+	int error;
+
+	error = gfs2_glock_nq_init(gl, LM_ST_SHARED, flags, &gh);
+	if (!error)
+		gfs2_glock_dq(&gh);
+}
+
 static bool gfs2_try_evict(struct gfs2_glock *gl)
 {
 	struct gfs2_inode *ip;
@@ -804,6 +815,8 @@ static bool gfs2_try_evict(struct gfs2_glock *gl)
 		ip = NULL;
 	spin_unlock(&gl->gl_lockref.lock);
 	if (ip) {
+		struct gfs2_glock *inode_gl = NULL;
+
 		gl->gl_no_formal_ino = ip->i_no_formal_ino;
 		set_bit(GIF_DEFERRED_DELETE, &ip->i_flags);
 		d_prune_aliases(&ip->i_inode);
@@ -812,9 +825,16 @@ static bool gfs2_try_evict(struct gfs2_glock *gl)
 		/* If the inode was evicted, gl->gl_object will now be NULL. */
 		spin_lock(&gl->gl_lockref.lock);
 		ip = gl->gl_object;
-		if (ip)
+		if (ip) {
+			inode_gl = ip->i_gl;
+			lockref_get(&inode_gl->gl_lockref);
 			clear_bit(GIF_DEFERRED_DELETE, &ip->i_flags);
+		}
 		spin_unlock(&gl->gl_lockref.lock);
+		if (inode_gl) {
+			gfs2_glock_poke(inode_gl);
+			gfs2_glock_put(inode_gl);
+		}
 		evicted = !ip;
 	}
 	return evicted;
@@ -845,12 +865,22 @@ static void delete_work_func(struct work_struct *work)
 		 * has happened.  Otherwise, if we cause contention on the inode glock
 		 * immediately, the remote node will think that we still have
 		 * the inode in use, and so it will give up waiting.
+		 *
+		 * If we can't evict the inode, signal to the remote node that
+		 * the inode is still in use.  We'll later try to delete the
+		 * inode locally in gfs2_evict_inode.
+		 *
+		 * FIXME: We only need to verify that the remote node has
+		 * deleted the inode because nodes before this remote delete
+		 * rework won't cooperate.  At a later time, when we no longer
+		 * care about compatibility with such nodes, we can skip this
+		 * step entirely.
 		 */
 		if (gfs2_try_evict(gl)) {
 			if (gfs2_queue_delete_work(gl, 5 * HZ))
 				return;
-			goto out;
 		}
+		goto out;
 	}
 
 	inode = gfs2_lookup_by_inum(sdp, no_addr, gl->gl_no_formal_ino,
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index bcb56f13f50a..32d8d26126a1 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -1273,8 +1273,12 @@ static bool gfs2_upgrade_iopen_glock(struct inode *inode)
 	 * If there are no other lock holders, we'll get the lock immediately.
 	 * Otherwise, the other nodes holding the lock will be notified about
 	 * our locking request.  If they don't have the inode open, they'll
-	 * evict the cached inode and release the lock.  As a last resort,
-	 * we'll eventually time out.
+	 * evict the cached inode and release the lock.  Otherwise, if they
+	 * poke the inode glock, we'll take this as an indication that they
+	 * still need the iopen glock and that they'll take care of deleting
+	 * the inode when they're done.  As a last resort, if another node
+	 * keeps holding the iopen glock without showing any activity on the
+	 * inode glock, we'll eventually time out.
 	 *
 	 * Note that we're passing the LM_FLAG_TRY_1CB flag to the first
 	 * locking request as an optimization to notify lock holders as soon as
@@ -1293,7 +1297,8 @@ static bool gfs2_upgrade_iopen_glock(struct inode *inode)
 		return false;
 
 	timeout = wait_event_interruptible_timeout(sdp->sd_async_glock_wait,
-		!test_bit(HIF_WAIT, &gh->gh_iflags),
+		!test_bit(HIF_WAIT, &gh->gh_iflags) ||
+		test_bit(GLF_DEMOTE, &ip->i_gl->gl_flags),
 		timeout);
 	if (!test_bit(HIF_HOLDER, &gh->gh_iflags)) {
 		gfs2_glock_dq(gh);