raid5-cache: fix a deadlock in superblock write

There is a potential deadlock in superblock write. Discard could zero data, so before discard we must make sure superblock is updated to new log tail. Updating superblock (either directly call md_update_sb() or depend on md thread) must hold reconfig mutex. On the other hand, raid5_quiesce is called with reconfig_mutex hold. The first step of raid5_quiesce() is waitting for all IO finish, hence waitting for reclaim thread, while reclaim thread is calling this function and waitting for reconfig mutex. So there is a deadlock. We workaround this issue with a trylock. The downside of the solution is we could miss discard if we can't take reconfig mutex. But this should happen rarely (mainly in raid array stop), so miss discard shouldn't be a big problem. Cc: NeilBrown <neilb@suse.com> Signed-off-by: Shaohua Li <shli@fb.com>
2024-11-25 03:30:53 +07:00 · 2016-08-25 10:09:39 -07:00 · 2016-08-25 10:09:39 -07:00 · 8e018c21da
commit 8e018c21da
parent 86a1679860
1 changed files with 15 additions and 31 deletions
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@ -96,7 +96,6 @@ struct r5l_log {
 	spinlock_t no_space_stripes_lock;

 	bool need_cache_flush;
-	bool in_teardown;
 };

 /*
@ -704,31 +703,22 @@ static void r5l_write_super_and_discard_space(struct r5l_log *log,

 	mddev = log->rdev->mddev;
 	/*
-	 * This is to avoid a deadlock. r5l_quiesce holds reconfig_mutex and
-	 * wait for this thread to finish. This thread waits for
-	 * MD_CHANGE_PENDING clear, which is supposed to be done in
-	 * md_check_recovery(). md_check_recovery() tries to get
-	 * reconfig_mutex. Since r5l_quiesce already holds the mutex,
-	 * md_check_recovery() fails, so the PENDING never get cleared. The
-	 * in_teardown check workaround this issue.
+	 * Discard could zero data, so before discard we must make sure
+	 * superblock is updated to new log tail. Updating superblock (either
+	 * directly call md_update_sb() or depend on md thread) must hold
+	 * reconfig mutex. On the other hand, raid5_quiesce is called with
+	 * reconfig_mutex hold. The first step of raid5_quiesce() is waitting
+	 * for all IO finish, hence waitting for reclaim thread, while reclaim
+	 * thread is calling this function and waitting for reconfig mutex. So
+	 * there is a deadlock. We workaround this issue with a trylock.
+	 * FIXME: we could miss discard if we can't take reconfig mutex
 	 */
-	if (!log->in_teardown) {
-		set_mask_bits(&mddev->flags, 0,
-			      BIT(MD_CHANGE_DEVS) | BIT(MD_CHANGE_PENDING));
-		md_wakeup_thread(mddev->thread);
-		wait_event(mddev->sb_wait,
-			!test_bit(MD_CHANGE_PENDING, &mddev->flags) ||
-			log->in_teardown);
-		/*
-		 * r5l_quiesce could run after in_teardown check and hold
-		 * mutex first. Superblock might get updated twice.
-		 */
-		if (log->in_teardown)
-			md_update_sb(mddev, 1);
-	} else {
-		WARN_ON(!mddev_is_locked(mddev));
-		md_update_sb(mddev, 1);
-	}
+	set_mask_bits(&mddev->flags, 0,
+		BIT(MD_CHANGE_DEVS) | BIT(MD_CHANGE_PENDING));
+	if (!mddev_trylock(mddev))
+		return;
+	md_update_sb(mddev, 1);
+	mddev_unlock(mddev);

 	/* discard IO error really doesn't matter, ignore it */
 	if (log->last_checkpoint < end) {
@ -827,7 +817,6 @@ void r5l_quiesce(struct r5l_log *log, int state)
 	if (!log || state == 2)
 		return;
 	if (state == 0) {
-		log->in_teardown = 0;
 		/*
 		 * This is a special case for hotadd. In suspend, the array has
 		 * no journal. In resume, journal is initialized as well as the
@ -838,11 +827,6 @@ void r5l_quiesce(struct r5l_log *log, int state)
 		log->reclaim_thread = md_register_thread(r5l_reclaim_thread,
 					log->rdev->mddev, "reclaim");
 	} else if (state == 1) {
-		/*
-		 * at this point all stripes are finished, so io_unit is at
-		 * least in STRIPE_END state
-		 */
-		log->in_teardown = 1;
 		/* make sure r5l_write_super_and_discard_space exits */
 		mddev = log->rdev->mddev;
 		wake_up(&mddev->sb_wait);