linux_dsm_epyc7002/fs/btrfs/space-info.c

#ifndef MY_ABC_HERE
#define MY_ABC_HERE
#endif
// SPDX-License-Identifier: GPL-2.0

#ifdef MY_ABC_HERE
#include <linux/fs.h>
#endif /* MY_ABC_HERE */

#include "misc.h"
#include "ctree.h"
#include "space-info.h"
#include "sysfs.h"
#include "volumes.h"
#include "free-space-cache.h"
#include "ordered-data.h"
#include "transaction.h"
#include "block-group.h"

/*
 * HOW DOES SPACE RESERVATION WORK
 *
 * If you want to know about delalloc specifically, there is a separate comment
 * for that with the delalloc code.  This comment is about how the whole system
 * works generally.
 *
 * BASIC CONCEPTS
 *
 *   1) space_info.  This is the ultimate arbiter of how much space we can use.
 *   There's a description of the bytes_ fields with the struct declaration,
 *   refer to that for specifics on each field.  Suffice it to say that for
 *   reservations we care about total_bytes - SUM(space_info->bytes_) when
 *   determining if there is space to make an allocation.  There is a space_info
 *   for METADATA, SYSTEM, and DATA areas.
 *
 *   2) block_rsv's.  These are basically buckets for every different type of
 *   metadata reservation we have.  You can see the comment in the block_rsv
 *   code on the rules for each type, but generally block_rsv->reserved is how
 *   much space is accounted for in space_info->bytes_may_use.
 *
 *   3) btrfs_calc*_size.  These are the worst case calculations we used based
 *   on the number of items we will want to modify.  We have one for changing
 *   items, and one for inserting new items.  Generally we use these helpers to
 *   determine the size of the block reserves, and then use the actual bytes
 *   values to adjust the space_info counters.
 *
 * MAKING RESERVATIONS, THE NORMAL CASE
 *
 *   We call into either btrfs_reserve_data_bytes() or
 *   btrfs_reserve_metadata_bytes(), depending on which we're looking for, with
 *   num_bytes we want to reserve.
 *
 *   ->reserve
 *     space_info->bytes_may_reserve += num_bytes
 *
 *   ->extent allocation
 *     Call btrfs_add_reserved_bytes() which does
 *     space_info->bytes_may_reserve -= num_bytes
 *     space_info->bytes_reserved += extent_bytes
 *
 *   ->insert reference
 *     Call btrfs_update_block_group() which does
 *     space_info->bytes_reserved -= extent_bytes
 *     space_info->bytes_used += extent_bytes
 *
 * MAKING RESERVATIONS, FLUSHING NORMALLY (non-priority)
 *
 *   Assume we are unable to simply make the reservation because we do not have
 *   enough space
 *
 *   -> __reserve_bytes
 *     create a reserve_ticket with ->bytes set to our reservation, add it to
 *     the tail of space_info->tickets, kick async flush thread
 *
 *   ->handle_reserve_ticket
 *     wait on ticket->wait for ->bytes to be reduced to 0, or ->error to be set
 *     on the ticket.
 *
 *   -> btrfs_async_reclaim_metadata_space/btrfs_async_reclaim_data_space
 *     Flushes various things attempting to free up space.
 *
 *   -> btrfs_try_granting_tickets()
 *     This is called by anything that either subtracts space from
 *     space_info->bytes_may_use, ->bytes_pinned, etc, or adds to the
 *     space_info->total_bytes.  This loops through the ->priority_tickets and
 *     then the ->tickets list checking to see if the reservation can be
 *     completed.  If it can the space is added to space_info->bytes_may_use and
 *     the ticket is woken up.
 *
 *   -> ticket wakeup
 *     Check if ->bytes == 0, if it does we got our reservation and we can carry
 *     on, if not return the appropriate error (ENOSPC, but can be EINTR if we
 *     were interrupted.)
 *
 * MAKING RESERVATIONS, FLUSHING HIGH PRIORITY
 *
 *   Same as the above, except we add ourselves to the
 *   space_info->priority_tickets, and we do not use ticket->wait, we simply
 *   call flush_space() ourselves for the states that are safe for us to call
 *   without deadlocking and hope for the best.
 *
 * THE FLUSHING STATES
 *
 *   Generally speaking we will have two cases for each state, a "nice" state
 *   and a "ALL THE THINGS" state.  In btrfs we delay a lot of work in order to
 *   reduce the locking over head on the various trees, and even to keep from
 *   doing any work at all in the case of delayed refs.  Each of these delayed
 *   things however hold reservations, and so letting them run allows us to
 *   reclaim space so we can make new reservations.
 *
 *   FLUSH_DELAYED_ITEMS
 *     Every inode has a delayed item to update the inode.  Take a simple write
 *     for example, we would update the inode item at write time to update the
 *     mtime, and then again at finish_ordered_io() time in order to update the
 *     isize or bytes.  We keep these delayed items to coalesce these operations
 *     into a single operation done on demand.  These are an easy way to reclaim
 *     metadata space.
 *
 *   FLUSH_DELALLOC
 *     Look at the delalloc comment to get an idea of how much space is reserved
 *     for delayed allocation.  We can reclaim some of this space simply by
 *     running delalloc, but usually we need to wait for ordered extents to
 *     reclaim the bulk of this space.
 *
 *   FLUSH_DELAYED_REFS
 *     We have a block reserve for the outstanding delayed refs space, and every
 *     delayed ref operation holds a reservation.  Running these is a quick way
 *     to reclaim space, but we want to hold this until the end because COW can
 *     churn a lot and we can avoid making some extent tree modifications if we
 *     are able to delay for as long as possible.
 *
 *   ALLOC_CHUNK
 *     We will skip this the first time through space reservation, because of
 *     overcommit and we don't want to have a lot of useless metadata space when
 *     our worst case reservations will likely never come true.
 *
 *   RUN_DELAYED_IPUTS
 *     If we're freeing inodes we're likely freeing checksums, file extent
 *     items, and extent tree items.  Loads of space could be freed up by these
 *     operations, however they won't be usable until the transaction commits.
 *
 *   COMMIT_TRANS
 *     may_commit_transaction() is the ultimate arbiter on whether we commit the
 *     transaction or not.  In order to avoid constantly churning we do all the
 *     above flushing first and then commit the transaction as the last resort.
 *     However we need to take into account things like pinned space that would
 *     be freed, plus any delayed work we may not have gotten rid of in the case
 *     of metadata.
 *
 * OVERCOMMIT
 *
 *   Because we hold so many reservations for metadata we will allow you to
 *   reserve more space than is currently free in the currently allocate
 *   metadata space.  This only happens with metadata, data does not allow
 *   overcommitting.
 *
 *   You can see the current logic for when we allow overcommit in
 *   btrfs_can_overcommit(), but it only applies to unallocated space.  If there
 *   is no unallocated space to be had, all reservations are kept within the
 *   free space in the allocated metadata chunks.
 *
 *   Because of overcommitting, you generally want to use the
 *   btrfs_can_overcommit() logic for metadata allocations, as it does the right
 *   thing with or without extra unallocated space.
 */

u64 __pure btrfs_space_info_used(struct btrfs_space_info *s_info,
			  bool may_use_included)
{
	ASSERT(s_info);
	return s_info->bytes_used + s_info->bytes_reserved +
		s_info->bytes_pinned + s_info->bytes_readonly +
		(may_use_included ? s_info->bytes_may_use : 0);
}

/*
 * after adding space to the filesystem, we need to clear the full flags
 * on all the space infos.
 */
void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
{
	struct list_head *head = &info->space_info;
	struct btrfs_space_info *found;

	list_for_each_entry(found, head, list)
		found->full = 0;
}

static int create_space_info(struct btrfs_fs_info *info, u64 flags)
{

	struct btrfs_space_info *space_info;
	int i;
	int ret;

	space_info = kzalloc(sizeof(*space_info), GFP_NOFS);
	if (!space_info)
		return -ENOMEM;

	ret = percpu_counter_init(&space_info->total_bytes_pinned, 0,
				 GFP_KERNEL);
	if (ret) {
		kfree(space_info);
		return ret;
	}

	for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
		INIT_LIST_HEAD(&space_info->block_groups[i]);
	init_rwsem(&space_info->groups_sem);
	spin_lock_init(&space_info->lock);
	space_info->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK;
	space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
	INIT_LIST_HEAD(&space_info->ro_bgs);
	INIT_LIST_HEAD(&space_info->tickets);
	INIT_LIST_HEAD(&space_info->priority_tickets);
#ifdef MY_ABC_HERE
	spin_lock_init(&space_info->syno_allocator.lock);
	init_rwsem(&space_info->syno_allocator.allocation_sem);
	space_info->syno_allocator.free_space_bytes = RB_ROOT_CACHED;
	space_info->syno_allocator.free_space_max_length = RB_ROOT_CACHED;
	space_info->syno_allocator.free_space_max_length_with_extent = RB_ROOT_CACHED;
	space_info->syno_allocator.preload = RB_ROOT_CACHED;
	mutex_init(&space_info->syno_allocator.syno_allocator_mutex);
	space_info->syno_allocator.force_cluster_disable = true;
	space_info->syno_allocator.cache_bg = NULL;
	space_info->syno_allocator.cache_offset = 0;
#ifdef MY_ABC_HERE
	space_info->syno_allocator.log_bg_offset = 0;
#endif /* MY_ABC_HERE */
	atomic64_set(&space_info->syno_allocator.fallback_relink_count, 0);
	atomic64_set(&space_info->syno_allocator.fallback_full_scan_count, 0);
#endif /* MY_ABC_HERE */

	ret = btrfs_sysfs_add_space_info_type(info, space_info);
	if (ret)
		return ret;

	list_add(&space_info->list, &info->space_info);
	if (flags & BTRFS_BLOCK_GROUP_DATA)
		info->data_sinfo = space_info;

	return ret;
}

int btrfs_init_space_info(struct btrfs_fs_info *fs_info)
{
	struct btrfs_super_block *disk_super;
	u64 features;
	u64 flags;
	int mixed = 0;
	int ret;

	disk_super = fs_info->super_copy;
	if (!btrfs_super_root(disk_super))
		return -EINVAL;

	features = btrfs_super_incompat_flags(disk_super);
	if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
		mixed = 1;

	flags = BTRFS_BLOCK_GROUP_SYSTEM;
	ret = create_space_info(fs_info, flags);
	if (ret)
		goto out;

	if (mixed) {
		flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA;
		ret = create_space_info(fs_info, flags);
	} else {
		flags = BTRFS_BLOCK_GROUP_METADATA;
		ret = create_space_info(fs_info, flags);
		if (ret)
			goto out;

		flags = BTRFS_BLOCK_GROUP_DATA;
		ret = create_space_info(fs_info, flags);
	}
out:
	return ret;
}

void btrfs_update_space_info(struct btrfs_fs_info *info, u64 flags,
			     u64 total_bytes, u64 bytes_used,
			     u64 bytes_readonly,
			     struct btrfs_space_info **space_info)
{
	struct btrfs_space_info *found;
	int factor;

	factor = btrfs_bg_type_to_factor(flags);

	found = btrfs_find_space_info(info, flags);
	ASSERT(found);
	spin_lock(&found->lock);
	found->total_bytes += total_bytes;
	found->disk_total += total_bytes * factor;
	found->bytes_used += bytes_used;
	found->disk_used += bytes_used * factor;
	found->bytes_readonly += bytes_readonly;
	if (total_bytes > 0)
		found->full = 0;
	btrfs_try_granting_tickets(info, found);
	spin_unlock(&found->lock);
	*space_info = found;
}

struct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info,
					       u64 flags)
{
	struct list_head *head = &info->space_info;
	struct btrfs_space_info *found;

	flags &= BTRFS_BLOCK_GROUP_TYPE_MASK;

	list_for_each_entry(found, head, list) {
		if (found->flags & flags)
			return found;
	}
	return NULL;
}

static u64 calc_available_free_space(struct btrfs_fs_info *fs_info,
			  struct btrfs_space_info *space_info,
			  enum btrfs_reserve_flush_enum flush)
{
	u64 profile;
	u64 avail;
	int factor;

	if (space_info->flags & BTRFS_BLOCK_GROUP_SYSTEM)
		profile = btrfs_system_alloc_profile(fs_info);
	else
		profile = btrfs_metadata_alloc_profile(fs_info);

	avail = atomic64_read(&fs_info->free_chunk_space);

	/*
	 * If we have dup, raid1 or raid10 then only half of the free
	 * space is actually usable.  For raid56, the space info used
	 * doesn't include the parity drive, so we don't have to
	 * change the math
	 */
	factor = btrfs_bg_type_to_factor(profile);
	avail = div_u64(avail, factor);

	/*
	 * If we aren't flushing all things, let us overcommit up to
	 * 1/2th of the space. If we can flush, don't let us overcommit
	 * too much, let it overcommit up to 1/8 of the space.
	 */
	if (flush == BTRFS_RESERVE_FLUSH_ALL)
		avail >>= 3;
	else
		avail >>= 1;
	return avail;
}

int btrfs_can_overcommit(struct btrfs_fs_info *fs_info,
			 struct btrfs_space_info *space_info, u64 bytes,
			 enum btrfs_reserve_flush_enum flush)
{
	u64 avail;
	u64 used;

	/* Don't overcommit when in mixed mode */
	if (space_info->flags & BTRFS_BLOCK_GROUP_DATA)
		return 0;

	used = btrfs_space_info_used(space_info, true);
	avail = calc_available_free_space(fs_info, space_info, flush);

	if (used + bytes < space_info->total_bytes + avail)
		return 1;
	return 0;
}

static void remove_ticket(struct btrfs_space_info *space_info,
			  struct reserve_ticket *ticket)
{
	if (!list_empty(&ticket->list)) {
		list_del_init(&ticket->list);
		ASSERT(space_info->reclaim_size >= ticket->bytes);
		space_info->reclaim_size -= ticket->bytes;
	}
}

/*
 * This is for space we already have accounted in space_info->bytes_may_use, so
 * basically when we're returning space from block_rsv's.
 */
void btrfs_try_granting_tickets(struct btrfs_fs_info *fs_info,
				struct btrfs_space_info *space_info)
{
	struct list_head *head;
	enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_NO_FLUSH;

	lockdep_assert_held(&space_info->lock);

	head = &space_info->priority_tickets;
again:
	while (!list_empty(head)) {
		struct reserve_ticket *ticket;
		u64 used = btrfs_space_info_used(space_info, true);

		ticket = list_first_entry(head, struct reserve_ticket, list);

		/* Check and see if our ticket can be satisified now. */
		if ((used + ticket->bytes <= space_info->total_bytes) ||
		    btrfs_can_overcommit(fs_info, space_info, ticket->bytes,
					 flush)) {
			btrfs_space_info_update_bytes_may_use(fs_info,
							      space_info,
							      ticket->bytes);
			remove_ticket(space_info, ticket);
			ticket->bytes = 0;
			space_info->tickets_id++;
			wake_up(&ticket->wait);
		} else {
			break;
		}
	}

	if (head == &space_info->priority_tickets) {
		head = &space_info->tickets;
		flush = BTRFS_RESERVE_FLUSH_ALL;
		goto again;
	}
}

#define DUMP_BLOCK_RSV(fs_info, rsv_name)				\
do {									\
	struct btrfs_block_rsv *__rsv = &(fs_info)->rsv_name;		\
	spin_lock(&__rsv->lock);					\
	btrfs_info(fs_info, #rsv_name ": size %llu reserved %llu",	\
		   __rsv->size, __rsv->reserved);			\
	spin_unlock(&__rsv->lock);					\
} while (0)

static void __btrfs_dump_space_info(struct btrfs_fs_info *fs_info,
				    struct btrfs_space_info *info)
{
	lockdep_assert_held(&info->lock);

	btrfs_info(fs_info, "space_info %llu has %llu free, is %sfull",
		   info->flags,
		   info->total_bytes - btrfs_space_info_used(info, true),
		   info->full ? "" : "not ");
	btrfs_info(fs_info,
		"space_info total=%llu, used=%llu, pinned=%llu, reserved=%llu, may_use=%llu, readonly=%llu",
		info->total_bytes, info->bytes_used, info->bytes_pinned,
		info->bytes_reserved, info->bytes_may_use,
		info->bytes_readonly);

	DUMP_BLOCK_RSV(fs_info, global_block_rsv);
	DUMP_BLOCK_RSV(fs_info, trans_block_rsv);
	DUMP_BLOCK_RSV(fs_info, chunk_block_rsv);
	DUMP_BLOCK_RSV(fs_info, delayed_block_rsv);
	DUMP_BLOCK_RSV(fs_info, delayed_refs_rsv);

}

void btrfs_dump_space_info(struct btrfs_fs_info *fs_info,
			   struct btrfs_space_info *info, u64 bytes,
			   int dump_block_groups)
{
	struct btrfs_block_group *cache;
	int index = 0;

	spin_lock(&info->lock);
	__btrfs_dump_space_info(fs_info, info);
	spin_unlock(&info->lock);

	if (!dump_block_groups)
		return;

	down_read(&info->groups_sem);
again:
	list_for_each_entry(cache, &info->block_groups[index], list) {
		spin_lock(&cache->lock);
		btrfs_info(fs_info,
			"block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %s",
			cache->start, cache->length, cache->used, cache->pinned,
			cache->reserved, cache->ro ? "[readonly]" : "");
		spin_unlock(&cache->lock);
		btrfs_dump_free_space(cache, bytes);
	}
	if (++index < BTRFS_NR_RAID_TYPES)
		goto again;
	up_read(&info->groups_sem);
}

static inline u64 calc_reclaim_items_nr(struct btrfs_fs_info *fs_info,
					u64 to_reclaim)
{
	u64 bytes;
	u64 nr;

	bytes = btrfs_calc_insert_metadata_size(fs_info, 1);
	nr = div64_u64(to_reclaim, bytes);
	if (!nr)
		nr = 1;
	return nr;
}

#define EXTENT_SIZE_PER_ITEM	SZ_256K

/*
 * shrink metadata reservation for delalloc
 */
static void shrink_delalloc(struct btrfs_fs_info *fs_info,
			    struct btrfs_space_info *space_info,
			    u64 to_reclaim, bool wait_ordered)
{
	struct btrfs_trans_handle *trans;
	u64 delalloc_bytes;
	u64 dio_bytes;
	u64 items;
	long time_left;
	int loops;

	/* Calc the number of the pages we need flush for space reservation */
	if (to_reclaim == U64_MAX) {
		items = U64_MAX;
	} else {
		/*
		 * to_reclaim is set to however much metadata we need to
		 * reclaim, but reclaiming that much data doesn't really track
		 * exactly, so increase the amount to reclaim by 2x in order to
		 * make sure we're flushing enough delalloc to hopefully reclaim
		 * some metadata reservations.
		 */
		items = calc_reclaim_items_nr(fs_info, to_reclaim) * 2;
		to_reclaim = items * EXTENT_SIZE_PER_ITEM;
	}

	trans = (struct btrfs_trans_handle *)current->journal_info;

	delalloc_bytes = percpu_counter_sum_positive(
						&fs_info->delalloc_bytes);
	dio_bytes = percpu_counter_sum_positive(&fs_info->dio_bytes);
	if (delalloc_bytes == 0 && dio_bytes == 0) {
		if (trans)
			return;
		if (wait_ordered)
			btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1);
		return;
	}

	/*
	 * If we are doing more ordered than delalloc we need to just wait on
	 * ordered extents, otherwise we'll waste time trying to flush delalloc
	 * that likely won't give us the space back we need.
	 */
	if (dio_bytes > delalloc_bytes)
		wait_ordered = true;

	loops = 0;
	while ((delalloc_bytes || dio_bytes) && loops < 3) {
		u64 nr_pages = min(delalloc_bytes, to_reclaim) >> PAGE_SHIFT;

		btrfs_start_delalloc_roots(fs_info, nr_pages, true);

		loops++;
		if (wait_ordered && !trans) {
			btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1);
		} else {
			time_left = schedule_timeout_killable(1);
			if (time_left)
				break;
		}

		spin_lock(&space_info->lock);
		if (list_empty(&space_info->tickets) &&
		    list_empty(&space_info->priority_tickets)) {
			spin_unlock(&space_info->lock);
			break;
		}
		spin_unlock(&space_info->lock);

		delalloc_bytes = percpu_counter_sum_positive(
						&fs_info->delalloc_bytes);
		dio_bytes = percpu_counter_sum_positive(&fs_info->dio_bytes);
	}
}

/**
 * maybe_commit_transaction - possibly commit the transaction if its ok to
 * @root - the root we're allocating for
 * @bytes - the number of bytes we want to reserve
 * @force - force the commit
 *
 * This will check to make sure that committing the transaction will actually
 * get us somewhere and then commit the transaction if it does.  Otherwise it
 * will return -ENOSPC.
 */
static int may_commit_transaction(struct btrfs_fs_info *fs_info,
				  struct btrfs_space_info *space_info)
{
	struct reserve_ticket *ticket = NULL;
	struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_block_rsv;
	struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv;
	struct btrfs_block_rsv *trans_rsv = &fs_info->trans_block_rsv;
	struct btrfs_trans_handle *trans;
	u64 reclaim_bytes = 0;
	u64 bytes_needed = 0;
	u64 cur_free_bytes = 0;

	trans = (struct btrfs_trans_handle *)current->journal_info;
	if (trans)
		return -EAGAIN;

	spin_lock(&space_info->lock);
	cur_free_bytes = btrfs_space_info_used(space_info, true);
	if (cur_free_bytes < space_info->total_bytes)
		cur_free_bytes = space_info->total_bytes - cur_free_bytes;
	else
		cur_free_bytes = 0;

	if (!list_empty(&space_info->priority_tickets))
		ticket = list_first_entry(&space_info->priority_tickets,
					  struct reserve_ticket, list);
	else if (!list_empty(&space_info->tickets))
		ticket = list_first_entry(&space_info->tickets,
					  struct reserve_ticket, list);
	if (ticket)
		bytes_needed = ticket->bytes;

	if (bytes_needed > cur_free_bytes)
		bytes_needed -= cur_free_bytes;
	else
		bytes_needed = 0;
	spin_unlock(&space_info->lock);

	if (!bytes_needed)
		return 0;

	trans = btrfs_join_transaction(fs_info->extent_root);
	if (IS_ERR(trans))
		return PTR_ERR(trans);

	/*
	 * See if there is enough pinned space to make this reservation, or if
	 * we have block groups that are going to be freed, allowing us to
	 * possibly do a chunk allocation the next loop through.
	 */
	if (test_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags) ||
	    __percpu_counter_compare(&space_info->total_bytes_pinned,
				     bytes_needed,
				     BTRFS_TOTAL_BYTES_PINNED_BATCH) >= 0)
		goto commit;

	/*
	 * See if there is some space in the delayed insertion reserve for this
	 * reservation.  If the space_info's don't match (like for DATA or
	 * SYSTEM) then just go enospc, reclaiming this space won't recover any
	 * space to satisfy those reservations.
	 */
	if (space_info != delayed_rsv->space_info)
		goto enospc;

	spin_lock(&delayed_rsv->lock);
	reclaim_bytes += delayed_rsv->reserved;
	spin_unlock(&delayed_rsv->lock);

	spin_lock(&delayed_refs_rsv->lock);
	reclaim_bytes += delayed_refs_rsv->reserved;
	spin_unlock(&delayed_refs_rsv->lock);

	spin_lock(&trans_rsv->lock);
	reclaim_bytes += trans_rsv->reserved;
	spin_unlock(&trans_rsv->lock);

	if (reclaim_bytes >= bytes_needed)
		goto commit;
	bytes_needed -= reclaim_bytes;

	if (__percpu_counter_compare(&space_info->total_bytes_pinned,
				   bytes_needed,
				   BTRFS_TOTAL_BYTES_PINNED_BATCH) < 0)
		goto enospc;

commit:
	return btrfs_commit_transaction(trans);
enospc:
	btrfs_end_transaction(trans);
	return -ENOSPC;
}

/*
 * Try to flush some data based on policy set by @state. This is only advisory
 * and may fail for various reasons. The caller is supposed to examine the
 * state of @space_info to detect the outcome.
 */
static void flush_space(struct btrfs_fs_info *fs_info,
		       struct btrfs_space_info *space_info, u64 num_bytes,
		       int state)
{
	struct btrfs_root *root = fs_info->extent_root;
	struct btrfs_trans_handle *trans;
	int nr;
	int ret = 0;

	switch (state) {
	case FLUSH_DELAYED_ITEMS_NR:
	case FLUSH_DELAYED_ITEMS:
		if (state == FLUSH_DELAYED_ITEMS_NR)
			nr = calc_reclaim_items_nr(fs_info, num_bytes) * 2;
		else
			nr = -1;

		trans = btrfs_join_transaction(root);
		if (IS_ERR(trans)) {
			ret = PTR_ERR(trans);
			break;
		}
		ret = btrfs_run_delayed_items_nr(trans, nr);
		btrfs_end_transaction(trans);
		break;
	case FLUSH_DELALLOC:
	case FLUSH_DELALLOC_WAIT:
		shrink_delalloc(fs_info, space_info, num_bytes,
				state == FLUSH_DELALLOC_WAIT);
		break;
	case FLUSH_DELAYED_REFS_NR:
	case FLUSH_DELAYED_REFS:
		trans = btrfs_join_transaction(root);
		if (IS_ERR(trans)) {
			ret = PTR_ERR(trans);
			break;
		}
		if (state == FLUSH_DELAYED_REFS_NR)
			nr = calc_reclaim_items_nr(fs_info, num_bytes);
		else
			nr = 0;
		btrfs_run_delayed_refs(trans, nr);
		btrfs_end_transaction(trans);
		break;
	case ALLOC_CHUNK:
	case ALLOC_CHUNK_FORCE:
		trans = btrfs_join_transaction(root);
		if (IS_ERR(trans)) {
			ret = PTR_ERR(trans);
			break;
		}
		ret = btrfs_chunk_alloc(trans,
				btrfs_get_alloc_profile(fs_info, space_info->flags),
				(state == ALLOC_CHUNK) ? CHUNK_ALLOC_NO_FORCE :
					CHUNK_ALLOC_FORCE);
		btrfs_end_transaction(trans);
		if (ret > 0 || ret == -ENOSPC)
			ret = 0;
		break;
	case RUN_DELAYED_IPUTS:
		/*
		 * If we have pending delayed iputs then we could free up a
		 * bunch of pinned space, so make sure we run the iputs before
		 * we do our pinned bytes check below.
		 */
		btrfs_run_delayed_iputs(fs_info);
		btrfs_wait_on_delayed_iputs(fs_info);
		break;
	case COMMIT_TRANS:
		ret = may_commit_transaction(fs_info, space_info);
		break;
	default:
		ret = -ENOSPC;
		break;
	}

	trace_btrfs_flush_space(fs_info, space_info->flags, num_bytes, state,
				ret);
	return;
}

static inline u64
btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info,
				 struct btrfs_space_info *space_info)
{
	u64 used;
	u64 avail;
	u64 expected;
	u64 to_reclaim = space_info->reclaim_size;

	lockdep_assert_held(&space_info->lock);

	avail = calc_available_free_space(fs_info, space_info,
					  BTRFS_RESERVE_FLUSH_ALL);
	used = btrfs_space_info_used(space_info, true);

	/*
	 * We may be flushing because suddenly we have less space than we had
	 * before, and now we're well over-committed based on our current free
	 * space.  If that's the case add in our overage so we make sure to put
	 * appropriate pressure on the flushing state machine.
	 */
	if (space_info->total_bytes + avail < used)
		to_reclaim += used - (space_info->total_bytes + avail);

	if (to_reclaim)
		return to_reclaim;

	to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M);
	if (btrfs_can_overcommit(fs_info, space_info, to_reclaim,
				 BTRFS_RESERVE_FLUSH_ALL))
		return 0;

	used = btrfs_space_info_used(space_info, true);

	if (btrfs_can_overcommit(fs_info, space_info, SZ_1M,
				 BTRFS_RESERVE_FLUSH_ALL))
		expected = div_factor_fine(space_info->total_bytes, 95);
	else
		expected = div_factor_fine(space_info->total_bytes, 90);

	if (used > expected)
		to_reclaim = used - expected;
	else
		to_reclaim = 0;
	to_reclaim = min(to_reclaim, space_info->bytes_may_use +
				     space_info->bytes_reserved);
	return to_reclaim;
}

static inline int need_do_async_reclaim(struct btrfs_fs_info *fs_info,
					struct btrfs_space_info *space_info,
					u64 used)
{
	u64 thresh = div_factor_fine(space_info->total_bytes, 98);

	/* If we're just plain full then async reclaim just slows us down. */
	if ((space_info->bytes_used + space_info->bytes_reserved) >= thresh)
		return 0;

	if (!btrfs_calc_reclaim_metadata_size(fs_info, space_info))
		return 0;

	return (used >= thresh && !btrfs_fs_closing(fs_info) &&
		!test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state));
}

static bool steal_from_global_rsv(struct btrfs_fs_info *fs_info,
				  struct btrfs_space_info *space_info,
				  struct reserve_ticket *ticket)
{
	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
	u64 min_bytes;

	if (global_rsv->space_info != space_info)
		return false;

	spin_lock(&global_rsv->lock);
	min_bytes = div_factor(global_rsv->size, 1);
	if (global_rsv->reserved < min_bytes + ticket->bytes) {
		spin_unlock(&global_rsv->lock);
		return false;
	}
	global_rsv->reserved -= ticket->bytes;
	remove_ticket(space_info, ticket);
	ticket->bytes = 0;
	wake_up(&ticket->wait);
	space_info->tickets_id++;
	if (global_rsv->reserved < global_rsv->size)
		global_rsv->full = 0;
	spin_unlock(&global_rsv->lock);

	return true;
}

/*
 * maybe_fail_all_tickets - we've exhausted our flushing, start failing tickets
 * @fs_info - fs_info for this fs
 * @space_info - the space info we were flushing
 *
 * We call this when we've exhausted our flushing ability and haven't made
 * progress in satisfying tickets.  The reservation code handles tickets in
 * order, so if there is a large ticket first and then smaller ones we could
 * very well satisfy the smaller tickets.  This will attempt to wake up any
 * tickets in the list to catch this case.
 *
 * This function returns true if it was able to make progress by clearing out
 * other tickets, or if it stumbles across a ticket that was smaller than the
 * first ticket.
 */
static bool maybe_fail_all_tickets(struct btrfs_fs_info *fs_info,
				   struct btrfs_space_info *space_info)
{
	struct reserve_ticket *ticket;
	u64 tickets_id = space_info->tickets_id;
	u64 first_ticket_bytes = 0;

	if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
		btrfs_info(fs_info, "cannot satisfy tickets, dumping space info");
		__btrfs_dump_space_info(fs_info, space_info);
	}

	while (!list_empty(&space_info->tickets) &&
	       tickets_id == space_info->tickets_id) {
		ticket = list_first_entry(&space_info->tickets,
					  struct reserve_ticket, list);

		if (ticket->steal &&
		    steal_from_global_rsv(fs_info, space_info, ticket))
			return true;

		/*
		 * may_commit_transaction will avoid committing the transaction
		 * if it doesn't feel like the space reclaimed by the commit
		 * would result in the ticket succeeding.  However if we have a
		 * smaller ticket in the queue it may be small enough to be
		 * satisified by committing the transaction, so if any
		 * subsequent ticket is smaller than the first ticket go ahead
		 * and send us back for another loop through the enospc flushing
		 * code.
		 */
		if (first_ticket_bytes == 0)
			first_ticket_bytes = ticket->bytes;
		else if (first_ticket_bytes > ticket->bytes)
			return true;

		if (btrfs_test_opt(fs_info, ENOSPC_DEBUG))
			btrfs_info(fs_info, "failing ticket with %llu bytes",
				   ticket->bytes);

		remove_ticket(space_info, ticket);
		ticket->error = -ENOSPC;
		wake_up(&ticket->wait);

		/*
		 * We're just throwing tickets away, so more flushing may not
		 * trip over btrfs_try_granting_tickets, so we need to call it
		 * here to see if we can make progress with the next ticket in
		 * the list.
		 */
		btrfs_try_granting_tickets(fs_info, space_info);
	}
	return (tickets_id != space_info->tickets_id);
}

/*
 * This is for normal flushers, we can wait all goddamned day if we want to.  We
 * will loop and continuously try to flush as long as we are making progress.
 * We count progress as clearing off tickets each time we have to loop.
 */
static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
{
	struct btrfs_fs_info *fs_info;
	struct btrfs_space_info *space_info;
	u64 to_reclaim;
	int flush_state;
	int commit_cycles = 0;
	u64 last_tickets_id;

	fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work);
	space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);

	spin_lock(&space_info->lock);
	to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info);
	if (!to_reclaim) {
		space_info->flush = 0;
		spin_unlock(&space_info->lock);
		return;
	}
	last_tickets_id = space_info->tickets_id;
	spin_unlock(&space_info->lock);

	flush_state = FLUSH_DELAYED_ITEMS_NR;
	do {
		flush_space(fs_info, space_info, to_reclaim, flush_state);
		spin_lock(&space_info->lock);
		if (list_empty(&space_info->tickets)) {
			space_info->flush = 0;
			spin_unlock(&space_info->lock);
			return;
		}
		to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info,
							      space_info);
		if (last_tickets_id == space_info->tickets_id) {
			flush_state++;
		} else {
			last_tickets_id = space_info->tickets_id;
			flush_state = FLUSH_DELAYED_ITEMS_NR;
			if (commit_cycles)
				commit_cycles--;
		}

		/*
		 * We don't want to force a chunk allocation until we've tried
		 * pretty hard to reclaim space.  Think of the case where we
		 * freed up a bunch of space and so have a lot of pinned space
		 * to reclaim.  We would rather use that than possibly create a
		 * underutilized metadata chunk.  So if this is our first run
		 * through the flushing state machine skip ALLOC_CHUNK_FORCE and
		 * commit the transaction.  If nothing has changed the next go
		 * around then we can force a chunk allocation.
		 */
		if (flush_state == ALLOC_CHUNK_FORCE && !commit_cycles)
			flush_state++;

		if (flush_state > COMMIT_TRANS) {
			commit_cycles++;
			if (commit_cycles > 2) {
				if (maybe_fail_all_tickets(fs_info, space_info)) {
					flush_state = FLUSH_DELAYED_ITEMS_NR;
					commit_cycles--;
				} else {
					space_info->flush = 0;
				}
			} else {
				flush_state = FLUSH_DELAYED_ITEMS_NR;
			}
		}
		spin_unlock(&space_info->lock);
	} while (flush_state <= COMMIT_TRANS);
}

#ifdef MY_ABC_HERE
static inline bool need_do_async_syno_reclaim(struct btrfs_fs_info *fs_info,
					struct btrfs_space_info *space_info)
{
	bool reclaim = false;
	u64 used;

	spin_lock(&space_info->lock);
	used = btrfs_space_info_used(space_info, true);
	if (need_do_async_reclaim(fs_info, space_info, used))
		reclaim = true;
	spin_unlock(&space_info->lock);
	return reclaim;
}

static const enum btrfs_flush_state syno_metadata_flush_states[] = {
	FLUSH_DELAYED_REFS,
	FLUSH_DELAYED_ITEMS,
};

static void btrfs_syno_async_reclaim_metadata_space(struct work_struct *work)
{
	struct btrfs_fs_info *fs_info;
	struct btrfs_space_info *space_info;
	int flush_state = 0;
	unsigned long next_wakeup, cur, delay;

	fs_info = container_of(work, struct btrfs_fs_info, syno_async_metadata_reclaim_work);
	space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
	if (!space_info)
		goto out;

	next_wakeup = jiffies + msecs_to_jiffies(1000);
	while (flush_state < ARRAY_SIZE(syno_metadata_flush_states)) {
		flush_space(fs_info, space_info, U64_MAX, syno_metadata_flush_states[flush_state]);
		spin_lock(&space_info->lock);
		if (!list_empty(&space_info->tickets) ||
			!list_empty(&space_info->priority_tickets)) {
			spin_unlock(&space_info->lock);
			goto loop;
		}
		spin_unlock(&space_info->lock);
		if (!need_do_async_syno_reclaim(fs_info, space_info))
			break;
loop:
		flush_state++;
		if (flush_state >= ARRAY_SIZE(syno_metadata_flush_states)) {
			flush_state = 0;
			cur = jiffies;
			if (time_after(next_wakeup, cur)) {
				delay = next_wakeup - cur;
				delay = min(delay, msecs_to_jiffies(1000));
				schedule_timeout_interruptible(delay);
			}
			next_wakeup = jiffies + msecs_to_jiffies(1000);
		}
		cond_resched();
	}

out:
	return;
}
#endif /* MY_ABC_HERE */

/*
 * FLUSH_DELALLOC_WAIT:
 *   Space is freed from flushing delalloc in one of two ways.
 *
 *   1) compression is on and we allocate less space than we reserved
 *   2) we are overwriting existing space
 *
 *   For #1 that extra space is reclaimed as soon as the delalloc pages are
 *   COWed, by way of btrfs_add_reserved_bytes() which adds the actual extent
 *   length to ->bytes_reserved, and subtracts the reserved space from
 *   ->bytes_may_use.
 *
 *   For #2 this is trickier.  Once the ordered extent runs we will drop the
 *   extent in the range we are overwriting, which creates a delayed ref for
 *   that freed extent.  This however is not reclaimed until the transaction
 *   commits, thus the next stages.
 *
 * RUN_DELAYED_IPUTS
 *   If we are freeing inodes, we want to make sure all delayed iputs have
 *   completed, because they could have been on an inode with i_nlink == 0, and
 *   thus have been truncated and freed up space.  But again this space is not
 *   immediately re-usable, it comes in the form of a delayed ref, which must be
 *   run and then the transaction must be committed.
 *
 * FLUSH_DELAYED_REFS
 *   The above two cases generate delayed refs that will affect
 *   ->total_bytes_pinned.  However this counter can be inconsistent with
 *   reality if there are outstanding delayed refs.  This is because we adjust
 *   the counter based solely on the current set of delayed refs and disregard
 *   any on-disk state which might include more refs.  So for example, if we
 *   have an extent with 2 references, but we only drop 1, we'll see that there
 *   is a negative delayed ref count for the extent and assume that the space
 *   will be freed, and thus increase ->total_bytes_pinned.
 *
 *   Running the delayed refs gives us the actual real view of what will be
 *   freed at the transaction commit time.  This stage will not actually free
 *   space for us, it just makes sure that may_commit_transaction() has all of
 *   the information it needs to make the right decision.
 *
 * COMMIT_TRANS
 *   This is where we reclaim all of the pinned space generated by the previous
 *   two stages.  We will not commit the transaction if we don't think we're
 *   likely to satisfy our request, which means if our current free space +
 *   total_bytes_pinned < reservation we will not commit.  This is why the
 *   previous states are actually important, to make sure we know for sure
 *   whether committing the transaction will allow us to make progress.
 *
 * ALLOC_CHUNK_FORCE
 *   For data we start with alloc chunk force, however we could have been full
 *   before, and then the transaction commit could have freed new block groups,
 *   so if we now have space to allocate do the force chunk allocation.
 */
static const enum btrfs_flush_state data_flush_states[] = {
	FLUSH_DELALLOC_WAIT,
	RUN_DELAYED_IPUTS,
	FLUSH_DELAYED_REFS,
	COMMIT_TRANS,
	ALLOC_CHUNK_FORCE,
};

static void btrfs_async_reclaim_data_space(struct work_struct *work)
{
	struct btrfs_fs_info *fs_info;
	struct btrfs_space_info *space_info;
	u64 last_tickets_id;
	int flush_state = 0;

	fs_info = container_of(work, struct btrfs_fs_info, async_data_reclaim_work);
	space_info = fs_info->data_sinfo;

	spin_lock(&space_info->lock);
	if (list_empty(&space_info->tickets)) {
		space_info->flush = 0;
		spin_unlock(&space_info->lock);
		return;
	}
	last_tickets_id = space_info->tickets_id;
	spin_unlock(&space_info->lock);

	while (!space_info->full) {
		flush_space(fs_info, space_info, U64_MAX, ALLOC_CHUNK_FORCE);
		spin_lock(&space_info->lock);
		if (list_empty(&space_info->tickets)) {
			space_info->flush = 0;
			spin_unlock(&space_info->lock);
			return;
		}
		last_tickets_id = space_info->tickets_id;
		spin_unlock(&space_info->lock);
	}

	while (flush_state < ARRAY_SIZE(data_flush_states)) {
		flush_space(fs_info, space_info, U64_MAX,
			    data_flush_states[flush_state]);
		spin_lock(&space_info->lock);
		if (list_empty(&space_info->tickets)) {
			space_info->flush = 0;
			spin_unlock(&space_info->lock);
			return;
		}

		if (last_tickets_id == space_info->tickets_id) {
			flush_state++;
		} else {
			last_tickets_id = space_info->tickets_id;
			flush_state = 0;
		}

		if (flush_state >= ARRAY_SIZE(data_flush_states)) {
			if (space_info->full) {
				if (maybe_fail_all_tickets(fs_info, space_info))
					flush_state = 0;
				else
					space_info->flush = 0;
			} else {
				flush_state = 0;
			}
		}
		spin_unlock(&space_info->lock);
	}
}

#ifdef MY_ABC_HERE
/*
 * Estimate write bandwidth at 200ms intervals.
 */
#define BANDWIDTH_INTERVAL max(HZ/5, 1)
static void syno_ordered_extent_bw_update(struct btrfs_fs_info *fs_info)
{
	unsigned long now = jiffies;
	unsigned long elapsed = now - fs_info->syno_ordered_extent_processed_bw_time_stamp;
	u64 processed, bw;

	if (elapsed < BANDWIDTH_INTERVAL)
		goto out;

	processed = atomic64_read(&fs_info->syno_ordered_extent_processed_nr);
	bw = processed - min(processed, fs_info->syno_ordered_extent_processed_stamp);
	bw = bw * HZ;
	bw = div64_u64(bw, elapsed);
	bw = fs_info->syno_ordered_extent_processed_bw * 3 + bw;
	bw = bw >> 2;

	fs_info->syno_ordered_extent_processed_bw = bw;
	fs_info->syno_ordered_extent_processed_stamp = processed;
	fs_info->syno_ordered_extent_processed_bw_time_stamp = now;
out:
	return;
}
#endif /* MY_ABC_HERE */

#ifdef MY_ABC_HERE
static void btrfs_syno_async_data_flush(struct work_struct *work)
{
	struct btrfs_fs_info *fs_info;
	struct btrfs_space_info *space_info;
	struct btrfs_inode *binode, *first_binode;
	struct inode *inode;
	struct blk_plug plug;
	int nr_retry;

	fs_info = container_of(work, struct btrfs_fs_info, syno_async_data_flush_work);
	space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
	if (!space_info)
		goto out;

again:
	nr_retry = max(32, (int)fs_info->syno_writeback_thread_max + 1);
	first_binode = NULL;

	if (btrfs_fs_closing(fs_info) || test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state))
		goto out;

	if (list_empty(&fs_info->syno_dirty_lru_inodes))
		goto out;

	spin_lock(&fs_info->syno_multiple_writeback_lock);
select_inode:
	if (list_empty(&fs_info->syno_dirty_lru_inodes)) {
		spin_unlock(&fs_info->syno_multiple_writeback_lock);
		goto out;
	}
	binode = list_first_entry(&fs_info->syno_dirty_lru_inodes, struct btrfs_inode, syno_dirty_lru_inode);
	if (first_binode == binode)
		nr_retry = 0;
	if (!first_binode)
		first_binode = binode;
	inode = igrab(&binode->vfs_inode);
	if (!inode) {
		list_del_init(&binode->syno_dirty_lru_inode);
		clear_bit(BTRFS_INODE_SYNO_WRITEBACK_LRU_LIST, &binode->runtime_flags);
		if (first_binode == binode)
			first_binode = NULL;
		goto select_inode;
	}
	spin_lock(&inode->i_lock);
	if (inode->i_state & I_SYNC) {
		spin_unlock(&inode->i_lock);
		iput(inode);
		if (nr_retry-- > 0) {
			list_move_tail(&binode->syno_dirty_lru_inode, &fs_info->syno_dirty_lru_inodes);
			goto select_inode;
		}
		spin_unlock(&fs_info->syno_multiple_writeback_lock);
		goto sleep;
	}
	inode->i_state |= I_SYNC;
	spin_unlock(&inode->i_lock);

	list_move_tail(&BTRFS_I(inode)->syno_dirty_lru_inode, &fs_info->syno_dirty_lru_inodes);
	spin_unlock(&fs_info->syno_multiple_writeback_lock);

	blk_start_plug(&plug);
	filemap_flush(inode->i_mapping);
	blk_finish_plug(&plug);

	spin_lock(&inode->i_lock);
	inode_sync_complete(inode);
	spin_unlock(&inode->i_lock);
	iput(inode);

	cond_resched();
	spin_lock(&space_info->lock);
	if (!list_empty(&space_info->tickets) ||
		!list_empty(&space_info->priority_tickets)) {
		spin_unlock(&space_info->lock);
#ifdef MY_ABC_HERE
		syno_ordered_extent_bw_update(fs_info);
		if (atomic64_read(&fs_info->syno_ordered_extent_nr) < (fs_info->syno_ordered_extent_processed_bw << 1))
			goto again;
#endif /* MY_ABC_HERE */
		goto sleep;
	}
	spin_unlock(&space_info->lock);

	if (need_do_async_syno_reclaim(fs_info, space_info)) {
#ifdef MY_ABC_HERE
		syno_ordered_extent_bw_update(fs_info);
		if (atomic64_read(&fs_info->syno_ordered_extent_nr) < (fs_info->syno_ordered_extent_processed_bw << 1))
			goto again;
#endif /* MY_ABC_HERE */
		goto sleep;
	}

out:
	return;

sleep:
	schedule_timeout_interruptible(msecs_to_jiffies(1000));
	goto again;
}
#endif /* MY_ABC_HERE */

#if defined(MY_ABC_HERE) || defined(MY_ABC_HERE)
void syno_perf_indicator_dirty_limit_update(struct btrfs_fs_info *fs_info)
{
	unsigned long now;
	unsigned long elapsed;
	unsigned long background_thresh, dirty_thresh;

	if (!fs_info)
		goto out;

	if (test_and_set_bit(SYNO_PERF_INDICATROT_FLAG_DIRTY_LIMIT_UPDATE, &fs_info->syno_perf_indicator.flags))
		goto out;

	now = jiffies;
	elapsed = now - fs_info->syno_perf_indicator.dirty_limit_stamp;

	/* Estimate 1s intervals. */
	if (elapsed < HZ)
		goto out_unlock;

	global_dirty_limits(&background_thresh, &dirty_thresh);
	fs_info->syno_perf_indicator.dirty_thresh = dirty_thresh;
	fs_info->syno_perf_indicator.dirty_background_thresh = background_thresh;
	fs_info->syno_perf_indicator.dirty_limit_stamp = now;

out_unlock:
	clear_bit(SYNO_PERF_INDICATROT_FLAG_DIRTY_LIMIT_UPDATE, &fs_info->syno_perf_indicator.flags);
out:
	return;
}
#endif /* defined(MY_ABC_HERE) || defined(MY_ABC_HERE) */

#ifdef MY_ABC_HERE
static bool btrfs_check_need_async_meta_flush(struct btrfs_fs_info *fs_info, bool worker)
{
	bool ret = false;

	if (btrfs_fs_closing(fs_info) || test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state))
		goto out;

	if (!worker && work_busy(&fs_info->syno_async_metadata_flush_work))
		goto out;

	if (!writeback_in_progress(&fs_info->sb->s_bdi->wb))
		goto out;

	if (__percpu_counter_compare(&fs_info->dirty_metadata_bytes,
			BTRFS_DIRTY_METADATA_THRESH, fs_info->dirty_metadata_batch) <= 0)
		goto out;

	spin_lock(&fs_info->trans_lock);
	if (fs_info->running_transaction &&
		fs_info->running_transaction->state == TRANS_STATE_COMMIT_DOING) {
		spin_unlock(&fs_info->trans_lock);
		goto out;
	}
	spin_unlock(&fs_info->trans_lock);

	syno_perf_indicator_dirty_limit_update(fs_info);
	if (global_node_page_state(NR_FILE_DIRTY) < fs_info->syno_perf_indicator.dirty_background_thresh)
		goto out;

	ret = true;
out:
	return ret;
}

static void btrfs_syno_async_metadata_flush(struct work_struct *work)
{
	struct btrfs_fs_info *fs_info;
	struct blk_plug plug;
	struct writeback_control wbc = {
		.sync_mode      = WB_SYNC_NONE,
		.range_cyclic   = 1,
	};

	fs_info = container_of(work, struct btrfs_fs_info, syno_async_metadata_flush_work);

again:
	blk_start_plug(&plug);
	wbc.nr_to_write = BTRFS_DIRTY_METADATA_THRESH >> (PAGE_SHIFT + 1);
	btree_write_cache_pages(fs_info->btree_inode->i_mapping, &wbc);
	blk_finish_plug(&plug);

	if (btrfs_check_need_async_meta_flush(fs_info, true)) {
		schedule_timeout_interruptible(msecs_to_jiffies(100));
		goto again;
	}
	schedule_timeout_interruptible(msecs_to_jiffies(1000));

	return;
}

void btrfs_syno_btree_balance_dirty(struct btrfs_fs_info *fs_info, bool throttle)
{
	bool async_meta_flush = false;

	async_meta_flush = btrfs_check_need_async_meta_flush(fs_info, false);
	if (async_meta_flush)
		queue_work(system_unbound_wq, &fs_info->syno_async_metadata_flush_work);

	if (!throttle)
		goto out;

	/* if we need async meta flush, we always checked dirty metadata bytes, we skip double check */
	if (!async_meta_flush) {
		if (__percpu_counter_compare(&fs_info->dirty_metadata_bytes,
			BTRFS_DIRTY_METADATA_THRESH, fs_info->dirty_metadata_batch) <= 0)
			goto out;
	}

	balance_dirty_pages_ratelimited(fs_info->btree_inode->i_mapping);
out:
	return;
}
#endif /* MY_ABC_HERE */

void btrfs_init_async_reclaim_work(struct btrfs_fs_info *fs_info)
{
	INIT_WORK(&fs_info->async_reclaim_work, btrfs_async_reclaim_metadata_space);
	INIT_WORK(&fs_info->async_data_reclaim_work, btrfs_async_reclaim_data_space);
#ifdef MY_ABC_HERE
	INIT_WORK(&fs_info->syno_async_metadata_reclaim_work, btrfs_syno_async_reclaim_metadata_space);
#endif /* MY_ABC_HERE */
#ifdef MY_ABC_HERE
	INIT_WORK(&fs_info->syno_async_data_flush_work, btrfs_syno_async_data_flush);
#endif /* MY_ABC_HERE */
#ifdef MY_ABC_HERE
	INIT_WORK(&fs_info->syno_async_metadata_flush_work, btrfs_syno_async_metadata_flush);
#endif /* MY_ABC_HERE */
}

static const enum btrfs_flush_state priority_flush_states[] = {
	FLUSH_DELAYED_ITEMS_NR,
	FLUSH_DELAYED_ITEMS,
	ALLOC_CHUNK,
};

static const enum btrfs_flush_state evict_flush_states[] = {
	FLUSH_DELAYED_ITEMS_NR,
	FLUSH_DELAYED_ITEMS,
	FLUSH_DELAYED_REFS_NR,
	FLUSH_DELAYED_REFS,
	FLUSH_DELALLOC,
	FLUSH_DELALLOC_WAIT,
	ALLOC_CHUNK,
	COMMIT_TRANS,
};

static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info,
				struct btrfs_space_info *space_info,
				struct reserve_ticket *ticket,
				const enum btrfs_flush_state *states,
				int states_nr)
{
	u64 to_reclaim;
	int flush_state;

	spin_lock(&space_info->lock);
	to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info);
	if (!to_reclaim) {
		spin_unlock(&space_info->lock);
		return;
	}
	spin_unlock(&space_info->lock);

	flush_state = 0;
	do {
		flush_space(fs_info, space_info, to_reclaim, states[flush_state]);
		flush_state++;
		spin_lock(&space_info->lock);
		if (ticket->bytes == 0) {
			spin_unlock(&space_info->lock);
			return;
		}
		spin_unlock(&space_info->lock);
	} while (flush_state < states_nr);
}

static void priority_reclaim_data_space(struct btrfs_fs_info *fs_info,
					struct btrfs_space_info *space_info,
					struct reserve_ticket *ticket)
{
	while (!space_info->full) {
		flush_space(fs_info, space_info, U64_MAX, ALLOC_CHUNK_FORCE);
		spin_lock(&space_info->lock);
		if (ticket->bytes == 0) {
			spin_unlock(&space_info->lock);
			return;
		}
		spin_unlock(&space_info->lock);
	}
}

static void wait_reserve_ticket(struct btrfs_fs_info *fs_info,
				struct btrfs_space_info *space_info,
				struct reserve_ticket *ticket)

{
	DEFINE_WAIT(wait);
	int ret = 0;

	spin_lock(&space_info->lock);
	while (ticket->bytes > 0 && ticket->error == 0) {
		ret = prepare_to_wait_event(&ticket->wait, &wait, TASK_KILLABLE);
		if (ret) {
			/*
			 * Delete us from the list. After we unlock the space
			 * info, we don't want the async reclaim job to reserve
			 * space for this ticket. If that would happen, then the
			 * ticket's task would not known that space was reserved
			 * despite getting an error, resulting in a space leak
			 * (bytes_may_use counter of our space_info).
			 */
			remove_ticket(space_info, ticket);
			ticket->error = -EINTR;
			break;
		}
		spin_unlock(&space_info->lock);

		schedule();

		finish_wait(&ticket->wait, &wait);
		spin_lock(&space_info->lock);
	}
	spin_unlock(&space_info->lock);
}

/**
 * handle_reserve_ticket - do the appropriate flushing and waiting for a ticket
 * @fs_info - the fs
 * @space_info - the space_info for the reservation
 * @ticket - the ticket for the reservation
 * @flush - how much we can flush
 *
 * This does the work of figuring out how to flush for the ticket, waiting for
 * the reservation, and returning the appropriate error if there is one.
 */
static int handle_reserve_ticket(struct btrfs_fs_info *fs_info,
				 struct btrfs_space_info *space_info,
				 struct reserve_ticket *ticket,
				 enum btrfs_reserve_flush_enum flush)
{
	int ret;

	switch (flush) {
	case BTRFS_RESERVE_FLUSH_DATA:
	case BTRFS_RESERVE_FLUSH_ALL:
	case BTRFS_RESERVE_FLUSH_ALL_STEAL:
		wait_reserve_ticket(fs_info, space_info, ticket);
		break;
	case BTRFS_RESERVE_FLUSH_LIMIT:
		priority_reclaim_metadata_space(fs_info, space_info, ticket,
						priority_flush_states,
						ARRAY_SIZE(priority_flush_states));
		break;
	case BTRFS_RESERVE_FLUSH_EVICT:
		priority_reclaim_metadata_space(fs_info, space_info, ticket,
						evict_flush_states,
						ARRAY_SIZE(evict_flush_states));
		break;
	case BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE:
		priority_reclaim_data_space(fs_info, space_info, ticket);
		break;
	default:
		ASSERT(0);
		break;
	}

	spin_lock(&space_info->lock);
	ret = ticket->error;
	if (ticket->bytes || ticket->error) {
		/*
		 * We were a priority ticket, so we need to delete ourselves
		 * from the list.  Because we could have other priority tickets
		 * behind us that require less space, run
		 * btrfs_try_granting_tickets() to see if their reservations can
		 * now be made.
		 */
		if (!list_empty(&ticket->list)) {
			remove_ticket(space_info, ticket);
			btrfs_try_granting_tickets(fs_info, space_info);
		}

		if (!ret)
			ret = -ENOSPC;
	}
	spin_unlock(&space_info->lock);
	ASSERT(list_empty(&ticket->list));
	/*
	 * Check that we can't have an error set if the reservation succeeded,
	 * as that would confuse tasks and lead them to error out without
	 * releasing reserved space (if an error happens the expectation is that
	 * space wasn't reserved at all).
	 */
	ASSERT(!(ticket->bytes == 0 && ticket->error));
	return ret;
}

/*
 * This returns true if this flush state will go through the ordinary flushing
 * code.
 */
static inline bool is_normal_flushing(enum btrfs_reserve_flush_enum flush)
{
	return	(flush == BTRFS_RESERVE_FLUSH_ALL) ||
		(flush == BTRFS_RESERVE_FLUSH_ALL_STEAL);
}

/**
 * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
 * @root - the root we're allocating for
 * @space_info - the space info we want to allocate from
 * @orig_bytes - the number of bytes we want
 * @flush - whether or not we can flush to make our reservation
 *
 * This will reserve orig_bytes number of bytes from the space info associated
 * with the block_rsv.  If there is not enough space it will make an attempt to
 * flush out space to make room.  It will do this by flushing delalloc if
 * possible or committing the transaction.  If flush is 0 then no attempts to
 * regain reservations will be made and this will fail if there is not enough
 * space already.
 */
static int __reserve_bytes(struct btrfs_fs_info *fs_info,
			   struct btrfs_space_info *space_info, u64 orig_bytes,
			   enum btrfs_reserve_flush_enum flush)
{
	struct work_struct *async_work;
	struct reserve_ticket ticket;
	u64 used;
	int ret = 0;
	bool pending_tickets;

	ASSERT(orig_bytes);
	ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_ALL);

	if (flush == BTRFS_RESERVE_FLUSH_DATA)
		async_work = &fs_info->async_data_reclaim_work;
	else
		async_work = &fs_info->async_reclaim_work;

	spin_lock(&space_info->lock);
	ret = -ENOSPC;
	used = btrfs_space_info_used(space_info, true);

	/*
	 * We don't want NO_FLUSH allocations to jump everybody, they can
	 * generally handle ENOSPC in a different way, so treat them the same as
	 * normal flushers when it comes to skipping pending tickets.
	 */
	if (is_normal_flushing(flush) || (flush == BTRFS_RESERVE_NO_FLUSH))
		pending_tickets = !list_empty(&space_info->tickets) ||
			!list_empty(&space_info->priority_tickets);
	else
		pending_tickets = !list_empty(&space_info->priority_tickets);

	/*
	 * Carry on if we have enough space (short-circuit) OR call
	 * can_overcommit() to ensure we can overcommit to continue.
	 */
	if (!pending_tickets &&
	    ((used + orig_bytes <= space_info->total_bytes) ||
	     btrfs_can_overcommit(fs_info, space_info, orig_bytes, flush))) {
		btrfs_space_info_update_bytes_may_use(fs_info, space_info,
						      orig_bytes);
		ret = 0;
	}

	/*
	 * If we couldn't make a reservation then setup our reservation ticket
	 * and kick the async worker if it's not already running.
	 *
	 * If we are a priority flusher then we just need to add our ticket to
	 * the list and we will do our own flushing further down.
	 */
	if (ret && flush != BTRFS_RESERVE_NO_FLUSH) {
		ticket.bytes = orig_bytes;
		ticket.error = 0;
		space_info->reclaim_size += ticket.bytes;
		init_waitqueue_head(&ticket.wait);
		ticket.steal = (flush == BTRFS_RESERVE_FLUSH_ALL_STEAL);
		if (flush == BTRFS_RESERVE_FLUSH_ALL ||
		    flush == BTRFS_RESERVE_FLUSH_ALL_STEAL ||
		    flush == BTRFS_RESERVE_FLUSH_DATA) {
			list_add_tail(&ticket.list, &space_info->tickets);
			if (!space_info->flush) {
				space_info->flush = 1;
				trace_btrfs_trigger_flush(fs_info,
							  space_info->flags,
							  orig_bytes, flush,
							  "enospc");
				queue_work(system_unbound_wq, async_work);
			}
		} else {
			list_add_tail(&ticket.list,
				      &space_info->priority_tickets);
		}
#ifdef MY_ABC_HERE
		if (space_info->flags & BTRFS_BLOCK_GROUP_METADATA &&
		    !test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags) &&
		    need_do_async_reclaim(fs_info, space_info, used)) {
			if (!work_busy(&fs_info->syno_async_metadata_reclaim_work))
				queue_work(system_unbound_wq, &fs_info->syno_async_metadata_reclaim_work);
#ifdef MY_ABC_HERE
			if (!work_busy(&fs_info->syno_async_data_flush_work) &&
				!list_empty(&fs_info->syno_dirty_lru_inodes))
				queue_work(system_unbound_wq, &fs_info->syno_async_data_flush_work);
#endif /* MY_ABC_HERE */
		}
#endif /* MY_ABC_HERE */
	} else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
		used += orig_bytes;
		/*
		 * We will do the space reservation dance during log replay,
		 * which means we won't have fs_info->fs_root set, so don't do
		 * the async reclaim as we will panic.
		 */
		if (!test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags) &&
		    need_do_async_reclaim(fs_info, space_info, used)) {
			if (!work_busy(&fs_info->async_reclaim_work)) {
				trace_btrfs_trigger_flush(fs_info, space_info->flags,
						  orig_bytes, flush, "preempt");
				queue_work(system_unbound_wq,
					  &fs_info->async_reclaim_work);
			}
#ifdef MY_ABC_HERE
			if (!work_busy(&fs_info->syno_async_metadata_reclaim_work))
				queue_work(system_unbound_wq, &fs_info->syno_async_metadata_reclaim_work);
#ifdef MY_ABC_HERE
			if (!work_busy(&fs_info->syno_async_data_flush_work) &&
				!list_empty(&fs_info->syno_dirty_lru_inodes))
				queue_work(system_unbound_wq, &fs_info->syno_async_data_flush_work);
#endif /* MY_ABC_HERE */
#endif /* MY_ABC_HERE */
		}
	}
	spin_unlock(&space_info->lock);
	if (!ret || flush == BTRFS_RESERVE_NO_FLUSH)
		return ret;

	return handle_reserve_ticket(fs_info, space_info, &ticket, flush);
}

/**
 * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
 * @root - the root we're allocating for
 * @block_rsv - the block_rsv we're allocating for
 * @orig_bytes - the number of bytes we want
 * @flush - whether or not we can flush to make our reservation
 *
 * This will reserve orig_bytes number of bytes from the space info associated
 * with the block_rsv.  If there is not enough space it will make an attempt to
 * flush out space to make room.  It will do this by flushing delalloc if
 * possible or committing the transaction.  If flush is 0 then no attempts to
 * regain reservations will be made and this will fail if there is not enough
 * space already.
 */
int btrfs_reserve_metadata_bytes(struct btrfs_root *root,
				 struct btrfs_block_rsv *block_rsv,
				 u64 orig_bytes,
				 enum btrfs_reserve_flush_enum flush)
{
	struct btrfs_fs_info *fs_info = root->fs_info;
	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
	int ret;

	ret = __reserve_bytes(fs_info, block_rsv->space_info, orig_bytes, flush);
	if (ret == -ENOSPC &&
	    unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) {
		if (block_rsv != global_rsv &&
		    !btrfs_block_rsv_use_bytes(global_rsv, orig_bytes))
			ret = 0;
	}
	if (ret == -ENOSPC) {
		trace_btrfs_space_reservation(fs_info, "space_info:enospc",
					      block_rsv->space_info->flags,
					      orig_bytes, 1);

		if (btrfs_test_opt(fs_info, ENOSPC_DEBUG))
			btrfs_dump_space_info(fs_info, block_rsv->space_info,
					      orig_bytes, 0);
	}
	return ret;
}

/**
 * btrfs_reserve_data_bytes - try to reserve data bytes for an allocation
 * @fs_info - the filesystem
 * @bytes - the number of bytes we need
 * @flush - how we are allowed to flush
 *
 * This will reserve bytes from the data space info.  If there is not enough
 * space then we will attempt to flush space as specified by flush.
 */
int btrfs_reserve_data_bytes(struct btrfs_fs_info *fs_info, u64 bytes,
			     enum btrfs_reserve_flush_enum flush)
{
	struct btrfs_space_info *data_sinfo = fs_info->data_sinfo;
	int ret;

	ASSERT(flush == BTRFS_RESERVE_FLUSH_DATA ||
	       flush == BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE);
	ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_DATA);

	ret = __reserve_bytes(fs_info, data_sinfo, bytes, flush);
	if (ret == -ENOSPC) {
		trace_btrfs_space_reservation(fs_info, "space_info:enospc",
					      data_sinfo->flags, bytes, 1);
		if (btrfs_test_opt(fs_info, ENOSPC_DEBUG))
			btrfs_dump_space_info(fs_info, data_sinfo, bytes, 0);
	}
	return ret;
}