2018-06-06 09:42:14 +07:00
|
|
|
// SPDX-License-Identifier: GPL-2.0
|
2005-04-17 05:20:36 +07:00
|
|
|
/*
|
2005-11-02 10:58:39 +07:00
|
|
|
* Copyright (c) 2000-2005 Silicon Graphics, Inc.
|
|
|
|
* All Rights Reserved.
|
2005-04-17 05:20:36 +07:00
|
|
|
*/
|
|
|
|
#ifndef __XFS_MOUNT_H__
|
|
|
|
#define __XFS_MOUNT_H__
|
|
|
|
|
2012-06-14 21:22:15 +07:00
|
|
|
struct xlog;
|
2005-04-17 05:20:36 +07:00
|
|
|
struct xfs_inode;
|
2007-07-11 08:09:12 +07:00
|
|
|
struct xfs_mru_cache;
|
2008-05-21 13:41:01 +07:00
|
|
|
struct xfs_nameops;
|
2008-10-30 13:38:26 +07:00
|
|
|
struct xfs_ail;
|
2009-06-08 20:33:32 +07:00
|
|
|
struct xfs_quotainfo;
|
2013-10-29 18:11:46 +07:00
|
|
|
struct xfs_dir_ops;
|
2014-06-06 12:01:58 +07:00
|
|
|
struct xfs_da_geometry;
|
2009-06-08 20:33:32 +07:00
|
|
|
|
2011-01-04 07:35:03 +07:00
|
|
|
/* dynamic preallocation free space thresholds, 5% down to 1% */
|
|
|
|
enum {
|
|
|
|
XFS_LOWSP_1_PCNT = 0,
|
|
|
|
XFS_LOWSP_2_PCNT,
|
|
|
|
XFS_LOWSP_3_PCNT,
|
|
|
|
XFS_LOWSP_4_PCNT,
|
|
|
|
XFS_LOWSP_5_PCNT,
|
|
|
|
XFS_LOWSP_MAX,
|
|
|
|
};
|
|
|
|
|
2016-05-18 07:58:51 +07:00
|
|
|
/*
|
|
|
|
* Error Configuration
|
|
|
|
*
|
|
|
|
* Error classes define the subsystem the configuration belongs to.
|
|
|
|
* Error numbers define the errors that are configurable.
|
|
|
|
*/
|
|
|
|
enum {
|
2016-05-18 08:01:00 +07:00
|
|
|
XFS_ERR_METADATA,
|
2016-05-18 07:58:51 +07:00
|
|
|
XFS_ERR_CLASS_MAX,
|
|
|
|
};
|
|
|
|
enum {
|
2016-05-18 08:01:00 +07:00
|
|
|
XFS_ERR_DEFAULT,
|
2016-05-18 08:09:28 +07:00
|
|
|
XFS_ERR_EIO,
|
|
|
|
XFS_ERR_ENOSPC,
|
|
|
|
XFS_ERR_ENODEV,
|
2016-05-18 07:58:51 +07:00
|
|
|
XFS_ERR_ERRNO_MAX,
|
|
|
|
};
|
|
|
|
|
2016-05-18 08:08:15 +07:00
|
|
|
#define XFS_ERR_RETRY_FOREVER -1
|
|
|
|
|
2016-09-14 04:51:30 +07:00
|
|
|
/*
|
|
|
|
* Although retry_timeout is in jiffies which is normally an unsigned long,
|
|
|
|
* we limit the retry timeout to 86400 seconds, or one day. So even a
|
|
|
|
* signed 32-bit long is sufficient for a HZ value up to 24855. Making it
|
|
|
|
* signed lets us store the special "-1" value, meaning retry forever.
|
|
|
|
*/
|
2016-05-18 07:58:51 +07:00
|
|
|
struct xfs_error_cfg {
|
|
|
|
struct xfs_kobj kobj;
|
|
|
|
int max_retries;
|
2016-09-14 04:51:30 +07:00
|
|
|
long retry_timeout; /* in jiffies, -1 = infinite */
|
2016-05-18 07:58:51 +07:00
|
|
|
};
|
|
|
|
|
2005-04-17 05:20:36 +07:00
|
|
|
typedef struct xfs_mount {
|
2007-08-30 14:21:30 +07:00
|
|
|
struct super_block *m_super;
|
2005-04-17 05:20:36 +07:00
|
|
|
xfs_tid_t m_tid; /* next unused tid for fs */
|
2019-04-12 21:40:25 +07:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Bitsets of per-fs metadata that have been checked and/or are sick.
|
|
|
|
* Callers must hold m_sb_lock to access these two fields.
|
|
|
|
*/
|
|
|
|
uint8_t m_fs_checked;
|
|
|
|
uint8_t m_fs_sick;
|
|
|
|
/*
|
|
|
|
* Bitsets of rt metadata that have been checked and/or are sick.
|
|
|
|
* Callers must hold m_sb_lock to access this field.
|
|
|
|
*/
|
|
|
|
uint8_t m_rt_checked;
|
|
|
|
uint8_t m_rt_sick;
|
|
|
|
|
2008-10-30 13:38:26 +07:00
|
|
|
struct xfs_ail *m_ail; /* fs active log item list */
|
2015-02-23 17:19:28 +07:00
|
|
|
|
|
|
|
struct xfs_sb m_sb; /* copy of fs superblock */
|
2007-10-11 14:42:32 +07:00
|
|
|
spinlock_t m_sb_lock; /* sb counter lock */
|
2015-02-23 17:19:28 +07:00
|
|
|
struct percpu_counter m_icount; /* allocated inodes counter */
|
2015-02-23 17:19:53 +07:00
|
|
|
struct percpu_counter m_ifree; /* free inodes counter */
|
2015-02-23 17:22:03 +07:00
|
|
|
struct percpu_counter m_fdblocks; /* free block counter */
|
2019-04-26 08:26:22 +07:00
|
|
|
/*
|
|
|
|
* Count of data device blocks reserved for delayed allocations,
|
|
|
|
* including indlen blocks. Does not include allocated CoW staging
|
|
|
|
* extents or anything related to the rt device.
|
|
|
|
*/
|
|
|
|
struct percpu_counter m_delalloc_blks;
|
2015-02-23 17:19:28 +07:00
|
|
|
|
2005-04-17 05:20:36 +07:00
|
|
|
struct xfs_buf *m_sb_bp; /* buffer for superblock */
|
|
|
|
char *m_fsname; /* filesystem name */
|
|
|
|
int m_fsname_len; /* strlen of fs name */
|
2005-11-02 07:44:33 +07:00
|
|
|
char *m_rtname; /* realtime device name */
|
|
|
|
char *m_logname; /* external log device name */
|
2005-04-17 05:20:36 +07:00
|
|
|
int m_bsize; /* fs logical block size */
|
|
|
|
xfs_agnumber_t m_agfrotor; /* last ag where space found */
|
|
|
|
xfs_agnumber_t m_agirotor; /* last ag dir inode alloced */
|
2007-10-11 14:43:43 +07:00
|
|
|
spinlock_t m_agirotor_lock;/* .. and lock protecting it */
|
2005-04-17 05:20:36 +07:00
|
|
|
xfs_agnumber_t m_maxagi; /* highest inode alloc group */
|
|
|
|
uint m_readio_log; /* min read size log bytes */
|
|
|
|
uint m_readio_blocks; /* min read size blocks */
|
|
|
|
uint m_writeio_log; /* min write size log bytes */
|
|
|
|
uint m_writeio_blocks; /* min write size blocks */
|
2014-06-06 12:01:58 +07:00
|
|
|
struct xfs_da_geometry *m_dir_geo; /* directory block geometry */
|
|
|
|
struct xfs_da_geometry *m_attr_geo; /* attribute block geometry */
|
2012-06-14 21:22:15 +07:00
|
|
|
struct xlog *m_log; /* log specific stuff */
|
2005-04-17 05:20:36 +07:00
|
|
|
int m_logbufs; /* number of log buffers */
|
|
|
|
int m_logbsize; /* size of each log buffer */
|
|
|
|
uint m_rsumlevels; /* rt summary levels */
|
|
|
|
uint m_rsumsize; /* size of rt summary, bytes */
|
xfs: cache minimum realtime summary level
The realtime summary is a two-dimensional array on disk, effectively:
u32 rsum[log2(number of realtime extents) + 1][number of blocks in the bitmap]
rsum[log][bbno] is the number of extents of size 2**log which start in
bitmap block bbno.
xfs_rtallocate_extent_near() uses xfs_rtany_summary() to check whether
rsum[log][bbno] != 0 for any log level. However, the summary array is
stored in row-major order (i.e., like an array in C), so all of these
entries are not adjacent, but rather spread across the entire summary
file. In the worst case (a full bitmap block), xfs_rtany_summary() has
to check every level.
This means that on a moderately-used realtime device, an allocation will
waste a lot of time finding, reading, and releasing buffers for the
realtime summary. In particular, one of our storage services (which runs
on servers with 8 very slow CPUs and 15 8 TB XFS realtime filesystems)
spends almost 5% of its CPU cycles in xfs_rtbuf_get() and
xfs_trans_brelse() called from xfs_rtany_summary().
One solution would be to also store the summary with the dimensions
swapped. However, this would require a disk format change to a very old
component of XFS.
Instead, we can cache the minimum size which contains any extents. We do
so lazily; rather than guaranteeing that the cache contains the precise
minimum, it always contains a loose lower bound which we tighten when we
read or update a summary block. This only uses a few kilobytes of memory
and is already serialized via the realtime bitmap and summary inode
locks, so the cost is minimal. With this change, the same workload only
spends 0.2% of its CPU cycles in the realtime allocator.
Signed-off-by: Omar Sandoval <osandov@fb.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
2018-12-12 23:46:32 +07:00
|
|
|
/*
|
|
|
|
* Optional cache of rt summary level per bitmap block with the
|
|
|
|
* invariant that m_rsum_cache[bbno] <= the minimum i for which
|
|
|
|
* rsum[i][bbno] != 0. Reads and writes are serialized by the rsumip
|
|
|
|
* inode lock.
|
|
|
|
*/
|
|
|
|
uint8_t *m_rsum_cache;
|
2005-04-17 05:20:36 +07:00
|
|
|
struct xfs_inode *m_rbmip; /* pointer to bitmap inode */
|
|
|
|
struct xfs_inode *m_rsumip; /* pointer to summary inode */
|
|
|
|
struct xfs_inode *m_rootip; /* pointer to root directory */
|
|
|
|
struct xfs_quotainfo *m_quotainfo; /* disk quota information */
|
|
|
|
xfs_buftarg_t *m_ddev_targp; /* saves taking the address */
|
|
|
|
xfs_buftarg_t *m_logdev_targp;/* ptr to log device */
|
|
|
|
xfs_buftarg_t *m_rtdev_targp; /* ptr to rt device */
|
2017-06-17 01:00:05 +07:00
|
|
|
uint8_t m_blkbit_log; /* blocklog + NBBY */
|
|
|
|
uint8_t m_blkbb_log; /* blocklog - BBSHIFT */
|
|
|
|
uint8_t m_agno_log; /* log #ag's */
|
|
|
|
uint8_t m_agino_log; /* #bits for agino in inum */
|
xfs: increase inode cluster size for v5 filesystems
v5 filesystems use 512 byte inodes as a minimum, so read inodes in
clusters that are effectively half the size of a v4 filesystem with
256 byte inodes. For v5 fielsystems, scale the inode cluster size
with the size of the inode so that we keep a constant 32 inodes per
cluster ratio for all inode IO.
This only works if mkfs.xfs sets the inode alignment appropriately
for larger inode clusters, so this functionality is made conditional
on mkfs doing the right thing. xfs_repair needs to know about
the inode alignment changes, too.
Wall time:
create bulkstat find+stat ls -R unlink
v4 237s 161s 173s 201s 299s
v5 235s 163s 205s 31s 356s
patched 234s 160s 182s 29s 317s
System time:
create bulkstat find+stat ls -R unlink
v4 2601s 2490s 1653s 1656s 2960s
v5 2637s 2497s 1681s 20s 3216s
patched 2613s 2451s 1658s 20s 3007s
So, wall time same or down across the board, system time same or
down across the board, and cache hit rates all improve except for
the ls -R case which is a pure cold cache directory read workload
on v5 filesystems...
So, this patch removes most of the performance and CPU usage
differential between v4 and v5 filesystems on traversal related
workloads.
Note: while this patch is currently for v5 filesystems only, there
is no reason it can't be ported back to v4 filesystems. This hasn't
been done here because bringing the code back to v4 requires
forwards and backwards kernel compatibility testing. i.e. to
deterine if older kernels(*) do the right thing with larger inode
alignments but still only using 8k inode cluster sizes. None of this
testing and validation on v4 filesystems has been done, so for the
moment larger inode clusters is limited to v5 superblocks.
(*) a current default config v4 filesystem should mount just fine on
2.6.23 (when lazy-count support was introduced), and so if we change
the alignment emitted by mkfs without a feature bit then we have to
make sure it works properly on all kernels since 2.6.23. And if we
allow it to be changed when the lazy-count bit is not set, then it's
all kernels since v2 logs were introduced that need to be tested for
compatibility...
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
2013-11-01 11:27:20 +07:00
|
|
|
uint m_inode_cluster_size;/* min inode buf size */
|
2018-12-12 23:46:25 +07:00
|
|
|
unsigned int m_inodes_per_cluster;
|
|
|
|
unsigned int m_blocks_per_cluster;
|
2018-12-12 23:46:25 +07:00
|
|
|
unsigned int m_cluster_align;
|
|
|
|
unsigned int m_cluster_align_inodes;
|
2005-04-17 05:20:36 +07:00
|
|
|
uint m_blockmask; /* sb_blocksize-1 */
|
|
|
|
uint m_blockwsize; /* sb_blocksize in words */
|
|
|
|
uint m_blockwmask; /* blockwsize-1 */
|
2008-10-30 13:11:19 +07:00
|
|
|
uint m_alloc_mxr[2]; /* max alloc btree records */
|
|
|
|
uint m_alloc_mnr[2]; /* min alloc btree records */
|
|
|
|
uint m_bmap_dmxr[2]; /* max bmap btree records */
|
|
|
|
uint m_bmap_dmnr[2]; /* min bmap btree records */
|
|
|
|
uint m_inobt_mxr[2]; /* max inobt btree records */
|
|
|
|
uint m_inobt_mnr[2]; /* min inobt btree records */
|
2016-08-03 08:36:07 +07:00
|
|
|
uint m_rmap_mxr[2]; /* max rmap btree records */
|
|
|
|
uint m_rmap_mnr[2]; /* min rmap btree records */
|
2016-10-03 23:11:18 +07:00
|
|
|
uint m_refc_mxr[2]; /* max refc btree records */
|
|
|
|
uint m_refc_mnr[2]; /* min refc btree records */
|
2005-04-17 05:20:36 +07:00
|
|
|
uint m_ag_maxlevels; /* XFS_AG_MAXLEVELS */
|
|
|
|
uint m_bm_maxlevels[2]; /* XFS_BM_MAXLEVELS */
|
2009-02-09 14:37:14 +07:00
|
|
|
uint m_in_maxlevels; /* max inobt btree levels. */
|
2016-08-03 08:36:07 +07:00
|
|
|
uint m_rmap_maxlevels; /* max rmap btree levels */
|
2016-10-03 23:11:18 +07:00
|
|
|
uint m_refc_maxlevels; /* max refcount btree level */
|
2016-08-03 08:31:47 +07:00
|
|
|
xfs_extlen_t m_ag_prealloc_blocks; /* reserved ag blocks */
|
2016-08-03 08:38:24 +07:00
|
|
|
uint m_alloc_set_aside; /* space we can't use */
|
|
|
|
uint m_ag_max_usable; /* max space per AG */
|
2010-01-11 18:47:44 +07:00
|
|
|
struct radix_tree_root m_perag_tree; /* per-ag accounting info */
|
|
|
|
spinlock_t m_perag_lock; /* lock for m_perag_tree */
|
2007-08-30 14:21:54 +07:00
|
|
|
struct mutex m_growlock; /* growfs mutex */
|
2005-04-17 05:20:36 +07:00
|
|
|
int m_fixedfsid[2]; /* unchanged for life of FS */
|
2017-06-17 01:00:05 +07:00
|
|
|
uint64_t m_flags; /* global mount flags */
|
2019-02-15 00:33:15 +07:00
|
|
|
bool m_finobt_nores; /* no per-AG finobt resv. */
|
2005-04-17 05:20:36 +07:00
|
|
|
int m_ialloc_inos; /* inodes in inode allocation */
|
|
|
|
int m_ialloc_blks; /* blocks in inode allocation */
|
2015-05-29 05:55:20 +07:00
|
|
|
int m_ialloc_min_blks;/* min blocks in sparse inode
|
|
|
|
* allocation */
|
2005-04-17 05:20:36 +07:00
|
|
|
int m_inoalign_mask;/* mask sb_inoalignmt if used */
|
|
|
|
uint m_qflags; /* quota status flags */
|
2013-08-12 17:49:56 +07:00
|
|
|
struct xfs_trans_resv m_resv; /* precomputed res values */
|
2017-06-17 01:00:05 +07:00
|
|
|
uint64_t m_maxicount; /* maximum inode count */
|
|
|
|
uint64_t m_resblks; /* total reserved blocks */
|
|
|
|
uint64_t m_resblks_avail;/* available reserved blocks */
|
|
|
|
uint64_t m_resblks_save; /* reserved blks @ remount,ro */
|
2005-04-17 05:20:36 +07:00
|
|
|
int m_dalign; /* stripe unit */
|
|
|
|
int m_swidth; /* stripe width */
|
2006-03-29 05:55:14 +07:00
|
|
|
int m_sinoalign; /* stripe unit inode alignment */
|
2017-06-17 01:00:05 +07:00
|
|
|
uint8_t m_sectbb_log; /* sectlog - BBSHIFT */
|
2008-05-21 13:41:01 +07:00
|
|
|
const struct xfs_nameops *m_dirnameops; /* vector of dir name ops */
|
2013-10-29 18:11:46 +07:00
|
|
|
const struct xfs_dir_ops *m_dir_inode_ops; /* vector of dir inode ops */
|
2013-10-29 18:11:51 +07:00
|
|
|
const struct xfs_dir_ops *m_nondir_inode_ops; /* !dir inode ops */
|
2005-04-17 05:20:36 +07:00
|
|
|
uint m_chsize; /* size of next field */
|
|
|
|
atomic_t m_active_trans; /* number trans frozen */
|
2007-07-11 08:09:12 +07:00
|
|
|
struct xfs_mru_cache *m_filestream; /* per-mount filestream data */
|
2011-04-08 09:45:07 +07:00
|
|
|
struct delayed_work m_reclaim_work; /* background inode reclaim */
|
2012-11-06 21:50:47 +07:00
|
|
|
struct delayed_work m_eofblocks_work; /* background eof blocks
|
|
|
|
trimming */
|
2016-10-03 23:11:46 +07:00
|
|
|
struct delayed_work m_cowblocks_work; /* background cow blocks
|
|
|
|
trimming */
|
2015-01-22 05:10:31 +07:00
|
|
|
bool m_update_sb; /* sb needs update in mount */
|
2011-01-04 07:35:03 +07:00
|
|
|
int64_t m_low_space[XFS_LOWSP_MAX];
|
|
|
|
/* low free space thresholds */
|
2014-07-15 05:07:01 +07:00
|
|
|
struct xfs_kobj m_kobj;
|
2016-05-18 07:58:51 +07:00
|
|
|
struct xfs_kobj m_error_kobj;
|
2016-05-18 08:01:00 +07:00
|
|
|
struct xfs_kobj m_error_meta_kobj;
|
2016-05-18 07:58:51 +07:00
|
|
|
struct xfs_error_cfg m_error_cfg[XFS_ERR_CLASS_MAX][XFS_ERR_ERRNO_MAX];
|
2015-10-12 14:21:19 +07:00
|
|
|
struct xstats m_stats; /* per-fs stats */
|
2012-02-29 16:53:48 +07:00
|
|
|
|
2014-11-28 09:59:58 +07:00
|
|
|
struct workqueue_struct *m_buf_workqueue;
|
2012-02-29 16:53:48 +07:00
|
|
|
struct workqueue_struct *m_unwritten_workqueue;
|
2012-04-23 14:54:32 +07:00
|
|
|
struct workqueue_struct *m_cil_workqueue;
|
2012-10-08 17:56:05 +07:00
|
|
|
struct workqueue_struct *m_reclaim_workqueue;
|
|
|
|
struct workqueue_struct *m_log_workqueue;
|
2012-11-06 21:50:47 +07:00
|
|
|
struct workqueue_struct *m_eofblocks_workqueue;
|
2017-03-29 04:51:44 +07:00
|
|
|
struct workqueue_struct *m_sync_workqueue;
|
2015-02-16 07:49:23 +07:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Generation of the filesysyem layout. This is incremented by each
|
|
|
|
* growfs, and used by the pNFS server to ensure the client updates
|
|
|
|
* its view of the block device once it gets a layout that might
|
|
|
|
* reference the newly added blocks. Does not need to be persistent
|
|
|
|
* as long as we only allow file system size increments, but if we
|
|
|
|
* ever support shrinks it would have to be persisted in addition
|
|
|
|
* to various other kinds of pain inflicted on the pNFS server.
|
|
|
|
*/
|
2017-06-17 01:00:05 +07:00
|
|
|
uint32_t m_generation;
|
2016-03-15 07:42:44 +07:00
|
|
|
|
2019-02-19 00:38:49 +07:00
|
|
|
bool m_always_cow;
|
2016-05-18 08:11:27 +07:00
|
|
|
bool m_fail_unmount;
|
2016-03-15 07:42:44 +07:00
|
|
|
#ifdef DEBUG
|
2017-06-21 07:54:46 +07:00
|
|
|
/*
|
|
|
|
* Frequency with which errors are injected. Replaces xfs_etest; the
|
|
|
|
* value stored in here is the inverse of the frequency with which the
|
|
|
|
* error triggers. 1 = always, 2 = half the time, etc.
|
|
|
|
*/
|
|
|
|
unsigned int *m_errortag;
|
2017-06-21 07:54:47 +07:00
|
|
|
struct xfs_kobj m_errortag_kobj;
|
2016-03-15 07:42:44 +07:00
|
|
|
#endif
|
2005-04-17 05:20:36 +07:00
|
|
|
} xfs_mount_t;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Flags for m_flags.
|
|
|
|
*/
|
[XFS] Lazy Superblock Counters
When we have a couple of hundred transactions on the fly at once, they all
typically modify the on disk superblock in some way.
create/unclink/mkdir/rmdir modify inode counts, allocation/freeing modify
free block counts.
When these counts are modified in a transaction, they must eventually lock
the superblock buffer and apply the mods. The buffer then remains locked
until the transaction is committed into the incore log buffer. The result
of this is that with enough transactions on the fly the incore superblock
buffer becomes a bottleneck.
The result of contention on the incore superblock buffer is that
transaction rates fall - the more pressure that is put on the superblock
buffer, the slower things go.
The key to removing the contention is to not require the superblock fields
in question to be locked. We do that by not marking the superblock dirty
in the transaction. IOWs, we modify the incore superblock but do not
modify the cached superblock buffer. In short, we do not log superblock
modifications to critical fields in the superblock on every transaction.
In fact we only do it just before we write the superblock to disk every
sync period or just before unmount.
This creates an interesting problem - if we don't log or write out the
fields in every transaction, then how do the values get recovered after a
crash? the answer is simple - we keep enough duplicate, logged information
in other structures that we can reconstruct the correct count after log
recovery has been performed.
It is the AGF and AGI structures that contain the duplicate information;
after recovery, we walk every AGI and AGF and sum their individual
counters to get the correct value, and we do a transaction into the log to
correct them. An optimisation of this is that if we have a clean unmount
record, we know the value in the superblock is correct, so we can avoid
the summation walk under normal conditions and so mount/recovery times do
not change under normal operation.
One wrinkle that was discovered during development was that the blocks
used in the freespace btrees are never accounted for in the AGF counters.
This was once a valid optimisation to make; when the filesystem is full,
the free space btrees are empty and consume no space. Hence when it
matters, the "accounting" is correct. But that means the when we do the
AGF summations, we would not have a correct count and xfs_check would
complain. Hence a new counter was added to track the number of blocks used
by the free space btrees. This is an *on-disk format change*.
As a result of this, lazy superblock counters are a mkfs option and at the
moment on linux there is no way to convert an old filesystem. This is
possible - xfs_db can be used to twiddle the right bits and then
xfs_repair will do the format conversion for you. Similarly, you can
convert backwards as well. At some point we'll add functionality to
xfs_admin to do the bit twiddling easily....
SGI-PV: 964999
SGI-Modid: xfs-linux-melb:xfs-kern:28652a
Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Tim Shimmin <tes@sgi.com>
2007-05-24 12:26:31 +07:00
|
|
|
#define XFS_MOUNT_WSYNC (1ULL << 0) /* for nfs - all metadata ops
|
2005-04-17 05:20:36 +07:00
|
|
|
must be synchronous except
|
|
|
|
for space allocations */
|
2016-05-18 08:11:27 +07:00
|
|
|
#define XFS_MOUNT_UNMOUNTING (1ULL << 1) /* filesystem is unmounting */
|
[XFS] Lazy Superblock Counters
When we have a couple of hundred transactions on the fly at once, they all
typically modify the on disk superblock in some way.
create/unclink/mkdir/rmdir modify inode counts, allocation/freeing modify
free block counts.
When these counts are modified in a transaction, they must eventually lock
the superblock buffer and apply the mods. The buffer then remains locked
until the transaction is committed into the incore log buffer. The result
of this is that with enough transactions on the fly the incore superblock
buffer becomes a bottleneck.
The result of contention on the incore superblock buffer is that
transaction rates fall - the more pressure that is put on the superblock
buffer, the slower things go.
The key to removing the contention is to not require the superblock fields
in question to be locked. We do that by not marking the superblock dirty
in the transaction. IOWs, we modify the incore superblock but do not
modify the cached superblock buffer. In short, we do not log superblock
modifications to critical fields in the superblock on every transaction.
In fact we only do it just before we write the superblock to disk every
sync period or just before unmount.
This creates an interesting problem - if we don't log or write out the
fields in every transaction, then how do the values get recovered after a
crash? the answer is simple - we keep enough duplicate, logged information
in other structures that we can reconstruct the correct count after log
recovery has been performed.
It is the AGF and AGI structures that contain the duplicate information;
after recovery, we walk every AGI and AGF and sum their individual
counters to get the correct value, and we do a transaction into the log to
correct them. An optimisation of this is that if we have a clean unmount
record, we know the value in the superblock is correct, so we can avoid
the summation walk under normal conditions and so mount/recovery times do
not change under normal operation.
One wrinkle that was discovered during development was that the blocks
used in the freespace btrees are never accounted for in the AGF counters.
This was once a valid optimisation to make; when the filesystem is full,
the free space btrees are empty and consume no space. Hence when it
matters, the "accounting" is correct. But that means the when we do the
AGF summations, we would not have a correct count and xfs_check would
complain. Hence a new counter was added to track the number of blocks used
by the free space btrees. This is an *on-disk format change*.
As a result of this, lazy superblock counters are a mkfs option and at the
moment on linux there is no way to convert an old filesystem. This is
possible - xfs_db can be used to twiddle the right bits and then
xfs_repair will do the format conversion for you. Similarly, you can
convert backwards as well. At some point we'll add functionality to
xfs_admin to do the bit twiddling easily....
SGI-PV: 964999
SGI-Modid: xfs-linux-melb:xfs-kern:28652a
Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Tim Shimmin <tes@sgi.com>
2007-05-24 12:26:31 +07:00
|
|
|
#define XFS_MOUNT_WAS_CLEAN (1ULL << 3)
|
2005-11-02 11:09:22 +07:00
|
|
|
#define XFS_MOUNT_FS_SHUTDOWN (1ULL << 4) /* atomic stop of all filesystem
|
2005-04-17 05:20:36 +07:00
|
|
|
operations, typically for
|
|
|
|
disk errors in metadata */
|
2011-05-20 20:45:32 +07:00
|
|
|
#define XFS_MOUNT_DISCARD (1ULL << 5) /* discard unused blocks */
|
2005-11-02 11:09:22 +07:00
|
|
|
#define XFS_MOUNT_NOALIGN (1ULL << 7) /* turn off stripe alignment
|
2005-04-17 05:20:36 +07:00
|
|
|
allocations */
|
2006-01-11 11:32:01 +07:00
|
|
|
#define XFS_MOUNT_ATTR2 (1ULL << 8) /* allow use of attr2 format */
|
2007-08-30 14:21:12 +07:00
|
|
|
#define XFS_MOUNT_GRPID (1ULL << 9) /* group-ID assigned from directory */
|
2005-11-02 11:09:22 +07:00
|
|
|
#define XFS_MOUNT_NORECOVERY (1ULL << 10) /* no recovery - dirty fs */
|
|
|
|
#define XFS_MOUNT_DFLT_IOSIZE (1ULL << 12) /* set default i/o size */
|
2016-03-02 05:58:09 +07:00
|
|
|
#define XFS_MOUNT_SMALL_INUMS (1ULL << 14) /* user wants 32bit inodes */
|
|
|
|
#define XFS_MOUNT_32BITINODES (1ULL << 15) /* inode32 allocator active */
|
2005-11-02 11:09:22 +07:00
|
|
|
#define XFS_MOUNT_NOUUID (1ULL << 16) /* ignore uuid during mount */
|
2008-02-29 09:58:40 +07:00
|
|
|
#define XFS_MOUNT_IKEEP (1ULL << 18) /* keep empty inode clusters*/
|
2005-11-02 11:09:22 +07:00
|
|
|
#define XFS_MOUNT_SWALLOC (1ULL << 19) /* turn on stripe width
|
2005-04-17 05:20:36 +07:00
|
|
|
* allocation */
|
2007-08-30 14:21:12 +07:00
|
|
|
#define XFS_MOUNT_RDONLY (1ULL << 20) /* read-only fs */
|
2005-11-02 11:09:22 +07:00
|
|
|
#define XFS_MOUNT_DIRSYNC (1ULL << 21) /* synchronous directory ops */
|
|
|
|
#define XFS_MOUNT_COMPAT_IOSIZE (1ULL << 22) /* don't report large preferred
|
2005-11-02 06:33:05 +07:00
|
|
|
* I/O size in stat() */
|
2007-07-11 08:09:12 +07:00
|
|
|
#define XFS_MOUNT_FILESTREAMS (1ULL << 24) /* enable the filestreams
|
|
|
|
allocator */
|
2008-04-30 15:15:28 +07:00
|
|
|
#define XFS_MOUNT_NOATTR2 (1ULL << 25) /* disable use of attr2 format */
|
2005-11-02 06:33:05 +07:00
|
|
|
|
2015-06-04 06:19:18 +07:00
|
|
|
#define XFS_MOUNT_DAX (1ULL << 62) /* TEST ONLY! */
|
|
|
|
|
2005-04-17 05:20:36 +07:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Default minimum read and write sizes.
|
|
|
|
*/
|
|
|
|
#define XFS_READIO_LOG_LARGE 16
|
|
|
|
#define XFS_WRITEIO_LOG_LARGE 16
|
|
|
|
|
|
|
|
/*
|
2005-05-06 03:28:29 +07:00
|
|
|
* Max and min values for mount-option defined I/O
|
|
|
|
* preallocation sizes.
|
2005-04-17 05:20:36 +07:00
|
|
|
*/
|
2005-05-06 03:28:29 +07:00
|
|
|
#define XFS_MAX_IO_LOG 30 /* 1G */
|
2005-04-17 05:20:36 +07:00
|
|
|
#define XFS_MIN_IO_LOG PAGE_SHIFT
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Synchronous read and write sizes. This should be
|
|
|
|
* better for NFSv2 wsync filesystems.
|
|
|
|
*/
|
2009-03-29 14:55:42 +07:00
|
|
|
#define XFS_WSYNC_READIO_LOG 15 /* 32k */
|
|
|
|
#define XFS_WSYNC_WRITEIO_LOG 14 /* 16k */
|
2005-04-17 05:20:36 +07:00
|
|
|
|
2005-11-02 06:33:05 +07:00
|
|
|
/*
|
|
|
|
* Allow large block sizes to be reported to userspace programs if the
|
2008-04-10 09:22:07 +07:00
|
|
|
* "largeio" mount option is used.
|
2005-11-02 06:33:05 +07:00
|
|
|
*
|
|
|
|
* If compatibility mode is specified, simply return the basic unit of caching
|
|
|
|
* so that we don't get inefficient read/modify/write I/O from user apps.
|
|
|
|
* Otherwise....
|
|
|
|
*
|
|
|
|
* If the underlying volume is a stripe, then return the stripe width in bytes
|
|
|
|
* as the recommended I/O size. It is not a stripe and we've set a default
|
|
|
|
* buffered I/O size, return that, otherwise return the compat default.
|
|
|
|
*/
|
|
|
|
static inline unsigned long
|
|
|
|
xfs_preferred_iosize(xfs_mount_t *mp)
|
|
|
|
{
|
|
|
|
if (mp->m_flags & XFS_MOUNT_COMPAT_IOSIZE)
|
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 19:29:47 +07:00
|
|
|
return PAGE_SIZE;
|
2005-11-02 06:33:05 +07:00
|
|
|
return (mp->m_swidth ?
|
|
|
|
(mp->m_swidth << mp->m_sb.sb_blocklog) :
|
|
|
|
((mp->m_flags & XFS_MOUNT_DFLT_IOSIZE) ?
|
2018-06-07 21:54:02 +07:00
|
|
|
(1 << (int)max(mp->m_readio_log, mp->m_writeio_log)) :
|
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 19:29:47 +07:00
|
|
|
PAGE_SIZE));
|
2005-11-02 06:33:05 +07:00
|
|
|
}
|
|
|
|
|
[XFS] Lazy Superblock Counters
When we have a couple of hundred transactions on the fly at once, they all
typically modify the on disk superblock in some way.
create/unclink/mkdir/rmdir modify inode counts, allocation/freeing modify
free block counts.
When these counts are modified in a transaction, they must eventually lock
the superblock buffer and apply the mods. The buffer then remains locked
until the transaction is committed into the incore log buffer. The result
of this is that with enough transactions on the fly the incore superblock
buffer becomes a bottleneck.
The result of contention on the incore superblock buffer is that
transaction rates fall - the more pressure that is put on the superblock
buffer, the slower things go.
The key to removing the contention is to not require the superblock fields
in question to be locked. We do that by not marking the superblock dirty
in the transaction. IOWs, we modify the incore superblock but do not
modify the cached superblock buffer. In short, we do not log superblock
modifications to critical fields in the superblock on every transaction.
In fact we only do it just before we write the superblock to disk every
sync period or just before unmount.
This creates an interesting problem - if we don't log or write out the
fields in every transaction, then how do the values get recovered after a
crash? the answer is simple - we keep enough duplicate, logged information
in other structures that we can reconstruct the correct count after log
recovery has been performed.
It is the AGF and AGI structures that contain the duplicate information;
after recovery, we walk every AGI and AGF and sum their individual
counters to get the correct value, and we do a transaction into the log to
correct them. An optimisation of this is that if we have a clean unmount
record, we know the value in the superblock is correct, so we can avoid
the summation walk under normal conditions and so mount/recovery times do
not change under normal operation.
One wrinkle that was discovered during development was that the blocks
used in the freespace btrees are never accounted for in the AGF counters.
This was once a valid optimisation to make; when the filesystem is full,
the free space btrees are empty and consume no space. Hence when it
matters, the "accounting" is correct. But that means the when we do the
AGF summations, we would not have a correct count and xfs_check would
complain. Hence a new counter was added to track the number of blocks used
by the free space btrees. This is an *on-disk format change*.
As a result of this, lazy superblock counters are a mkfs option and at the
moment on linux there is no way to convert an old filesystem. This is
possible - xfs_db can be used to twiddle the right bits and then
xfs_repair will do the format conversion for you. Similarly, you can
convert backwards as well. At some point we'll add functionality to
xfs_admin to do the bit twiddling easily....
SGI-PV: 964999
SGI-Modid: xfs-linux-melb:xfs-kern:28652a
Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Tim Shimmin <tes@sgi.com>
2007-05-24 12:26:31 +07:00
|
|
|
#define XFS_LAST_UNMOUNT_WAS_CLEAN(mp) \
|
|
|
|
((mp)->m_flags & XFS_MOUNT_WAS_CLEAN)
|
2005-04-17 05:20:36 +07:00
|
|
|
#define XFS_FORCED_SHUTDOWN(mp) ((mp)->m_flags & XFS_MOUNT_FS_SHUTDOWN)
|
2007-08-30 14:20:39 +07:00
|
|
|
void xfs_do_force_shutdown(struct xfs_mount *mp, int flags, char *fname,
|
|
|
|
int lnnum);
|
2005-04-17 05:20:36 +07:00
|
|
|
#define xfs_force_shutdown(m,f) \
|
2007-08-30 14:20:39 +07:00
|
|
|
xfs_do_force_shutdown(m, f, __FILE__, __LINE__)
|
2005-04-17 05:20:36 +07:00
|
|
|
|
2008-11-28 10:23:36 +07:00
|
|
|
#define SHUTDOWN_META_IO_ERROR 0x0001 /* write attempt to metadata failed */
|
|
|
|
#define SHUTDOWN_LOG_IO_ERROR 0x0002 /* write attempt to the log failed */
|
|
|
|
#define SHUTDOWN_FORCE_UMOUNT 0x0004 /* shutdown from a forced unmount */
|
|
|
|
#define SHUTDOWN_CORRUPT_INCORE 0x0008 /* corrupt in-memory data structures */
|
|
|
|
#define SHUTDOWN_REMOTE_REQ 0x0010 /* shutdown came from remote cell */
|
|
|
|
#define SHUTDOWN_DEVICE_REQ 0x0020 /* failed all paths to the device */
|
|
|
|
|
2005-04-17 05:20:36 +07:00
|
|
|
/*
|
|
|
|
* Flags for xfs_mountfs
|
|
|
|
*/
|
2006-03-31 10:04:17 +07:00
|
|
|
#define XFS_MFSI_QUIET 0x40 /* Be silent if mount errors found */
|
2005-04-17 05:20:36 +07:00
|
|
|
|
2005-11-02 10:38:42 +07:00
|
|
|
static inline xfs_agnumber_t
|
|
|
|
xfs_daddr_to_agno(struct xfs_mount *mp, xfs_daddr_t d)
|
2005-04-17 05:20:36 +07:00
|
|
|
{
|
2017-04-20 05:19:32 +07:00
|
|
|
xfs_rfsblock_t ld = XFS_BB_TO_FSBT(mp, d);
|
2005-11-02 10:38:42 +07:00
|
|
|
do_div(ld, mp->m_sb.sb_agblocks);
|
|
|
|
return (xfs_agnumber_t) ld;
|
2005-04-17 05:20:36 +07:00
|
|
|
}
|
|
|
|
|
2005-11-02 10:38:42 +07:00
|
|
|
static inline xfs_agblock_t
|
|
|
|
xfs_daddr_to_agbno(struct xfs_mount *mp, xfs_daddr_t d)
|
2005-04-17 05:20:36 +07:00
|
|
|
{
|
2017-04-20 05:19:32 +07:00
|
|
|
xfs_rfsblock_t ld = XFS_BB_TO_FSBT(mp, d);
|
2005-11-02 10:38:42 +07:00
|
|
|
return (xfs_agblock_t) do_div(ld, mp->m_sb.sb_agblocks);
|
2005-04-17 05:20:36 +07:00
|
|
|
}
|
|
|
|
|
xfs: set up per-AG free space reservations
One unfortunate quirk of the reference count and reverse mapping
btrees -- they can expand in size when blocks are written to *other*
allocation groups if, say, one large extent becomes a lot of tiny
extents. Since we don't want to start throwing errors in the middle
of CoWing, we need to reserve some blocks to handle future expansion.
The transaction block reservation counters aren't sufficient here
because we have to have a reserve of blocks in every AG, not just
somewhere in the filesystem.
Therefore, create two per-AG block reservation pools. One feeds the
AGFL so that rmapbt expansion always succeeds, and the other feeds all
other metadata so that refcountbt expansion never fails.
Use the count of how many reserved blocks we need to have on hand to
create a virtual reservation in the AG. Through selective clamping of
the maximum length of allocation requests and of the length of the
longest free extent, we can make it look like there's less free space
in the AG unless the reservation owner is asking for blocks.
In other words, play some accounting tricks in-core to make sure that
we always have blocks available. On the plus side, there's nothing to
clean up if we crash, which is contrast to the strategy that the rough
draft used (actually removing extents from the freespace btrees).
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2016-09-19 07:30:52 +07:00
|
|
|
/* per-AG block reservation data structures*/
|
|
|
|
enum xfs_ag_resv_type {
|
|
|
|
XFS_AG_RESV_NONE = 0,
|
2018-03-10 05:02:32 +07:00
|
|
|
XFS_AG_RESV_AGFL,
|
xfs: set up per-AG free space reservations
One unfortunate quirk of the reference count and reverse mapping
btrees -- they can expand in size when blocks are written to *other*
allocation groups if, say, one large extent becomes a lot of tiny
extents. Since we don't want to start throwing errors in the middle
of CoWing, we need to reserve some blocks to handle future expansion.
The transaction block reservation counters aren't sufficient here
because we have to have a reserve of blocks in every AG, not just
somewhere in the filesystem.
Therefore, create two per-AG block reservation pools. One feeds the
AGFL so that rmapbt expansion always succeeds, and the other feeds all
other metadata so that refcountbt expansion never fails.
Use the count of how many reserved blocks we need to have on hand to
create a virtual reservation in the AG. Through selective clamping of
the maximum length of allocation requests and of the length of the
longest free extent, we can make it look like there's less free space
in the AG unless the reservation owner is asking for blocks.
In other words, play some accounting tricks in-core to make sure that
we always have blocks available. On the plus side, there's nothing to
clean up if we crash, which is contrast to the strategy that the rough
draft used (actually removing extents from the freespace btrees).
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2016-09-19 07:30:52 +07:00
|
|
|
XFS_AG_RESV_METADATA,
|
2018-03-10 05:01:59 +07:00
|
|
|
XFS_AG_RESV_RMAPBT,
|
xfs: set up per-AG free space reservations
One unfortunate quirk of the reference count and reverse mapping
btrees -- they can expand in size when blocks are written to *other*
allocation groups if, say, one large extent becomes a lot of tiny
extents. Since we don't want to start throwing errors in the middle
of CoWing, we need to reserve some blocks to handle future expansion.
The transaction block reservation counters aren't sufficient here
because we have to have a reserve of blocks in every AG, not just
somewhere in the filesystem.
Therefore, create two per-AG block reservation pools. One feeds the
AGFL so that rmapbt expansion always succeeds, and the other feeds all
other metadata so that refcountbt expansion never fails.
Use the count of how many reserved blocks we need to have on hand to
create a virtual reservation in the AG. Through selective clamping of
the maximum length of allocation requests and of the length of the
longest free extent, we can make it look like there's less free space
in the AG unless the reservation owner is asking for blocks.
In other words, play some accounting tricks in-core to make sure that
we always have blocks available. On the plus side, there's nothing to
clean up if we crash, which is contrast to the strategy that the rough
draft used (actually removing extents from the freespace btrees).
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2016-09-19 07:30:52 +07:00
|
|
|
};
|
|
|
|
|
|
|
|
struct xfs_ag_resv {
|
|
|
|
/* number of blocks originally reserved here */
|
|
|
|
xfs_extlen_t ar_orig_reserved;
|
|
|
|
/* number of blocks reserved here */
|
|
|
|
xfs_extlen_t ar_reserved;
|
|
|
|
/* number of blocks originally asked for */
|
|
|
|
xfs_extlen_t ar_asked;
|
|
|
|
};
|
|
|
|
|
2013-08-12 17:49:55 +07:00
|
|
|
/*
|
|
|
|
* Per-ag incore structure, copies of information in agf and agi, to improve the
|
2014-11-28 10:25:04 +07:00
|
|
|
* performance of allocation group selection.
|
2013-08-12 17:49:55 +07:00
|
|
|
*/
|
|
|
|
typedef struct xfs_perag {
|
|
|
|
struct xfs_mount *pag_mount; /* owner filesystem */
|
|
|
|
xfs_agnumber_t pag_agno; /* AG this structure belongs to */
|
|
|
|
atomic_t pag_ref; /* perag reference count */
|
|
|
|
char pagf_init; /* this agf's entry is initialized */
|
|
|
|
char pagi_init; /* this agi's entry is initialized */
|
|
|
|
char pagf_metadata; /* the agf is preferred to be metadata */
|
|
|
|
char pagi_inodeok; /* The agi is ok for inodes */
|
2017-06-17 01:00:05 +07:00
|
|
|
uint8_t pagf_levels[XFS_BTNUM_AGF];
|
2013-08-12 17:49:55 +07:00
|
|
|
/* # of levels in bno & cnt btree */
|
2018-03-16 00:51:58 +07:00
|
|
|
bool pagf_agflreset; /* agfl requires reset before use */
|
2017-06-17 01:00:05 +07:00
|
|
|
uint32_t pagf_flcount; /* count of blocks in freelist */
|
2013-08-12 17:49:55 +07:00
|
|
|
xfs_extlen_t pagf_freeblks; /* total free blocks */
|
|
|
|
xfs_extlen_t pagf_longest; /* longest free space */
|
2017-06-17 01:00:05 +07:00
|
|
|
uint32_t pagf_btreeblks; /* # of blocks held in AGF btrees */
|
2013-08-12 17:49:55 +07:00
|
|
|
xfs_agino_t pagi_freecount; /* number of free inodes */
|
|
|
|
xfs_agino_t pagi_count; /* number of allocated inodes */
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Inode allocation search lookup optimisation.
|
|
|
|
* If the pagino matches, the search for new inodes
|
|
|
|
* doesn't need to search the near ones again straight away
|
|
|
|
*/
|
|
|
|
xfs_agino_t pagl_pagino;
|
|
|
|
xfs_agino_t pagl_leftrec;
|
|
|
|
xfs_agino_t pagl_rightrec;
|
2019-04-12 21:40:25 +07:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Bitsets of per-ag metadata that have been checked and/or are sick.
|
|
|
|
* Callers should hold pag_state_lock before accessing this field.
|
|
|
|
*/
|
|
|
|
uint16_t pag_checked;
|
|
|
|
uint16_t pag_sick;
|
|
|
|
spinlock_t pag_state_lock;
|
|
|
|
|
2013-08-12 17:49:55 +07:00
|
|
|
spinlock_t pagb_lock; /* lock for pagb_tree */
|
|
|
|
struct rb_root pagb_tree; /* ordered tree of busy extents */
|
2017-02-08 05:06:57 +07:00
|
|
|
unsigned int pagb_gen; /* generation count for pagb_tree */
|
|
|
|
wait_queue_head_t pagb_wait; /* woken when pagb_gen changes */
|
2013-08-12 17:49:55 +07:00
|
|
|
|
|
|
|
atomic_t pagf_fstrms; /* # of filestreams active in this AG */
|
|
|
|
|
|
|
|
spinlock_t pag_ici_lock; /* incore inode cache lock */
|
|
|
|
struct radix_tree_root pag_ici_root; /* incore inode cache root */
|
|
|
|
int pag_ici_reclaimable; /* reclaimable inodes */
|
|
|
|
struct mutex pag_ici_reclaim_lock; /* serialisation point */
|
|
|
|
unsigned long pag_ici_reclaim_cursor; /* reclaim restart point */
|
|
|
|
|
|
|
|
/* buffer cache index */
|
2016-12-07 13:36:36 +07:00
|
|
|
spinlock_t pag_buf_lock; /* lock for pag_buf_hash */
|
|
|
|
struct rhashtable pag_buf_hash;
|
2013-08-12 17:49:55 +07:00
|
|
|
|
|
|
|
/* for rcu-safe freeing */
|
|
|
|
struct rcu_head rcu_head;
|
|
|
|
int pagb_count; /* pagb slots in use */
|
xfs: set up per-AG free space reservations
One unfortunate quirk of the reference count and reverse mapping
btrees -- they can expand in size when blocks are written to *other*
allocation groups if, say, one large extent becomes a lot of tiny
extents. Since we don't want to start throwing errors in the middle
of CoWing, we need to reserve some blocks to handle future expansion.
The transaction block reservation counters aren't sufficient here
because we have to have a reserve of blocks in every AG, not just
somewhere in the filesystem.
Therefore, create two per-AG block reservation pools. One feeds the
AGFL so that rmapbt expansion always succeeds, and the other feeds all
other metadata so that refcountbt expansion never fails.
Use the count of how many reserved blocks we need to have on hand to
create a virtual reservation in the AG. Through selective clamping of
the maximum length of allocation requests and of the length of the
longest free extent, we can make it look like there's less free space
in the AG unless the reservation owner is asking for blocks.
In other words, play some accounting tricks in-core to make sure that
we always have blocks available. On the plus side, there's nothing to
clean up if we crash, which is contrast to the strategy that the rough
draft used (actually removing extents from the freespace btrees).
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2016-09-19 07:30:52 +07:00
|
|
|
|
|
|
|
/* Blocks reserved for all kinds of metadata. */
|
|
|
|
struct xfs_ag_resv pag_meta_resv;
|
2018-03-10 05:01:59 +07:00
|
|
|
/* Blocks reserved for the reverse mapping btree. */
|
|
|
|
struct xfs_ag_resv pag_rmapbt_resv;
|
2016-10-03 23:11:16 +07:00
|
|
|
|
|
|
|
/* reference count */
|
2017-06-17 01:00:05 +07:00
|
|
|
uint8_t pagf_refcount_level;
|
2019-02-08 01:37:16 +07:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Unlinked inode information. This incore information reflects
|
|
|
|
* data stored in the AGI, so callers must hold the AGI buffer lock
|
|
|
|
* or have some other means to control concurrency.
|
|
|
|
*/
|
|
|
|
struct rhashtable pagi_unlinked_hash;
|
2013-08-12 17:49:55 +07:00
|
|
|
} xfs_perag_t;
|
|
|
|
|
xfs: set up per-AG free space reservations
One unfortunate quirk of the reference count and reverse mapping
btrees -- they can expand in size when blocks are written to *other*
allocation groups if, say, one large extent becomes a lot of tiny
extents. Since we don't want to start throwing errors in the middle
of CoWing, we need to reserve some blocks to handle future expansion.
The transaction block reservation counters aren't sufficient here
because we have to have a reserve of blocks in every AG, not just
somewhere in the filesystem.
Therefore, create two per-AG block reservation pools. One feeds the
AGFL so that rmapbt expansion always succeeds, and the other feeds all
other metadata so that refcountbt expansion never fails.
Use the count of how many reserved blocks we need to have on hand to
create a virtual reservation in the AG. Through selective clamping of
the maximum length of allocation requests and of the length of the
longest free extent, we can make it look like there's less free space
in the AG unless the reservation owner is asking for blocks.
In other words, play some accounting tricks in-core to make sure that
we always have blocks available. On the plus side, there's nothing to
clean up if we crash, which is contrast to the strategy that the rough
draft used (actually removing extents from the freespace btrees).
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2016-09-19 07:30:52 +07:00
|
|
|
static inline struct xfs_ag_resv *
|
|
|
|
xfs_perag_resv(
|
|
|
|
struct xfs_perag *pag,
|
|
|
|
enum xfs_ag_resv_type type)
|
|
|
|
{
|
|
|
|
switch (type) {
|
|
|
|
case XFS_AG_RESV_METADATA:
|
|
|
|
return &pag->pag_meta_resv;
|
2018-03-10 05:01:59 +07:00
|
|
|
case XFS_AG_RESV_RMAPBT:
|
|
|
|
return &pag->pag_rmapbt_resv;
|
xfs: set up per-AG free space reservations
One unfortunate quirk of the reference count and reverse mapping
btrees -- they can expand in size when blocks are written to *other*
allocation groups if, say, one large extent becomes a lot of tiny
extents. Since we don't want to start throwing errors in the middle
of CoWing, we need to reserve some blocks to handle future expansion.
The transaction block reservation counters aren't sufficient here
because we have to have a reserve of blocks in every AG, not just
somewhere in the filesystem.
Therefore, create two per-AG block reservation pools. One feeds the
AGFL so that rmapbt expansion always succeeds, and the other feeds all
other metadata so that refcountbt expansion never fails.
Use the count of how many reserved blocks we need to have on hand to
create a virtual reservation in the AG. Through selective clamping of
the maximum length of allocation requests and of the length of the
longest free extent, we can make it look like there's less free space
in the AG unless the reservation owner is asking for blocks.
In other words, play some accounting tricks in-core to make sure that
we always have blocks available. On the plus side, there's nothing to
clean up if we crash, which is contrast to the strategy that the rough
draft used (actually removing extents from the freespace btrees).
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2016-09-19 07:30:52 +07:00
|
|
|
default:
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2016-12-07 13:36:36 +07:00
|
|
|
int xfs_buf_hash_init(xfs_perag_t *pag);
|
|
|
|
void xfs_buf_hash_destroy(xfs_perag_t *pag);
|
|
|
|
|
2015-11-03 09:06:34 +07:00
|
|
|
extern void xfs_uuid_table_free(void);
|
2011-06-30 05:10:14 +07:00
|
|
|
extern int xfs_log_sbcount(xfs_mount_t *);
|
2017-06-17 01:00:05 +07:00
|
|
|
extern uint64_t xfs_default_resblks(xfs_mount_t *mp);
|
2008-08-13 13:49:32 +07:00
|
|
|
extern int xfs_mountfs(xfs_mount_t *mp);
|
2013-08-12 17:49:41 +07:00
|
|
|
extern int xfs_initialize_perag(xfs_mount_t *mp, xfs_agnumber_t agcount,
|
|
|
|
xfs_agnumber_t *maxagi);
|
2008-08-13 13:49:57 +07:00
|
|
|
extern void xfs_unmountfs(xfs_mount_t *);
|
2015-02-23 17:24:37 +07:00
|
|
|
|
2015-02-23 17:19:28 +07:00
|
|
|
extern int xfs_mod_icount(struct xfs_mount *mp, int64_t delta);
|
2015-02-23 17:19:53 +07:00
|
|
|
extern int xfs_mod_ifree(struct xfs_mount *mp, int64_t delta);
|
2015-02-23 17:22:03 +07:00
|
|
|
extern int xfs_mod_fdblocks(struct xfs_mount *mp, int64_t delta,
|
|
|
|
bool reserved);
|
2015-02-23 17:22:54 +07:00
|
|
|
extern int xfs_mod_frextents(struct xfs_mount *mp, int64_t delta);
|
|
|
|
|
2005-04-17 05:20:36 +07:00
|
|
|
extern struct xfs_buf *xfs_getsb(xfs_mount_t *, int);
|
2006-03-31 10:04:17 +07:00
|
|
|
extern int xfs_readsb(xfs_mount_t *, int);
|
2005-04-17 05:20:36 +07:00
|
|
|
extern void xfs_freesb(xfs_mount_t *);
|
2014-11-28 10:02:59 +07:00
|
|
|
extern bool xfs_fs_writable(struct xfs_mount *mp, int level);
|
2017-06-17 01:00:05 +07:00
|
|
|
extern int xfs_sb_validate_fsb_count(struct xfs_sb *, uint64_t);
|
2005-04-17 05:20:36 +07:00
|
|
|
|
2010-02-18 02:36:13 +07:00
|
|
|
extern int xfs_dev_is_read_only(struct xfs_mount *, char *);
|
|
|
|
|
2011-01-04 07:35:03 +07:00
|
|
|
extern void xfs_set_low_space_thresholds(struct xfs_mount *);
|
|
|
|
|
2015-11-03 08:27:22 +07:00
|
|
|
int xfs_zero_extent(struct xfs_inode *ip, xfs_fsblock_t start_fsb,
|
|
|
|
xfs_off_t count_fsb);
|
|
|
|
|
2016-05-18 08:05:33 +07:00
|
|
|
struct xfs_error_cfg * xfs_error_get_cfg(struct xfs_mount *mp,
|
|
|
|
int error_class, int error);
|
2018-07-20 23:28:40 +07:00
|
|
|
void xfs_force_summary_recalc(struct xfs_mount *mp);
|
2019-04-26 08:26:22 +07:00
|
|
|
void xfs_mod_delalloc(struct xfs_mount *mp, int64_t delta);
|
2016-05-18 08:05:33 +07:00
|
|
|
|
2005-04-17 05:20:36 +07:00
|
|
|
#endif /* __XFS_MOUNT_H__ */
|