2016-10-03 23:11:20 +07:00
|
|
|
/*
|
|
|
|
* Copyright (C) 2016 Oracle. All Rights Reserved.
|
|
|
|
*
|
|
|
|
* Author: Darrick J. Wong <darrick.wong@oracle.com>
|
|
|
|
*
|
|
|
|
* This program is free software; you can redistribute it and/or
|
|
|
|
* modify it under the terms of the GNU General Public License
|
|
|
|
* as published by the Free Software Foundation; either version 2
|
|
|
|
* of the License, or (at your option) any later version.
|
|
|
|
*
|
|
|
|
* This program is distributed in the hope that it would be useful,
|
|
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
* GNU General Public License for more details.
|
|
|
|
*
|
|
|
|
* You should have received a copy of the GNU General Public License
|
|
|
|
* along with this program; if not, write the Free Software Foundation,
|
|
|
|
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
|
|
|
|
*/
|
|
|
|
#ifndef __XFS_REFCOUNT_ITEM_H__
|
|
|
|
#define __XFS_REFCOUNT_ITEM_H__
|
|
|
|
|
|
|
|
/*
|
|
|
|
* There are (currently) two pairs of refcount btree redo item types:
|
|
|
|
* increase and decrease. The log items for these are CUI (refcount
|
|
|
|
* update intent) and CUD (refcount update done). The redo item type
|
|
|
|
* is encoded in the flags field of each xfs_map_extent.
|
|
|
|
*
|
|
|
|
* *I items should be recorded in the *first* of a series of rolled
|
|
|
|
* transactions, and the *D items should be recorded in the same
|
|
|
|
* transaction that records the associated refcountbt updates.
|
|
|
|
*
|
|
|
|
* Should the system crash after the commit of the first transaction
|
|
|
|
* but before the commit of the final transaction in a series, log
|
|
|
|
* recovery will use the redo information recorded by the intent items
|
|
|
|
* to replay the refcountbt metadata updates.
|
|
|
|
*/
|
|
|
|
|
|
|
|
/* kernel only CUI/CUD definitions */
|
|
|
|
|
|
|
|
struct xfs_mount;
|
|
|
|
struct kmem_zone;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Max number of extents in fast allocation path.
|
|
|
|
*/
|
|
|
|
#define XFS_CUI_MAX_FAST_EXTENTS 16
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Define CUI flag bits. Manipulated by set/clear/test_bit operators.
|
|
|
|
*/
|
|
|
|
#define XFS_CUI_RECOVERED 1
|
|
|
|
|
|
|
|
/*
|
|
|
|
* This is the "refcount update intent" log item. It is used to log
|
|
|
|
* the fact that some reverse mappings need to change. It is used in
|
|
|
|
* conjunction with the "refcount update done" log item described
|
|
|
|
* below.
|
|
|
|
*
|
|
|
|
* These log items follow the same rules as struct xfs_efi_log_item;
|
|
|
|
* see the comments about that structure (in xfs_extfree_item.h) for
|
|
|
|
* more details.
|
|
|
|
*/
|
|
|
|
struct xfs_cui_log_item {
|
|
|
|
struct xfs_log_item cui_item;
|
|
|
|
atomic_t cui_refcount;
|
|
|
|
atomic_t cui_next_extent;
|
|
|
|
unsigned long cui_flags; /* misc flags */
|
|
|
|
struct xfs_cui_log_format cui_format;
|
|
|
|
};
|
|
|
|
|
|
|
|
static inline size_t
|
|
|
|
xfs_cui_log_item_sizeof(
|
|
|
|
unsigned int nr)
|
|
|
|
{
|
|
|
|
return offsetof(struct xfs_cui_log_item, cui_format) +
|
|
|
|
xfs_cui_log_format_sizeof(nr);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* This is the "refcount update done" log item. It is used to log the
|
|
|
|
* fact that some refcountbt updates mentioned in an earlier cui item
|
|
|
|
* have been performed.
|
|
|
|
*/
|
|
|
|
struct xfs_cud_log_item {
|
|
|
|
struct xfs_log_item cud_item;
|
|
|
|
struct xfs_cui_log_item *cud_cuip;
|
|
|
|
struct xfs_cud_log_format cud_format;
|
|
|
|
};
|
|
|
|
|
|
|
|
extern struct kmem_zone *xfs_cui_zone;
|
|
|
|
extern struct kmem_zone *xfs_cud_zone;
|
|
|
|
|
|
|
|
struct xfs_cui_log_item *xfs_cui_init(struct xfs_mount *, uint);
|
|
|
|
struct xfs_cud_log_item *xfs_cud_init(struct xfs_mount *,
|
|
|
|
struct xfs_cui_log_item *);
|
|
|
|
void xfs_cui_item_free(struct xfs_cui_log_item *);
|
|
|
|
void xfs_cui_release(struct xfs_cui_log_item *);
|
xfs: log recovery should replay deferred ops in order
As part of testing log recovery with dm_log_writes, Amir Goldstein
discovered an error in the deferred ops recovery that lead to corruption
of the filesystem metadata if a reflink+rmap filesystem happened to shut
down midway through a CoW remap:
"This is what happens [after failed log recovery]:
"Phase 1 - find and verify superblock...
"Phase 2 - using internal log
" - zero log...
" - scan filesystem freespace and inode maps...
" - found root inode chunk
"Phase 3 - for each AG...
" - scan (but don't clear) agi unlinked lists...
" - process known inodes and perform inode discovery...
" - agno = 0
"data fork in regular inode 134 claims CoW block 376
"correcting nextents for inode 134
"bad data fork in inode 134
"would have cleared inode 134"
Hou Tao dissected the log contents of exactly such a crash:
"According to the implementation of xfs_defer_finish(), these ops should
be completed in the following sequence:
"Have been done:
"(1) CUI: Oper (160)
"(2) BUI: Oper (161)
"(3) CUD: Oper (194), for CUI Oper (160)
"(4) RUI A: Oper (197), free rmap [0x155, 2, -9]
"Should be done:
"(5) BUD: for BUI Oper (161)
"(6) RUI B: add rmap [0x155, 2, 137]
"(7) RUD: for RUI A
"(8) RUD: for RUI B
"Actually be done by xlog_recover_process_intents()
"(5) BUD: for BUI Oper (161)
"(6) RUI B: add rmap [0x155, 2, 137]
"(7) RUD: for RUI B
"(8) RUD: for RUI A
"So the rmap entry [0x155, 2, -9] for COW should be freed firstly,
then a new rmap entry [0x155, 2, 137] will be added. However, as we can see
from the log record in post_mount.log (generated after umount) and the trace
print, the new rmap entry [0x155, 2, 137] are added firstly, then the rmap
entry [0x155, 2, -9] are freed."
When reconstructing the internal log state from the log items found on
disk, it's required that deferred ops replay in exactly the same order
that they would have had the filesystem not gone down. However,
replaying unfinished deferred ops can create /more/ deferred ops. These
new deferred ops are finished in the wrong order. This causes fs
corruption and replay crashes, so let's create a single defer_ops to
handle the subsequent ops created during replay, then use one single
transaction at the end of log recovery to ensure that everything is
replayed in the same order as they're supposed to be.
Reported-by: Amir Goldstein <amir73il@gmail.com>
Analyzed-by: Hou Tao <houtao1@huawei.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Tested-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
2017-11-22 11:53:02 +07:00
|
|
|
int xfs_cui_recover(struct xfs_mount *mp, struct xfs_cui_log_item *cuip,
|
|
|
|
struct xfs_defer_ops *dfops);
|
2016-10-03 23:11:20 +07:00
|
|
|
|
|
|
|
#endif /* __XFS_REFCOUNT_ITEM_H__ */
|