2007-06-12 20:07:21 +07:00
|
|
|
/*
|
|
|
|
* Copyright (C) 2007 Oracle. All rights reserved.
|
|
|
|
*
|
|
|
|
* This program is free software; you can redistribute it and/or
|
|
|
|
* modify it under the terms of the GNU General Public
|
|
|
|
* License v2 as published by the Free Software Foundation.
|
|
|
|
*
|
|
|
|
* This program is distributed in the hope that it will be useful,
|
|
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
|
|
* General Public License for more details.
|
|
|
|
*
|
|
|
|
* You should have received a copy of the GNU General Public
|
|
|
|
* License along with this program; if not, write to the
|
|
|
|
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
|
|
|
* Boston, MA 021110-1307, USA.
|
|
|
|
*/
|
|
|
|
|
Btrfs: Cache free inode numbers in memory
Currently btrfs stores the highest objectid of the fs tree, and it always
returns (highest+1) inode number when we create a file, so inode numbers
won't be reclaimed when we delete files, so we'll run out of inode numbers
as we keep create/delete files in 32bits machines.
This fixes it, and it works similarly to how we cache free space in block
cgroups.
We start a kernel thread to read the file tree. By scanning inode items,
we know which chunks of inode numbers are free, and we cache them in
an rb-tree.
Because we are searching the commit root, we have to carefully handle the
cross-transaction case.
The rb-tree is a hybrid extent+bitmap tree, so if we have too many small
chunks of inode numbers, we'll use bitmaps. Initially we allow 16K ram
of extents, and a bitmap will be used if we exceed this threshold. The
extents threshold is adjusted in runtime.
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
2011-04-20 09:06:11 +07:00
|
|
|
#include <linux/delay.h>
|
|
|
|
#include <linux/kthread.h>
|
|
|
|
#include <linux/pagemap.h>
|
|
|
|
|
2007-03-21 01:38:32 +07:00
|
|
|
#include "ctree.h"
|
|
|
|
#include "disk-io.h"
|
Btrfs: Cache free inode numbers in memory
Currently btrfs stores the highest objectid of the fs tree, and it always
returns (highest+1) inode number when we create a file, so inode numbers
won't be reclaimed when we delete files, so we'll run out of inode numbers
as we keep create/delete files in 32bits machines.
This fixes it, and it works similarly to how we cache free space in block
cgroups.
We start a kernel thread to read the file tree. By scanning inode items,
we know which chunks of inode numbers are free, and we cache them in
an rb-tree.
Because we are searching the commit root, we have to carefully handle the
cross-transaction case.
The rb-tree is a hybrid extent+bitmap tree, so if we have too many small
chunks of inode numbers, we'll use bitmaps. Initially we allow 16K ram
of extents, and a bitmap will be used if we exceed this threshold. The
extents threshold is adjusted in runtime.
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
2011-04-20 09:06:11 +07:00
|
|
|
#include "free-space-cache.h"
|
|
|
|
#include "inode-map.h"
|
2007-03-21 01:38:32 +07:00
|
|
|
#include "transaction.h"
|
|
|
|
|
Btrfs: Cache free inode numbers in memory
Currently btrfs stores the highest objectid of the fs tree, and it always
returns (highest+1) inode number when we create a file, so inode numbers
won't be reclaimed when we delete files, so we'll run out of inode numbers
as we keep create/delete files in 32bits machines.
This fixes it, and it works similarly to how we cache free space in block
cgroups.
We start a kernel thread to read the file tree. By scanning inode items,
we know which chunks of inode numbers are free, and we cache them in
an rb-tree.
Because we are searching the commit root, we have to carefully handle the
cross-transaction case.
The rb-tree is a hybrid extent+bitmap tree, so if we have too many small
chunks of inode numbers, we'll use bitmaps. Initially we allow 16K ram
of extents, and a bitmap will be used if we exceed this threshold. The
extents threshold is adjusted in runtime.
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
2011-04-20 09:06:11 +07:00
|
|
|
static int caching_kthread(void *data)
|
|
|
|
{
|
|
|
|
struct btrfs_root *root = data;
|
|
|
|
struct btrfs_fs_info *fs_info = root->fs_info;
|
|
|
|
struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
|
|
|
|
struct btrfs_key key;
|
|
|
|
struct btrfs_path *path;
|
|
|
|
struct extent_buffer *leaf;
|
|
|
|
u64 last = (u64)-1;
|
|
|
|
int slot;
|
|
|
|
int ret;
|
|
|
|
|
2016-06-10 08:38:35 +07:00
|
|
|
if (!btrfs_test_opt(root->fs_info, INODE_MAP_CACHE))
|
2011-06-03 20:36:29 +07:00
|
|
|
return 0;
|
|
|
|
|
Btrfs: Cache free inode numbers in memory
Currently btrfs stores the highest objectid of the fs tree, and it always
returns (highest+1) inode number when we create a file, so inode numbers
won't be reclaimed when we delete files, so we'll run out of inode numbers
as we keep create/delete files in 32bits machines.
This fixes it, and it works similarly to how we cache free space in block
cgroups.
We start a kernel thread to read the file tree. By scanning inode items,
we know which chunks of inode numbers are free, and we cache them in
an rb-tree.
Because we are searching the commit root, we have to carefully handle the
cross-transaction case.
The rb-tree is a hybrid extent+bitmap tree, so if we have too many small
chunks of inode numbers, we'll use bitmaps. Initially we allow 16K ram
of extents, and a bitmap will be used if we exceed this threshold. The
extents threshold is adjusted in runtime.
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
2011-04-20 09:06:11 +07:00
|
|
|
path = btrfs_alloc_path();
|
|
|
|
if (!path)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
/* Since the commit root is read-only, we can safely skip locking. */
|
|
|
|
path->skip_locking = 1;
|
|
|
|
path->search_commit_root = 1;
|
2015-11-27 22:31:35 +07:00
|
|
|
path->reada = READA_FORWARD;
|
Btrfs: Cache free inode numbers in memory
Currently btrfs stores the highest objectid of the fs tree, and it always
returns (highest+1) inode number when we create a file, so inode numbers
won't be reclaimed when we delete files, so we'll run out of inode numbers
as we keep create/delete files in 32bits machines.
This fixes it, and it works similarly to how we cache free space in block
cgroups.
We start a kernel thread to read the file tree. By scanning inode items,
we know which chunks of inode numbers are free, and we cache them in
an rb-tree.
Because we are searching the commit root, we have to carefully handle the
cross-transaction case.
The rb-tree is a hybrid extent+bitmap tree, so if we have too many small
chunks of inode numbers, we'll use bitmaps. Initially we allow 16K ram
of extents, and a bitmap will be used if we exceed this threshold. The
extents threshold is adjusted in runtime.
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
2011-04-20 09:06:11 +07:00
|
|
|
|
|
|
|
key.objectid = BTRFS_FIRST_FREE_OBJECTID;
|
|
|
|
key.offset = 0;
|
|
|
|
key.type = BTRFS_INODE_ITEM_KEY;
|
|
|
|
again:
|
|
|
|
/* need to make sure the commit_root doesn't disappear */
|
2014-03-14 02:42:13 +07:00
|
|
|
down_read(&fs_info->commit_root_sem);
|
Btrfs: Cache free inode numbers in memory
Currently btrfs stores the highest objectid of the fs tree, and it always
returns (highest+1) inode number when we create a file, so inode numbers
won't be reclaimed when we delete files, so we'll run out of inode numbers
as we keep create/delete files in 32bits machines.
This fixes it, and it works similarly to how we cache free space in block
cgroups.
We start a kernel thread to read the file tree. By scanning inode items,
we know which chunks of inode numbers are free, and we cache them in
an rb-tree.
Because we are searching the commit root, we have to carefully handle the
cross-transaction case.
The rb-tree is a hybrid extent+bitmap tree, so if we have too many small
chunks of inode numbers, we'll use bitmaps. Initially we allow 16K ram
of extents, and a bitmap will be used if we exceed this threshold. The
extents threshold is adjusted in runtime.
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
2011-04-20 09:06:11 +07:00
|
|
|
|
|
|
|
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
while (1) {
|
2011-05-31 23:07:27 +07:00
|
|
|
if (btrfs_fs_closing(fs_info))
|
Btrfs: Cache free inode numbers in memory
Currently btrfs stores the highest objectid of the fs tree, and it always
returns (highest+1) inode number when we create a file, so inode numbers
won't be reclaimed when we delete files, so we'll run out of inode numbers
as we keep create/delete files in 32bits machines.
This fixes it, and it works similarly to how we cache free space in block
cgroups.
We start a kernel thread to read the file tree. By scanning inode items,
we know which chunks of inode numbers are free, and we cache them in
an rb-tree.
Because we are searching the commit root, we have to carefully handle the
cross-transaction case.
The rb-tree is a hybrid extent+bitmap tree, so if we have too many small
chunks of inode numbers, we'll use bitmaps. Initially we allow 16K ram
of extents, and a bitmap will be used if we exceed this threshold. The
extents threshold is adjusted in runtime.
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
2011-04-20 09:06:11 +07:00
|
|
|
goto out;
|
|
|
|
|
|
|
|
leaf = path->nodes[0];
|
|
|
|
slot = path->slots[0];
|
2011-05-26 13:38:30 +07:00
|
|
|
if (slot >= btrfs_header_nritems(leaf)) {
|
Btrfs: Cache free inode numbers in memory
Currently btrfs stores the highest objectid of the fs tree, and it always
returns (highest+1) inode number when we create a file, so inode numbers
won't be reclaimed when we delete files, so we'll run out of inode numbers
as we keep create/delete files in 32bits machines.
This fixes it, and it works similarly to how we cache free space in block
cgroups.
We start a kernel thread to read the file tree. By scanning inode items,
we know which chunks of inode numbers are free, and we cache them in
an rb-tree.
Because we are searching the commit root, we have to carefully handle the
cross-transaction case.
The rb-tree is a hybrid extent+bitmap tree, so if we have too many small
chunks of inode numbers, we'll use bitmaps. Initially we allow 16K ram
of extents, and a bitmap will be used if we exceed this threshold. The
extents threshold is adjusted in runtime.
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
2011-04-20 09:06:11 +07:00
|
|
|
ret = btrfs_next_leaf(root, path);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
else if (ret > 0)
|
|
|
|
break;
|
|
|
|
|
|
|
|
if (need_resched() ||
|
|
|
|
btrfs_transaction_in_commit(fs_info)) {
|
|
|
|
leaf = path->nodes[0];
|
|
|
|
|
2013-10-31 12:00:08 +07:00
|
|
|
if (WARN_ON(btrfs_header_nritems(leaf) == 0))
|
Btrfs: Cache free inode numbers in memory
Currently btrfs stores the highest objectid of the fs tree, and it always
returns (highest+1) inode number when we create a file, so inode numbers
won't be reclaimed when we delete files, so we'll run out of inode numbers
as we keep create/delete files in 32bits machines.
This fixes it, and it works similarly to how we cache free space in block
cgroups.
We start a kernel thread to read the file tree. By scanning inode items,
we know which chunks of inode numbers are free, and we cache them in
an rb-tree.
Because we are searching the commit root, we have to carefully handle the
cross-transaction case.
The rb-tree is a hybrid extent+bitmap tree, so if we have too many small
chunks of inode numbers, we'll use bitmaps. Initially we allow 16K ram
of extents, and a bitmap will be used if we exceed this threshold. The
extents threshold is adjusted in runtime.
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
2011-04-20 09:06:11 +07:00
|
|
|
break;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Save the key so we can advances forward
|
|
|
|
* in the next search.
|
|
|
|
*/
|
|
|
|
btrfs_item_key_to_cpu(leaf, &key, 0);
|
2011-05-22 23:33:42 +07:00
|
|
|
btrfs_release_path(path);
|
2014-02-05 08:37:48 +07:00
|
|
|
root->ino_cache_progress = last;
|
2014-03-14 02:42:13 +07:00
|
|
|
up_read(&fs_info->commit_root_sem);
|
Btrfs: Cache free inode numbers in memory
Currently btrfs stores the highest objectid of the fs tree, and it always
returns (highest+1) inode number when we create a file, so inode numbers
won't be reclaimed when we delete files, so we'll run out of inode numbers
as we keep create/delete files in 32bits machines.
This fixes it, and it works similarly to how we cache free space in block
cgroups.
We start a kernel thread to read the file tree. By scanning inode items,
we know which chunks of inode numbers are free, and we cache them in
an rb-tree.
Because we are searching the commit root, we have to carefully handle the
cross-transaction case.
The rb-tree is a hybrid extent+bitmap tree, so if we have too many small
chunks of inode numbers, we'll use bitmaps. Initially we allow 16K ram
of extents, and a bitmap will be used if we exceed this threshold. The
extents threshold is adjusted in runtime.
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
2011-04-20 09:06:11 +07:00
|
|
|
schedule_timeout(1);
|
|
|
|
goto again;
|
|
|
|
} else
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
btrfs_item_key_to_cpu(leaf, &key, slot);
|
|
|
|
|
|
|
|
if (key.type != BTRFS_INODE_ITEM_KEY)
|
|
|
|
goto next;
|
|
|
|
|
2011-05-26 13:38:30 +07:00
|
|
|
if (key.objectid >= root->highest_objectid)
|
Btrfs: Cache free inode numbers in memory
Currently btrfs stores the highest objectid of the fs tree, and it always
returns (highest+1) inode number when we create a file, so inode numbers
won't be reclaimed when we delete files, so we'll run out of inode numbers
as we keep create/delete files in 32bits machines.
This fixes it, and it works similarly to how we cache free space in block
cgroups.
We start a kernel thread to read the file tree. By scanning inode items,
we know which chunks of inode numbers are free, and we cache them in
an rb-tree.
Because we are searching the commit root, we have to carefully handle the
cross-transaction case.
The rb-tree is a hybrid extent+bitmap tree, so if we have too many small
chunks of inode numbers, we'll use bitmaps. Initially we allow 16K ram
of extents, and a bitmap will be used if we exceed this threshold. The
extents threshold is adjusted in runtime.
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
2011-04-20 09:06:11 +07:00
|
|
|
break;
|
|
|
|
|
|
|
|
if (last != (u64)-1 && last + 1 != key.objectid) {
|
|
|
|
__btrfs_add_free_space(ctl, last + 1,
|
|
|
|
key.objectid - last - 1);
|
2014-02-05 08:37:48 +07:00
|
|
|
wake_up(&root->ino_cache_wait);
|
Btrfs: Cache free inode numbers in memory
Currently btrfs stores the highest objectid of the fs tree, and it always
returns (highest+1) inode number when we create a file, so inode numbers
won't be reclaimed when we delete files, so we'll run out of inode numbers
as we keep create/delete files in 32bits machines.
This fixes it, and it works similarly to how we cache free space in block
cgroups.
We start a kernel thread to read the file tree. By scanning inode items,
we know which chunks of inode numbers are free, and we cache them in
an rb-tree.
Because we are searching the commit root, we have to carefully handle the
cross-transaction case.
The rb-tree is a hybrid extent+bitmap tree, so if we have too many small
chunks of inode numbers, we'll use bitmaps. Initially we allow 16K ram
of extents, and a bitmap will be used if we exceed this threshold. The
extents threshold is adjusted in runtime.
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
2011-04-20 09:06:11 +07:00
|
|
|
}
|
|
|
|
|
|
|
|
last = key.objectid;
|
|
|
|
next:
|
|
|
|
path->slots[0]++;
|
|
|
|
}
|
|
|
|
|
2011-05-26 13:38:30 +07:00
|
|
|
if (last < root->highest_objectid - 1) {
|
Btrfs: Cache free inode numbers in memory
Currently btrfs stores the highest objectid of the fs tree, and it always
returns (highest+1) inode number when we create a file, so inode numbers
won't be reclaimed when we delete files, so we'll run out of inode numbers
as we keep create/delete files in 32bits machines.
This fixes it, and it works similarly to how we cache free space in block
cgroups.
We start a kernel thread to read the file tree. By scanning inode items,
we know which chunks of inode numbers are free, and we cache them in
an rb-tree.
Because we are searching the commit root, we have to carefully handle the
cross-transaction case.
The rb-tree is a hybrid extent+bitmap tree, so if we have too many small
chunks of inode numbers, we'll use bitmaps. Initially we allow 16K ram
of extents, and a bitmap will be used if we exceed this threshold. The
extents threshold is adjusted in runtime.
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
2011-04-20 09:06:11 +07:00
|
|
|
__btrfs_add_free_space(ctl, last + 1,
|
2011-05-26 13:38:30 +07:00
|
|
|
root->highest_objectid - last - 1);
|
Btrfs: Cache free inode numbers in memory
Currently btrfs stores the highest objectid of the fs tree, and it always
returns (highest+1) inode number when we create a file, so inode numbers
won't be reclaimed when we delete files, so we'll run out of inode numbers
as we keep create/delete files in 32bits machines.
This fixes it, and it works similarly to how we cache free space in block
cgroups.
We start a kernel thread to read the file tree. By scanning inode items,
we know which chunks of inode numbers are free, and we cache them in
an rb-tree.
Because we are searching the commit root, we have to carefully handle the
cross-transaction case.
The rb-tree is a hybrid extent+bitmap tree, so if we have too many small
chunks of inode numbers, we'll use bitmaps. Initially we allow 16K ram
of extents, and a bitmap will be used if we exceed this threshold. The
extents threshold is adjusted in runtime.
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
2011-04-20 09:06:11 +07:00
|
|
|
}
|
|
|
|
|
2014-02-05 08:37:48 +07:00
|
|
|
spin_lock(&root->ino_cache_lock);
|
|
|
|
root->ino_cache_state = BTRFS_CACHE_FINISHED;
|
|
|
|
spin_unlock(&root->ino_cache_lock);
|
Btrfs: Cache free inode numbers in memory
Currently btrfs stores the highest objectid of the fs tree, and it always
returns (highest+1) inode number when we create a file, so inode numbers
won't be reclaimed when we delete files, so we'll run out of inode numbers
as we keep create/delete files in 32bits machines.
This fixes it, and it works similarly to how we cache free space in block
cgroups.
We start a kernel thread to read the file tree. By scanning inode items,
we know which chunks of inode numbers are free, and we cache them in
an rb-tree.
Because we are searching the commit root, we have to carefully handle the
cross-transaction case.
The rb-tree is a hybrid extent+bitmap tree, so if we have too many small
chunks of inode numbers, we'll use bitmaps. Initially we allow 16K ram
of extents, and a bitmap will be used if we exceed this threshold. The
extents threshold is adjusted in runtime.
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
2011-04-20 09:06:11 +07:00
|
|
|
|
2014-02-05 08:37:48 +07:00
|
|
|
root->ino_cache_progress = (u64)-1;
|
Btrfs: Cache free inode numbers in memory
Currently btrfs stores the highest objectid of the fs tree, and it always
returns (highest+1) inode number when we create a file, so inode numbers
won't be reclaimed when we delete files, so we'll run out of inode numbers
as we keep create/delete files in 32bits machines.
This fixes it, and it works similarly to how we cache free space in block
cgroups.
We start a kernel thread to read the file tree. By scanning inode items,
we know which chunks of inode numbers are free, and we cache them in
an rb-tree.
Because we are searching the commit root, we have to carefully handle the
cross-transaction case.
The rb-tree is a hybrid extent+bitmap tree, so if we have too many small
chunks of inode numbers, we'll use bitmaps. Initially we allow 16K ram
of extents, and a bitmap will be used if we exceed this threshold. The
extents threshold is adjusted in runtime.
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
2011-04-20 09:06:11 +07:00
|
|
|
btrfs_unpin_free_ino(root);
|
|
|
|
out:
|
2014-02-05 08:37:48 +07:00
|
|
|
wake_up(&root->ino_cache_wait);
|
2014-03-14 02:42:13 +07:00
|
|
|
up_read(&fs_info->commit_root_sem);
|
Btrfs: Cache free inode numbers in memory
Currently btrfs stores the highest objectid of the fs tree, and it always
returns (highest+1) inode number when we create a file, so inode numbers
won't be reclaimed when we delete files, so we'll run out of inode numbers
as we keep create/delete files in 32bits machines.
This fixes it, and it works similarly to how we cache free space in block
cgroups.
We start a kernel thread to read the file tree. By scanning inode items,
we know which chunks of inode numbers are free, and we cache them in
an rb-tree.
Because we are searching the commit root, we have to carefully handle the
cross-transaction case.
The rb-tree is a hybrid extent+bitmap tree, so if we have too many small
chunks of inode numbers, we'll use bitmaps. Initially we allow 16K ram
of extents, and a bitmap will be used if we exceed this threshold. The
extents threshold is adjusted in runtime.
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
2011-04-20 09:06:11 +07:00
|
|
|
|
|
|
|
btrfs_free_path(path);
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void start_caching(struct btrfs_root *root)
|
|
|
|
{
|
2011-05-26 13:38:30 +07:00
|
|
|
struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
|
Btrfs: Cache free inode numbers in memory
Currently btrfs stores the highest objectid of the fs tree, and it always
returns (highest+1) inode number when we create a file, so inode numbers
won't be reclaimed when we delete files, so we'll run out of inode numbers
as we keep create/delete files in 32bits machines.
This fixes it, and it works similarly to how we cache free space in block
cgroups.
We start a kernel thread to read the file tree. By scanning inode items,
we know which chunks of inode numbers are free, and we cache them in
an rb-tree.
Because we are searching the commit root, we have to carefully handle the
cross-transaction case.
The rb-tree is a hybrid extent+bitmap tree, so if we have too many small
chunks of inode numbers, we'll use bitmaps. Initially we allow 16K ram
of extents, and a bitmap will be used if we exceed this threshold. The
extents threshold is adjusted in runtime.
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
2011-04-20 09:06:11 +07:00
|
|
|
struct task_struct *tsk;
|
2011-04-20 09:33:24 +07:00
|
|
|
int ret;
|
2011-05-26 13:38:30 +07:00
|
|
|
u64 objectid;
|
Btrfs: Cache free inode numbers in memory
Currently btrfs stores the highest objectid of the fs tree, and it always
returns (highest+1) inode number when we create a file, so inode numbers
won't be reclaimed when we delete files, so we'll run out of inode numbers
as we keep create/delete files in 32bits machines.
This fixes it, and it works similarly to how we cache free space in block
cgroups.
We start a kernel thread to read the file tree. By scanning inode items,
we know which chunks of inode numbers are free, and we cache them in
an rb-tree.
Because we are searching the commit root, we have to carefully handle the
cross-transaction case.
The rb-tree is a hybrid extent+bitmap tree, so if we have too many small
chunks of inode numbers, we'll use bitmaps. Initially we allow 16K ram
of extents, and a bitmap will be used if we exceed this threshold. The
extents threshold is adjusted in runtime.
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
2011-04-20 09:06:11 +07:00
|
|
|
|
2016-06-10 08:38:35 +07:00
|
|
|
if (!btrfs_test_opt(root->fs_info, INODE_MAP_CACHE))
|
2011-06-03 20:36:29 +07:00
|
|
|
return;
|
|
|
|
|
2014-02-05 08:37:48 +07:00
|
|
|
spin_lock(&root->ino_cache_lock);
|
|
|
|
if (root->ino_cache_state != BTRFS_CACHE_NO) {
|
|
|
|
spin_unlock(&root->ino_cache_lock);
|
Btrfs: Cache free inode numbers in memory
Currently btrfs stores the highest objectid of the fs tree, and it always
returns (highest+1) inode number when we create a file, so inode numbers
won't be reclaimed when we delete files, so we'll run out of inode numbers
as we keep create/delete files in 32bits machines.
This fixes it, and it works similarly to how we cache free space in block
cgroups.
We start a kernel thread to read the file tree. By scanning inode items,
we know which chunks of inode numbers are free, and we cache them in
an rb-tree.
Because we are searching the commit root, we have to carefully handle the
cross-transaction case.
The rb-tree is a hybrid extent+bitmap tree, so if we have too many small
chunks of inode numbers, we'll use bitmaps. Initially we allow 16K ram
of extents, and a bitmap will be used if we exceed this threshold. The
extents threshold is adjusted in runtime.
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
2011-04-20 09:06:11 +07:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2014-02-05 08:37:48 +07:00
|
|
|
root->ino_cache_state = BTRFS_CACHE_STARTED;
|
|
|
|
spin_unlock(&root->ino_cache_lock);
|
Btrfs: Cache free inode numbers in memory
Currently btrfs stores the highest objectid of the fs tree, and it always
returns (highest+1) inode number when we create a file, so inode numbers
won't be reclaimed when we delete files, so we'll run out of inode numbers
as we keep create/delete files in 32bits machines.
This fixes it, and it works similarly to how we cache free space in block
cgroups.
We start a kernel thread to read the file tree. By scanning inode items,
we know which chunks of inode numbers are free, and we cache them in
an rb-tree.
Because we are searching the commit root, we have to carefully handle the
cross-transaction case.
The rb-tree is a hybrid extent+bitmap tree, so if we have too many small
chunks of inode numbers, we'll use bitmaps. Initially we allow 16K ram
of extents, and a bitmap will be used if we exceed this threshold. The
extents threshold is adjusted in runtime.
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
2011-04-20 09:06:11 +07:00
|
|
|
|
2011-04-20 09:33:24 +07:00
|
|
|
ret = load_free_ino_cache(root->fs_info, root);
|
|
|
|
if (ret == 1) {
|
2014-02-05 08:37:48 +07:00
|
|
|
spin_lock(&root->ino_cache_lock);
|
|
|
|
root->ino_cache_state = BTRFS_CACHE_FINISHED;
|
|
|
|
spin_unlock(&root->ino_cache_lock);
|
2011-04-20 09:33:24 +07:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2011-05-26 13:38:30 +07:00
|
|
|
/*
|
|
|
|
* It can be quite time-consuming to fill the cache by searching
|
|
|
|
* through the extent tree, and this can keep ino allocation path
|
|
|
|
* waiting. Therefore at start we quickly find out the highest
|
|
|
|
* inode number and we know we can use inode numbers which fall in
|
|
|
|
* [highest_ino + 1, BTRFS_LAST_FREE_OBJECTID].
|
|
|
|
*/
|
|
|
|
ret = btrfs_find_free_objectid(root, &objectid);
|
|
|
|
if (!ret && objectid <= BTRFS_LAST_FREE_OBJECTID) {
|
|
|
|
__btrfs_add_free_space(ctl, objectid,
|
|
|
|
BTRFS_LAST_FREE_OBJECTID - objectid + 1);
|
|
|
|
}
|
|
|
|
|
2014-05-16 20:15:45 +07:00
|
|
|
tsk = kthread_run(caching_kthread, root, "btrfs-ino-cache-%llu",
|
Btrfs: Cache free inode numbers in memory
Currently btrfs stores the highest objectid of the fs tree, and it always
returns (highest+1) inode number when we create a file, so inode numbers
won't be reclaimed when we delete files, so we'll run out of inode numbers
as we keep create/delete files in 32bits machines.
This fixes it, and it works similarly to how we cache free space in block
cgroups.
We start a kernel thread to read the file tree. By scanning inode items,
we know which chunks of inode numbers are free, and we cache them in
an rb-tree.
Because we are searching the commit root, we have to carefully handle the
cross-transaction case.
The rb-tree is a hybrid extent+bitmap tree, so if we have too many small
chunks of inode numbers, we'll use bitmaps. Initially we allow 16K ram
of extents, and a bitmap will be used if we exceed this threshold. The
extents threshold is adjusted in runtime.
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
2011-04-20 09:06:11 +07:00
|
|
|
root->root_key.objectid);
|
2014-04-23 18:33:34 +07:00
|
|
|
if (IS_ERR(tsk)) {
|
|
|
|
btrfs_warn(root->fs_info, "failed to start inode caching task");
|
2014-02-05 21:26:17 +07:00
|
|
|
btrfs_clear_pending_and_info(root->fs_info, INODE_MAP_CACHE,
|
2014-04-23 18:33:34 +07:00
|
|
|
"disabling inode map caching");
|
|
|
|
}
|
Btrfs: Cache free inode numbers in memory
Currently btrfs stores the highest objectid of the fs tree, and it always
returns (highest+1) inode number when we create a file, so inode numbers
won't be reclaimed when we delete files, so we'll run out of inode numbers
as we keep create/delete files in 32bits machines.
This fixes it, and it works similarly to how we cache free space in block
cgroups.
We start a kernel thread to read the file tree. By scanning inode items,
we know which chunks of inode numbers are free, and we cache them in
an rb-tree.
Because we are searching the commit root, we have to carefully handle the
cross-transaction case.
The rb-tree is a hybrid extent+bitmap tree, so if we have too many small
chunks of inode numbers, we'll use bitmaps. Initially we allow 16K ram
of extents, and a bitmap will be used if we exceed this threshold. The
extents threshold is adjusted in runtime.
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
2011-04-20 09:06:11 +07:00
|
|
|
}
|
|
|
|
|
|
|
|
int btrfs_find_free_ino(struct btrfs_root *root, u64 *objectid)
|
|
|
|
{
|
2016-06-10 08:38:35 +07:00
|
|
|
if (!btrfs_test_opt(root->fs_info, INODE_MAP_CACHE))
|
2011-06-03 20:36:29 +07:00
|
|
|
return btrfs_find_free_objectid(root, objectid);
|
|
|
|
|
Btrfs: Cache free inode numbers in memory
Currently btrfs stores the highest objectid of the fs tree, and it always
returns (highest+1) inode number when we create a file, so inode numbers
won't be reclaimed when we delete files, so we'll run out of inode numbers
as we keep create/delete files in 32bits machines.
This fixes it, and it works similarly to how we cache free space in block
cgroups.
We start a kernel thread to read the file tree. By scanning inode items,
we know which chunks of inode numbers are free, and we cache them in
an rb-tree.
Because we are searching the commit root, we have to carefully handle the
cross-transaction case.
The rb-tree is a hybrid extent+bitmap tree, so if we have too many small
chunks of inode numbers, we'll use bitmaps. Initially we allow 16K ram
of extents, and a bitmap will be used if we exceed this threshold. The
extents threshold is adjusted in runtime.
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
2011-04-20 09:06:11 +07:00
|
|
|
again:
|
|
|
|
*objectid = btrfs_find_ino_for_alloc(root);
|
|
|
|
|
|
|
|
if (*objectid != 0)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
start_caching(root);
|
|
|
|
|
2014-02-05 08:37:48 +07:00
|
|
|
wait_event(root->ino_cache_wait,
|
|
|
|
root->ino_cache_state == BTRFS_CACHE_FINISHED ||
|
Btrfs: Cache free inode numbers in memory
Currently btrfs stores the highest objectid of the fs tree, and it always
returns (highest+1) inode number when we create a file, so inode numbers
won't be reclaimed when we delete files, so we'll run out of inode numbers
as we keep create/delete files in 32bits machines.
This fixes it, and it works similarly to how we cache free space in block
cgroups.
We start a kernel thread to read the file tree. By scanning inode items,
we know which chunks of inode numbers are free, and we cache them in
an rb-tree.
Because we are searching the commit root, we have to carefully handle the
cross-transaction case.
The rb-tree is a hybrid extent+bitmap tree, so if we have too many small
chunks of inode numbers, we'll use bitmaps. Initially we allow 16K ram
of extents, and a bitmap will be used if we exceed this threshold. The
extents threshold is adjusted in runtime.
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
2011-04-20 09:06:11 +07:00
|
|
|
root->free_ino_ctl->free_space > 0);
|
|
|
|
|
2014-02-05 08:37:48 +07:00
|
|
|
if (root->ino_cache_state == BTRFS_CACHE_FINISHED &&
|
Btrfs: Cache free inode numbers in memory
Currently btrfs stores the highest objectid of the fs tree, and it always
returns (highest+1) inode number when we create a file, so inode numbers
won't be reclaimed when we delete files, so we'll run out of inode numbers
as we keep create/delete files in 32bits machines.
This fixes it, and it works similarly to how we cache free space in block
cgroups.
We start a kernel thread to read the file tree. By scanning inode items,
we know which chunks of inode numbers are free, and we cache them in
an rb-tree.
Because we are searching the commit root, we have to carefully handle the
cross-transaction case.
The rb-tree is a hybrid extent+bitmap tree, so if we have too many small
chunks of inode numbers, we'll use bitmaps. Initially we allow 16K ram
of extents, and a bitmap will be used if we exceed this threshold. The
extents threshold is adjusted in runtime.
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
2011-04-20 09:06:11 +07:00
|
|
|
root->free_ino_ctl->free_space == 0)
|
|
|
|
return -ENOSPC;
|
|
|
|
else
|
|
|
|
goto again;
|
|
|
|
}
|
|
|
|
|
|
|
|
void btrfs_return_ino(struct btrfs_root *root, u64 objectid)
|
|
|
|
{
|
|
|
|
struct btrfs_free_space_ctl *pinned = root->free_ino_pinned;
|
2011-06-03 20:36:29 +07:00
|
|
|
|
2016-06-10 08:38:35 +07:00
|
|
|
if (!btrfs_test_opt(root->fs_info, INODE_MAP_CACHE))
|
2011-06-03 20:36:29 +07:00
|
|
|
return;
|
Btrfs: Cache free inode numbers in memory
Currently btrfs stores the highest objectid of the fs tree, and it always
returns (highest+1) inode number when we create a file, so inode numbers
won't be reclaimed when we delete files, so we'll run out of inode numbers
as we keep create/delete files in 32bits machines.
This fixes it, and it works similarly to how we cache free space in block
cgroups.
We start a kernel thread to read the file tree. By scanning inode items,
we know which chunks of inode numbers are free, and we cache them in
an rb-tree.
Because we are searching the commit root, we have to carefully handle the
cross-transaction case.
The rb-tree is a hybrid extent+bitmap tree, so if we have too many small
chunks of inode numbers, we'll use bitmaps. Initially we allow 16K ram
of extents, and a bitmap will be used if we exceed this threshold. The
extents threshold is adjusted in runtime.
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
2011-04-20 09:06:11 +07:00
|
|
|
again:
|
2014-02-05 08:37:48 +07:00
|
|
|
if (root->ino_cache_state == BTRFS_CACHE_FINISHED) {
|
2014-04-23 18:33:36 +07:00
|
|
|
__btrfs_add_free_space(pinned, objectid, 1);
|
Btrfs: Cache free inode numbers in memory
Currently btrfs stores the highest objectid of the fs tree, and it always
returns (highest+1) inode number when we create a file, so inode numbers
won't be reclaimed when we delete files, so we'll run out of inode numbers
as we keep create/delete files in 32bits machines.
This fixes it, and it works similarly to how we cache free space in block
cgroups.
We start a kernel thread to read the file tree. By scanning inode items,
we know which chunks of inode numbers are free, and we cache them in
an rb-tree.
Because we are searching the commit root, we have to carefully handle the
cross-transaction case.
The rb-tree is a hybrid extent+bitmap tree, so if we have too many small
chunks of inode numbers, we'll use bitmaps. Initially we allow 16K ram
of extents, and a bitmap will be used if we exceed this threshold. The
extents threshold is adjusted in runtime.
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
2011-04-20 09:06:11 +07:00
|
|
|
} else {
|
2014-03-14 02:42:13 +07:00
|
|
|
down_write(&root->fs_info->commit_root_sem);
|
2014-02-05 08:37:48 +07:00
|
|
|
spin_lock(&root->ino_cache_lock);
|
|
|
|
if (root->ino_cache_state == BTRFS_CACHE_FINISHED) {
|
|
|
|
spin_unlock(&root->ino_cache_lock);
|
2014-03-14 02:42:13 +07:00
|
|
|
up_write(&root->fs_info->commit_root_sem);
|
Btrfs: Cache free inode numbers in memory
Currently btrfs stores the highest objectid of the fs tree, and it always
returns (highest+1) inode number when we create a file, so inode numbers
won't be reclaimed when we delete files, so we'll run out of inode numbers
as we keep create/delete files in 32bits machines.
This fixes it, and it works similarly to how we cache free space in block
cgroups.
We start a kernel thread to read the file tree. By scanning inode items,
we know which chunks of inode numbers are free, and we cache them in
an rb-tree.
Because we are searching the commit root, we have to carefully handle the
cross-transaction case.
The rb-tree is a hybrid extent+bitmap tree, so if we have too many small
chunks of inode numbers, we'll use bitmaps. Initially we allow 16K ram
of extents, and a bitmap will be used if we exceed this threshold. The
extents threshold is adjusted in runtime.
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
2011-04-20 09:06:11 +07:00
|
|
|
goto again;
|
|
|
|
}
|
2014-02-05 08:37:48 +07:00
|
|
|
spin_unlock(&root->ino_cache_lock);
|
Btrfs: Cache free inode numbers in memory
Currently btrfs stores the highest objectid of the fs tree, and it always
returns (highest+1) inode number when we create a file, so inode numbers
won't be reclaimed when we delete files, so we'll run out of inode numbers
as we keep create/delete files in 32bits machines.
This fixes it, and it works similarly to how we cache free space in block
cgroups.
We start a kernel thread to read the file tree. By scanning inode items,
we know which chunks of inode numbers are free, and we cache them in
an rb-tree.
Because we are searching the commit root, we have to carefully handle the
cross-transaction case.
The rb-tree is a hybrid extent+bitmap tree, so if we have too many small
chunks of inode numbers, we'll use bitmaps. Initially we allow 16K ram
of extents, and a bitmap will be used if we exceed this threshold. The
extents threshold is adjusted in runtime.
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
2011-04-20 09:06:11 +07:00
|
|
|
|
|
|
|
start_caching(root);
|
|
|
|
|
2014-04-23 18:33:36 +07:00
|
|
|
__btrfs_add_free_space(pinned, objectid, 1);
|
Btrfs: Cache free inode numbers in memory
Currently btrfs stores the highest objectid of the fs tree, and it always
returns (highest+1) inode number when we create a file, so inode numbers
won't be reclaimed when we delete files, so we'll run out of inode numbers
as we keep create/delete files in 32bits machines.
This fixes it, and it works similarly to how we cache free space in block
cgroups.
We start a kernel thread to read the file tree. By scanning inode items,
we know which chunks of inode numbers are free, and we cache them in
an rb-tree.
Because we are searching the commit root, we have to carefully handle the
cross-transaction case.
The rb-tree is a hybrid extent+bitmap tree, so if we have too many small
chunks of inode numbers, we'll use bitmaps. Initially we allow 16K ram
of extents, and a bitmap will be used if we exceed this threshold. The
extents threshold is adjusted in runtime.
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
2011-04-20 09:06:11 +07:00
|
|
|
|
2014-03-14 02:42:13 +07:00
|
|
|
up_write(&root->fs_info->commit_root_sem);
|
Btrfs: Cache free inode numbers in memory
Currently btrfs stores the highest objectid of the fs tree, and it always
returns (highest+1) inode number when we create a file, so inode numbers
won't be reclaimed when we delete files, so we'll run out of inode numbers
as we keep create/delete files in 32bits machines.
This fixes it, and it works similarly to how we cache free space in block
cgroups.
We start a kernel thread to read the file tree. By scanning inode items,
we know which chunks of inode numbers are free, and we cache them in
an rb-tree.
Because we are searching the commit root, we have to carefully handle the
cross-transaction case.
The rb-tree is a hybrid extent+bitmap tree, so if we have too many small
chunks of inode numbers, we'll use bitmaps. Initially we allow 16K ram
of extents, and a bitmap will be used if we exceed this threshold. The
extents threshold is adjusted in runtime.
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
2011-04-20 09:06:11 +07:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2014-02-05 08:37:48 +07:00
|
|
|
* When a transaction is committed, we'll move those inode numbers which are
|
|
|
|
* smaller than root->ino_cache_progress from pinned tree to free_ino tree, and
|
|
|
|
* others will just be dropped, because the commit root we were searching has
|
|
|
|
* changed.
|
Btrfs: Cache free inode numbers in memory
Currently btrfs stores the highest objectid of the fs tree, and it always
returns (highest+1) inode number when we create a file, so inode numbers
won't be reclaimed when we delete files, so we'll run out of inode numbers
as we keep create/delete files in 32bits machines.
This fixes it, and it works similarly to how we cache free space in block
cgroups.
We start a kernel thread to read the file tree. By scanning inode items,
we know which chunks of inode numbers are free, and we cache them in
an rb-tree.
Because we are searching the commit root, we have to carefully handle the
cross-transaction case.
The rb-tree is a hybrid extent+bitmap tree, so if we have too many small
chunks of inode numbers, we'll use bitmaps. Initially we allow 16K ram
of extents, and a bitmap will be used if we exceed this threshold. The
extents threshold is adjusted in runtime.
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
2011-04-20 09:06:11 +07:00
|
|
|
*
|
2014-03-14 02:42:13 +07:00
|
|
|
* Must be called with root->fs_info->commit_root_sem held
|
Btrfs: Cache free inode numbers in memory
Currently btrfs stores the highest objectid of the fs tree, and it always
returns (highest+1) inode number when we create a file, so inode numbers
won't be reclaimed when we delete files, so we'll run out of inode numbers
as we keep create/delete files in 32bits machines.
This fixes it, and it works similarly to how we cache free space in block
cgroups.
We start a kernel thread to read the file tree. By scanning inode items,
we know which chunks of inode numbers are free, and we cache them in
an rb-tree.
Because we are searching the commit root, we have to carefully handle the
cross-transaction case.
The rb-tree is a hybrid extent+bitmap tree, so if we have too many small
chunks of inode numbers, we'll use bitmaps. Initially we allow 16K ram
of extents, and a bitmap will be used if we exceed this threshold. The
extents threshold is adjusted in runtime.
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
2011-04-20 09:06:11 +07:00
|
|
|
*/
|
|
|
|
void btrfs_unpin_free_ino(struct btrfs_root *root)
|
|
|
|
{
|
|
|
|
struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
|
|
|
|
struct rb_root *rbroot = &root->free_ino_pinned->free_space_offset;
|
Btrfs: fix race between caching kthread and returning inode to inode cache
While the inode cache caching kthread is calling btrfs_unpin_free_ino(),
we could have a concurrent call to btrfs_return_ino() that adds a new
entry to the root's free space cache of pinned inodes. This concurrent
call does not acquire the fs_info->commit_root_sem before adding a new
entry if the caching state is BTRFS_CACHE_FINISHED, which is a problem
because the caching kthread calls btrfs_unpin_free_ino() after setting
the caching state to BTRFS_CACHE_FINISHED and therefore races with
the task calling btrfs_return_ino(), which is adding a new entry, while
the former (caching kthread) is navigating the cache's rbtree, removing
and freeing nodes from the cache's rbtree without acquiring the spinlock
that protects the rbtree.
This race resulted in memory corruption due to double free of struct
btrfs_free_space objects because both tasks can end up doing freeing the
same objects. Note that adding a new entry can result in merging it with
other entries in the cache, in which case those entries are freed.
This is particularly important as btrfs_free_space structures are also
used for the block group free space caches.
This memory corruption can be detected by a debugging kernel, which
reports it with the following trace:
[132408.501148] slab error in verify_redzone_free(): cache `btrfs_free_space': double free detected
[132408.505075] CPU: 15 PID: 12248 Comm: btrfs-ino-cache Tainted: G W 4.1.0-rc5-btrfs-next-10+ #1
[132408.505075] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.8.1-0-g4adadbd-20150316_085822-nilsson.home.kraxel.org 04/01/2014
[132408.505075] ffff880023e7d320 ffff880163d73cd8 ffffffff8145eec7 ffffffff81095dce
[132408.505075] ffff880009735d40 ffff880163d73ce8 ffffffff81154e1e ffff880163d73d68
[132408.505075] ffffffff81155733 ffffffffa054a95a ffff8801b6099f00 ffffffffa0505b5f
[132408.505075] Call Trace:
[132408.505075] [<ffffffff8145eec7>] dump_stack+0x4f/0x7b
[132408.505075] [<ffffffff81095dce>] ? console_unlock+0x356/0x3a2
[132408.505075] [<ffffffff81154e1e>] __slab_error.isra.28+0x25/0x36
[132408.505075] [<ffffffff81155733>] __cache_free+0xe2/0x4b6
[132408.505075] [<ffffffffa054a95a>] ? __btrfs_add_free_space+0x2f0/0x343 [btrfs]
[132408.505075] [<ffffffffa0505b5f>] ? btrfs_unpin_free_ino+0x8e/0x99 [btrfs]
[132408.505075] [<ffffffff810f3b30>] ? time_hardirqs_off+0x15/0x28
[132408.505075] [<ffffffff81084d42>] ? trace_hardirqs_off+0xd/0xf
[132408.505075] [<ffffffff811563a1>] ? kfree+0xb6/0x14e
[132408.505075] [<ffffffff811563d0>] kfree+0xe5/0x14e
[132408.505075] [<ffffffffa0505b5f>] btrfs_unpin_free_ino+0x8e/0x99 [btrfs]
[132408.505075] [<ffffffffa0505e08>] caching_kthread+0x29e/0x2d9 [btrfs]
[132408.505075] [<ffffffffa0505b6a>] ? btrfs_unpin_free_ino+0x99/0x99 [btrfs]
[132408.505075] [<ffffffff8106698f>] kthread+0xef/0xf7
[132408.505075] [<ffffffff810f3b08>] ? time_hardirqs_on+0x15/0x28
[132408.505075] [<ffffffff810668a0>] ? __kthread_parkme+0xad/0xad
[132408.505075] [<ffffffff814653d2>] ret_from_fork+0x42/0x70
[132408.505075] [<ffffffff810668a0>] ? __kthread_parkme+0xad/0xad
[132408.505075] ffff880023e7d320: redzone 1:0x9f911029d74e35b, redzone 2:0x9f911029d74e35b.
[132409.501654] slab: double free detected in cache 'btrfs_free_space', objp ffff880023e7d320
[132409.503355] ------------[ cut here ]------------
[132409.504241] kernel BUG at mm/slab.c:2571!
Therefore fix this by having btrfs_unpin_free_ino() acquire the lock
that protects the rbtree while doing the searches and removing entries.
Fixes: 1c70d8fb4dfa ("Btrfs: fix inode caching vs tree log")
Cc: stable@vger.kernel.org
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2015-06-13 12:52:57 +07:00
|
|
|
spinlock_t *rbroot_lock = &root->free_ino_pinned->tree_lock;
|
Btrfs: Cache free inode numbers in memory
Currently btrfs stores the highest objectid of the fs tree, and it always
returns (highest+1) inode number when we create a file, so inode numbers
won't be reclaimed when we delete files, so we'll run out of inode numbers
as we keep create/delete files in 32bits machines.
This fixes it, and it works similarly to how we cache free space in block
cgroups.
We start a kernel thread to read the file tree. By scanning inode items,
we know which chunks of inode numbers are free, and we cache them in
an rb-tree.
Because we are searching the commit root, we have to carefully handle the
cross-transaction case.
The rb-tree is a hybrid extent+bitmap tree, so if we have too many small
chunks of inode numbers, we'll use bitmaps. Initially we allow 16K ram
of extents, and a bitmap will be used if we exceed this threshold. The
extents threshold is adjusted in runtime.
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
2011-04-20 09:06:11 +07:00
|
|
|
struct btrfs_free_space *info;
|
|
|
|
struct rb_node *n;
|
|
|
|
u64 count;
|
|
|
|
|
2016-06-10 08:38:35 +07:00
|
|
|
if (!btrfs_test_opt(root->fs_info, INODE_MAP_CACHE))
|
2011-06-03 20:36:29 +07:00
|
|
|
return;
|
|
|
|
|
Btrfs: Cache free inode numbers in memory
Currently btrfs stores the highest objectid of the fs tree, and it always
returns (highest+1) inode number when we create a file, so inode numbers
won't be reclaimed when we delete files, so we'll run out of inode numbers
as we keep create/delete files in 32bits machines.
This fixes it, and it works similarly to how we cache free space in block
cgroups.
We start a kernel thread to read the file tree. By scanning inode items,
we know which chunks of inode numbers are free, and we cache them in
an rb-tree.
Because we are searching the commit root, we have to carefully handle the
cross-transaction case.
The rb-tree is a hybrid extent+bitmap tree, so if we have too many small
chunks of inode numbers, we'll use bitmaps. Initially we allow 16K ram
of extents, and a bitmap will be used if we exceed this threshold. The
extents threshold is adjusted in runtime.
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
2011-04-20 09:06:11 +07:00
|
|
|
while (1) {
|
Btrfs: fix race between caching kthread and returning inode to inode cache
While the inode cache caching kthread is calling btrfs_unpin_free_ino(),
we could have a concurrent call to btrfs_return_ino() that adds a new
entry to the root's free space cache of pinned inodes. This concurrent
call does not acquire the fs_info->commit_root_sem before adding a new
entry if the caching state is BTRFS_CACHE_FINISHED, which is a problem
because the caching kthread calls btrfs_unpin_free_ino() after setting
the caching state to BTRFS_CACHE_FINISHED and therefore races with
the task calling btrfs_return_ino(), which is adding a new entry, while
the former (caching kthread) is navigating the cache's rbtree, removing
and freeing nodes from the cache's rbtree without acquiring the spinlock
that protects the rbtree.
This race resulted in memory corruption due to double free of struct
btrfs_free_space objects because both tasks can end up doing freeing the
same objects. Note that adding a new entry can result in merging it with
other entries in the cache, in which case those entries are freed.
This is particularly important as btrfs_free_space structures are also
used for the block group free space caches.
This memory corruption can be detected by a debugging kernel, which
reports it with the following trace:
[132408.501148] slab error in verify_redzone_free(): cache `btrfs_free_space': double free detected
[132408.505075] CPU: 15 PID: 12248 Comm: btrfs-ino-cache Tainted: G W 4.1.0-rc5-btrfs-next-10+ #1
[132408.505075] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.8.1-0-g4adadbd-20150316_085822-nilsson.home.kraxel.org 04/01/2014
[132408.505075] ffff880023e7d320 ffff880163d73cd8 ffffffff8145eec7 ffffffff81095dce
[132408.505075] ffff880009735d40 ffff880163d73ce8 ffffffff81154e1e ffff880163d73d68
[132408.505075] ffffffff81155733 ffffffffa054a95a ffff8801b6099f00 ffffffffa0505b5f
[132408.505075] Call Trace:
[132408.505075] [<ffffffff8145eec7>] dump_stack+0x4f/0x7b
[132408.505075] [<ffffffff81095dce>] ? console_unlock+0x356/0x3a2
[132408.505075] [<ffffffff81154e1e>] __slab_error.isra.28+0x25/0x36
[132408.505075] [<ffffffff81155733>] __cache_free+0xe2/0x4b6
[132408.505075] [<ffffffffa054a95a>] ? __btrfs_add_free_space+0x2f0/0x343 [btrfs]
[132408.505075] [<ffffffffa0505b5f>] ? btrfs_unpin_free_ino+0x8e/0x99 [btrfs]
[132408.505075] [<ffffffff810f3b30>] ? time_hardirqs_off+0x15/0x28
[132408.505075] [<ffffffff81084d42>] ? trace_hardirqs_off+0xd/0xf
[132408.505075] [<ffffffff811563a1>] ? kfree+0xb6/0x14e
[132408.505075] [<ffffffff811563d0>] kfree+0xe5/0x14e
[132408.505075] [<ffffffffa0505b5f>] btrfs_unpin_free_ino+0x8e/0x99 [btrfs]
[132408.505075] [<ffffffffa0505e08>] caching_kthread+0x29e/0x2d9 [btrfs]
[132408.505075] [<ffffffffa0505b6a>] ? btrfs_unpin_free_ino+0x99/0x99 [btrfs]
[132408.505075] [<ffffffff8106698f>] kthread+0xef/0xf7
[132408.505075] [<ffffffff810f3b08>] ? time_hardirqs_on+0x15/0x28
[132408.505075] [<ffffffff810668a0>] ? __kthread_parkme+0xad/0xad
[132408.505075] [<ffffffff814653d2>] ret_from_fork+0x42/0x70
[132408.505075] [<ffffffff810668a0>] ? __kthread_parkme+0xad/0xad
[132408.505075] ffff880023e7d320: redzone 1:0x9f911029d74e35b, redzone 2:0x9f911029d74e35b.
[132409.501654] slab: double free detected in cache 'btrfs_free_space', objp ffff880023e7d320
[132409.503355] ------------[ cut here ]------------
[132409.504241] kernel BUG at mm/slab.c:2571!
Therefore fix this by having btrfs_unpin_free_ino() acquire the lock
that protects the rbtree while doing the searches and removing entries.
Fixes: 1c70d8fb4dfa ("Btrfs: fix inode caching vs tree log")
Cc: stable@vger.kernel.org
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2015-06-13 12:52:57 +07:00
|
|
|
bool add_to_ctl = true;
|
|
|
|
|
|
|
|
spin_lock(rbroot_lock);
|
Btrfs: Cache free inode numbers in memory
Currently btrfs stores the highest objectid of the fs tree, and it always
returns (highest+1) inode number when we create a file, so inode numbers
won't be reclaimed when we delete files, so we'll run out of inode numbers
as we keep create/delete files in 32bits machines.
This fixes it, and it works similarly to how we cache free space in block
cgroups.
We start a kernel thread to read the file tree. By scanning inode items,
we know which chunks of inode numbers are free, and we cache them in
an rb-tree.
Because we are searching the commit root, we have to carefully handle the
cross-transaction case.
The rb-tree is a hybrid extent+bitmap tree, so if we have too many small
chunks of inode numbers, we'll use bitmaps. Initially we allow 16K ram
of extents, and a bitmap will be used if we exceed this threshold. The
extents threshold is adjusted in runtime.
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
2011-04-20 09:06:11 +07:00
|
|
|
n = rb_first(rbroot);
|
Btrfs: fix race between caching kthread and returning inode to inode cache
While the inode cache caching kthread is calling btrfs_unpin_free_ino(),
we could have a concurrent call to btrfs_return_ino() that adds a new
entry to the root's free space cache of pinned inodes. This concurrent
call does not acquire the fs_info->commit_root_sem before adding a new
entry if the caching state is BTRFS_CACHE_FINISHED, which is a problem
because the caching kthread calls btrfs_unpin_free_ino() after setting
the caching state to BTRFS_CACHE_FINISHED and therefore races with
the task calling btrfs_return_ino(), which is adding a new entry, while
the former (caching kthread) is navigating the cache's rbtree, removing
and freeing nodes from the cache's rbtree without acquiring the spinlock
that protects the rbtree.
This race resulted in memory corruption due to double free of struct
btrfs_free_space objects because both tasks can end up doing freeing the
same objects. Note that adding a new entry can result in merging it with
other entries in the cache, in which case those entries are freed.
This is particularly important as btrfs_free_space structures are also
used for the block group free space caches.
This memory corruption can be detected by a debugging kernel, which
reports it with the following trace:
[132408.501148] slab error in verify_redzone_free(): cache `btrfs_free_space': double free detected
[132408.505075] CPU: 15 PID: 12248 Comm: btrfs-ino-cache Tainted: G W 4.1.0-rc5-btrfs-next-10+ #1
[132408.505075] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.8.1-0-g4adadbd-20150316_085822-nilsson.home.kraxel.org 04/01/2014
[132408.505075] ffff880023e7d320 ffff880163d73cd8 ffffffff8145eec7 ffffffff81095dce
[132408.505075] ffff880009735d40 ffff880163d73ce8 ffffffff81154e1e ffff880163d73d68
[132408.505075] ffffffff81155733 ffffffffa054a95a ffff8801b6099f00 ffffffffa0505b5f
[132408.505075] Call Trace:
[132408.505075] [<ffffffff8145eec7>] dump_stack+0x4f/0x7b
[132408.505075] [<ffffffff81095dce>] ? console_unlock+0x356/0x3a2
[132408.505075] [<ffffffff81154e1e>] __slab_error.isra.28+0x25/0x36
[132408.505075] [<ffffffff81155733>] __cache_free+0xe2/0x4b6
[132408.505075] [<ffffffffa054a95a>] ? __btrfs_add_free_space+0x2f0/0x343 [btrfs]
[132408.505075] [<ffffffffa0505b5f>] ? btrfs_unpin_free_ino+0x8e/0x99 [btrfs]
[132408.505075] [<ffffffff810f3b30>] ? time_hardirqs_off+0x15/0x28
[132408.505075] [<ffffffff81084d42>] ? trace_hardirqs_off+0xd/0xf
[132408.505075] [<ffffffff811563a1>] ? kfree+0xb6/0x14e
[132408.505075] [<ffffffff811563d0>] kfree+0xe5/0x14e
[132408.505075] [<ffffffffa0505b5f>] btrfs_unpin_free_ino+0x8e/0x99 [btrfs]
[132408.505075] [<ffffffffa0505e08>] caching_kthread+0x29e/0x2d9 [btrfs]
[132408.505075] [<ffffffffa0505b6a>] ? btrfs_unpin_free_ino+0x99/0x99 [btrfs]
[132408.505075] [<ffffffff8106698f>] kthread+0xef/0xf7
[132408.505075] [<ffffffff810f3b08>] ? time_hardirqs_on+0x15/0x28
[132408.505075] [<ffffffff810668a0>] ? __kthread_parkme+0xad/0xad
[132408.505075] [<ffffffff814653d2>] ret_from_fork+0x42/0x70
[132408.505075] [<ffffffff810668a0>] ? __kthread_parkme+0xad/0xad
[132408.505075] ffff880023e7d320: redzone 1:0x9f911029d74e35b, redzone 2:0x9f911029d74e35b.
[132409.501654] slab: double free detected in cache 'btrfs_free_space', objp ffff880023e7d320
[132409.503355] ------------[ cut here ]------------
[132409.504241] kernel BUG at mm/slab.c:2571!
Therefore fix this by having btrfs_unpin_free_ino() acquire the lock
that protects the rbtree while doing the searches and removing entries.
Fixes: 1c70d8fb4dfa ("Btrfs: fix inode caching vs tree log")
Cc: stable@vger.kernel.org
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2015-06-13 12:52:57 +07:00
|
|
|
if (!n) {
|
|
|
|
spin_unlock(rbroot_lock);
|
Btrfs: Cache free inode numbers in memory
Currently btrfs stores the highest objectid of the fs tree, and it always
returns (highest+1) inode number when we create a file, so inode numbers
won't be reclaimed when we delete files, so we'll run out of inode numbers
as we keep create/delete files in 32bits machines.
This fixes it, and it works similarly to how we cache free space in block
cgroups.
We start a kernel thread to read the file tree. By scanning inode items,
we know which chunks of inode numbers are free, and we cache them in
an rb-tree.
Because we are searching the commit root, we have to carefully handle the
cross-transaction case.
The rb-tree is a hybrid extent+bitmap tree, so if we have too many small
chunks of inode numbers, we'll use bitmaps. Initially we allow 16K ram
of extents, and a bitmap will be used if we exceed this threshold. The
extents threshold is adjusted in runtime.
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
2011-04-20 09:06:11 +07:00
|
|
|
break;
|
Btrfs: fix race between caching kthread and returning inode to inode cache
While the inode cache caching kthread is calling btrfs_unpin_free_ino(),
we could have a concurrent call to btrfs_return_ino() that adds a new
entry to the root's free space cache of pinned inodes. This concurrent
call does not acquire the fs_info->commit_root_sem before adding a new
entry if the caching state is BTRFS_CACHE_FINISHED, which is a problem
because the caching kthread calls btrfs_unpin_free_ino() after setting
the caching state to BTRFS_CACHE_FINISHED and therefore races with
the task calling btrfs_return_ino(), which is adding a new entry, while
the former (caching kthread) is navigating the cache's rbtree, removing
and freeing nodes from the cache's rbtree without acquiring the spinlock
that protects the rbtree.
This race resulted in memory corruption due to double free of struct
btrfs_free_space objects because both tasks can end up doing freeing the
same objects. Note that adding a new entry can result in merging it with
other entries in the cache, in which case those entries are freed.
This is particularly important as btrfs_free_space structures are also
used for the block group free space caches.
This memory corruption can be detected by a debugging kernel, which
reports it with the following trace:
[132408.501148] slab error in verify_redzone_free(): cache `btrfs_free_space': double free detected
[132408.505075] CPU: 15 PID: 12248 Comm: btrfs-ino-cache Tainted: G W 4.1.0-rc5-btrfs-next-10+ #1
[132408.505075] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.8.1-0-g4adadbd-20150316_085822-nilsson.home.kraxel.org 04/01/2014
[132408.505075] ffff880023e7d320 ffff880163d73cd8 ffffffff8145eec7 ffffffff81095dce
[132408.505075] ffff880009735d40 ffff880163d73ce8 ffffffff81154e1e ffff880163d73d68
[132408.505075] ffffffff81155733 ffffffffa054a95a ffff8801b6099f00 ffffffffa0505b5f
[132408.505075] Call Trace:
[132408.505075] [<ffffffff8145eec7>] dump_stack+0x4f/0x7b
[132408.505075] [<ffffffff81095dce>] ? console_unlock+0x356/0x3a2
[132408.505075] [<ffffffff81154e1e>] __slab_error.isra.28+0x25/0x36
[132408.505075] [<ffffffff81155733>] __cache_free+0xe2/0x4b6
[132408.505075] [<ffffffffa054a95a>] ? __btrfs_add_free_space+0x2f0/0x343 [btrfs]
[132408.505075] [<ffffffffa0505b5f>] ? btrfs_unpin_free_ino+0x8e/0x99 [btrfs]
[132408.505075] [<ffffffff810f3b30>] ? time_hardirqs_off+0x15/0x28
[132408.505075] [<ffffffff81084d42>] ? trace_hardirqs_off+0xd/0xf
[132408.505075] [<ffffffff811563a1>] ? kfree+0xb6/0x14e
[132408.505075] [<ffffffff811563d0>] kfree+0xe5/0x14e
[132408.505075] [<ffffffffa0505b5f>] btrfs_unpin_free_ino+0x8e/0x99 [btrfs]
[132408.505075] [<ffffffffa0505e08>] caching_kthread+0x29e/0x2d9 [btrfs]
[132408.505075] [<ffffffffa0505b6a>] ? btrfs_unpin_free_ino+0x99/0x99 [btrfs]
[132408.505075] [<ffffffff8106698f>] kthread+0xef/0xf7
[132408.505075] [<ffffffff810f3b08>] ? time_hardirqs_on+0x15/0x28
[132408.505075] [<ffffffff810668a0>] ? __kthread_parkme+0xad/0xad
[132408.505075] [<ffffffff814653d2>] ret_from_fork+0x42/0x70
[132408.505075] [<ffffffff810668a0>] ? __kthread_parkme+0xad/0xad
[132408.505075] ffff880023e7d320: redzone 1:0x9f911029d74e35b, redzone 2:0x9f911029d74e35b.
[132409.501654] slab: double free detected in cache 'btrfs_free_space', objp ffff880023e7d320
[132409.503355] ------------[ cut here ]------------
[132409.504241] kernel BUG at mm/slab.c:2571!
Therefore fix this by having btrfs_unpin_free_ino() acquire the lock
that protects the rbtree while doing the searches and removing entries.
Fixes: 1c70d8fb4dfa ("Btrfs: fix inode caching vs tree log")
Cc: stable@vger.kernel.org
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2015-06-13 12:52:57 +07:00
|
|
|
}
|
Btrfs: Cache free inode numbers in memory
Currently btrfs stores the highest objectid of the fs tree, and it always
returns (highest+1) inode number when we create a file, so inode numbers
won't be reclaimed when we delete files, so we'll run out of inode numbers
as we keep create/delete files in 32bits machines.
This fixes it, and it works similarly to how we cache free space in block
cgroups.
We start a kernel thread to read the file tree. By scanning inode items,
we know which chunks of inode numbers are free, and we cache them in
an rb-tree.
Because we are searching the commit root, we have to carefully handle the
cross-transaction case.
The rb-tree is a hybrid extent+bitmap tree, so if we have too many small
chunks of inode numbers, we'll use bitmaps. Initially we allow 16K ram
of extents, and a bitmap will be used if we exceed this threshold. The
extents threshold is adjusted in runtime.
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
2011-04-20 09:06:11 +07:00
|
|
|
|
|
|
|
info = rb_entry(n, struct btrfs_free_space, offset_index);
|
2012-03-12 22:03:00 +07:00
|
|
|
BUG_ON(info->bitmap); /* Logic error */
|
Btrfs: Cache free inode numbers in memory
Currently btrfs stores the highest objectid of the fs tree, and it always
returns (highest+1) inode number when we create a file, so inode numbers
won't be reclaimed when we delete files, so we'll run out of inode numbers
as we keep create/delete files in 32bits machines.
This fixes it, and it works similarly to how we cache free space in block
cgroups.
We start a kernel thread to read the file tree. By scanning inode items,
we know which chunks of inode numbers are free, and we cache them in
an rb-tree.
Because we are searching the commit root, we have to carefully handle the
cross-transaction case.
The rb-tree is a hybrid extent+bitmap tree, so if we have too many small
chunks of inode numbers, we'll use bitmaps. Initially we allow 16K ram
of extents, and a bitmap will be used if we exceed this threshold. The
extents threshold is adjusted in runtime.
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
2011-04-20 09:06:11 +07:00
|
|
|
|
2014-02-05 08:37:48 +07:00
|
|
|
if (info->offset > root->ino_cache_progress)
|
Btrfs: fix race between caching kthread and returning inode to inode cache
While the inode cache caching kthread is calling btrfs_unpin_free_ino(),
we could have a concurrent call to btrfs_return_ino() that adds a new
entry to the root's free space cache of pinned inodes. This concurrent
call does not acquire the fs_info->commit_root_sem before adding a new
entry if the caching state is BTRFS_CACHE_FINISHED, which is a problem
because the caching kthread calls btrfs_unpin_free_ino() after setting
the caching state to BTRFS_CACHE_FINISHED and therefore races with
the task calling btrfs_return_ino(), which is adding a new entry, while
the former (caching kthread) is navigating the cache's rbtree, removing
and freeing nodes from the cache's rbtree without acquiring the spinlock
that protects the rbtree.
This race resulted in memory corruption due to double free of struct
btrfs_free_space objects because both tasks can end up doing freeing the
same objects. Note that adding a new entry can result in merging it with
other entries in the cache, in which case those entries are freed.
This is particularly important as btrfs_free_space structures are also
used for the block group free space caches.
This memory corruption can be detected by a debugging kernel, which
reports it with the following trace:
[132408.501148] slab error in verify_redzone_free(): cache `btrfs_free_space': double free detected
[132408.505075] CPU: 15 PID: 12248 Comm: btrfs-ino-cache Tainted: G W 4.1.0-rc5-btrfs-next-10+ #1
[132408.505075] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.8.1-0-g4adadbd-20150316_085822-nilsson.home.kraxel.org 04/01/2014
[132408.505075] ffff880023e7d320 ffff880163d73cd8 ffffffff8145eec7 ffffffff81095dce
[132408.505075] ffff880009735d40 ffff880163d73ce8 ffffffff81154e1e ffff880163d73d68
[132408.505075] ffffffff81155733 ffffffffa054a95a ffff8801b6099f00 ffffffffa0505b5f
[132408.505075] Call Trace:
[132408.505075] [<ffffffff8145eec7>] dump_stack+0x4f/0x7b
[132408.505075] [<ffffffff81095dce>] ? console_unlock+0x356/0x3a2
[132408.505075] [<ffffffff81154e1e>] __slab_error.isra.28+0x25/0x36
[132408.505075] [<ffffffff81155733>] __cache_free+0xe2/0x4b6
[132408.505075] [<ffffffffa054a95a>] ? __btrfs_add_free_space+0x2f0/0x343 [btrfs]
[132408.505075] [<ffffffffa0505b5f>] ? btrfs_unpin_free_ino+0x8e/0x99 [btrfs]
[132408.505075] [<ffffffff810f3b30>] ? time_hardirqs_off+0x15/0x28
[132408.505075] [<ffffffff81084d42>] ? trace_hardirqs_off+0xd/0xf
[132408.505075] [<ffffffff811563a1>] ? kfree+0xb6/0x14e
[132408.505075] [<ffffffff811563d0>] kfree+0xe5/0x14e
[132408.505075] [<ffffffffa0505b5f>] btrfs_unpin_free_ino+0x8e/0x99 [btrfs]
[132408.505075] [<ffffffffa0505e08>] caching_kthread+0x29e/0x2d9 [btrfs]
[132408.505075] [<ffffffffa0505b6a>] ? btrfs_unpin_free_ino+0x99/0x99 [btrfs]
[132408.505075] [<ffffffff8106698f>] kthread+0xef/0xf7
[132408.505075] [<ffffffff810f3b08>] ? time_hardirqs_on+0x15/0x28
[132408.505075] [<ffffffff810668a0>] ? __kthread_parkme+0xad/0xad
[132408.505075] [<ffffffff814653d2>] ret_from_fork+0x42/0x70
[132408.505075] [<ffffffff810668a0>] ? __kthread_parkme+0xad/0xad
[132408.505075] ffff880023e7d320: redzone 1:0x9f911029d74e35b, redzone 2:0x9f911029d74e35b.
[132409.501654] slab: double free detected in cache 'btrfs_free_space', objp ffff880023e7d320
[132409.503355] ------------[ cut here ]------------
[132409.504241] kernel BUG at mm/slab.c:2571!
Therefore fix this by having btrfs_unpin_free_ino() acquire the lock
that protects the rbtree while doing the searches and removing entries.
Fixes: 1c70d8fb4dfa ("Btrfs: fix inode caching vs tree log")
Cc: stable@vger.kernel.org
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2015-06-13 12:52:57 +07:00
|
|
|
add_to_ctl = false;
|
2014-02-05 08:37:48 +07:00
|
|
|
else if (info->offset + info->bytes > root->ino_cache_progress)
|
|
|
|
count = root->ino_cache_progress - info->offset + 1;
|
Btrfs: Cache free inode numbers in memory
Currently btrfs stores the highest objectid of the fs tree, and it always
returns (highest+1) inode number when we create a file, so inode numbers
won't be reclaimed when we delete files, so we'll run out of inode numbers
as we keep create/delete files in 32bits machines.
This fixes it, and it works similarly to how we cache free space in block
cgroups.
We start a kernel thread to read the file tree. By scanning inode items,
we know which chunks of inode numbers are free, and we cache them in
an rb-tree.
Because we are searching the commit root, we have to carefully handle the
cross-transaction case.
The rb-tree is a hybrid extent+bitmap tree, so if we have too many small
chunks of inode numbers, we'll use bitmaps. Initially we allow 16K ram
of extents, and a bitmap will be used if we exceed this threshold. The
extents threshold is adjusted in runtime.
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
2011-04-20 09:06:11 +07:00
|
|
|
else
|
|
|
|
count = info->bytes;
|
|
|
|
|
|
|
|
rb_erase(&info->offset_index, rbroot);
|
Btrfs: fix race between caching kthread and returning inode to inode cache
While the inode cache caching kthread is calling btrfs_unpin_free_ino(),
we could have a concurrent call to btrfs_return_ino() that adds a new
entry to the root's free space cache of pinned inodes. This concurrent
call does not acquire the fs_info->commit_root_sem before adding a new
entry if the caching state is BTRFS_CACHE_FINISHED, which is a problem
because the caching kthread calls btrfs_unpin_free_ino() after setting
the caching state to BTRFS_CACHE_FINISHED and therefore races with
the task calling btrfs_return_ino(), which is adding a new entry, while
the former (caching kthread) is navigating the cache's rbtree, removing
and freeing nodes from the cache's rbtree without acquiring the spinlock
that protects the rbtree.
This race resulted in memory corruption due to double free of struct
btrfs_free_space objects because both tasks can end up doing freeing the
same objects. Note that adding a new entry can result in merging it with
other entries in the cache, in which case those entries are freed.
This is particularly important as btrfs_free_space structures are also
used for the block group free space caches.
This memory corruption can be detected by a debugging kernel, which
reports it with the following trace:
[132408.501148] slab error in verify_redzone_free(): cache `btrfs_free_space': double free detected
[132408.505075] CPU: 15 PID: 12248 Comm: btrfs-ino-cache Tainted: G W 4.1.0-rc5-btrfs-next-10+ #1
[132408.505075] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.8.1-0-g4adadbd-20150316_085822-nilsson.home.kraxel.org 04/01/2014
[132408.505075] ffff880023e7d320 ffff880163d73cd8 ffffffff8145eec7 ffffffff81095dce
[132408.505075] ffff880009735d40 ffff880163d73ce8 ffffffff81154e1e ffff880163d73d68
[132408.505075] ffffffff81155733 ffffffffa054a95a ffff8801b6099f00 ffffffffa0505b5f
[132408.505075] Call Trace:
[132408.505075] [<ffffffff8145eec7>] dump_stack+0x4f/0x7b
[132408.505075] [<ffffffff81095dce>] ? console_unlock+0x356/0x3a2
[132408.505075] [<ffffffff81154e1e>] __slab_error.isra.28+0x25/0x36
[132408.505075] [<ffffffff81155733>] __cache_free+0xe2/0x4b6
[132408.505075] [<ffffffffa054a95a>] ? __btrfs_add_free_space+0x2f0/0x343 [btrfs]
[132408.505075] [<ffffffffa0505b5f>] ? btrfs_unpin_free_ino+0x8e/0x99 [btrfs]
[132408.505075] [<ffffffff810f3b30>] ? time_hardirqs_off+0x15/0x28
[132408.505075] [<ffffffff81084d42>] ? trace_hardirqs_off+0xd/0xf
[132408.505075] [<ffffffff811563a1>] ? kfree+0xb6/0x14e
[132408.505075] [<ffffffff811563d0>] kfree+0xe5/0x14e
[132408.505075] [<ffffffffa0505b5f>] btrfs_unpin_free_ino+0x8e/0x99 [btrfs]
[132408.505075] [<ffffffffa0505e08>] caching_kthread+0x29e/0x2d9 [btrfs]
[132408.505075] [<ffffffffa0505b6a>] ? btrfs_unpin_free_ino+0x99/0x99 [btrfs]
[132408.505075] [<ffffffff8106698f>] kthread+0xef/0xf7
[132408.505075] [<ffffffff810f3b08>] ? time_hardirqs_on+0x15/0x28
[132408.505075] [<ffffffff810668a0>] ? __kthread_parkme+0xad/0xad
[132408.505075] [<ffffffff814653d2>] ret_from_fork+0x42/0x70
[132408.505075] [<ffffffff810668a0>] ? __kthread_parkme+0xad/0xad
[132408.505075] ffff880023e7d320: redzone 1:0x9f911029d74e35b, redzone 2:0x9f911029d74e35b.
[132409.501654] slab: double free detected in cache 'btrfs_free_space', objp ffff880023e7d320
[132409.503355] ------------[ cut here ]------------
[132409.504241] kernel BUG at mm/slab.c:2571!
Therefore fix this by having btrfs_unpin_free_ino() acquire the lock
that protects the rbtree while doing the searches and removing entries.
Fixes: 1c70d8fb4dfa ("Btrfs: fix inode caching vs tree log")
Cc: stable@vger.kernel.org
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2015-06-13 12:52:57 +07:00
|
|
|
spin_unlock(rbroot_lock);
|
|
|
|
if (add_to_ctl)
|
|
|
|
__btrfs_add_free_space(ctl, info->offset, count);
|
2015-06-13 12:52:56 +07:00
|
|
|
kmem_cache_free(btrfs_free_space_cachep, info);
|
Btrfs: Cache free inode numbers in memory
Currently btrfs stores the highest objectid of the fs tree, and it always
returns (highest+1) inode number when we create a file, so inode numbers
won't be reclaimed when we delete files, so we'll run out of inode numbers
as we keep create/delete files in 32bits machines.
This fixes it, and it works similarly to how we cache free space in block
cgroups.
We start a kernel thread to read the file tree. By scanning inode items,
we know which chunks of inode numbers are free, and we cache them in
an rb-tree.
Because we are searching the commit root, we have to carefully handle the
cross-transaction case.
The rb-tree is a hybrid extent+bitmap tree, so if we have too many small
chunks of inode numbers, we'll use bitmaps. Initially we allow 16K ram
of extents, and a bitmap will be used if we exceed this threshold. The
extents threshold is adjusted in runtime.
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
2011-04-20 09:06:11 +07:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-12-14 23:42:10 +07:00
|
|
|
#define INIT_THRESHOLD ((SZ_32K / 2) / sizeof(struct btrfs_free_space))
|
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 19:29:47 +07:00
|
|
|
#define INODES_PER_BITMAP (PAGE_SIZE * 8)
|
Btrfs: Cache free inode numbers in memory
Currently btrfs stores the highest objectid of the fs tree, and it always
returns (highest+1) inode number when we create a file, so inode numbers
won't be reclaimed when we delete files, so we'll run out of inode numbers
as we keep create/delete files in 32bits machines.
This fixes it, and it works similarly to how we cache free space in block
cgroups.
We start a kernel thread to read the file tree. By scanning inode items,
we know which chunks of inode numbers are free, and we cache them in
an rb-tree.
Because we are searching the commit root, we have to carefully handle the
cross-transaction case.
The rb-tree is a hybrid extent+bitmap tree, so if we have too many small
chunks of inode numbers, we'll use bitmaps. Initially we allow 16K ram
of extents, and a bitmap will be used if we exceed this threshold. The
extents threshold is adjusted in runtime.
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
2011-04-20 09:06:11 +07:00
|
|
|
|
|
|
|
/*
|
|
|
|
* The goal is to keep the memory used by the free_ino tree won't
|
|
|
|
* exceed the memory if we use bitmaps only.
|
|
|
|
*/
|
|
|
|
static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl)
|
|
|
|
{
|
|
|
|
struct btrfs_free_space *info;
|
|
|
|
struct rb_node *n;
|
|
|
|
int max_ino;
|
|
|
|
int max_bitmaps;
|
|
|
|
|
|
|
|
n = rb_last(&ctl->free_space_offset);
|
|
|
|
if (!n) {
|
|
|
|
ctl->extents_thresh = INIT_THRESHOLD;
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
info = rb_entry(n, struct btrfs_free_space, offset_index);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Find the maximum inode number in the filesystem. Note we
|
|
|
|
* ignore the fact that this can be a bitmap, because we are
|
|
|
|
* not doing precise calculation.
|
|
|
|
*/
|
|
|
|
max_ino = info->bytes - 1;
|
|
|
|
|
|
|
|
max_bitmaps = ALIGN(max_ino, INODES_PER_BITMAP) / INODES_PER_BITMAP;
|
|
|
|
if (max_bitmaps <= ctl->total_bitmaps) {
|
|
|
|
ctl->extents_thresh = 0;
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
ctl->extents_thresh = (max_bitmaps - ctl->total_bitmaps) *
|
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 19:29:47 +07:00
|
|
|
PAGE_SIZE / sizeof(*info);
|
Btrfs: Cache free inode numbers in memory
Currently btrfs stores the highest objectid of the fs tree, and it always
returns (highest+1) inode number when we create a file, so inode numbers
won't be reclaimed when we delete files, so we'll run out of inode numbers
as we keep create/delete files in 32bits machines.
This fixes it, and it works similarly to how we cache free space in block
cgroups.
We start a kernel thread to read the file tree. By scanning inode items,
we know which chunks of inode numbers are free, and we cache them in
an rb-tree.
Because we are searching the commit root, we have to carefully handle the
cross-transaction case.
The rb-tree is a hybrid extent+bitmap tree, so if we have too many small
chunks of inode numbers, we'll use bitmaps. Initially we allow 16K ram
of extents, and a bitmap will be used if we exceed this threshold. The
extents threshold is adjusted in runtime.
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
2011-04-20 09:06:11 +07:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We don't fall back to bitmap, if we are below the extents threshold
|
|
|
|
* or this chunk of inode numbers is a big one.
|
|
|
|
*/
|
|
|
|
static bool use_bitmap(struct btrfs_free_space_ctl *ctl,
|
|
|
|
struct btrfs_free_space *info)
|
|
|
|
{
|
|
|
|
if (ctl->free_extents < ctl->extents_thresh ||
|
|
|
|
info->bytes > INODES_PER_BITMAP / 10)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2015-11-19 17:42:28 +07:00
|
|
|
static const struct btrfs_free_space_op free_ino_op = {
|
Btrfs: Cache free inode numbers in memory
Currently btrfs stores the highest objectid of the fs tree, and it always
returns (highest+1) inode number when we create a file, so inode numbers
won't be reclaimed when we delete files, so we'll run out of inode numbers
as we keep create/delete files in 32bits machines.
This fixes it, and it works similarly to how we cache free space in block
cgroups.
We start a kernel thread to read the file tree. By scanning inode items,
we know which chunks of inode numbers are free, and we cache them in
an rb-tree.
Because we are searching the commit root, we have to carefully handle the
cross-transaction case.
The rb-tree is a hybrid extent+bitmap tree, so if we have too many small
chunks of inode numbers, we'll use bitmaps. Initially we allow 16K ram
of extents, and a bitmap will be used if we exceed this threshold. The
extents threshold is adjusted in runtime.
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
2011-04-20 09:06:11 +07:00
|
|
|
.recalc_thresholds = recalculate_thresholds,
|
|
|
|
.use_bitmap = use_bitmap,
|
|
|
|
};
|
|
|
|
|
|
|
|
static void pinned_recalc_thresholds(struct btrfs_free_space_ctl *ctl)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
static bool pinned_use_bitmap(struct btrfs_free_space_ctl *ctl,
|
|
|
|
struct btrfs_free_space *info)
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* We always use extents for two reasons:
|
|
|
|
*
|
|
|
|
* - The pinned tree is only used during the process of caching
|
|
|
|
* work.
|
|
|
|
* - Make code simpler. See btrfs_unpin_free_ino().
|
|
|
|
*/
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2015-11-19 17:42:28 +07:00
|
|
|
static const struct btrfs_free_space_op pinned_free_ino_op = {
|
Btrfs: Cache free inode numbers in memory
Currently btrfs stores the highest objectid of the fs tree, and it always
returns (highest+1) inode number when we create a file, so inode numbers
won't be reclaimed when we delete files, so we'll run out of inode numbers
as we keep create/delete files in 32bits machines.
This fixes it, and it works similarly to how we cache free space in block
cgroups.
We start a kernel thread to read the file tree. By scanning inode items,
we know which chunks of inode numbers are free, and we cache them in
an rb-tree.
Because we are searching the commit root, we have to carefully handle the
cross-transaction case.
The rb-tree is a hybrid extent+bitmap tree, so if we have too many small
chunks of inode numbers, we'll use bitmaps. Initially we allow 16K ram
of extents, and a bitmap will be used if we exceed this threshold. The
extents threshold is adjusted in runtime.
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
2011-04-20 09:06:11 +07:00
|
|
|
.recalc_thresholds = pinned_recalc_thresholds,
|
|
|
|
.use_bitmap = pinned_use_bitmap,
|
|
|
|
};
|
|
|
|
|
|
|
|
void btrfs_init_free_ino_ctl(struct btrfs_root *root)
|
|
|
|
{
|
|
|
|
struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
|
|
|
|
struct btrfs_free_space_ctl *pinned = root->free_ino_pinned;
|
|
|
|
|
|
|
|
spin_lock_init(&ctl->tree_lock);
|
|
|
|
ctl->unit = 1;
|
|
|
|
ctl->start = 0;
|
|
|
|
ctl->private = NULL;
|
|
|
|
ctl->op = &free_ino_op;
|
Btrfs: fix race between writing free space cache and trimming
Trimming is completely transactionless, and the way it operates consists
of hiding free space entries from a block group, perform the trim/discard
and then make the free space entries visible again.
Therefore while a free space entry is being trimmed, we can have free space
cache writing running in parallel (as part of a transaction commit) which
will miss the free space entry. This means that an unmount (or crash/reboot)
after that transaction commit and mount again before another transaction
starts/commits after the discard finishes, we will have some free space
that won't be used again unless the free space cache is rebuilt. After the
unmount, fsck (btrfsck, btrfs check) reports the issue like the following
example:
*** fsck.btrfs output ***
checking extents
checking free space cache
There is no free space entry for 521764864-521781248
There is no free space entry for 521764864-1103101952
cache appears valid but isnt 29360128
Checking filesystem on /dev/sdc
UUID: b4789e27-4774-4626-98e9-ae8dfbfb0fb5
found 1235681286 bytes used err is -22
(...)
Another issue caused by this race is a crash while writing bitmap entries
to the cache, because while the cache writeout task accesses the bitmaps,
the trim task can be concurrently modifying the bitmap or worse might
be freeing the bitmap. The later case results in the following crash:
[55650.804460] general protection fault: 0000 [#1] SMP DEBUG_PAGEALLOC
[55650.804835] Modules linked in: btrfs dm_flakey dm_mod crc32c_generic xor raid6_pq nfsd auth_rpcgss oid_registry nfs_acl nfs lockd fscache sunrpc loop parport_pc parport i2c_piix4 psmouse evdev pcspkr microcode processor i2ccore serio_raw thermal_sys button ext4 crc16 jbd2 mbcache sg sd_mod crc_t10dif sr_mod cdrom crct10dif_generic crct10dif_common ata_generic virtio_scsi floppy ata_piix libata virtio_pci virtio_ring virtio scsi_mod e1000 [last unloaded: btrfs]
[55650.806169] CPU: 1 PID: 31002 Comm: btrfs-transacti Tainted: G W 3.17.0-rc5-btrfs-next-1+ #1
[55650.806493] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.7.5-0-ge51488c-20140602_164612-nilsson.home.kraxel.org 04/01/2014
[55650.806867] task: ffff8800b12f6410 ti: ffff880071538000 task.ti: ffff880071538000
[55650.807166] RIP: 0010:[<ffffffffa037cf45>] [<ffffffffa037cf45>] write_bitmap_entries+0x65/0xbb [btrfs]
[55650.807514] RSP: 0018:ffff88007153bc30 EFLAGS: 00010246
[55650.807687] RAX: 000000005d1ec000 RBX: ffff8800a665df08 RCX: 0000000000000400
[55650.807885] RDX: ffff88005d1ec000 RSI: 6b6b6b6b6b6b6b6b RDI: ffff88005d1ec000
[55650.808017] RBP: ffff88007153bc58 R08: 00000000ddd51536 R09: 00000000000001e0
[55650.808017] R10: 0000000000000000 R11: 0000000000000037 R12: 6b6b6b6b6b6b6b6b
[55650.808017] R13: ffff88007153bca8 R14: 6b6b6b6b6b6b6b6b R15: ffff88007153bc98
[55650.808017] FS: 0000000000000000(0000) GS:ffff88023ec80000(0000) knlGS:0000000000000000
[55650.808017] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b
[55650.808017] CR2: 0000000002273b88 CR3: 00000000b18f6000 CR4: 00000000000006e0
[55650.808017] Stack:
[55650.808017] ffff88020e834e00 ffff880172d68db0 0000000000000000 ffff88019257c800
[55650.808017] ffff8801d42ea720 ffff88007153bd10 ffffffffa037d2fa ffff880224e99180
[55650.808017] ffff8801469a6188 ffff880224e99140 ffff880172d68c50 00000003000000b7
[55650.808017] Call Trace:
[55650.808017] [<ffffffffa037d2fa>] __btrfs_write_out_cache+0x1ea/0x37f [btrfs]
[55650.808017] [<ffffffffa037d959>] btrfs_write_out_cache+0xa1/0xd8 [btrfs]
[55650.808017] [<ffffffffa033936b>] btrfs_write_dirty_block_groups+0x4b5/0x505 [btrfs]
[55650.808017] [<ffffffffa03aa98e>] commit_cowonly_roots+0x15e/0x1f7 [btrfs]
[55650.808017] [<ffffffff813eb9c7>] ? _raw_spin_lock+0xe/0x10
[55650.808017] [<ffffffffa0346e46>] btrfs_commit_transaction+0x411/0x882 [btrfs]
[55650.808017] [<ffffffffa03432a4>] transaction_kthread+0xf2/0x1a4 [btrfs]
[55650.808017] [<ffffffffa03431b2>] ? btrfs_cleanup_transaction+0x3d8/0x3d8 [btrfs]
[55650.808017] [<ffffffff8105966b>] kthread+0xb7/0xbf
[55650.808017] [<ffffffff810595b4>] ? __kthread_parkme+0x67/0x67
[55650.808017] [<ffffffff813ebeac>] ret_from_fork+0x7c/0xb0
[55650.808017] [<ffffffff810595b4>] ? __kthread_parkme+0x67/0x67
[55650.808017] Code: 4c 89 ef 8d 70 ff e8 d4 fc ff ff 41 8b 45 34 41 39 45 30 7d 5c 31 f6 4c 89 ef e8 80 f6 ff ff 49 8b 7d 00 4c 89 f6 b9 00 04 00 00 <f3> a5 4c 89 ef 41 8b 45 30 8d 70 ff e8 a3 fc ff ff 41 8b 45 34
[55650.808017] RIP [<ffffffffa037cf45>] write_bitmap_entries+0x65/0xbb [btrfs]
[55650.808017] RSP <ffff88007153bc30>
[55650.815725] ---[ end trace 1c032e96b149ff86 ]---
Fix this by serializing both tasks in such a way that cache writeout
doesn't wait for the trim/discard of free space entries to finish and
doesn't miss any free space entry.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-12-02 00:04:09 +07:00
|
|
|
INIT_LIST_HEAD(&ctl->trimming_ranges);
|
|
|
|
mutex_init(&ctl->cache_writeout_mutex);
|
Btrfs: Cache free inode numbers in memory
Currently btrfs stores the highest objectid of the fs tree, and it always
returns (highest+1) inode number when we create a file, so inode numbers
won't be reclaimed when we delete files, so we'll run out of inode numbers
as we keep create/delete files in 32bits machines.
This fixes it, and it works similarly to how we cache free space in block
cgroups.
We start a kernel thread to read the file tree. By scanning inode items,
we know which chunks of inode numbers are free, and we cache them in
an rb-tree.
Because we are searching the commit root, we have to carefully handle the
cross-transaction case.
The rb-tree is a hybrid extent+bitmap tree, so if we have too many small
chunks of inode numbers, we'll use bitmaps. Initially we allow 16K ram
of extents, and a bitmap will be used if we exceed this threshold. The
extents threshold is adjusted in runtime.
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
2011-04-20 09:06:11 +07:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Initially we allow to use 16K of ram to cache chunks of
|
|
|
|
* inode numbers before we resort to bitmaps. This is somewhat
|
|
|
|
* arbitrary, but it will be adjusted in runtime.
|
|
|
|
*/
|
|
|
|
ctl->extents_thresh = INIT_THRESHOLD;
|
|
|
|
|
|
|
|
spin_lock_init(&pinned->tree_lock);
|
|
|
|
pinned->unit = 1;
|
|
|
|
pinned->start = 0;
|
|
|
|
pinned->private = NULL;
|
|
|
|
pinned->extents_thresh = 0;
|
|
|
|
pinned->op = &pinned_free_ino_op;
|
|
|
|
}
|
|
|
|
|
2011-04-20 09:33:24 +07:00
|
|
|
int btrfs_save_ino_cache(struct btrfs_root *root,
|
|
|
|
struct btrfs_trans_handle *trans)
|
|
|
|
{
|
|
|
|
struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
|
|
|
|
struct btrfs_path *path;
|
|
|
|
struct inode *inode;
|
2011-11-11 08:45:04 +07:00
|
|
|
struct btrfs_block_rsv *rsv;
|
|
|
|
u64 num_bytes;
|
2011-04-20 09:33:24 +07:00
|
|
|
u64 alloc_hint = 0;
|
|
|
|
int ret;
|
|
|
|
int prealloc;
|
|
|
|
bool retry = false;
|
|
|
|
|
2011-06-01 16:42:49 +07:00
|
|
|
/* only fs tree and subvol/snap needs ino cache */
|
|
|
|
if (root->root_key.objectid != BTRFS_FS_TREE_OBJECTID &&
|
|
|
|
(root->root_key.objectid < BTRFS_FIRST_FREE_OBJECTID ||
|
|
|
|
root->root_key.objectid > BTRFS_LAST_FREE_OBJECTID))
|
|
|
|
return 0;
|
|
|
|
|
2011-06-01 02:33:33 +07:00
|
|
|
/* Don't save inode cache if we are deleting this root */
|
2013-09-05 21:58:43 +07:00
|
|
|
if (btrfs_root_refs(&root->root_item) == 0)
|
2011-06-01 02:33:33 +07:00
|
|
|
return 0;
|
|
|
|
|
2016-06-10 08:38:35 +07:00
|
|
|
if (!btrfs_test_opt(root->fs_info, INODE_MAP_CACHE))
|
2011-06-03 20:36:29 +07:00
|
|
|
return 0;
|
|
|
|
|
2011-04-20 09:33:24 +07:00
|
|
|
path = btrfs_alloc_path();
|
|
|
|
if (!path)
|
|
|
|
return -ENOMEM;
|
2011-06-03 20:36:29 +07:00
|
|
|
|
2011-11-11 08:45:04 +07:00
|
|
|
rsv = trans->block_rsv;
|
|
|
|
trans->block_rsv = &root->fs_info->trans_block_rsv;
|
|
|
|
|
|
|
|
num_bytes = trans->bytes_reserved;
|
|
|
|
/*
|
|
|
|
* 1 item for inode item insertion if need
|
2013-05-13 20:55:09 +07:00
|
|
|
* 4 items for inode item update (in the worst case)
|
|
|
|
* 1 items for slack space if we need do truncation
|
2011-11-11 08:45:04 +07:00
|
|
|
* 1 item for free space object
|
|
|
|
* 3 items for pre-allocation
|
|
|
|
*/
|
2013-05-13 20:55:09 +07:00
|
|
|
trans->bytes_reserved = btrfs_calc_trans_metadata_size(root, 10);
|
Btrfs: improve the noflush reservation
In some places(such as: evicting inode), we just can not flush the reserved
space of delalloc, flushing the delayed directory index and delayed inode
is OK, but we don't try to flush those things and just go back when there is
no enough space to be reserved. This patch fixes this problem.
We defined 3 types of the flush operations: NO_FLUSH, FLUSH_LIMIT and FLUSH_ALL.
If we can in the transaction, we should not flush anything, or the deadlock
would happen, so use NO_FLUSH. If we flushing the reserved space of delalloc
would cause deadlock, use FLUSH_LIMIT. In the other cases, FLUSH_ALL is used,
and we will flush all things.
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
2012-10-16 18:33:38 +07:00
|
|
|
ret = btrfs_block_rsv_add(root, trans->block_rsv,
|
|
|
|
trans->bytes_reserved,
|
|
|
|
BTRFS_RESERVE_NO_FLUSH);
|
2011-11-11 08:45:04 +07:00
|
|
|
if (ret)
|
|
|
|
goto out;
|
2012-02-24 22:39:05 +07:00
|
|
|
trace_btrfs_space_reservation(root->fs_info, "ino_cache",
|
2012-03-29 20:57:44 +07:00
|
|
|
trans->transid, trans->bytes_reserved, 1);
|
2011-04-20 09:33:24 +07:00
|
|
|
again:
|
|
|
|
inode = lookup_free_ino_inode(root, path);
|
2012-03-12 22:03:00 +07:00
|
|
|
if (IS_ERR(inode) && (PTR_ERR(inode) != -ENOENT || retry)) {
|
2011-04-20 09:33:24 +07:00
|
|
|
ret = PTR_ERR(inode);
|
2011-11-11 08:45:04 +07:00
|
|
|
goto out_release;
|
2011-04-20 09:33:24 +07:00
|
|
|
}
|
|
|
|
|
|
|
|
if (IS_ERR(inode)) {
|
2012-03-12 22:03:00 +07:00
|
|
|
BUG_ON(retry); /* Logic error */
|
2011-04-20 09:33:24 +07:00
|
|
|
retry = true;
|
|
|
|
|
|
|
|
ret = create_free_ino_inode(root, trans, path);
|
|
|
|
if (ret)
|
2011-11-11 08:45:04 +07:00
|
|
|
goto out_release;
|
2011-04-20 09:33:24 +07:00
|
|
|
goto again;
|
|
|
|
}
|
|
|
|
|
|
|
|
BTRFS_I(inode)->generation = 0;
|
|
|
|
ret = btrfs_update_inode(trans, root, inode);
|
2012-03-12 22:03:00 +07:00
|
|
|
if (ret) {
|
2016-06-11 05:19:25 +07:00
|
|
|
btrfs_abort_transaction(trans, ret);
|
2012-03-12 22:03:00 +07:00
|
|
|
goto out_put;
|
|
|
|
}
|
2011-04-20 09:33:24 +07:00
|
|
|
|
|
|
|
if (i_size_read(inode) > 0) {
|
2015-04-07 02:46:08 +07:00
|
|
|
ret = btrfs_truncate_free_space_cache(root, trans, NULL, inode);
|
2012-03-12 22:03:00 +07:00
|
|
|
if (ret) {
|
2013-05-13 20:55:08 +07:00
|
|
|
if (ret != -ENOSPC)
|
2016-06-11 05:19:25 +07:00
|
|
|
btrfs_abort_transaction(trans, ret);
|
2011-04-20 09:33:24 +07:00
|
|
|
goto out_put;
|
2012-03-12 22:03:00 +07:00
|
|
|
}
|
2011-04-20 09:33:24 +07:00
|
|
|
}
|
|
|
|
|
2014-02-05 08:37:48 +07:00
|
|
|
spin_lock(&root->ino_cache_lock);
|
|
|
|
if (root->ino_cache_state != BTRFS_CACHE_FINISHED) {
|
2011-04-20 09:33:24 +07:00
|
|
|
ret = -1;
|
2014-02-05 08:37:48 +07:00
|
|
|
spin_unlock(&root->ino_cache_lock);
|
2011-04-20 09:33:24 +07:00
|
|
|
goto out_put;
|
|
|
|
}
|
2014-02-05 08:37:48 +07:00
|
|
|
spin_unlock(&root->ino_cache_lock);
|
2011-04-20 09:33:24 +07:00
|
|
|
|
|
|
|
spin_lock(&ctl->tree_lock);
|
|
|
|
prealloc = sizeof(struct btrfs_free_space) * ctl->free_extents;
|
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 19:29:47 +07:00
|
|
|
prealloc = ALIGN(prealloc, PAGE_SIZE);
|
|
|
|
prealloc += ctl->total_bitmaps * PAGE_SIZE;
|
2011-04-20 09:33:24 +07:00
|
|
|
spin_unlock(&ctl->tree_lock);
|
|
|
|
|
|
|
|
/* Just to make sure we have enough space */
|
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 19:29:47 +07:00
|
|
|
prealloc += 8 * PAGE_SIZE;
|
2011-04-20 09:33:24 +07:00
|
|
|
|
2015-09-08 16:25:55 +07:00
|
|
|
ret = btrfs_delalloc_reserve_space(inode, 0, prealloc);
|
2011-04-20 09:33:24 +07:00
|
|
|
if (ret)
|
|
|
|
goto out_put;
|
|
|
|
|
|
|
|
ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, prealloc,
|
|
|
|
prealloc, prealloc, &alloc_hint);
|
2011-08-30 21:19:10 +07:00
|
|
|
if (ret) {
|
2015-09-08 16:25:55 +07:00
|
|
|
btrfs_delalloc_release_space(inode, 0, prealloc);
|
2011-04-20 09:33:24 +07:00
|
|
|
goto out_put;
|
2011-08-30 21:19:10 +07:00
|
|
|
}
|
2015-09-08 16:25:55 +07:00
|
|
|
btrfs_free_reserved_data_space(inode, 0, prealloc);
|
2011-04-20 09:33:24 +07:00
|
|
|
|
2013-09-20 20:43:28 +07:00
|
|
|
ret = btrfs_write_out_ino_cache(root, trans, path, inode);
|
2011-04-20 09:33:24 +07:00
|
|
|
out_put:
|
|
|
|
iput(inode);
|
2011-11-11 08:45:04 +07:00
|
|
|
out_release:
|
2012-02-24 22:39:05 +07:00
|
|
|
trace_btrfs_space_reservation(root->fs_info, "ino_cache",
|
2012-03-29 20:57:44 +07:00
|
|
|
trans->transid, trans->bytes_reserved, 0);
|
2011-11-11 08:45:04 +07:00
|
|
|
btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved);
|
2011-04-20 09:33:24 +07:00
|
|
|
out:
|
2011-11-11 08:45:04 +07:00
|
|
|
trans->block_rsv = rsv;
|
|
|
|
trans->bytes_reserved = num_bytes;
|
2011-04-20 09:33:24 +07:00
|
|
|
|
|
|
|
btrfs_free_path(path);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2016-01-07 20:26:59 +07:00
|
|
|
int btrfs_find_highest_objectid(struct btrfs_root *root, u64 *objectid)
|
2007-04-06 00:35:25 +07:00
|
|
|
{
|
|
|
|
struct btrfs_path *path;
|
|
|
|
int ret;
|
2007-10-16 03:14:19 +07:00
|
|
|
struct extent_buffer *l;
|
2007-04-06 00:35:25 +07:00
|
|
|
struct btrfs_key search_key;
|
2007-10-16 03:14:19 +07:00
|
|
|
struct btrfs_key found_key;
|
2007-04-06 00:35:25 +07:00
|
|
|
int slot;
|
|
|
|
|
|
|
|
path = btrfs_alloc_path();
|
2011-03-23 15:14:16 +07:00
|
|
|
if (!path)
|
|
|
|
return -ENOMEM;
|
2007-04-06 00:35:25 +07:00
|
|
|
|
2008-09-06 03:43:53 +07:00
|
|
|
search_key.objectid = BTRFS_LAST_FREE_OBJECTID;
|
|
|
|
search_key.type = -1;
|
2007-04-06 00:35:25 +07:00
|
|
|
search_key.offset = (u64)-1;
|
|
|
|
ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
|
|
|
|
if (ret < 0)
|
|
|
|
goto error;
|
2012-03-12 22:03:00 +07:00
|
|
|
BUG_ON(ret == 0); /* Corruption */
|
2007-04-06 00:35:25 +07:00
|
|
|
if (path->slots[0] > 0) {
|
|
|
|
slot = path->slots[0] - 1;
|
2007-10-16 03:14:19 +07:00
|
|
|
l = path->nodes[0];
|
|
|
|
btrfs_item_key_to_cpu(l, &found_key, slot);
|
2009-09-22 02:56:00 +07:00
|
|
|
*objectid = max_t(u64, found_key.objectid,
|
|
|
|
BTRFS_FIRST_FREE_OBJECTID - 1);
|
2007-04-06 00:35:25 +07:00
|
|
|
} else {
|
2009-09-22 02:56:00 +07:00
|
|
|
*objectid = BTRFS_FIRST_FREE_OBJECTID - 1;
|
2007-04-06 00:35:25 +07:00
|
|
|
}
|
|
|
|
ret = 0;
|
|
|
|
error:
|
|
|
|
btrfs_free_path(path);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
Btrfs: Cache free inode numbers in memory
Currently btrfs stores the highest objectid of the fs tree, and it always
returns (highest+1) inode number when we create a file, so inode numbers
won't be reclaimed when we delete files, so we'll run out of inode numbers
as we keep create/delete files in 32bits machines.
This fixes it, and it works similarly to how we cache free space in block
cgroups.
We start a kernel thread to read the file tree. By scanning inode items,
we know which chunks of inode numbers are free, and we cache them in
an rb-tree.
Because we are searching the commit root, we have to carefully handle the
cross-transaction case.
The rb-tree is a hybrid extent+bitmap tree, so if we have too many small
chunks of inode numbers, we'll use bitmaps. Initially we allow 16K ram
of extents, and a bitmap will be used if we exceed this threshold. The
extents threshold is adjusted in runtime.
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
2011-04-20 09:06:11 +07:00
|
|
|
int btrfs_find_free_objectid(struct btrfs_root *root, u64 *objectid)
|
2007-03-21 01:38:32 +07:00
|
|
|
{
|
|
|
|
int ret;
|
2008-06-26 03:01:30 +07:00
|
|
|
mutex_lock(&root->objectid_mutex);
|
2007-03-21 01:38:32 +07:00
|
|
|
|
2009-09-22 02:56:00 +07:00
|
|
|
if (unlikely(root->highest_objectid >= BTRFS_LAST_FREE_OBJECTID)) {
|
2016-03-09 13:18:57 +07:00
|
|
|
btrfs_warn(root->fs_info,
|
|
|
|
"the objectid of root %llu reaches its highest value",
|
|
|
|
root->root_key.objectid);
|
2009-09-22 02:56:00 +07:00
|
|
|
ret = -ENOSPC;
|
|
|
|
goto out;
|
2007-03-21 01:38:32 +07:00
|
|
|
}
|
2009-09-22 02:56:00 +07:00
|
|
|
|
|
|
|
*objectid = ++root->highest_objectid;
|
|
|
|
ret = 0;
|
|
|
|
out:
|
2008-06-26 03:01:30 +07:00
|
|
|
mutex_unlock(&root->objectid_mutex);
|
2007-03-21 01:38:32 +07:00
|
|
|
return ret;
|
|
|
|
}
|