mirror of
https://github.com/AuxXxilium/linux_dsm_epyc7002.git
synced 2024-11-30 13:26:41 +07:00
285b2c4fdd
The maximum size of a shmem/tmpfs file has been limited by the maximum size of its triple-indirect swap vector. With 4kB page size, maximum filesize was just over 2TB on a 32-bit kernel, but sadly one eighth of that on a 64-bit kernel. (With 8kB page size, maximum filesize was just over 4TB on a 64-bit kernel, but 16TB on a 32-bit kernel, MAX_LFS_FILESIZE being then more restrictive than swap vector layout.) It's a shame that tmpfs should be more restrictive than ramfs, and this limitation has now been noticed. Add another level to the swap vector? No, it became obscure and hard to maintain, once I complicated it to make use of highmem pages nine years ago: better choose another way. Surely, if 2.4 had had the radix tree pagecache introduced in 2.5, then tmpfs would never have invented its own peculiar radix tree: we would have fitted swap entries into the common radix tree instead, in much the same way as we fit swap entries into page tables. And why should each file have a separate radix tree for its pages and for its swap entries? The swap entries are required precisely where and when the pages are not. We want to put them together in a single radix tree: which can then avoid much of the locking which was needed to prevent them from being exchanged underneath us. This also avoids the waste of memory devoted to swap vectors, first in the shmem_inode itself, then at least two more pages once a file grew beyond 16 data pages (pages accounted by df and du, but not by memcg). Allocated upfront, to avoid allocation when under swapping pressure, but pure waste when CONFIG_SWAP is not set - I have never spattered around the ifdefs to prevent that, preferring this move to sharing the common radix tree instead. There are three downsides to sharing the radix tree. One, that it binds tmpfs more tightly to the rest of mm, either requiring knowledge of swap entries in radix tree there, or duplication of its code here in shmem.c. I believe that the simplications and memory savings (and probable higher performance, not yet measured) justify that. Two, that on HIGHMEM systems with SWAP enabled, it's the lowmem radix nodes that cannot be freed under memory pressure - whereas before it was the less precious highmem swap vector pages that could not be freed. I'm hoping that 64-bit has now been accessible for long enough, that the highmem argument has grown much less persuasive. Three, that swapoff is slower than it used to be on tmpfs files, since it's using a simple generic mechanism not tailored to it: I find this noticeable, and shall want to improve, but maybe nobody else will notice. So... now remove most of the old swap vector code from shmem.c. But, for the moment, keep the simple i_direct vector of 16 pages, with simple accessors shmem_put_swap() and shmem_get_swap(), as a toy implementation to help mark where swap needs to be handled in subsequent patches. Signed-off-by: Hugh Dickins <hughd@google.com> Acked-by: Rik van Riel <riel@redhat.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
71 lines
2.4 KiB
C
71 lines
2.4 KiB
C
#ifndef __SHMEM_FS_H
|
|
#define __SHMEM_FS_H
|
|
|
|
#include <linux/swap.h>
|
|
#include <linux/mempolicy.h>
|
|
#include <linux/pagemap.h>
|
|
#include <linux/percpu_counter.h>
|
|
|
|
/* inode in-kernel data */
|
|
|
|
#define SHMEM_NR_DIRECT 16
|
|
|
|
#define SHMEM_SYMLINK_INLINE_LEN (SHMEM_NR_DIRECT * sizeof(swp_entry_t))
|
|
|
|
struct shmem_inode_info {
|
|
spinlock_t lock;
|
|
unsigned long flags;
|
|
unsigned long alloced; /* data pages alloced to file */
|
|
unsigned long swapped; /* subtotal assigned to swap */
|
|
struct shared_policy policy; /* NUMA memory alloc policy */
|
|
union {
|
|
swp_entry_t i_direct[SHMEM_NR_DIRECT]; /* first blocks */
|
|
char inline_symlink[SHMEM_SYMLINK_INLINE_LEN];
|
|
};
|
|
struct list_head swaplist; /* chain of maybes on swap */
|
|
struct list_head xattr_list; /* list of shmem_xattr */
|
|
struct inode vfs_inode;
|
|
};
|
|
|
|
struct shmem_sb_info {
|
|
unsigned long max_blocks; /* How many blocks are allowed */
|
|
struct percpu_counter used_blocks; /* How many are allocated */
|
|
unsigned long max_inodes; /* How many inodes are allowed */
|
|
unsigned long free_inodes; /* How many are left for allocation */
|
|
spinlock_t stat_lock; /* Serialize shmem_sb_info changes */
|
|
uid_t uid; /* Mount uid for root directory */
|
|
gid_t gid; /* Mount gid for root directory */
|
|
mode_t mode; /* Mount mode for root directory */
|
|
struct mempolicy *mpol; /* default memory policy for mappings */
|
|
};
|
|
|
|
static inline struct shmem_inode_info *SHMEM_I(struct inode *inode)
|
|
{
|
|
return container_of(inode, struct shmem_inode_info, vfs_inode);
|
|
}
|
|
|
|
/*
|
|
* Functions in mm/shmem.c called directly from elsewhere:
|
|
*/
|
|
extern int init_tmpfs(void);
|
|
extern int shmem_fill_super(struct super_block *sb, void *data, int silent);
|
|
extern struct file *shmem_file_setup(const char *name,
|
|
loff_t size, unsigned long flags);
|
|
extern int shmem_zero_setup(struct vm_area_struct *);
|
|
extern int shmem_lock(struct file *file, int lock, struct user_struct *user);
|
|
extern struct page *shmem_read_mapping_page_gfp(struct address_space *mapping,
|
|
pgoff_t index, gfp_t gfp_mask);
|
|
extern void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end);
|
|
extern int shmem_unuse(swp_entry_t entry, struct page *page);
|
|
extern void mem_cgroup_get_shmem_target(struct inode *inode, pgoff_t pgoff,
|
|
struct page **pagep, swp_entry_t *ent);
|
|
|
|
static inline struct page *shmem_read_mapping_page(
|
|
struct address_space *mapping, pgoff_t index)
|
|
{
|
|
return shmem_read_mapping_page_gfp(mapping, index,
|
|
mapping_gfp_mask(mapping));
|
|
}
|
|
|
|
#endif
|