From: David Howells FS-Cache: Avoid ENFILE checking for kernel-specific open files Make it possible to avoid ENFILE checking for kernel specific open files, such as are used by the CacheFiles module. After, for example, tarring up a kernel source tree over the network, the CacheFiles module may easily have 20000+ files open in the backing filesystem, thus causing all non-root processes to be given error ENFILE when they try to open a file, socket, pipe, etc.. Signed-Off-By: David Howells Signed-off-by: Trond Myklebust --- Documentation/sysctl/fs.txt | 6 ++++- fs/file_table.c | 48 +++++++++++++++++++++++++++++++++++-------- fs/open.c | 20 ++++++++++++++++++ include/linux/file.h | 1 - include/linux/fs.h | 10 +++++++++ include/linux/sysctl.h | 1 + kernel/sysctl.c | 11 ++++++++++ 7 files changed, 86 insertions(+), 11 deletions(-) diff --git a/Documentation/sysctl/fs.txt b/Documentation/sysctl/fs.txt index 0b62c62..ead15f0 100644 --- a/Documentation/sysctl/fs.txt +++ b/Documentation/sysctl/fs.txt @@ -71,7 +71,7 @@ you might want to raise the limit. ============================================================== -file-max & file-nr: +file-max, file-nr & file-kernel: The kernel allocates file handles dynamically, but as yet it doesn't free them again. @@ -88,6 +88,10 @@ close to the maximum, but the number of significantly greater than 0, you've encountered a peak in your usage of file handles and you don't need to increase the maximum. +The value in file-kernel denotes the number of internal file handles +that the kernel has open. These do not contribute to ENFILE +accounting. + ============================================================== inode-max, inode-nr & inode-state: diff --git a/fs/file_table.c b/fs/file_table.c index 0131ba0..894711d 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -29,10 +29,13 @@ struct files_stat_struct files_stat = { .max_files = NR_FILE }; +struct files_kernel_stat_struct files_kernel_stat; + /* public. Not pretty! */ __cacheline_aligned_in_smp DEFINE_SPINLOCK(files_lock); static struct percpu_counter nr_files __cacheline_aligned_in_smp; +static atomic_t nr_kernel_files; static inline void file_free_rcu(struct rcu_head *head) { @@ -42,7 +45,10 @@ static inline void file_free_rcu(struct static inline void file_free(struct file *f) { - percpu_counter_dec(&nr_files); + if (f->f_kernel_flags & FKFLAGS_NO_ENFILE) + atomic_dec(&nr_kernel_files); + else + percpu_counter_dec(&nr_files); call_rcu(&f->f_u.fu_rcuhead, file_free_rcu); } @@ -73,45 +79,64 @@ int proc_nr_files(ctl_table *table, int files_stat.nr_files = get_nr_files(); return proc_dointvec(table, write, filp, buffer, lenp, ppos); } +int proc_files_kernel(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + files_kernel_stat.nr_kernel_files = atomic_read(&nr_kernel_files); + return proc_dointvec(table, write, filp, buffer, lenp, ppos); +} #else int proc_nr_files(ctl_table *table, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } +int proc_files_kernel(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} #endif /* Find an unused file structure and return a pointer to it. * Returns NULL, if there are no more free file structures or * we run out of memory. */ -struct file *get_empty_filp(void) +struct file *get_empty_kernel_filp(unsigned short kflags) { struct task_struct *tsk; static int old_max; struct file * f; /* - * Privileged users can go above max_files + * Privileged users can go above max_files and internal kernel users + * can avoid it completely */ - if (get_nr_files() >= files_stat.max_files && !capable(CAP_SYS_ADMIN)) { + if (!(kflags & FKFLAGS_NO_ENFILE) && + get_nr_files() >= files_stat.max_files && + !capable(CAP_SYS_ADMIN) + ) { /* - * percpu_counters are inaccurate. Do an expensive check before - * we go and fail. + * percpu_counters are inaccurate. Do an expensive + * check before we go and fail. */ if (percpu_counter_sum(&nr_files) >= files_stat.max_files) goto over; } - f = kmem_cache_alloc(filp_cachep, GFP_KERNEL); + f = kmem_cache_zalloc(filp_cachep, GFP_KERNEL); if (f == NULL) goto fail; - percpu_counter_inc(&nr_files); - memset(f, 0, sizeof(*f)); + if (kflags & FKFLAGS_NO_ENFILE) + atomic_inc(&nr_kernel_files); + else + percpu_counter_inc(&nr_files); + if (security_file_alloc(f)) goto fail_sec; + f->f_kernel_flags = kflags; tsk = current; INIT_LIST_HEAD(&f->f_u.fu_list); atomic_set(&f->f_count, 1); @@ -137,6 +162,11 @@ fail: return NULL; } +struct file *get_empty_filp(void) +{ + return get_empty_kernel_filp(0); +} + EXPORT_SYMBOL(get_empty_filp); void fastcall fput(struct file *file) diff --git a/fs/open.c b/fs/open.c index 303f06d..3cf6c00 100644 --- a/fs/open.c +++ b/fs/open.c @@ -974,6 +974,26 @@ struct file *dentry_open(struct dentry * EXPORT_SYMBOL(dentry_open); /* + * open a specifically in-kernel file + */ +struct file *dentry_open_kernel(struct dentry *dentry, struct vfsmount *mnt, int flags) +{ + int error; + struct file *f; + + error = -ENFILE; + f = get_empty_kernel_filp(FKFLAGS_NO_ENFILE); + if (f == NULL) { + dput(dentry); + mntput(mnt); + return ERR_PTR(error); + } + + return __dentry_open(dentry, mnt, flags, f, NULL); +} +EXPORT_SYMBOL_GPL(dentry_open_kernel); + +/* * Find an empty file descriptor entry, and mark it busy. */ int get_unused_fd(void) diff --git a/include/linux/file.h b/include/linux/file.h index 9f7c251..da7be8f 100644 --- a/include/linux/file.h +++ b/include/linux/file.h @@ -79,7 +79,6 @@ extern void FASTCALL(set_close_on_exec(u extern void put_filp(struct file *); extern int get_unused_fd(void); extern void FASTCALL(put_unused_fd(unsigned int fd)); -struct kmem_cache; extern struct file ** alloc_fd_array(int); extern void free_fd_array(struct file **, int); diff --git a/include/linux/fs.h b/include/linux/fs.h index 43aef9b..623df73 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -33,7 +33,11 @@ struct files_stat_struct { int nr_free_files; /* read only */ int max_files; /* tunable */ }; +struct files_kernel_stat_struct { + int nr_kernel_files; /* read only */ +}; extern struct files_stat_struct files_stat; +extern struct files_kernel_stat_struct files_kernel_stat; extern int get_max_files(void); struct inodes_stat_t { @@ -69,6 +73,8 @@ #define FMODE_PWRITE FMODE_PREAD /* Thes behavior for cross-node execution/opening_for_writing of files */ #define FMODE_EXEC 16 +#define FKFLAGS_NO_ENFILE 1 /* kernel internal file (ignored for ENFILE accounting) */ + #define RW_MASK 1 #define RWA_MASK 2 #define READ 0 @@ -678,6 +684,7 @@ struct file { atomic_t f_count; unsigned int f_flags; mode_t f_mode; + unsigned short f_kernel_flags; loff_t f_pos; struct fown_struct f_owner; unsigned int f_uid, f_gid; @@ -1419,6 +1426,7 @@ extern long do_sys_open(int fdf, const c int mode); extern struct file *filp_open(const char *, int, int); extern struct file * dentry_open(struct dentry *, struct vfsmount *, int); +extern struct file * dentry_open_kernel(struct dentry *, struct vfsmount *, int); extern int filp_close(struct file *, fl_owner_t id); extern char * getname(const char __user *); @@ -1622,6 +1630,7 @@ static inline void insert_inode_hash(str } extern struct file * get_empty_filp(void); +extern struct file * get_empty_kernel_filp(unsigned short fkflags); extern void file_move(struct file *f, struct list_head *list); extern void file_kill(struct file *f); struct bio; @@ -1647,6 +1656,7 @@ extern ssize_t generic_file_direct_write unsigned long *, loff_t, loff_t *, size_t, size_t); extern ssize_t generic_file_buffered_write(struct kiocb *, const struct iovec *, unsigned long, loff_t, loff_t *, size_t, ssize_t); +extern int generic_file_buffered_write_one_kernel_page(struct file *, pgoff_t, struct page *); extern ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos); extern ssize_t do_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos); ssize_t generic_file_write_nolock(struct file *file, const struct iovec *iov, diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index e4b1a4d..5bbbc79 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -794,6 +794,7 @@ enum FS_AIO_NR=18, /* current system-wide number of aio requests */ FS_AIO_MAX_NR=19, /* system-wide maximum number of aio requests */ FS_INOTIFY=20, /* inotify submenu */ + FS_FILE_KERNEL=21, /* int: number of internal kernel files */ }; /* /proc/sys/fs/quota/ */ diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 362a0cc..32043a2 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -52,6 +52,9 @@ #include extern int proc_nr_files(ctl_table *table, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos); +extern int proc_files_kernel(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos); + #if defined(CONFIG_SYSCTL) /* External variables not in a header file. */ @@ -993,6 +996,14 @@ static ctl_table fs_table[] = { .proc_handler = &proc_dointvec, }, { + .ctl_name = FS_FILE_KERNEL, + .procname = "file-kernel", + .data = &files_stat, + .maxlen = 1*sizeof(int), + .mode = 0444, + .proc_handler = &proc_files_kernel, + }, + { .ctl_name = FS_DENTRY, .procname = "dentry-state", .data = &dentry_stat,