file_table.c 9.99 KB
Newer Older
Linus Torvalds's avatar
Linus Torvalds committed
1 2 3 4 5 6 7 8 9 10
/*
 *  linux/fs/file_table.c
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 *  Copyright (C) 1997 David S. Miller (davem@caip.rutgers.edu)
 */

#include <linux/string.h>
#include <linux/slab.h>
#include <linux/file.h>
Al Viro's avatar
Al Viro committed
11
#include <linux/fdtable.h>
Linus Torvalds's avatar
Linus Torvalds committed
12 13 14 15
#include <linux/init.h>
#include <linux/module.h>
#include <linux/fs.h>
#include <linux/security.h>
16
#include <linux/cred.h>
Linus Torvalds's avatar
Linus Torvalds committed
17
#include <linux/eventpoll.h>
18
#include <linux/rcupdate.h>
Linus Torvalds's avatar
Linus Torvalds committed
19
#include <linux/mount.h>
20
#include <linux/capability.h>
Linus Torvalds's avatar
Linus Torvalds committed
21
#include <linux/cdev.h>
Robert Love's avatar
Robert Love committed
22
#include <linux/fsnotify.h>
23 24
#include <linux/sysctl.h>
#include <linux/percpu_counter.h>
Nick Piggin's avatar
Nick Piggin committed
25
#include <linux/percpu.h>
Al Viro's avatar
Al Viro committed
26
#include <linux/task_work.h>
27
#include <linux/ima.h>
28
#include <linux/swap.h>
29

30
#include <linux/atomic.h>
Linus Torvalds's avatar
Linus Torvalds committed
31

32 33
#include "internal.h"

Linus Torvalds's avatar
Linus Torvalds committed
34 35 36 37 38
/* sysctl tunables... */
struct files_stat_struct files_stat = {
	.max_files = NR_FILE
};

39 40 41
/* SLAB cache for file structures */
static struct kmem_cache *filp_cachep __read_mostly;

42
static struct percpu_counter nr_files __cacheline_aligned_in_smp;
Linus Torvalds's avatar
Linus Torvalds committed
43

Al Viro's avatar
Al Viro committed
44
static void file_free_rcu(struct rcu_head *head)
Linus Torvalds's avatar
Linus Torvalds committed
45
{
46 47 48
	struct file *f = container_of(head, struct file, f_u.fu_rcuhead);

	put_cred(f->f_cred);
49
	kmem_cache_free(filp_cachep, f);
Linus Torvalds's avatar
Linus Torvalds committed
50 51
}

52
static inline void file_free(struct file *f)
Linus Torvalds's avatar
Linus Torvalds committed
53
{
54
	security_file_free(f);
55 56
	if (!(f->f_mode & FMODE_NOACCOUNT))
		percpu_counter_dec(&nr_files);
57
	call_rcu(&f->f_u.fu_rcuhead, file_free_rcu);
Linus Torvalds's avatar
Linus Torvalds committed
58 59
}

60 61 62
/*
 * Return the total number of open files in the system
 */
63
static long get_nr_files(void)
Linus Torvalds's avatar
Linus Torvalds committed
64
{
65
	return percpu_counter_read_positive(&nr_files);
Linus Torvalds's avatar
Linus Torvalds committed
66 67
}

68 69 70
/*
 * Return the maximum number of open files in the system
 */
71
unsigned long get_max_files(void)
72
{
73
	return files_stat.max_files;
74
}
75 76 77 78 79 80
EXPORT_SYMBOL_GPL(get_max_files);

/*
 * Handle nr_files sysctl
 */
#if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS)
81
int proc_nr_files(struct ctl_table *table, int write,
82 83 84
                     void __user *buffer, size_t *lenp, loff_t *ppos)
{
	files_stat.nr_files = get_nr_files();
85
	return proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
86 87
}
#else
88
int proc_nr_files(struct ctl_table *table, int write,
89 90 91 92 93
                     void __user *buffer, size_t *lenp, loff_t *ppos)
{
	return -ENOSYS;
}
#endif
94

95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122
static struct file *__alloc_file(int flags, const struct cred *cred)
{
	struct file *f;
	int error;

	f = kmem_cache_zalloc(filp_cachep, GFP_KERNEL);
	if (unlikely(!f))
		return ERR_PTR(-ENOMEM);

	f->f_cred = get_cred(cred);
	error = security_file_alloc(f);
	if (unlikely(error)) {
		file_free_rcu(&f->f_u.fu_rcuhead);
		return ERR_PTR(error);
	}

	atomic_long_set(&f->f_count, 1);
	rwlock_init(&f->f_owner.lock);
	spin_lock_init(&f->f_lock);
	mutex_init(&f->f_pos_lock);
	eventpoll_init_file(f);
	f->f_flags = flags;
	f->f_mode = OPEN_FMODE(flags);
	/* f->f_version: 0 */

	return f;
}

Linus Torvalds's avatar
Linus Torvalds committed
123
/* Find an unused file structure and return a pointer to it.
124 125
 * Returns an error pointer if some error happend e.g. we over file
 * structures limit, run out of memory or operation is not permitted.
126 127 128 129 130 131
 *
 * Be very careful using this.  You are responsible for
 * getting write access to any mount that you might assign
 * to this filp, if it is opened for write.  If this is not
 * done, you will imbalance int the mount's writer count
 * and a warning at __fput() time.
Linus Torvalds's avatar
Linus Torvalds committed
132
 */
133
struct file *alloc_empty_file(int flags, const struct cred *cred)
Linus Torvalds's avatar
Linus Torvalds committed
134
{
135
	static long old_max;
136
	struct file *f;
Linus Torvalds's avatar
Linus Torvalds committed
137 138 139 140

	/*
	 * Privileged users can go above max_files
	 */
141 142 143 144 145
	if (get_nr_files() >= files_stat.max_files && !capable(CAP_SYS_ADMIN)) {
		/*
		 * percpu_counters are inaccurate.  Do an expensive check before
		 * we go and fail.
		 */
146
		if (percpu_counter_sum_positive(&nr_files) >= files_stat.max_files)
147 148
			goto over;
	}
149

150 151 152
	f = __alloc_file(flags, cred);
	if (!IS_ERR(f))
		percpu_counter_inc(&nr_files);
Linus Torvalds's avatar
Linus Torvalds committed
153

154 155 156
	return f;

over:
Linus Torvalds's avatar
Linus Torvalds committed
157
	/* Ran out of filps - report that */
158
	if (get_nr_files() > old_max) {
159
		pr_info("VFS: file-max limit %lu reached\n", get_max_files());
160
		old_max = get_nr_files();
Linus Torvalds's avatar
Linus Torvalds committed
161
	}
162
	return ERR_PTR(-ENFILE);
Linus Torvalds's avatar
Linus Torvalds committed
163 164
}

165 166 167 168 169 170 171 172 173 174 175 176 177 178 179
/*
 * Variant of alloc_empty_file() that doesn't check and modify nr_files.
 *
 * Should not be used unless there's a very good reason to do so.
 */
struct file *alloc_empty_file_noaccount(int flags, const struct cred *cred)
{
	struct file *f = __alloc_file(flags, cred);

	if (!IS_ERR(f))
		f->f_mode |= FMODE_NOACCOUNT;

	return f;
}

180 181
/**
 * alloc_file - allocate and initialize a 'struct file'
182 183
 *
 * @path: the (dentry, vfsmount) pair for the new file
184
 * @flags: O_... flags with which the new file will be opened
185 186
 * @fop: the 'struct file_operations' for the new file
 */
Al Viro's avatar
Al Viro committed
187
static struct file *alloc_file(const struct path *path, int flags,
188
		const struct file_operations *fop)
189 190 191
{
	struct file *file;

192
	file = alloc_empty_file(flags, current_cred());
193
	if (IS_ERR(file))
194
		return file;
195

196
	file->f_path = *path;
197
	file->f_inode = path->dentry->d_inode;
198
	file->f_mapping = path->dentry->d_inode->i_mapping;
199
	file->f_wb_err = filemap_sample_wb_err(file->f_mapping);
200
	if ((file->f_mode & FMODE_READ) &&
201
	     likely(fop->read || fop->read_iter))
202 203
		file->f_mode |= FMODE_CAN_READ;
	if ((file->f_mode & FMODE_WRITE) &&
204
	     likely(fop->write || fop->write_iter))
205
		file->f_mode |= FMODE_CAN_WRITE;
Al Viro's avatar
Al Viro committed
206
	file->f_mode |= FMODE_OPENED;
207
	file->f_op = fop;
208
	if ((file->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ)
209
		i_readcount_inc(path->dentry->d_inode);
Al Viro's avatar
Al Viro committed
210
	return file;
211 212
}

213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239
struct file *alloc_file_pseudo(struct inode *inode, struct vfsmount *mnt,
				const char *name, int flags,
				const struct file_operations *fops)
{
	static const struct dentry_operations anon_ops = {
		.d_dname = simple_dname
	};
	struct qstr this = QSTR_INIT(name, strlen(name));
	struct path path;
	struct file *file;

	path.dentry = d_alloc_pseudo(mnt->mnt_sb, &this);
	if (!path.dentry)
		return ERR_PTR(-ENOMEM);
	if (!mnt->mnt_sb->s_d_op)
		d_set_d_op(path.dentry, &anon_ops);
	path.mnt = mntget(mnt);
	d_instantiate(path.dentry, inode);
	file = alloc_file(&path, flags, fops);
	if (IS_ERR(file)) {
		ihold(inode);
		path_put(&path);
	}
	return file;
}
EXPORT_SYMBOL(alloc_file_pseudo);

Al Viro's avatar
Al Viro committed
240 241 242 243 244 245 246 247 248 249 250
struct file *alloc_file_clone(struct file *base, int flags,
				const struct file_operations *fops)
{
	struct file *f = alloc_file(&base->f_path, flags, fops);
	if (!IS_ERR(f)) {
		path_get(&f->f_path);
		f->f_mapping = base->f_mapping;
	}
	return f;
}

251
/* the real guts of fput() - releasing the last reference to file
Linus Torvalds's avatar
Linus Torvalds committed
252
 */
253
static void __fput(struct file *file)
Linus Torvalds's avatar
Linus Torvalds committed
254
{
255 256
	struct dentry *dentry = file->f_path.dentry;
	struct vfsmount *mnt = file->f_path.mnt;
257
	struct inode *inode = file->f_inode;
Linus Torvalds's avatar
Linus Torvalds committed
258

Al Viro's avatar
Al Viro committed
259 260 261
	if (unlikely(!(file->f_mode & FMODE_OPENED)))
		goto out;

Linus Torvalds's avatar
Linus Torvalds committed
262
	might_sleep();
Robert Love's avatar
Robert Love committed
263 264

	fsnotify_close(file);
Linus Torvalds's avatar
Linus Torvalds committed
265 266 267 268 269
	/*
	 * The function eventpoll_release() should be the first called
	 * in the file cleanup chain.
	 */
	eventpoll_release(file);
270
	locks_remove_file(file);
Linus Torvalds's avatar
Linus Torvalds committed
271

272
	ima_file_free(file);
273
	if (unlikely(file->f_flags & FASYNC)) {
Al Viro's avatar
Al Viro committed
274
		if (file->f_op->fasync)
275 276
			file->f_op->fasync(-1, file, 0);
	}
Al Viro's avatar
Al Viro committed
277
	if (file->f_op->release)
Linus Torvalds's avatar
Linus Torvalds committed
278
		file->f_op->release(inode, file);
279 280
	if (unlikely(S_ISCHR(inode->i_mode) && inode->i_cdev != NULL &&
		     !(file->f_mode & FMODE_PATH))) {
Linus Torvalds's avatar
Linus Torvalds committed
281
		cdev_put(inode->i_cdev);
282
	}
Linus Torvalds's avatar
Linus Torvalds committed
283
	fops_put(file->f_op);
284
	put_pid(file->f_owner.pid);
285 286
	if ((file->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ)
		i_readcount_dec(inode);
287 288 289 290
	if (file->f_mode & FMODE_WRITER) {
		put_write_access(inode);
		__mnt_drop_write(mnt);
	}
Linus Torvalds's avatar
Linus Torvalds committed
291 292
	dput(dentry);
	mntput(mnt);
Al Viro's avatar
Al Viro committed
293 294
out:
	file_free(file);
Linus Torvalds's avatar
Linus Torvalds committed
295 296
}

297
static LLIST_HEAD(delayed_fput_list);
Al Viro's avatar
Al Viro committed
298 299
static void delayed_fput(struct work_struct *unused)
{
300
	struct llist_node *node = llist_del_all(&delayed_fput_list);
301
	struct file *f, *t;
302

303 304
	llist_for_each_entry_safe(f, t, node, f_u.fu_llist)
		__fput(f);
Al Viro's avatar
Al Viro committed
305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326
}

static void ____fput(struct callback_head *work)
{
	__fput(container_of(work, struct file, f_u.fu_rcuhead));
}

/*
 * If kernel thread really needs to have the final fput() it has done
 * to complete, call this.  The only user right now is the boot - we
 * *do* need to make sure our writes to binaries on initramfs has
 * not left us with opened struct file waiting for __fput() - execve()
 * won't work without that.  Please, don't add more callers without
 * very good reasons; in particular, never call that with locks
 * held and never call that from a thread that might need to do
 * some work on any kind of umount.
 */
void flush_delayed_fput(void)
{
	delayed_fput(NULL);
}

327
static DECLARE_DELAYED_WORK(delayed_fput_work, delayed_fput);
Al Viro's avatar
Al Viro committed
328

329 330
void fput(struct file *file)
{
Al Viro's avatar
Al Viro committed
331 332
	if (atomic_long_dec_and_test(&file->f_count)) {
		struct task_struct *task = current;
333 334 335 336 337

		if (likely(!in_interrupt() && !(task->flags & PF_KTHREAD))) {
			init_task_work(&file->f_u.fu_rcuhead, ____fput);
			if (!task_work_add(task, &file->f_u.fu_rcuhead, true))
				return;
338 339
			/*
			 * After this task has run exit_task_work(),
340
			 * task_work_add() will fail.  Fall through to delayed
341 342
			 * fput to avoid leaking *file.
			 */
Al Viro's avatar
Al Viro committed
343
		}
344 345

		if (llist_add(&file->f_u.fu_llist, &delayed_fput_list))
346
			schedule_delayed_work(&delayed_fput_work, 1);
Al Viro's avatar
Al Viro committed
347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362
	}
}

/*
 * synchronous analog of fput(); for kernel threads that might be needed
 * in some umount() (and thus can't use flush_delayed_fput() without
 * risking deadlocks), need to wait for completion of __fput() and know
 * for this specific struct file it won't involve anything that would
 * need them.  Use only if you really need it - at the very least,
 * don't blindly convert fput() by kernel thread to that.
 */
void __fput_sync(struct file *file)
{
	if (atomic_long_dec_and_test(&file->f_count)) {
		struct task_struct *task = current;
		BUG_ON(!(task->flags & PF_KTHREAD));
363
		__fput(file);
Al Viro's avatar
Al Viro committed
364
	}
365 366 367 368
}

EXPORT_SYMBOL(fput);

369
void __init files_init(void)
370
{
371
	filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0,
372
			SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT, NULL);
373 374
	percpu_counter_init(&nr_files, 0, GFP_KERNEL);
}
375

376 377 378 379 380 381 382
/*
 * One file with associated inode and dcache is very roughly 1K. Per default
 * do not use more than 10% of our memory for files.
 */
void __init files_maxfiles_init(void)
{
	unsigned long n;
383
	unsigned long nr_pages = totalram_pages();
384
	unsigned long memreserve = (nr_pages - nr_free_pages()) * 3/2;
385

386 387
	memreserve = min(memreserve, nr_pages - 1);
	n = ((nr_pages - memreserve) * (PAGE_SIZE / 1024)) / 10;
Linus Torvalds's avatar
Linus Torvalds committed
388

389
	files_stat.max_files = max_t(unsigned long, n, NR_FILE);
390
}