socket.c 83.5 KB
Newer Older
Linus Torvalds's avatar
Linus Torvalds committed
1 2 3 4 5 6
/*
 * NET		An implementation of the SOCKET network access protocol.
 *
 * Version:	@(#)socket.c	1.1.93	18/02/95
 *
 * Authors:	Orest Zborowski, <obz@Kodak.COM>
7
 *		Ross Biro
Linus Torvalds's avatar
Linus Torvalds committed
8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44
 *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
 *
 * Fixes:
 *		Anonymous	:	NOTSOCK/BADF cleanup. Error fix in
 *					shutdown()
 *		Alan Cox	:	verify_area() fixes
 *		Alan Cox	:	Removed DDI
 *		Jonathan Kamens	:	SOCK_DGRAM reconnect bug
 *		Alan Cox	:	Moved a load of checks to the very
 *					top level.
 *		Alan Cox	:	Move address structures to/from user
 *					mode above the protocol layers.
 *		Rob Janssen	:	Allow 0 length sends.
 *		Alan Cox	:	Asynchronous I/O support (cribbed from the
 *					tty drivers).
 *		Niibe Yutaka	:	Asynchronous I/O for writes (4.4BSD style)
 *		Jeff Uphoff	:	Made max number of sockets command-line
 *					configurable.
 *		Matti Aarnio	:	Made the number of sockets dynamic,
 *					to be allocated when needed, and mr.
 *					Uphoff's max is used as max to be
 *					allowed to allocate.
 *		Linus		:	Argh. removed all the socket allocation
 *					altogether: it's in the inode now.
 *		Alan Cox	:	Made sock_alloc()/sock_release() public
 *					for NetROM and future kernel nfsd type
 *					stuff.
 *		Alan Cox	:	sendmsg/recvmsg basics.
 *		Tom Dyas	:	Export net symbols.
 *		Marcin Dalecki	:	Fixed problems with CONFIG_NET="n".
 *		Alan Cox	:	Added thread locking to sys_* calls
 *					for sockets. May have errors at the
 *					moment.
 *		Kevin Buhr	:	Fixed the dumb errors in the above.
 *		Andi Kleen	:	Some small cleanups, optimizations,
 *					and fixed a copy_from_user() bug.
 *		Tigran Aivazian	:	sys_send(args) calls sys_sendto(args, NULL, 0)
45
 *		Tigran Aivazian	:	Made listen(2) backlog sanity checks
Linus Torvalds's avatar
Linus Torvalds committed
46 47 48 49 50 51 52 53 54 55
 *					protocol-independent
 *
 *
 *		This program is free software; you can redistribute it and/or
 *		modify it under the terms of the GNU General Public License
 *		as published by the Free Software Foundation; either version
 *		2 of the License, or (at your option) any later version.
 *
 *
 *	This module is effectively the top level interface to the BSD socket
56
 *	paradigm.
Linus Torvalds's avatar
Linus Torvalds committed
57 58 59 60 61 62 63 64 65
 *
 *	Based upon Swansea University Computer Society NET3.039
 */

#include <linux/mm.h>
#include <linux/socket.h>
#include <linux/file.h>
#include <linux/net.h>
#include <linux/interrupt.h>
Ulrich Drepper's avatar
Ulrich Drepper committed
66
#include <linux/thread_info.h>
67
#include <linux/rcupdate.h>
Linus Torvalds's avatar
Linus Torvalds committed
68 69 70
#include <linux/netdevice.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
Arjan van de Ven's avatar
Arjan van de Ven committed
71
#include <linux/mutex.h>
Linus Torvalds's avatar
Linus Torvalds committed
72
#include <linux/if_bridge.h>
73 74
#include <linux/if_frad.h>
#include <linux/if_vlan.h>
75
#include <linux/ptp_classify.h>
Linus Torvalds's avatar
Linus Torvalds committed
76 77 78 79 80 81 82 83 84 85
#include <linux/init.h>
#include <linux/poll.h>
#include <linux/cache.h>
#include <linux/module.h>
#include <linux/highmem.h>
#include <linux/mount.h>
#include <linux/security.h>
#include <linux/syscalls.h>
#include <linux/compat.h>
#include <linux/kmod.h>
86
#include <linux/audit.h>
87
#include <linux/wireless.h>
88
#include <linux/nsproxy.h>
Nick Black's avatar
Nick Black committed
89
#include <linux/magic.h>
90
#include <linux/slab.h>
91
#include <linux/xattr.h>
92
#include <linux/nospec.h>
Linus Torvalds's avatar
Linus Torvalds committed
93

94
#include <linux/uaccess.h>
Linus Torvalds's avatar
Linus Torvalds committed
95 96 97
#include <asm/unistd.h>

#include <net/compat.h>
98
#include <net/wext.h>
99
#include <net/cls_cgroup.h>
Linus Torvalds's avatar
Linus Torvalds committed
100 101 102 103

#include <net/sock.h>
#include <linux/netfilter.h>

104 105 106 107
#include <linux/if_tun.h>
#include <linux/ipv6_route.h>
#include <linux/route.h>
#include <linux/sockios.h>
108
#include <net/busy_poll.h>
109
#include <linux/errqueue.h>
110

111
#ifdef CONFIG_NET_RX_BUSY_POLL
112 113
unsigned int sysctl_net_busy_read __read_mostly;
unsigned int sysctl_net_busy_poll __read_mostly;
114
#endif
115

116 117
static ssize_t sock_read_iter(struct kiocb *iocb, struct iov_iter *to);
static ssize_t sock_write_iter(struct kiocb *iocb, struct iov_iter *from);
118
static int sock_mmap(struct file *file, struct vm_area_struct *vma);
Linus Torvalds's avatar
Linus Torvalds committed
119 120

static int sock_close(struct inode *inode, struct file *file);
121 122
static __poll_t sock_poll(struct file *file,
			      struct poll_table_struct *wait);
123
static long sock_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
124 125
#ifdef CONFIG_COMPAT
static long compat_sock_ioctl(struct file *file,
126
			      unsigned int cmd, unsigned long arg);
127
#endif
Linus Torvalds's avatar
Linus Torvalds committed
128 129 130
static int sock_fasync(int fd, struct file *filp, int on);
static ssize_t sock_sendpage(struct file *file, struct page *page,
			     int offset, size_t size, loff_t *ppos, int more);
Jens Axboe's avatar
Jens Axboe committed
131
static ssize_t sock_splice_read(struct file *file, loff_t *ppos,
132
				struct pipe_inode_info *pipe, size_t len,
Jens Axboe's avatar
Jens Axboe committed
133
				unsigned int flags);
Linus Torvalds's avatar
Linus Torvalds committed
134 135 136 137 138 139

/*
 *	Socket files have a set of 'special' operations as well as the generic file ones. These don't appear
 *	in the operation structures but are done directly via the socketcall() multiplexor.
 */

140
static const struct file_operations socket_file_ops = {
Linus Torvalds's avatar
Linus Torvalds committed
141 142
	.owner =	THIS_MODULE,
	.llseek =	no_llseek,
143 144
	.read_iter =	sock_read_iter,
	.write_iter =	sock_write_iter,
Linus Torvalds's avatar
Linus Torvalds committed
145 146
	.poll =		sock_poll,
	.unlocked_ioctl = sock_ioctl,
147 148 149
#ifdef CONFIG_COMPAT
	.compat_ioctl = compat_sock_ioctl,
#endif
Linus Torvalds's avatar
Linus Torvalds committed
150 151 152
	.mmap =		sock_mmap,
	.release =	sock_close,
	.fasync =	sock_fasync,
153 154
	.sendpage =	sock_sendpage,
	.splice_write = generic_splice_sendpage,
Jens Axboe's avatar
Jens Axboe committed
155
	.splice_read =	sock_splice_read,
Linus Torvalds's avatar
Linus Torvalds committed
156 157 158 159 160 161 162
};

/*
 *	The protocol list. Each protocol is registered in here.
 */

static DEFINE_SPINLOCK(net_family_lock);
163
static const struct net_proto_family __rcu *net_families[NPROTO] __read_mostly;
Linus Torvalds's avatar
Linus Torvalds committed
164 165

/*
166 167 168
 * Support routines.
 * Move socket addresses back and forth across the kernel/user
 * divide and look after the messy bits.
Linus Torvalds's avatar
Linus Torvalds committed
169 170 171 172 173 174 175 176 177 178 179 180 181
 */

/**
 *	move_addr_to_kernel	-	copy a socket address into kernel space
 *	@uaddr: Address in user space
 *	@kaddr: Address in kernel space
 *	@ulen: Length in user space
 *
 *	The address is copied into kernel space. If the provided address is
 *	too long an error code of -EINVAL is returned. If the copy gives
 *	invalid addresses -EFAULT is returned. On a success 0 is returned.
 */

182
int move_addr_to_kernel(void __user *uaddr, int ulen, struct sockaddr_storage *kaddr)
Linus Torvalds's avatar
Linus Torvalds committed
183
{
184
	if (ulen < 0 || ulen > sizeof(struct sockaddr_storage))
Linus Torvalds's avatar
Linus Torvalds committed
185
		return -EINVAL;
186
	if (ulen == 0)
Linus Torvalds's avatar
Linus Torvalds committed
187
		return 0;
188
	if (copy_from_user(kaddr, uaddr, ulen))
Linus Torvalds's avatar
Linus Torvalds committed
189
		return -EFAULT;
190
	return audit_sockaddr(ulen, kaddr);
Linus Torvalds's avatar
Linus Torvalds committed
191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208
}

/**
 *	move_addr_to_user	-	copy an address to user space
 *	@kaddr: kernel space address
 *	@klen: length of address in kernel
 *	@uaddr: user space address
 *	@ulen: pointer to user length field
 *
 *	The value pointed to by ulen on entry is the buffer length available.
 *	This is overwritten with the buffer space used. -EINVAL is returned
 *	if an overlong buffer is specified or a negative buffer size. -EFAULT
 *	is returned if either the buffer or the length field are not
 *	accessible.
 *	After copying the data up to the limit the user specifies, the true
 *	length of the data is written over the length limit the user
 *	specified. Zero is returned for a success.
 */
209

210
static int move_addr_to_user(struct sockaddr_storage *kaddr, int klen,
stephen hemminger's avatar
stephen hemminger committed
211
			     void __user *uaddr, int __user *ulen)
Linus Torvalds's avatar
Linus Torvalds committed
212 213 214 215
{
	int err;
	int len;

216
	BUG_ON(klen > sizeof(struct sockaddr_storage));
217 218
	err = get_user(len, ulen);
	if (err)
Linus Torvalds's avatar
Linus Torvalds committed
219
		return err;
220 221
	if (len > klen)
		len = klen;
222
	if (len < 0)
Linus Torvalds's avatar
Linus Torvalds committed
223
		return -EINVAL;
224
	if (len) {
Steve Grubb's avatar
Steve Grubb committed
225 226
		if (audit_sockaddr(klen, kaddr))
			return -ENOMEM;
227
		if (copy_to_user(uaddr, kaddr, len))
Linus Torvalds's avatar
Linus Torvalds committed
228 229 230
			return -EFAULT;
	}
	/*
231 232
	 *      "fromlen shall refer to the value before truncation.."
	 *                      1003.1g
Linus Torvalds's avatar
Linus Torvalds committed
233 234 235 236
	 */
	return __put_user(klen, ulen);
}

237
static struct kmem_cache *sock_inode_cachep __ro_after_init;
Linus Torvalds's avatar
Linus Torvalds committed
238 239 240 241

static struct inode *sock_alloc_inode(struct super_block *sb)
{
	struct socket_alloc *ei;
242
	struct socket_wq *wq;
243

244
	ei = kmem_cache_alloc(sock_inode_cachep, GFP_KERNEL);
Linus Torvalds's avatar
Linus Torvalds committed
245 246
	if (!ei)
		return NULL;
247 248
	wq = kmalloc(sizeof(*wq), GFP_KERNEL);
	if (!wq) {
249 250 251
		kmem_cache_free(sock_inode_cachep, ei);
		return NULL;
	}
252 253
	init_waitqueue_head(&wq->wait);
	wq->fasync_list = NULL;
254
	wq->flags = 0;
255
	RCU_INIT_POINTER(ei->socket.wq, wq);
256

Linus Torvalds's avatar
Linus Torvalds committed
257 258 259 260 261 262 263 264 265 266 267
	ei->socket.state = SS_UNCONNECTED;
	ei->socket.flags = 0;
	ei->socket.ops = NULL;
	ei->socket.sk = NULL;
	ei->socket.file = NULL;

	return &ei->vfs_inode;
}

static void sock_destroy_inode(struct inode *inode)
{
268
	struct socket_alloc *ei;
269
	struct socket_wq *wq;
270 271

	ei = container_of(inode, struct socket_alloc, vfs_inode);
272
	wq = rcu_dereference_protected(ei->socket.wq, 1);
273
	kfree_rcu(wq, rcu);
274
	kmem_cache_free(sock_inode_cachep, ei);
Linus Torvalds's avatar
Linus Torvalds committed
275 276
}

277
static void init_once(void *foo)
Linus Torvalds's avatar
Linus Torvalds committed
278
{
279
	struct socket_alloc *ei = (struct socket_alloc *)foo;
Linus Torvalds's avatar
Linus Torvalds committed
280

281
	inode_init_once(&ei->vfs_inode);
Linus Torvalds's avatar
Linus Torvalds committed
282
}
283

284
static void init_inodecache(void)
Linus Torvalds's avatar
Linus Torvalds committed
285 286
{
	sock_inode_cachep = kmem_cache_create("sock_inode_cache",
287 288 289 290
					      sizeof(struct socket_alloc),
					      0,
					      (SLAB_HWCACHE_ALIGN |
					       SLAB_RECLAIM_ACCOUNT |
291
					       SLAB_MEM_SPREAD | SLAB_ACCOUNT),
292
					      init_once);
293
	BUG_ON(sock_inode_cachep == NULL);
Linus Torvalds's avatar
Linus Torvalds committed
294 295
}

296
static const struct super_operations sockfs_ops = {
297 298 299
	.alloc_inode	= sock_alloc_inode,
	.destroy_inode	= sock_destroy_inode,
	.statfs		= simple_statfs,
Linus Torvalds's avatar
Linus Torvalds committed
300 301
};

302 303 304 305 306 307
/*
 * sockfs_dname() is called from d_path().
 */
static char *sockfs_dname(struct dentry *dentry, char *buffer, int buflen)
{
	return dynamic_dname(dentry, buffer, buflen, "socket:[%lu]",
308
				d_inode(dentry)->i_ino);
309 310
}

Al Viro's avatar
Al Viro committed
311
static const struct dentry_operations sockfs_dentry_operations = {
312
	.d_dname  = sockfs_dname,
Linus Torvalds's avatar
Linus Torvalds committed
313 314
};

315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335
static int sockfs_xattr_get(const struct xattr_handler *handler,
			    struct dentry *dentry, struct inode *inode,
			    const char *suffix, void *value, size_t size)
{
	if (value) {
		if (dentry->d_name.len + 1 > size)
			return -ERANGE;
		memcpy(value, dentry->d_name.name, dentry->d_name.len + 1);
	}
	return dentry->d_name.len + 1;
}

#define XATTR_SOCKPROTONAME_SUFFIX "sockprotoname"
#define XATTR_NAME_SOCKPROTONAME (XATTR_SYSTEM_PREFIX XATTR_SOCKPROTONAME_SUFFIX)
#define XATTR_NAME_SOCKPROTONAME_LEN (sizeof(XATTR_NAME_SOCKPROTONAME)-1)

static const struct xattr_handler sockfs_xattr_handler = {
	.name = XATTR_NAME_SOCKPROTONAME,
	.get = sockfs_xattr_get,
};

336 337 338 339 340 341 342 343 344 345 346 347 348 349
static int sockfs_security_xattr_set(const struct xattr_handler *handler,
				     struct dentry *dentry, struct inode *inode,
				     const char *suffix, const void *value,
				     size_t size, int flags)
{
	/* Handled by LSM. */
	return -EAGAIN;
}

static const struct xattr_handler sockfs_security_xattr_handler = {
	.prefix = XATTR_SECURITY_PREFIX,
	.set = sockfs_security_xattr_set,
};

350 351
static const struct xattr_handler *sockfs_xattr_handlers[] = {
	&sockfs_xattr_handler,
352
	&sockfs_security_xattr_handler,
353 354 355
	NULL
};

356 357 358
static struct dentry *sockfs_mount(struct file_system_type *fs_type,
			 int flags, const char *dev_name, void *data)
{
359 360 361
	return mount_pseudo_xattr(fs_type, "socket:", &sockfs_ops,
				  sockfs_xattr_handlers,
				  &sockfs_dentry_operations, SOCKFS_MAGIC);
362 363 364 365 366 367 368 369 370 371
}

static struct vfsmount *sock_mnt __read_mostly;

static struct file_system_type sock_fs_type = {
	.name =		"sockfs",
	.mount =	sockfs_mount,
	.kill_sb =	kill_anon_super,
};

Linus Torvalds's avatar
Linus Torvalds committed
372 373 374
/*
 *	Obtains the first available file descriptor and sets it up for use.
 *
375 376
 *	These functions create file structures and maps them to fd space
 *	of the current process. On success it returns file descriptor
Linus Torvalds's avatar
Linus Torvalds committed
377 378 379 380 381 382 383 384 385 386 387 388
 *	and file struct implicitly stored in sock->file.
 *	Note that another thread may close file descriptor before we return
 *	from this function. We use the fact that now we do not refer
 *	to socket after mapping. If one day we will need it, this
 *	function will increment ref. count on file by 1.
 *
 *	In any case returned fd MAY BE not valid!
 *	This race condition is unavoidable
 *	with shared fd spaces, we cannot solve it inside kernel,
 *	but we take care of internal coherence yet.
 */

389
struct file *sock_alloc_file(struct socket *sock, int flags, const char *dname)
Linus Torvalds's avatar
Linus Torvalds committed
390
{
391
	struct qstr name = { .name = "" };
392
	struct path path;
393
	struct file *file;
Linus Torvalds's avatar
Linus Torvalds committed
394

395 396 397 398 399 400 401
	if (dname) {
		name.name = dname;
		name.len = strlen(name.name);
	} else if (sock->sk) {
		name.name = sock->sk->sk_prot_creator->name;
		name.len = strlen(name.name);
	}
402
	path.dentry = d_alloc_pseudo(sock_mnt->mnt_sb, &name);
403 404
	if (unlikely(!path.dentry)) {
		sock_release(sock);
405
		return ERR_PTR(-ENOMEM);
406
	}
407
	path.mnt = mntget(sock_mnt);
408

409
	d_instantiate(path.dentry, SOCK_INODE(sock));
410

411
	file = alloc_file(&path, FMODE_READ | FMODE_WRITE,
412
		  &socket_file_ops);
413
	if (IS_ERR(file)) {
414
		/* drop dentry, keep inode for a bit */
415
		ihold(d_inode(path.dentry));
416
		path_put(&path);
417 418
		/* ... and now kill it properly */
		sock_release(sock);
419
		return file;
420 421 422
	}

	sock->file = file;
423
	file->f_flags = O_RDWR | (flags & O_NONBLOCK);
424
	file->private_data = sock;
425
	return file;
426
}
427
EXPORT_SYMBOL(sock_alloc_file);
428

429
static int sock_map_fd(struct socket *sock, int flags)
430 431
{
	struct file *newfile;
432
	int fd = get_unused_fd_flags(flags);
433 434
	if (unlikely(fd < 0)) {
		sock_release(sock);
435
		return fd;
436
	}
437

438
	newfile = sock_alloc_file(sock, flags, NULL);
439
	if (likely(!IS_ERR(newfile))) {
440
		fd_install(fd, newfile);
441 442
		return fd;
	}
443

444 445
	put_unused_fd(fd);
	return PTR_ERR(newfile);
Linus Torvalds's avatar
Linus Torvalds committed
446 447
}

448
struct socket *sock_from_file(struct file *file, int *err)
449 450 451 452
{
	if (file->f_op == &socket_file_ops)
		return file->private_data;	/* set in sock_map_fd */

Eric Dumazet's avatar
Eric Dumazet committed
453 454
	*err = -ENOTSOCK;
	return NULL;
455
}
456
EXPORT_SYMBOL(sock_from_file);
457

Linus Torvalds's avatar
Linus Torvalds committed
458
/**
459
 *	sockfd_lookup - Go from a file number to its socket slot
Linus Torvalds's avatar
Linus Torvalds committed
460 461 462 463
 *	@fd: file handle
 *	@err: pointer to an error code return
 *
 *	The file handle passed in is locked and the socket it is bound
464
 *	to is returned. If an error occurs the err pointer is overwritten
Linus Torvalds's avatar
Linus Torvalds committed
465 466 467 468 469 470 471 472 473 474 475
 *	with a negative errno code and NULL is returned. The function checks
 *	for both invalid handles and passing a handle which is not a socket.
 *
 *	On a success the socket object pointer is returned.
 */

struct socket *sockfd_lookup(int fd, int *err)
{
	struct file *file;
	struct socket *sock;

476 477
	file = fget(fd);
	if (!file) {
Linus Torvalds's avatar
Linus Torvalds committed
478 479 480
		*err = -EBADF;
		return NULL;
	}
481

482 483
	sock = sock_from_file(file, err);
	if (!sock)
Linus Torvalds's avatar
Linus Torvalds committed
484
		fput(file);
485 486
	return sock;
}
487
EXPORT_SYMBOL(sockfd_lookup);
Linus Torvalds's avatar
Linus Torvalds committed
488

489 490
static struct socket *sockfd_lookup_light(int fd, int *err, int *fput_needed)
{
491
	struct fd f = fdget(fd);
492 493
	struct socket *sock;

494
	*err = -EBADF;
495 496 497 498
	if (f.file) {
		sock = sock_from_file(f.file, err);
		if (likely(sock)) {
			*fput_needed = f.flags;
499
			return sock;
500 501
		}
		fdput(f);
Linus Torvalds's avatar
Linus Torvalds committed
502
	}
503
	return NULL;
Linus Torvalds's avatar
Linus Torvalds committed
504 505
}

506 507 508 509 510 511
static ssize_t sockfs_listxattr(struct dentry *dentry, char *buffer,
				size_t size)
{
	ssize_t len;
	ssize_t used = 0;

512
	len = security_inode_listsecurity(d_inode(dentry), buffer, size);
513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533
	if (len < 0)
		return len;
	used += len;
	if (buffer) {
		if (size < used)
			return -ERANGE;
		buffer += len;
	}

	len = (XATTR_NAME_SOCKPROTONAME_LEN + 1);
	used += len;
	if (buffer) {
		if (size < used)
			return -ERANGE;
		memcpy(buffer, XATTR_NAME_SOCKPROTONAME, len);
		buffer += len;
	}

	return used;
}

534
static int sockfs_setattr(struct dentry *dentry, struct iattr *iattr)
535 536 537
{
	int err = simple_setattr(dentry, iattr);

538
	if (!err && (iattr->ia_valid & ATTR_UID)) {
539 540
		struct socket *sock = SOCKET_I(d_inode(dentry));

541 542 543 544
		if (sock->sk)
			sock->sk->sk_uid = iattr->ia_uid;
		else
			err = -ENOENT;
545 546 547 548 549
	}

	return err;
}

550 551
static const struct inode_operations sockfs_inode_ops = {
	.listxattr = sockfs_listxattr,
552
	.setattr = sockfs_setattr,
553 554
};

Linus Torvalds's avatar
Linus Torvalds committed
555 556
/**
 *	sock_alloc	-	allocate a socket
557
 *
Linus Torvalds's avatar
Linus Torvalds committed
558 559 560 561 562
 *	Allocate a new inode and socket object. The two are bound together
 *	and initialised. The socket is then returned. If we are out of inodes
 *	NULL is returned.
 */

Tom Herbert's avatar
Tom Herbert committed
563
struct socket *sock_alloc(void)
Linus Torvalds's avatar
Linus Torvalds committed
564
{
565 566
	struct inode *inode;
	struct socket *sock;
Linus Torvalds's avatar
Linus Torvalds committed
567

568
	inode = new_inode_pseudo(sock_mnt->mnt_sb);
Linus Torvalds's avatar
Linus Torvalds committed
569 570 571 572 573
	if (!inode)
		return NULL;

	sock = SOCKET_I(inode);

574
	inode->i_ino = get_next_ino();
575
	inode->i_mode = S_IFSOCK | S_IRWXUGO;
576 577
	inode->i_uid = current_fsuid();
	inode->i_gid = current_fsgid();
578
	inode->i_op = &sockfs_inode_ops;
Linus Torvalds's avatar
Linus Torvalds committed
579 580 581

	return sock;
}
Tom Herbert's avatar
Tom Herbert committed
582
EXPORT_SYMBOL(sock_alloc);
Linus Torvalds's avatar
Linus Torvalds committed
583 584 585 586 587 588 589

/**
 *	sock_release	-	close a socket
 *	@sock: socket to close
 *
 *	The socket is released from the protocol stack if it has a release
 *	callback, and the inode is then released if the socket is bound to
590
 *	an inode not a file.
Linus Torvalds's avatar
Linus Torvalds committed
591
 */
592

593
static void __sock_release(struct socket *sock, struct inode *inode)
Linus Torvalds's avatar
Linus Torvalds committed
594 595 596 597
{
	if (sock->ops) {
		struct module *owner = sock->ops->owner;

598 599
		if (inode)
			inode_lock(inode);
Linus Torvalds's avatar
Linus Torvalds committed
600
		sock->ops->release(sock);
601 602
		if (inode)
			inode_unlock(inode);
Linus Torvalds's avatar
Linus Torvalds committed
603 604 605 606
		sock->ops = NULL;
		module_put(owner);
	}

607
	if (rcu_dereference_protected(sock->wq, 1)->fasync_list)
608
		pr_err("%s: fasync list not empty!\n", __func__);
Linus Torvalds's avatar
Linus Torvalds committed
609 610 611 612 613

	if (!sock->file) {
		iput(SOCK_INODE(sock));
		return;
	}
614
	sock->file = NULL;
Linus Torvalds's avatar
Linus Torvalds committed
615
}
616 617 618 619 620

void sock_release(struct socket *sock)
{
	__sock_release(sock, NULL);
}
621
EXPORT_SYMBOL(sock_release);
Linus Torvalds's avatar
Linus Torvalds committed
622

623
void __sock_tx_timestamp(__u16 tsflags, __u8 *tx_flags)
624
{
625 626
	u8 flags = *tx_flags;

627
	if (tsflags & SOF_TIMESTAMPING_TX_HARDWARE)
628 629
		flags |= SKBTX_HW_TSTAMP;

630
	if (tsflags & SOF_TIMESTAMPING_TX_SOFTWARE)
631 632
		flags |= SKBTX_SW_TSTAMP;

633
	if (tsflags & SOF_TIMESTAMPING_TX_SCHED)
634 635 636
		flags |= SKBTX_SCHED_TSTAMP;

	*tx_flags = flags;
637
}
638
EXPORT_SYMBOL(__sock_tx_timestamp);
639

640
static inline int sock_sendmsg_nosec(struct socket *sock, struct msghdr *msg)
Linus Torvalds's avatar
Linus Torvalds committed
641
{
Al Viro's avatar
Al Viro committed
642
	int ret = sock->ops->sendmsg(sock, msg, msg_data_left(msg));
643 644
	BUG_ON(ret == -EIOCBQUEUED);
	return ret;
Linus Torvalds's avatar
Linus Torvalds committed
645 646
}

647
int sock_sendmsg(struct socket *sock, struct msghdr *msg)
648
{
649
	int err = security_socket_sendmsg(sock, msg,
Al Viro's avatar
Al Viro committed
650
					  msg_data_left(msg));
651

652
	return err ?: sock_sendmsg_nosec(sock, msg);
653
}
654
EXPORT_SYMBOL(sock_sendmsg);
Linus Torvalds's avatar
Linus Torvalds committed
655 656 657 658

int kernel_sendmsg(struct socket *sock, struct msghdr *msg,
		   struct kvec *vec, size_t num, size_t size)
{
659
	iov_iter_kvec(&msg->msg_iter, WRITE | ITER_KVEC, vec, num, size);
660
	return sock_sendmsg(sock, msg);
Linus Torvalds's avatar
Linus Torvalds committed
661
}
662
EXPORT_SYMBOL(kernel_sendmsg);
Linus Torvalds's avatar
Linus Torvalds committed
663

664 665 666 667 668 669
int kernel_sendmsg_locked(struct sock *sk, struct msghdr *msg,
			  struct kvec *vec, size_t num, size_t size)
{
	struct socket *sock = sk->sk_socket;

	if (!sock->ops->sendmsg_locked)
John Fastabend's avatar
John Fastabend committed
670
		return sock_no_sendmsg_locked(sk, msg, size);
671 672 673 674 675 676 677

	iov_iter_kvec(&msg->msg_iter, WRITE | ITER_KVEC, vec, num, size);

	return sock->ops->sendmsg_locked(sk, msg, msg_data_left(msg));
}
EXPORT_SYMBOL(kernel_sendmsg_locked);

678 679 680 681 682 683 684 685 686 687
static bool skb_is_err_queue(const struct sk_buff *skb)
{
	/* pkt_type of skbs enqueued on the error queue are set to
	 * PACKET_OUTGOING in skb_set_err_queue(). This is only safe to do
	 * in recvmsg, since skbs received on a local socket will never
	 * have a pkt_type of PACKET_OUTGOING.
	 */
	return skb->pkt_type == PACKET_OUTGOING;
}

688 689 690 691 692 693 694 695 696 697 698 699 700
/* On transmit, software and hardware timestamps are returned independently.
 * As the two skb clones share the hardware timestamp, which may be updated
 * before the software timestamp is received, a hardware TX timestamp may be
 * returned only if there is no software TX timestamp. Ignore false software
 * timestamps, which may be made in the __sock_recv_timestamp() call when the
 * option SO_TIMESTAMP(NS) is enabled on the socket, even when the skb has a
 * hardware timestamp.
 */
static bool skb_is_swtx_tstamp(const struct sk_buff *skb, int false_tstamp)
{
	return skb->tstamp && !false_tstamp && skb_is_err_queue(skb);
}

701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721
static void put_ts_pktinfo(struct msghdr *msg, struct sk_buff *skb)
{
	struct scm_ts_pktinfo ts_pktinfo;
	struct net_device *orig_dev;

	if (!skb_mac_header_was_set(skb))
		return;

	memset(&ts_pktinfo, 0, sizeof(ts_pktinfo));

	rcu_read_lock();
	orig_dev = dev_get_by_napi_id(skb_napi_id(skb));
	if (orig_dev)
		ts_pktinfo.if_index = orig_dev->ifindex;
	rcu_read_unlock();

	ts_pktinfo.pkt_length = skb->len - skb_mac_offset(skb);
	put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMPING_PKTINFO,
		 sizeof(ts_pktinfo), &ts_pktinfo);
}

722 723 724 725 726 727
/*
 * called from sock_recv_timestamp() if sock_flag(sk, SOCK_RCVTSTAMP)
 */
void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk,
	struct sk_buff *skb)
{
728
	int need_software_tstamp = sock_flag(sk, SOCK_RCVTSTAMP);
729
	struct scm_timestamping tss;
730
	int empty = 1, false_tstamp = 0;
731 732 733 734 735
	struct skb_shared_hwtstamps *shhwtstamps =
		skb_hwtstamps(skb);

	/* Race occurred between timestamp enabling and packet
	   receiving.  Fill in the current time for now. */
736
	if (need_software_tstamp && skb->tstamp == 0) {
737
		__net_timestamp(skb);
738 739
		false_tstamp = 1;
	}
740 741 742 743 744 745 746 747

	if (need_software_tstamp) {
		if (!sock_flag(sk, SOCK_RCVTSTAMPNS)) {
			struct timeval tv;
			skb_get_timestamp(skb, &tv);
			put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMP,
				 sizeof(tv), &tv);
		} else {
748 749
			struct timespec ts;
			skb_get_timestampns(skb, &ts);
750
			put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMPNS,
751
				 sizeof(ts), &ts);
752 753 754
		}
	}

755
	memset(&tss, 0, sizeof(tss));
756
	if ((sk->sk_tsflags & SOF_TIMESTAMPING_SOFTWARE) &&
757
	    ktime_to_timespec_cond(skb->tstamp, tss.ts + 0))
758
		empty = 0;
759
	if (shhwtstamps &&
760
	    (sk->sk_tsflags & SOF_TIMESTAMPING_RAW_HARDWARE) &&
761
	    !skb_is_swtx_tstamp(skb, false_tstamp) &&
762
	    ktime_to_timespec_cond(shhwtstamps->hwtstamp, tss.ts + 2)) {
763
		empty = 0;
764 765 766 767
		if ((sk->sk_tsflags & SOF_TIMESTAMPING_OPT_PKTINFO) &&
		    !skb_is_err_queue(skb))
			put_ts_pktinfo(msg, skb);
	}
768
	if (!empty) {
769
		put_cmsg(msg, SOL_SOCKET,
770
			 SCM_TIMESTAMPING, sizeof(tss), &tss);
771

772
		if (skb_is_err_queue(skb) && skb->len &&
773
		    SKB_EXT_ERR(skb)->opt_stats)
774 775 776
			put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMPING_OPT_STATS,
				 skb->len, skb->data);
	}
777
}
778 779
EXPORT_SYMBOL_GPL(__sock_recv_timestamp);

780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795
void __sock_recv_wifi_status(struct msghdr *msg, struct sock *sk,
	struct sk_buff *skb)
{
	int ack;

	if (!sock_flag(sk, SOCK_WIFI_STATUS))
		return;
	if (!skb->wifi_acked_valid)
		return;

	ack = skb->wifi_acked;

	put_cmsg(msg, SOL_SOCKET, SCM_WIFI_STATUS, sizeof(ack), &ack);
}
EXPORT_SYMBOL_GPL(__sock_recv_wifi_status);

stephen hemminger's avatar
stephen hemminger committed
796 797
static inline void sock_recv_drops(struct msghdr *msg, struct sock *sk,
				   struct sk_buff *skb)
798
{
799
	if (sock_flag(sk, SOCK_RXQ_OVFL) && skb && SOCK_SKB_CB(skb)->dropcount)
800
		put_cmsg(msg, SOL_SOCKET, SO_RXQ_OVFL,
801
			sizeof(__u32), &SOCK_SKB_CB(skb)->dropcount);
802 803
}

804
void __sock_recv_ts_and_drops(struct msghdr *msg, struct sock *sk,
805 806 807 808 809
	struct sk_buff *skb)
{
	sock_recv_timestamp(msg, sk, skb);
	sock_recv_drops(msg, sk, skb);
}
810
EXPORT_SYMBOL_GPL(__sock_recv_ts_and_drops);
811

812
static inline int sock_recvmsg_nosec(struct socket *sock, struct msghdr *msg,
813
				     int flags)
Linus Torvalds's avatar
Linus Torvalds committed
814
{
815
	return sock->ops->recvmsg(sock, msg, msg_data_left(msg), flags);
Linus Torvalds's avatar
Linus Torvalds committed
816 817
}

818
int sock_recvmsg(struct socket *sock, struct msghdr *msg, int flags)
819
{
820
	int err = security_socket_recvmsg(sock, msg, msg_data_left(msg), flags);
821

822
	return err ?: sock_recvmsg_nosec(sock, msg, flags);
Linus Torvalds's avatar
Linus Torvalds committed
823
}
824
EXPORT_SYMBOL(sock_recvmsg);
Linus Torvalds's avatar
Linus Torvalds committed
825

826 827 828 829 830 831 832 833 834 835 836 837 838 839 840
/**
 * kernel_recvmsg - Receive a message from a socket (kernel space)
 * @sock:       The socket to receive the message from
 * @msg:        Received message
 * @vec:        Input s/g array for message data
 * @num:        Size of input s/g array
 * @size:       Number of bytes to read
 * @flags:      Message flags (MSG_DONTWAIT, etc...)
 *
 * On return the msg structure contains the scatter/gather array passed in the
 * vec argument. The array is modified so that it consists of the unfilled
 * portion of the original array.
 *
 * The returned value is the total number of bytes received, or an error.
 */
841 842
int kernel_recvmsg(struct socket *sock, struct msghdr *msg,
		   struct kvec *vec, size_t num, size_t size, int flags)
Linus Torvalds's avatar
Linus Torvalds committed
843 844 845 846
{
	mm_segment_t oldfs = get_fs();
	int result;

847
	iov_iter_kvec(&msg->msg_iter, READ | ITER_KVEC, vec, num, size);
Linus Torvalds's avatar
Linus Torvalds committed
848
	set_fs(KERNEL_DS);
849
	result = sock_recvmsg(sock, msg, flags);
Linus Torvalds's avatar
Linus Torvalds committed
850 851 852
	set_fs(oldfs);
	return result;
}
853
EXPORT_SYMBOL(kernel_recvmsg);
Linus Torvalds's avatar
Linus Torvalds committed
854

855 856
static ssize_t sock_sendpage(struct file *file, struct page *page,
			     int offset, size_t size, loff_t *ppos, int more)
Linus Torvalds's avatar
Linus Torvalds committed
857 858 859 860
{
	struct socket *sock;
	int flags;

861 862
	sock = file->private_data;

863 864 865
	flags = (file->f_flags & O_NONBLOCK) ? MSG_DONTWAIT : 0;
	/* more is a combination of MSG_MORE and MSG_SENDPAGE_NOTLAST */
	flags |= more;
866

867
	return kernel_sendpage(sock, page, offset, size, flags);
868
}
Linus Torvalds's avatar
Linus Torvalds committed
869

Jens Axboe's avatar
Jens Axboe committed
870
static ssize_t sock_splice_read(struct file *file, loff_t *ppos,
871
				struct pipe_inode_info *pipe, size_t len,
Jens Axboe's avatar
Jens Axboe committed
872 873 874 875
				unsigned int flags)
{
	struct socket *sock = file->private_data;

876 877 878
	if (unlikely(!sock->ops->splice_read))
		return -EINVAL;

Jens Axboe's avatar
Jens Axboe committed
879 880 881
	return sock->ops->splice_read(sock, ppos, pipe, len, flags);
}

882
static ssize_t sock_read_iter(struct kiocb *iocb, struct iov_iter *to)
883
{
884 885
	struct file *file = iocb->ki_filp;
	struct socket *sock = file->private_data;
886 887
	struct msghdr msg = {.msg_iter = *to,
			     .msg_iocb = iocb};
888
	ssize_t res;
889

890 891 892 893
	if (file->f_flags & O_NONBLOCK)
		msg.msg_flags = MSG_DONTWAIT;

	if (iocb->ki_pos != 0)
Linus Torvalds's avatar
Linus Torvalds committed
894
		return -ESPIPE;
895

Christoph Hellwig's avatar
Christoph Hellwig committed
896
	if (!iov_iter_count(to))	/* Match SYS5 behaviour */
Linus Torvalds's avatar
Linus Torvalds committed
897 898
		return 0;

899
	res = sock_recvmsg(sock, &msg, msg.msg_flags);
900 901
	*to = msg.msg_iter;
	return res;
Linus Torvalds's avatar
Linus Torvalds committed
902 903
}

904
static ssize_t sock_write_iter(struct kiocb *iocb, struct iov_iter *from)
905
{
906 907
	struct file *file = iocb->ki_filp;
	struct socket *sock = file->private_data;
908 909
	struct msghdr msg = {.msg_iter = *from,
			     .msg_iocb = iocb};
910
	ssize_t res;
Linus Torvalds's avatar
Linus Torvalds committed
911

912
	if (iocb->ki_pos != 0)
913
		return -ESPIPE;
914

915 916 917
	if (file->f_flags & O_NONBLOCK)
		msg.msg_flags = MSG_DONTWAIT;

918 919 920
	if (sock->type == SOCK_SEQPACKET)
		msg.msg_flags |= MSG_EOR;

921
	res = sock_sendmsg(sock, &msg);
922 923
	*from = msg.msg_iter;
	return res;
Linus Torvalds's avatar
Linus Torvalds committed
924 925 926 927 928 929 930
}

/*
 * Atomic setting of ioctl hooks to avoid race
 * with module unload.
 */