linux_dsm_epyc7002/net/core/scm.c

356 lines
7.9 KiB
C
Raw Normal View History

/* scm.c - Socket level control messages processing.
*
* Author: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
* Alignment and value checking mods by Craig Metz
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*/
#include <linux/module.h>
#include <linux/signal.h>
#include <linux/capability.h>
#include <linux/errno.h>
#include <linux/sched.h>
#include <linux/mm.h>
#include <linux/kernel.h>
#include <linux/stat.h>
#include <linux/socket.h>
#include <linux/file.h>
#include <linux/fcntl.h>
#include <linux/net.h>
#include <linux/interrupt.h>
#include <linux/netdevice.h>
#include <linux/security.h>
#include <linux/pid.h>
#include <linux/nsproxy.h>
include cleanup: Update gfp.h and slab.h includes to prepare for breaking implicit slab.h inclusion from percpu.h percpu.h is included by sched.h and module.h and thus ends up being included when building most .c files. percpu.h includes slab.h which in turn includes gfp.h making everything defined by the two files universally available and complicating inclusion dependencies. percpu.h -> slab.h dependency is about to be removed. Prepare for this change by updating users of gfp and slab facilities include those headers directly instead of assuming availability. As this conversion needs to touch large number of source files, the following script is used as the basis of conversion. http://userweb.kernel.org/~tj/misc/slabh-sweep.py The script does the followings. * Scan files for gfp and slab usages and update includes such that only the necessary includes are there. ie. if only gfp is used, gfp.h, if slab is used, slab.h. * When the script inserts a new include, it looks at the include blocks and try to put the new include such that its order conforms to its surrounding. It's put in the include block which contains core kernel includes, in the same order that the rest are ordered - alphabetical, Christmas tree, rev-Xmas-tree or at the end if there doesn't seem to be any matching order. * If the script can't find a place to put a new include (mostly because the file doesn't have fitting include block), it prints out an error message indicating which .h file needs to be added to the file. The conversion was done in the following steps. 1. The initial automatic conversion of all .c files updated slightly over 4000 files, deleting around 700 includes and adding ~480 gfp.h and ~3000 slab.h inclusions. The script emitted errors for ~400 files. 2. Each error was manually checked. Some didn't need the inclusion, some needed manual addition while adding it to implementation .h or embedding .c file was more appropriate for others. This step added inclusions to around 150 files. 3. The script was run again and the output was compared to the edits from #2 to make sure no file was left behind. 4. Several build tests were done and a couple of problems were fixed. e.g. lib/decompress_*.c used malloc/free() wrappers around slab APIs requiring slab.h to be added manually. 5. The script was run on all .h files but without automatically editing them as sprinkling gfp.h and slab.h inclusions around .h files could easily lead to inclusion dependency hell. Most gfp.h inclusion directives were ignored as stuff from gfp.h was usually wildly available and often used in preprocessor macros. Each slab.h inclusion directive was examined and added manually as necessary. 6. percpu.h was updated not to include slab.h. 7. Build test were done on the following configurations and failures were fixed. CONFIG_GCOV_KERNEL was turned off for all tests (as my distributed build env didn't work with gcov compiles) and a few more options had to be turned off depending on archs to make things build (like ipr on powerpc/64 which failed due to missing writeq). * x86 and x86_64 UP and SMP allmodconfig and a custom test config. * powerpc and powerpc64 SMP allmodconfig * sparc and sparc64 SMP allmodconfig * ia64 SMP allmodconfig * s390 SMP allmodconfig * alpha SMP allmodconfig * um on x86_64 SMP allmodconfig 8. percpu.h modifications were reverted so that it could be applied as a separate patch and serve as bisection point. Given the fact that I had only a couple of failures from tests on step 6, I'm fairly confident about the coverage of this conversion patch. If there is a breakage, it's likely to be something in one of the arch headers which should be easily discoverable easily on most builds of the specific arch. Signed-off-by: Tejun Heo <tj@kernel.org> Guess-its-ok-by: Christoph Lameter <cl@linux-foundation.org> Cc: Ingo Molnar <mingo@redhat.com> Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
2010-03-24 15:04:11 +07:00
#include <linux/slab.h>
#include <asm/uaccess.h>
#include <net/protocol.h>
#include <linux/skbuff.h>
#include <net/sock.h>
#include <net/compat.h>
#include <net/scm.h>
#include <net/cls_cgroup.h>
/*
* Only allow a user to send credentials, that they could set with
* setu(g)id.
*/
static __inline__ int scm_check_creds(struct ucred *creds)
{
const struct cred *cred = current_cred();
kuid_t uid = make_kuid(cred->user_ns, creds->uid);
kgid_t gid = make_kgid(cred->user_ns, creds->gid);
if (!uid_valid(uid) || !gid_valid(gid))
return -EINVAL;
if ((creds->pid == task_tgid_vnr(current) || nsown_capable(CAP_SYS_ADMIN)) &&
((uid_eq(uid, cred->uid) || uid_eq(uid, cred->euid) ||
uid_eq(uid, cred->suid)) || nsown_capable(CAP_SETUID)) &&
((gid_eq(gid, cred->gid) || gid_eq(gid, cred->egid) ||
gid_eq(gid, cred->sgid)) || nsown_capable(CAP_SETGID))) {
return 0;
}
return -EPERM;
}
static int scm_fp_copy(struct cmsghdr *cmsg, struct scm_fp_list **fplp)
{
int *fdp = (int*)CMSG_DATA(cmsg);
struct scm_fp_list *fpl = *fplp;
struct file **fpp;
int i, num;
num = (cmsg->cmsg_len - CMSG_ALIGN(sizeof(struct cmsghdr)))/sizeof(int);
if (num <= 0)
return 0;
if (num > SCM_MAX_FD)
return -EINVAL;
if (!fpl)
{
fpl = kmalloc(sizeof(struct scm_fp_list), GFP_KERNEL);
if (!fpl)
return -ENOMEM;
*fplp = fpl;
fpl->count = 0;
fpl->max = SCM_MAX_FD;
}
fpp = &fpl->fp[fpl->count];
if (fpl->count + num > fpl->max)
return -EINVAL;
/*
* Verify the descriptors and increment the usage count.
*/
for (i=0; i< num; i++)
{
int fd = fdp[i];
struct file *file;
if (fd < 0 || !(file = fget_raw(fd)))
return -EBADF;
*fpp++ = file;
fpl->count++;
}
return num;
}
void __scm_destroy(struct scm_cookie *scm)
{
struct scm_fp_list *fpl = scm->fp;
int i;
if (fpl) {
scm->fp = NULL;
for (i=fpl->count-1; i>=0; i--)
fput(fpl->fp[i]);
kfree(fpl);
}
}
EXPORT_SYMBOL(__scm_destroy);
int __scm_send(struct socket *sock, struct msghdr *msg, struct scm_cookie *p)
{
struct cmsghdr *cmsg;
int err;
for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg))
{
err = -EINVAL;
/* Verify that cmsg_len is at least sizeof(struct cmsghdr) */
/* The first check was omitted in <= 2.2.5. The reasoning was
that parser checks cmsg_len in any case, so that
additional check would be work duplication.
But if cmsg_level is not SOL_SOCKET, we do not check
for too short ancillary data object at all! Oops.
OK, let's add it...
*/
if (!CMSG_OK(msg, cmsg))
goto error;
if (cmsg->cmsg_level != SOL_SOCKET)
continue;
switch (cmsg->cmsg_type)
{
case SCM_RIGHTS:
if (!sock->ops || sock->ops->family != PF_UNIX)
goto error;
err=scm_fp_copy(cmsg, &p->fp);
if (err<0)
goto error;
break;
case SCM_CREDENTIALS:
{
struct ucred creds;
kuid_t uid;
kgid_t gid;
if (cmsg->cmsg_len != CMSG_LEN(sizeof(struct ucred)))
goto error;
memcpy(&creds, CMSG_DATA(cmsg), sizeof(struct ucred));
err = scm_check_creds(&creds);
if (err)
goto error;
p->creds.pid = creds.pid;
if (!p->pid || pid_vnr(p->pid) != creds.pid) {
struct pid *pid;
err = -ESRCH;
pid = find_get_pid(creds.pid);
if (!pid)
goto error;
put_pid(p->pid);
p->pid = pid;
}
err = -EINVAL;
uid = make_kuid(current_user_ns(), creds.uid);
gid = make_kgid(current_user_ns(), creds.gid);
if (!uid_valid(uid) || !gid_valid(gid))
goto error;
p->creds.uid = uid;
p->creds.gid = gid;
af_unix: dont send SCM_CREDENTIALS by default Since commit 7361c36c5224 (af_unix: Allow credentials to work across user and pid namespaces) af_unix performance dropped a lot. This is because we now take a reference on pid and cred in each write(), and release them in read(), usually done from another process, eventually from another cpu. This triggers false sharing. # Events: 154K cycles # # Overhead Command Shared Object Symbol # ........ ....... .................. ......................... # 10.40% hackbench [kernel.kallsyms] [k] put_pid 8.60% hackbench [kernel.kallsyms] [k] unix_stream_recvmsg 7.87% hackbench [kernel.kallsyms] [k] unix_stream_sendmsg 6.11% hackbench [kernel.kallsyms] [k] do_raw_spin_lock 4.95% hackbench [kernel.kallsyms] [k] unix_scm_to_skb 4.87% hackbench [kernel.kallsyms] [k] pid_nr_ns 4.34% hackbench [kernel.kallsyms] [k] cred_to_ucred 2.39% hackbench [kernel.kallsyms] [k] unix_destruct_scm 2.24% hackbench [kernel.kallsyms] [k] sub_preempt_count 1.75% hackbench [kernel.kallsyms] [k] fget_light 1.51% hackbench [kernel.kallsyms] [k] __mutex_lock_interruptible_slowpath 1.42% hackbench [kernel.kallsyms] [k] sock_alloc_send_pskb This patch includes SCM_CREDENTIALS information in a af_unix message/skb only if requested by the sender, [man 7 unix for details how to include ancillary data using sendmsg() system call] Note: This might break buggy applications that expected SCM_CREDENTIAL from an unaware write() system call, and receiver not using SO_PASSCRED socket option. If SOCK_PASSCRED is set on source or destination socket, we still include credentials for mere write() syscalls. Performance boost in hackbench : more than 50% gain on a 16 thread machine (2 quad-core cpus, 2 threads per core) hackbench 20 thread 2000 4.228 sec instead of 9.102 sec Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com> Acked-by: Tim Chen <tim.c.chen@linux.intel.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2011-09-19 12:52:27 +07:00
if (!p->cred ||
!uid_eq(p->cred->euid, uid) ||
!gid_eq(p->cred->egid, gid)) {
struct cred *cred;
err = -ENOMEM;
cred = prepare_creds();
if (!cred)
goto error;
cred->uid = cred->euid = uid;
cred->gid = cred->egid = gid;
af_unix: dont send SCM_CREDENTIALS by default Since commit 7361c36c5224 (af_unix: Allow credentials to work across user and pid namespaces) af_unix performance dropped a lot. This is because we now take a reference on pid and cred in each write(), and release them in read(), usually done from another process, eventually from another cpu. This triggers false sharing. # Events: 154K cycles # # Overhead Command Shared Object Symbol # ........ ....... .................. ......................... # 10.40% hackbench [kernel.kallsyms] [k] put_pid 8.60% hackbench [kernel.kallsyms] [k] unix_stream_recvmsg 7.87% hackbench [kernel.kallsyms] [k] unix_stream_sendmsg 6.11% hackbench [kernel.kallsyms] [k] do_raw_spin_lock 4.95% hackbench [kernel.kallsyms] [k] unix_scm_to_skb 4.87% hackbench [kernel.kallsyms] [k] pid_nr_ns 4.34% hackbench [kernel.kallsyms] [k] cred_to_ucred 2.39% hackbench [kernel.kallsyms] [k] unix_destruct_scm 2.24% hackbench [kernel.kallsyms] [k] sub_preempt_count 1.75% hackbench [kernel.kallsyms] [k] fget_light 1.51% hackbench [kernel.kallsyms] [k] __mutex_lock_interruptible_slowpath 1.42% hackbench [kernel.kallsyms] [k] sock_alloc_send_pskb This patch includes SCM_CREDENTIALS information in a af_unix message/skb only if requested by the sender, [man 7 unix for details how to include ancillary data using sendmsg() system call] Note: This might break buggy applications that expected SCM_CREDENTIAL from an unaware write() system call, and receiver not using SO_PASSCRED socket option. If SOCK_PASSCRED is set on source or destination socket, we still include credentials for mere write() syscalls. Performance boost in hackbench : more than 50% gain on a 16 thread machine (2 quad-core cpus, 2 threads per core) hackbench 20 thread 2000 4.228 sec instead of 9.102 sec Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com> Acked-by: Tim Chen <tim.c.chen@linux.intel.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2011-09-19 12:52:27 +07:00
if (p->cred)
put_cred(p->cred);
p->cred = cred;
}
break;
}
default:
goto error;
}
}
if (p->fp && !p->fp->count)
{
kfree(p->fp);
p->fp = NULL;
}
return 0;
error:
scm_destroy(p);
return err;
}
EXPORT_SYMBOL(__scm_send);
int put_cmsg(struct msghdr * msg, int level, int type, int len, void *data)
{
struct cmsghdr __user *cm
= (__force struct cmsghdr __user *)msg->msg_control;
struct cmsghdr cmhdr;
int cmlen = CMSG_LEN(len);
int err;
if (MSG_CMSG_COMPAT & msg->msg_flags)
return put_cmsg_compat(msg, level, type, len, data);
if (cm==NULL || msg->msg_controllen < sizeof(*cm)) {
msg->msg_flags |= MSG_CTRUNC;
return 0; /* XXX: return error? check spec. */
}
if (msg->msg_controllen < cmlen) {
msg->msg_flags |= MSG_CTRUNC;
cmlen = msg->msg_controllen;
}
cmhdr.cmsg_level = level;
cmhdr.cmsg_type = type;
cmhdr.cmsg_len = cmlen;
err = -EFAULT;
if (copy_to_user(cm, &cmhdr, sizeof cmhdr))
goto out;
if (copy_to_user(CMSG_DATA(cm), data, cmlen - sizeof(struct cmsghdr)))
goto out;
cmlen = CMSG_SPACE(len);
[NET]: Fix function put_cmsg() which may cause usr application memory overflow When used function put_cmsg() to copy kernel information to user application memory, if the memory length given by user application is not enough, by the bad length calculate of msg.msg_controllen, put_cmsg() function may cause the msg.msg_controllen to be a large value, such as 0xFFFFFFF0, so the following put_cmsg() can also write data to usr application memory even usr has no valid memory to store this. This may cause usr application memory overflow. int put_cmsg(struct msghdr * msg, int level, int type, int len, void *data) { struct cmsghdr __user *cm = (__force struct cmsghdr __user *)msg->msg_control; struct cmsghdr cmhdr; int cmlen = CMSG_LEN(len); ~~~~~~~~~~~~~~~~~~~~~ int err; if (MSG_CMSG_COMPAT & msg->msg_flags) return put_cmsg_compat(msg, level, type, len, data); if (cm==NULL || msg->msg_controllen < sizeof(*cm)) { msg->msg_flags |= MSG_CTRUNC; return 0; /* XXX: return error? check spec. */ } if (msg->msg_controllen < cmlen) { ~~~~~~~~~~~~~~~~~~~~~~~~ msg->msg_flags |= MSG_CTRUNC; cmlen = msg->msg_controllen; } cmhdr.cmsg_level = level; cmhdr.cmsg_type = type; cmhdr.cmsg_len = cmlen; err = -EFAULT; if (copy_to_user(cm, &cmhdr, sizeof cmhdr)) goto out; if (copy_to_user(CMSG_DATA(cm), data, cmlen - sizeof(struct cmsghdr))) goto out; cmlen = CMSG_SPACE(len); ~~~~~~~~~~~~~~~~~~~~~~~~~~~ If MSG_CTRUNC flags is set, msg->msg_controllen is less than CMSG_SPACE(len), "msg->msg_controllen -= cmlen" will cause unsinged int type msg->msg_controllen to be a large value. ~~~~~~~~~~~~~~~~~~~~~~~~~~~ msg->msg_control += cmlen; msg->msg_controllen -= cmlen; ~~~~~~~~~~~~~~~~~~~~~ err = 0; out: return err; } The same promble exists in put_cmsg_compat(). This patch can fix this problem. Signed-off-by: Wei Yongjun <yjwei@cn.fujitsu.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2007-12-21 05:36:44 +07:00
if (msg->msg_controllen < cmlen)
cmlen = msg->msg_controllen;
msg->msg_control += cmlen;
msg->msg_controllen -= cmlen;
err = 0;
out:
return err;
}
EXPORT_SYMBOL(put_cmsg);
void scm_detach_fds(struct msghdr *msg, struct scm_cookie *scm)
{
struct cmsghdr __user *cm
= (__force struct cmsghdr __user*)msg->msg_control;
int fdmax = 0;
int fdnum = scm->fp->count;
struct file **fp = scm->fp->fp;
int __user *cmfptr;
int err = 0, i;
if (MSG_CMSG_COMPAT & msg->msg_flags) {
scm_detach_fds_compat(msg, scm);
return;
}
if (msg->msg_controllen > sizeof(struct cmsghdr))
fdmax = ((msg->msg_controllen - sizeof(struct cmsghdr))
/ sizeof(int));
if (fdnum < fdmax)
fdmax = fdnum;
for (i=0, cmfptr=(__force int __user *)CMSG_DATA(cm); i<fdmax;
i++, cmfptr++)
{
struct socket *sock;
int new_fd;
err = security_file_receive(fp[i]);
if (err)
break;
O_CLOEXEC for SCM_RIGHTS Part two in the O_CLOEXEC saga: adding support for file descriptors received through Unix domain sockets. The patch is once again pretty minimal, it introduces a new flag for recvmsg and passes it just like the existing MSG_CMSG_COMPAT flag. I think this bit is not used otherwise but the networking people will know better. This new flag is not recognized by recvfrom and recv. These functions cannot be used for that purpose and the asymmetry this introduces is not worse than the already existing MSG_CMSG_COMPAT situations. The patch must be applied on the patch which introduced O_CLOEXEC. It has to remove static from the new get_unused_fd_flags function but since scm.c cannot live in a module the function still hasn't to be exported. Here's a test program to make sure the code works. It's so much longer than the actual patch... #include <errno.h> #include <error.h> #include <fcntl.h> #include <stdio.h> #include <string.h> #include <unistd.h> #include <sys/socket.h> #include <sys/un.h> #ifndef O_CLOEXEC # define O_CLOEXEC 02000000 #endif #ifndef MSG_CMSG_CLOEXEC # define MSG_CMSG_CLOEXEC 0x40000000 #endif int main (int argc, char *argv[]) { if (argc > 1) { int fd = atol (argv[1]); printf ("child: fd = %d\n", fd); if (fcntl (fd, F_GETFD) == 0 || errno != EBADF) { puts ("file descriptor valid in child"); return 1; } return 0; } struct sockaddr_un sun; strcpy (sun.sun_path, "./testsocket"); sun.sun_family = AF_UNIX; char databuf[] = "hello"; struct iovec iov[1]; iov[0].iov_base = databuf; iov[0].iov_len = sizeof (databuf); union { struct cmsghdr hdr; char bytes[CMSG_SPACE (sizeof (int))]; } buf; struct msghdr msg = { .msg_iov = iov, .msg_iovlen = 1, .msg_control = buf.bytes, .msg_controllen = sizeof (buf) }; struct cmsghdr *cmsg = CMSG_FIRSTHDR (&msg); cmsg->cmsg_level = SOL_SOCKET; cmsg->cmsg_type = SCM_RIGHTS; cmsg->cmsg_len = CMSG_LEN (sizeof (int)); msg.msg_controllen = cmsg->cmsg_len; pid_t child = fork (); if (child == -1) error (1, errno, "fork"); if (child == 0) { int sock = socket (PF_UNIX, SOCK_STREAM, 0); if (sock < 0) error (1, errno, "socket"); if (bind (sock, (struct sockaddr *) &sun, sizeof (sun)) < 0) error (1, errno, "bind"); if (listen (sock, SOMAXCONN) < 0) error (1, errno, "listen"); int conn = accept (sock, NULL, NULL); if (conn == -1) error (1, errno, "accept"); *(int *) CMSG_DATA (cmsg) = sock; if (sendmsg (conn, &msg, MSG_NOSIGNAL) < 0) error (1, errno, "sendmsg"); return 0; } /* For a test suite this should be more robust like a barrier in shared memory. */ sleep (1); int sock = socket (PF_UNIX, SOCK_STREAM, 0); if (sock < 0) error (1, errno, "socket"); if (connect (sock, (struct sockaddr *) &sun, sizeof (sun)) < 0) error (1, errno, "connect"); unlink (sun.sun_path); *(int *) CMSG_DATA (cmsg) = -1; if (recvmsg (sock, &msg, MSG_CMSG_CLOEXEC) < 0) error (1, errno, "recvmsg"); int fd = *(int *) CMSG_DATA (cmsg); if (fd == -1) error (1, 0, "no descriptor received"); char fdname[20]; snprintf (fdname, sizeof (fdname), "%d", fd); execl ("/proc/self/exe", argv[0], fdname, NULL); puts ("execl failed"); return 1; } [akpm@linux-foundation.org: Fix fastcall inconsistency noted by Michael Buesch] [akpm@linux-foundation.org: build fix] Signed-off-by: Ulrich Drepper <drepper@redhat.com> Cc: Ingo Molnar <mingo@elte.hu> Cc: Michael Buesch <mb@bu3sch.de> Cc: Michael Kerrisk <mtk-manpages@gmx.net> Acked-by: David S. Miller <davem@davemloft.net> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-07-16 13:40:34 +07:00
err = get_unused_fd_flags(MSG_CMSG_CLOEXEC & msg->msg_flags
? O_CLOEXEC : 0);
if (err < 0)
break;
new_fd = err;
err = put_user(new_fd, cmfptr);
if (err) {
put_unused_fd(new_fd);
break;
}
/* Bump the usage count and install the file. */
sock = sock_from_file(fp[i], &err);
if (sock) {
sock_update_netprioidx(sock->sk, current);
sock_update_classid(sock->sk, current);
}
fd_install(new_fd, get_file(fp[i]));
}
if (i > 0)
{
int cmlen = CMSG_LEN(i*sizeof(int));
err = put_user(SOL_SOCKET, &cm->cmsg_level);
if (!err)
err = put_user(SCM_RIGHTS, &cm->cmsg_type);
if (!err)
err = put_user(cmlen, &cm->cmsg_len);
if (!err) {
cmlen = CMSG_SPACE(i*sizeof(int));
msg->msg_control += cmlen;
msg->msg_controllen -= cmlen;
}
}
if (i < fdnum || (fdnum && fdmax <= 0))
msg->msg_flags |= MSG_CTRUNC;
/*
* All of the files that fit in the message have had their
* usage counts incremented, so we just free the list.
*/
__scm_destroy(scm);
}
EXPORT_SYMBOL(scm_detach_fds);
struct scm_fp_list *scm_fp_dup(struct scm_fp_list *fpl)
{
struct scm_fp_list *new_fpl;
int i;
if (!fpl)
return NULL;
new_fpl = kmemdup(fpl, offsetof(struct scm_fp_list, fp[fpl->count]),
GFP_KERNEL);
if (new_fpl) {
for (i = 0; i < fpl->count; i++)
get_file(fpl->fp[i]);
new_fpl->max = new_fpl->count;
}
return new_fpl;
}
EXPORT_SYMBOL(scm_fp_dup);