aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--lib/libc/sys/Makefile.inc5
-rw-r--r--lib/libc/sys/Symbol.map3
-rw-r--r--lib/libc/sys/cap_new.24
-rw-r--r--lib/libc/sys/pdfork.2182
-rw-r--r--sys/compat/linux/linux_fork.c8
-rw-r--r--sys/conf/NOTES3
-rw-r--r--sys/conf/files1
-rw-r--r--sys/conf/options1
-rw-r--r--sys/kern/capabilities.conf2
-rw-r--r--sys/kern/init_main.c3
-rw-r--r--sys/kern/kern_descrip.c30
-rw-r--r--sys/kern/kern_exit.c82
-rw-r--r--sys/kern/kern_fork.c96
-rw-r--r--sys/kern/kern_kthread.c2
-rw-r--r--sys/kern/kern_sig.c31
-rw-r--r--sys/kern/sys_procdesc.c524
-rw-r--r--sys/kern/syscalls.master8
-rw-r--r--sys/sys/capability.h7
-rw-r--r--sys/sys/file.h1
-rw-r--r--sys/sys/proc.h6
-rw-r--r--sys/sys/procdesc.h119
-rw-r--r--sys/sys/unistd.h5
-rw-r--r--sys/sys/user.h4
23 files changed, 1074 insertions, 53 deletions
diff --git a/lib/libc/sys/Makefile.inc b/lib/libc/sys/Makefile.inc
index ddc157ee1b57..fe5061d97f3c 100644
--- a/lib/libc/sys/Makefile.inc
+++ b/lib/libc/sys/Makefile.inc
@@ -96,7 +96,7 @@ MAN+= abort2.2 accept.2 access.2 acct.2 adjtime.2 \
mq_setattr.2 \
msgctl.2 msgget.2 msgrcv.2 msgsnd.2 \
msync.2 munmap.2 nanosleep.2 nfssvc.2 ntp_adjtime.2 open.2 \
- pathconf.2 pipe.2 poll.2 posix_fallocate.2 posix_openpt.2 profil.2 \
+ pathconf.2 pdfork.2 pipe.2 poll.2 posix_fallocate.2 posix_openpt.2 profil.2 \
pselect.2 ptrace.2 quotactl.2 \
read.2 readlink.2 reboot.2 recv.2 rename.2 revoke.2 rfork.2 rmdir.2 \
rtprio.2
@@ -178,6 +178,9 @@ MLINKS+=ntp_adjtime.2 ntp_gettime.2
MLINKS+=open.2 openat.2
MLINKS+=pathconf.2 fpathconf.2
MLINKS+=pathconf.2 lpathconf.2
+MLINKS+=pdfork.2 pdgetpid.2\
+ pdfork.2 pdkill.2 \
+ pdfork.2 pdwait4.2
MLINKS+=read.2 pread.2 read.2 preadv.2 read.2 readv.2
MLINKS+=readlink.2 readlinkat.2
MLINKS+=recv.2 recvfrom.2 recv.2 recvmsg.2
diff --git a/lib/libc/sys/Symbol.map b/lib/libc/sys/Symbol.map
index 547a2cff9720..095751a441cd 100644
--- a/lib/libc/sys/Symbol.map
+++ b/lib/libc/sys/Symbol.map
@@ -366,6 +366,9 @@ FBSD_1.2 {
cap_new;
cap_getrights;
getloginclass;
+ pdfork;
+ pdgetpid;
+ pdkill;
posix_fallocate;
rctl_get_racct;
rctl_get_rules;
diff --git a/lib/libc/sys/cap_new.2 b/lib/libc/sys/cap_new.2
index 7710e12abdfa..206715e84b4e 100644
--- a/lib/libc/sys/cap_new.2
+++ b/lib/libc/sys/cap_new.2
@@ -260,7 +260,7 @@ Permit
.Xr pdkill 2 .
.It Dv CAP_PDWAIT
Permit
-.Xr pdwait 2 .
+.Xr pdwait4 2 .
.It Dv CAP_PEELOFF
Permit
.Xr sctp_peeloff 2 .
@@ -429,7 +429,7 @@ argument is not a capability.
.Xr openat 2 ,
.Xr pdgetpid 2 ,
.Xr pdkill 2 ,
-.Xr pdwait 2 ,
+.Xr pdwait4 2 ,
.Xr pipe 2 ,
.Xr poll 2 ,
.Xr pread 2 ,
diff --git a/lib/libc/sys/pdfork.2 b/lib/libc/sys/pdfork.2
new file mode 100644
index 000000000000..3f36e881ec9e
--- /dev/null
+++ b/lib/libc/sys/pdfork.2
@@ -0,0 +1,182 @@
+.\"
+.\" Copyright (c) 2009-2010 Robert N. M. Watson
+.\" All rights reserved.
+.\"
+.\" This software was developed at the University of Cambridge Computer
+.\" Laboratory with support from a grant from Google, Inc.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\" notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\" notice, this list of conditions and the following disclaimer in the
+.\" documentation and/or other materials provided with the distribution.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.\" $FreeBSD$
+.\"
+.Dd August 16, 2011
+.Dt PDFORK 2
+.Os
+.Sh NAME
+.Nm pdfork ,
+.Nm pdgetpid ,
+.Nm pdkill ,
+.Nm pdwait4
+.Nd System calls to manage process descriptors
+.Sh LIBRARY
+.Lb libc
+.Sh SYNOPSIS
+.In sys/procdesc.h
+.Ft int
+.Fn pdfork "int *fdp" "int flags"
+.Ft int
+.Fn pdgetpid "int fd" "pid_t *pidp"
+.Ft int
+.Fn pdkill "int fd" "int signum"
+.Ft int
+.Fn pdwait4 "int fd" "int *status" "int options" "struct rusage *rusage"
+.Sh DESCRIPTION
+Process descriptors are special file descriptors that represent processes,
+and are created using
+.Fn pdfork ,
+a variant of
+.Xr fork 2 ,
+which, if successful, returns a process descriptor in the integer pointed to
+by
+.Fa pidp .
+Processes created via
+.Fn pdfork
+will not cause
+.Dv SIGCHLD
+on termination.
+.Fn pdfork
+can accept the flags:
+.Bl -tag -width ".Dv PD_DAEMON"
+.It Dv PD_DAEMON
+Instead of the default terminate-on-close behaviour, allow the process to
+live until it is explicitly killed with
+.Xr kill 2 .
+.Pp
+This option is not permitted in Capsicum capability mode (see
+.Xr cap_enter 2 ) .
+.El
+.Pp
+.Fn pdgetpid
+queries the process ID (PID) if the process descriptor
+.Fa fd .
+.Pp
+.Fn pdkill
+is functionally identical to
+.Xr kill 2 ,
+except that it accepts a process descriptor,
+.Fa fd ,
+rather than a PID.
+.Pp
+.Fn pdwait4
+behaves identially to
+.Xr wait4 2 ,
+but operates with respect to a process descriptor argument rather than a PID.
+.Pp
+The following system calls also have effects specific to process descriptors:
+.Pp
+.Xr fstat 2
+queries status of a process descriptor; currently only the
+.Fa st_mode ,
+.Fa st_birthtime ,
+.Fa st_atime ,
+.Fa st_ctime
+and
+.Fa st_mtime
+fields are defined. If the owner read, write, and execute bits are set then the
+process represented by the process descriptor is still alive.
+.Pp
+.Xr poll 2
+and
+.Xr select 2
+allow waiting for process state transitions; currently only
+.Dv POLLHUP
+is defined, and will be raised when the process dies.
+.Pp
+.Xr close 2
+will close the process descriptor unless
+.Dv PD_DAEMON
+is set; if the process is still alive and this is
+the last reference to the process descriptor, the process will be terminated
+with the signal
+.Dv SIGKILL .
+.Sh RETURN VALUES
+.Fn pdfork
+returns a PID, 0 or -1, as
+.Xr fork 2
+does.
+.Pp
+.Fn pdgetpid
+and
+.Fn pdkill
+return 0 on success and -1 on failure.
+.Pp
+.Fn pdwait4
+returns a PID on success and -1 on failure.
+.Sh ERRORS
+These functions may return the same error numbers as their PID-based equivalents
+(e.g.
+.Fn pdfork
+may return the same error numbers as
+.Xr fork 2 ) ,
+with the following additions:
+.Bl -tag -width Er
+.It Bq Er EINVAL
+The signal number given to
+.Fn pdkill
+is invalid.
+.It Bq Er ENOTCAPABLE
+The process descriptor being operated on has insufficient rights (e.g.
+.Dv CAP_PDKILL
+for
+.Fn pdkill ) .
+.El
+.Sh SEE ALSO
+.Xr close 2 ,
+.Xr fork 2 ,
+.Xr fstat 2 ,
+.Xr kill 2 ,
+.Xr poll 2 ,
+.Xr wait4 2
+.Sh HISTORY
+The
+.Fn pdfork ,
+.Fn pdgetpid ,
+.Fn pdkill
+and
+.Fn pdwait4
+system calls first appeared in
+.Fx 9.0 .
+.Pp
+Support for process descriptors mode was developed as part of the
+.Tn TrustedBSD
+Project.
+.Sh AUTHORS
+.An -nosplit
+These functions and the capability facility were created by
+.An "Robert N. M. Watson" Aq rwatson@FreeBSD.org
+and
+.An "Jonathan Anderson" Aq jonathan@FreeBSD.org
+at the University of Cambridge Computer Laboratory with support from a grant
+from Google, Inc.
+.Sh BUGS
+.Fn pdwait4
+has not yet been implemented.
diff --git a/sys/compat/linux/linux_fork.c b/sys/compat/linux/linux_fork.c
index bf1d45c9ea34..5d2ce5bdb0cb 100644
--- a/sys/compat/linux/linux_fork.c
+++ b/sys/compat/linux/linux_fork.c
@@ -64,7 +64,8 @@ linux_fork(struct thread *td, struct linux_fork_args *args)
printf(ARGS(fork, ""));
#endif
- if ((error = fork1(td, RFFDG | RFPROC | RFSTOPPED, 0, &p2)) != 0)
+ if ((error = fork1(td, RFFDG | RFPROC | RFSTOPPED, 0, &p2, NULL, 0))
+ != 0)
return (error);
td->td_retval[0] = p2->p_pid;
@@ -100,7 +101,8 @@ linux_vfork(struct thread *td, struct linux_vfork_args *args)
#endif
/* Exclude RFPPWAIT */
- if ((error = fork1(td, RFFDG | RFPROC | RFMEM | RFSTOPPED, 0, &p2)) != 0)
+ if ((error = fork1(td, RFFDG | RFPROC | RFMEM | RFSTOPPED, 0, &p2,
+ NULL, 0)) != 0)
return (error);
td->td_retval[0] = p2->p_pid;
@@ -190,7 +192,7 @@ linux_clone(struct thread *td, struct linux_clone_args *args)
if (args->parent_tidptr == NULL)
return (EINVAL);
- error = fork1(td, ff, 0, &p2);
+ error = fork1(td, ff, 0, &p2, NULL, 0);
if (error)
return (error);
diff --git a/sys/conf/NOTES b/sys/conf/NOTES
index 4a9ec35d0380..59f02c812a13 100644
--- a/sys/conf/NOTES
+++ b/sys/conf/NOTES
@@ -1159,6 +1159,9 @@ options MAC_TEST
options CAPABILITIES # fine-grained rights on file descriptors
options CAPABILITY_MODE # sandboxes with no global namespace access
+# Support for process descriptors
+options PROCDESC
+
#####################################################################
# CLOCK OPTIONS
diff --git a/sys/conf/files b/sys/conf/files
index 0dc814e0934a..5c5d92d6b163 100644
--- a/sys/conf/files
+++ b/sys/conf/files
@@ -2412,6 +2412,7 @@ kern/subr_witness.c optional witness
kern/sys_capability.c standard
kern/sys_generic.c standard
kern/sys_pipe.c standard
+kern/sys_procdesc.c standard
kern/sys_process.c standard
kern/sys_socket.c standard
kern/syscalls.c standard
diff --git a/sys/conf/options b/sys/conf/options
index f7026c134177..27fdbedacb60 100644
--- a/sys/conf/options
+++ b/sys/conf/options
@@ -149,6 +149,7 @@ PPC_DEBUG opt_ppc.h
PPC_PROBE_CHIPSET opt_ppc.h
PPS_SYNC opt_ntp.h
PREEMPTION opt_sched.h
+PROCDESC opt_procdesc.h
QUOTA
SCHED_4BSD opt_sched.h
SCHED_STATS opt_sched.h
diff --git a/sys/kern/capabilities.conf b/sys/kern/capabilities.conf
index 004c2ddf2077..4a62643623eb 100644
--- a/sys/kern/capabilities.conf
+++ b/sys/kern/capabilities.conf
@@ -475,7 +475,7 @@ openbsd_poll
pdfork
pdgetpid
pdkill
-pdwait4
+#pdwait4 # not yet implemented
##
## Allow pipe(2).
diff --git a/sys/kern/init_main.c b/sys/kern/init_main.c
index be5c26fec029..fc072457c191 100644
--- a/sys/kern/init_main.c
+++ b/sys/kern/init_main.c
@@ -790,7 +790,8 @@ create_init(const void *udata __unused)
struct ucred *newcred, *oldcred;
int error;
- error = fork1(&thread0, RFFDG | RFPROC | RFSTOPPED, 0, &initproc);
+ error = fork1(&thread0, RFFDG | RFPROC | RFSTOPPED, 0, &initproc,
+ NULL, 0);
if (error)
panic("cannot fork init: %d\n", error);
KASSERT(initproc->p_pid == 1, ("create_init: initproc->p_pid != 1"));
diff --git a/sys/kern/kern_descrip.c b/sys/kern/kern_descrip.c
index 85f866c39c0a..4aaed1f040e8 100644
--- a/sys/kern/kern_descrip.c
+++ b/sys/kern/kern_descrip.c
@@ -41,6 +41,7 @@ __FBSDID("$FreeBSD$");
#include "opt_compat.h"
#include "opt_ddb.h"
#include "opt_ktrace.h"
+#include "opt_procdesc.h"
#include <sys/param.h>
#include <sys/systm.h>
@@ -65,6 +66,7 @@ __FBSDID("$FreeBSD$");
#include <sys/pipe.h>
#include <sys/priv.h>
#include <sys/proc.h>
+#include <sys/procdesc.h>
#include <sys/protosw.h>
#include <sys/racct.h>
#include <sys/resourcevar.h>
@@ -120,6 +122,8 @@ static int fill_vnode_info(struct vnode *vp, struct kinfo_file *kif);
static int fill_socket_info(struct socket *so, struct kinfo_file *kif);
static int fill_pts_info(struct tty *tp, struct kinfo_file *kif);
static int fill_pipe_info(struct pipe *pi, struct kinfo_file *kif);
+static int fill_procdesc_info(struct procdesc *pdp,
+ struct kinfo_file *kif);
/*
* A process is initially started out with NDFILE descriptors stored within
@@ -3056,6 +3060,12 @@ sysctl_kern_proc_ofiledesc(SYSCTL_HANDLER_ARGS)
tp = fp->f_data;
break;
+#ifdef PROCDESC
+ case DTYPE_PROCDESC:
+ kif->kf_type = KF_TYPE_PROCDESC;
+ break;
+#endif
+
default:
kif->kf_type = KF_TYPE_UNKNOWN;
break;
@@ -3218,6 +3228,9 @@ export_fd_for_sysctl(void *data, int type, int fd, int fflags, int refcnt,
case KF_TYPE_PTS:
error = fill_pts_info((struct tty *)data, kif);
break;
+ case KF_TYPE_PROCDESC:
+ error = fill_procdesc_info((struct procdesc *)data, kif);
+ break;
default:
error = 0;
}
@@ -3391,6 +3404,13 @@ sysctl_kern_proc_filedesc(SYSCTL_HANDLER_ARGS)
data = fp->f_data;
break;
+#ifdef PROCDESC
+ case DTYPE_PROCDESC:
+ type = KF_TYPE_PROCDESC;
+ data = fp->f_data;
+ break;
+#endif
+
default:
type = KF_TYPE_UNKNOWN;
break;
@@ -3586,6 +3606,16 @@ fill_pipe_info(struct pipe *pi, struct kinfo_file *kif)
return (0);
}
+static int
+fill_procdesc_info(struct procdesc *pdp, struct kinfo_file *kif)
+{
+
+ if (pdp == NULL)
+ return (1);
+ kif->kf_un.kf_proc.kf_pid = pdp->pd_pid;
+ return (0);
+}
+
static SYSCTL_NODE(_kern_proc, KERN_PROC_FILEDESC, filedesc, CTLFLAG_RD,
sysctl_kern_proc_filedesc, "Process filedesc entries");
diff --git a/sys/kern/kern_exit.c b/sys/kern/kern_exit.c
index 30b94b6a28a4..e5d60942f7c0 100644
--- a/sys/kern/kern_exit.c
+++ b/sys/kern/kern_exit.c
@@ -40,16 +40,19 @@ __FBSDID("$FreeBSD$");
#include "opt_compat.h"
#include "opt_kdtrace.h"
#include "opt_ktrace.h"
+#include "opt_procdesc.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/sysproto.h>
+#include <sys/capability.h>
#include <sys/eventhandler.h>
#include <sys/kernel.h>
#include <sys/malloc.h>
#include <sys/lock.h>
#include <sys/mutex.h>
#include <sys/proc.h>
+#include <sys/procdesc.h>
#include <sys/pioctl.h>
#include <sys/jail.h>
#include <sys/tty.h>
@@ -461,39 +464,54 @@ exit1(struct thread *td, int rv)
knlist_clear(&p->p_klist, 1);
/*
- * Notify parent that we're gone. If parent has the PS_NOCLDWAIT
- * flag set, or if the handler is set to SIG_IGN, notify process
- * 1 instead (and hope it will handle this situation).
+ * If this is a process with a descriptor, we may not need to deliver
+ * a signal to the parent. proctree_lock is held over
+ * procdesc_exit() to serialize concurrent calls to close() and
+ * exit().
*/
- PROC_LOCK(p->p_pptr);
- mtx_lock(&p->p_pptr->p_sigacts->ps_mtx);
- if (p->p_pptr->p_sigacts->ps_flag & (PS_NOCLDWAIT | PS_CLDSIGIGN)) {
- struct proc *pp;
-
- mtx_unlock(&p->p_pptr->p_sigacts->ps_mtx);
- pp = p->p_pptr;
- PROC_UNLOCK(pp);
- proc_reparent(p, initproc);
- p->p_sigparent = SIGCHLD;
- PROC_LOCK(p->p_pptr);
-
+#ifdef PROCDESC
+ if (p->p_procdesc == NULL || procdesc_exit(p)) {
+#endif
/*
- * Notify parent, so in case he was wait(2)ing or
- * executing waitpid(2) with our pid, he will
- * continue.
+ * Notify parent that we're gone. If parent has the
+ * PS_NOCLDWAIT flag set, or if the handler is set to SIG_IGN,
+ * notify process 1 instead (and hope it will handle this
+ * situation).
*/
- wakeup(pp);
+ PROC_LOCK(p->p_pptr);
+ mtx_lock(&p->p_pptr->p_sigacts->ps_mtx);
+ if (p->p_pptr->p_sigacts->ps_flag &
+ (PS_NOCLDWAIT | PS_CLDSIGIGN)) {
+ struct proc *pp;
+
+ mtx_unlock(&p->p_pptr->p_sigacts->ps_mtx);
+ pp = p->p_pptr;
+ PROC_UNLOCK(pp);
+ proc_reparent(p, initproc);
+ p->p_sigparent = SIGCHLD;
+ PROC_LOCK(p->p_pptr);
+
+ /*
+ * Notify parent, so in case he was wait(2)ing or
+ * executing waitpid(2) with our pid, he will
+ * continue.
+ */
+ wakeup(pp);
+ } else
+ mtx_unlock(&p->p_pptr->p_sigacts->ps_mtx);
+
+ if (p->p_pptr == initproc)
+ psignal(p->p_pptr, SIGCHLD);
+ else if (p->p_sigparent != 0) {
+ if (p->p_sigparent == SIGCHLD)
+ childproc_exited(p);
+ else /* LINUX thread */
+ psignal(p->p_pptr, p->p_sigparent);
+ }
+#ifdef PROCDESC
} else
- mtx_unlock(&p->p_pptr->p_sigacts->ps_mtx);
-
- if (p->p_pptr == initproc)
- psignal(p->p_pptr, SIGCHLD);
- else if (p->p_sigparent != 0) {
- if (p->p_sigparent == SIGCHLD)
- childproc_exited(p);
- else /* LINUX thread */
- psignal(p->p_pptr, p->p_sigparent);
- }
+ PROC_LOCK(p->p_pptr);
+#endif
sx_xunlock(&proctree_lock);
/*
@@ -660,7 +678,7 @@ wait4(struct thread *td, struct wait_args *uap)
* rusage. Asserts and will release both the proctree_lock and the process
* lock as part of its work.
*/
-static void
+void
proc_reap(struct thread *td, struct proc *p, int *status, int options,
struct rusage *rusage)
{
@@ -722,6 +740,10 @@ proc_reap(struct thread *td, struct proc *p, int *status, int options,
sx_xunlock(&allproc_lock);
LIST_REMOVE(p, p_sibling);
leavepgrp(p);
+#ifdef PROCDESC
+ if (p->p_procdesc != NULL)
+ procdesc_reap(p);
+#endif
sx_xunlock(&proctree_lock);
/*
diff --git a/sys/kern/kern_fork.c b/sys/kern/kern_fork.c
index 9d3e22d224cf..32d00550a815 100644
--- a/sys/kern/kern_fork.c
+++ b/sys/kern/kern_fork.c
@@ -40,11 +40,13 @@ __FBSDID("$FreeBSD$");
#include "opt_kdtrace.h"
#include "opt_ktrace.h"
#include "opt_kstack_pages.h"
+#include "opt_procdesc.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/sysproto.h>
#include <sys/eventhandler.h>
+#include <sys/fcntl.h>
#include <sys/filedesc.h>
#include <sys/jail.h>
#include <sys/kernel.h>
@@ -55,6 +57,7 @@ __FBSDID("$FreeBSD$");
#include <sys/mutex.h>
#include <sys/priv.h>
#include <sys/proc.h>
+#include <sys/procdesc.h>
#include <sys/pioctl.h>
#include <sys/racct.h>
#include <sys/resourcevar.h>
@@ -104,7 +107,7 @@ fork(struct thread *td, struct fork_args *uap)
int error;
struct proc *p2;
- error = fork1(td, RFFDG | RFPROC, 0, &p2);
+ error = fork1(td, RFFDG | RFPROC, 0, &p2, NULL, 0);
if (error == 0) {
td->td_retval[0] = p2->p_pid;
td->td_retval[1] = 0;
@@ -112,6 +115,34 @@ fork(struct thread *td, struct fork_args *uap)
return (error);
}
+/* ARGUSED */
+int
+pdfork(td, uap)
+ struct thread *td;
+ struct pdfork_args *uap;
+{
+#ifdef PROCDESC
+ int error, fd;
+ struct proc *p2;
+
+ /*
+ * It is necessary to return fd by reference because 0 is a valid file
+ * descriptor number, and the child needs to be able to distinguish
+ * itself from the parent using the return value.
+ */
+ error = fork1(td, RFFDG | RFPROC | RFPROCDESC, 0, &p2,
+ &fd, uap->flags);
+ if (error == 0) {
+ td->td_retval[0] = p2->p_pid;
+ td->td_retval[1] = 0;
+ error = copyout(&fd, uap->fdp, sizeof(fd));
+ }
+ return (error);
+#else
+ return (ENOSYS);
+#endif
+}
+
/* ARGSUSED */
int
vfork(struct thread *td, struct vfork_args *uap)
@@ -124,7 +155,7 @@ vfork(struct thread *td, struct vfork_args *uap)
#else
flags = RFFDG | RFPROC | RFPPWAIT | RFMEM;
#endif
- error = fork1(td, flags, 0, &p2);
+ error = fork1(td, flags, 0, &p2, NULL, 0);
if (error == 0) {
td->td_retval[0] = p2->p_pid;
td->td_retval[1] = 0;
@@ -143,7 +174,7 @@ rfork(struct thread *td, struct rfork_args *uap)
return (EINVAL);
AUDIT_ARG_FFLAGS(uap->flags);
- error = fork1(td, uap->flags, 0, &p2);
+ error = fork1(td, uap->flags, 0, &p2, NULL, 0);
if (error == 0) {
td->td_retval[0] = p2 ? p2->p_pid : 0;
td->td_retval[1] = 0;
@@ -337,7 +368,7 @@ fail:
static void
do_fork(struct thread *td, int flags, struct proc *p2, struct thread *td2,
- struct vmspace *vm2)
+ struct vmspace *vm2, int pdflags)
{
struct proc *p1, *pptr;
int p2_held, trypid;
@@ -625,6 +656,16 @@ do_fork(struct thread *td, int flags, struct proc *p2, struct thread *td2,
p2->p_vmspace->vm_ssize);
}
+#ifdef PROCDESC
+ /*
+ * Associate the process descriptor with the process before anything
+ * can happen that might cause that process to need the descriptor.
+ * However, don't do this until after fork(2) can no longer fail.
+ */
+ if (flags & RFPROCDESC)
+ procdesc_new(p2, pdflags);
+#endif
+
/*
* Both processes are set up, now check if any loadable modules want
* to adjust anything.
@@ -710,7 +751,8 @@ do_fork(struct thread *td, int flags, struct proc *p2, struct thread *td2,
}
int
-fork1(struct thread *td, int flags, int pages, struct proc **procp)
+fork1(struct thread *td, int flags, int pages, struct proc **procp,
+ int *procdescp, int pdflags)
{
struct proc *p1;
struct proc *newproc;
@@ -721,6 +763,9 @@ fork1(struct thread *td, int flags, int pages, struct proc **procp)
int error;
static int curfail;
static struct timeval lastfail;
+#ifdef PROCDESC
+ struct file *fp_procdesc = NULL;
+#endif
/* Check for the undefined or unimplemented flags. */
if ((flags & ~(RFFLAGS | RFTSIGFLAGS(RFTSIGMASK))) != 0)
@@ -738,6 +783,18 @@ fork1(struct thread *td, int flags, int pages, struct proc **procp)
if ((flags & RFTSIGZMB) != 0 && (u_int)RFTSIGNUM(flags) > _SIG_MAXSIG)
return (EINVAL);
+#ifdef PROCDESC
+ if ((flags & RFPROCDESC) != 0) {
+ /* Can't not create a process yet get a process descriptor. */
+ if ((flags & RFPROC) == 0)
+ return (EINVAL);
+
+ /* Must provide a place to put a procdesc if creating one. */
+ if (procdescp == NULL)
+ return (EINVAL);
+ }
+#endif
+
p1 = td->td_proc;
/*
@@ -757,6 +814,25 @@ fork1(struct thread *td, int flags, int pages, struct proc **procp)
return (EAGAIN);
#endif
+#ifdef PROCDESC
+ /*
+ * If required, create a process descriptor in the parent first; we
+ * will abandon it if something goes wrong. We don't finit() until
+ * later.
+ */
+ if (flags & RFPROCDESC) {
+ error = falloc(td, &fp_procdesc, procdescp, 0);
+ if (error != 0) {
+#ifdef RACCT
+ PROC_LOCK(p1);
+ racct_sub(p1, RACCT_NPROC, 1);
+ PROC_UNLOCK(p1);
+#endif
+ return (error);
+ }
+ }
+#endif
+
mem_charged = 0;
vm2 = NULL;
if (pages == 0)
@@ -868,12 +944,16 @@ fork1(struct thread *td, int flags, int pages, struct proc **procp)
PROC_UNLOCK(p1);
}
if (ok) {
- do_fork(td, flags, newproc, td2, vm2);
+ do_fork(td, flags, newproc, td2, vm2, pdflags);
/*
* Return child proc pointer to parent.
*/
*procp = newproc;
+#ifdef PROCDESC
+ if (flags & RFPROCDESC)
+ procdesc_finit(newproc->p_procdesc, fp_procdesc);
+#endif
return (0);
}
@@ -892,6 +972,10 @@ fail1:
if (vm2 != NULL)
vmspace_free(vm2);
uma_zfree(proc_zone, newproc);
+#ifdef PROCDESC
+ if (((flags & RFPROCDESC) != 0) && (fp_procdesc != NULL))
+ fdrop(fp_procdesc, td);
+#endif
pause("fork", hz / 2);
#ifdef RACCT
PROC_LOCK(p1);
diff --git a/sys/kern/kern_kthread.c b/sys/kern/kern_kthread.c
index 95f896fa5aa6..bb1246980d5b 100644
--- a/sys/kern/kern_kthread.c
+++ b/sys/kern/kern_kthread.c
@@ -88,7 +88,7 @@ kproc_create(void (*func)(void *), void *arg,
panic("kproc_create called too soon");
error = fork1(&thread0, RFMEM | RFFDG | RFPROC | RFSTOPPED | flags,
- pages, &p2);
+ pages, &p2, NULL, 0);
if (error)
return error;
diff --git a/sys/kern/kern_sig.c b/sys/kern/kern_sig.c
index e1861eb1e81d..26ef0d7f31d4 100644
--- a/sys/kern/kern_sig.c
+++ b/sys/kern/kern_sig.c
@@ -41,12 +41,14 @@ __FBSDID("$FreeBSD$");
#include "opt_kdtrace.h"
#include "opt_ktrace.h"
#include "opt_core.h"
+#include "opt_procdesc.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/signalvar.h>
#include <sys/vnode.h>
#include <sys/acct.h>
+#include <sys/capability.h>
#include <sys/condvar.h>
#include <sys/event.h>
#include <sys/fcntl.h>
@@ -59,6 +61,7 @@ __FBSDID("$FreeBSD$");
#include <sys/mutex.h>
#include <sys/namei.h>
#include <sys/proc.h>
+#include <sys/procdesc.h>
#include <sys/posix4.h>
#include <sys/pioctl.h>
#include <sys/racct.h>
@@ -1698,6 +1701,34 @@ kill(struct thread *td, struct kill_args *uap)
/* NOTREACHED */
}
+int
+pdkill(td, uap)
+ struct thread *td;
+ struct pdkill_args *uap;
+{
+#ifdef PROCDESC
+ struct proc *p;
+ int error;
+
+ AUDIT_ARG_SIGNUM(uap->signum);
+ AUDIT_ARG_FD(uap->fd);
+ if ((u_int)uap->signum > _SIG_MAXSIG)
+ return (EINVAL);
+
+ error = procdesc_find(td, uap->fd, CAP_PDKILL, &p);
+ if (error)
+ return (error);
+ AUDIT_ARG_PROCESS(p);
+ error = p_cansignal(td, p, uap->signum);
+ if (error == 0 && uap->signum)
+ psignal(p, uap->signum);
+ PROC_UNLOCK(p);
+ return (error);
+#else
+ return (ENOSYS);
+#endif
+}
+
#if defined(COMPAT_43)
#ifndef _SYS_SYSPROTO_H_
struct okillpg_args {
diff --git a/sys/kern/sys_procdesc.c b/sys/kern/sys_procdesc.c
new file mode 100644
index 000000000000..9993732527cd
--- /dev/null
+++ b/sys/kern/sys_procdesc.c
@@ -0,0 +1,524 @@
+/*-
+ * Copyright (c) 2009 Robert N. M. Watson
+ * All rights reserved.
+ *
+ * This software was developed at the University of Cambridge Computer
+ * Laboratory with support from a grant from Google, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*-
+ * FreeBSD process descriptor facility.
+ *
+ * Some processes are represented by a file descriptor, which will be used in
+ * preference to signaling and pids for the purposes of process management,
+ * and is, in effect, a form of capability. When a process descriptor is
+ * used with a process, it ceases to be visible to certain traditional UNIX
+ * process facilities, such as waitpid(2).
+ *
+ * Some semantics:
+ *
+ * - At most one process descriptor will exist for any process, although
+ * references to that descriptor may be held from many processes (or even
+ * be in flight between processes over a local domain socket).
+ * - Last close on the process descriptor will terminate the process using
+ * SIGKILL and reparent it to init so that there's a process to reap it
+ * when it's done exiting.
+ * - If the process exits before the descriptor is closed, it will not
+ * generate SIGCHLD on termination, or be picked up by waitpid().
+ * - The pdkill(2) system call may be used to deliver a signal to the process
+ * using its process descriptor.
+ * - The pdwait4(2) system call may be used to block (or not) on a process
+ * descriptor to collect termination information.
+ *
+ * Open questions:
+ *
+ * - How to handle ptrace(2)?
+ * - Will we want to add a pidtoprocdesc(2) system call to allow process
+ * descriptors to be created for processes without pfork(2)?
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_procdesc.h"
+
+#include <sys/param.h>
+#include <sys/capability.h>
+#include <sys/fcntl.h>
+#include <sys/file.h>
+#include <sys/filedesc.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/poll.h>
+#include <sys/proc.h>
+#include <sys/procdesc.h>
+#include <sys/resourcevar.h>
+#include <sys/stat.h>
+#include <sys/sysproto.h>
+#include <sys/sysctl.h>
+#include <sys/systm.h>
+#include <sys/ucred.h>
+
+#include <security/audit/audit.h>
+
+#include <vm/uma.h>
+
+#ifdef PROCDESC
+
+FEATURE(process_descriptors, "Process Descriptors");
+
+static uma_zone_t procdesc_zone;
+
+static fo_rdwr_t procdesc_read;
+static fo_rdwr_t procdesc_write;
+static fo_truncate_t procdesc_truncate;
+static fo_ioctl_t procdesc_ioctl;
+static fo_poll_t procdesc_poll;
+static fo_kqfilter_t procdesc_kqfilter;
+static fo_stat_t procdesc_stat;
+static fo_close_t procdesc_close;
+static fo_chmod_t procdesc_chmod;
+static fo_chown_t procdesc_chown;
+
+static struct fileops procdesc_ops = {
+ .fo_read = procdesc_read,
+ .fo_write = procdesc_write,
+ .fo_truncate = procdesc_truncate,
+ .fo_ioctl = procdesc_ioctl,
+ .fo_poll = procdesc_poll,
+ .fo_kqfilter = procdesc_kqfilter,
+ .fo_stat = procdesc_stat,
+ .fo_close = procdesc_close,
+ .fo_chmod = procdesc_chmod,
+ .fo_chown = procdesc_chown,
+ .fo_flags = DFLAG_PASSABLE,
+};
+
+/*
+ * Initialize with VFS so that process descriptors are available along with
+ * other file descriptor types. As long as it runs before init(8) starts,
+ * there shouldn't be a problem.
+ */
+static void
+procdesc_init(void *dummy __unused)
+{
+
+ procdesc_zone = uma_zcreate("procdesc", sizeof(struct procdesc),
+ NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
+ if (procdesc_zone == NULL)
+ panic("procdesc_init: procdesc_zone not initialized");
+}
+SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_ANY, procdesc_init, NULL);
+
+/*
+ * Return a locked process given a process descriptor, or ESRCH if it has
+ * died.
+ */
+int
+procdesc_find(struct thread *td, int fd, cap_rights_t rights,
+ struct proc **p)
+{
+ struct procdesc *pd;
+ struct file *fp;
+ int error;
+
+ error = fget(td, fd, rights, &fp);
+ if (error)
+ return (error);
+ if (fp->f_type != DTYPE_PROCDESC) {
+ error = EBADF;
+ goto out;
+ }
+ pd = fp->f_data;
+ sx_slock(&proctree_lock);
+ if (pd->pd_proc != NULL) {
+ *p = pd->pd_proc;
+ PROC_LOCK(*p);
+ } else
+ error = ESRCH;
+ sx_sunlock(&proctree_lock);
+out:
+ fdrop(fp, td);
+ return (error);
+}
+
+/*
+ * Function to be used by procstat(1) sysctls when returning procdesc
+ * information.
+ */
+pid_t
+procdesc_pid(struct file *fp_procdesc)
+{
+ struct procdesc *pd;
+
+ KASSERT(fp_procdesc->f_type == DTYPE_PROCDESC,
+ ("procdesc_pid: !procdesc"));
+
+ pd = fp_procdesc->f_data;
+ return (pd->pd_pid);
+}
+
+/*
+ * Retrieve the PID associated with a process descriptor.
+ */
+int
+kern_pdgetpid(struct thread *td, int fd, cap_rights_t rights, pid_t *pidp)
+{
+ struct file *fp;
+ int error;
+
+ error = fget(td, fd, rights, &fp);
+ if (error)
+ return (error);
+ if (fp->f_type != DTYPE_PROCDESC) {
+ error = EBADF;
+ goto out;
+ }
+ *pidp = procdesc_pid(fp);
+out:
+ fdrop(fp, td);
+ return (error);
+}
+
+/*
+ * System call to return the pid of a process given its process descriptor.
+ */
+int
+pdgetpid(struct thread *td, struct pdgetpid_args *uap)
+{
+ pid_t pid;
+ int error;
+
+ AUDIT_ARG_FD(uap->fd);
+ error = kern_pdgetpid(td, uap->fd, CAP_PDGETPID, &pid);
+ if (error == 0)
+ error = copyout(&pid, uap->pidp, sizeof(pid));
+ return (error);
+}
+
+/*
+ * When a new process is forked by pdfork(), a file descriptor is allocated
+ * by the fork code first, then the process is forked, and then we get a
+ * chance to set up the process descriptor. Failure is not permitted at this
+ * point, so procdesc_new() must succeed.
+ */
+void
+procdesc_new(struct proc *p, int flags)
+{
+ struct procdesc *pd;
+
+ pd = uma_zalloc(procdesc_zone, M_WAITOK | M_ZERO);
+ pd->pd_proc = p;
+ pd->pd_pid = p->p_pid;
+ p->p_procdesc = pd;
+ pd->pd_flags = 0;
+ if (flags & PD_DAEMON)
+ pd->pd_flags |= PDF_DAEMON;
+ PROCDESC_LOCK_INIT(pd);
+
+ /*
+ * Process descriptors start out with two references: one from their
+ * struct file, and the other from their struct proc.
+ */
+ refcount_init(&pd->pd_refcount, 2);
+}
+
+/*
+ * Initialize a file with a process descriptor.
+ */
+void
+procdesc_finit(struct procdesc *pdp, struct file *fp)
+{
+
+ finit(fp, FREAD | FWRITE, DTYPE_PROCDESC, pdp, &procdesc_ops);
+}
+
+static void
+procdesc_free(struct procdesc *pd)
+{
+
+ /*
+ * When the last reference is released, we assert that the descriptor
+ * has been closed, but not that the process has exited, as we will
+ * detach the descriptor before the process dies if the descript is
+ * closed, as we can't wait synchronously.
+ */
+ if (refcount_release(&pd->pd_refcount)) {
+ KASSERT(pd->pd_proc == NULL,
+ ("procdesc_free: pd_proc != NULL"));
+ KASSERT((pd->pd_flags & PDF_CLOSED),
+ ("procdesc_free: !PDF_CLOSED"));
+
+ PROCDESC_LOCK_DESTROY(pd);
+ uma_zfree(procdesc_zone, pd);
+ }
+}
+
+/*
+ * procdesc_exit() - notify a process descriptor that its process is exiting.
+ * We use the proctree_lock to ensure that process exit either happens
+ * strictly before or strictly after a concurrent call to procdesc_close().
+ */
+int
+procdesc_exit(struct proc *p)
+{
+ struct procdesc *pd;
+
+ sx_assert(&proctree_lock, SA_XLOCKED);
+ PROC_LOCK_ASSERT(p, MA_OWNED);
+ KASSERT(p->p_procdesc != NULL, ("procdesc_exit: p_procdesc NULL"));
+
+ pd = p->p_procdesc;
+
+ PROCDESC_LOCK(pd);
+ KASSERT((pd->pd_flags & PDF_CLOSED) == 0 || p->p_pptr == initproc,
+ ("procdesc_exit: closed && parent not init"));
+
+ pd->pd_flags |= PDF_EXITED;
+
+ /*
+ * If the process descriptor has been closed, then we have nothing
+ * to do; return 1 so that init will get SIGCHLD and do the reaping.
+ * Clean up the procdesc now rather than letting it happen during
+ * that reap.
+ */
+ if (pd->pd_flags & PDF_CLOSED) {
+ PROCDESC_UNLOCK(pd);
+ pd->pd_proc = NULL;
+ p->p_procdesc = NULL;
+ procdesc_free(pd);
+ return (1);
+ }
+ if (pd->pd_flags & PDF_SELECTED) {
+ pd->pd_flags &= ~PDF_SELECTED;
+ selwakeup(&pd->pd_selinfo);
+ }
+ PROCDESC_UNLOCK(pd);
+ return (0);
+}
+
+/*
+ * When a process descriptor is reaped, perhaps as a result of close() or
+ * pdwait4(), release the process's reference on the process descriptor.
+ */
+void
+procdesc_reap(struct proc *p)
+{
+ struct procdesc *pd;
+
+ sx_assert(&proctree_lock, SA_XLOCKED);
+ KASSERT(p->p_procdesc != NULL, ("procdesc_reap: p_procdesc == NULL"));
+
+ pd = p->p_procdesc;
+ pd->pd_proc = NULL;
+ procdesc_free(pd);
+}
+
+/*
+ * procdesc_close() - last close on a process descriptor. If the process is
+ * still running, terminate with SIGKILL (unless PD_DAEMON is set) and let
+ * init(8) clean up the mess; if not, we have to clean up the zombie ourselves.
+ */
+static int
+procdesc_close(struct file *fp, struct thread *td)
+{
+ struct procdesc *pd;
+ struct proc *p;
+
+ KASSERT(fp->f_type == DTYPE_PROCDESC, ("procdesc_close: !procdesc"));
+
+ pd = fp->f_data;
+ fp->f_ops = &badfileops;
+ fp->f_data = NULL;
+
+ sx_xlock(&proctree_lock);
+ PROCDESC_LOCK(pd);
+ pd->pd_flags |= PDF_CLOSED;
+ PROCDESC_UNLOCK(pd);
+ p = pd->pd_proc;
+ PROC_LOCK(p);
+ if (p->p_state == PRS_ZOMBIE) {
+ /*
+ * If the process is already dead and just awaiting reaping,
+ * do that now. This will release the process's reference to
+ * the process descriptor when it calls back into
+ * procdesc_reap().
+ */
+ PROC_SLOCK(p);
+ proc_reap(curthread, p, NULL, 0, NULL);
+ } else {
+ /*
+ * If the process is not yet dead, we need to kill it, but we
+ * can't wait around synchronously for it to go away, as that
+ * path leads to madness (and deadlocks). First, detach the
+ * process from its descriptor so that its exit status will
+ * be reported normally.
+ */
+ pd->pd_proc = NULL;
+ p->p_procdesc = NULL;
+ procdesc_free(pd);
+
+ /*
+ * Next, reparent it to init(8) so that there's someone to
+ * pick up the pieces; finally, terminate with prejudice.
+ */
+ p->p_sigparent = SIGCHLD;
+ proc_reparent(p, initproc);
+ if ((pd->pd_flags & PD_DAEMON) == 0)
+ psignal(p, SIGKILL);
+ PROC_UNLOCK(p);
+ sx_xunlock(&proctree_lock);
+ }
+
+ /*
+ * Release the file descriptor's reference on the process descriptor.
+ */
+ procdesc_free(pd);
+ return (0);
+}
+
+static int
+procdesc_read(struct file *fp, struct uio *uio, struct ucred *active_cred,
+ int flags, struct thread *td)
+{
+
+ return (EOPNOTSUPP);
+}
+
+static int
+procdesc_write(struct file *fp, struct uio *uio, struct ucred *active_cred,
+ int flags, struct thread *td)
+{
+
+ return (EOPNOTSUPP);
+}
+
+static int
+procdesc_truncate(struct file *fp, off_t length, struct ucred *active_cred,
+ struct thread *td)
+{
+
+ return (EOPNOTSUPP);
+}
+
+static int
+procdesc_ioctl(struct file *fp, u_long com, void *data,
+ struct ucred *active_cred, struct thread *td)
+{
+
+ return (EOPNOTSUPP);
+}
+
+static int
+procdesc_poll(struct file *fp, int events, struct ucred *active_cred,
+ struct thread *td)
+{
+ struct procdesc *pd;
+ int revents;
+
+ revents = 0;
+ pd = fp->f_data;
+ PROCDESC_LOCK(pd);
+ if (pd->pd_flags & PDF_EXITED)
+ revents |= POLLHUP;
+ if (revents == 0) {
+ selrecord(td, &pd->pd_selinfo);
+ pd->pd_flags |= PDF_SELECTED;
+ }
+ PROCDESC_UNLOCK(pd);
+ return (revents);
+}
+
+static int
+procdesc_kqfilter(struct file *fp, struct knote *kn)
+{
+
+ return (EOPNOTSUPP);
+}
+
+static int
+procdesc_stat(struct file *fp, struct stat *sb, struct ucred *active_cred,
+ struct thread *td)
+{
+ struct procdesc *pd;
+ struct timeval pstart;
+
+ /*
+ * XXXRW: Perhaps we should cache some more information from the
+ * process so that we can return it reliably here even after it has
+ * died. For example, caching its credential data.
+ */
+ bzero(sb, sizeof(*sb));
+ pd = fp->f_data;
+ sx_slock(&proctree_lock);
+ if (pd->pd_proc != NULL) {
+ PROC_LOCK(pd->pd_proc);
+
+ /* Set birth and [acm] times to process start time. */
+ pstart = pd->pd_proc->p_stats->p_start;
+ timevaladd(&pstart, &boottime);
+ TIMEVAL_TO_TIMESPEC(&pstart, &sb->st_birthtim);
+ sb->st_atim = sb->st_birthtim;
+ sb->st_ctim = sb->st_birthtim;
+ sb->st_mtim = sb->st_birthtim;
+ if (pd->pd_proc->p_state != PRS_ZOMBIE)
+ sb->st_mode = S_IFREG | S_IRWXU;
+ else
+ sb->st_mode = S_IFREG;
+ sb->st_uid = pd->pd_proc->p_ucred->cr_ruid;
+ sb->st_gid = pd->pd_proc->p_ucred->cr_rgid;
+ PROC_UNLOCK(pd->pd_proc);
+ } else
+ sb->st_mode = S_IFREG;
+ sx_sunlock(&proctree_lock);
+ return (0);
+}
+
+static int
+procdesc_chmod(struct file *fp, mode_t mode, struct ucred *active_cred,
+ struct thread *td)
+{
+
+ return (EOPNOTSUPP);
+}
+
+static int
+procdesc_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred,
+ struct thread *td)
+{
+
+ return (EOPNOTSUPP);
+}
+
+#else /* !PROCDESC */
+
+int
+pdgetpid(struct thread *td, struct pdgetpid_args *uap)
+{
+
+ return (ENOSYS);
+}
+
+#endif /* PROCDESC */
diff --git a/sys/kern/syscalls.master b/sys/kern/syscalls.master
index 0b249a5b55ad..b79c6c7109cc 100644
--- a/sys/kern/syscalls.master
+++ b/sys/kern/syscalls.master
@@ -919,10 +919,10 @@
u_int64_t *rightsp); }
516 AUE_CAP_ENTER STD { int cap_enter(void); }
517 AUE_CAP_GETMODE STD { int cap_getmode(u_int *modep); }
-518 AUE_PDFORK UNIMPL pdfork
-519 AUE_PDKILL UNIMPL pdkill
-520 AUE_PDGETPID UNIMPL pdgetpid
-521 AUE_PDWAIT UNIMPL pdwait
+518 AUE_PDFORK STD { int pdfork(int *fdp, int flags); }
+519 AUE_PDKILL STD { int pdkill(int fd, int signum); }
+520 AUE_PDGETPID STD { int pdgetpid(int fd, pid_t *pidp); }
+521 AUE_PDWAIT UNIMPL pdwait4
522 AUE_SELECT STD { int pselect(int nd, fd_set *in, \
fd_set *ou, fd_set *ex, \
const struct timespec *ts, \
diff --git a/sys/sys/capability.h b/sys/sys/capability.h
index d67dc179b0de..81446a281904 100644
--- a/sys/sys/capability.h
+++ b/sys/sys/capability.h
@@ -131,8 +131,13 @@
#define CAP_IOCTL 0x0004000000000000ULL
#define CAP_TTYHOOK 0x0008000000000000ULL
+/* Process management via process descriptors. */
+#define CAP_PDGETPID 0x0010000000000000ULL
+#define CAP_PDWAIT 0x0020000000000000ULL
+#define CAP_PDKILL 0x0040000000000000ULL
+
/* The mask of all valid method rights. */
-#define CAP_MASK_VALID 0x000fffffffffffffULL
+#define CAP_MASK_VALID 0x007fffffffffffffULL
#ifdef _KERNEL
diff --git a/sys/sys/file.h b/sys/sys/file.h
index 5a4af332ebb9..57e7047e8b32 100644
--- a/sys/sys/file.h
+++ b/sys/sys/file.h
@@ -65,6 +65,7 @@ struct socket;
#define DTYPE_PTS 10 /* pseudo teletype master device */
#define DTYPE_DEV 11 /* Device specific fd type */
#define DTYPE_CAPABILITY 12 /* capability */
+#define DTYPE_PROCDESC 13 /* process descriptor */
#ifdef _KERNEL
diff --git a/sys/sys/proc.h b/sys/sys/proc.h
index 233efe985640..67adbe5b5899 100644
--- a/sys/sys/proc.h
+++ b/sys/sys/proc.h
@@ -166,6 +166,7 @@ struct mqueue_notifier;
struct nlminfo;
struct p_sched;
struct proc;
+struct procdesc;
struct racct;
struct sleepqueue;
struct td_sched;
@@ -534,6 +535,7 @@ struct proc {
int p_boundary_count;/* (c) Num threads at user boundary */
int p_pendingcnt; /* how many signals are pending */
struct itimers *p_itimers; /* (c) POSIX interval timers. */
+ struct procdesc *p_procdesc; /* (e) Process descriptor, if any. */
/* End area that is zeroed on creation. */
#define p_endzero p_magic
@@ -822,7 +824,7 @@ int enterpgrp(struct proc *p, pid_t pgid, struct pgrp *pgrp,
int enterthispgrp(struct proc *p, struct pgrp *pgrp);
void faultin(struct proc *p);
void fixjobc(struct proc *p, struct pgrp *pgrp, int entering);
-int fork1(struct thread *, int, int, struct proc **);
+int fork1(struct thread *, int, int, struct proc **, int *, int);
void fork_exit(void (*)(void *, struct trapframe *), void *,
struct trapframe *);
void fork_return(struct thread *, struct trapframe *);
@@ -844,6 +846,8 @@ void pargs_hold(struct pargs *pa);
void procinit(void);
void proc_linkup0(struct proc *p, struct thread *td);
void proc_linkup(struct proc *p, struct thread *td);
+void proc_reap(struct thread *td, struct proc *p, int *status, int options,
+ struct rusage *rusage);
void proc_reparent(struct proc *child, struct proc *newparent);
struct pstats *pstats_alloc(void);
void pstats_fork(struct pstats *src, struct pstats *dst);
diff --git a/sys/sys/procdesc.h b/sys/sys/procdesc.h
new file mode 100644
index 000000000000..cc8b7166f639
--- /dev/null
+++ b/sys/sys/procdesc.h
@@ -0,0 +1,119 @@
+/*-
+ * Copyright (c) 2009 Robert N. M. Watson
+ * All rights reserved.
+ *
+ * This software was developed at the University of Cambridge Computer
+ * Laboratory with support from a grant from Google, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _SYS_PROCDESC_H_
+#define _SYS_PROCDESC_H_
+
+#ifdef _KERNEL
+#include <sys/selinfo.h> /* struct selinfo */
+#include <sys/_lock.h>
+#include <sys/_mutex.h>
+
+/*-
+ * struct procdesc describes a process descriptor, and essentially consists
+ * of two pointers -- one to the file descriptor, and one to the process.
+ * When both become NULL, the process descriptor will be freed. An important
+ * invariant is that there is only ever one process descriptor for a process,
+ * so a single file pointer will suffice.
+ *
+ * Locking key:
+ * (c) - Constant after initial setup.
+ * (p) - Protected by the process descriptor mutex.
+ * (r) - Atomic eference count.
+ * (s) - Protected by selinfo.
+ * (t) - Protected by the proctree_lock
+ */
+struct proc;
+struct sigio;
+struct procdesc {
+ /*
+ * Basic process descriptor state: the process, a cache of its pid to
+ * satisfy queries after the process exits, and process descriptor
+ * refcount.
+ */
+ struct proc *pd_proc; /* (t) Process. */
+ pid_t pd_pid; /* (c) Cached pid. */
+ u_int pd_refcount; /* (r) Reference count. */
+
+ /*
+ * In-flight data and notification of events.
+ */
+ int pd_flags; /* (p) PD_ flags. */
+ struct selinfo pd_selinfo; /* (p) Event notification. */
+ struct mtx pd_lock; /* Protect data + events. */
+};
+
+/*
+ * Locking macros for the procdesc itself.
+ */
+#define PROCDESC_LOCK_DESTROY(pd) mtx_destroy(&(pd)->pd_lock)
+#define PROCDESC_LOCK_INIT(pd) mtx_init(&(pd)->pd_lock, "procdesc", NULL, \
+ MTX_DEF)
+#define PROCDESC_LOCK(pd) mtx_lock(&(pd)->pd_lock)
+#define PROCDESC_UNLOCK(pd) mtx_unlock(&(pd)->pd_lock)
+
+/*
+ * Flags for the pd_flags field.
+ */
+#define PDF_CLOSED 0x00000001 /* Descriptor has closed. */
+#define PDF_SELECTED 0x00000002 /* Issue selwakeup(). */
+#define PDF_EXITED 0x00000004 /* Process exited. */
+#define PDF_DAEMON 0x00000008 /* Don't exit when procdesc closes. */
+
+/*
+ * In-kernel interfaces to process descriptors.
+ */
+int procdesc_exit(struct proc *);
+int procdesc_find(struct thread *, int fd, cap_rights_t, struct proc **);
+int kern_pdgetpid(struct thread *, int fd, cap_rights_t, pid_t *pidp);
+void procdesc_new(struct proc *, int);
+void procdesc_finit(struct procdesc *, struct file *);
+pid_t procdesc_pid(struct file *);
+void procdesc_reap(struct proc *);
+
+#else /* !_KERNEL */
+
+/*
+ * Process descriptor system calls.
+ */
+struct rusage;
+int pdfork(int *, int);
+int pdkill(int, int);
+int pdgetpid(int, pid_t *);
+
+#endif /* _KERNEL */
+
+/*
+ * Flags which can be passed to pdfork(2).
+ */
+#define PD_DAEMON 0x00000001 /* Don't exit when procdesc closes. */
+
+#endif /* !_SYS_PROCDESC_H_ */
diff --git a/sys/sys/unistd.h b/sys/sys/unistd.h
index 9d56a3a42730..9e7f7e6d8342 100644
--- a/sys/sys/unistd.h
+++ b/sys/sys/unistd.h
@@ -185,11 +185,12 @@
#define RFTSIGMASK 0xFF
#define RFTSIGNUM(flags) (((flags) >> RFTSIGSHIFT) & RFTSIGMASK)
#define RFTSIGFLAGS(signum) ((signum) << RFTSIGSHIFT)
+#define RFPROCDESC (1<<28) /* return a process descriptor */
#define RFPPWAIT (1<<31) /* parent sleeps until child exits (vfork) */
-#define RFKERNELONLY (RFSTOPPED | RFHIGHPID | RFPPWAIT)
#define RFFLAGS (RFFDG | RFPROC | RFMEM | RFNOWAIT | RFCFDG | \
RFTHREAD | RFSIGSHARE | RFLINUXTHPN | RFSTOPPED | RFHIGHPID | RFTSIGZMB | \
- RFPPWAIT)
+ RFPROCDESC | RFPPWAIT)
+#define RFKERNELONLY (RFSTOPPED | RFHIGHPID | RFPPWAIT | RFPROCDESC)
#endif /* __BSD_VISIBLE */
diff --git a/sys/sys/user.h b/sys/sys/user.h
index ecf4ea940138..a139d4fdfabe 100644
--- a/sys/sys/user.h
+++ b/sys/sys/user.h
@@ -252,6 +252,7 @@ struct user {
#define KF_TYPE_SEM 9
#define KF_TYPE_PTS 10
/* no KF_TYPE_CAPABILITY (11), since capabilities wrap other file objects */
+#define KF_TYPE_PROCDESC 12
#define KF_TYPE_UNKNOWN 255
#define KF_VTYPE_VNON 0
@@ -377,6 +378,9 @@ struct kinfo_file {
/* Round to 64 bit alignment. */
uint32_t kf_pts_pad0[7];
} kf_pts;
+ struct {
+ pid_t kf_pid;
+ } kf_proc;
} kf_un;
uint16_t kf_status; /* Status flags. */
uint16_t kf_pad1; /* Round to 32 bit alignment. */