PR_SET_SECCOMP:
A pre-existing option for enabling strict seccomp mode (1) or
- filtering seccomp (13).
+ filtering seccomp (13). Performing this call (13) will enable
+ the current set of defined filters.
Usage:
prctl(PR_SET_SECCOMP, 1); /* strict */
-----------
Changing the availability of the kernel ABI at runtime runs the risk of
-providing access to normally unreachable code paths in normal
-applications. To avoid the pitfalls that accompany this risk, seccomp
-filters inheritance is restricted.
-
-In general, filters can be inherited across fork/clone, but only when
-they are active (e.g., PR_SET_SECCOMP has been set to 13) and not prior
-to use. Inheriting only active filters stops a parent process from
-adding filters that may undermine the child process security or create
-unexpected behavior after an execve.
-
-For example, a parent process may add a rule to exposes a system call
-that was not normally part of the child process' filter set. When the
-child process configures its filters, it would have to check
-/proc/self/seccomp_filter to ensure nothing unexpected has been added.
-The standard inheritance behavior ensures this suboptimal situation is
-avoided.
-
-Inheritance across execve follows a subset of this behavior. In
-particular, execve can only be added to the allowed filter set by a
-process with CAP_SYS_ADMIN privileges. The result is that an
-unprivileged process can never create a seccomp filter set that can be
-inherited across execve. To further guarantee this behavior, any
-unprivileged modifications to a seccomp filter set will forcibly
-clear execve. The end result is that a privileged parent may install
-a set of seccomp filters and, at any point in the hierarchy, a child may
-make a private version of the inherited filter set with their own
-changes applied but execve blocked.
+providing access to unreachable code paths in normal applications. To
+avoid the pitfalls that accompany this risk, seccomp filter inheritance
+is restricted.
+
+Filters may be inherited across a fork/clone if they have been activated
+by a call to prctl(PR_SET_SECCOMP, 13). If the process had the
+CAP_SYS_ADMIN capability when configuring the filters, they may also be
+inherited across an execve call.
+
+Inherited filters may not be modified by the child process. If child
+would like to further restrict the available system calls, it may
+perform the same calls as discussed earlier: set, clear, get, and
+finally prctl(PR_SET_SECCOMP, 13). Until the child calls
+PR_SET_SECCOMP, their filters will be ignored and only the inherited
+filter will be evaluated. After a successful PR_SET_SECCOMP call, all
+system calls performed by the child will be checked against the filters
+that were specified by itself and then against the filters supplied by
+any ancestors. Any system call used must be allowed by all of the
+filters tested. This composition of the ancestral seccomp filters and
+process-local filters guarantees that only the minimal set of system calls
+will be permitted at any point and without child processes needing to be
+aware of any prior system call filtering.
+
+(If, for instance, the child process merely copied the parent filters
+and then extended them, the child would be required to enumerate all
+existing filters to determine which needed to be dropped.)
Caveats
(_tsk)->seccomp.filters = NULL; \
} while (0)
-/* Do nothing unless seccomp filtering is active. If not, the execve boundary
- * can not be cleanly enforced and preset filters may leak across execve calls.
- */
-#define seccomp_filter_fork(_tsk, _orig) do { \
- if ((_tsk)->seccomp.mode) { \
- (_tsk)->seccomp.mode = (_orig)->seccomp.mode; \
- mutex_lock(&(_orig)->seccomp.filters_guard); \
- (_tsk)->seccomp.filters = \
- get_seccomp_filters((_orig)->seccomp.filters); \
- mutex_unlock(&(_orig)->seccomp.filters_guard); \
- } \
-} while (0)
+extern void seccomp_filter_fork(struct task_struct *child,
+ struct task_struct *parent);
/* No locking is needed here because the task_struct will
* have no parallel consumers.
extern long seccomp_clear_filter(int);
extern long seccomp_get_filter(int, char *, unsigned long);
+extern long seccomp_enable_filters(void);
+
extern long prctl_set_seccomp_filter(unsigned long, unsigned long,
char __user *);
extern long prctl_get_seccomp_filter(unsigned long, unsigned long,
struct seccomp_filters { };
#define seccomp_filter_init_task(_tsk) do { } while (0)
-#define seccomp_filter_fork(_tsk, _orig) do { } while (0)
+static inline void seccomp_filter_fork(struct task_struct *child,
+ struct task_struct *parent) { }
#define seccomp_filter_free_task(_tsk) do { } while (0)
static inline int seccomp_show_filters(struct seccomp_filters *filters,
#include <linux/kallsyms.h>
#include <linux/kref.h>
#include <linux/perf_event.h>
+#include <linux/pid.h>
#include <linux/prctl.h>
#include <linux/seccomp.h>
#include <linux/security.h>
* get/put helpers should be used when accessing an instance
* outside of a lifetime-guarded section. In general, this
* is only needed for handling shared filters across tasks.
+ * @creator: pointer to the pid that created this filter
+ * @parent: pointer to the ancestor which this filter will be composed with.
* @count: size of @event_filters
* @filter: tree of pointers to seccomp filters
*
* seccomp_filters objects should never be modified after being attached
- * to a task_struct.
+ * to a task_struct (other than @usage).
*/
struct seccomp_filters {
struct kref usage;
+ struct pid *creator;
+ struct seccomp_filters *parent;
struct {
uint32_t compat:1,
- __reserved:31;
+ enabled:1,
+ __reserved:30;
} flags;
uint16_t count;
struct btree_head32 tree;
/*
* Make ftrace support optional
*/
-
#if defined(CONFIG_FTRACE_SYSCALLS) && defined(CONFIG_PERF_EVENTS)
#include <asm/syscall.h>
kref_init(&f->usage);
if (btree_init32(&f->tree))
return ERR_PTR(-ENOMEM);
+ f->creator = get_task_pid(current, PIDTYPE_PID);
return f;
}
free_event_filter(ef);
}
btree_destroy32(&filters->tree);
+ put_seccomp_filters(filters->parent);
+ put_pid(filters->creator);
kfree(filters);
}
}
dst->count++;
}
+ dst->parent = get_seccomp_filters(src->parent);
+ if (dst->creator)
+ put_pid(dst->creator);
+ dst->creator = get_pid(src->creator);
done:
return ret;
return orig;
}
+static int filters_created_by_parent(struct seccomp_filters *filters)
+{
+ struct pid *pid;
+ int ret = 0;
+ if (!filters)
+ return ret;
+ pid = get_task_pid(current, PIDTYPE_PID);
+ if (pid != filters->creator)
+ ret = 1;
+ put_pid(pid);
+ return ret;
+}
+
/**
* seccomp_test_filters - tests 'current' against the given syscall
* @syscall: number of the system call to test
mutex_lock(¤t->seccomp.filters_guard);
/* No reference counting is done. filters_guard should protect the
- * lifetime of any existing pointer below.
+ * lifetime of any existing pointer below using the task reference.
+ * Parents will be protected by the held references.
*/
filters = current->seccomp.filters;
+
+ /* Inherited filters will always be enabled so skip ahead here. */
+ if (filters && !filters->flags.enabled)
+ filters = filters->parent;
+
+ /* Without any enabled filters, no system calls will be allowed. */
if (!filters)
goto out;
- if (filters_compat_mismatch(filters)) {
- pr_info("%s[%d]: seccomp_filter compat() mismatch.\n",
- current->comm, task_pid_nr(current));
- goto out;
- }
+ /* Only allow a system call if it is allowed in all ancestors. */
+ for ( ; filters != NULL; filters = filters->parent) {
+ if (filters_compat_mismatch(filters)) {
+ pr_info("%s[%d]: seccomp_filter compat() mismatch.\n",
+ current->comm, task_pid_nr(current));
+ goto out;
+ }
- filter = btree_lookup32(&filters->tree, syscall);
- if (!filter)
- goto out;
+ filter = btree_lookup32(&filters->tree, syscall);
+ if (!filter)
+ goto out;
- ret = 0;
- if (IS_ALLOW_FILTER(filter))
+ if (IS_ALLOW_FILTER(filter) || filter_match_current(filter))
+ continue;
goto out;
-
- if (!filter_match_current(filter))
- ret = -EACCES;
+ }
+ /* If the loop terminates normally, the syscall is approved. */
+ ret = 0;
out:
mutex_unlock(¤t->seccomp.filters_guard);
return ret;
{
int nr;
struct event_filter *ef;
+
if (!filters)
goto out;
- btree_for_each_safe32(&filters->tree, nr, ef) {
- const char *filter_string = SECCOMP_FILTER_ALLOW;
- seq_printf(m, "%d (%s): ", nr, syscall_nr_to_name(nr));
- if (!IS_ALLOW_FILTER(ef))
- filter_string = get_filter_string(ef);
- seq_printf(m, "%s\n", filter_string);
+ for ( ; filters; filters = filters->parent) {
+ seq_printf(m, "Enabled: %d\n", filters->flags.enabled);
+ seq_printf(m, "Inherited: %d\n",
+ filters_created_by_parent(filters));
+ btree_for_each_safe32(&filters->tree, nr, ef) {
+ const char *filter_string = SECCOMP_FILTER_ALLOW;
+ seq_printf(m, "%d (%s): ", nr, syscall_nr_to_name(nr));
+ if (!IS_ALLOW_FILTER(ef))
+ filter_string = get_filter_string(ef);
+ seq_printf(m, "%s\n", filter_string);
+ }
+ seq_printf(m, "--\n");
}
out:
return 0;
mutex_lock(¤t->seccomp.filters_guard);
filters = current->seccomp.filters;
- if (!filters)
+ if (!filters || filters_created_by_parent(filters))
goto out;
ret = -ENOENT;
mutex_lock(¤t->seccomp.filters_guard);
orig_filters = current->seccomp.filters;
- if (!orig_filters)
+ if (!orig_filters || filters_created_by_parent(orig_filters))
goto out;
if (filters_compat_mismatch(orig_filters))
*/
long seccomp_set_filter(int syscall_nr, char *filter)
{
- struct seccomp_filters *filters = NULL, *orig_filters = NULL;
+ struct seccomp_filters *filters = NULL, *orig_filters = NULL,
+ *parent_filters = NULL;
struct event_filter *ef = NULL;
long ret = -EPERM;
orig_filters = current->seccomp.filters;
+ if (filters_created_by_parent(orig_filters)) {
+ parent_filters = orig_filters;
+ orig_filters = NULL;
+ }
+
/* After the first call, compatibility mode is selected permanently. */
ret = -EACCES;
if (filters_compat_mismatch(orig_filters))
ef = btree_lookup32(&orig_filters->tree, syscall_nr);
if (!ef) {
- /* Don't allow DENYs to be changed when in a seccomp mode */
+ /* A new filter cannot be added to an active filters set. */
ret = -EACCES;
- if (current->seccomp.mode)
+ if (current->seccomp.mode && orig_filters &&
+ orig_filters->flags.enabled)
goto out;
}
get_seccomp_filters(filters); /* simplify the error paths */
current->seccomp.filters = filters;
+ if (parent_filters)
+ filters->parent = parent_filters; /* already have a ref */
put_seccomp_filters(orig_filters); /* for the task */
out:
put_seccomp_filters(filters); /* for get or task, on err */
}
EXPORT_SYMBOL_GPL(seccomp_set_filter);
+long seccomp_enable_filters(void)
+{
+ struct seccomp_filters *filters = NULL, *orig_filters = NULL;
+ long ret = 0;
+
+ mutex_lock(¤t->seccomp.filters_guard);
+ /* Rely on the task reference */
+ orig_filters = current->seccomp.filters;
+ if (!orig_filters)
+ goto out;
+
+ /* Cannot re-enable inherited filters */
+ ret = -EINVAL;
+ if (filters_created_by_parent(orig_filters))
+ goto out;
+
+ filters = seccomp_filters_alloc();
+ if (IS_ERR(filters)) {
+ ret = PTR_ERR(filters);
+ goto out;
+ }
+
+ ret = seccomp_filters_copy(filters, orig_filters);
+ if (ret)
+ goto out;
+
+ /* Do the real work */
+ filters->flags.enabled = 1;
+
+ get_seccomp_filters(filters); /* simplify the error paths */
+
+ current->seccomp.filters = filters;
+ put_seccomp_filters(orig_filters); /* for the task */
+out:
+ put_seccomp_filters(filters); /* for get or task, on err */
+ mutex_unlock(¤t->seccomp.filters_guard);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(seccomp_enable_filters);
+
long prctl_set_seccomp_filter(unsigned long id_type,
unsigned long id,
char __user *user_filter)
kfree(buf);
return ret;
}
+
+/* seccomp_filter_fork: manages inheritance on fork
+ * @child: forkee
+ * @parent: forker
+ * Ensures that @child inherit a seccomp_filters iff seccomp is enabled
+ * and the set of filters is marked as 'enabled'.
+ */
+void seccomp_filter_fork(struct task_struct *child,
+ struct task_struct *parent)
+{
+ if (!parent->seccomp.mode)
+ return;
+ child->seccomp.mode = parent->seccomp.mode;
+ mutex_lock(&parent->seccomp.filters_guard);
+ child->seccomp.filters = get_seccomp_filters(parent->seccomp.filters);
+ mutex_unlock(&parent->seccomp.filters_guard);
+ /* If @parent's filters are not active, then inherit the ancestor
+ * if there is one. It's possible that it will be NULL if the seccomp
+ * mode does not use seccomp_filters.
+ */
+ if (child->seccomp.filters && !child->seccomp.filters->flags.enabled) {
+ struct seccomp_filters *enabled =
+ get_seccomp_filters(child->seccomp.filters->parent);
+ put_seccomp_filters(child->seccomp.filters);
+ child->seccomp.filters = enabled;
+ }
+}