From e029c1dc2f01e3256d73f456fa27db64d9aaadf6 Mon Sep 17 00:00:00 2001 From: Ben Cressey Date: Wed, 11 Aug 2021 22:16:07 +0000 Subject: [PATCH 1/8] selinux-policy: add proc type Previosly, an unprivileged process trying to write to its own files under `/proc/self` could trigger an "associate" denial since `/proc` has a filesystem context of `any_t`. Giving `/proc` its own label lets subject labels be associated without also letting them be created on filesystems like `/run` and `/tmp`. Signed-off-by: Ben Cressey --- packages/selinux-policy/fs.cil | 4 ++-- packages/selinux-policy/object.cil | 12 +++++++++--- packages/selinux-policy/rules.cil | 3 +++ 3 files changed, 14 insertions(+), 5 deletions(-) diff --git a/packages/selinux-policy/fs.cil b/packages/selinux-policy/fs.cil index 315f60e0ca7..9a45f65e75a 100644 --- a/packages/selinux-policy/fs.cil +++ b/packages/selinux-policy/fs.cil @@ -26,7 +26,7 @@ (genfscon debugfs / any) (genfscon kvmfs / any) (genfscon nsfs / any) -(genfscon proc / any) +(genfscon proc / proc) (genfscon pstore / any) (genfscon ramfs / any) (genfscon rootfs / any) @@ -81,7 +81,7 @@ (filecon "/var/lib/netdog/.*" any lease) ; Label kernel filesystem mounts. -(filecon "/proc" any any) +(filecon "/proc" any proc) (filecon "/proc/.*" any ()) (filecon "/sys" any any) (filecon "/sys/.*" any ()) diff --git a/packages/selinux-policy/object.cil b/packages/selinux-policy/object.cil index 37f398727e3..04ff11b9612 100644 --- a/packages/selinux-policy/object.cil +++ b/packages/selinux-policy/object.cil @@ -39,8 +39,13 @@ (roletype object_r runtime_exec_t) (context runtime_exec (system_u object_r runtime_exec_t s0)) +; Files under /proc. +(type proc_t) +(roletype object_r proc_t) +(context proc (system_u object_r proc_t s0)) + ; Files where we have no specific policy objectives, such as -; those on kernel filesystems like /proc and /dev. +; tmpfs mounts and various kernel filesystems. (type any_t) (roletype object_r any_t) (context any (system_u object_r any_t s0)) @@ -133,14 +138,15 @@ ; from the rest of the OS, such as tmpfs filesystems, EBS volumes, ; and EFS filesystems. (typeattribute ephemeral_o) -(typeattributeset ephemeral_o (any_t external_t unlabeled_t)) +(typeattributeset ephemeral_o ( + any_t external_t proc_t unlabeled_t)) ; The set of all objects. (typeattribute all_o) (typeattributeset all_o ( os_t init_exec_t api_exec_t clock_exec_t network_exec_t bus_exec_t runtime_exec_t - any_t etc_t unlabeled_t external_t + any_t etc_t external_t proc_t unlabeled_t local_t private_t secret_t cache_t lease_t measure_t state_t api_socket_t)) diff --git a/packages/selinux-policy/rules.cil b/packages/selinux-policy/rules.cil index 521f59ad8e2..4b7abfb2de1 100644 --- a/packages/selinux-policy/rules.cil +++ b/packages/selinux-policy/rules.cil @@ -200,6 +200,9 @@ ; be useful for containers, and we don't use it in the host. (neverallow all_s global (files (block))) +; All subject labels can be used for files on /proc. +(allow all_s proc_t (filesystem (associate))) + ; All object labels can be used for files on filesystems that have ; the same label, and for files on ephemeral storage. (allow all_o self (filesystem (associate))) From 7534d22f22684d9cd8159d8e3b5f9a47d32c7318 Mon Sep 17 00:00:00 2001 From: Ben Cressey Date: Wed, 11 Aug 2021 22:27:42 +0000 Subject: [PATCH 2/8] selinux-policy: drop external and unlabeled types These types were mostly treated like `any_t` and `local_t`, where all processes could freely write to the files. The special case was for trusted processes, where directories created on an `unlabeled_t` path would end up with the `local_t` label. In combination with some mount options, this caused files on `/local` to end up with the right labels, even if the filesystem was created on a system without SELinux enabled. This might happen when using a custom disk image as the source for the secondary storage volume. However, a filesystem that's created by a bootstrap container won't necessarily be mounted with the right options, and the `unlabeled_t` label would continue to propagate. That would prevent the named file transitions used to label Docker and containerd directories from taking place, which would make them less secure. We can simplify the policy and avoid this problem by treating unknown or unrecognized types as already having the `local_t` label. Signed-off-by: Ben Cressey --- packages/selinux-policy/fs.cil | 5 +++-- packages/selinux-policy/object.cil | 26 ++++++++++---------------- packages/selinux-policy/rules.cil | 5 ----- packages/selinux-policy/sid.cil | 8 ++++---- 4 files changed, 17 insertions(+), 27 deletions(-) diff --git a/packages/selinux-policy/fs.cil b/packages/selinux-policy/fs.cil index 9a45f65e75a..457aace294a 100644 --- a/packages/selinux-policy/fs.cil +++ b/packages/selinux-policy/fs.cil @@ -97,7 +97,8 @@ (filecon "/run/.*" any ()) ; Label external filesystem mounts. -(filecon "/mnt" any external) +(filecon "/mnt" any local) (filecon "/mnt/.*" any ()) -(filecon "/media" any external) +(filecon "/media" any local) +(filecon "/media/cdrom" any local) (filecon "/media/.*" any ()) diff --git a/packages/selinux-policy/object.cil b/packages/selinux-policy/object.cil index 04ff11b9612..162932250ed 100644 --- a/packages/selinux-policy/object.cil +++ b/packages/selinux-policy/object.cil @@ -55,16 +55,18 @@ (roletype object_r etc_t) (context etc (system_u object_r etc_t s0)) -; Files that have no label, or perhaps an invalid label. -(type unlabeled_t) -(roletype object_r unlabeled_t) -(context unlabeled (system_u object_r unlabeled_t s0)) - ; Files created on local storage. (type local_t) (roletype object_r local_t) (context local (system_u object_r local_t s0)) +; The "external_t" and "unlabeled_t" types were removed to simplify +; the policy. Add aliases for backwards compatibility. +(typealias external_t) +(typealias unlabeled_t) +(typealiasactual external_t local_t) +(typealiasactual unlabeled_t local_t) + ; Alias "container_file_t" to "local_t" for compatibility with ; the container-selinux policy. (typealias container_file_t) @@ -110,11 +112,6 @@ (roletype object_r secret_t) (context secret (system_u object_r secret_t s0)) -; Files that are mount points for external filesystems. -(type external_t) -(roletype object_r external_t) -(context external (system_u object_r external_t s0)) - ; Dynamic objects are files on temporary storage with special rules. (typeattribute dynamic_o) (typeattributeset dynamic_o (etc_t)) @@ -134,19 +131,16 @@ os_t init_exec_t api_exec_t clock_exec_t network_exec_t bus_exec_t runtime_exec_t)) -; Ephemeral objects reside on storage with a different lifecycle -; from the rest of the OS, such as tmpfs filesystems, EBS volumes, -; and EFS filesystems. +; Ephemeral objects reside on tmpfs filesystems. (typeattribute ephemeral_o) -(typeattributeset ephemeral_o ( - any_t external_t proc_t unlabeled_t)) +(typeattributeset ephemeral_o (any_t proc_t)) ; The set of all objects. (typeattribute all_o) (typeattributeset all_o ( os_t init_exec_t api_exec_t clock_exec_t network_exec_t bus_exec_t runtime_exec_t - any_t etc_t external_t proc_t unlabeled_t + any_t etc_t proc_t local_t private_t secret_t cache_t lease_t measure_t state_t api_socket_t)) diff --git a/packages/selinux-policy/rules.cil b/packages/selinux-policy/rules.cil index 4b7abfb2de1..4c1e76cfe6d 100644 --- a/packages/selinux-policy/rules.cil +++ b/packages/selinux-policy/rules.cil @@ -85,11 +85,6 @@ ; Allow containers to communicate with runtimes via pipes. (allow container_s runtime_t (files (mutate))) -; If a trusted process creates a file or directory when the parent -; directory has no label, it receives the "local_t" label. -(typetransition trusted_s unlabeled_t file local_t) -(typetransition trusted_s unlabeled_t dir local_t) - ; If a runtime process creates a directory for cached container archives ; or snapshot layers on local storage, it receives the "cache_t" label. ; ... containerd's pristine archives diff --git a/packages/selinux-policy/sid.cil b/packages/selinux-policy/sid.cil index f4152759ed7..8aff8cb7fd5 100644 --- a/packages/selinux-policy/sid.cil +++ b/packages/selinux-policy/sid.cil @@ -42,10 +42,10 @@ (sidcontext security kernel) (sidcontext devnull kernel) -; Apply the "unlabeled" context for entities with an invalid context, -; and for files with no context at all, which are treated the same. -(sidcontext unlabeled unlabeled) -(sidcontext file unlabeled) +; Apply the "local" context for entities with an invalid context, and +; for files with no context at all, which are treated the same. +(sidcontext unlabeled local) +(sidcontext file local) ; Apply the "any" context for entities like sockets, ports, and ; network interfaces if they are otherwise unlabeled. From eddc9a40f4352ff0c7e5e57b6c8e725fedc4b468 Mon Sep 17 00:00:00 2001 From: Ben Cressey Date: Fri, 13 Aug 2021 21:50:52 +0000 Subject: [PATCH 3/8] release: drop context mount options for /local The mount options are no longer needed, now that objects with missing or invalid labels are treated as `local_t`. Signed-off-by: Ben Cressey --- packages/release/prepare-local.service | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/packages/release/prepare-local.service b/packages/release/prepare-local.service index 36536ee834d..d190ba6c0c1 100644 --- a/packages/release/prepare-local.service +++ b/packages/release/prepare-local.service @@ -10,7 +10,6 @@ After=dev-disk-by\x2dpartlabel-BOTTLEROCKET\x2dDATA.device Type=oneshot Environment=BOTTLEROCKET_DATA=/dev/disk/by-partlabel/BOTTLEROCKET-DATA Environment=LOCAL_DIR=/local -Environment=CONTEXT="system_u:object_r:local_t:s0" # To "grow" the partition, we delete it and recreate it at the larger size, then # write it back to the device. udevd observes the write via inotify, and tells @@ -28,7 +27,7 @@ ExecStart=/usr/sbin/growpart ${BOTTLEROCKET_DATA} # depend on the link, and would immediately transition to the failed state when the # link is removed. systemd will create local.mount for us as a side effect. ExecStart=/usr/bin/mount \ - -o defaults,noatime,nosuid,nodev,fscontext="${CONTEXT}",defcontext="${CONTEXT}",rootcontext="${CONTEXT}" \ + -o defaults,noatime,nosuid,nodev \ ${BOTTLEROCKET_DATA} ${LOCAL_DIR} # After the mount is active, we grow the filesystem to fill the resized partition, From 1533cde72d692bc8afb85ccafd2c0fb4a2aff6c8 Mon Sep 17 00:00:00 2001 From: Ben Cressey Date: Fri, 13 Aug 2021 23:55:09 +0000 Subject: [PATCH 4/8] release: restrict writes to /usr/src/kernels Loading a kernel module is a privileged operation, so writing to the location where build files like `objtool` are stored should also be privileged. Signed-off-by: Ben Cressey --- packages/release/usr-src-kernels.mount.in | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/release/usr-src-kernels.mount.in b/packages/release/usr-src-kernels.mount.in index 1c1eb7ddfab..65d6f012e1e 100644 --- a/packages/release/usr-src-kernels.mount.in +++ b/packages/release/usr-src-kernels.mount.in @@ -8,7 +8,7 @@ Before=local-fs.target umount.target What=overlay Where=PREFIX/src/kernels Type=overlay -Options=noatime,nosuid,nodev,lowerdir=/var/lib/kernel-devel/lower,upperdir=/var/lib/kernel-devel/upper,workdir=/var/lib/kernel-devel/work,context=system_u:object_r:local_t:s0 +Options=noatime,nosuid,nodev,lowerdir=/var/lib/kernel-devel/lower,upperdir=/var/lib/kernel-devel/upper,workdir=/var/lib/kernel-devel/work,context=system_u:object_r:state_t:s0 [Install] WantedBy=preconfigured.target From d2a2b5db172c2e61976822723b9f12b013cb94fd Mon Sep 17 00:00:00 2001 From: Ben Cressey Date: Fri, 13 Aug 2021 23:45:41 +0000 Subject: [PATCH 5/8] selinux-policy: add distinct type for container files Previously we had two use cases for `local_t`. It was the label used for most files and directories on `/local`, and therefore the label that most hostPath mounts would have. It was also the label of the container root filesystem, and therefore the label that external volumes, emptyDir mounts, and other private storage would have. For MCS isolation, it's useful to have distinct types to assert that one type always has a level with categories, and another type never does. That way, the constraints can be applied only to the files that are meant to be private to a pod or container. Signed-off-by: Ben Cressey --- packages/selinux-policy/lxc_contexts | 2 +- packages/selinux-policy/object.cil | 16 +++++++++++++--- packages/selinux-policy/rules.cil | 18 +++++++++++------- 3 files changed, 25 insertions(+), 11 deletions(-) diff --git a/packages/selinux-policy/lxc_contexts b/packages/selinux-policy/lxc_contexts index 248efd243db..bd7c4c7493b 100644 --- a/packages/selinux-policy/lxc_contexts +++ b/packages/selinux-policy/lxc_contexts @@ -6,7 +6,7 @@ process = "system_u:system_r:container_t:s0" # The 'file' label should always be applied to the container's root # filesystem, regardless of privileged status or automatic labeling. -file = "system_u:object_r:local_t:s0" +file = "system_u:object_r:data_t:s0" # The 'ro_file' label is not currently used by the above runtimes. ro_file = "system_u:object_r:cache_t:s0" diff --git a/packages/selinux-policy/object.cil b/packages/selinux-policy/object.cil index 162932250ed..cce7e4e1acf 100644 --- a/packages/selinux-policy/object.cil +++ b/packages/selinux-policy/object.cil @@ -67,8 +67,14 @@ (typealiasactual external_t local_t) (typealiasactual unlabeled_t local_t) -; Alias "container_file_t" to "local_t" for compatibility with -; the container-selinux policy. +; Files created by containers, or on their behalf. +(type data_t) +(roletype object_r data_t) +(context data (system_u object_r data_t s0)) + +; Alias "container_file_t" to "local_t" for compatibility with the +; container-selinux policy. Ideally it would be aliased to `data_t` +; but then kubelet applies the wrong label to plugin directories. (typealias container_file_t) (typealiasactual container_file_t local_t) @@ -116,6 +122,10 @@ (typeattribute dynamic_o) (typeattributeset dynamic_o (etc_t)) +; Shared objects are files on local storage for containers. +(typeattribute shared_o) +(typeattributeset shared_o (local_t data_t)) + ; Protected objects are files on local storage with special rules. (typeattribute protected_o) (typeattributeset protected_o ( @@ -141,6 +151,6 @@ os_t init_exec_t api_exec_t clock_exec_t network_exec_t bus_exec_t runtime_exec_t any_t etc_t proc_t - local_t private_t secret_t cache_t + local_t data_t private_t secret_t cache_t lease_t measure_t state_t api_socket_t)) diff --git a/packages/selinux-policy/rules.cil b/packages/selinux-policy/rules.cil index 4c1e76cfe6d..dd43d3e1ada 100644 --- a/packages/selinux-policy/rules.cil +++ b/packages/selinux-policy/rules.cil @@ -70,11 +70,11 @@ ; Runtimes that use the Go SELinux library will override this label ; with the "process" label from the `lxc_contexts` when launching ; unprivileged containers, unless automatic labeling is disabled. -(typetransition runtime_t local_t process control_t) +(typetransition runtime_t data_t process control_t) (typetransition runtime_t cache_t process control_t) (typetransition runtime_t secret_t process control_t) (allow runtime_t container_s (processes (transform))) -(allow container_s local_t (file (entrypoint))) +(allow container_s data_t (file (entrypoint))) (allow container_s cache_t (file (entrypoint))) (allow container_s secret_t (file (entrypoint))) @@ -135,8 +135,8 @@ (neverallow other_s dynamic_o (files (mutate mount))) ; Most subjects are allowed to write to and manage mounts for -; "local" files and directories on /local. -(allow unconfined_s local_t (files (mutate mount))) +; most of the files and directories on /local. +(allow unconfined_s shared_o (files (mutate mount))) ; Subjects that control the OS, including helpers spawned by apiserver, can ; write to and manage mounts for "secret" files and directories on /local. @@ -154,8 +154,8 @@ (neverallow unprivileged_s state_t (files (mutate mount))) (neverallow unprivileged_s secret_t (files (mutate mount))) -; Confined subjects cannot modify "state", "secret", or "local" files. -(neverallow confined_s local_t (files (mutate mount))) +; Confined subjects cannot modify "state", "secret", or "shared" files. +(neverallow confined_s shared_o (files (mutate mount))) (neverallow confined_s state_t (files (mutate mount))) (neverallow confined_s secret_t (files (mutate mount))) @@ -203,9 +203,13 @@ (allow all_o self (filesystem (associate))) (allow all_o ephemeral_o (filesystem (associate))) -; Protected object labels can also be used on local storage. +; Protected object labels can be used on local storage. (allow protected_o local_t (filesystem (associate))) +; The data object label can also be used, so that volume types like +; emptyDir can be relabeled on behalf of containers. +(allow data_t local_t (filesystem (associate))) + ; Containers are allowed to relax security constraints, since we ; don't control what code they run or how it's built. (allow container_s self (processes (relax))) From b126b1ca0374fa3c5f41ee192894e9c2d0f7a260 Mon Sep 17 00:00:00 2001 From: Ben Cressey Date: Thu, 19 Aug 2021 22:44:10 +0000 Subject: [PATCH 6/8] selinux-policy: use target's range for new files If `defaultrange` is not specified in the policy, the lower part of the range from the source process is applied to all new files. Unprivileged containers will run with a process label that includes two category pairs, so the files get the label we expect. Privileged containers, on the other hand, may run with these labels: * `system_u:system_r:control_t:s0-s0:c0.c1023` * `system_u:system_r:super_t:s0` In both cases, the lower range of the process is just `s0`, and files would end up with that. This would allow unprivileged containers to also modify the files. We can avoid this by using the target's range instead, since Docker and containerd CRI will ensure that volume mounts are labeled with the appropriate range. Signed-off-by: Ben Cressey --- packages/selinux-policy/base.cil | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/packages/selinux-policy/base.cil b/packages/selinux-policy/base.cil index 4d25e920d39..18e66d1cf83 100644 --- a/packages/selinux-policy/base.cil +++ b/packages/selinux-policy/base.cil @@ -28,6 +28,13 @@ (userlevel system_u s0-s0) (userrange system_u s0-s0) +; Take the context from the target file rather than the source +; process when computing the level for new file objects. We can +; expect the directory where files are created to have the right +; range of categories applied, but the process creating the file +; may be privileged and have the full range or no range at all. +(defaultrange files target low-high) + ; Enable policy to use consolidated network peer controls. This ; avoids a function call to the compatibility mode helper, and ; will be faster when no network labeling rules are defined. From bb14322edd16bcf59d0dbe1baafe6f26ad6f76a3 Mon Sep 17 00:00:00 2001 From: Ben Cressey Date: Thu, 19 Aug 2021 23:21:36 +0000 Subject: [PATCH 7/8] selinux-policy: set range for privileged containers Conceptually, anything with the `control_t` label has access to all categories. Setting the range makes this explicit in the output of tools like `ps`. Signed-off-by: Ben Cressey --- packages/selinux-policy/rules.cil | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/packages/selinux-policy/rules.cil b/packages/selinux-policy/rules.cil index dd43d3e1ada..be94e7e3e8d 100644 --- a/packages/selinux-policy/rules.cil +++ b/packages/selinux-policy/rules.cil @@ -57,8 +57,6 @@ (allow bus_t bus_exec_t (file (entrypoint))) ; PID 1 starts container runtimes as "runtime_t". -; The level range is adjusted to span all categories at the same time, -; to support Docker's use of MCS labels. (typetransition init_t runtime_exec_t process runtime_t) (allow init_t runtime_t (processes (transform))) (allow runtime_t runtime_exec_t (file (entrypoint))) @@ -78,6 +76,12 @@ (allow container_s cache_t (file (entrypoint))) (allow container_s secret_t (file (entrypoint))) +; Adjust the level range to span all categories, since privileged +; containers won't get an MCS pair assigned. +(rangetransition runtime_t data_t process s0-s0) +(rangetransition runtime_t cache_t process s0-s0) +(rangetransition runtime_t secret_t process s0-s0) + ; Also allow entry to container domains through `docker-init`, which ; is mounted from the root filesystem and used as the init process. (allow container_s runtime_exec_t (file (entrypoint))) From fa78efd1ddedf48d4292cb78fb44d9e6d0a9e796 Mon Sep 17 00:00:00 2001 From: Ben Cressey Date: Sat, 14 Aug 2021 20:07:20 +0000 Subject: [PATCH 8/8] host-ctr: label containers with all categories Setting the level gives our host and bootstrap containers the same range of categories as all other privileged containers, and means that all containers will run with some categories specified. Signed-off-by: Ben Cressey --- sources/host-ctr/cmd/host-ctr/main.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sources/host-ctr/cmd/host-ctr/main.go b/sources/host-ctr/cmd/host-ctr/main.go index 5cc8bf41c92..15fe975a169 100644 --- a/sources/host-ctr/cmd/host-ctr/main.go +++ b/sources/host-ctr/cmd/host-ctr/main.go @@ -622,7 +622,7 @@ func withSuperpowered() oci.SpecOpts { oci.WithParentCgroupDevices, oci.WithPrivileged, oci.WithNewPrivileges, - oci.WithSelinuxLabel("system_u:system_r:super_t:s0"), + oci.WithSelinuxLabel("system_u:system_r:super_t:s0-s0:c0.c1023"), oci.WithAllDevicesAllowed, ) } @@ -634,7 +634,7 @@ func withBootstrap() oci.SpecOpts { return oci.Compose( withPrivilegedMounts(), withRootFsShared(), - oci.WithSelinuxLabel("system_u:system_r:control_t:s0"), + oci.WithSelinuxLabel("system_u:system_r:control_t:s0-s0:c0.c1023"), // Bootstrap containers don't require all "privileges", we only add the // `CAP_SYS_ADMIN` capability. `WithDefaultProfile` will create the proper // seccomp profile based on the container's capabilities. @@ -647,7 +647,7 @@ func withBootstrap() oci.SpecOpts { // withDefault adds container options for non-privileged containers func withDefault() oci.SpecOpts { return oci.Compose( - oci.WithSelinuxLabel("system_u:system_r:control_t:s0"), + oci.WithSelinuxLabel("system_u:system_r:control_t:s0-s0:c0.c1023"), // Non-privileged containers only have access to a subset of the devices oci.WithDefaultUnixDevices, // No additional capabilities required for non-privileged containers