mirror of
https://github.com/NixOS/nixpkgs.git
synced 2025-06-12 04:35:41 +03:00

Our more thorough parametrised tests uncovered that with the changes for supporting DynamicUser, we now have the situation that for static users the root directory within the confined environment is now writable for the user in question. This is obviously not what we want and I'd consider that a regression. However while discussing this with @ju1m and my suggestion being to set TemporaryFileSystem to "/" (as we had previously), they had an even better idea[1]: > The goal is to deny write access to / to non-root users, > > * TemporaryFileSystem=/ gives us that through the ownership of / by > root (instead of the service's user inherited from > RuntimeDirectory=). > * ProtectSystem=strict gives us that by mounting / read-only (while > keeping its ownership to the service's user). > > To avoid the incompatibilities of TemporaryFileSystem=/ mentioned > above, I suggest to mount / read-only in all cases with > ReadOnlyPaths = [ "+/" ]: > > ... > > I guess this would require at least two changes to the current tests: > > 1. to no longer expect root to be able to write to some paths (like > /bin) (at least not without first remounting / in read-write > mode). > 2. to no longer expect non-root users to fail to write to certain > paths with a "permission denied" error code, but with a > "read-only file system" error code. I like the solution with ReadOnlyPaths even more because it further reduces the attack surface if the user is root. In chroot-only mode this is especially useful, since if there are no other bind-mounted paths involved in the unit configuration, the whole file system within the confined environment is read-only. [1]: https://github.com/NixOS/nixpkgs/pull/289593#discussion_r1586794215 Signed-off-by: aszlig <aszlig@nix.build>
205 lines
8.5 KiB
Nix
205 lines
8.5 KiB
Nix
{ config, pkgs, lib, utils, ... }:
|
|
|
|
let
|
|
toplevelConfig = config;
|
|
inherit (lib) types;
|
|
inherit (utils.systemdUtils.lib) mkPathSafeName;
|
|
in {
|
|
options.systemd.services = lib.mkOption {
|
|
type = types.attrsOf (types.submodule ({ name, config, ... }: {
|
|
options.confinement.enable = lib.mkOption {
|
|
type = types.bool;
|
|
default = false;
|
|
description = ''
|
|
If set, all the required runtime store paths for this service are
|
|
bind-mounted into a `tmpfs`-based
|
|
{manpage}`chroot(2)`.
|
|
'';
|
|
};
|
|
|
|
options.confinement.fullUnit = lib.mkOption {
|
|
type = types.bool;
|
|
default = false;
|
|
description = ''
|
|
Whether to include the full closure of the systemd unit file into the
|
|
chroot, instead of just the dependencies for the executables.
|
|
|
|
::: {.warning}
|
|
While it may be tempting to just enable this option to
|
|
make things work quickly, please be aware that this might add paths
|
|
to the closure of the chroot that you didn't anticipate. It's better
|
|
to use {option}`confinement.packages` to **explicitly** add additional store paths to the
|
|
chroot.
|
|
:::
|
|
'';
|
|
};
|
|
|
|
options.confinement.packages = lib.mkOption {
|
|
type = types.listOf (types.either types.str types.package);
|
|
default = [];
|
|
description = let
|
|
mkScOption = optName: "{option}`serviceConfig.${optName}`";
|
|
in ''
|
|
Additional packages or strings with context to add to the closure of
|
|
the chroot. By default, this includes all the packages from the
|
|
${lib.concatMapStringsSep ", " mkScOption [
|
|
"ExecReload" "ExecStartPost" "ExecStartPre" "ExecStop"
|
|
"ExecStopPost"
|
|
]} and ${mkScOption "ExecStart"} options. If you want to have all the
|
|
dependencies of this systemd unit, you can use
|
|
{option}`confinement.fullUnit`.
|
|
|
|
::: {.note}
|
|
The store paths listed in {option}`path` are
|
|
**not** included in the closure as
|
|
well as paths from other options except those listed
|
|
above.
|
|
:::
|
|
'';
|
|
};
|
|
|
|
options.confinement.binSh = lib.mkOption {
|
|
type = types.nullOr types.path;
|
|
default = toplevelConfig.environment.binsh;
|
|
defaultText = lib.literalExpression "config.environment.binsh";
|
|
example = lib.literalExpression ''"''${pkgs.dash}/bin/dash"'';
|
|
description = ''
|
|
The program to make available as {file}`/bin/sh` inside
|
|
the chroot. If this is set to `null`, no
|
|
{file}`/bin/sh` is provided at all.
|
|
|
|
This is useful for some applications, which for example use the
|
|
{manpage}`system(3)` library function to execute commands.
|
|
'';
|
|
};
|
|
|
|
options.confinement.mode = lib.mkOption {
|
|
type = types.enum [ "full-apivfs" "chroot-only" ];
|
|
default = "full-apivfs";
|
|
description = ''
|
|
The value `full-apivfs` (the default) sets up
|
|
private {file}`/dev`, {file}`/proc`,
|
|
{file}`/sys`, {file}`/tmp` and {file}`/var/tmp` file systems
|
|
in a separate user name space.
|
|
|
|
If this is set to `chroot-only`, only the file
|
|
system name space is set up along with the call to
|
|
{manpage}`chroot(2)`.
|
|
|
|
In all cases, unless `serviceConfig.PrivateTmp=true` is set,
|
|
both {file}`/tmp` and {file}`/var/tmp` paths are added to `InaccessiblePaths=`.
|
|
This is to overcome options like `DynamicUser=true`
|
|
implying `PrivateTmp=true` without letting it being turned off.
|
|
Beware however that giving processes the `CAP_SYS_ADMIN` and `@mount` privileges
|
|
can let them undo the effects of `InaccessiblePaths=`.
|
|
|
|
::: {.note}
|
|
This doesn't cover network namespaces and is solely for
|
|
file system level isolation.
|
|
:::
|
|
'';
|
|
};
|
|
|
|
config = let
|
|
inherit (config.confinement) binSh fullUnit;
|
|
wantsAPIVFS = lib.mkDefault (config.confinement.mode == "full-apivfs");
|
|
in lib.mkIf config.confinement.enable {
|
|
serviceConfig = {
|
|
ReadOnlyPaths = [ "+/" ];
|
|
RuntimeDirectory = [ "confinement/${mkPathSafeName name}" ];
|
|
RootDirectory = lib.mkDefault "/run/confinement/${mkPathSafeName name}";
|
|
InaccessiblePaths = [
|
|
"-+/run/confinement/${mkPathSafeName name}"
|
|
];
|
|
PrivateMounts = lib.mkDefault true;
|
|
|
|
# https://github.com/NixOS/nixpkgs/issues/14645 is a future attempt
|
|
# to change some of these to default to true.
|
|
#
|
|
# If we run in chroot-only mode, having something like PrivateDevices
|
|
# set to true by default will mount /dev within the chroot, whereas
|
|
# with "chroot-only" it's expected that there are no /dev, /proc and
|
|
# /sys file systems available.
|
|
#
|
|
# However, if this suddenly becomes true, the attack surface will
|
|
# increase, so let's explicitly set these options to true/false
|
|
# depending on the mode.
|
|
MountAPIVFS = wantsAPIVFS;
|
|
PrivateDevices = wantsAPIVFS;
|
|
PrivateTmp = wantsAPIVFS;
|
|
PrivateUsers = wantsAPIVFS;
|
|
ProtectControlGroups = wantsAPIVFS;
|
|
ProtectKernelModules = wantsAPIVFS;
|
|
ProtectKernelTunables = wantsAPIVFS;
|
|
};
|
|
confinement.packages = let
|
|
execOpts = [
|
|
"ExecReload" "ExecStart" "ExecStartPost" "ExecStartPre" "ExecStop"
|
|
"ExecStopPost"
|
|
];
|
|
execPkgs = lib.concatMap (opt: let
|
|
isSet = config.serviceConfig ? ${opt};
|
|
in lib.flatten (lib.optional isSet config.serviceConfig.${opt})) execOpts;
|
|
unitAttrs = toplevelConfig.systemd.units."${name}.service";
|
|
allPkgs = lib.singleton (builtins.toJSON unitAttrs);
|
|
unitPkgs = if fullUnit then allPkgs else execPkgs;
|
|
in unitPkgs ++ lib.optional (binSh != null) binSh;
|
|
};
|
|
}));
|
|
};
|
|
|
|
config.assertions = lib.concatLists (lib.mapAttrsToList (name: cfg: let
|
|
whatOpt = optName: "The 'serviceConfig' option '${optName}' for"
|
|
+ " service '${name}' is enabled in conjunction with"
|
|
+ " 'confinement.enable'";
|
|
in lib.optionals cfg.confinement.enable [
|
|
{ assertion = !cfg.serviceConfig.RootDirectoryStartOnly or false;
|
|
message = "${whatOpt "RootDirectoryStartOnly"}, but right now systemd"
|
|
+ " doesn't support restricting bind-mounts to 'ExecStart'."
|
|
+ " Please either define a separate service or find a way to run"
|
|
+ " commands other than ExecStart within the chroot.";
|
|
}
|
|
]) config.systemd.services);
|
|
|
|
config.systemd.packages = lib.concatLists (lib.mapAttrsToList (name: cfg: let
|
|
rootPaths = let
|
|
contents = lib.concatStringsSep "\n" cfg.confinement.packages;
|
|
in pkgs.writeText "${mkPathSafeName name}-string-contexts.txt" contents;
|
|
|
|
chrootPaths = pkgs.runCommand "${mkPathSafeName name}-chroot-paths" {
|
|
closureInfo = pkgs.closureInfo { inherit rootPaths; };
|
|
serviceName = "${name}.service";
|
|
excludedPath = rootPaths;
|
|
} ''
|
|
mkdir -p "$out/lib/systemd/system/$serviceName.d"
|
|
serviceFile="$out/lib/systemd/system/$serviceName.d/confinement.conf"
|
|
|
|
echo '[Service]' > "$serviceFile"
|
|
|
|
# /bin/sh is special here, because the option value could contain a
|
|
# symlink and we need to properly resolve it.
|
|
${lib.optionalString (cfg.confinement.binSh != null) ''
|
|
binsh=${lib.escapeShellArg cfg.confinement.binSh}
|
|
realprog="$(readlink -e "$binsh")"
|
|
echo "BindReadOnlyPaths=$realprog:/bin/sh" >> "$serviceFile"
|
|
''}
|
|
|
|
# If DynamicUser= is enabled, PrivateTmp=true is implied (and cannot be turned off).
|
|
# so disable them unless PrivateTmp=true is explicitely set.
|
|
${lib.optionalString (!cfg.serviceConfig.PrivateTmp) ''
|
|
echo "InaccessiblePaths=-+/tmp" >> "$serviceFile"
|
|
echo "InaccessiblePaths=-+/var/tmp" >> "$serviceFile"
|
|
''}
|
|
|
|
while read storePath; do
|
|
if [ -L "$storePath" ]; then
|
|
# Currently, systemd can't cope with symlinks in Bind(ReadOnly)Paths,
|
|
# so let's just bind-mount the target to that location.
|
|
echo "BindReadOnlyPaths=$(readlink -e "$storePath"):$storePath"
|
|
elif [ "$storePath" != "$excludedPath" ]; then
|
|
echo "BindReadOnlyPaths=$storePath"
|
|
fi
|
|
done < "$closureInfo/store-paths" >> "$serviceFile"
|
|
'';
|
|
in lib.optional cfg.confinement.enable chrootPaths) config.systemd.services);
|
|
}
|