tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

SandboxFilter.cpp (83852B)


      1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
      2 /* vim: set ts=8 sts=2 et sw=2 tw=80: */
      3 /* This Source Code Form is subject to the terms of the Mozilla Public
      4 * License, v. 2.0. If a copy of the MPL was not distributed with this file,
      5 * You can obtain one at http://mozilla.org/MPL/2.0/. */
      6 
      7 #include "SandboxFilter.h"
      8 
      9 #include <asm/ioctls.h>    // For TCGETS2
     10 #include <asm/termbits.h>  // For termios2
     11 #include <errno.h>
     12 #include <fcntl.h>
     13 #include <linux/ioctl.h>
     14 #include <linux/ipc.h>
     15 #include <linux/memfd.h>
     16 #include <linux/mman.h>
     17 #include <linux/net.h>
     18 #include <linux/sched.h>
     19 #include <linux/sockios.h>
     20 #include <string.h>
     21 #include <sys/ioctl.h>
     22 #include <sys/mman.h>
     23 #include <sys/prctl.h>
     24 #include <sys/socket.h>
     25 #include <sys/syscall.h>
     26 #include <sys/un.h>
     27 #include <sys/utsname.h>
     28 #include <time.h>
     29 #include <unistd.h>
     30 // This has to go after <sys/socket.h> for annoying reasons
     31 #include <linux/wireless.h>
     32 
     33 #include <algorithm>
     34 #include <utility>
     35 
     36 #include "PlatformMacros.h"
     37 #include "Sandbox.h"  // for ContentProcessSandboxParams
     38 #include "SandboxBrokerClient.h"
     39 #include "SandboxFilterUtil.h"
     40 #include "SandboxInfo.h"
     41 #include "SandboxInternal.h"
     42 #include "SandboxLogging.h"
     43 #include "SandboxOpenedFiles.h"
     44 #include "mozilla/PodOperations.h"
     45 #include "mozilla/ProcInfo_linux.h"
     46 #include "mozilla/UniquePtr.h"
     47 #include "prenv.h"
     48 #include "sandbox/linux/bpf_dsl/bpf_dsl.h"
     49 #include "sandbox/linux/system_headers/linux_seccomp.h"
     50 #include "sandbox/linux/system_headers/linux_syscalls.h"
     51 
     52 #if defined(GP_PLAT_amd64_linux) && defined(GP_ARCH_amd64) && \
     53    defined(MOZ_USING_WASM_SANDBOXING)
     54 #  include <asm/prctl.h>  // For ARCH_SET_GS
     55 #endif
     56 
     57 using namespace sandbox::bpf_dsl;
     58 
     59 // Fill in defines in case of old headers.
     60 // (Warning: these are wrong on PA-RISC.)
     61 #ifndef MADV_HUGEPAGE
     62 #  define MADV_HUGEPAGE 14
     63 #endif
     64 #ifndef MADV_NOHUGEPAGE
     65 #  define MADV_NOHUGEPAGE 15
     66 #endif
     67 #ifndef MADV_DONTDUMP
     68 #  define MADV_DONTDUMP 16
     69 #endif
     70 
     71 // Added in Linux 4.5; see bug 1303813.
     72 #ifndef MADV_FREE
     73 #  define MADV_FREE 8
     74 #endif
     75 
     76 #ifndef PR_SET_PTRACER
     77 #  define PR_SET_PTRACER 0x59616d61
     78 #endif
     79 
     80 // Linux 5.17+
     81 #ifndef PR_SET_VMA
     82 #  define PR_SET_VMA 0x53564d41
     83 #endif
     84 #ifndef PR_SET_VMA_ANON_NAME
     85 #  define PR_SET_VMA_ANON_NAME 0
     86 #endif
     87 
     88 // The GNU libc headers define O_LARGEFILE as 0 on x86_64, but we need the
     89 // actual value because it shows up in file flags.
     90 #if !defined(O_LARGEFILE) || O_LARGEFILE == 0
     91 #  define O_LARGEFILE_REAL 00100000
     92 #else
     93 #  define O_LARGEFILE_REAL O_LARGEFILE
     94 #endif
     95 
     96 // Not part of UAPI, but userspace sees it in F_GETFL; see bug 1650751.
     97 #define FMODE_NONOTIFY 0x4000000
     98 
     99 #ifndef F_LINUX_SPECIFIC_BASE
    100 #  define F_LINUX_SPECIFIC_BASE 1024
    101 #else
    102 static_assert(F_LINUX_SPECIFIC_BASE == 1024);
    103 #endif
    104 
    105 #ifndef F_ADD_SEALS
    106 #  define F_ADD_SEALS (F_LINUX_SPECIFIC_BASE + 9)
    107 #  define F_GET_SEALS (F_LINUX_SPECIFIC_BASE + 10)
    108 #else
    109 static_assert(F_ADD_SEALS == (F_LINUX_SPECIFIC_BASE + 9));
    110 static_assert(F_GET_SEALS == (F_LINUX_SPECIFIC_BASE + 10));
    111 #endif
    112 
    113 // Added in 6.13
    114 #ifndef MADV_GUARD_INSTALL
    115 #  define MADV_GUARD_INSTALL 102
    116 #  define MADV_GUARD_REMOVE 103
    117 #else
    118 static_assert(MADV_GUARD_INSTALL == 102);
    119 static_assert(MADV_GUARD_REMOVE == 103);
    120 #endif
    121 
    122 // Added in 4.14
    123 #ifndef MFD_HUGETLB
    124 #  define MFD_HUGETLB 4U
    125 #  define MFD_HUGE_MASK MAP_HUGE_MASK
    126 #  define MFD_HUGE_SHIFT MAP_HUGE_SHIFT
    127 #else
    128 static_assert(MFD_HUGE_MASK == MAP_HUGE_MASK);
    129 static_assert(MFD_HUGE_SHIFT == MAP_HUGE_SHIFT);
    130 #endif
    131 
    132 // Added in 6.10
    133 #ifndef F_DUPFD_QUERY
    134 #  define F_DUPFD_QUERY (F_LINUX_SPECIFIC_BASE + 3)
    135 #else
    136 static_assert(F_DUPFD_QUERY == (F_LINUX_SPECIFIC_BASE + 3));
    137 #endif
    138 
    139 // To avoid visual confusion between "ifdef ANDROID" and "ifndef ANDROID":
    140 #ifndef ANDROID
    141 #  define DESKTOP
    142 #endif
    143 
    144 namespace {
    145 static const unsigned long kIoctlTypeMask = _IOC_TYPEMASK << _IOC_TYPESHIFT;
    146 static const unsigned long kTtyIoctls = TIOCSTI & kIoctlTypeMask;
    147 // On some older architectures (but not x86 or ARM), ioctls are
    148 // assigned type fields differently, and the TIOC/TC/FIO group
    149 // isn't all the same type.  If/when we support those archs,
    150 // this would need to be revised (but really this should be a
    151 // default-deny policy; see below).
    152 static_assert(kTtyIoctls == (TCSETA & kIoctlTypeMask) &&
    153                  kTtyIoctls == (FIOASYNC & kIoctlTypeMask),
    154              "tty-related ioctls use the same type");
    155 };  // namespace
    156 
    157 // This file defines the seccomp-bpf system call filter policies.
    158 // See also SandboxFilterUtil.h, for the CASES_FOR_* macros and
    159 // SandboxFilterBase::Evaluate{Socket,Ipc}Call.
    160 //
    161 // One important difference from how Chromium bpf_dsl filters are
    162 // normally interpreted: returning -ENOSYS from a Trap() handler
    163 // indicates an unexpected system call; SigSysHandler() in Sandbox.cpp
    164 // will detect this, request a crash dump, and terminate the process.
    165 // This does not apply to using Error(ENOSYS) in the policy, so that
    166 // can be used if returning an actual ENOSYS is needed.
    167 
    168 namespace mozilla {
    169 
    170 // This class allows everything used by the sandbox itself, by the
    171 // core IPC code, by the crash reporter, or other core code.  It also
    172 // contains support for brokering file operations, but file access is
    173 // denied if no broker client is provided by the concrete class.
    174 class SandboxPolicyCommon : public SandboxPolicyBase {
    175 protected:
    176  // Subclasses can assign these in their constructors to loosen the
    177  // default settings.
    178  SandboxBrokerClient* mBroker = nullptr;
    179  bool mMayCreateShmem = false;
    180  bool mAllowUnsafeSocketPair = false;
    181  bool mBrokeredConnect = false;  // Can connect() be brokered?
    182 
    183  SandboxPolicyCommon() = default;
    184 
    185  typedef const arch_seccomp_data& ArgsRef;
    186 
    187  static intptr_t BlockedSyscallTrap(ArgsRef aArgs, void* aux) {
    188    MOZ_ASSERT(!aux);
    189    return -ENOSYS;
    190  }
    191 
    192  // Convert Unix-style "return -1 and set errno" APIs back into the
    193  // Linux ABI "return -err" style.
    194  static intptr_t ConvertError(long rv) { return rv < 0 ? -errno : rv; }
    195 
    196  template <typename... Args>
    197  static intptr_t DoSyscall(long nr, Args... args) {
    198    static_assert(std::conjunction_v<
    199                      std::conditional_t<(sizeof(Args) <= sizeof(void*)),
    200                                         std::true_type, std::false_type>...>,
    201                  "each syscall arg is at most one word");
    202    return ConvertError(syscall(nr, args...));
    203  }
    204 
    205  // Mesa's amdgpu driver uses kcmp with KCMP_FILE; see also bug
    206  // 1624743.  This policy restricts it to the process's own pid,
    207  // which should be sufficient on its own if we need to remove the
    208  // `type` restriction in the future.
    209  //
    210  // (Note: if we end up with more Mesa-specific hooks needed in
    211  // several process types, we could put them into this class's
    212  // EvaluateSyscall guarded by a boolean member variable, or
    213  // introduce another layer of subclassing.)
    214  ResultExpr KcmpPolicyForMesa() const {
    215    // The real KCMP_FILE is part of an anonymous enum in
    216    // <linux/kcmp.h>, but we can't depend on having that header,
    217    // and it's not a #define so the usual #ifndef approach
    218    // doesn't work.
    219    static const int kKcmpFile = 0;
    220    const pid_t myPid = getpid();
    221    Arg<pid_t> pid1(0), pid2(1);
    222    Arg<int> type(2);
    223    return If(AllOf(pid1 == myPid, pid2 == myPid, type == kKcmpFile), Allow())
    224        .Else(InvalidSyscall());
    225  }
    226 
    227  static intptr_t SchedTrap(ArgsRef aArgs, void* aux) {
    228    const pid_t tid = syscall(__NR_gettid);
    229    if (aArgs.args[0] == static_cast<uint64_t>(tid)) {
    230      return DoSyscall(aArgs.nr, 0, static_cast<uintptr_t>(aArgs.args[1]),
    231                       static_cast<uintptr_t>(aArgs.args[2]),
    232                       static_cast<uintptr_t>(aArgs.args[3]),
    233                       static_cast<uintptr_t>(aArgs.args[4]),
    234                       static_cast<uintptr_t>(aArgs.args[5]));
    235    }
    236    return -EPERM;
    237  }
    238 
    239 private:
    240  // Bug 1093893: Translate tkill to tgkill for pthread_kill; fixed in
    241  // bionic commit 10c8ce59a (in JB and up; API level 16 = Android 4.1).
    242  // Bug 1376653: musl also needs this, and security-wise it's harmless.
    243  static intptr_t TKillCompatTrap(ArgsRef aArgs, void* aux) {
    244    auto tid = static_cast<pid_t>(aArgs.args[0]);
    245    auto sig = static_cast<int>(aArgs.args[1]);
    246    return DoSyscall(__NR_tgkill, getpid(), tid, sig);
    247  }
    248 
    249  static intptr_t SetNoNewPrivsTrap(ArgsRef& aArgs, void* aux) {
    250    if (gSetSandboxFilter == nullptr) {
    251      // Called after BroadcastSetThreadSandbox finished, therefore
    252      // not our doing and not expected.
    253      return BlockedSyscallTrap(aArgs, nullptr);
    254    }
    255    // Signal that the filter is already in place.
    256    return -ETXTBSY;
    257  }
    258 
    259  // Trap handlers for filesystem brokering.
    260  // (The amount of code duplication here could be improved....)
    261 #ifdef __NR_open
    262  static intptr_t OpenTrap(ArgsRef aArgs, void* aux) {
    263    auto broker = static_cast<SandboxBrokerClient*>(aux);
    264    auto path = reinterpret_cast<const char*>(aArgs.args[0]);
    265    auto flags = static_cast<int>(aArgs.args[1]);
    266    return broker->Open(path, flags);
    267  }
    268 
    269  static intptr_t AccessTrap(ArgsRef aArgs, void* aux) {
    270    auto broker = static_cast<SandboxBrokerClient*>(aux);
    271    auto path = reinterpret_cast<const char*>(aArgs.args[0]);
    272    auto mode = static_cast<int>(aArgs.args[1]);
    273    return broker->Access(path, mode);
    274  }
    275 
    276  static intptr_t StatTrap(ArgsRef aArgs, void* aux) {
    277    auto broker = static_cast<SandboxBrokerClient*>(aux);
    278    auto path = reinterpret_cast<const char*>(aArgs.args[0]);
    279    auto buf = reinterpret_cast<statstruct*>(aArgs.args[1]);
    280    return broker->Stat(path, buf);
    281  }
    282 
    283  static intptr_t LStatTrap(ArgsRef aArgs, void* aux) {
    284    auto broker = static_cast<SandboxBrokerClient*>(aux);
    285    auto path = reinterpret_cast<const char*>(aArgs.args[0]);
    286    auto buf = reinterpret_cast<statstruct*>(aArgs.args[1]);
    287    return broker->LStat(path, buf);
    288  }
    289 
    290  static intptr_t ChmodTrap(ArgsRef aArgs, void* aux) {
    291    auto broker = static_cast<SandboxBrokerClient*>(aux);
    292    auto path = reinterpret_cast<const char*>(aArgs.args[0]);
    293    auto mode = static_cast<mode_t>(aArgs.args[1]);
    294    return broker->Chmod(path, mode);
    295  }
    296 
    297  static intptr_t LinkTrap(ArgsRef aArgs, void* aux) {
    298    auto broker = static_cast<SandboxBrokerClient*>(aux);
    299    auto path = reinterpret_cast<const char*>(aArgs.args[0]);
    300    auto path2 = reinterpret_cast<const char*>(aArgs.args[1]);
    301    return broker->Link(path, path2);
    302  }
    303 
    304  static intptr_t SymlinkTrap(ArgsRef aArgs, void* aux) {
    305    auto broker = static_cast<SandboxBrokerClient*>(aux);
    306    auto path = reinterpret_cast<const char*>(aArgs.args[0]);
    307    auto path2 = reinterpret_cast<const char*>(aArgs.args[1]);
    308    return broker->Symlink(path, path2);
    309  }
    310 
    311  static intptr_t RenameTrap(ArgsRef aArgs, void* aux) {
    312    auto broker = static_cast<SandboxBrokerClient*>(aux);
    313    auto path = reinterpret_cast<const char*>(aArgs.args[0]);
    314    auto path2 = reinterpret_cast<const char*>(aArgs.args[1]);
    315    return broker->Rename(path, path2);
    316  }
    317 
    318  static intptr_t MkdirTrap(ArgsRef aArgs, void* aux) {
    319    auto broker = static_cast<SandboxBrokerClient*>(aux);
    320    auto path = reinterpret_cast<const char*>(aArgs.args[0]);
    321    auto mode = static_cast<mode_t>(aArgs.args[1]);
    322    return broker->Mkdir(path, mode);
    323  }
    324 
    325  static intptr_t RmdirTrap(ArgsRef aArgs, void* aux) {
    326    auto broker = static_cast<SandboxBrokerClient*>(aux);
    327    auto path = reinterpret_cast<const char*>(aArgs.args[0]);
    328    return broker->Rmdir(path);
    329  }
    330 
    331  static intptr_t UnlinkTrap(ArgsRef aArgs, void* aux) {
    332    auto broker = static_cast<SandboxBrokerClient*>(aux);
    333    auto path = reinterpret_cast<const char*>(aArgs.args[0]);
    334    if (path && path[0] == '\0') {
    335      // If the path is empty, then just fail the call here
    336      return -ENOENT;
    337    }
    338    return broker->Unlink(path);
    339  }
    340 
    341  static intptr_t ReadlinkTrap(ArgsRef aArgs, void* aux) {
    342    auto broker = static_cast<SandboxBrokerClient*>(aux);
    343    auto path = reinterpret_cast<const char*>(aArgs.args[0]);
    344    auto buf = reinterpret_cast<char*>(aArgs.args[1]);
    345    auto size = static_cast<size_t>(aArgs.args[2]);
    346    return broker->Readlink(path, buf, size);
    347  }
    348 #endif  // __NR_open
    349 
    350  static intptr_t OpenAtTrap(ArgsRef aArgs, void* aux) {
    351    auto broker = static_cast<SandboxBrokerClient*>(aux);
    352    auto fd = static_cast<int>(aArgs.args[0]);
    353    auto path = reinterpret_cast<const char*>(aArgs.args[1]);
    354    auto flags = static_cast<int>(aArgs.args[2]);
    355    if (fd != AT_FDCWD && path[0] != '/') {
    356      SANDBOX_LOG("unsupported fd-relative openat(%d, \"%s\", 0%o)", fd, path,
    357                  flags);
    358      return BlockedSyscallTrap(aArgs, nullptr);
    359    }
    360    return broker->Open(path, flags);
    361  }
    362 
    363  static intptr_t AccessAtTrap(ArgsRef aArgs, void* aux) {
    364    auto broker = static_cast<SandboxBrokerClient*>(aux);
    365    auto fd = static_cast<int>(aArgs.args[0]);
    366    auto path = reinterpret_cast<const char*>(aArgs.args[1]);
    367    auto mode = static_cast<int>(aArgs.args[2]);
    368    // Linux's faccessat syscall has no "flags" argument.  Attempting
    369    // to handle the flags != 0 case is left to userspace; this is
    370    // impossible to do correctly in all cases, but that's not our
    371    // problem.
    372    //
    373    // Starting with kernel 5.8+ and glibc 2.33, there is faccessat2 that
    374    // supports flags, handled below.
    375    if (fd != AT_FDCWD && path[0] != '/') {
    376      SANDBOX_LOG("unsupported fd-relative faccessat(%d, \"%s\", %d)", fd, path,
    377                  mode);
    378      return BlockedSyscallTrap(aArgs, nullptr);
    379    }
    380    return broker->Access(path, mode);
    381  }
    382 
    383  static intptr_t AccessAt2Trap(ArgsRef aArgs, void* aux) {
    384    auto* broker = static_cast<SandboxBrokerClient*>(aux);
    385    auto fd = static_cast<int>(aArgs.args[0]);
    386    const auto* path = reinterpret_cast<const char*>(aArgs.args[1]);
    387    auto mode = static_cast<int>(aArgs.args[2]);
    388    auto flags = static_cast<int>(aArgs.args[3]);
    389    if (fd != AT_FDCWD && path[0] != '/') {
    390      SANDBOX_LOG("unsupported fd-relative faccessat2(%d, \"%s\", %d, %d)", fd,
    391                  path, mode, flags);
    392      return BlockedSyscallTrap(aArgs, nullptr);
    393    }
    394    if ((flags & ~AT_EACCESS) == 0) {
    395      return broker->Access(path, mode);
    396    }
    397    return ConvertError(ENOSYS);
    398  }
    399 
    400  static intptr_t StatAtTrap(ArgsRef aArgs, void* aux) {
    401    auto broker = static_cast<SandboxBrokerClient*>(aux);
    402    auto fd = static_cast<int>(aArgs.args[0]);
    403    auto path = reinterpret_cast<const char*>(aArgs.args[1]);
    404    auto buf = reinterpret_cast<statstruct*>(aArgs.args[2]);
    405    auto flags = static_cast<int>(aArgs.args[3]);
    406 
    407    if (fd != AT_FDCWD && (flags & AT_EMPTY_PATH) && path &&
    408        !strcmp(path, "")) {
    409 #ifdef __NR_fstat64
    410      return DoSyscall(__NR_fstat64, fd, buf);
    411 #else
    412      return DoSyscall(__NR_fstat, fd, buf);
    413 #endif
    414    }
    415 
    416    if (!broker) {
    417      return BlockedSyscallTrap(aArgs, nullptr);
    418    }
    419 
    420    if (fd != AT_FDCWD && path && path[0] != '/') {
    421      SANDBOX_LOG("unsupported fd-relative fstatat(%d, \"%s\", %p, 0x%x)", fd,
    422                  path, buf, flags);
    423      return BlockedSyscallTrap(aArgs, nullptr);
    424    }
    425 
    426    int badFlags = flags & ~(AT_SYMLINK_NOFOLLOW | AT_NO_AUTOMOUNT);
    427    if (badFlags != 0) {
    428      SANDBOX_LOG("unsupported flags 0x%x in fstatat(%d, \"%s\", %p, 0x%x)",
    429                  badFlags, fd, path, buf, flags);
    430      return BlockedSyscallTrap(aArgs, nullptr);
    431    }
    432    return (flags & AT_SYMLINK_NOFOLLOW) == 0 ? broker->Stat(path, buf)
    433                                              : broker->LStat(path, buf);
    434  }
    435 
    436  static intptr_t ChmodAtTrap(ArgsRef aArgs, void* aux) {
    437    auto broker = static_cast<SandboxBrokerClient*>(aux);
    438    auto fd = static_cast<int>(aArgs.args[0]);
    439    auto path = reinterpret_cast<const char*>(aArgs.args[1]);
    440    auto mode = static_cast<mode_t>(aArgs.args[2]);
    441    auto flags = static_cast<int>(aArgs.args[3]);
    442    if (fd != AT_FDCWD && path[0] != '/') {
    443      SANDBOX_LOG("unsupported fd-relative chmodat(%d, \"%s\", 0%o, %d)", fd,
    444                  path, mode, flags);
    445      return BlockedSyscallTrap(aArgs, nullptr);
    446    }
    447    if (flags != 0) {
    448      SANDBOX_LOG("unsupported flags in chmodat(%d, \"%s\", 0%o, %d)", fd, path,
    449                  mode, flags);
    450      return BlockedSyscallTrap(aArgs, nullptr);
    451    }
    452    return broker->Chmod(path, mode);
    453  }
    454 
    455  static intptr_t LinkAtTrap(ArgsRef aArgs, void* aux) {
    456    auto broker = static_cast<SandboxBrokerClient*>(aux);
    457    auto fd = static_cast<int>(aArgs.args[0]);
    458    auto path = reinterpret_cast<const char*>(aArgs.args[1]);
    459    auto fd2 = static_cast<int>(aArgs.args[2]);
    460    auto path2 = reinterpret_cast<const char*>(aArgs.args[3]);
    461    auto flags = static_cast<int>(aArgs.args[4]);
    462    if ((fd != AT_FDCWD && path[0] != '/') ||
    463        (fd2 != AT_FDCWD && path2[0] != '/')) {
    464      SANDBOX_LOG(
    465          "unsupported fd-relative linkat(%d, \"%s\", %d, \"%s\", 0x%x)", fd,
    466          path, fd2, path2, flags);
    467      return BlockedSyscallTrap(aArgs, nullptr);
    468    }
    469    if (flags != 0) {
    470      SANDBOX_LOG("unsupported flags in linkat(%d, \"%s\", %d, \"%s\", 0x%x)",
    471                  fd, path, fd2, path2, flags);
    472      return BlockedSyscallTrap(aArgs, nullptr);
    473    }
    474    return broker->Link(path, path2);
    475  }
    476 
    477  static intptr_t SymlinkAtTrap(ArgsRef aArgs, void* aux) {
    478    auto broker = static_cast<SandboxBrokerClient*>(aux);
    479    auto path = reinterpret_cast<const char*>(aArgs.args[0]);
    480    auto fd2 = static_cast<int>(aArgs.args[1]);
    481    auto path2 = reinterpret_cast<const char*>(aArgs.args[2]);
    482    if (fd2 != AT_FDCWD && path2[0] != '/') {
    483      SANDBOX_LOG("unsupported fd-relative symlinkat(\"%s\", %d, \"%s\")", path,
    484                  fd2, path2);
    485      return BlockedSyscallTrap(aArgs, nullptr);
    486    }
    487    return broker->Symlink(path, path2);
    488  }
    489 
    490  static intptr_t RenameAtTrap(ArgsRef aArgs, void* aux) {
    491    auto broker = static_cast<SandboxBrokerClient*>(aux);
    492    auto fd = static_cast<int>(aArgs.args[0]);
    493    auto path = reinterpret_cast<const char*>(aArgs.args[1]);
    494    auto fd2 = static_cast<int>(aArgs.args[2]);
    495    auto path2 = reinterpret_cast<const char*>(aArgs.args[3]);
    496    if ((fd != AT_FDCWD && path[0] != '/') ||
    497        (fd2 != AT_FDCWD && path2[0] != '/')) {
    498      SANDBOX_LOG("unsupported fd-relative renameat(%d, \"%s\", %d, \"%s\")",
    499                  fd, path, fd2, path2);
    500      return BlockedSyscallTrap(aArgs, nullptr);
    501    }
    502    return broker->Rename(path, path2);
    503  }
    504 
    505  static intptr_t MkdirAtTrap(ArgsRef aArgs, void* aux) {
    506    auto broker = static_cast<SandboxBrokerClient*>(aux);
    507    auto fd = static_cast<int>(aArgs.args[0]);
    508    auto path = reinterpret_cast<const char*>(aArgs.args[1]);
    509    auto mode = static_cast<mode_t>(aArgs.args[2]);
    510    if (fd != AT_FDCWD && path[0] != '/') {
    511      SANDBOX_LOG("unsupported fd-relative mkdirat(%d, \"%s\", 0%o)", fd, path,
    512                  mode);
    513      return BlockedSyscallTrap(aArgs, nullptr);
    514    }
    515    return broker->Mkdir(path, mode);
    516  }
    517 
    518  static intptr_t UnlinkAtTrap(ArgsRef aArgs, void* aux) {
    519    auto broker = static_cast<SandboxBrokerClient*>(aux);
    520    auto fd = static_cast<int>(aArgs.args[0]);
    521    auto path = reinterpret_cast<const char*>(aArgs.args[1]);
    522    auto flags = static_cast<int>(aArgs.args[2]);
    523    if (path && path[0] == '\0') {
    524      // If the path is empty, then just fail the call here
    525      return -ENOENT;
    526    }
    527    if (fd != AT_FDCWD && path[0] != '/') {
    528      SANDBOX_LOG("unsupported fd-relative unlinkat(%d, \"%s\", 0x%x)", fd,
    529                  path, flags);
    530      return BlockedSyscallTrap(aArgs, nullptr);
    531    }
    532    int badFlags = flags & ~AT_REMOVEDIR;
    533    if (badFlags != 0) {
    534      SANDBOX_LOG("unsupported flags 0x%x in unlinkat(%d, \"%s\", 0x%x)",
    535                  badFlags, fd, path, flags);
    536      return BlockedSyscallTrap(aArgs, nullptr);
    537    }
    538    return (flags & AT_REMOVEDIR) == 0 ? broker->Unlink(path)
    539                                       : broker->Rmdir(path);
    540  }
    541 
    542  static intptr_t ReadlinkAtTrap(ArgsRef aArgs, void* aux) {
    543    auto broker = static_cast<SandboxBrokerClient*>(aux);
    544    auto fd = static_cast<int>(aArgs.args[0]);
    545    auto path = reinterpret_cast<const char*>(aArgs.args[1]);
    546    auto buf = reinterpret_cast<char*>(aArgs.args[2]);
    547    auto size = static_cast<size_t>(aArgs.args[3]);
    548    if (fd != AT_FDCWD && path[0] != '/') {
    549      SANDBOX_LOG("unsupported fd-relative readlinkat(%d, %s, %p, %d)", fd,
    550                  path, buf, size);
    551      return BlockedSyscallTrap(aArgs, nullptr);
    552    }
    553    return broker->Readlink(path, buf, size);
    554  }
    555 
    556  static intptr_t SocketpairDatagramTrap(ArgsRef aArgs, void* aux) {
    557    auto fds = reinterpret_cast<int*>(aArgs.args[3]);
    558    // Return sequential packet sockets instead of the expected
    559    // datagram sockets; see bug 1355274 for details.
    560    return ConvertError(socketpair(AF_UNIX, SOCK_SEQPACKET, 0, fds));
    561  }
    562 
    563  static intptr_t SocketcallUnpackTrap(ArgsRef aArgs, void* aux) {
    564 #ifdef __NR_socketcall
    565    auto argsPtr = reinterpret_cast<const unsigned long*>(aArgs.args[1]);
    566    int sysno = -1;
    567 
    568    // When Linux added separate syscalls for socket operations on the
    569    // old socketcall platforms, they had long since stopped adding
    570    // send and recv syscalls, because they can be trivially mapped
    571    // onto sendto and recvfrom (see also open vs. openat).
    572    //
    573    // But, socketcall itself *does* have separate calls for those.
    574    // So, we need to remap them; since send(to) and recv(from)
    575    // have basically the same types except for const, the code is
    576    // factored out here.
    577    unsigned long altArgs[6];
    578    auto legacySendRecvWorkaround = [&] {
    579      MOZ_ASSERT(argsPtr != altArgs);
    580      memcpy(altArgs, argsPtr, sizeof(unsigned long[4]));
    581      altArgs[4] = altArgs[5] = 0;
    582      argsPtr = altArgs;
    583    };
    584 
    585    switch (aArgs.args[0]) {
    586      // See also the other socketcall table in SandboxFilterUtil.cpp
    587 #  define DISPATCH_SOCKETCALL(this_sysno, this_call) \
    588    case this_call:                                  \
    589      sysno = this_sysno;                            \
    590      break
    591 
    592      DISPATCH_SOCKETCALL(__NR_socketpair, SYS_SOCKETPAIR);
    593      DISPATCH_SOCKETCALL(__NR_getsockopt, SYS_GETSOCKOPT);
    594      DISPATCH_SOCKETCALL(__NR_sendmsg, SYS_SENDMSG);
    595      DISPATCH_SOCKETCALL(__NR_recvmsg, SYS_RECVMSG);
    596      DISPATCH_SOCKETCALL(__NR_sendto, SYS_SENDTO);
    597      DISPATCH_SOCKETCALL(__NR_recvfrom, SYS_RECVFROM);
    598      DISPATCH_SOCKETCALL(__NR_sendmmsg, SYS_SENDMMSG);
    599      DISPATCH_SOCKETCALL(__NR_recvmmsg, SYS_RECVMMSG);
    600      // __NR_recvmmsg_time64 is not available as a socketcall; a
    601      // Y2K38-ready userland would call it directly.
    602 #  undef DISPATCH_SOCKETCALL
    603 
    604      case SYS_SEND:
    605        sysno = __NR_sendto;
    606        legacySendRecvWorkaround();
    607        break;
    608      case SYS_RECV:
    609        sysno = __NR_recvfrom;
    610        legacySendRecvWorkaround();
    611        break;
    612    }
    613 
    614    // This assert will fail if someone tries to map a socketcall to
    615    // this trap without adding it to the switch statement above.
    616    MOZ_RELEASE_ASSERT(sysno >= 0);
    617 
    618    return DoSyscall(sysno, argsPtr[0], argsPtr[1], argsPtr[2], argsPtr[3],
    619                     argsPtr[4], argsPtr[5]);
    620 
    621 #else  // no socketcall
    622    MOZ_CRASH("unreachable?");
    623    return -ENOSYS;
    624 #endif
    625  }
    626 
    627  // This just needs to return something to stand in for the
    628  // unconnected socket until ConnectTrap, below, and keep track of
    629  // the socket type somehow.  Half a socketpair *is* a socket, so it
    630  // should result in minimal confusion in the caller.
    631  static intptr_t FakeSocketTrapCommon(int domain, int type, int protocol) {
    632    int fds[2];
    633    // X11 client libs will still try to getaddrinfo() even for a
    634    // local connection.  Also, WebRTC still has vestigial network
    635    // code trying to do things in the content process.  Politely tell
    636    // them no.
    637    if (domain != AF_UNIX) {
    638      return -EAFNOSUPPORT;
    639    }
    640    if (socketpair(domain, type, protocol, fds) != 0) {
    641      return -errno;
    642    }
    643    close(fds[1]);
    644    return fds[0];
    645  }
    646 
    647  static intptr_t FakeSocketTrap(ArgsRef aArgs, void* aux) {
    648    return FakeSocketTrapCommon(static_cast<int>(aArgs.args[0]),
    649                                static_cast<int>(aArgs.args[1]),
    650                                static_cast<int>(aArgs.args[2]));
    651  }
    652 
    653  static intptr_t FakeSocketTrapLegacy(ArgsRef aArgs, void* aux) {
    654    const auto innerArgs = reinterpret_cast<unsigned long*>(aArgs.args[1]);
    655 
    656    return FakeSocketTrapCommon(static_cast<int>(innerArgs[0]),
    657                                static_cast<int>(innerArgs[1]),
    658                                static_cast<int>(innerArgs[2]));
    659  }
    660 
    661  static Maybe<int> DoGetSockOpt(int fd, int optname) {
    662    int optval;
    663    socklen_t optlen = sizeof(optval);
    664 
    665    if (getsockopt(fd, SOL_SOCKET, optname, &optval, &optlen) != 0) {
    666      return Nothing();
    667    }
    668    MOZ_RELEASE_ASSERT(static_cast<size_t>(optlen) == sizeof(optval));
    669    return Some(optval);
    670  }
    671 
    672  // Substitute the newly connected socket from the broker for the
    673  // original socket.  This is meant to be used on a fd from
    674  // FakeSocketTrap, above, but it should also work to simulate
    675  // re-connect()ing a real connected socket.
    676  //
    677  // Warning: This isn't quite right if the socket is dup()ed, because
    678  // other duplicates will still be the original socket, but hopefully
    679  // nothing we're dealing with does that.
    680  static intptr_t ConnectTrapCommon(SandboxBrokerClient* aBroker, int aFd,
    681                                    const struct sockaddr_un* aAddr,
    682                                    socklen_t aLen) {
    683    if (aFd < 0) {
    684      return -EBADF;
    685    }
    686    const auto maybeDomain = DoGetSockOpt(aFd, SO_DOMAIN);
    687    if (!maybeDomain) {
    688      return -errno;
    689    }
    690    if (*maybeDomain != AF_UNIX) {
    691      return -EAFNOSUPPORT;
    692    }
    693    const auto maybeType = DoGetSockOpt(aFd, SO_TYPE);
    694    if (!maybeType) {
    695      return -errno;
    696    }
    697    const int oldFlags = fcntl(aFd, F_GETFL);
    698    if (oldFlags == -1) {
    699      return -errno;
    700    }
    701    const int newFd = aBroker->Connect(aAddr, aLen, *maybeType);
    702    if (newFd < 0) {
    703      return newFd;
    704    }
    705    // Copy over the nonblocking flag.  The connect() won't be
    706    // nonblocking in that case, but that shouldn't matter for
    707    // AF_UNIX.  The other fcntl-settable flags are either irrelevant
    708    // for sockets (e.g., O_APPEND) or would be blocked by this
    709    // seccomp-bpf policy, so they're ignored.
    710    if (fcntl(newFd, F_SETFL, oldFlags & O_NONBLOCK) != 0) {
    711      close(newFd);
    712      return -errno;
    713    }
    714    if (dup2(newFd, aFd) < 0) {
    715      close(newFd);
    716      return -errno;
    717    }
    718    close(newFd);
    719    return 0;
    720  }
    721 
    722  static intptr_t ConnectTrap(ArgsRef aArgs, void* aux) {
    723    typedef const struct sockaddr_un* AddrPtr;
    724 
    725    return ConnectTrapCommon(static_cast<SandboxBrokerClient*>(aux),
    726                             static_cast<int>(aArgs.args[0]),
    727                             reinterpret_cast<AddrPtr>(aArgs.args[1]),
    728                             static_cast<socklen_t>(aArgs.args[2]));
    729  }
    730 
    731  static intptr_t ConnectTrapLegacy(ArgsRef aArgs, void* aux) {
    732    const auto innerArgs = reinterpret_cast<unsigned long*>(aArgs.args[1]);
    733    typedef const struct sockaddr_un* AddrPtr;
    734 
    735    return ConnectTrapCommon(static_cast<SandboxBrokerClient*>(aux),
    736                             static_cast<int>(innerArgs[0]),
    737                             reinterpret_cast<AddrPtr>(innerArgs[1]),
    738                             static_cast<socklen_t>(innerArgs[2]));
    739  }
    740 
    741  static intptr_t StatFsTrap(ArgsRef aArgs, void* aux) {
    742    // Warning: the kernel interface is not the C interface.  The
    743    // structs are different (<asm/statfs.h> vs. <sys/statfs.h>), and
    744    // the statfs64 version takes an additional size parameter.
    745    auto path = reinterpret_cast<const char*>(aArgs.args[0]);
    746    int fd = open(path, O_RDONLY | O_LARGEFILE);
    747    if (fd < 0) {
    748      return -errno;
    749    }
    750 
    751    intptr_t rv;
    752    switch (aArgs.nr) {
    753      case __NR_statfs: {
    754        auto buf = reinterpret_cast<void*>(aArgs.args[1]);
    755        rv = DoSyscall(__NR_fstatfs, fd, buf);
    756        break;
    757      }
    758 #ifdef __NR_statfs64
    759      case __NR_statfs64: {
    760        auto sz = static_cast<size_t>(aArgs.args[1]);
    761        auto buf = reinterpret_cast<void*>(aArgs.args[2]);
    762        rv = DoSyscall(__NR_fstatfs64, fd, sz, buf);
    763        break;
    764      }
    765 #endif
    766      default:
    767        MOZ_ASSERT(false);
    768        rv = -ENOSYS;
    769    }
    770 
    771    close(fd);
    772    return rv;
    773  }
    774 
    775 public:
    776  ResultExpr InvalidSyscall() const override {
    777    return Trap(BlockedSyscallTrap, nullptr);
    778  }
    779 
    780  virtual ResultExpr ClonePolicy(ResultExpr failPolicy) const {
    781    // Allow use for simple thread creation (pthread_create) only.
    782 
    783    // WARNING: s390 and cris pass the flags in the second arg -- see
    784    // CLONE_BACKWARDS2 in arch/Kconfig in the kernel source -- but we
    785    // don't support seccomp-bpf on those archs yet.
    786    Arg<int> flags(0);
    787 
    788    // The exact flags used can vary.  CLONE_DETACHED is used by musl
    789    // and by old versions of Android (<= JB 4.2), but it's been
    790    // ignored by the kernel since the beginning of the Git history.
    791    //
    792    // If we ever need to support Android <= KK 4.4 again, SETTLS
    793    // and the *TID flags will need to be made optional.
    794    static const int flags_required =
    795        CLONE_VM | CLONE_FS | CLONE_FILES | CLONE_SIGHAND | CLONE_THREAD |
    796        CLONE_SYSVSEM | CLONE_SETTLS | CLONE_PARENT_SETTID |
    797        CLONE_CHILD_CLEARTID;
    798    static const int flags_optional = CLONE_DETACHED;
    799 
    800    return If((flags & ~flags_optional) == flags_required, Allow())
    801        .Else(failPolicy);
    802  }
    803 
    804  virtual ResultExpr PrctlPolicy() const {
    805    Arg<int> op(0);
    806    Arg<int> arg2(1);
    807    return Switch(op)
    808        .Case(PR_SET_VMA,  // Tagging of anonymous memory mappings
    809              If(arg2 == PR_SET_VMA_ANON_NAME, Allow()).Else(InvalidSyscall()))
    810        .Cases({PR_GET_SECCOMP,   // BroadcastSetThreadSandbox, etc.
    811                PR_SET_NAME,      // Thread creation
    812                PR_SET_DUMPABLE,  // Crash reporting
    813                PR_SET_PTRACER},  // Debug-mode crash handling
    814               Allow())
    815        .Case(PR_CAPBSET_READ,  // libcap.so.2 loaded by libpulse.so.0
    816                                // queries for capabilities
    817              Error(EINVAL))
    818 #if defined(MOZ_PROFILE_GENERATE)
    819        .Case(PR_GET_PDEATHSIG, Allow())
    820 #endif  // defined(MOZ_PROFILE_GENERATE)
    821        .Default(InvalidSyscall());
    822  }
    823 
    824  virtual BoolExpr MsgFlagsAllowed(const Arg<int>& aFlags) const {
    825    // MSG_DONTWAIT: used by IPC
    826    // MSG_NOSIGNAL: used by the sandbox (broker, reporter)
    827    // MSG_CMSG_CLOEXEC: should be used by anything that's passed fds
    828    static constexpr int kNeeded =
    829        MSG_DONTWAIT | MSG_NOSIGNAL | MSG_CMSG_CLOEXEC;
    830 
    831    // These don't appear to be used in our code at the moment, but
    832    // they seem low-risk enough to allow to avoid the possibility of
    833    // breakage.  (Necko might use MSG_PEEK, but the socket process
    834    // overrides this method.)
    835    static constexpr int kHarmless = MSG_PEEK | MSG_WAITALL | MSG_TRUNC;
    836 
    837    static constexpr int kAllowed = kNeeded | kHarmless;
    838    return (aFlags & ~kAllowed) == 0;
    839  }
    840 
    841  static ResultExpr UnpackSocketcallOrAllow() {
    842    // See bug 1066750.
    843    if (HasSeparateSocketCalls()) {
    844      // If this is a socketcall(2) platform, but the kernel also
    845      // supports separate syscalls (>= 4.3.0), we can unpack the
    846      // arguments and filter them.
    847      return Trap(SocketcallUnpackTrap, nullptr);
    848    }
    849    // Otherwise, we can't filter the args if the platform passes
    850    // them by pointer.
    851    return Allow();
    852  }
    853 
    854  Maybe<ResultExpr> EvaluateSocketCall(int aCall,
    855                                       bool aHasArgs) const override {
    856    switch (aCall) {
    857      case SYS_RECVMSG:
    858      case SYS_SENDMSG:
    859        if (aHasArgs) {
    860          Arg<int> flags(2);
    861          return Some(
    862              If(MsgFlagsAllowed(flags), Allow()).Else(InvalidSyscall()));
    863        }
    864        return Some(UnpackSocketcallOrAllow());
    865 
    866        // These next four weren't needed for IPC or other core
    867        // functionality when they were added, but they're subsets of
    868        // recvmsg/sendmsg so there's nothing gained by not allowing
    869        // them here (and simplifying subclasses).  Also, there may be
    870        // unknown dependencies on them now.
    871      case SYS_RECVFROM:
    872      case SYS_SENDTO:
    873      case SYS_RECV:
    874      case SYS_SEND:
    875        if (aHasArgs) {
    876          Arg<int> flags(3);
    877          return Some(
    878              If(MsgFlagsAllowed(flags), Allow()).Else(InvalidSyscall()));
    879        }
    880        return Some(UnpackSocketcallOrAllow());
    881 
    882      case SYS_SOCKETPAIR: {
    883        // We try to allow "safe" (always connected) socketpairs when using the
    884        // file broker, or for content processes, but we may need to fall back
    885        // and allow all socketpairs in some cases, see bug 1066750.
    886        if (!mBroker && !mAllowUnsafeSocketPair) {
    887          return Nothing();
    888        }
    889        if (!aHasArgs) {
    890          return Some(UnpackSocketcallOrAllow());
    891        }
    892        Arg<int> domain(0), type(1);
    893        return Some(
    894            If(domain == AF_UNIX,
    895               Switch(type & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
    896                   .Case(SOCK_STREAM, Allow())
    897                   .Case(SOCK_SEQPACKET, Allow())
    898                   // This is used only by content (and only for
    899                   // direct PulseAudio, which is deprecated) but it
    900                   // doesn't increase attack surface:
    901                   .Case(SOCK_DGRAM, Trap(SocketpairDatagramTrap, nullptr))
    902                   .Default(InvalidSyscall()))
    903                .Else(InvalidSyscall()));
    904      }
    905 
    906      case SYS_GETSOCKOPT: {
    907        // Best-effort argument filtering as for socketpair(2), above.
    908        if (!aHasArgs) {
    909          if (HasSeparateSocketCalls()) {
    910            return Some(Trap(SocketcallUnpackTrap, nullptr));
    911          }
    912          return Some(Allow());
    913        }
    914        Arg<int> level(1), optname(2);
    915        // SO_SNDBUF is used by IPC to avoid constructing
    916        // unnecessarily large gather arrays for `sendmsg`.
    917        //
    918        // SO_DOMAIN and SO_TYPE are needed for connect() brokering,
    919        // but they're harmless even when it's not enabled.
    920        return Some(If(AllOf(level == SOL_SOCKET,
    921                             AnyOf(optname == SO_SNDBUF, optname == SO_DOMAIN,
    922                                   optname == SO_TYPE)),
    923                       Allow())
    924                        .Else(InvalidSyscall()));
    925      }
    926 
    927        // These two cases are for connect() brokering, if enabled.
    928      case SYS_SOCKET:
    929        if (mBrokeredConnect) {
    930          const auto trapFn = aHasArgs ? FakeSocketTrap : FakeSocketTrapLegacy;
    931          MOZ_ASSERT(mBroker);
    932          return Some(Trap(trapFn, mBroker));
    933        }
    934        return Nothing();
    935 
    936      case SYS_CONNECT:
    937        if (mBrokeredConnect) {
    938          const auto trapFn = aHasArgs ? ConnectTrap : ConnectTrapLegacy;
    939          MOZ_ASSERT(mBroker);
    940          return Some(Trap(trapFn, mBroker));
    941        }
    942        return Nothing();
    943 
    944      default:
    945        return Nothing();
    946    }
    947  }
    948 
    949  ResultExpr EvaluateSyscall(int sysno) const override {
    950    // If a file broker client was provided, route syscalls to it;
    951    // otherwise, fall through to the main policy, which will deny
    952    // them.
    953    if (mBroker) {
    954      switch (sysno) {
    955 #ifdef __NR_open
    956        case __NR_open:
    957          return Trap(OpenTrap, mBroker);
    958        case __NR_access:
    959          return Trap(AccessTrap, mBroker);
    960        CASES_FOR_stat:
    961          return Trap(StatTrap, mBroker);
    962        CASES_FOR_lstat:
    963          return Trap(LStatTrap, mBroker);
    964        case __NR_chmod:
    965          return Trap(ChmodTrap, mBroker);
    966        case __NR_link:
    967          return Trap(LinkTrap, mBroker);
    968        case __NR_mkdir:
    969          return Trap(MkdirTrap, mBroker);
    970        case __NR_symlink:
    971          return Trap(SymlinkTrap, mBroker);
    972        case __NR_rename:
    973          return Trap(RenameTrap, mBroker);
    974        case __NR_rmdir:
    975          return Trap(RmdirTrap, mBroker);
    976        case __NR_unlink:
    977          return Trap(UnlinkTrap, mBroker);
    978        case __NR_readlink:
    979          return Trap(ReadlinkTrap, mBroker);
    980 #endif
    981        case __NR_openat:
    982          return Trap(OpenAtTrap, mBroker);
    983        case __NR_faccessat:
    984          return Trap(AccessAtTrap, mBroker);
    985        case __NR_faccessat2:
    986          return Trap(AccessAt2Trap, mBroker);
    987        CASES_FOR_fstatat:
    988          return Trap(StatAtTrap, mBroker);
    989        // Used by new libc and Rust's stdlib, if available.
    990        // We don't have broker support yet so claim it does not exist.
    991        case __NR_statx:
    992          return Error(ENOSYS);
    993        case __NR_fchmodat:
    994          return Trap(ChmodAtTrap, mBroker);
    995        case __NR_linkat:
    996          return Trap(LinkAtTrap, mBroker);
    997        case __NR_mkdirat:
    998          return Trap(MkdirAtTrap, mBroker);
    999        case __NR_symlinkat:
   1000          return Trap(SymlinkAtTrap, mBroker);
   1001        case __NR_renameat:
   1002          return Trap(RenameAtTrap, mBroker);
   1003        case __NR_unlinkat:
   1004          return Trap(UnlinkAtTrap, mBroker);
   1005        case __NR_readlinkat:
   1006          return Trap(ReadlinkAtTrap, mBroker);
   1007      }
   1008    } else {
   1009      // In the absence of a broker we still need to handle the
   1010      // fstat-equivalent subset of fstatat; see bug 1673770.
   1011      switch (sysno) {
   1012        // statx may be used for fstat (bug 1867673)
   1013        case __NR_statx:
   1014          return Error(ENOSYS);
   1015        CASES_FOR_fstatat:
   1016          return Trap(StatAtTrap, nullptr);
   1017      }
   1018    }
   1019 
   1020    switch (sysno) {
   1021        // Timekeeping
   1022        //
   1023        // (Note: the switch needs to start with a literal case, not a
   1024        // macro; otherwise clang-format gets confused.)
   1025      case __NR_gettimeofday:
   1026 #ifdef __NR_time
   1027      case __NR_time:
   1028 #endif
   1029      case __NR_nanosleep:
   1030        return Allow();
   1031 
   1032      CASES_FOR_clock_gettime:
   1033      CASES_FOR_clock_getres:
   1034      CASES_FOR_clock_nanosleep: {
   1035        // clockid_t can encode a pid or tid to monitor another
   1036        // process or thread's CPU usage (see CPUCLOCK_PID and related
   1037        // definitions in include/linux/posix-timers.h in the kernel
   1038        // source).  For threads, the kernel allows only tids within
   1039        // the calling process, so it isn't a problem if we don't
   1040        // filter those; pids do need to be restricted to the current
   1041        // process in order to not leak information.
   1042        Arg<clockid_t> clk_id(0);
   1043 #ifdef MOZ_GECKO_PROFILER
   1044        clockid_t this_process =
   1045            MAKE_PROCESS_CPUCLOCK(getpid(), CPUCLOCK_SCHED);
   1046 #endif
   1047        return If(clk_id == CLOCK_MONOTONIC, Allow())
   1048 #ifdef CLOCK_MONOTONIC_COARSE
   1049            // Used by SandboxReporter, among other things.
   1050            .ElseIf(clk_id == CLOCK_MONOTONIC_COARSE, Allow())
   1051 #endif
   1052 #ifdef CLOCK_MONOTONIC_RAW
   1053            .ElseIf(clk_id == CLOCK_MONOTONIC_RAW, Allow())
   1054 #endif
   1055            .ElseIf(clk_id == CLOCK_PROCESS_CPUTIME_ID, Allow())
   1056            .ElseIf(clk_id == CLOCK_REALTIME, Allow())
   1057 #ifdef CLOCK_REALTIME_COARSE
   1058            .ElseIf(clk_id == CLOCK_REALTIME_COARSE, Allow())
   1059 #endif
   1060            .ElseIf(clk_id == CLOCK_THREAD_CPUTIME_ID, Allow())
   1061 #ifdef MOZ_GECKO_PROFILER
   1062            // Allow clock_gettime on the same process.
   1063            .ElseIf(clk_id == this_process, Allow())
   1064            // Allow clock_gettime on a thread.
   1065            .ElseIf((clk_id & 7u) == (CPUCLOCK_PERTHREAD_MASK | CPUCLOCK_SCHED),
   1066                    Allow())
   1067 #endif
   1068 #ifdef CLOCK_BOOTTIME
   1069            .ElseIf(clk_id == CLOCK_BOOTTIME, Allow())
   1070 #endif
   1071            .Else(InvalidSyscall());
   1072      }
   1073 
   1074        // Thread synchronization
   1075      CASES_FOR_futex:
   1076        // FIXME(bug 1441993): This could be more restrictive.
   1077        return Allow();
   1078 
   1079        // Asynchronous I/O
   1080      CASES_FOR_epoll_create:
   1081      CASES_FOR_epoll_wait:
   1082      case __NR_epoll_ctl:
   1083      CASES_FOR_poll:
   1084        return Allow();
   1085 
   1086        // Used when requesting a crash dump.
   1087      CASES_FOR_pipe:
   1088        return Allow();
   1089 
   1090        // Metadata of opened files
   1091      CASES_FOR_fstat:
   1092        return Allow();
   1093 
   1094      CASES_FOR_fcntl: {
   1095        Arg<int> cmd(1);
   1096        Arg<int> flags(2);
   1097        // Typical use of F_SETFL is to modify the flags returned by
   1098        // F_GETFL and write them back, including some flags that
   1099        // F_SETFL ignores.  This is a default-deny policy in case any
   1100        // new SETFL-able flags are added.  (In particular we want to
   1101        // forbid O_ASYNC; see bug 1328896, but also see bug 1408438.)
   1102        static const int ignored_flags =
   1103            O_ACCMODE | O_LARGEFILE_REAL | O_CLOEXEC | FMODE_NONOTIFY;
   1104        static const int allowed_flags = ignored_flags | O_APPEND | O_NONBLOCK;
   1105        return Switch(cmd)
   1106            // Close-on-exec is meaningless when execve isn't allowed, but
   1107            // NSPR reads the bit and asserts that it has the expected value.
   1108            .Case(F_GETFD, Allow())
   1109            .Case(
   1110                F_SETFD,
   1111                If((flags & ~FD_CLOEXEC) == 0, Allow()).Else(InvalidSyscall()))
   1112            // F_GETFL is also used by fdopen
   1113            .Case(F_GETFL, Allow())
   1114            .Case(F_SETFL, If((flags & ~allowed_flags) == 0, Allow())
   1115                               .Else(InvalidSyscall()))
   1116 #if defined(MOZ_PROFILE_GENERATE)
   1117            .Case(F_SETLKW, Allow())
   1118 #endif
   1119            // Not much different from other forms of dup(), and commonly used.
   1120            .Case(F_DUPFD_CLOEXEC, Allow())
   1121            // Used by Mesa, generally useful, and harmless: tests if
   1122            // two file descriptors refer to the same file description.
   1123            .Case(F_DUPFD_QUERY, Allow())
   1124            .Default(SandboxPolicyBase::EvaluateSyscall(sysno));
   1125      }
   1126 
   1127        // Simple I/O
   1128      case __NR_pread64:
   1129      case __NR_write:
   1130      case __NR_read:
   1131      case __NR_readv:
   1132      case __NR_writev:  // see SandboxLogging.cpp
   1133      CASES_FOR_lseek:
   1134        return Allow();
   1135 
   1136      CASES_FOR_getdents:
   1137        return Allow();
   1138 
   1139      CASES_FOR_ftruncate:
   1140      case __NR_fallocate:
   1141        return mMayCreateShmem ? Allow() : InvalidSyscall();
   1142 
   1143        // Used by our fd/shm classes
   1144      case __NR_dup:
   1145        return Allow();
   1146 
   1147        // Memory mapping
   1148      CASES_FOR_mmap: {
   1149        Arg<int> flags(3);
   1150        // Explicit huge-page mapping has a history of bugs, and
   1151        // generally isn't used outside of server applications.
   1152        static constexpr int kBadFlags =
   1153            MAP_HUGETLB | (MAP_HUGE_MASK << MAP_HUGE_SHIFT);
   1154        // ENOSYS seems to be what the kernel would return if
   1155        // CONFIG_HUGETLBFS=n.  (This uses Error rather than
   1156        // InvalidSyscall because the latter would crash on Nightly,
   1157        // and I don't think those reports would be actionable.)
   1158        return If((flags & kBadFlags) != 0, Error(ENOSYS)).Else(Allow());
   1159      }
   1160      case __NR_munmap:
   1161        return Allow();
   1162 
   1163        // Shared memory
   1164      case __NR_memfd_create: {
   1165        Arg<unsigned> flags(1);
   1166        // See above about mmap MAP_HUGETLB.
   1167        static constexpr int kBadFlags =
   1168            MFD_HUGETLB | (MFD_HUGE_MASK << MFD_HUGE_SHIFT);
   1169        return If((flags & kBadFlags) != 0, Error(ENOSYS)).Else(Allow());
   1170      }
   1171 
   1172        // ipc::Shmem; also, glibc when creating threads:
   1173      case __NR_mprotect:
   1174        return Allow();
   1175 
   1176 #if !defined(MOZ_MEMORY)
   1177        // No jemalloc means using a system allocator like glibc
   1178        // that might use brk.
   1179      case __NR_brk:
   1180        return Allow();
   1181 
   1182        // Similarly, mremap (bugs: 1047620, 1286119, 1860267)
   1183      case __NR_mremap: {
   1184        Arg<int> flags(3);
   1185        return If((flags & ~MREMAP_MAYMOVE) == 0, Allow())
   1186            .Else(SandboxPolicyBase::EvaluateSyscall(sysno));
   1187      }
   1188 #endif
   1189 
   1190        // madvise hints used by malloc; see bug 1303813 and bug 1364533
   1191      case __NR_madvise: {
   1192        Arg<int> advice(2);
   1193        // The GMP specific sandbox duplicates this logic, so when adding
   1194        // allowed values here also add them to the GMP sandbox rules.
   1195        return If(advice == MADV_DONTNEED, Allow())
   1196            .ElseIf(advice == MADV_FREE, Allow())
   1197            // Used by glibc (and maybe someday mozjemalloc).
   1198            .ElseIf(advice == MADV_GUARD_INSTALL, Allow())
   1199            .ElseIf(advice == MADV_GUARD_REMOVE, Allow())
   1200            // Formerly used by mozjemalloc; unclear if current use:
   1201            .ElseIf(advice == MADV_HUGEPAGE, Allow())
   1202            .ElseIf(advice == MADV_NOHUGEPAGE, Allow())
   1203 #ifdef MOZ_ASAN
   1204            .ElseIf(advice == MADV_DONTDUMP, Allow())
   1205 #endif
   1206            .ElseIf(advice == MADV_MERGEABLE, Error(EPERM))  // bug 1705045
   1207            .Else(InvalidSyscall());
   1208      }
   1209 
   1210        // musl libc will set this up in pthreads support.
   1211      case __NR_membarrier:
   1212        return Allow();
   1213 
   1214        // Signal handling
   1215      case __NR_sigaltstack:
   1216      CASES_FOR_sigreturn:
   1217      CASES_FOR_sigprocmask:
   1218      CASES_FOR_sigaction:
   1219        return Allow();
   1220 
   1221        // Send signals within the process (raise(), profiling, etc.)
   1222      case __NR_tgkill: {
   1223        Arg<pid_t> tgid(0);
   1224        return If(tgid == getpid(), Allow()).Else(InvalidSyscall());
   1225      }
   1226 
   1227        // Polyfill with tgkill; see above.
   1228      case __NR_tkill:
   1229        return Trap(TKillCompatTrap, nullptr);
   1230 
   1231        // Yield
   1232      case __NR_sched_yield:
   1233        return Allow();
   1234 
   1235        // Thread creation.
   1236      case __NR_clone:
   1237        return ClonePolicy(InvalidSyscall());
   1238 
   1239      case __NR_clone3:
   1240        return Error(ENOSYS);
   1241 
   1242        // More thread creation.
   1243 #ifdef __NR_set_robust_list
   1244      case __NR_set_robust_list:
   1245        return Allow();
   1246 #endif
   1247 #ifdef ANDROID
   1248      case __NR_set_tid_address:
   1249        return Allow();
   1250 #endif
   1251 
   1252        // prctl
   1253      case __NR_prctl: {
   1254        // WARNING: do not handle __NR_prctl directly in subclasses;
   1255        // override PrctlPolicy instead.  The special handling of
   1256        // PR_SET_NO_NEW_PRIVS is used to detect that a thread already
   1257        // has the policy applied; see also bug 1257361.
   1258 
   1259        if (SandboxInfo::Get().Test(SandboxInfo::kHasSeccompTSync)) {
   1260          return PrctlPolicy();
   1261        }
   1262 
   1263        Arg<int> option(0);
   1264        return If(option == PR_SET_NO_NEW_PRIVS,
   1265                  Trap(SetNoNewPrivsTrap, nullptr))
   1266            .Else(PrctlPolicy());
   1267      }
   1268 
   1269 #if defined(GP_PLAT_amd64_linux) && defined(GP_ARCH_amd64) && \
   1270    defined(MOZ_USING_WASM_SANDBOXING)
   1271        // arch_prctl
   1272      case __NR_arch_prctl: {
   1273        // Bug 1923701 - Needed for by RLBox-wasm2c: Buggy libraries are
   1274        // sandboxed with RLBox and wasm2c (Wasm). wasm2c offers an optimization
   1275        // for performance that uses the otherwise-unused GS register on x86.
   1276        // The GS register is only settable using the arch_prctl platforms on
   1277        // older x86 CPUs that don't have the wrgsbase instruction. This
   1278        // optimization is currently only supported on linux+clang+x86_64.
   1279        Arg<int> op(0);
   1280        return If(op == ARCH_SET_GS, Allow())
   1281            .Else(SandboxPolicyBase::EvaluateSyscall(sysno));
   1282      }
   1283 #endif
   1284 
   1285        // NSPR can call this when creating a thread, but it will accept a
   1286        // polite "no".
   1287      case __NR_getpriority:
   1288        // But if thread creation races with sandbox startup, that call
   1289        // could succeed, and then we get one of these:
   1290      case __NR_setpriority:
   1291        return Error(EACCES);
   1292 
   1293        // Stack bounds are obtained via pthread_getattr_np, which calls
   1294        // this but doesn't actually need it:
   1295      case __NR_sched_getaffinity:
   1296        return Error(ENOSYS);
   1297 
   1298        // Identifies the processor and node where this thread or process is
   1299        // running. This is used by "Awake" profiler markers.
   1300      case __NR_getcpu:
   1301        return Allow();
   1302 
   1303        // Read own pid/tid.
   1304      case __NR_getpid:
   1305      case __NR_gettid:
   1306        return Allow();
   1307 
   1308        // Discard capabilities
   1309      case __NR_close:
   1310        return Allow();
   1311 
   1312        // Machine-dependent stuff
   1313 #ifdef __arm__
   1314      case __ARM_NR_breakpoint:
   1315      case __ARM_NR_cacheflush:
   1316      case __ARM_NR_usr26:  // FIXME: do we actually need this?
   1317      case __ARM_NR_usr32:
   1318      case __ARM_NR_set_tls:
   1319        return Allow();
   1320 #endif
   1321 
   1322        // Needed when being debugged:
   1323      case __NR_restart_syscall:
   1324        return Allow();
   1325 
   1326        // Terminate threads or the process
   1327      case __NR_exit:
   1328      case __NR_exit_group:
   1329        return Allow();
   1330 
   1331      case __NR_getrandom:
   1332        return Allow();
   1333 
   1334        // Used by almost every process: GMP needs them for Clearkey
   1335        // because of bug 1576006 (but may not need them for other
   1336        // plugin types; see bug 1737092).  Given that fstat is
   1337        // allowed, the uid/gid are probably available anyway.
   1338      CASES_FOR_getuid:
   1339      CASES_FOR_getgid:
   1340      CASES_FOR_geteuid:
   1341      CASES_FOR_getegid:
   1342        return Allow();
   1343 
   1344 #ifdef DESKTOP
   1345        // Bug 1543858: glibc's qsort calls sysinfo to check the
   1346        // memory size; it falls back to assuming there's enough RAM.
   1347      case __NR_sysinfo:
   1348        return Error(EPERM);
   1349 #endif
   1350 
   1351        // Bug 1651701: an API for restartable atomic sequences and
   1352        // per-CPU data; exposing information about CPU numbers and
   1353        // when threads are migrated or preempted isn't great but the
   1354        // risk should be relatively low.
   1355      case __NR_rseq:
   1356        return Allow();
   1357 
   1358      case __NR_ioctl: {
   1359        Arg<unsigned long> request(1);
   1360 #ifdef MOZ_ASAN
   1361        Arg<int> fd(0);
   1362 #endif  // MOZ_ASAN
   1363        // Make isatty() return false, because none of the terminal
   1364        // ioctls will be allowed; libraries sometimes call this for
   1365        // various reasons (e.g., to decide whether to emit ANSI/VT
   1366        // color codes when logging to stderr).  glibc uses TCGETS and
   1367        // musl uses TIOCGWINSZ.
   1368        //
   1369        // This is required by ffmpeg
   1370        return If(AnyOf(request == TCGETS, request == TIOCGWINSZ,
   1371                        request == TCGETS2),
   1372                  Error(ENOTTY))
   1373 #ifdef MOZ_ASAN
   1374            // ASAN's error reporter wants to know if stderr is a tty.
   1375            .ElseIf(fd == STDERR_FILENO, Error(ENOTTY))
   1376 #endif  // MOZ_ASAN
   1377            .Else(SandboxPolicyBase::EvaluateSyscall(sysno));
   1378      }
   1379 
   1380      CASES_FOR_dup2:  // See ConnectTrapCommon
   1381        if (mBrokeredConnect) {
   1382          return Allow();
   1383        }
   1384        return SandboxPolicyBase::EvaluateSyscall(sysno);
   1385 
   1386 #ifdef MOZ_ASAN
   1387        // ...and before compiler-rt r209773, it will call readlink on
   1388        // /proc/self/exe and use the cached value only if that fails:
   1389      case __NR_readlink:
   1390      case __NR_readlinkat:
   1391        return Error(ENOENT);
   1392 
   1393        // ...and if it found an external symbolizer, it will try to run it:
   1394        // (See also bug 1081242 comment #7.)
   1395      CASES_FOR_stat:
   1396        return Error(ENOENT);
   1397 #endif  // MOZ_ASAN
   1398 
   1399        // Replace statfs with open (which may be brokered) and
   1400        // fstatfs (which is not allowed in this policy, but may be
   1401        // allowed by subclasses if they wish to enable statfs).
   1402      CASES_FOR_statfs:
   1403        return Trap(StatFsTrap, nullptr);
   1404 
   1405        // GTK's theme parsing tries to getcwd() while sandboxed, but
   1406        // only during Talos runs.
   1407        // Also, Rust panics call getcwd to try to print relative paths
   1408        // in backtraces.
   1409      case __NR_getcwd:
   1410        return Error(ENOENT);
   1411 
   1412        // Basically every process type ends up using this for some
   1413        // reason (nsSystemInfo in content, Mesa in RDD, bug 1992904 for
   1414        // utility, etc.).  Other than GMP, which overrides this (see
   1415        // below), it's relatively safe to expose this information.
   1416      case __NR_uname:
   1417        return Allow();
   1418 
   1419      default:
   1420        return SandboxPolicyBase::EvaluateSyscall(sysno);
   1421    }
   1422  }
   1423 };
   1424 
   1425 // The process-type-specific syscall rules start here:
   1426 
   1427 // The seccomp-bpf filter for content processes is not a true sandbox
   1428 // on its own; its purpose is attack surface reduction and syscall
   1429 // interception in support of a semantic sandboxing layer.  On B2G
   1430 // this is the Android process permission model; on desktop,
   1431 // namespaces and chroot() will be used.
   1432 class ContentSandboxPolicy : public SandboxPolicyCommon {
   1433 private:
   1434  ContentProcessSandboxParams mParams;
   1435  bool mAllowSysV;
   1436  bool mUsingRenderDoc;
   1437 
   1438  bool BelowLevel(int aLevel) const { return mParams.mLevel < aLevel; }
   1439  ResultExpr AllowBelowLevel(int aLevel, ResultExpr aOrElse) const {
   1440    return BelowLevel(aLevel) ? Allow() : std::move(aOrElse);
   1441  }
   1442  ResultExpr AllowBelowLevel(int aLevel) const {
   1443    return AllowBelowLevel(aLevel, InvalidSyscall());
   1444  }
   1445 
   1446  static intptr_t GetPPidTrap(ArgsRef aArgs, void* aux) {
   1447    // In a pid namespace, getppid() will return 0. We will return 0 instead
   1448    // of the real parent pid to see what breaks when we introduce the
   1449    // pid namespace (Bug 1151624).
   1450    return 0;
   1451  }
   1452 
   1453 public:
   1454  ContentSandboxPolicy(SandboxBrokerClient* aBroker,
   1455                       ContentProcessSandboxParams&& aParams)
   1456      : mParams(std::move(aParams)),
   1457        mAllowSysV(PR_GetEnv("MOZ_SANDBOX_ALLOW_SYSV") != nullptr),
   1458        mUsingRenderDoc(PR_GetEnv("RENDERDOC_CAPTUREOPTS") != nullptr) {
   1459    mBroker = aBroker;
   1460    mMayCreateShmem = true;
   1461    mAllowUnsafeSocketPair = true;
   1462    mBrokeredConnect = true;
   1463  }
   1464 
   1465  ~ContentSandboxPolicy() override = default;
   1466 
   1467  Maybe<ResultExpr> EvaluateSocketCall(int aCall,
   1468                                       bool aHasArgs) const override {
   1469    switch (aCall) {
   1470 #ifdef ANDROID
   1471      case SYS_SOCKET:
   1472        return Some(Error(EACCES));
   1473 #else  // #ifdef DESKTOP
   1474      case SYS_SOCKET:
   1475      case SYS_CONNECT:
   1476        if (BelowLevel(4)) {
   1477          return Some(Allow());
   1478        }
   1479        return SandboxPolicyCommon::EvaluateSocketCall(aCall, aHasArgs);
   1480 
   1481        // FIXME (bug 1761134): sockopts should be filtered
   1482      case SYS_GETSOCKOPT:
   1483      case SYS_SETSOCKOPT:
   1484        // These next 3 were needed for X11; they may not be needed
   1485        // with X11 lockdown, but there's not much attack surface here.
   1486      case SYS_GETSOCKNAME:
   1487      case SYS_GETPEERNAME:
   1488      case SYS_SHUTDOWN:
   1489        return Some(Allow());
   1490 
   1491      case SYS_ACCEPT:
   1492      case SYS_ACCEPT4:
   1493        if (mUsingRenderDoc) {
   1494          return Some(Allow());
   1495        }
   1496        [[fallthrough]];
   1497 #endif
   1498      default:
   1499        return SandboxPolicyCommon::EvaluateSocketCall(aCall, aHasArgs);
   1500    }
   1501  }
   1502 
   1503 #ifdef DESKTOP
   1504  Maybe<ResultExpr> EvaluateIpcCall(int aCall, int aArgShift) const override {
   1505    switch (aCall) {
   1506        // These are a problem: SysV IPC follows the Unix "same uid
   1507        // policy" and can't be restricted/brokered like file access.
   1508        // We're not using it directly, but there are some library
   1509        // dependencies that do; see ContentNeedsSysVIPC() in
   1510        // SandboxLaunch.cpp.  Also, Cairo as used by GTK will sometimes
   1511        // try to use MIT-SHM, so shmget() is a non-fatal error.  See
   1512        // also bug 1376910 and bug 1438401.
   1513      case SHMGET:
   1514        return Some(mAllowSysV ? Allow() : Error(EPERM));
   1515      case SHMCTL:
   1516      case SHMAT:
   1517      case SHMDT:
   1518      case SEMGET:
   1519      case SEMCTL:
   1520      case SEMOP:
   1521        if (mAllowSysV) {
   1522          return Some(Allow());
   1523        }
   1524        return SandboxPolicyCommon::EvaluateIpcCall(aCall, aArgShift);
   1525      default:
   1526        return SandboxPolicyCommon::EvaluateIpcCall(aCall, aArgShift);
   1527    }
   1528  }
   1529 #endif
   1530 
   1531 #ifdef MOZ_PULSEAUDIO
   1532  ResultExpr PrctlPolicy() const override {
   1533    if (BelowLevel(4)) {
   1534      Arg<int> op(0);
   1535      return If(op == PR_GET_NAME, Allow())
   1536          .Else(SandboxPolicyCommon::PrctlPolicy());
   1537    }
   1538    return SandboxPolicyCommon::PrctlPolicy();
   1539  }
   1540 #endif
   1541 
   1542  ResultExpr EvaluateSyscall(int sysno) const override {
   1543    // Straight allow for anything that got overriden via prefs
   1544    const auto& whitelist = mParams.mSyscallWhitelist;
   1545    if (std::find(whitelist.begin(), whitelist.end(), sysno) !=
   1546        whitelist.end()) {
   1547      if (SandboxInfo::Get().Test(SandboxInfo::kVerbose)) {
   1548        SANDBOX_LOG("Allowing syscall nr %d via whitelist", sysno);
   1549      }
   1550      return Allow();
   1551    }
   1552 
   1553    // Level 1 has been removed.  If seccomp-bpf is used, then we're
   1554    // necessarily at level >= 2 and filesystem access is brokered.
   1555    MOZ_ASSERT(!BelowLevel(2));
   1556    MOZ_ASSERT(mBroker);
   1557 
   1558    switch (sysno) {
   1559 #ifdef DESKTOP
   1560      case __NR_getppid:
   1561        return Trap(GetPPidTrap, nullptr);
   1562 
   1563 #  ifdef MOZ_PULSEAUDIO
   1564      CASES_FOR_fchown:
   1565      case __NR_fchmod:
   1566        return AllowBelowLevel(4);
   1567 #  endif
   1568      CASES_FOR_fstatfs:  // fontconfig, pulseaudio, GIO (see also statfs)
   1569      case __NR_flock:    // graphics
   1570        return Allow();
   1571 
   1572        // Bug 1354731: proprietary GL drivers try to mknod() their devices
   1573 #  ifdef __NR_mknod
   1574      case __NR_mknod:
   1575 #  endif
   1576      case __NR_mknodat: {
   1577        Arg<mode_t> mode(sysno == __NR_mknodat ? 2 : 1);
   1578        return If((mode & S_IFMT) == S_IFCHR, Error(EPERM))
   1579            .Else(InvalidSyscall());
   1580      }
   1581      // Bug 1438389: ...and nvidia GL will sometimes try to chown the devices
   1582 #  ifdef __NR_chown
   1583      case __NR_chown:
   1584 #  endif
   1585      case __NR_fchownat:
   1586        return Error(EPERM);
   1587 #endif
   1588 
   1589      CASES_FOR_select:
   1590        return Allow();
   1591 
   1592      case __NR_writev:
   1593 #ifdef DESKTOP
   1594      case __NR_pwrite64:
   1595      case __NR_readahead:
   1596 #endif
   1597        return Allow();
   1598 
   1599      case __NR_ioctl: {
   1600 #ifdef MOZ_ALSA
   1601        if (BelowLevel(4)) {
   1602          return Allow();
   1603        }
   1604 #endif
   1605        Arg<unsigned long> request(1);
   1606        auto shifted_type = request & kIoctlTypeMask;
   1607 
   1608        // Rust's stdlib seems to use FIOCLEX instead of equivalent fcntls.
   1609        return If(request == FIOCLEX, Allow())
   1610            // Rust's stdlib also uses FIONBIO instead of equivalent fcntls.
   1611            .ElseIf(request == FIONBIO, Allow())
   1612            // Allow anything that isn't a tty ioctl, if level < 6
   1613            .ElseIf(
   1614                BelowLevel(6) ? shifted_type != kTtyIoctls : BoolConst(false),
   1615                Allow())
   1616            .Else(SandboxPolicyCommon::EvaluateSyscall(sysno));
   1617      }
   1618 
   1619      CASES_FOR_fcntl: {
   1620        Arg<int> cmd(1);
   1621        return Switch(cmd)
   1622            // Nvidia GL and fontconfig (newer versions) use fcntl file locking.
   1623            .Case(F_SETLK, Allow())
   1624 #ifdef F_SETLK64
   1625            .Case(F_SETLK64, Allow())
   1626 #endif
   1627            // Pulseaudio uses F_SETLKW, as does fontconfig.
   1628            .Case(F_SETLKW, Allow())
   1629 #ifdef F_SETLKW64
   1630            .Case(F_SETLKW64, Allow())
   1631 #endif
   1632            // Wayland client libraries use file seals
   1633            .Case(F_ADD_SEALS, Allow())
   1634            .Case(F_GET_SEALS, Allow())
   1635            .Default(SandboxPolicyCommon::EvaluateSyscall(sysno));
   1636      }
   1637 
   1638      case __NR_brk:
   1639        // FIXME(bug 1510861) are we using any hints that aren't allowed
   1640        // in SandboxPolicyCommon now?
   1641      case __NR_madvise:
   1642        return Allow();
   1643 
   1644        // wasm uses mremap (always with zero flags)
   1645      case __NR_mremap: {
   1646        Arg<int> flags(3);
   1647        return If(flags == 0, Allow())
   1648            .Else(SandboxPolicyCommon::EvaluateSyscall(sysno));
   1649      }
   1650 
   1651        // Bug 1462640: Mesa libEGL uses mincore to test whether values
   1652        // are pointers, for reasons.
   1653      case __NR_mincore: {
   1654        Arg<size_t> length(1);
   1655        return If(length == getpagesize(), Allow())
   1656            .Else(SandboxPolicyCommon::EvaluateSyscall(sysno));
   1657      }
   1658 
   1659 #ifdef __NR_set_thread_area
   1660      case __NR_set_thread_area:
   1661        return Allow();
   1662 #endif
   1663 
   1664      case __NR_getrusage:
   1665      case __NR_times:
   1666        return Allow();
   1667 
   1668      case __NR_fsync:
   1669      case __NR_msync:
   1670        return Allow();
   1671 
   1672      case __NR_getpriority:
   1673      case __NR_setpriority:
   1674      case __NR_sched_getattr:
   1675      case __NR_sched_setattr:
   1676      case __NR_sched_get_priority_min:
   1677      case __NR_sched_get_priority_max:
   1678      case __NR_sched_getscheduler:
   1679      case __NR_sched_setscheduler:
   1680      case __NR_sched_getparam:
   1681      case __NR_sched_setparam:
   1682 #ifdef DESKTOP
   1683      case __NR_sched_getaffinity:
   1684 #endif
   1685        return Allow();
   1686 
   1687 #ifdef DESKTOP
   1688      case __NR_sched_setaffinity:
   1689        return Error(EPERM);
   1690 #endif
   1691 
   1692 #ifdef DESKTOP
   1693      case __NR_pipe2: {
   1694        // Restrict the flags; O_NOTIFICATION_PIPE in particular
   1695        // exposes enough attack surface to be a cause for concern
   1696        // (bug 1808320).  O_DIRECT isn't known to be used currently
   1697        // (Try passes with it blocked), but should be low-risk, and
   1698        // Chromium allows it.
   1699        static constexpr int allowed_flags = O_CLOEXEC | O_NONBLOCK | O_DIRECT;
   1700        Arg<int> flags(1);
   1701        return If((flags & ~allowed_flags) == 0, Allow())
   1702            .Else(InvalidSyscall());
   1703      }
   1704 
   1705      CASES_FOR_getrlimit:
   1706      CASES_FOR_getresuid:
   1707      CASES_FOR_getresgid:
   1708        return Allow();
   1709 
   1710      case __NR_prlimit64: {
   1711        // Allow only the getrlimit() use case.  (glibc seems to use
   1712        // only pid 0 to indicate the current process; pid == getpid()
   1713        // is equivalent and could also be allowed if needed.)
   1714        Arg<pid_t> pid(0);
   1715        // This is really a const struct ::rlimit*, but Arg<> doesn't
   1716        // work with pointers, only integer types.
   1717        Arg<uintptr_t> new_limit(2);
   1718        return If(AllOf(pid == 0, new_limit == 0), Allow())
   1719            .Else(InvalidSyscall());
   1720      }
   1721 
   1722        // PulseAudio calls umask, even though it's unsafe in
   1723        // multithreaded applications.  But, allowing it here doesn't
   1724        // really do anything one way or the other, now that file
   1725        // accesses are brokered to another process.
   1726      case __NR_umask:
   1727        return AllowBelowLevel(4);
   1728 
   1729      case __NR_kill: {
   1730        if (BelowLevel(4)) {
   1731          Arg<int> sig(1);
   1732          // PulseAudio uses kill(pid, 0) to check if purported owners of
   1733          // shared memory files are still alive; see bug 1397753 for more
   1734          // details.
   1735          return If(sig == 0, Error(EPERM)).Else(InvalidSyscall());
   1736        }
   1737        return InvalidSyscall();
   1738      }
   1739 
   1740      case __NR_wait4:
   1741 #  ifdef __NR_waitpid
   1742      case __NR_waitpid:
   1743 #  endif
   1744        // NSPR will start a thread to wait for child processes even if
   1745        // fork() fails; see bug 227246 and bug 1299581.
   1746        return Error(ECHILD);
   1747 
   1748      case __NR_eventfd2:
   1749        return Allow();
   1750 
   1751 #  ifdef __NR_rt_tgsigqueueinfo
   1752        // Only allow to send signals within the process.
   1753      case __NR_rt_tgsigqueueinfo: {
   1754        Arg<pid_t> tgid(0);
   1755        return If(tgid == getpid(), Allow()).Else(InvalidSyscall());
   1756      }
   1757 #  endif
   1758 
   1759      case __NR_mlock:
   1760      case __NR_munlock:
   1761        return Allow();
   1762 
   1763        // We can't usefully allow fork+exec, even on a temporary basis;
   1764        // the child would inherit the seccomp-bpf policy and almost
   1765        // certainly die from an unexpected SIGSYS.  We also can't have
   1766        // fork() crash, currently, because there are too many system
   1767        // libraries/plugins that try to run commands.  But they can
   1768        // usually do something reasonable on error.
   1769      case __NR_clone:
   1770        return ClonePolicy(Error(EPERM));
   1771 #  ifdef __NR_fork
   1772      case __NR_fork:
   1773        return Error(ENOSYS);
   1774 #  endif
   1775 
   1776 #  ifdef __NR_fadvise64
   1777      case __NR_fadvise64:
   1778        return Allow();
   1779 #  endif
   1780 
   1781 #  ifdef __NR_fadvise64_64
   1782      case __NR_fadvise64_64:
   1783        return Allow();
   1784 #  endif
   1785 
   1786      case __NR_fallocate:
   1787        return Allow();
   1788 
   1789      case __NR_get_mempolicy:
   1790        return Allow();
   1791 
   1792      // Required by libnuma for FFmpeg
   1793      case __NR_set_mempolicy:
   1794        return Error(ENOSYS);
   1795 
   1796      case __NR_kcmp:
   1797        return KcmpPolicyForMesa();
   1798 
   1799 #endif  // DESKTOP
   1800 
   1801 #ifdef DESKTOP
   1802      case __NR_sysinfo:
   1803 #endif
   1804        return Allow();
   1805 
   1806      default:
   1807        return SandboxPolicyCommon::EvaluateSyscall(sysno);
   1808    }
   1809  }
   1810 };
   1811 
   1812 UniquePtr<sandbox::bpf_dsl::Policy> GetContentSandboxPolicy(
   1813    SandboxBrokerClient* aMaybeBroker, ContentProcessSandboxParams&& aParams) {
   1814  return MakeUnique<ContentSandboxPolicy>(aMaybeBroker, std::move(aParams));
   1815 }
   1816 
   1817 // Unlike for content, the GeckoMediaPlugin seccomp-bpf policy needs
   1818 // to be an effective sandbox by itself, because we allow GMP on Linux
   1819 // systems where that's the only sandboxing mechanism we can use.
   1820 //
   1821 // Be especially careful about what this policy allows.
   1822 class GMPSandboxPolicy : public SandboxPolicyCommon {
   1823  static intptr_t OpenTrap(const arch_seccomp_data& aArgs, void* aux) {
   1824    const auto files = static_cast<const SandboxOpenedFiles*>(aux);
   1825    const char* path;
   1826    int flags;
   1827 
   1828    switch (aArgs.nr) {
   1829 #ifdef __NR_open
   1830      case __NR_open:
   1831        path = reinterpret_cast<const char*>(aArgs.args[0]);
   1832        flags = static_cast<int>(aArgs.args[1]);
   1833        break;
   1834 #endif
   1835      case __NR_openat:
   1836        // The path has to be absolute to match the pre-opened file (see
   1837        // assertion in ctor) so the dirfd argument is ignored.
   1838        path = reinterpret_cast<const char*>(aArgs.args[1]);
   1839        flags = static_cast<int>(aArgs.args[2]);
   1840        break;
   1841      default:
   1842        MOZ_CRASH("unexpected syscall number");
   1843    }
   1844 
   1845    if ((flags & O_ACCMODE) != O_RDONLY) {
   1846      SANDBOX_LOG("non-read-only open of file %s attempted (flags=0%o)", path,
   1847                  flags);
   1848      return -EROFS;
   1849    }
   1850    int fd = files->GetDesc(path);
   1851    if (fd < 0) {
   1852      // SandboxOpenedFile::GetDesc already logged about this, if appropriate.
   1853      return -ENOENT;
   1854    }
   1855    return fd;
   1856  }
   1857 
   1858 #if defined(__NR_stat64) || defined(__NR_stat)
   1859  static intptr_t StatTrap(const arch_seccomp_data& aArgs, void* aux) {
   1860    const auto* const files = static_cast<const SandboxOpenedFiles*>(aux);
   1861    const auto* path = reinterpret_cast<const char*>(aArgs.args[0]);
   1862    int fd = files->GetDesc(path);
   1863    if (fd < 0) {
   1864      // SandboxOpenedFile::GetDesc already logged about this, if appropriate.
   1865      return -ENOENT;
   1866    }
   1867    auto* buf = reinterpret_cast<statstruct*>(aArgs.args[1]);
   1868 #  ifdef __NR_fstat64
   1869    return DoSyscall(__NR_fstat64, fd, buf);
   1870 #  else
   1871    return DoSyscall(__NR_fstat, fd, buf);
   1872 #  endif
   1873  }
   1874 #endif
   1875 
   1876  static intptr_t UnameTrap(const arch_seccomp_data& aArgs, void* aux) {
   1877    const auto buf = reinterpret_cast<struct utsname*>(aArgs.args[0]);
   1878    PodZero(buf);
   1879    // The real uname() increases fingerprinting risk for no benefit.
   1880    // This is close enough.
   1881    strcpy(buf->sysname, "Linux");
   1882    strcpy(buf->version, "3");
   1883    return 0;
   1884  }
   1885 
   1886  static intptr_t FcntlTrap(const arch_seccomp_data& aArgs, void* aux) {
   1887    const auto cmd = static_cast<int>(aArgs.args[1]);
   1888    switch (cmd) {
   1889        // This process can't exec, so the actual close-on-exec flag
   1890        // doesn't matter; have it always read as true and ignore writes.
   1891      case F_GETFD:
   1892        return O_CLOEXEC;
   1893      case F_SETFD:
   1894        return 0;
   1895      default:
   1896        return -ENOSYS;
   1897    }
   1898  }
   1899 
   1900  const SandboxOpenedFiles* mFiles;
   1901 
   1902 public:
   1903  explicit GMPSandboxPolicy(const SandboxOpenedFiles* aFiles) : mFiles(aFiles) {
   1904    // Used by the profiler to send data back to the parent process;
   1905    // we are not enabling the file broker, so this will only work if
   1906    // memfd_create is available.
   1907    mMayCreateShmem = true;
   1908  }
   1909 
   1910  ~GMPSandboxPolicy() override = default;
   1911 
   1912  ResultExpr EvaluateSyscall(int sysno) const override {
   1913    switch (sysno) {
   1914      // Simulate opening the plugin file.
   1915 #ifdef __NR_open
   1916      case __NR_open:
   1917 #endif
   1918      case __NR_openat:
   1919        return Trap(OpenTrap, mFiles);
   1920 
   1921 #if defined(__NR_stat64) || defined(__NR_stat)
   1922      CASES_FOR_stat:
   1923        return Trap(StatTrap, mFiles);
   1924 #endif
   1925 
   1926      case __NR_brk:
   1927        return Allow();
   1928      case __NR_sched_get_priority_min:
   1929      case __NR_sched_get_priority_max:
   1930        return Allow();
   1931      case __NR_sched_getparam:
   1932      case __NR_sched_getscheduler:
   1933      case __NR_sched_setscheduler: {
   1934        Arg<pid_t> pid(0);
   1935        return If(pid == 0, Allow()).Else(Trap(SchedTrap, nullptr));
   1936      }
   1937 
   1938      // For clock(3) on older glibcs; bug 1304220.
   1939      case __NR_times:
   1940        return Allow();
   1941 
   1942      // Bug 1372428
   1943      case __NR_uname:
   1944        return Trap(UnameTrap, nullptr);
   1945      CASES_FOR_fcntl:
   1946        return Trap(FcntlTrap, nullptr);
   1947 
   1948      // Allow the same advice values as the default policy, but return
   1949      // Error(ENOSYS) for other values. Because the Widevine CDM may probe
   1950      // advice arguments, including invalid values, we don't want to return
   1951      // InvalidSyscall(), as this will crash the process. So instead just
   1952      // indicate such calls are not available.
   1953      case __NR_madvise: {
   1954        Arg<int> advice(2);
   1955        return If(advice == MADV_DONTNEED, Allow())
   1956            .ElseIf(advice == MADV_FREE, Allow())
   1957            .ElseIf(advice == MADV_HUGEPAGE, Allow())
   1958            .ElseIf(advice == MADV_NOHUGEPAGE, Allow())
   1959 #ifdef MOZ_ASAN
   1960            .ElseIf(advice == MADV_DONTDUMP, Allow())
   1961 #endif
   1962            .ElseIf(advice == MADV_MERGEABLE, Error(EPERM))  // bug 1705045
   1963            .Else(Error(ENOSYS));
   1964      }
   1965 
   1966      // The profiler will try to readlink /proc/self/exe for native
   1967      // stackwalking, but that's broken for several other reasons;
   1968      // see discussion in bug 1770905.  (That can be emulated by
   1969      // pre-recording the result if/when we need it.)
   1970 #ifdef __NR_readlink
   1971      case __NR_readlink:
   1972 #endif
   1973      case __NR_readlinkat:
   1974        return Error(EINVAL);
   1975 
   1976      default:
   1977        return SandboxPolicyCommon::EvaluateSyscall(sysno);
   1978    }
   1979  }
   1980 };
   1981 
   1982 UniquePtr<sandbox::bpf_dsl::Policy> GetMediaSandboxPolicy(
   1983    const SandboxOpenedFiles* aFiles) {
   1984  return UniquePtr<sandbox::bpf_dsl::Policy>(new GMPSandboxPolicy(aFiles));
   1985 }
   1986 
   1987 // The policy for the data decoder process is similar to the one for
   1988 // media plugins, but the codec code is all in-tree so it's better
   1989 // behaved and doesn't need special exceptions (or the ability to load
   1990 // a plugin file).  However, it does directly create shared memory
   1991 // segments, so it may need file brokering.
   1992 class RDDSandboxPolicy final : public SandboxPolicyCommon {
   1993 public:
   1994  explicit RDDSandboxPolicy(SandboxBrokerClient* aBroker) {
   1995    mBroker = aBroker;
   1996    mMayCreateShmem = true;
   1997  }
   1998 
   1999 #ifndef ANDROID
   2000  Maybe<ResultExpr> EvaluateIpcCall(int aCall, int aArgShift) const override {
   2001    // The Intel media driver uses SysV IPC (semaphores and shared
   2002    // memory) on newer hardware models; it always uses this fixed
   2003    // key, so we can restrict semget and shmget.  Unfortunately, the
   2004    // calls that operate on these resources take "identifiers", which
   2005    // are unpredictable (by us) but guessable (by an adversary).
   2006    static constexpr key_t kIntelKey = 'D' << 24 | 'V' << 8 | 'X' << 0;
   2007 
   2008    switch (aCall) {
   2009      case SEMGET:
   2010      case SHMGET: {
   2011        Arg<key_t> key(0 + aArgShift);
   2012        return Some(If(key == kIntelKey, Allow()).Else(InvalidSyscall()));
   2013      }
   2014 
   2015      case SEMCTL:
   2016      case SEMOP:
   2017      case SEMTIMEDOP:
   2018      case SHMCTL:
   2019      case SHMAT:
   2020      case SHMDT:
   2021        return Some(Allow());
   2022 
   2023      default:
   2024        return SandboxPolicyCommon::EvaluateIpcCall(aCall, aArgShift);
   2025    }
   2026  }
   2027 #endif
   2028 
   2029  Maybe<ResultExpr> EvaluateSocketCall(int aCall,
   2030                                       bool aHasArgs) const override {
   2031    switch (aCall) {
   2032      // These are for X11.
   2033      //
   2034      // FIXME (bug 1884449): X11 is blocked now so we probably don't
   2035      // need these, but they're relatively harmless.
   2036      case SYS_GETSOCKNAME:
   2037      case SYS_GETPEERNAME:
   2038      case SYS_SHUTDOWN:
   2039        return Some(Allow());
   2040 
   2041      case SYS_SOCKET:
   2042        // Hardware-accelerated decode uses EGL to manage hardware surfaces.
   2043        // When initialised it tries to connect to the Wayland server over a
   2044        // UNIX socket. It still works fine if it can't connect to Wayland, so
   2045        // don't let it create the socket (but don't kill the process for
   2046        // trying).
   2047        //
   2048        // We also see attempts to connect to an X server on desktop
   2049        // Linux sometimes (bug 1882598).
   2050        return Some(Error(EACCES));
   2051 
   2052      default:
   2053        return SandboxPolicyCommon::EvaluateSocketCall(aCall, aHasArgs);
   2054    }
   2055  }
   2056 
   2057  ResultExpr EvaluateSyscall(int sysno) const override {
   2058    switch (sysno) {
   2059      case __NR_getrusage:
   2060        return Allow();
   2061 
   2062      case __NR_ioctl: {
   2063        Arg<unsigned long> request(1);
   2064        auto shifted_type = request & kIoctlTypeMask;
   2065        static constexpr unsigned long kDrmType =
   2066            static_cast<unsigned long>('d') << _IOC_TYPESHIFT;
   2067        // Note: 'b' is also the Binder device on Android.
   2068        static constexpr unsigned long kDmaBufType =
   2069            static_cast<unsigned long>('b') << _IOC_TYPESHIFT;
   2070 #ifdef MOZ_ENABLE_V4L2
   2071        // Type 'V' for V4L2, used for hw accelerated decode
   2072        static constexpr unsigned long kVideoType =
   2073            static_cast<unsigned long>('V') << _IOC_TYPESHIFT;
   2074 #endif
   2075        // nvidia non-tegra uses some ioctls from this range (but not actual
   2076        // fbdev ioctls; nvidia uses values >= 200 for the NR field
   2077        // (low 8 bits))
   2078        static constexpr unsigned long kFbDevType =
   2079            static_cast<unsigned long>('F') << _IOC_TYPESHIFT;
   2080 
   2081 #if defined(__aarch64__)
   2082        // NVIDIA decoder, from Linux4Tegra
   2083        // http://lists.mplayerhq.hu/pipermail/ffmpeg-devel/2024-May/328552.html
   2084        static constexpr unsigned long kNvidiaNvmapType =
   2085            static_cast<unsigned long>('N') << _IOC_TYPESHIFT;
   2086        static constexpr unsigned long kNvidiaNvhostType =
   2087            static_cast<unsigned long>('H') << _IOC_TYPESHIFT;
   2088 #endif  // defined(__aarch64__)
   2089 
   2090        // Allow DRI and DMA-Buf for VA-API. Also allow V4L2 if enabled
   2091        return If(shifted_type == kDrmType, Allow())
   2092            .ElseIf(shifted_type == kDmaBufType, Allow())
   2093 #ifdef MOZ_ENABLE_V4L2
   2094            .ElseIf(shifted_type == kVideoType, Allow())
   2095 #endif
   2096        // NVIDIA decoder from Linux4Tegra, this is specific to Tegra ARM64 SoC
   2097 #if defined(__aarch64__)
   2098            .ElseIf(shifted_type == kNvidiaNvmapType, Allow())
   2099            .ElseIf(shifted_type == kNvidiaNvhostType, Allow())
   2100 #endif  // defined(__aarch64__)
   2101        // Hack for nvidia non-tegra devices, which isn't supported yet:
   2102            .ElseIf(shifted_type == kFbDevType, Error(ENOTTY))
   2103            .Else(SandboxPolicyCommon::EvaluateSyscall(sysno));
   2104      }
   2105 
   2106        // Mesa/amdgpu
   2107      case __NR_kcmp:
   2108        return KcmpPolicyForMesa();
   2109 
   2110        // We use this in our DMABuf support code.
   2111      case __NR_eventfd2:
   2112        return Allow();
   2113 
   2114        // Allow the sched_* syscalls for the current thread only.
   2115        // Mesa attempts to use them to optimize performance; often
   2116        // this involves passing other threads' tids, which we can't
   2117        // safely allow, but maybe a future Mesa version could fix that.
   2118      case __NR_sched_getaffinity:
   2119      case __NR_sched_setaffinity:
   2120      case __NR_sched_getparam:
   2121      case __NR_sched_setparam:
   2122      case __NR_sched_getscheduler:
   2123      case __NR_sched_setscheduler:
   2124      case __NR_sched_getattr:
   2125      case __NR_sched_setattr: {
   2126        Arg<pid_t> pid(0);
   2127        return If(pid == 0, Allow()).Else(Trap(SchedTrap, nullptr));
   2128      }
   2129 
   2130        // The priority bounds are also used, sometimes (bug 1838675):
   2131      case __NR_sched_get_priority_min:
   2132      case __NR_sched_get_priority_max:
   2133        return Allow();
   2134 
   2135        // nvidia tries to mknod(!) its devices; that won't work anyway,
   2136        // so quietly reject it.
   2137 #ifdef __NR_mknod
   2138      case __NR_mknod:
   2139 #endif
   2140      case __NR_mknodat:
   2141        return Error(EPERM);
   2142 
   2143        // Used by the nvidia GPU driver, including in multi-GPU
   2144        // systems when we intend to use a non-nvidia GPU.  (Also used
   2145        // by Mesa for its shader cache, but we disable that in this
   2146        // process.)
   2147      CASES_FOR_fstatfs:
   2148        return Allow();
   2149 
   2150        // nvidia drivers may attempt to spawn nvidia-modprobe
   2151      case __NR_clone:
   2152        return ClonePolicy(Error(EPERM));
   2153 #ifdef __NR_fork
   2154      case __NR_fork:
   2155        return Error(ENOSYS);
   2156 #endif
   2157 
   2158        // Pass through the common policy.
   2159      default:
   2160        return SandboxPolicyCommon::EvaluateSyscall(sysno);
   2161    }
   2162  }
   2163 };
   2164 
   2165 UniquePtr<sandbox::bpf_dsl::Policy> GetDecoderSandboxPolicy(
   2166    SandboxBrokerClient* aMaybeBroker) {
   2167  return UniquePtr<sandbox::bpf_dsl::Policy>(
   2168      new RDDSandboxPolicy(aMaybeBroker));
   2169 }
   2170 
   2171 // Basically a clone of RDDSandboxPolicy until we know exactly what
   2172 // the SocketProcess sandbox looks like.
   2173 class SocketProcessSandboxPolicy final : public SandboxPolicyCommon {
   2174 private:
   2175  SocketProcessSandboxParams mParams;
   2176 
   2177  bool BelowLevel(int aLevel) const { return mParams.mLevel < aLevel; }
   2178 
   2179 public:
   2180  explicit SocketProcessSandboxPolicy(SandboxBrokerClient* aBroker,
   2181                                      SocketProcessSandboxParams&& aParams)
   2182      : mParams(std::move(aParams)) {
   2183    mBroker = aBroker;
   2184    mMayCreateShmem = true;
   2185  }
   2186 
   2187  static intptr_t FcntlTrap(const arch_seccomp_data& aArgs, void* aux) {
   2188    const auto cmd = static_cast<int>(aArgs.args[1]);
   2189    switch (cmd) {
   2190        // This process can't exec, so the actual close-on-exec flag
   2191        // doesn't matter; have it always read as true and ignore writes.
   2192      case F_GETFD:
   2193        return O_CLOEXEC;
   2194      case F_SETFD:
   2195        return 0;
   2196      default:
   2197        return -ENOSYS;
   2198    }
   2199  }
   2200 
   2201  BoolExpr MsgFlagsAllowed(const Arg<int>& aFlags) const override {
   2202    // Necko might use advanced networking features, and the sandbox
   2203    // is relatively permissive compared to content, so this is a
   2204    // default-allow policy.
   2205    //
   2206    // However, `MSG_OOB` has historically been buggy, and the way it
   2207    // maps to TCP is notoriously broken (see RFC 6093), so it should
   2208    // be safe to block.
   2209    return (aFlags & MSG_OOB) == 0;
   2210  }
   2211 
   2212  Maybe<ResultExpr> EvaluateSocketCall(int aCall,
   2213                                       bool aHasArgs) const override {
   2214    switch (aCall) {
   2215      case SYS_SOCKET:
   2216      case SYS_CONNECT:
   2217      case SYS_BIND:
   2218        return Some(Allow());
   2219 
   2220      // sendmsg and recvmmsg needed for HTTP3/QUIC UDP IO. Note sendmsg is
   2221      // allowed in SandboxPolicyCommon.
   2222      case SYS_RECVMMSG:
   2223      // Required for the DNS Resolver thread.
   2224      case SYS_SENDMMSG:
   2225        if (aHasArgs) {
   2226          Arg<int> flags(3);
   2227          return Some(
   2228              If(MsgFlagsAllowed(flags), Allow()).Else(InvalidSyscall()));
   2229        }
   2230        return Some(UnpackSocketcallOrAllow());
   2231 
   2232      case SYS_GETSOCKOPT:
   2233      case SYS_SETSOCKOPT:
   2234      case SYS_GETSOCKNAME:
   2235      case SYS_GETPEERNAME:
   2236      case SYS_SHUTDOWN:
   2237      case SYS_ACCEPT:
   2238      case SYS_ACCEPT4:
   2239        return Some(Allow());
   2240 
   2241      default:
   2242        return SandboxPolicyCommon::EvaluateSocketCall(aCall, aHasArgs);
   2243    }
   2244  }
   2245 
   2246  ResultExpr PrctlPolicy() const override {
   2247    Arg<int> op(0);
   2248    Arg<int> arg2(1);
   2249    return Switch(op)
   2250        .Case(PR_SET_VMA,  // Tagging of anonymous memory mappings
   2251              If(arg2 == PR_SET_VMA_ANON_NAME, Allow()).Else(InvalidSyscall()))
   2252        .Cases({PR_SET_NAME,      // Thread creation
   2253                PR_SET_DUMPABLE,  // Crash reporting
   2254                PR_SET_PTRACER},  // Debug-mode crash handling
   2255               Allow())
   2256 #if defined(MOZ_PROFILE_GENERATE)
   2257        .Case(PR_GET_PDEATHSIG, Allow())
   2258 #endif  // defined(MOZ_PROFILE_GENERATE)
   2259        .Default(InvalidSyscall());
   2260  }
   2261 
   2262  ResultExpr EvaluateSyscall(int sysno) const override {
   2263    switch (sysno) {
   2264      case __NR_getrusage:
   2265        return Allow();
   2266 
   2267      case __NR_ioctl: {
   2268        Arg<unsigned long> request(1);
   2269        auto shifted_type = request & kIoctlTypeMask;
   2270 
   2271        // Rust's stdlib seems to use FIOCLEX instead of equivalent fcntls.
   2272        return Switch(request)
   2273            .Case(FIOCLEX, Allow())
   2274            // Rust's stdlib also uses FIONBIO instead of equivalent fcntls.
   2275            .Case(FIONBIO, Allow())
   2276            // This is used by PR_Available in nsSocketInputStream::Available.
   2277            .Case(FIONREAD, Allow())
   2278            // WebRTC needs interface information (bug 1975576)
   2279            .Cases({SIOCGIFNAME, SIOCGIFFLAGS, SIOCETHTOOL, SIOCGIWRATE},
   2280                   Allow())
   2281            .Default(
   2282                // Allow anything that isn't a tty ioctl (if level < 2)
   2283                If(BelowLevel(2) ? shifted_type != kTtyIoctls
   2284                                 : BoolConst(false),
   2285                   Allow())
   2286                    .Else(SandboxPolicyCommon::EvaluateSyscall(sysno)));
   2287      }
   2288 
   2289      CASES_FOR_fcntl: {
   2290        Arg<int> cmd(1);
   2291        return Switch(cmd)
   2292            .Case(F_DUPFD_CLOEXEC, Allow())
   2293            // Nvidia GL and fontconfig (newer versions) use fcntl file locking.
   2294            .Case(F_SETLK, Allow())
   2295 #ifdef F_SETLK64
   2296            .Case(F_SETLK64, Allow())
   2297 #endif
   2298            // Pulseaudio uses F_SETLKW, as does fontconfig.
   2299            .Case(F_SETLKW, Allow())
   2300 #ifdef F_SETLKW64
   2301            .Case(F_SETLKW64, Allow())
   2302 #endif
   2303            .Default(SandboxPolicyCommon::EvaluateSyscall(sysno));
   2304      }
   2305 
   2306 #ifdef DESKTOP
   2307      // This section is borrowed from ContentSandboxPolicy
   2308      CASES_FOR_getrlimit:
   2309      CASES_FOR_getresuid:
   2310      CASES_FOR_getresgid:
   2311        return Allow();
   2312 
   2313      case __NR_prlimit64: {
   2314        // Allow only the getrlimit() use case.  (glibc seems to use
   2315        // only pid 0 to indicate the current process; pid == getpid()
   2316        // is equivalent and could also be allowed if needed.)
   2317        Arg<pid_t> pid(0);
   2318        // This is really a const struct ::rlimit*, but Arg<> doesn't
   2319        // work with pointers, only integer types.
   2320        Arg<uintptr_t> new_limit(2);
   2321        return If(AllOf(pid == 0, new_limit == 0), Allow())
   2322            .Else(InvalidSyscall());
   2323      }
   2324 #endif  // DESKTOP
   2325 
   2326      default:
   2327        return SandboxPolicyCommon::EvaluateSyscall(sysno);
   2328    }
   2329  }
   2330 };
   2331 
   2332 UniquePtr<sandbox::bpf_dsl::Policy> GetSocketProcessSandboxPolicy(
   2333    SandboxBrokerClient* aMaybeBroker, SocketProcessSandboxParams&& aParams) {
   2334  return UniquePtr<sandbox::bpf_dsl::Policy>(
   2335      new SocketProcessSandboxPolicy(aMaybeBroker, std::move(aParams)));
   2336 }
   2337 
   2338 class UtilitySandboxPolicy : public SandboxPolicyCommon {
   2339 public:
   2340  explicit UtilitySandboxPolicy(SandboxBrokerClient* aBroker) {
   2341    mBroker = aBroker;
   2342    mMayCreateShmem = true;
   2343  }
   2344 
   2345  ResultExpr PrctlPolicy() const override {
   2346    Arg<int> op(0);
   2347    Arg<int> arg2(1);
   2348    return Switch(op)
   2349        .Case(PR_SET_VMA,  // Tagging of anonymous memory mappings
   2350              If(arg2 == PR_SET_VMA_ANON_NAME, Allow()).Else(InvalidSyscall()))
   2351        .Cases({PR_SET_NAME,        // Thread creation
   2352                PR_SET_DUMPABLE,    // Crash reporting
   2353                PR_SET_PTRACER,     // Debug-mode crash handling
   2354                PR_GET_PDEATHSIG},  // PGO profiling, cf
   2355                                    // https://reviews.llvm.org/D29954
   2356               Allow())
   2357        .Case(PR_CAPBSET_READ,  // libcap.so.2 loaded by libpulse.so.0
   2358                                // queries for capabilities
   2359              Error(EINVAL))
   2360 #if defined(MOZ_PROFILE_GENERATE)
   2361        .Case(PR_GET_PDEATHSIG, Allow())
   2362 #endif  // defined(MOZ_PROFILE_GENERATE)
   2363        .Default(InvalidSyscall());
   2364  }
   2365 
   2366  ResultExpr EvaluateSyscall(int sysno) const override {
   2367    switch (sysno) {
   2368      case __NR_getrusage:
   2369        return Allow();
   2370 
   2371      // Required by FFmpeg
   2372      case __NR_get_mempolicy:
   2373        return Allow();
   2374 
   2375      // Required by libnuma for FFmpeg
   2376      case __NR_sched_getaffinity: {
   2377        Arg<pid_t> pid(0);
   2378        return If(pid == 0, Allow()).Else(Trap(SchedTrap, nullptr));
   2379      }
   2380 
   2381      // Required by libnuma for FFmpeg
   2382      case __NR_set_mempolicy:
   2383        return Error(ENOSYS);
   2384 
   2385      // Pass through the common policy.
   2386      default:
   2387        return SandboxPolicyCommon::EvaluateSyscall(sysno);
   2388    }
   2389  }
   2390 };
   2391 
   2392 UniquePtr<sandbox::bpf_dsl::Policy> GetUtilitySandboxPolicy(
   2393    SandboxBrokerClient* aMaybeBroker) {
   2394  return UniquePtr<sandbox::bpf_dsl::Policy>(
   2395      new UtilitySandboxPolicy(aMaybeBroker));
   2396 }
   2397 
   2398 }  // namespace mozilla