tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

process_watcher_posix_sigchld.cc (20590B)


      1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
      2 /* vim: set ts=8 sts=2 et sw=2 tw=80: */
      3 /* This Source Code Form is subject to the terms of the Mozilla Public
      4 * License, v. 2.0. If a copy of the MPL was not distributed with this
      5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
      6 
      7 #include <errno.h>
      8 #include <fcntl.h>
      9 #include <mutex>
     10 #include <signal.h>
     11 #include <sys/types.h>
     12 #include <sys/wait.h>
     13 #include <unistd.h>
     14 
     15 #include "base/eintr_wrapper.h"
     16 #include "base/logging.h"
     17 #include "base/message_loop.h"
     18 #include "base/process_util.h"
     19 #include "mozilla/DataMutex.h"
     20 #include "mozilla/StaticPtr.h"
     21 #include "mozilla/ipc/IOThread.h"
     22 #include "nsITimer.h"
     23 #include "nsTArray.h"
     24 #include "nsThreadUtils.h"
     25 #include "nsXULAppAPI.h"
     26 #include "prenv.h"
     27 
     28 #include "chrome/common/process_watcher.h"
     29 
     30 #ifdef MOZ_ENABLE_FORKSERVER
     31 #  include "mozilla/ipc/ForkServiceChild.h"
     32 #endif
     33 
     34 #if defined(XP_LINUX) && !defined(ANDROID)
     35 #  include "mozilla/AvailableMemoryWatcher.h"
     36 #  include "mozilla/glean/XpcomMetrics.h"
     37 #  include "nsPrintfCString.h"
     38 #endif
     39 
     40 // Just to make sure the moz.build is doing the right things with
     41 // TARGET_OS and/or OS_TARGET:
     42 #if defined(MOZ_WIDGET_ANDROID) || defined(MOZ_WIDGET_UIKIT)
     43 #  error Unsupported OS
     44 #endif
     45 
     46 #if !defined(XP_DARWIN)
     47 // Linux, {Free,Net,Open}BSD, and Solaris; but not macOS, yet.
     48 #  define HAVE_PIPE2 1
     49 #endif
     50 
     51 // The basic idea here is a minimal SIGCHLD handler which writes to a
     52 // pipe and a libevent callback on the I/O thread which fires when the
     53 // other end becomes readable.  When we start waiting for process
     54 // termination we check if it had already terminated, and otherwise
     55 // register it to be checked later when SIGCHLD fires.
     56 //
     57 // Making this more complicated is that we usually want to kill the
     58 // process after a timeout, in case it hangs trying to exit, but not
     59 // if it's already exited by that point (see `DelayedKill`).
     60 // But we also support waiting indefinitely, for debug/CI use cases
     61 // like refcount logging / leak detection / code coverage, and in that
     62 // case we block parent process shutdown until all children exit
     63 // (which is done by blocking the I/O thread late in shutdown, which
     64 // isn't ideal, but the Windows implementation has the same issue).
     65 
     66 // Maximum amount of time (in milliseconds) to wait for the process to exit.
     67 // XXX/cjones: fairly arbitrary, chosen to match process_watcher_win.cc
     68 static constexpr int kMaxWaitMs = 2000;
     69 
     70 // This is also somewhat arbitrary, but loosely based on Try results.
     71 // See also toolkit.asyncshutdown.crash_timeout (currently 60s) after
     72 // which the parent process will be killed.
     73 #ifdef MOZ_CODE_COVERAGE
     74 // Code coverage instrumentation can be slow (especially when writing
     75 // out data, which has to take a lock on the data files).
     76 static constexpr int kShutdownWaitMs = 80000;
     77 #elif defined(MOZ_ASAN) || defined(MOZ_TSAN)
     78 // Sanitizers slow things down in some cases; see bug 1806224.
     79 static constexpr int kShutdownWaitMs = 40000;
     80 #else
     81 static constexpr int kShutdownWaitMs = 8000;
     82 #endif
     83 
     84 namespace {
     85 
     86 using base::BlockingWait;
     87 
     88 // Represents a child process being awaited (which is expected to exit
     89 // soon, or already has).
     90 //
     91 // If `mForce` is null then we will wait indefinitely (and block
     92 // parent shutdown; see above); otherwise it will be killed after a
     93 // timeout (or during parent shutdown, if that happens first).
     94 struct PendingChild {
     95  pid_t mPid;
     96  nsCOMPtr<nsITimer> mForce;
     97 };
     98 
     99 // `EnsureProcessTerminated` is called when a process is expected to
    100 // be shutting down, so there should be relatively few `PendingChild`
    101 // instances at any given time, meaning that using an array and doing
    102 // O(n) operations should be fine.
    103 static mozilla::StaticDataMutex<mozilla::StaticAutoPtr<nsTArray<PendingChild>>>
    104    gPendingChildren("ProcessWatcher::gPendingChildren");
    105 static int gSignalPipe[2] = {-1, -1};
    106 static mozilla::Atomic<bool> gProcessWatcherShutdown;
    107 
    108 #if defined(XP_LINUX) && !defined(ANDROID)
    109 // Record Glean event when a content process is killed by OOM killer
    110 static void RecordContentProcessOOMKilled() {
    111  // Get PSI data
    112  mozilla::PSIInfo psi;
    113  nsresult rv = mozilla::GetLastPSISnapshot(psi);
    114 
    115  if (NS_SUCCEEDED(rv)) {
    116    // Record Glean event with PSI metrics
    117    mozilla::glean::memory_watcher::process_oom_killed.Record(
    118        mozilla::Some(mozilla::glean::memory_watcher::ProcessOomKilledExtra{
    119            mozilla::Some(nsPrintfCString("%lu", psi.some_avg10)),
    120            mozilla::Some(nsPrintfCString("%lu", psi.some_avg60)),
    121            mozilla::Some(nsPrintfCString("%lu", psi.full_avg10)),
    122            mozilla::Some(nsPrintfCString("%lu", psi.full_avg60)),
    123        }));
    124  }
    125 }
    126 #endif
    127 
    128 // A wrapper around WaitForProcess to simplify the result (true if the
    129 // process exited and the pid is now freed for reuse, false if it's
    130 // still running), and handle the case where "blocking" mode doesn't
    131 // block (so this function will always return true if `aBlock` is
    132 // `YES`), and log a warning message if the process didn't exit
    133 // successfully (as in `exit(0)`).
    134 static bool IsProcessDead(pid_t pid, BlockingWait aBlock) {
    135  int info = 0;
    136 
    137  auto status = WaitForProcess(pid, aBlock, &info);
    138  while (aBlock == BlockingWait::Yes &&
    139         status == base::ProcessStatus::Running) {
    140    // It doesn't matter if this is interrupted; we just need to
    141    // wait for some amount of time while the other process status
    142    // event is (hopefully) handled.  This is used only during an
    143    // error case at shutdown, so a 1s wait won't be too noticeable.
    144    sleep(1);
    145    status = WaitForProcess(pid, aBlock, &info);
    146  }
    147 
    148  switch (status) {
    149    case base::ProcessStatus::Running:
    150      return false;
    151 
    152    case base::ProcessStatus::Exited:
    153      if (info != 0) {
    154        CHROMIUM_LOG(WARNING)
    155            << "process " << pid << " exited with status " << info;
    156      }
    157      return true;
    158 
    159    case base::ProcessStatus::Killed:
    160      CHROMIUM_LOG(WARNING)
    161          << "process " << pid << " exited on signal " << info;
    162 #if defined(XP_LINUX) && !defined(ANDROID)
    163      // Record telemetry for OOM kills
    164      if (info == SIGKILL) {
    165        NS_DispatchToMainThread(
    166            NS_NewRunnableFunction("ContentProcessOOMTelemetry",
    167                                   []() { RecordContentProcessOOMKilled(); }));
    168      }
    169 #endif
    170      return true;
    171 
    172    case base::ProcessStatus::Error:
    173      CHROMIUM_LOG(ERROR) << "waiting for process " << pid
    174                          << " failed with error " << info;
    175      // Don't keep trying.
    176      return true;
    177 
    178    default:
    179      DCHECK(false) << "can't happen";
    180      return true;
    181  }
    182 }
    183 
    184 // Creates a timer to kill the process after a delay, for the
    185 // `force=true` case.  The timer is bound to the I/O thread, which
    186 // means it needs to be cancelled there (and thus that child exit
    187 // notifications need to be handled on the I/O thread).
    188 already_AddRefed<nsITimer> DelayedKill(pid_t aPid) {
    189  nsCOMPtr<nsITimer> timer;
    190 
    191  nsresult rv = NS_NewTimerWithCallback(
    192      getter_AddRefs(timer),
    193      [aPid](nsITimer*) {
    194        // If the process already exited, normally it would remain as
    195        // a zombie and the `SIGKILL` would be ignored.  But if the
    196        // fork server crashed, then the child would be reparented to
    197        // pid 1 and cleaned up immediately, so in that case we should
    198        // not try to signal.
    199        if (IsProcessDead(aPid, BlockingWait::No)) {
    200          return;
    201        }
    202        // In theory it's possible for the fork server to crash and
    203        // the child process to exit and have its pid reused by a new
    204        // process all between these two statements, but that is
    205        // *extremely* unlikely.
    206        if (kill(aPid, SIGKILL) != 0) {
    207          const int err = errno;
    208      // Bug 1944669: suppress logging if it's a forkserver child
    209      // process that already exited.  (Before bug 1658072 we
    210      // would kill(pid, 0) first, but that doesn't change
    211      // anything.)  This can be removed with bug 1752638.
    212 #ifdef MOZ_ENABLE_FORKSERVER
    213          const bool forkServed = mozilla::ipc::ForkServiceChild::WasUsed();
    214 #else
    215          constexpr bool forkServed = false;
    216 #endif
    217          if (err != ESRCH || !forkServed) {
    218            CHROMIUM_LOG(ERROR) << "failed to send SIGKILL to process " << aPid
    219                                << strerror(err);
    220          }
    221        }
    222        // If the process was still running, it will exit and the
    223        // SIGCHLD handler will waitpid it.
    224      },
    225      kMaxWaitMs, nsITimer::TYPE_ONE_SHOT, "ProcessWatcher::DelayedKill"_ns,
    226      XRE_GetAsyncIOEventTarget());
    227 
    228  // This should happen only during shutdown, in which case we're
    229  // about to kill the process anyway during I/O thread destruction.
    230  if (NS_FAILED(rv)) {
    231    CHROMIUM_LOG(WARNING) << "failed to start kill timer for process " << aPid
    232                          << "; killing immediately";
    233    kill(aPid, SIGKILL);
    234    return nullptr;
    235  }
    236 
    237  return timer.forget();
    238 }
    239 
    240 bool CrashProcessIfHanging(pid_t aPid) {
    241  if (IsProcessDead(aPid, BlockingWait::No)) {
    242    return false;
    243  }
    244 
    245  // If child processes seems to be hanging on shutdown, wait for a
    246  // reasonable time.  The wait is global instead of per-process
    247  // because the child processes should be shutting down in
    248  // parallel, and also we're potentially racing global timeouts
    249  // like nsTerminator.  (The counter doesn't need to be atomic;
    250  // this is always called on the I/O thread.)
    251  static int sWaitMs = kShutdownWaitMs;
    252  if (sWaitMs > 0) {
    253    CHROMIUM_LOG(WARNING) << "Process " << aPid
    254                          << " may be hanging at shutdown; will wait for up to "
    255                          << sWaitMs << "ms";
    256  }
    257  // There isn't a way to do a time-limited wait that's both
    258  // portable and doesn't require messing with signals.  Instead, we
    259  // sleep in short increments and poll the process status.
    260  while (sWaitMs > 0) {
    261    static constexpr int kWaitTickMs = 200;
    262    struct timespec ts = {kWaitTickMs / 1000, (kWaitTickMs % 1000) * 1000000};
    263    HANDLE_EINTR(nanosleep(&ts, &ts));
    264    sWaitMs -= kWaitTickMs;
    265 
    266    if (IsProcessDead(aPid, BlockingWait::No)) {
    267      return false;
    268    }
    269  }
    270 
    271  // We want TreeHerder to flag this log line as an error, so that
    272  // this is more obviously a deliberate crash; "fatal error" is one
    273  // of the strings it looks for.
    274  CHROMIUM_LOG(ERROR)
    275      << "Process " << aPid
    276      << " hanging at shutdown; attempting crash report (fatal error).";
    277 
    278  kill(aPid, SIGABRT);
    279  return true;
    280 }
    281 
    282 // Most of the logic is here.  Reponds to SIGCHLD via the self-pipe,
    283 // and handles shutdown behavior in `WillDestroyCurrentMessageLoop`.
    284 // There is one instance of this class; it's created the first time
    285 // it's used and destroys itself during IPC shutdown.
    286 class ProcessCleaner final : public MessageLoopForIO::Watcher,
    287                             public MessageLoop::DestructionObserver {
    288 public:
    289  // Safety: this must be called on the I/O thread.
    290  void Register() {
    291    MessageLoopForIO* loop = MessageLoopForIO::current();
    292    loop->AddDestructionObserver(this);
    293    loop->WatchFileDescriptor(gSignalPipe[0], /* persistent= */ true,
    294                              MessageLoopForIO::WATCH_READ, &mWatcher, this);
    295  }
    296 
    297  void OnFileCanReadWithoutBlocking(int fd) override {
    298    DCHECK(fd == gSignalPipe[0]);
    299    ssize_t rv;
    300    // Drain the pipe and prune dead processes.
    301    do {
    302      char msg[32];
    303      rv = HANDLE_EINTR(read(gSignalPipe[0], msg, sizeof msg));
    304      CHECK(rv != 0);
    305      if (rv < 0) {
    306        DCHECK(errno == EAGAIN || errno == EWOULDBLOCK);
    307      } else {
    308 #ifdef DEBUG
    309        for (size_t i = 0; i < (size_t)rv; ++i) {
    310          DCHECK(msg[i] == 0);
    311        }
    312 #endif
    313      }
    314    } while (rv > 0);
    315    PruneDeadProcesses();
    316  }
    317 
    318  void OnFileCanWriteWithoutBlocking(int fd) override {
    319    CHROMIUM_LOG(FATAL) << "unreachable";
    320  }
    321 
    322  void WillDestroyCurrentMessageLoop() override {
    323    gProcessWatcherShutdown = true;
    324    mWatcher.StopWatchingFileDescriptor();
    325    auto lock = gPendingChildren.Lock();
    326    auto& children = lock.ref();
    327    if (children) {
    328      for (const auto& child : *children) {
    329        // If the child still has force-termination pending, do that now.
    330        if (child.mForce) {
    331          // This is too late for timers to run, so no need to Cancel().
    332          //
    333          // FIXME (bug 1724337, approximately): This code isn't run at
    334          // all in practice, because the parent process will already have
    335          // exited (unless the fastShutdownStage pref is changed).
    336          if (kill(child.mPid, SIGKILL) != 0) {
    337            CHROMIUM_LOG(ERROR)
    338                << "failed to send SIGKILL to process " << child.mPid;
    339            continue;
    340          }
    341        } else {
    342          // Exception for the fake hang tests in ipc/glue/test/browser
    343          // (See also the comment in `~ProcessChild()`.)
    344          if (!PR_GetEnv("MOZ_TEST_CHILD_EXIT_HANG") &&
    345              !CrashProcessIfHanging(child.mPid)) {
    346            continue;
    347          }
    348        }
    349        // If the process was just killed, it should exit immediately;
    350        // otherwise, block until it exits on its own.
    351        IsProcessDead(child.mPid, BlockingWait::Yes);
    352      }
    353      children = nullptr;
    354    }
    355 #ifdef MOZ_ENABLE_FORKSERVER
    356    mozilla::ipc::ForkServiceChild::StopForkServer();
    357 #endif
    358    delete this;
    359  }
    360 
    361 private:
    362  MessageLoopForIO::FileDescriptorWatcher mWatcher;
    363 
    364  static void PruneDeadProcesses() {
    365    auto lock = gPendingChildren.Lock();
    366    auto& children = lock.ref();
    367    if (!children || children->IsEmpty()) {
    368      return;
    369    }
    370    nsTArray<PendingChild> live;
    371    for (const auto& child : *children) {
    372      if (IsProcessDead(child.mPid, BlockingWait::No)) {
    373        if (child.mForce) {
    374          child.mForce->Cancel();
    375        }
    376      } else {
    377        live.AppendElement(child);
    378      }
    379    }
    380    *children = std::move(live);
    381  }
    382 };
    383 
    384 static void HandleSigChld(int signum) {
    385  DCHECK(signum == SIGCHLD);
    386  char msg = 0;
    387  HANDLE_EINTR(write(gSignalPipe[1], &msg, 1));
    388  // Can't log here if this fails (at least not normally; SafeSPrintf
    389  // from security/sandbox/chromium could be used).
    390  //
    391  // (Note that this could fail with EAGAIN if the pipe buffer becomes
    392  // full; this is extremely unlikely, and it doesn't matter because
    393  // the reader will be woken up regardless and doesn't care about the
    394  // number of signals delivered.)
    395 }
    396 
    397 static void ProcessWatcherInit() {
    398  int rv;
    399 
    400 #ifdef HAVE_PIPE2
    401  rv = pipe2(gSignalPipe, O_NONBLOCK | O_CLOEXEC);
    402  CHECK(rv == 0)
    403  << "pipe2() failed";
    404 #else
    405  rv = pipe(gSignalPipe);
    406  CHECK(rv == 0)
    407  << "pipe() failed";
    408  for (int fd : gSignalPipe) {
    409    rv = fcntl(fd, F_SETFL, O_NONBLOCK);
    410    CHECK(rv == 0)
    411    << "O_NONBLOCK failed";
    412    rv = fcntl(fd, F_SETFD, FD_CLOEXEC);
    413    CHECK(rv == 0)
    414    << "FD_CLOEXEC failed";
    415  }
    416 #endif  // HAVE_PIPE2
    417 
    418  // Currently there are no other SIGCHLD handlers; this is debug
    419  // asserted.  If the situation changes, it should be relatively
    420  // simple to delegate; note that this ProcessWatcher doesn't
    421  // interfere with child processes it hasn't been asked to handle.
    422  auto oldHandler = signal(SIGCHLD, HandleSigChld);
    423  CHECK(oldHandler != SIG_ERR);
    424  DCHECK(oldHandler == SIG_DFL);
    425 
    426  // Start the ProcessCleaner; registering it with the I/O thread must
    427  // happen on the I/O thread itself.  It's okay for that to happen
    428  // asynchronously: the callback is level-triggered, so if the signal
    429  // handler already wrote to the pipe at that point then it will be
    430  // detected, and the signal itself is async so additional delay
    431  // doesn't change the semantics.
    432  XRE_GetAsyncIOEventTarget()->Dispatch(
    433      NS_NewRunnableFunction("ProcessCleaner::Register", [] {
    434        ProcessCleaner* pc = new ProcessCleaner();
    435        pc->Register();
    436      }));
    437 }
    438 
    439 static void EnsureProcessWatcher() {
    440  static std::once_flag sInited;
    441  std::call_once(sInited, ProcessWatcherInit);
    442 }
    443 
    444 }  // namespace
    445 
    446 mozilla::UniqueFileHandle ProcessWatcher::GetSignalPipe() {
    447  EnsureProcessWatcher();
    448  int fd = gSignalPipe[1];
    449  MOZ_ASSERT(fd >= 0);
    450  fd = dup(fd);
    451  MOZ_ASSERT(fd >= 0);
    452  return mozilla::UniqueFileHandle(fd);
    453 }
    454 
    455 /**
    456 * Do everything possible to ensure that |process| has been reaped
    457 * before this process exits.
    458 *
    459 * |force| decides how strict to be with the child's shutdown.
    460 *
    461 *                | child exit timeout | upon parent shutdown:
    462 *                +--------------------+----------------------------------
    463 *   force=true   | 2 seconds          | kill(child, SIGKILL)
    464 *   force=false  | infinite           | waitpid(child)
    465 *
    466 * If a child process doesn't shut down properly, and |force=false|
    467 * used, then the parent will wait on the child forever.  So,
    468 * |force=false| is expected to be used when an external entity can be
    469 * responsible for terminating hung processes, e.g. automated test
    470 * harnesses.
    471 */
    472 void ProcessWatcher::EnsureProcessTerminated(base::ProcessHandle process,
    473                                             bool force) {
    474  DCHECK(process != base::GetCurrentProcId());
    475  DCHECK(process > 0);
    476 
    477  if (gProcessWatcherShutdown) {
    478    // This late in shutdown, should only come from the I/O thread;
    479    // see further comments below.
    480    mozilla::ipc::AssertIOThread();
    481    // This should always be true given that gProcessWatcherShutdown
    482    // is set, but just in case something changes with MessageLoop
    483    // shutdown:
    484    DCHECK(!MessageLoop::current()->IsAcceptingTasks());
    485 
    486    // This is for the fork server itself, being torn down late
    487    // in shutdown.  Generally won't be reached with force=true,
    488    // because build types that default to it will QuickExit first.
    489    // It's not strictly necessary to wait for child processes when
    490    // the parent process is about to exit (pid 1 should clean them
    491    // up).
    492    //
    493    // However, if called in "wait forever" mode, let's wait for it
    494    // and log the exit status if it was abnormal:
    495    if (!force) {
    496      (void)IsProcessDead(process, BlockingWait::Yes);
    497    }
    498    return;
    499  }
    500 
    501  EnsureProcessWatcher();
    502 
    503  auto lock = gPendingChildren.Lock();
    504  auto& children = lock.ref();
    505 
    506  // Check if the process already exited.  This needs to happen under
    507  // the `gPendingChildren` lock to prevent this sequence:
    508  //
    509  // A1. this non-blocking wait fails
    510  // B1. the process exits
    511  // B2. SIGCHLD is handled
    512  // B3. the ProcessCleaner wakes up and drains the signal pipe
    513  // A2. the process is added to `gPendingChildren`
    514  //
    515  // Holding the lock prevents B3 from occurring between A1 and A2.
    516  if (IsProcessDead(process, BlockingWait::No)) {
    517    return;
    518  }
    519 
    520  if (!children) {
    521    children = new nsTArray<PendingChild>();
    522  }
    523  // Check for duplicate pids.  This is safe even in corner cases with
    524  // pid reuse: the pid can't be reused by the OS until the zombie
    525  // process has been waited, and both the `waitpid` and the following
    526  // removal of the `PendingChild` object occur while continually
    527  // holding the lock, which is also held here.
    528  for (const auto& child : *children) {
    529    if (child.mPid == process) {
    530 #ifdef MOZ_ENABLE_FORKSERVER
    531      if (mozilla::ipc::ForkServiceChild::WasUsed()) {
    532        // Ideally, this would never be reached.  But, in theory it's
    533        // possible if the fork server crashes and is restarted: the
    534        // process will be reparented to pid 1 which will clean it up
    535        // immediately, at which point the pid could be reused (but
    536        // it's very unlikely for that to happen so soon).  So, if
    537        // this is reached without any mistakes by the calling code,
    538        // in that case the old process has already terminated and
    539        // ProcessWatcher has no more responsibility for it.
    540 
    541        CHROMIUM_LOG(WARNING) << "EnsureProcessTerminated: duplicate process"
    542                                 " ID "
    543                              << process;
    544 
    545        // So, we want to end up with a PendingChild for the new
    546        // process; we can just use the old one.  Ideally we'd fix the
    547        // `mForce` value, but that would involve needing to cancel a
    548        // timer when we aren't necessarily on the right thread, and
    549        // in practice the `force` parameter depends only on the build
    550        // type.
    551        return;
    552      }
    553 #endif
    554      MOZ_ASSERT(false,
    555                 "EnsureProcessTerminated must be called at most once for a "
    556                 "given process");
    557      return;
    558    }
    559  }
    560 
    561  PendingChild child{};
    562  child.mPid = process;
    563  if (force) {
    564    child.mForce = DelayedKill(process);
    565  }
    566  children->AppendElement(std::move(child));
    567 }