thread_local_storage.cc (22927B)
1 // Copyright 2014 The Chromium Authors 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #include "base/threading/thread_local_storage.h" 6 7 #include <algorithm> 8 #include <atomic> 9 10 #include "base/check_op.h" 11 #include "base/compiler_specific.h" 12 #include "base/memory/raw_ptr_exclusion.h" 13 #include "base/notreached.h" 14 #include "base/synchronization/lock.h" 15 #include "build/build_config.h" 16 17 #if BUILDFLAG(IS_MAC) && defined(ARCH_CPU_X86_64) 18 #include <pthread.h> 19 #include <type_traits> 20 #endif 21 22 using base::internal::PlatformThreadLocalStorage; 23 24 // Chrome Thread Local Storage (TLS) 25 // 26 // This TLS system allows Chrome to use a single OS level TLS slot process-wide, 27 // and allows us to control the slot limits instead of being at the mercy of the 28 // platform. To do this, Chrome TLS replicates an array commonly found in the OS 29 // thread metadata. 30 // 31 // Overview: 32 // 33 // OS TLS Slots Per-Thread Per-Process Global 34 // ... 35 // [] Chrome TLS Array Chrome TLS Metadata 36 // [] ----------> [][][][][ ][][][][] [][][][][ ][][][][] 37 // [] | | 38 // ... V V 39 // Metadata Version Slot Information 40 // Your Data! 41 // 42 // Using a single OS TLS slot, Chrome TLS allocates an array on demand for the 43 // lifetime of each thread that requests Chrome TLS data. Each per-thread TLS 44 // array matches the length of the per-process global metadata array. 45 // 46 // A per-process global TLS metadata array tracks information about each item in 47 // the per-thread array: 48 // * Status: Tracks if the slot is allocated or free to assign. 49 // * Destructor: An optional destructor to call on thread destruction for that 50 // specific slot. 51 // * Version: Tracks the current version of the TLS slot. Each TLS slot 52 // allocation is associated with a unique version number. 53 // 54 // Most OS TLS APIs guarantee that a newly allocated TLS slot is 55 // initialized to 0 for all threads. The Chrome TLS system provides 56 // this guarantee by tracking the version for each TLS slot here 57 // on each per-thread Chrome TLS array entry. Threads that access 58 // a slot with a mismatched version will receive 0 as their value. 59 // The metadata version is incremented when the client frees a 60 // slot. The per-thread metadata version is updated when a client 61 // writes to the slot. This scheme allows for constant time 62 // invalidation and avoids the need to iterate through each Chrome 63 // TLS array to mark the slot as zero. 64 // 65 // Just like an OS TLS API, clients of the Chrome TLS are responsible for 66 // managing any necessary lifetime of the data in their slots. The only 67 // convenience provided is automatic destruction when a thread ends. If a client 68 // frees a slot, that client is responsible for destroying the data in the slot. 69 70 namespace { 71 // In order to make TLS destructors work, we need to keep around a function 72 // pointer to the destructor for each slot. We keep this array of pointers in a 73 // global (static) array. 74 // We use the single OS-level TLS slot (giving us one pointer per thread) to 75 // hold a pointer to a per-thread array (table) of slots that we allocate to 76 // Chromium consumers. 77 78 // g_native_tls_key is the one native TLS that we use. It stores our table. 79 80 std::atomic<PlatformThreadLocalStorage::TLSKey> g_native_tls_key{ 81 PlatformThreadLocalStorage::TLS_KEY_OUT_OF_INDEXES}; 82 83 // The OS TLS slot has the following states. The TLS slot's lower 2 bits contain 84 // the state, the upper bits the TlsVectorEntry*. 85 // * kUninitialized: Any call to Slot::Get()/Set() will create the base 86 // per-thread TLS state. kUninitialized must be null. 87 // * kInUse: value has been created and is in use. 88 // * kDestroying: Set when the thread is exiting prior to deleting any of the 89 // values stored in the TlsVectorEntry*. This state is necessary so that 90 // sequence/task checks won't be done while in the process of deleting the 91 // tls entries (see comments in SequenceCheckerImpl for more details). 92 // * kDestroyed: All of the values in the vector have been deallocated and 93 // the TlsVectorEntry has been deleted. 94 // 95 // Final States: 96 // * Windows: kDestroyed. Windows does not iterate through the OS TLS to clean 97 // up the values. 98 // * POSIX: kUninitialized. POSIX iterates through TLS until all slots contain 99 // nullptr. 100 // 101 // More details on this design: 102 // We need some type of thread-local state to indicate that the TLS system has 103 // been destroyed. To do so, we leverage the multi-pass nature of destruction 104 // of pthread_key. 105 // 106 // a) After destruction of TLS system, we set the pthread_key to a sentinel 107 // kDestroyed. 108 // b) All calls to Slot::Get() DCHECK that the state is not kDestroyed, and 109 // any system which might potentially invoke Slot::Get() after destruction 110 // of TLS must check ThreadLocalStorage::ThreadIsBeingDestroyed(). 111 // c) After a full pass of the pthread_keys, on the next invocation of 112 // ConstructTlsVector(), we'll then set the key to nullptr. 113 // d) At this stage, the TLS system is back in its uninitialized state. 114 // e) If in the second pass of destruction of pthread_keys something were to 115 // re-initialize TLS [this should never happen! Since the only code which 116 // uses Chrome TLS is Chrome controlled, we should really be striving for 117 // single-pass destruction], then TLS will be re-initialized and then go 118 // through the 2-pass destruction system again. Everything should just 119 // work (TM). 120 121 // The state of the tls-entry. 122 enum class TlsVectorState { 123 kUninitialized = 0, 124 125 // In the process of destroying the entries in the vector. 126 kDestroying, 127 128 // All of the entries and the vector has been destroyed. 129 kDestroyed, 130 131 // The vector has been initialized and is in use. 132 kInUse, 133 134 kMaxValue = kInUse 135 }; 136 137 // Bit-mask used to store TlsVectorState. 138 constexpr uintptr_t kVectorStateBitMask = 3; 139 static_assert(static_cast<int>(TlsVectorState::kMaxValue) <= 140 kVectorStateBitMask, 141 "number of states must fit in header"); 142 static_assert(static_cast<int>(TlsVectorState::kUninitialized) == 0, 143 "kUninitialized must be null"); 144 145 // The maximum number of slots in our thread local storage stack. 146 constexpr size_t kThreadLocalStorageSize = 256; 147 148 enum TlsStatus { 149 FREE, 150 IN_USE, 151 }; 152 153 struct TlsMetadata { 154 TlsStatus status; 155 base::ThreadLocalStorage::TLSDestructorFunc destructor; 156 // Incremented every time a slot is reused. Used to detect reuse of slots. 157 uint32_t version; 158 // Tracks slot creation order. Used to destroy slots in the reverse order: 159 // from last created to first created. 160 uint32_t sequence_num; 161 }; 162 163 struct TlsVectorEntry { 164 // `data` is not a raw_ptr<...> for performance reasons (based on analysis of 165 // sampling profiler data and tab_search:top100:2020). 166 RAW_PTR_EXCLUSION void* data; 167 168 uint32_t version; 169 }; 170 171 // This lock isn't needed until after we've constructed the per-thread TLS 172 // vector, so it's safe to use. 173 base::Lock* GetTLSMetadataLock() { 174 static auto* lock = new base::Lock(); 175 return lock; 176 } 177 TlsMetadata g_tls_metadata[kThreadLocalStorageSize]; 178 size_t g_last_assigned_slot = 0; 179 uint32_t g_sequence_num = 0; 180 181 // The maximum number of times to try to clear slots by calling destructors. 182 // Use pthread naming convention for clarity. 183 constexpr size_t kMaxDestructorIterations = kThreadLocalStorageSize; 184 185 // Sets the value and state of the vector. 186 void SetTlsVectorValue(PlatformThreadLocalStorage::TLSKey key, 187 TlsVectorEntry* tls_data, 188 TlsVectorState state) { 189 DCHECK(tls_data || (state == TlsVectorState::kUninitialized) || 190 (state == TlsVectorState::kDestroyed)); 191 PlatformThreadLocalStorage::SetTLSValue( 192 key, reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(tls_data) | 193 static_cast<uintptr_t>(state))); 194 } 195 196 // Returns the tls vector and current state from the raw tls value. 197 TlsVectorState GetTlsVectorStateAndValue(void* tls_value, 198 TlsVectorEntry** entry = nullptr) { 199 if (entry) { 200 *entry = reinterpret_cast<TlsVectorEntry*>( 201 reinterpret_cast<uintptr_t>(tls_value) & ~kVectorStateBitMask); 202 } 203 return static_cast<TlsVectorState>(reinterpret_cast<uintptr_t>(tls_value) & 204 kVectorStateBitMask); 205 } 206 207 // Returns the tls vector and state using the tls key. 208 TlsVectorState GetTlsVectorStateAndValue(PlatformThreadLocalStorage::TLSKey key, 209 TlsVectorEntry** entry = nullptr) { 210 // Only on x86_64, the implementation is not stable on ARM64. For instance, in 211 // macOS 11, the TPIDRRO_EL0 registers holds the CPU index in the low bits, 212 // which is not the case in macOS 12. See libsyscall/os/tsd.h in XNU 213 // (_os_tsd_get_direct() is used by pthread_getspecific() internally). 214 #if BUILDFLAG(IS_MAC) && defined(ARCH_CPU_X86_64) 215 // On macOS, pthread_getspecific() is in libSystem, so a call to it has to go 216 // through PLT. However, and contrary to some other platforms, *all* TLS keys 217 // are in a static array in the thread structure. So they are *always* at a 218 // fixed offset from the segment register holding the thread structure 219 // address. 220 // 221 // We could use _pthread_getspecific_direct(), but it is not 222 // exported. However, on all macOS versions we support, the TLS array is at 223 // %gs. This is used in V8 and PartitionAlloc, and can also be seen by looking 224 // at pthread_getspecific() disassembly: 225 // 226 // libsystem_pthread.dylib`pthread_getspecific: 227 // libsystem_pthread.dylib[0x7ff800316099] <+0>: movq %gs:(,%rdi,8), %rax 228 // libsystem_pthread.dylib[0x7ff8003160a2] <+9>: retq 229 // 230 // This function is essentially inlining the content of pthread_getspecific() 231 // here. 232 // 233 // Note that this likely ends up being even faster than thread_local for 234 // typical Chromium builds where the code is in a dynamic library. For the 235 // static executable case, this is likely equivalent. 236 static_assert( 237 std::is_same_v<PlatformThreadLocalStorage::TLSKey, pthread_key_t>, 238 "The special-case below assumes that the platform TLS implementation is " 239 "pthread."); 240 241 intptr_t platform_tls_value; 242 asm("movq %%gs:(,%1,8), %0;" : "=r"(platform_tls_value) : "r"(key)); 243 244 return GetTlsVectorStateAndValue(reinterpret_cast<void*>(platform_tls_value), 245 entry); 246 #else 247 return GetTlsVectorStateAndValue(PlatformThreadLocalStorage::GetTLSValue(key), 248 entry); 249 #endif 250 } 251 252 // This function is called to initialize our entire Chromium TLS system. 253 // It may be called very early, and we need to complete most all of the setup 254 // (initialization) before calling *any* memory allocator functions, which may 255 // recursively depend on this initialization. 256 // As a result, we use Atomics, and avoid anything (like a singleton) that might 257 // require memory allocations. 258 TlsVectorEntry* ConstructTlsVector() { 259 PlatformThreadLocalStorage::TLSKey key = 260 g_native_tls_key.load(std::memory_order_relaxed); 261 if (key == PlatformThreadLocalStorage::TLS_KEY_OUT_OF_INDEXES) { 262 CHECK(PlatformThreadLocalStorage::AllocTLS(&key)); 263 264 // The TLS_KEY_OUT_OF_INDEXES is used to find out whether the key is set or 265 // not in NoBarrier_CompareAndSwap, but Posix doesn't have invalid key, we 266 // define an almost impossible value be it. 267 // If we really get TLS_KEY_OUT_OF_INDEXES as value of key, just alloc 268 // another TLS slot. 269 if (key == PlatformThreadLocalStorage::TLS_KEY_OUT_OF_INDEXES) { 270 PlatformThreadLocalStorage::TLSKey tmp = key; 271 CHECK(PlatformThreadLocalStorage::AllocTLS(&key) && 272 key != PlatformThreadLocalStorage::TLS_KEY_OUT_OF_INDEXES); 273 PlatformThreadLocalStorage::FreeTLS(tmp); 274 } 275 // Atomically test-and-set the tls_key. If the key is 276 // TLS_KEY_OUT_OF_INDEXES, go ahead and set it. Otherwise, do nothing, as 277 // another thread already did our dirty work. 278 PlatformThreadLocalStorage::TLSKey old_key = 279 PlatformThreadLocalStorage::TLS_KEY_OUT_OF_INDEXES; 280 if (!g_native_tls_key.compare_exchange_strong(old_key, key, 281 std::memory_order_relaxed, 282 std::memory_order_relaxed)) { 283 // We've been shortcut. Another thread replaced g_native_tls_key first so 284 // we need to destroy our index and use the one the other thread got 285 // first. 286 PlatformThreadLocalStorage::FreeTLS(key); 287 key = g_native_tls_key.load(std::memory_order_relaxed); 288 } 289 } 290 CHECK_EQ(GetTlsVectorStateAndValue(key), TlsVectorState::kUninitialized); 291 292 // Some allocators, such as TCMalloc, make use of thread local storage. As a 293 // result, any attempt to call new (or malloc) will lazily cause such a system 294 // to initialize, which will include registering for a TLS key. If we are not 295 // careful here, then that request to create a key will call new back, and 296 // we'll have an infinite loop. We avoid that as follows: Use a stack 297 // allocated vector, so that we don't have dependence on our allocator until 298 // our service is in place. (i.e., don't even call new until after we're 299 // setup) 300 TlsVectorEntry stack_allocated_tls_data[kThreadLocalStorageSize]; 301 memset(stack_allocated_tls_data, 0, sizeof(stack_allocated_tls_data)); 302 // Ensure that any rentrant calls change the temp version. 303 SetTlsVectorValue(key, stack_allocated_tls_data, TlsVectorState::kInUse); 304 305 // Allocate an array to store our data. 306 TlsVectorEntry* tls_data = new TlsVectorEntry[kThreadLocalStorageSize]; 307 memcpy(tls_data, stack_allocated_tls_data, sizeof(stack_allocated_tls_data)); 308 SetTlsVectorValue(key, tls_data, TlsVectorState::kInUse); 309 return tls_data; 310 } 311 312 void OnThreadExitInternal(TlsVectorEntry* tls_data) { 313 DCHECK(tls_data); 314 // Some allocators, such as TCMalloc, use TLS. As a result, when a thread 315 // terminates, one of the destructor calls we make may be to shut down an 316 // allocator. We have to be careful that after we've shutdown all of the known 317 // destructors (perchance including an allocator), that we don't call the 318 // allocator and cause it to resurrect itself (with no possibly destructor 319 // call to follow). We handle this problem as follows: Switch to using a stack 320 // allocated vector, so that we don't have dependence on our allocator after 321 // we have called all g_tls_metadata destructors. (i.e., don't even call 322 // delete[] after we're done with destructors.) 323 TlsVectorEntry stack_allocated_tls_data[kThreadLocalStorageSize]; 324 memcpy(stack_allocated_tls_data, tls_data, sizeof(stack_allocated_tls_data)); 325 // Ensure that any re-entrant calls change the temp version. 326 PlatformThreadLocalStorage::TLSKey key = 327 g_native_tls_key.load(std::memory_order_relaxed); 328 SetTlsVectorValue(key, stack_allocated_tls_data, TlsVectorState::kDestroying); 329 delete[] tls_data; // Our last dependence on an allocator. 330 331 size_t remaining_attempts = kMaxDestructorIterations + 1; 332 bool need_to_scan_destructors = true; 333 while (need_to_scan_destructors) { 334 need_to_scan_destructors = false; 335 336 // Snapshot the TLS Metadata so we don't have to lock on every access. 337 TlsMetadata tls_metadata[kThreadLocalStorageSize]; 338 { 339 base::AutoLock auto_lock(*GetTLSMetadataLock()); 340 memcpy(tls_metadata, g_tls_metadata, sizeof(g_tls_metadata)); 341 } 342 343 // We destroy slots in reverse order (i.e. destroy the first-created slot 344 // last), for the following reasons: 345 // 1) Slots that are created early belong to basic services (like an 346 // allocator) and might have to be recreated by destructors of other 347 // services. So we save iterations here by destroying them last. 348 // 2) Perfetto tracing service allocates a slot early and relies on it to 349 // keep emitting trace events while destructors of other slots are called, 350 // so it's important to keep it live to avoid use-after-free errors. 351 // To achieve this, we sort all slots in the order of decreasing sequence 352 // numbers. 353 struct OrderedSlot { 354 uint32_t sequence_num; 355 uint16_t slot; 356 } slot_destruction_order[kThreadLocalStorageSize]; 357 for (uint16_t i = 0; i < kThreadLocalStorageSize; ++i) { 358 slot_destruction_order[i].sequence_num = tls_metadata[i].sequence_num; 359 slot_destruction_order[i].slot = i; 360 } 361 std::sort(std::begin(slot_destruction_order), 362 std::end(slot_destruction_order), 363 [](const OrderedSlot& s1, const OrderedSlot& s2) { 364 return s1.sequence_num > s2.sequence_num; 365 }); 366 367 for (const auto& ordered_slot : slot_destruction_order) { 368 size_t slot = ordered_slot.slot; 369 void* tls_value = stack_allocated_tls_data[slot].data; 370 if (!tls_value || tls_metadata[slot].status == TlsStatus::FREE || 371 stack_allocated_tls_data[slot].version != tls_metadata[slot].version) 372 continue; 373 374 base::ThreadLocalStorage::TLSDestructorFunc destructor = 375 tls_metadata[slot].destructor; 376 if (!destructor) 377 continue; 378 stack_allocated_tls_data[slot].data = nullptr; // pre-clear the slot. 379 destructor(tls_value); 380 // Any destructor might have called a different service, which then set a 381 // different slot to a non-null value. Hence we need to check the whole 382 // vector again. This is a pthread standard. 383 need_to_scan_destructors = true; 384 } 385 386 if (--remaining_attempts == 0) { 387 NOTREACHED(); // Destructors might not have been called. 388 break; 389 } 390 } 391 392 // Remove our stack allocated vector. 393 SetTlsVectorValue(key, nullptr, TlsVectorState::kDestroyed); 394 } 395 396 } // namespace 397 398 namespace base { 399 400 namespace internal { 401 402 #if BUILDFLAG(IS_WIN) 403 void PlatformThreadLocalStorage::OnThreadExit() { 404 PlatformThreadLocalStorage::TLSKey key = 405 g_native_tls_key.load(std::memory_order_relaxed); 406 if (key == PlatformThreadLocalStorage::TLS_KEY_OUT_OF_INDEXES) 407 return; 408 TlsVectorEntry* tls_vector = nullptr; 409 const TlsVectorState state = GetTlsVectorStateAndValue(key, &tls_vector); 410 411 // On Windows, thread destruction callbacks are only invoked once per module, 412 // so there should be no way that this could be invoked twice. 413 DCHECK_NE(state, TlsVectorState::kDestroyed); 414 415 // Maybe we have never initialized TLS for this thread. 416 if (state == TlsVectorState::kUninitialized) 417 return; 418 OnThreadExitInternal(tls_vector); 419 } 420 #elif BUILDFLAG(IS_POSIX) || BUILDFLAG(IS_FUCHSIA) 421 void PlatformThreadLocalStorage::OnThreadExit(void* value) { 422 // On posix this function may be called twice. The first pass calls dtors and 423 // sets state to kDestroyed. The second pass sets kDestroyed to 424 // kUninitialized. 425 TlsVectorEntry* tls_vector = nullptr; 426 const TlsVectorState state = GetTlsVectorStateAndValue(value, &tls_vector); 427 if (state == TlsVectorState::kDestroyed) { 428 PlatformThreadLocalStorage::TLSKey key = 429 g_native_tls_key.load(std::memory_order_relaxed); 430 SetTlsVectorValue(key, nullptr, TlsVectorState::kUninitialized); 431 return; 432 } 433 434 OnThreadExitInternal(tls_vector); 435 } 436 #endif // BUILDFLAG(IS_WIN) 437 438 } // namespace internal 439 440 // static 441 bool ThreadLocalStorage::HasBeenDestroyed() { 442 PlatformThreadLocalStorage::TLSKey key = 443 g_native_tls_key.load(std::memory_order_relaxed); 444 if (key == PlatformThreadLocalStorage::TLS_KEY_OUT_OF_INDEXES) 445 return false; 446 const TlsVectorState state = GetTlsVectorStateAndValue(key); 447 return state == TlsVectorState::kDestroying || 448 state == TlsVectorState::kDestroyed; 449 } 450 451 void ThreadLocalStorage::Slot::Initialize(TLSDestructorFunc destructor) { 452 PlatformThreadLocalStorage::TLSKey key = 453 g_native_tls_key.load(std::memory_order_relaxed); 454 if (key == PlatformThreadLocalStorage::TLS_KEY_OUT_OF_INDEXES || 455 GetTlsVectorStateAndValue(key) == TlsVectorState::kUninitialized) { 456 ConstructTlsVector(); 457 } 458 459 // Grab a new slot. 460 { 461 base::AutoLock auto_lock(*GetTLSMetadataLock()); 462 for (size_t i = 0; i < kThreadLocalStorageSize; ++i) { 463 // Tracking the last assigned slot is an attempt to find the next 464 // available slot within one iteration. Under normal usage, slots remain 465 // in use for the lifetime of the process (otherwise before we reclaimed 466 // slots, we would have run out of slots). This makes it highly likely the 467 // next slot is going to be a free slot. 468 size_t slot_candidate = 469 (g_last_assigned_slot + 1 + i) % kThreadLocalStorageSize; 470 if (g_tls_metadata[slot_candidate].status == TlsStatus::FREE) { 471 g_tls_metadata[slot_candidate].status = TlsStatus::IN_USE; 472 g_tls_metadata[slot_candidate].destructor = destructor; 473 g_tls_metadata[slot_candidate].sequence_num = ++g_sequence_num; 474 g_last_assigned_slot = slot_candidate; 475 DCHECK_EQ(kInvalidSlotValue, slot_); 476 slot_ = slot_candidate; 477 version_ = g_tls_metadata[slot_candidate].version; 478 break; 479 } 480 } 481 } 482 CHECK_LT(slot_, kThreadLocalStorageSize); 483 } 484 485 void ThreadLocalStorage::Slot::Free() { 486 DCHECK_LT(slot_, kThreadLocalStorageSize); 487 { 488 base::AutoLock auto_lock(*GetTLSMetadataLock()); 489 g_tls_metadata[slot_].status = TlsStatus::FREE; 490 g_tls_metadata[slot_].destructor = nullptr; 491 ++(g_tls_metadata[slot_].version); 492 } 493 slot_ = kInvalidSlotValue; 494 } 495 496 void* ThreadLocalStorage::Slot::Get() const { 497 TlsVectorEntry* tls_data = nullptr; 498 const TlsVectorState state = GetTlsVectorStateAndValue( 499 g_native_tls_key.load(std::memory_order_relaxed), &tls_data); 500 DCHECK_NE(state, TlsVectorState::kDestroyed); 501 if (!tls_data) 502 return nullptr; 503 DCHECK_LT(slot_, kThreadLocalStorageSize); 504 // Version mismatches means this slot was previously freed. 505 if (tls_data[slot_].version != version_) 506 return nullptr; 507 return tls_data[slot_].data; 508 } 509 510 void ThreadLocalStorage::Slot::Set(void* value) { 511 TlsVectorEntry* tls_data = nullptr; 512 const TlsVectorState state = GetTlsVectorStateAndValue( 513 g_native_tls_key.load(std::memory_order_relaxed), &tls_data); 514 DCHECK_NE(state, TlsVectorState::kDestroyed); 515 if (UNLIKELY(!tls_data)) { 516 if (!value) 517 return; 518 tls_data = ConstructTlsVector(); 519 } 520 DCHECK_LT(slot_, kThreadLocalStorageSize); 521 tls_data[slot_].data = value; 522 tls_data[slot_].version = version_; 523 } 524 525 ThreadLocalStorage::Slot::Slot(TLSDestructorFunc destructor) { 526 Initialize(destructor); 527 } 528 529 ThreadLocalStorage::Slot::~Slot() { 530 Free(); 531 } 532 533 } // namespace base