Linux: use futex_waitv syscall for atomic waiting

In order to make this possible, some unnecessary features were removed.
2024-05-31 19:17:32 -04:00 · 2023-07-31 23:57:26 +03:00 · 2023-07-31 23:57:26 +03:00 · d34287b2cc
parent 831a9fe012
commit d34287b2cc
51 changed files with 441 additions and 574 deletions
--- a/Utilities/Thread.cpp
+++ b/Utilities/Thread.cpp
@ -2172,14 +2172,14 @@ u64 thread_base::finalize(thread_state result_state) noexcept
 	const u64 _self = m_thread;

 	// Set result state (errored or finalized)
-	m_sync.fetch_op([&](u64& v)
+	m_sync.fetch_op([&](u32& v)
 	{
 		v &= -4;
 		v |= static_cast<u32>(result_state);
 	});

 	// Signal waiting threads
-	m_sync.notify_all(2);
+	m_sync.notify_all();

 	return _self;
 }
@ -2266,8 +2266,18 @@ thread_state thread_ctrl::state()

 void thread_ctrl::wait_for(u64 usec, [[maybe_unused]] bool alert /* true */)
 {
+	if (!usec)
+	{
+		return;
+	}
+
 	auto _this = g_tls_this_thread;

+	if (!alert && usec > 50000)
+	{
+		usec = 50000;
+	}
+
 #ifdef __linux__
 	static thread_local struct linux_timer_handle_t
 	{
@ -2296,13 +2306,13 @@ void thread_ctrl::wait_for(u64 usec, [[maybe_unused]] bool alert /* true */)
 		}
 	} fd_timer;

-	if (!alert && usec > 0 && usec <= 1000 && fd_timer != -1)
+	if (!alert && fd_timer != -1)
 	{
 		struct itimerspec timeout;
 		u64 missed;

-		timeout.it_value.tv_nsec = usec * 1'000ull;
-		timeout.it_value.tv_sec = 0;
+		timeout.it_value.tv_nsec = usec % 1'000'000 * 1'000ull;
+		timeout.it_value.tv_sec = usec / 1'000'000;
 		timeout.it_interval.tv_sec = 0;
 		timeout.it_interval.tv_nsec = 0;
 		timerfd_settime(fd_timer, 0, &timeout, NULL);
@ -2312,15 +2322,27 @@ void thread_ctrl::wait_for(u64 usec, [[maybe_unused]] bool alert /* true */)
 	}
 #endif

-	if (_this->m_sync.bit_test_reset(2) || _this->m_taskq)
+	if (alert)
 	{
-		return;
+		if (_this->m_sync.bit_test_reset(2) || _this->m_taskq)
+		{
+			return;
+		}
 	}

 	// Wait for signal and thread state abort
 	atomic_wait::list<2> list{};
-	list.set<0>(_this->m_sync, 0, 4 + 1);
-	list.set<1>(_this->m_taskq, nullptr);
+
+	if (alert)
+	{
+		list.set<0>(_this->m_sync, 0);
+		list.set<1>(utils::bless<atomic_t<u32>>(&_this->m_taskq)[1], 0);
+	}
+	else
+	{
+		list.set<0>(_this->m_dummy, 0);
+	}
+
 	list.wait(atomic_wait_timeout{usec <= 0xffff'ffff'ffff'ffff / 1000 ? usec * 1000 : 0xffff'ffff'ffff'ffff});
 }

@ -2331,29 +2353,27 @@ void thread_ctrl::wait_for_accurate(u64 usec)
 		return;
 	}

+	if (usec > 50000)
+	{
+		fmt::throw_exception("thread_ctrl::wait_for_accurate: unsupported amount");
+	}
+
+#ifdef __linux__
+	return wait_for(usec, false);
+#else
 	using namespace std::chrono_literals;

 	const auto until = std::chrono::steady_clock::now() + 1us * usec;

 	while (true)
 	{
-#ifdef __linux__
-		// NOTE: Assumption that timer initialization has succeeded
-		u64 host_min_quantum = usec <= 1000 ? 10 : 50;
-#else
 		// Host scheduler quantum for windows (worst case)
-		// NOTE: On ps3 this function has very high accuracy
 		constexpr u64 host_min_quantum = 500;
-#endif
+
 		if (usec >= host_min_quantum)
 		{
-#ifdef __linux__
-			// Do not wait for the last quantum to avoid loss of accuracy
-			wait_for(usec - ((usec % host_min_quantum) + host_min_quantum), false);
-#else
 			// Wait on multiple of min quantum for large durations to avoid overloading low thread cpus
 			wait_for(usec - (usec % host_min_quantum), false);
-#endif
 		}
 		// TODO: Determine best value for yield delay
 		else if (usec >= host_min_quantum / 2)
@ -2374,6 +2394,7 @@ void thread_ctrl::wait_for_accurate(u64 usec)

 		usec = (until - current).count();
 	}
+#endif
 }

 std::string thread_ctrl::get_name_cached()
@ -2440,7 +2461,7 @@ bool thread_base::join(bool dtor) const

 	for (u64 i = 0; (m_sync & 3) <= 1; i++)
 	{
-		m_sync.wait(0, 2, timeout);
+		m_sync.wait(m_sync & ~2, timeout);

 		if (m_sync & 2)
 		{
@ -2460,7 +2481,7 @@ void thread_base::notify()
 {
 	// Set notification
 	m_sync |= 4;
-	m_sync.notify_one(4);
+	m_sync.notify_all();
 }

 u64 thread_base::get_native_id() const
@ -2497,7 +2518,7 @@ u64 thread_base::get_cycles()
 	{
 		cycles = static_cast<u64>(thread_time.tv_sec) * 1'000'000'000 + thread_time.tv_nsec;
 #endif
-		if (const u64 old_cycles = m_sync.fetch_op([&](u64& v){ v &= 7; v |= (cycles << 3); }) >> 3)
+		if (const u64 old_cycles = m_cycles.exchange(cycles))
 		{
 			return cycles - old_cycles;
 		}
@ -2507,7 +2528,7 @@ u64 thread_base::get_cycles()
 	}
 	else
 	{
-		return m_sync >> 3;
+		return m_cycles;
 	}
 }

@ -2560,8 +2581,8 @@ void thread_base::exec()
 				}

 				// Notify waiters
-				ptr->exec.release(nullptr);
-				ptr->exec.notify_all();
+				ptr->done.release(1);
+				ptr->done.notify_all();
 			}

 			if (ptr->next)
--- a/Utilities/Thread.h
+++ b/Utilities/Thread.h
@ -100,17 +100,19 @@ class thread_future
 protected:
 	atomic_t<void(*)(thread_base*, thread_future*)> exec{};

+	atomic_t<u32> done{0};
+
 public:
 	// Get reference to the atomic variable for inspection and waiting for
 	const auto& get_wait() const
 	{
-		return exec;
+		return done;
 	}

 	// Wait (preset)
 	void wait() const
 	{
-		exec.wait<atomic_wait::op_ne>(nullptr);
+		done.wait(0);
 	}
 };

@ -131,8 +133,13 @@ private:
 	// Thread handle (platform-specific)
 	atomic_t<u64> m_thread{0};

-	// Thread state and cycles
-	atomic_t<u64> m_sync{0};
+	// Thread cycles
+	atomic_t<u64> m_cycles{0};
+
+	atomic_t<u32> m_dummy{0};
+
+	// Thread state
+	atomic_t<u32> m_sync{0};

 	// Thread name
 	atomic_ptr<std::string> m_tname;
@ -284,16 +291,22 @@ public:
 		}

 		atomic_wait::list<Max + 2> list{};
-		list.template set<Max>(_this->m_sync, 0, 4 + 1);
-		list.template set<Max + 1>(_this->m_taskq, nullptr);
+		list.template set<Max>(_this->m_sync, 0);
+		list.template set<Max + 1>(_this->m_taskq);
 		setter(list);
 		list.wait(atomic_wait_timeout{usec <= 0xffff'ffff'ffff'ffff / 1000 ? usec * 1000 : 0xffff'ffff'ffff'ffff});
 	}

-	template <atomic_wait::op Op = atomic_wait::op::eq, typename T, typename U>
+	template <typename T, typename U>
 	static inline void wait_on(T& wait, U old, u64 usec = -1)
 	{
-		wait_on_custom<1>([&](atomic_wait::list<3>& list){ list.set<0, Op>(wait, old); }, usec);
+		wait_on_custom<1>([&](atomic_wait::list<3>& list) { list.template set<0>(wait, old); }, usec);
+	}
+
+	template <typename T>
+	static inline void wait_on(T& wait)
+	{
+		wait_on_custom<1>([&](atomic_wait::list<3>& list) { list.template set<0>(wait); });
 	}

 	// Exit.
@ -637,7 +650,7 @@ public:
 	{
 		bool notify_sync = false;

-		if (s >= thread_state::aborting && thread::m_sync.fetch_op([](u64& v){ return !(v & 3) && (v |= 1); }).second)
+		if (s >= thread_state::aborting && thread::m_sync.fetch_op([](u32& v) { return !(v & 3) && (v |= 1); }).second)
 		{
 			notify_sync = true;
 		}
@ -650,7 +663,7 @@ public:
 		if (notify_sync)
 		{
 			// Notify after context abortion has been made so all conditions for wake-up be satisfied by the time of notification
-			thread::m_sync.notify_one(1);
+			thread::m_sync.notify_all();
 		}

 		if (s == thread_state::finished)
--- a/Utilities/cond.cpp
+++ b/Utilities/cond.cpp
@ -9,7 +9,7 @@ void cond_variable::imp_wait(u32 _old, u64 _timeout) noexcept
 	ensure(_old);

 	// Wait with timeout
-	m_value.wait(_old, c_signal_mask, atomic_wait_timeout{_timeout > max_timeout ? umax : _timeout * 1000});
+	m_value.wait(_old, atomic_wait_timeout{_timeout > max_timeout ? umax : _timeout * 1000});

 	// Cleanup
 	m_value.atomic_op([](u32& value)
@ -47,10 +47,10 @@ void cond_variable::imp_wake(u32 _count) noexcept
 	if (_count > 1 || ((_old + (c_signal_mask & (0 - c_signal_mask))) & c_signal_mask) == c_signal_mask)
 	{
 		// Resort to notify_all if signal count reached max
-		m_value.notify_all(c_signal_mask);
+		m_value.notify_all();
 	}
 	else
 	{
-		m_value.notify_one(c_signal_mask);
+		m_value.notify_one();
 	}
 }
--- a/Utilities/lockless.h
+++ b/Utilities/lockless.h
@ -2,6 +2,7 @@

 #include "util/types.hpp"
 #include "util/atomic.hpp"
+#include "util/asm.hpp"

 //! Simple unshrinkable array base for concurrent access. Only growths automatically.
 //! There is no way to know the current size. The smaller index is, the faster it's accessed.
@ -280,12 +281,17 @@ public:
 template <typename T>
 class lf_queue final
 {
-	atomic_t<lf_queue_item<T>*> m_head{nullptr};
+	atomic_t<u64> m_head{0};
+
+	lf_queue_item<T>* load(u64 value) const noexcept
+	{
+		return reinterpret_cast<lf_queue_item<T>*>(value >> 16);
+	}

 	// Extract all elements and reverse element order (FILO to FIFO)
 	lf_queue_item<T>* reverse() noexcept
 	{
-		if (auto* head = m_head.load() ? m_head.exchange(nullptr) : nullptr)
+		if (auto* head = load(m_head) ? load(m_head.exchange(0)) : nullptr)
 		{
 			if (auto* prev = head->m_link)
 			{
@ -311,35 +317,35 @@ public:

 	~lf_queue()
 	{
-		delete m_head.load();
+		delete load(m_head);
 	}

-	template <atomic_wait::op Flags = atomic_wait::op::eq>
 	void wait(std::nullptr_t /*null*/ = nullptr) noexcept
 	{
-		if (m_head == nullptr)
+		if (m_head == 0)
 		{
-			m_head.template wait<Flags>(nullptr);
+			utils::bless<atomic_t<u32>>(&m_head)[1].wait(0);
 		}
 	}

 	const volatile void* observe() const noexcept
 	{
-		return m_head.load();
+		return load(m_head);
 	}

 	explicit operator bool() const noexcept
 	{
-		return m_head != nullptr;
+		return m_head != 0;
 	}

 	template <typename... Args>
 	void push(Args&&... args)
 	{
-		auto _old = m_head.load();
+		auto oldv = m_head.load();
+		auto _old = load(oldv);
 		auto item = new lf_queue_item<T>(_old, std::forward<Args>(args)...);

-		while (!m_head.compare_exchange(_old, item))
+		while (!m_head.compare_exchange(oldv, reinterpret_cast<u64>(item) << 16))
 		{
 			item->m_link = _old;
 		}
@ -347,7 +353,7 @@ public:
 		if (!_old)
 		{
 			// Notify only if queue was empty
-			m_head.notify_one();
+			utils::bless<atomic_t<u32>>(&m_head)[1].notify_one();
 		}
 	}

@ -363,7 +369,7 @@ public:
 	lf_queue_slice<T> pop_all_reversed()
 	{
 		lf_queue_slice<T> result;
-		result.m_head = m_head.exchange(nullptr);
+		result.m_head = load(m_head.exchange(0));
 		return result;
 	}

--- a/Utilities/mutex.cpp
+++ b/Utilities/mutex.cpp
@ -74,14 +74,14 @@ void shared_mutex::imp_wait()
 			break;
 		}

-		m_value.wait(old, c_sig);
+		m_value.wait(old);
 	}
 }

 void shared_mutex::imp_signal()
 {
 	m_value += c_sig;
-	m_value.notify_one(c_sig);
+	m_value.notify_one();
 }

 void shared_mutex::imp_lock(u32 val)
--- a/Utilities/sync.h
+++ b/Utilities/sync.h
@ -38,7 +38,29 @@ constexpr NTSTATUS NTSTATUS_ALERTED = 0x101;
 constexpr NTSTATUS NTSTATUS_TIMEOUT = 0x102;
 #endif

-#ifndef __linux__
+#ifdef __linux__
+#ifndef SYS_futex_waitv
+#if defined(ARCH_X64) || defined(ARCH_ARM64)
+#define SYS_futex_waitv 449
+#endif
+#endif
+
+#ifndef FUTEX_32
+#define FUTEX_32 2
+#endif
+
+#ifndef FUTEX_WAITV_MAX
+#define FUTEX_WAITV_MAX 128
+#endif
+
+struct futex_waitv
+{
+	__u64 val;
+	__u64 uaddr;
+	__u32 flags;
+	__u32 __reserved;
+};
+#else
 enum
 {
 	FUTEX_PRIVATE_FLAG = 0,
@ -113,7 +135,7 @@ inline int futex(volatile void* uaddr, int futex_op, uint val, const timespec* t
 				}
 				else
 				{
-					// TODO
+					// TODO: absolute timeout
 				}

 				map.erase(std::find(map.find(uaddr), map.end(), ref));
--- a/rpcs3/Emu/CPU/CPUThread.cpp
+++ b/rpcs3/Emu/CPU/CPUThread.cpp
@ -261,7 +261,7 @@ struct cpu_prof
 			if (threads.empty())
 			{
 				// Wait for messages if no work (don't waste CPU)
-				thread_ctrl::wait_on(registered, nullptr);
+				thread_ctrl::wait_on(registered);
 				continue;
 			}

@ -939,7 +939,7 @@ bool cpu_thread::check_state() noexcept
 						else
 						{
 							// TODO: fix the workaround
-							g_suspend_counter.wait(ctr, -4, atomic_wait_timeout{100});
+							g_suspend_counter.wait(ctr, atomic_wait_timeout{10'000});
 						}
 					}
 					else
@ -972,8 +972,7 @@ bool cpu_thread::check_state() noexcept
 				}

 				// Short sleep when yield flag is present alone (makes no sense when other methods which can stop thread execution have been done)
-				// Pass a mask of a single bit which is often unused to avoid notifications
-				s_dummy_atomic.wait(0, 1u << 30, atomic_wait_timeout{80'000});
+				s_dummy_atomic.wait(0, atomic_wait_timeout{80'000});
 			}
 		}
 	}
@ -1010,13 +1009,13 @@ cpu_thread& cpu_thread::operator=(thread_state)

 	if (old & cpu_flag::wait && old.none_of(cpu_flag::again + cpu_flag::exit))
 	{
-		state.notify_one(cpu_flag::exit);
+		state.notify_one();

 		if (auto thread = try_get<spu_thread>())
 		{
 			if (u32 resv = atomic_storage<u32>::load(thread->raddr))
 			{
-				vm::reservation_notifier(resv).notify_all(-128);
+				vm::reservation_notifier(resv).notify_all();
 			}
 		}
 	}
--- a/rpcs3/Emu/Cell/Modules/cellMic.cpp
+++ b/rpcs3/Emu/Cell/Modules/cellMic.cpp
@ -73,7 +73,7 @@ void mic_context::operator()()
 	// Timestep in microseconds
 	constexpr u64 TIMESTEP = 256ull * 1'000'000ull / 48000ull;
 	u64 timeout = 0;
-	u64 oldvalue = 0;
+	u32 oldvalue = 0;

 	while (thread_ctrl::state() != thread_state::aborting)
 	{
--- a/rpcs3/Emu/Cell/Modules/cellMic.h
+++ b/rpcs3/Emu/Cell/Modules/cellMic.h
@ -374,7 +374,7 @@ public:
 	static constexpr auto thread_name = "Microphone Thread"sv;

 protected:
-	atomic_t<u64> wakey = 0;
+	atomic_t<u32> wakey = 0;

 	//	u32 signalStateLocalTalk = 9; // value is in range 0-10. 10 indicates talking, 0 indicating none.
 	//	u32 signalStateFarTalk = 0; // value is in range 0-10. 10 indicates talking from far away, 0 indicating none.
--- a/rpcs3/Emu/Cell/Modules/cellMsgDialog.cpp
+++ b/rpcs3/Emu/Cell/Modules/cellMsgDialog.cpp
@ -164,7 +164,7 @@ error_code open_msg_dialog(bool is_blocking, u32 type, vm::cptr<char> msgString,
 			return CellSysutilError{ret + 0u};
 		}

-		const auto notify = std::make_shared<atomic_t<bool>>(false);
+		const auto notify = std::make_shared<atomic_t<u32>>(0);

 		const auto res = manager->create<rsx::overlays::message_dialog>()->show(is_blocking, msgString.get_ptr(), _type, [callback, userData, &return_code, is_blocking, notify](s32 status)
 		{
@ -186,7 +186,7 @@ error_code open_msg_dialog(bool is_blocking, u32 type, vm::cptr<char> msgString,

 			if (is_blocking && notify)
 			{
-				*notify = true;
+				*notify = 1;
 				notify->notify_one();
 			}
 		});
--- a/rpcs3/Emu/Cell/Modules/cellMusicDecode.cpp
+++ b/rpcs3/Emu/Cell/Modules/cellMusicDecode.cpp
@ -256,7 +256,7 @@ error_code cell_music_decode_read(vm::ptr<void> buf, vm::ptr<u32> startTime, u64
 	{
 		dec.read_pos = 0;
 		dec.decoder.clear();
-		dec.decoder.track_fully_consumed = true;
+		dec.decoder.track_fully_consumed = 1;
 		dec.decoder.track_fully_consumed.notify_one();
 		break;
 	}
--- a/rpcs3/Emu/Cell/Modules/cellSpurs.cpp
+++ b/rpcs3/Emu/Cell/Modules/cellSpurs.cpp
@ -2477,8 +2477,8 @@ s32 _spurs::add_workload(ppu_thread& ppu, vm::ptr<CellSpurs> spurs, vm::ptr<u32>
 	spurs_res += 127;
 	spurs_res2 += 127;

-	spurs_res.notify_all(-128);
-	spurs_res2.notify_all(-128);
+	spurs_res.notify_all();
+	spurs_res2.notify_all();

 	u32 res_wkl;
 	const auto wkl = &spurs->wklInfo(wnum);
--- a/rpcs3/Emu/Cell/Modules/cellVdec.cpp
+++ b/rpcs3/Emu/Cell/Modules/cellVdec.cpp
@ -303,7 +303,7 @@ struct vdec_context final
 				return;
 			}

-			thread_ctrl::wait_on(in_cmd, nullptr);
+			thread_ctrl::wait_on(in_cmd);
 			slice = in_cmd.pop_all(); // Pop new command list
 		}())
 		{
@ -921,7 +921,7 @@ static error_code vdecOpen(ppu_thread& ppu, T type, U res, vm::cptr<CellVdecCb>
 	});

 	thrd->state -= cpu_flag::stop;
-	thrd->state.notify_one(cpu_flag::stop);
+	thrd->state.notify_one();

 	return CELL_OK;
 }
--- a/rpcs3/Emu/Cell/PPUThread.cpp
+++ b/rpcs3/Emu/Cell/PPUThread.cpp
@ -1821,7 +1821,11 @@ void ppu_thread::cpu_task()

 			// Wait until the progress dialog is closed.
 			// We don't want to open a cell dialog while a native progress dialog is still open.
-			thread_ctrl::wait_on<atomic_wait::op_ne>(g_progr_ptotal, 0);
+			while (u32 v = g_progr_ptotal)
+			{
+				g_progr_ptotal.wait(v);
+			}
+
 			g_fxo->get<progress_dialog_workaround>().show_overlay_message_only = true;

 			// Sadly we can't postpone initializing guest time because we need to run PPU threads
@ -1839,7 +1843,7 @@ void ppu_thread::cpu_task()
 					}

 					ensure(spu.state.test_and_reset(cpu_flag::stop));
-					spu.state.notify_one(cpu_flag::stop);
+					spu.state.notify_one();
 				}
 			});

@ -2051,7 +2055,7 @@ ppu_thread::ppu_thread(utils::serial& ar)
 	struct init_pushed
 	{
 		bool pushed = false;
-		atomic_t<bool> inited = false;
+		atomic_t<u32> inited = false;
 	};

 	call_history.data.resize(g_cfg.core.ppu_call_history ? call_history_max_size : 1);
@ -2100,7 +2104,7 @@ ppu_thread::ppu_thread(utils::serial& ar)
 				{
 					while (!Emu.IsStopped() && !g_fxo->get<init_pushed>().inited)
 					{
-						thread_ctrl::wait_on(g_fxo->get<init_pushed>().inited, false);
+						thread_ctrl::wait_on(g_fxo->get<init_pushed>().inited, 0);
 					}
 					return false;
 				}
@ -2117,7 +2121,7 @@ ppu_thread::ppu_thread(utils::serial& ar)
 				{ppu_cmd::ptr_call, 0}, +[](ppu_thread&) -> bool
 				{
 					auto& inited = g_fxo->get<init_pushed>().inited;
-					inited = true;
+					inited = 1;
 					inited.notify_all();
 					return true;
 				}
@ -3046,7 +3050,7 @@ static bool ppu_store_reservation(ppu_thread& ppu, u32 addr, u64 reg_value)

 		if (ppu.cia < liblv2_begin || ppu.cia >= liblv2_end)
 		{
-			res.notify_all(-128);
+			res.notify_all();
 		}

 		if (addr == ppu.last_faddr)
--- a/rpcs3/Emu/Cell/RawSPUThread.cpp
+++ b/rpcs3/Emu/Cell/RawSPUThread.cpp
@ -21,7 +21,7 @@ inline void try_start(spu_thread& spu)
 	}).second)
 	{
 		spu.state -= cpu_flag::stop;
-		spu.state.notify_one(cpu_flag::stop);
+		spu.state.notify_one();
 	}
 };

@ -273,7 +273,7 @@ bool spu_thread::write_reg(const u32 addr, const u32 value)

 				for (status_npc_sync_var old; (old = status_npc).status & SPU_STATUS_RUNNING;)
 				{
-					status_npc.wait(old);
+					utils::bless<atomic_t<u32>>(&status_npc)[0].wait(old.status);
 				}
 			}
 		}
--- a/rpcs3/Emu/Cell/SPURecompiler.cpp
+++ b/rpcs3/Emu/Cell/SPURecompiler.cpp
@ -647,7 +647,10 @@ void spu_cache::initialize()
 	if (g_cfg.core.spu_decoder == spu_decoder_type::asmjit || g_cfg.core.spu_decoder == spu_decoder_type::llvm)
 	{
 		// Initialize progress dialog (wait for previous progress done)
-		thread_ctrl::wait_on<atomic_wait::op_ne>(g_progr_ptotal, 0);
+		while (u32 v = g_progr_ptotal)
+		{
+			g_progr_ptotal.wait(v);
+		}

 		g_progr_ptotal += ::size32(func_list);
 		progr.emplace("Building SPU cache...");
@ -7795,7 +7798,7 @@ public:
 		{
 			minusb = eval(x);
 		}
-		
+
 		const auto minusbx = bitcast<u8[16]>(minusb);

 		// Data with swapped endian from a load instruction
@ -11011,7 +11014,7 @@ struct spu_llvm_worker
 				return;
 			}

-			thread_ctrl::wait_on(registered, nullptr);
+			thread_ctrl::wait_on(utils::bless<atomic_t<u32>>(&registered)[1], 0);
 			slice = registered.pop_all();
 		}())
 		{
@ -11178,7 +11181,7 @@ struct spu_llvm
 			{
 				// Interrupt profiler thread and put it to sleep
 				static_cast<void>(prof_mutex.reset());
-				thread_ctrl::wait_on(registered, nullptr);
+				thread_ctrl::wait_on(utils::bless<atomic_t<u32>>(&registered)[1], 0);
 				continue;
 			}

--- a/rpcs3/Emu/Cell/SPUThread.cpp
+++ b/rpcs3/Emu/Cell/SPUThread.cpp
@ -2418,7 +2418,7 @@ void spu_thread::do_dma_transfer(spu_thread* _this, const spu_mfc_cmd& args, u8*
 						}
 					}

-					if (++i < 10)
+					if (true || ++i < 10)
 					{
 						busy_wait(500);
 					}
@ -2426,7 +2426,7 @@ void spu_thread::do_dma_transfer(spu_thread* _this, const spu_mfc_cmd& args, u8*
 					{
 						// Wait
 						_cpu->state += cpu_flag::wait + cpu_flag::temp;
-						bits->wait(old, wmask);
+						// bits->wait(old, wmask);
 						_cpu->check_state();
 					}
 				}())
@ -2542,7 +2542,7 @@ void spu_thread::do_dma_transfer(spu_thread* _this, const spu_mfc_cmd& args, u8*
 					v &= ~wmask;
 				});

-				bits->notify_all(wmask);
+				// bits->notify_all(wmask);

 				if (size == size0)
 				{
@ -3588,7 +3588,7 @@ bool spu_thread::do_putllc(const spu_mfc_cmd& args)
 	{
 		if (raddr)
 		{
-			vm::reservation_notifier(addr).notify_all(-128);
+			vm::reservation_notifier(addr).notify_all();
 			raddr = 0;
 		}

@ -3775,7 +3775,7 @@ void spu_thread::do_putlluc(const spu_mfc_cmd& args)
 	}

 	do_cell_atomic_128_store(addr, _ptr<spu_rdata_t>(args.lsa & 0x3ff80));
-	vm::reservation_notifier(addr).notify_all(-128);
+	vm::reservation_notifier(addr).notify_all();
 }

 bool spu_thread::do_mfc(bool can_escape, bool must_finish)
@ -4908,7 +4908,11 @@ s64 spu_thread::get_ch_value(u32 ch)
 			}
 		}

+#ifdef __linux__
+		const bool reservation_busy_waiting = false;
+#else
 		const bool reservation_busy_waiting = ((utils::get_tsc() >> 8) % 100 + ((raddr == spurs_addr) ? 50 : 0)) < g_cfg.core.spu_reservation_busy_waiting_percentage;
+#endif

 		for (; !events.count; events = get_events(mask1 & ~SPU_EVENT_LR, true, true))
 		{
@ -4930,8 +4934,11 @@ s64 spu_thread::get_ch_value(u32 ch)
 			if (raddr && (mask1 & ~SPU_EVENT_TM) == SPU_EVENT_LR)
 			{
 				// Don't busy-wait with TSX - memory is sensitive
-				if (!reservation_busy_waiting)
+				if (g_use_rtm || !reservation_busy_waiting)
 				{
+#ifdef __linux__
+					vm::reservation_notifier(raddr).wait(rtime, atomic_wait_timeout{50'000});
+#else
 					if (raddr - spurs_addr <= 0x80 && !g_cfg.core.spu_accurate_reservations && mask1 == SPU_EVENT_LR)
 					{
 						atomic_wait_engine::set_one_time_use_wait_callback(+[](u64) -> bool
@ -4944,7 +4951,7 @@ s64 spu_thread::get_ch_value(u32 ch)

 						// Wait without timeout, in this situation we have notifications for all writes making it possible
 						// Abort notifications are handled specially for performance reasons
-						vm::reservation_notifier(raddr).wait(rtime, -128);
+						vm::reservation_notifier(raddr).wait(rtime);
 						continue;
 					}

@ -4976,7 +4983,8 @@ s64 spu_thread::get_ch_value(u32 ch)
 						return true;
 					});

-					vm::reservation_notifier(raddr).wait(rtime, -128, atomic_wait_timeout{80'000});
+					vm::reservation_notifier(raddr).wait(rtime, atomic_wait_timeout{80'000});
+#endif
 				}
 				else
 				{
@ -5464,7 +5472,7 @@ extern void resume_spu_thread_group_from_waiting(spu_thread& spu)
 	{
 		group->run_state = SPU_THREAD_GROUP_STATUS_SUSPENDED;
 		spu.state += cpu_flag::signal;
-		spu.state.notify_one(cpu_flag::signal);
+		spu.state.notify_one();
 		return;
 	}

@ -5482,7 +5490,7 @@ extern void resume_spu_thread_group_from_waiting(spu_thread& spu)
 				thread->state -= cpu_flag::suspend;
 			}

-			thread->state.notify_one(cpu_flag::suspend + cpu_flag::signal);
+			thread->state.notify_one();
 		}
 	}
 }
@ -6244,7 +6252,7 @@ s64 spu_channel::pop_wait(cpu_thread& spu, bool pop)

 	while (true)
 	{
-		thread_ctrl::wait_on(data, bit_wait);
+		thread_ctrl::wait_on(utils::bless<atomic_t<u32>>(&data)[1], u32{bit_wait >> 32});
 		old = data;

 		if (!(old & bit_wait))
@ -6325,7 +6333,7 @@ bool spu_channel::push_wait(cpu_thread& spu, u32 value, bool push)
 			return false;
 		}

-		thread_ctrl::wait_on(data, state);
+		thread_ctrl::wait_on(utils::bless<atomic_t<u32>>(&data)[1], u32(state >> 32));
 		state = data;
 	}
 }
@ -6369,7 +6377,7 @@ std::pair<u32, u32> spu_channel_4_t::pop_wait(cpu_thread& spu)

 	while (true)
 	{
-		thread_ctrl::wait_on(values, old);
+		thread_ctrl::wait_on(utils::bless<atomic_t<u32>>(&values)[0], u32(u64(std::bit_cast<u128>(old))));
 		old = values;

 		if (!old.waiting)
--- a/rpcs3/Emu/Cell/SPUThread.h
+++ b/rpcs3/Emu/Cell/SPUThread.h
@ -235,7 +235,7 @@ public:

 				// Turn off waiting bit manually (must succeed because waiting bit can only be resetted by the thread pushed to jostling_value)
 				ensure(this->data.bit_test_reset(off_wait));
-				data.notify_one();
+				utils::bless<atomic_t<u32>>(&data)[1].notify_one();
 			}

 			// Return true if count has changed from 0 to 1, this condition is considered satisfied even if we pushed a value directly to the special storage for waiting SPUs
@ -294,7 +294,7 @@ public:

 		if ((old & mask) == mask)
 		{
-			data.notify_one();
+			utils::bless<atomic_t<u32>>(&data)[1].notify_one();
 		}

 		return static_cast<u32>(old);
@ -386,7 +386,7 @@ struct spu_channel_4_t

 				// Turn off waiting bit manually (must succeed because waiting bit can only be resetted by the thread pushing to jostling_value)
 				ensure(atomic_storage<u8>::exchange(values.raw().waiting, 0));
-				values.notify_one();
+				utils::bless<atomic_t<u32>>(&values)[0].notify_one();
 			}

 			return;
--- a/rpcs3/Emu/Cell/lv2/lv2.cpp
+++ b/rpcs3/Emu/Cell/lv2/lv2.cpp
@ -1631,7 +1631,7 @@ bool lv2_obj::awake_unlocked(cpu_thread* cpu, s32 prio)

 			if (is_paused(target->state - cpu_flag::suspend))
 			{
-				target->state.notify_one(cpu_flag::suspend);
+				target->state.notify_one();
 			}
 		}
 	}
@ -1684,7 +1684,7 @@ void lv2_obj::schedule_all(u64 current_time)
 				if (notify_later_idx == std::size(g_to_notify))
 				{
 					// Out of notification slots, notify locally (resizable container is not worth it)
-					target->state.notify_one(cpu_flag::signal + cpu_flag::suspend);
+					target->state.notify_one();
 				}
 				else
 				{
@ -1718,7 +1718,7 @@ void lv2_obj::schedule_all(u64 current_time)
 				if (notify_later_idx == std::size(g_to_notify))
 				{
 					// Out of notification slots, notify locally (resizable container is not worth it)
-					target->state.notify_one(cpu_flag::notify);
+					target->state.notify_one();
 				}
 				else
 				{
@ -1948,7 +1948,7 @@ bool lv2_obj::wait_timeout(u64 usec, ppu_thread* cpu, bool scale, bool is_usleep
 		u64 remaining = usec - passed;
 #ifdef __linux__
 		// NOTE: Assumption that timer initialization has succeeded
-		u64 host_min_quantum = is_usleep && remaining <= 1000 ? 10 : 50;
+		constexpr u64 host_min_quantum = 10;
 #else
 		// Host scheduler quantum for windows (worst case)
 		// NOTE: On ps3 this function has very high accuracy
@ -1965,8 +1965,7 @@ bool lv2_obj::wait_timeout(u64 usec, ppu_thread* cpu, bool scale, bool is_usleep
 			if (remaining > host_min_quantum)
 			{
 #ifdef __linux__
-				// Do not wait for the last quantum to avoid loss of accuracy
-				wait_for(remaining - ((remaining % host_min_quantum) + host_min_quantum));
+				wait_for(remaining);
 #else
 				// Wait on multiple of min quantum for large durations to avoid overloading low thread cpus
 				wait_for(remaining - (remaining % host_min_quantum));
--- a/rpcs3/Emu/Cell/lv2/sys_interrupt.cpp
+++ b/rpcs3/Emu/Cell/lv2/sys_interrupt.cpp
@ -183,7 +183,7 @@ error_code _sys_interrupt_thread_establish(ppu_thread& ppu, vm::ptr<u32> ih, u32
 		});

 		it->state -= cpu_flag::stop;
-		it->state.notify_one(cpu_flag::stop);
+		it->state.notify_one();

 		return result;
 	});
--- a/rpcs3/Emu/Cell/lv2/sys_mmapper.cpp
+++ b/rpcs3/Emu/Cell/lv2/sys_mmapper.cpp
@ -862,7 +862,7 @@ error_code mmapper_thread_recover_page_fault(cpu_thread* cpu)

 	if (cpu->state & cpu_flag::signal)
 	{
-		cpu->state.notify_one(cpu_flag::signal);
+		cpu->state.notify_one();
 	}

 	return CELL_OK;
--- a/rpcs3/Emu/Cell/lv2/sys_spu.cpp
+++ b/rpcs3/Emu/Cell/lv2/sys_spu.cpp
@ -1042,7 +1042,7 @@ error_code sys_spu_thread_group_start(ppu_thread& ppu, u32 id)
 		{
 			for (; index != umax; index--)
 			{
-				threads[index]->state.notify_one(cpu_flag::stop);
+				threads[index]->state.notify_one();
 			}
 		}
 	} notify_threads;
@ -1216,7 +1216,7 @@ error_code sys_spu_thread_group_resume(ppu_thread& ppu, u32 id)
 		{
 			for (; index != umax; index--)
 			{
-				threads[index]->state.notify_one(cpu_flag::suspend);
+				threads[index]->state.notify_one();
 			}
 		}
 	} notify_threads;
@ -1397,7 +1397,7 @@ error_code sys_spu_thread_group_terminate(ppu_thread& ppu, u32 id, s32 value)
 				if (prev_resv && prev_resv != resv)
 				{
 					// Batch reservation notifications if possible
-					vm::reservation_notifier(prev_resv).notify_all(-128);
+					vm::reservation_notifier(prev_resv).notify_all();
 				}

 				prev_resv = resv;
@ -1407,7 +1407,7 @@ error_code sys_spu_thread_group_terminate(ppu_thread& ppu, u32 id, s32 value)

 	if (prev_resv)
 	{
-		vm::reservation_notifier(prev_resv).notify_all(-128);
+		vm::reservation_notifier(prev_resv).notify_all();
 	}

 	group->exit_status = value;
--- a/rpcs3/Emu/Cell/lv2/sys_sync.h
+++ b/rpcs3/Emu/Cell/lv2/sys_sync.h
@ -434,7 +434,7 @@ public:
 			{
 				// Note: by the time of notification the thread could have been deallocated which is why the direct function is used
 				// TODO: Pass a narrower mask
-				atomic_wait_engine::notify_one(cpu, 4, atomic_wait::default_mask<atomic_bs_t<cpu_flag>>);
+				atomic_wait_engine::notify_one(cpu);
 			}
 		}

--- a/rpcs3/Emu/Memory/vm_reservation.h
+++ b/rpcs3/Emu/Memory/vm_reservation.h
@ -135,7 +135,7 @@ namespace vm
 					_xend();
 #endif
 					if constexpr (Ack)
-						res.notify_all(-128);
+						res.notify_all();
 					return;
 				}
 				else
@ -149,7 +149,7 @@ namespace vm
 						_xend();
 #endif
 						if constexpr (Ack)
-							res.notify_all(-128);
+							res.notify_all();
 						return result;
 					}
 					else
@ -204,7 +204,7 @@ namespace vm
 #endif
 					res += 127;
 					if (Ack)
-						res.notify_all(-128);
+						res.notify_all();
 					return;
 				}
 				else
@ -218,7 +218,7 @@ namespace vm
 #endif
 						res += 127;
 						if (Ack)
-							res.notify_all(-128);
+							res.notify_all();
 						return result;
 					}
 					else
@ -253,7 +253,7 @@ namespace vm
 				});

 				if constexpr (Ack)
-					res.notify_all(-128);
+					res.notify_all();
 				return;
 			}
 			else
@ -273,7 +273,7 @@ namespace vm
 				});

 				if (Ack && result)
-					res.notify_all(-128);
+					res.notify_all();
 				return result;
 			}
 		}
@ -293,7 +293,7 @@ namespace vm
 			}

 			if constexpr (Ack)
-				res.notify_all(-128);
+				res.notify_all();
 			return;
 		}
 		else
@ -313,7 +313,7 @@ namespace vm
 			}

 			if (Ack && result)
-				res.notify_all(-128);
+				res.notify_all();
 			return result;
 		}
 	}
@ -405,7 +405,7 @@ namespace vm

 			if constexpr (Ack)
 			{
-				res.notify_all(-128);
+				res.notify_all();
 			}
 		}
 		else
@ -415,7 +415,7 @@ namespace vm

 			if constexpr (Ack)
 			{
-				res.notify_all(-128);
+				res.notify_all();
 			}

 			return result;
--- a/rpcs3/Emu/RSX/GL/GLPipelineCompiler.cpp
+++ b/rpcs3/Emu/RSX/GL/GLPipelineCompiler.cpp
@ -55,7 +55,7 @@ namespace gl
 				job.completion_callback(result);
 			}

-			thread_ctrl::wait_on(m_work_queue, nullptr);
+			thread_ctrl::wait_on(m_work_queue);
 		}
 	}

--- a/rpcs3/Emu/RSX/Overlays/HomeMenu/overlay_home_menu.cpp
+++ b/rpcs3/Emu/RSX/Overlays/HomeMenu/overlay_home_menu.cpp
@ -161,7 +161,7 @@ namespace rsx
 			this->on_close = std::move(on_close);
 			visible = true;

-			const auto notify = std::make_shared<atomic_t<bool>>(false);
+			const auto notify = std::make_shared<atomic_t<u32>>(0);
 			auto& overlayman = g_fxo->get<display_manager>();

 			overlayman.attach_thread_input(
--- a/rpcs3/Emu/RSX/Overlays/overlay_manager.cpp
+++ b/rpcs3/Emu/RSX/Overlays/overlay_manager.cpp
@ -295,7 +295,7 @@ namespace rsx
 				}
 				else if (!m_input_thread_abort)
 				{
-					thread_ctrl::wait_on(m_input_token_stack, nullptr);
+					thread_ctrl::wait_on(m_input_token_stack);
 				}
 			}
 		}
--- a/rpcs3/Emu/RSX/Overlays/overlay_message_dialog.cpp
+++ b/rpcs3/Emu/RSX/Overlays/overlay_message_dialog.cpp
@ -295,7 +295,7 @@ namespace rsx
 			{
 				if (!m_stop_input_loop)
 				{
-					const auto notify = std::make_shared<atomic_t<bool>>(false);
+					const auto notify = std::make_shared<atomic_t<u32>>(0);
 					auto& overlayman = g_fxo->get<display_manager>();

 					if (interactive)
--- a/rpcs3/Emu/RSX/Overlays/overlay_osk.cpp
+++ b/rpcs3/Emu/RSX/Overlays/overlay_osk.cpp
@ -1621,7 +1621,7 @@ namespace rsx

 			update_panel();

-			const auto notify = std::make_shared<atomic_t<bool>>(false);
+			const auto notify = std::make_shared<atomic_t<u32>>(0);
 			auto& overlayman = g_fxo->get<display_manager>();

 			overlayman.attach_thread_input(
@ -1631,7 +1631,7 @@ namespace rsx

 			while (!Emu.IsStopped() && !*notify)
 			{
-				notify->wait(false, atomic_wait_timeout{1'000'000});
+				notify->wait(0, atomic_wait_timeout{1'000'000});
 			}
 		}
 	}
--- a/rpcs3/Emu/RSX/Overlays/overlay_user_list_dialog.cpp
+++ b/rpcs3/Emu/RSX/Overlays/overlay_user_list_dialog.cpp
@ -240,7 +240,7 @@ namespace rsx
 			this->on_close = std::move(on_close);
 			visible = true;

-			const auto notify = std::make_shared<atomic_t<bool>>(false);
+			const auto notify = std::make_shared<atomic_t<u32>>(0);
 			auto& overlayman = g_fxo->get<display_manager>();

 			overlayman.attach_thread_input(
@ -250,7 +250,7 @@ namespace rsx

 			while (!Emu.IsStopped() && !*notify)
 			{
-				notify->wait(false, atomic_wait_timeout{1'000'000});
+				notify->wait(0, atomic_wait_timeout{1'000'000});
 			}

 			return CELL_OK;
--- a/rpcs3/Emu/RSX/Overlays/overlays.cpp
+++ b/rpcs3/Emu/RSX/Overlays/overlays.cpp
@ -20,7 +20,7 @@ namespace rsx

 		u64 user_interface::alloc_thread_bit()
 		{
-			auto [_old, ok] = this->thread_bits.fetch_op([](u64& bits)
+			auto [_old, ok] = this->thread_bits.fetch_op([](u32& bits)
 			{
 				if (~bits)
 				{
@ -385,7 +385,7 @@ namespace rsx
 			m_stop_pad_interception.release(stop_pad_interception);
 			m_stop_input_loop.release(true);

-			while (u64 b = thread_bits)
+			while (u32 b = thread_bits)
 			{
 				if (b == g_thread_bit)
 				{
--- a/rpcs3/Emu/RSX/Overlays/overlays.h
+++ b/rpcs3/Emu/RSX/Overlays/overlays.h
@ -85,7 +85,7 @@ namespace rsx
 			bool m_start_pad_interception = true;
 			atomic_t<bool> m_stop_pad_interception = false;
 			atomic_t<bool> m_input_thread_detached = false;
-			atomic_t<u64> thread_bits = 0;
+			atomic_t<u32> thread_bits = 0;
 			bool m_keyboard_input_enabled = false; // Allow keyboard input
 			bool m_keyboard_pad_handler_active = true; // Initialized as true to prevent keyboard input until proven otherwise.
 			bool m_allow_input_on_pause = false;
--- a/rpcs3/Emu/RSX/RSXThread.cpp
+++ b/rpcs3/Emu/RSX/RSXThread.cpp
@ -853,9 +853,8 @@ namespace rsx

 		g_fxo->get<vblank_thread>().set_thread(std::shared_ptr<named_thread<std::function<void()>>>(new named_thread<std::function<void()>>("VBlank Thread"sv, [this]() -> void
 		{
-			// See sys_timer_usleep for details
 #ifdef __linux__
-			constexpr u32 host_min_quantum = 50;
+			constexpr u32 host_min_quantum = 10;
 #else
 			constexpr u32 host_min_quantum = 500;
 #endif
@ -878,8 +877,12 @@ namespace rsx
 				// Calculate time remaining to that time (0 if we passed it)
 				const u64 wait_for = current >= post_event_time ? 0 : post_event_time - current;

+#ifdef __linux__
+				const u64 wait_sleep = wait_for;
+#else
 				// Substract host operating system min sleep quantom to get sleep time
 				const u64 wait_sleep = wait_for - u64{wait_for >= host_min_quantum} * host_min_quantum;
+#endif

 				if (!wait_for)
 				{
@ -3116,7 +3119,7 @@ namespace rsx
 		{
 #ifdef __linux__
 			// NOTE: Assumption that timer initialization has succeeded
-			u64 host_min_quantum = remaining <= 1000 ? 10 : 50;
+			constexpr u64 host_min_quantum = 10;
 #else
 			// Host scheduler quantum for windows (worst case)
 			// NOTE: On ps3 this function has very high accuracy
@ -3125,8 +3128,7 @@ namespace rsx
 			if (remaining >= host_min_quantum)
 			{
 #ifdef __linux__
-				// Do not wait for the last quantum to avoid loss of accuracy
-				thread_ctrl::wait_for(remaining - ((remaining % host_min_quantum) + host_min_quantum), false);
+				thread_ctrl::wait_for(remaining, false);
 #else
 				// Wait on multiple of min quantum for large durations to avoid overloading low thread cpus
 				thread_ctrl::wait_for(remaining - (remaining % host_min_quantum), false);
--- a/rpcs3/Emu/RSX/RSXThread.h
+++ b/rpcs3/Emu/RSX/RSXThread.h
@ -213,7 +213,7 @@ namespace rsx
 		u32 last_known_code_start = 0;
 		atomic_t<u32> external_interrupt_lock{ 0 };
 		atomic_t<bool> external_interrupt_ack{ false };
-		atomic_t<bool> is_initialized{ false };
+		atomic_t<u32> is_initialized{0};
 		bool is_fifo_idle() const;
 		void flush_fifo();

--- a/rpcs3/Emu/RSX/VK/VKPipelineCompiler.cpp
+++ b/rpcs3/Emu/RSX/VK/VKPipelineCompiler.cpp
@ -48,7 +48,7 @@ namespace vk
 				}
 			}

-			thread_ctrl::wait_on(m_work_queue, nullptr);
+			thread_ctrl::wait_on(m_work_queue);
 		}
 	}

--- a/rpcs3/Emu/RSX/rsx_decode.h
+++ b/rpcs3/Emu/RSX/rsx_decode.h
@ -1699,7 +1699,7 @@ struct registers_decoder<NV4097_SET_SHADER_CONTROL>
 	static void dump(std::string& out, const decoded_type& decoded)
 	{
 		fmt::append(out, "Shader control: raw_value: 0x%x reg_count: %u%s%s",
-			decoded.shader_ctrl(), ((decoded.shader_ctrl() >> 24) & 0xFF), ((decoded.shader_ctrl() & CELL_GCM_SHADER_CONTROL_DEPTH_EXPORT) ? " depth_replace" : ""), 
+			decoded.shader_ctrl(), ((decoded.shader_ctrl() >> 24) & 0xFF), ((decoded.shader_ctrl() & CELL_GCM_SHADER_CONTROL_DEPTH_EXPORT) ? " depth_replace" : ""),
 			((decoded.shader_ctrl() & CELL_GCM_SHADER_CONTROL_32_BITS_EXPORTS) ? " 32b_exports" : ""));
 	}
 };
--- a/rpcs3/Emu/System.cpp
+++ b/rpcs3/Emu/System.cpp
@ -153,7 +153,7 @@ void fmt_class_string<cfg_mode>::format(std::string& out, u64 arg)
 	});
 }

-void Emulator::CallFromMainThread(std::function<void()>&& func, atomic_t<bool>* wake_up, bool track_emu_state, u64 stop_ctr) const
+void Emulator::CallFromMainThread(std::function<void()>&& func, atomic_t<u32>* wake_up, bool track_emu_state, u64 stop_ctr) const
 {
 	if (!track_emu_state)
 	{
@ -174,14 +174,14 @@ void Emulator::CallFromMainThread(std::function<void()>&& func, atomic_t<bool>*

 void Emulator::BlockingCallFromMainThread(std::function<void()>&& func) const
 {
-	atomic_t<bool> wake_up = false;
+	atomic_t<u32> wake_up = 0;

 	CallFromMainThread(std::move(func), &wake_up);

 	while (!wake_up)
 	{
 		ensure(thread_ctrl::get_current());
-		wake_up.wait(false);
+		wake_up.wait(0);
 	}
 }

@ -424,7 +424,7 @@ void Emulator::Init()
 		make_path_verbose(dev_flash, true);
 		make_path_verbose(dev_flash2, true);
 		make_path_verbose(dev_flash3, true);
-		
+
 		if (make_path_verbose(dev_usb, true))
 		{
 			make_path_verbose(dev_usb + "MUSIC/", false);
@ -2152,7 +2152,7 @@ void Emulator::RunPPU()
 		}

 		ensure(cpu.state.test_and_reset(cpu_flag::stop));
-		cpu.state.notify_one(cpu_flag::stop);
+		cpu.state.notify_one();
 		signalled_thread = true;
 	});

@ -2165,7 +2165,7 @@ void Emulator::RunPPU()
 	if (auto thr = g_fxo->try_get<named_thread<rsx::rsx_replay_thread>>())
 	{
 		thr->state -= cpu_flag::stop;
-		thr->state.notify_one(cpu_flag::stop);
+		thr->state.notify_one();
 	}
 }

@ -2234,7 +2234,7 @@ void Emulator::FinalizeRunRequest()
 		}

 		ensure(spu.state.test_and_reset(cpu_flag::stop));
-		spu.state.notify_one(cpu_flag::stop);
+		spu.state.notify_one();
 	};

 	if (m_savestate_extension_flags1 & SaveStateExtentionFlags1::ShouldCloseMenu)
@ -2437,7 +2437,7 @@ void Emulator::Resume()
 	auto on_select = [](u32, cpu_thread& cpu)
 	{
 		cpu.state -= cpu_flag::dbg_global_pause;
-		cpu.state.notify_one(cpu_flag::dbg_global_pause);
+		cpu.state.notify_one();
 	};

 	idm::select<named_thread<ppu_thread>>(on_select);
--- a/rpcs3/Emu/System.h
+++ b/rpcs3/Emu/System.h
@ -54,7 +54,7 @@ constexpr bool is_error(game_boot_result res)

 struct EmuCallbacks
 {
-	std::function<void(std::function<void()>, atomic_t<bool>*)> call_from_main_thread;
+	std::function<void(std::function<void()>, atomic_t<u32>*)> call_from_main_thread;
 	std::function<void(bool)> on_run; // (start_playtime) continuing or going ingame, so start the clock
 	std::function<void()> on_pause;
 	std::function<void()> on_resume;
@ -180,7 +180,7 @@ public:
 	}

 	// Call from the GUI thread
-	void CallFromMainThread(std::function<void()>&& func, atomic_t<bool>* wake_up = nullptr, bool track_emu_state = true, u64 stop_ctr = umax) const;
+	void CallFromMainThread(std::function<void()>&& func, atomic_t<u32>* wake_up = nullptr, bool track_emu_state = true, u64 stop_ctr = umax) const;

 	// Blocking call from the GUI thread
 	void BlockingCallFromMainThread(std::function<void()>&& func) const;
--- a/rpcs3/Emu/system_progress.cpp
+++ b/rpcs3/Emu/system_progress.cpp
@ -68,7 +68,7 @@ void progress_dialog_server::operator()()
 		{
 			// Some backends like OpenGL actually initialize a lot of driver objects in the "on_init" method.
 			// Wait for init to complete within reasonable time. Abort just in case we have hardware/driver issues.
-			renderer->is_initialized.wait(false, atomic_wait_timeout(5 * 1000000000ull));
+			renderer->is_initialized.wait(0, atomic_wait_timeout(5 * 1000000000ull));

 			auto manager  = g_fxo->try_get<rsx::overlays::display_manager>();
 			show_overlay_message = g_fxo->get<progress_dialog_workaround>().show_overlay_message_only;
--- a/rpcs3/headless_application.cpp
+++ b/rpcs3/headless_application.cpp
@ -60,7 +60,7 @@ void headless_application::InitializeCallbacks()

 		return false;
 	};
-	callbacks.call_from_main_thread = [this](std::function<void()> func, atomic_t<bool>* wake_up)
+	callbacks.call_from_main_thread = [this](std::function<void()> func, atomic_t<u32>* wake_up)
 	{
 		RequestCallFromMainThread(std::move(func), wake_up);
 	};
@ -166,7 +166,7 @@ void headless_application::InitializeCallbacks()
 /**
 * Using connects avoids timers being unable to be used in a non-qt thread. So, even if this looks stupid to just call func, it's succinct.
 */
-void headless_application::CallFromMainThread(const std::function<void()>& func, atomic_t<bool>* wake_up)
+void headless_application::CallFromMainThread(const std::function<void()>& func, atomic_t<u32>* wake_up)
 {
 	func();

--- a/rpcs3/headless_application.h
+++ b/rpcs3/headless_application.h
@ -30,8 +30,8 @@ private:
 	}

 Q_SIGNALS:
-	void RequestCallFromMainThread(std::function<void()> func, atomic_t<bool>* wake_up);
+	void RequestCallFromMainThread(std::function<void()> func, atomic_t<u32>* wake_up);

 private Q_SLOTS:
-	static void CallFromMainThread(const std::function<void()>& func, atomic_t<bool>* wake_up);
+	static void CallFromMainThread(const std::function<void()>& func, atomic_t<u32>* wake_up);
 };
--- a/rpcs3/main.cpp
+++ b/rpcs3/main.cpp
@ -55,6 +55,7 @@ DYNAMIC_IMPORT("ntdll.dll", NtSetTimerResolution, NTSTATUS(ULONG DesiredResoluti
 #ifdef __linux__
 #include <sys/time.h>
 #include <sys/resource.h>
+#include <sys/prctl.h>
 #endif

 #if defined(__APPLE__)
@ -443,6 +444,11 @@ int main(int argc, char** argv)
 	const u64 intro_time = (intro_stats.ru_utime.tv_sec + intro_stats.ru_stime.tv_sec) * 1000000000ull + (intro_stats.ru_utime.tv_usec + intro_stats.ru_stime.tv_usec) * 1000ull;
 #endif

+#ifdef __linux__
+	// Set timerslack value for Linux. The default value is 50,000ns. Change this to just 1 since we value precise timers.
+	prctl(PR_SET_TIMERSLACK, 1, 0, 0, 0);
+#endif
+
 	s_argv0 = argv[0]; // Save for report_fatal_error

 	// Only run RPCS3 to display an error
--- a/rpcs3/rpcs3qt/debugger_frame.cpp
+++ b/rpcs3/rpcs3qt/debugger_frame.cpp
@ -1366,7 +1366,7 @@ void debugger_frame::DoStep(bool step_over)
 				}
 			});

-			cpu->state.notify_one(s_pause_flags);
+			cpu->state.notify_one();
 		}
 	}

@ -1412,7 +1412,7 @@ void debugger_frame::RunBtnPress()
 				Emu.Resume();
 			}

-			cpu->state.notify_one(s_pause_flags);
+			cpu->state.notify_one();
 			m_debugger_list->EnableThreadFollowing();
 		}
 	}
--- a/rpcs3/rpcs3qt/gui_application.cpp
+++ b/rpcs3/rpcs3qt/gui_application.cpp
@ -421,7 +421,7 @@ void gui_application::InitializeCallbacks()

 		return false;
 	};
-	callbacks.call_from_main_thread = [this](std::function<void()> func, atomic_t<bool>* wake_up)
+	callbacks.call_from_main_thread = [this](std::function<void()> func, atomic_t<u32>* wake_up)
 	{
 		RequestCallFromMainThread(std::move(func), wake_up);
 	};
@ -792,7 +792,7 @@ void gui_application::OnChangeStyleSheetRequest()
 /**
 * Using connects avoids timers being unable to be used in a non-qt thread. So, even if this looks stupid to just call func, it's succinct.
 */
-void gui_application::CallFromMainThread(const std::function<void()>& func, atomic_t<bool>* wake_up)
+void gui_application::CallFromMainThread(const std::function<void()>& func, atomic_t<u32>* wake_up)
 {
 	func();

--- a/rpcs3/rpcs3qt/gui_application.h
+++ b/rpcs3/rpcs3qt/gui_application.h
@ -118,8 +118,8 @@ Q_SIGNALS:
 	void OnEnableDiscEject(bool enabled);
 	void OnEnableDiscInsert(bool enabled);

-	void RequestCallFromMainThread(std::function<void()> func, atomic_t<bool>* wake_up);
+	void RequestCallFromMainThread(std::function<void()> func, atomic_t<u32>* wake_up);

 private Q_SLOTS:
-	static void CallFromMainThread(const std::function<void()>& func, atomic_t<bool>* wake_up);
+	static void CallFromMainThread(const std::function<void()>& func, atomic_t<u32>* wake_up);
 };
--- a/rpcs3/util/atomic.cpp
+++ b/rpcs3/util/atomic.cpp
@ -1,6 +1,7 @@
 #include "atomic.hpp"

 #if defined(__linux__)
+// This definition is unused on Linux
 #define USE_FUTEX
 #elif !defined(_WIN32)
 #define USE_STD
@ -40,8 +41,8 @@ namespace utils
 // Total number of entries.
 static constexpr usz s_hashtable_size = 1u << 17;

-// Reference counter combined with shifted pointer (which is assumed to be 47 bit)
-static constexpr uptr s_ref_mask = (1u << 17) - 1;
+// Reference counter combined with shifted pointer (which is assumed to be 48 bit)
+static constexpr uptr s_ref_mask = 0xffff;

 // Fix for silly on-first-use initializer
 static bool s_null_wait_cb(const void*, u64, u64){ return true; };
@ -55,163 +56,17 @@ static thread_local bool(*s_tls_one_time_wait_cb)(u64 attempts) = nullptr;
 // Callback for notification functions for optimizations
 static thread_local void(*s_tls_notify_cb)(const void* data, u64 progress) = nullptr;

-static inline bool operator &(atomic_wait::op lhs, atomic_wait::op_flag rhs)
-{
-	return !!(static_cast<u8>(lhs) & static_cast<u8>(rhs));
-}
-
 // Compare data in memory with old value, and return true if they are equal
-static NEVER_INLINE bool ptr_cmp(const void* data, u32 _size, u128 old128, u128 mask128, atomic_wait::info* ext = nullptr)
+static NEVER_INLINE bool ptr_cmp(const void* data, u32 old, atomic_wait::info* ext = nullptr)
 {
-	using atomic_wait::op;
-	using atomic_wait::op_flag;
-
-	const u8 size = static_cast<u8>(_size);
-	const op flag{static_cast<u8>(_size >> 8)};
-
-	bool result = false;
-
-	if (size <= 8)
-	{
-		u64 new_value = 0;
-		u64 old_value = static_cast<u64>(old128);
-		u64 mask = static_cast<u64>(mask128) & (u64{umax} >> ((64 - size * 8) & 63));
-
-		// Don't load memory on empty mask
-		switch (mask ? size : 0)
-		{
-		case 0: break;
-		case 1: new_value = reinterpret_cast<const atomic_t<u8>*>(data)->load(); break;
-		case 2: new_value = reinterpret_cast<const atomic_t<u16>*>(data)->load(); break;
-		case 4: new_value = reinterpret_cast<const atomic_t<u32>*>(data)->load(); break;
-		case 8: new_value = reinterpret_cast<const atomic_t<u64>*>(data)->load(); break;
-		default:
-		{
-			fmt::throw_exception("Bad size (arg=0x%x)", _size);
-		}
-		}
-
-		if (flag & op_flag::bit_not)
-		{
-			new_value = ~new_value;
-		}
-
-		if (!mask) [[unlikely]]
-		{
-			new_value = 0;
-			old_value = 0;
-		}
-		else
-		{
-			if (flag & op_flag::byteswap)
-			{
-				switch (size)
-				{
-				case 2:
-				{
-					new_value = stx::se_storage<u16>::swap(static_cast<u16>(new_value));
-					old_value = stx::se_storage<u16>::swap(static_cast<u16>(old_value));
-					mask = stx::se_storage<u16>::swap(static_cast<u16>(mask));
-					break;
-				}
-				case 4:
-				{
-					new_value = stx::se_storage<u32>::swap(static_cast<u32>(new_value));
-					old_value = stx::se_storage<u32>::swap(static_cast<u32>(old_value));
-					mask = stx::se_storage<u32>::swap(static_cast<u32>(mask));
-					break;
-				}
-				case 8:
-				{
-					new_value = stx::se_storage<u64>::swap(new_value);
-					old_value = stx::se_storage<u64>::swap(old_value);
-					mask = stx::se_storage<u64>::swap(mask);
-					break;
-				}
-				default:
-				{
-					break;
-				}
-				}
-			}
-
-			// Make most significant bit sign bit
-			const auto shv = std::countl_zero(mask);
-			new_value &= mask;
-			old_value &= mask;
-			new_value <<= shv;
-			old_value <<= shv;
-		}
-
-		s64 news = new_value;
-		s64 olds = old_value;
-
-		u64 newa = news < 0 ? (0ull - new_value) : new_value;
-		u64 olda = olds < 0 ? (0ull - old_value) : old_value;
-
-		switch (op{static_cast<u8>(static_cast<u8>(flag) & 0xf)})
-		{
-		case op::eq: result = old_value == new_value; break;
-		case op::slt: result = olds < news; break;
-		case op::sgt: result = olds > news; break;
-		case op::ult: result = old_value < new_value; break;
-		case op::ugt: result = old_value > new_value; break;
-		case op::alt: result = olda < newa; break;
-		case op::agt: result = olda > newa; break;
-		case op::pop:
-		{
-			// Count is taken from least significant byte and ignores some flags
-			const u64 count = static_cast<u64>(old128) & 0xff;
-
-			result = count < utils::popcnt64(new_value);
-			break;
-		}
-		default:
-		{
-			fmt::throw_exception("ptr_cmp(): unrecognized atomic wait operation.");
-		}
-		}
-	}
-	else if (size == 16 && (flag == op::eq || flag == (op::eq | op_flag::inverse)))
-	{
-		u128 new_value = 0;
-		u128 old_value = old128;
-		u128 mask = mask128;
-
-		// Don't load memory on empty mask
-		if (mask) [[likely]]
-		{
-			new_value = atomic_storage<u128>::load(*reinterpret_cast<const u128*>(data));
-		}
-
-		// TODO
-		result = !((old_value ^ new_value) & mask);
-	}
-	else if (size > 16 && !~mask128 && (flag == op::eq || flag == (op::eq | op_flag::inverse)))
-	{
-		// Interpret old128 as a pointer to the old value
-		ensure(!(old128 >> (64 + 17)));
-
-		result = std::memcmp(data, reinterpret_cast<const void*>(static_cast<uptr>(old128)), size) == 0;
-	}
-	else
-	{
-		fmt::throw_exception("ptr_cmp(): no alternative operations are supported for non-standard atomic wait yet.");
-	}
-
-	if (flag & op_flag::inverse)
-	{
-		result = !result;
-	}
-
 	// Check other wait variables if provided
-	if (result)
+	if (reinterpret_cast<const atomic_t<u32>*>(data)->load() == old)
 	{
 		if (ext) [[unlikely]]
 		{
 			for (auto e = ext; e->data; e++)
 			{
-				if (!ptr_cmp(e->data, e->size, e->old, e->mask))
+				if (!ptr_cmp(e->data, e->old))
 				{
 					return false;
 				}
@ -283,18 +138,15 @@ namespace
 #endif

 	// Essentially a fat semaphore
-	struct cond_handle
+	struct alignas(64) cond_handle
 	{
-		// Combined pointer (most significant 47 bits) and ref counter (17 least significant bits)
+		// Combined pointer (most significant 48 bits) and ref counter (16 least significant bits)
 		atomic_t<u64> ptr_ref;
 		u64 tid;
-		u128 mask;
-		u128 oldv;
+		u32 oldv;

 		u64 tsc0;
 		u16 link;
-		u8 size;
-		u8 flag;
 		atomic_t<u32> sync;

 #ifdef USE_STD
@ -316,7 +168,7 @@ namespace
 			mtx.init(mtx);
 #endif

-			ensure(!ptr_ref.exchange((iptr << 17) | 1));
+			ensure(!ptr_ref.exchange((iptr << 16) | 1));
 		}

 		void destroy()
@ -324,10 +176,7 @@ namespace
 			tid = 0;
 			tsc0 = 0;
 			link = 0;
-			size = 0;
-			flag = 0;
 			sync.release(0);
-			mask = 0;
 			oldv = 0;

 #ifdef USE_STD
@ -517,7 +366,7 @@ namespace
 // TLS storage for few allocaded "semaphores" to allow skipping initialization
 static thread_local tls_cond_handler s_tls_conds{};

-static u32 cond_alloc(uptr iptr, u128 mask, u32 tls_slot = -1)
+static u32 cond_alloc(uptr iptr, u32 tls_slot = -1)
 {
 	// Try to get cond from tls slot instead
 	u16* ptls = tls_slot >= std::size(s_tls_conds.cond) ? nullptr : s_tls_conds.cond + tls_slot;
@ -526,8 +375,7 @@ static u32 cond_alloc(uptr iptr, u128 mask, u32 tls_slot = -1)
 	{
 		// Fast reinitialize
 		const u32 id = std::exchange(*ptls, 0);
-		s_cond_list[id].mask = mask;
-		s_cond_list[id].ptr_ref.release((iptr << 17) | 1);
+		s_cond_list[id].ptr_ref.release((iptr << 16) | 1);
 		return id;
 	}

@ -581,7 +429,6 @@ static u32 cond_alloc(uptr iptr, u128 mask, u32 tls_slot = -1)
 		const u32 id = level3 * 64 + std::countr_one(bits);

 		// Initialize new "semaphore"
-		s_cond_list[id].mask = mask;
 		s_cond_list[id].init(iptr);
 		return id;
 	}
@ -625,8 +472,6 @@ static void cond_free(u32 cond_id, u32 tls_slot = -1)
 	{
 		// Fast finalization
 		cond->sync.release(0);
-		cond->size = 0;
-		cond->mask = 0;
 		*ptls = static_cast<u16>(cond_id);
 		return;
 	}
@ -652,7 +497,7 @@ static void cond_free(u32 cond_id, u32 tls_slot = -1)
 	s_cond_sem1.atomic_op(FN(x -= u128{1} << (level1 * 14)));
 }

-static cond_handle* cond_id_lock(u32 cond_id, u128 mask, uptr iptr = 0)
+static cond_handle* cond_id_lock(u32 cond_id, uptr iptr = 0)
 {
 	bool did_ref = false;

@ -673,7 +518,7 @@ static cond_handle* cond_id_lock(u32 cond_id, u128 mask, uptr iptr = 0)
 				return false;
 			}

-			if (iptr && (val >> 17) != iptr)
+			if (iptr && (val >> 16) != iptr)
 			{
 				// Pointer mismatch
 				return false;
@ -686,11 +531,6 @@ static cond_handle* cond_id_lock(u32 cond_id, u128 mask, uptr iptr = 0)
 				return false;
 			}

-			if (!(mask & cond->mask) && cond->size)
-			{
-				return false;
-			}
-
 			if (!did_ref)
 			{
 				val++;
@ -702,7 +542,7 @@ static cond_handle* cond_id_lock(u32 cond_id, u128 mask, uptr iptr = 0)
 		if (ok)
 		{
 			// Check other fields again
-			if (const u32 sync_val = cond->sync; sync_val == 0 || sync_val == 3 || (cond->size && !(mask & cond->mask)))
+			if (const u32 sync_val = cond->sync; sync_val == 0 || sync_val == 3)
 			{
 				did_ref = true;
 				continue;
@ -713,7 +553,7 @@ static cond_handle* cond_id_lock(u32 cond_id, u128 mask, uptr iptr = 0)

 		if ((old & s_ref_mask) == s_ref_mask)
 		{
-			fmt::throw_exception("Reference count limit (131071) reached in an atomic notifier.");
+			fmt::throw_exception("Reference count limit (%u) reached in an atomic notifier.", s_ref_mask);
 		}

 		break;
@ -736,8 +576,8 @@ namespace
 		u64 bits: 24; // Allocated bits
 		u64 prio: 24; // Reserved

-		u64 ref : 17; // Ref counter
-		u64 iptr: 47; // First pointer to use slot (to count used slots)
+		u64 ref : 16; // Ref counter
+		u64 iptr: 48; // First pointer to use slot (to count used slots)
 	};

 	// Need to spare 16 bits for ref counter
@ -760,7 +600,7 @@ namespace
 		static void slot_free(uptr ptr, atomic_t<u16>* slot, u32 tls_slot) noexcept;

 		template <typename F>
-		static auto slot_search(uptr iptr, u128 mask, F func) noexcept;
+		static auto slot_search(uptr iptr, F func) noexcept;
 	};

 	static_assert(sizeof(root_info) == 64);
@ -944,7 +784,7 @@ void root_info::slot_free(uptr iptr, atomic_t<u16>* slot, u32 tls_slot) noexcept
 }

 template <typename F>
-FORCE_INLINE auto root_info::slot_search(uptr iptr, u128 mask, F func) noexcept
+FORCE_INLINE auto root_info::slot_search(uptr iptr, F func) noexcept
 {
 	u32 index = 0;
 	[[maybe_unused]] u32 total = 0;
@ -974,7 +814,7 @@ FORCE_INLINE auto root_info::slot_search(uptr iptr, u128 mask, F func) noexcept

 		for (u32 i = 0; i < cond_count; i++)
 		{
-			if (cond_id_lock(cond_ids[i], mask, iptr))
+			if (cond_id_lock(cond_ids[i], iptr))
 			{
 				if (func(cond_ids[i]))
 				{
@ -994,18 +834,82 @@ FORCE_INLINE auto root_info::slot_search(uptr iptr, u128 mask, F func) noexcept
 	}
 }

-SAFE_BUFFERS(void) atomic_wait_engine::wait(const void* data, u32 size, u128 old_value, u64 timeout, u128 mask, atomic_wait::info* ext)
+SAFE_BUFFERS(void)
+atomic_wait_engine::wait(const void* data, u32 old_value, u64 timeout, atomic_wait::info* ext)
 {
-	const auto stamp0 = utils::get_unique_tsc();
+	uint ext_size = 0;

-	if (!s_tls_wait_cb(data, 0, stamp0))
+#ifdef __linux__
+	::timespec ts{};
+	if (timeout + 1)
+	{
+		if (ext) [[unlikely]]
+		{
+			// futex_waitv uses absolute timeout
+			::clock_gettime(CLOCK_MONOTONIC, &ts);
+		}
+
+		ts.tv_sec += timeout / 1'000'000'000;
+		ts.tv_nsec += timeout % 1'000'000'000;
+		if (ts.tv_nsec > 1'000'000'000)
+		{
+			ts.tv_sec++;
+			ts.tv_nsec -= 1'000'000'000;
+		}
+	}
+
+	futex_waitv vec[atomic_wait::max_list]{};
+	vec[0].flags = FUTEX_32 | FUTEX_PRIVATE_FLAG;
+	vec[0].uaddr = reinterpret_cast<__u64>(data);
+	vec[0].val = old_value;
+
+	if (ext) [[unlikely]]
+	{
+		for (auto e = ext; e->data; e++)
+		{
+			ext_size++;
+			vec[ext_size].flags = FUTEX_32 | FUTEX_PRIVATE_FLAG;
+			vec[ext_size].uaddr = reinterpret_cast<__u64>(e->data);
+			vec[ext_size].val = e->old;
+		}
+	}
+
+	if (ext_size) [[unlikely]]
+	{
+		if (syscall(SYS_futex_waitv, +vec, ext_size + 1, 0, timeout + 1 ? &ts : nullptr, CLOCK_MONOTONIC) == -1)
+		{
+			if (errno == ENOSYS)
+			{
+				fmt::throw_exception("futex_waitv is not supported (Linux kernel is too old)");
+			}
+			if (errno == EINVAL)
+			{
+				fmt::throw_exception("futex_waitv: bad param");
+			}
+		}
+	}
+	else
+	{
+		if (futex(const_cast<void*>(data), FUTEX_WAIT_PRIVATE, old_value, timeout + 1 ? &ts : nullptr) == -1)
+		{
+			if (errno == EINVAL)
+			{
+				fmt::throw_exception("futex: bad param");
+			}
+		}
+	}
+
+	return;
+#endif
+
+	if (!s_tls_wait_cb(data, 0, 0))
 	{
 		return;
 	}

-	const uptr iptr = reinterpret_cast<uptr>(data) & (~s_ref_mask >> 17);
+	const auto stamp0 = utils::get_unique_tsc();

-	uint ext_size = 0;
+	const uptr iptr = reinterpret_cast<uptr>(data) & (~s_ref_mask >> 16);

 	uptr iptr_ext[atomic_wait::max_list - 1]{};

@ -1026,18 +930,18 @@ SAFE_BUFFERS(void) atomic_wait_engine::wait(const void* data, u32 size, u128 old
 				}
 			}

-			iptr_ext[ext_size] = reinterpret_cast<uptr>(e->data) & (~s_ref_mask >> 17);
+			iptr_ext[ext_size] = reinterpret_cast<uptr>(e->data) & (~s_ref_mask >> 16);
 			ext_size++;
 		}
 	}

-	const u32 cond_id = cond_alloc(iptr, mask, 0);
+	const u32 cond_id = cond_alloc(iptr, 0);

 	u32 cond_id_ext[atomic_wait::max_list - 1]{};

 	for (u32 i = 0; i < ext_size; i++)
 	{
-		cond_id_ext[i] = cond_alloc(iptr_ext[i], ext[i].mask, i + 1);
+		cond_id_ext[i] = cond_alloc(iptr_ext[i], i + 1);
 	}

 	const auto slot = root_info::slot_alloc(iptr);
@ -1060,8 +964,6 @@ SAFE_BUFFERS(void) atomic_wait_engine::wait(const void* data, u32 size, u128 old

 	// Store some info for notifiers (some may be unused)
 	cond->link = 0;
-	cond->size = static_cast<u8>(size);
-	cond->flag = static_cast<u8>(size >> 8);
 	cond->oldv = old_value;
 	cond->tsc0 = stamp0;

@ -1071,8 +973,6 @@ SAFE_BUFFERS(void) atomic_wait_engine::wait(const void* data, u32 size, u128 old
 	{
 		// Extensions point to original cond_id, copy remaining info
 		cond_ext[i]->link = cond_id;
-		cond_ext[i]->size = static_cast<u8>(ext[i].size);
-		cond_ext[i]->flag = static_cast<u8>(ext[i].size >> 8);
 		cond_ext[i]->oldv = ext[i].old;
 		cond_ext[i]->tsc0 = stamp0;

@ -1105,7 +1005,7 @@ SAFE_BUFFERS(void) atomic_wait_engine::wait(const void* data, u32 size, u128 old

 	u64 attempts = 0;

-	while (ptr_cmp(data, size, old_value, mask, ext))
+	while (ptr_cmp(data, old_value, ext))
 	{
 		if (s_tls_one_time_wait_cb)
 		{
@ -1263,7 +1163,7 @@ SAFE_BUFFERS(void) atomic_wait_engine::wait(const void* data, u32 size, u128 old
 }

 template <bool NoAlert = false>
-static u32 alert_sema(u32 cond_id, u32 size, u128 mask)
+static u32 alert_sema(u32 cond_id, u32 size)
 {
 	ensure(cond_id);

@ -1271,11 +1171,11 @@ static u32 alert_sema(u32 cond_id, u32 size, u128 mask)

 	u32 ok = 0;

-	if (!cond->size || mask & cond->mask)
+	if (true)
 	{
 		// Redirect if necessary
 		const auto _old = cond;
-		const auto _new = _old->link ? cond_id_lock(_old->link, u128(-1)) : _old;
+		const auto _new = _old->link ? cond_id_lock(_old->link) : _old;

 		if (_new && _new->tsc0 == _old->tsc0)
 		{
@ -1336,50 +1236,58 @@ void atomic_wait_engine::set_notify_callback(void(*cb)(const void*, u64))
 	s_tls_notify_cb = cb;
 }

-void atomic_wait_engine::notify_one(const void* data, u32 size, u128 mask)
+void atomic_wait_engine::notify_one(const void* data)
 {
-	const uptr iptr = reinterpret_cast<uptr>(data) & (~s_ref_mask >> 17);
-
 	if (s_tls_notify_cb)
 		s_tls_notify_cb(data, 0);

-	root_info::slot_search(iptr, mask, [&](u32 cond_id)
+#ifdef __linux__
+	futex(const_cast<void*>(data), FUTEX_WAKE_PRIVATE, 1);
+#else
+	const uptr iptr = reinterpret_cast<uptr>(data) & (~s_ref_mask >> 16);
+
+	root_info::slot_search(iptr, [&](u32 cond_id)
 	{
-		if (alert_sema(cond_id, size, mask))
+		if (alert_sema(cond_id, 4))
 		{
 			return true;
 		}

 		return false;
 	});
+#endif

 	if (s_tls_notify_cb)
 		s_tls_notify_cb(data, -1);
 }

-SAFE_BUFFERS(void) atomic_wait_engine::notify_all(const void* data, u32 size, u128 mask)
+SAFE_BUFFERS(void)
+atomic_wait_engine::notify_all(const void* data)
 {
-	const uptr iptr = reinterpret_cast<uptr>(data) & (~s_ref_mask >> 17);
-
 	if (s_tls_notify_cb)
 		s_tls_notify_cb(data, 0);

+#ifdef __linux__
+	futex(const_cast<void*>(data), FUTEX_WAKE_PRIVATE, 1);
+#else
+	const uptr iptr = reinterpret_cast<uptr>(data) & (~s_ref_mask >> 16);
+
 	// Array count for batch notification
 	u32 count = 0;

 	// Array itself.
 	u32 cond_ids[128];

-	root_info::slot_search(iptr, mask, [&](u32 cond_id)
+	root_info::slot_search(iptr, [&](u32 cond_id)
 	{
 		if (count >= 128)
 		{
 			// Unusual big amount of sema: fallback to notify_one alg
-			alert_sema(cond_id, size, mask);
+			alert_sema(cond_id, 4);
 			return false;
 		}

-		u32 res = alert_sema<true>(cond_id, size, mask);
+		u32 res = alert_sema<true>(cond_id, 4);

 		if (~res <= u16{umax})
 		{
@ -1395,7 +1303,7 @@ SAFE_BUFFERS(void) atomic_wait_engine::notify_all(const void* data, u32 size, u1
 	{
 		const u32 cond_id = *(std::end(cond_ids) - i - 1);

-		if (!s_cond_list[cond_id].wakeup(size ? 1 : 2))
+		if (!s_cond_list[cond_id].wakeup(1))
 		{
 			*(std::end(cond_ids) - i - 1) = ~cond_id;
 		}
@ -1434,6 +1342,7 @@ SAFE_BUFFERS(void) atomic_wait_engine::notify_all(const void* data, u32 size, u1
 	{
 		cond_free(~*(std::end(cond_ids) - i - 1));
 	}
+#endif

 	if (s_tls_notify_cb)
 		s_tls_notify_cb(data, -1);
--- a/rpcs3/util/atomic.hpp
+++ b/rpcs3/util/atomic.hpp
@ -129,54 +129,21 @@ enum class atomic_wait_timeout : u64
 	inf = 0xffffffffffffffff,
 };

+template <typename T>
+class lf_queue;
+
+namespace stx
+{
+	template <typename T>
+	class atomic_ptr;
+}
+
 // Various extensions for atomic_t::wait
 namespace atomic_wait
 {
 	// Max number of simultaneous atomic variables to wait on (can be extended if really necessary)
 	constexpr uint max_list = 8;

-	enum class op : u8
-	{
-		eq, // Wait while value is bitwise equal to
-		slt, // Wait while signed value is less than
-		sgt, // Wait while signed value is greater than
-		ult, // Wait while unsigned value is less than
-		ugt, // Wait while unsigned value is greater than
-		alt, // Wait while absolute value is less than
-		agt, // Wait while absolute value is greater than
-		pop, // Wait while set bit count of the value is less than
-		__max
-	};
-
-	static_assert(static_cast<u8>(op::__max) == 8);
-
-	enum class op_flag : u8
-	{
-		inverse = 1 << 4, // Perform inverse operation (negate the result)
-		bit_not = 1 << 5, // Perform bitwise NOT on loaded value before operation
-		byteswap = 1 << 6, // Perform byteswap on both arguments and masks when applicable
-	};
-
-	constexpr op_flag op_be = std::endian::native == std::endian::little ? op_flag::byteswap : op_flag{0};
-	constexpr op_flag op_le = std::endian::native == std::endian::little ? op_flag{0} : op_flag::byteswap;
-
-	constexpr op operator |(op_flag lhs, op_flag rhs)
-	{
-		return op{static_cast<u8>(static_cast<u8>(lhs) | static_cast<u8>(rhs))};
-	}
-
-	constexpr op operator |(op_flag lhs, op rhs)
-	{
-		return op{static_cast<u8>(static_cast<u8>(lhs) | static_cast<u8>(rhs))};
-	}
-
-	constexpr op operator |(op lhs, op_flag rhs)
-	{
-		return op{static_cast<u8>(static_cast<u8>(lhs) | static_cast<u8>(rhs))};
-	}
-
-	constexpr op op_ne = op::eq | op_flag::inverse;
-
 	constexpr struct any_value_t
 	{
 		template <typename T>
@ -186,46 +153,10 @@ namespace atomic_wait
 		}
 	} any_value;

-	template <typename X>
-	using payload_type = decltype(std::declval<X>().observe());
-
-	template <typename X, typename T = payload_type<X>>
-	constexpr u128 default_mask = sizeof(T) <= 8 ? u128{u64{umax} >> ((64 - sizeof(T) * 8) & 63)} : u128(-1);
-
-	template <typename X, typename T = payload_type<X>>
-	constexpr u128 get_value(X&, T value = T{}, ...)
-	{
-		static_assert((sizeof(T) & (sizeof(T) - 1)) == 0);
-		static_assert(sizeof(T) <= 16);
-		return std::bit_cast<get_uint_t<sizeof(T)>, T>(value);
-	}
-
 	struct info
 	{
 		const void* data;
-		u32 size;
-		u128 old;
-		u128 mask;
-
-		template <typename X, typename T = payload_type<X>>
-		constexpr void set_value(X& a, T value = T{})
-		{
-			old = get_value(a, value);
-		}
-
-		template <typename X, typename T = payload_type<X>>
-		constexpr void set_mask(T value)
-		{
-			static_assert((sizeof(T) & (sizeof(T) - 1)) == 0);
-			static_assert(sizeof(T) <= 16);
-			mask = std::bit_cast<get_uint_t<sizeof(T)>, T>(value);
-		}
-
-		template <typename X, typename T = payload_type<X>>
-		constexpr void set_mask()
-		{
-			mask = default_mask<X>;
-		}
+		u32 old;
 	};

 	template <uint Max, typename... T>
@ -243,9 +174,9 @@ namespace atomic_wait

 		constexpr list& operator=(const list&) noexcept = default;

-		template <typename... U, typename = std::void_t<decltype(std::declval<U>().template wait<op::eq>(any_value))...>>
+		template <typename... U, typename = std::void_t<decltype(std::declval<U>().wait(any_value))...>>
 		constexpr list(U&... vars)
-			: m_info{{&vars, sizeof(vars.observe()), get_value(vars), default_mask<U>}...}
+			: m_info{{&vars, 0}...}
 		{
 			static_assert(sizeof...(U) == Max, "Inconsistent amount of atomics.");
 		}
@ -256,40 +187,37 @@ namespace atomic_wait
 			static_assert(sizeof...(U) == Max, "Inconsistent amount of values.");

 			auto* ptr = m_info;
-			((ptr->template set_value<T>(*static_cast<T*>(ptr->data), values), ptr++), ...);
+			(((ptr->old = std::bit_cast<u32>(values)), ptr++), ...);
 			return *this;
 		}

-		template <typename... U>
-		constexpr list& masks(U... masks)
-		{
-			static_assert(sizeof...(U) <= Max, "Too many masks.");
-
-			auto* ptr = m_info;
-			((ptr++)->template set_mask<T>(masks), ...);
-			return *this;
-		}
-
-		template <uint Index, op Flags = op::eq, typename T2, typename U, typename = std::void_t<decltype(std::declval<T2>().template wait<op::eq>(any_value))>>
+		template <uint Index, typename T2, typename U, typename = std::void_t<decltype(std::declval<T2>().wait(any_value))>>
 		constexpr void set(T2& var, U value)
 		{
 			static_assert(Index < Max);

 			m_info[Index].data = &var;
-			m_info[Index].size = sizeof(var.observe()) | (static_cast<u8>(Flags) << 8);
-			m_info[Index].template set_value<T2>(var, value);
-			m_info[Index].template set_mask<T2>();
+			m_info[Index].old = std::bit_cast<u32>(value);
 		}

-		template <uint Index, op Flags = op::eq, typename T2, typename U, typename V, typename = std::void_t<decltype(std::declval<T2>().template wait<op::eq>(any_value))>>
-		constexpr void set(T2& var, U value, V mask)
+		template <uint Index, typename T2>
+		constexpr void set(lf_queue<T2>& var, std::nullptr_t = nullptr)
 		{
 			static_assert(Index < Max);
+			static_assert(sizeof(var) == sizeof(uptr));

-			m_info[Index].data = &var;
-			m_info[Index].size = sizeof(var.observe()) | (static_cast<u8>(Flags) << 8);
-			m_info[Index].template set_value<T2>(var, value);
-			m_info[Index].template set_mask<T2>(mask);
+			m_info[Index].data = reinterpret_cast<char*>(&var) + sizeof(u32);
+			m_info[Index].old = 0;
+		}
+
+		template <uint Index, typename T2>
+		constexpr void set(stx::atomic_ptr<T2>& var, std::nullptr_t = nullptr)
+		{
+			static_assert(Index < Max);
+			static_assert(sizeof(var) == sizeof(uptr));
+
+			m_info[Index].data = reinterpret_cast<char*>(&var) + sizeof(u32);
+			m_info[Index].old = 0;
 		}

 		// Timeout is discouraged
@ -302,7 +230,7 @@ namespace atomic_wait
 		}
 	};

-	template <typename... T, typename = std::void_t<decltype(std::declval<T>().template wait<op::eq>(any_value))...>>
+	template <typename... T, typename = std::void_t<decltype(std::declval<T>().wait(any_value))...>>
 	list(T&... vars) -> list<sizeof...(T), T...>;
 }

@ -322,20 +250,15 @@ private:
 	template <uint Max, typename... T>
 	friend class atomic_wait::list;

-	static void wait(const void* data, u32 size, u128 old_value, u64 timeout, u128 mask, atomic_wait::info* ext = nullptr);
+	static void wait(const void* data, u32 old_value, u64 timeout, atomic_wait::info* ext = nullptr);

 public:
-	static void notify_one(const void* data, u32 size, u128 mask128);
-	static void notify_all(const void* data, u32 size, u128 mask128);
+	static void notify_one(const void* data);
+	static void notify_all(const void* data);

 	static void set_wait_callback(bool(*cb)(const void* data, u64 attempts, u64 stamp0));
 	static void set_notify_callback(void(*cb)(const void* data, u64 progress));
-	static void set_one_time_use_wait_callback(bool(*cb)(u64 progress));
-
-	static void notify_all(const void* data)
-	{
-		notify_all(data, 0, u128(-1));
-	}
+	static void set_one_time_use_wait_callback(bool (*cb)(u64 progress));
 };

 template <uint Max, typename... T>
@ -343,7 +266,7 @@ void atomic_wait::list<Max, T...>::wait(atomic_wait_timeout timeout)
 {
 	static_assert(!!Max, "Cannot initiate atomic wait with empty list.");

-	atomic_wait_engine::wait(m_info[0].data, m_info[0].size, m_info[0].old, static_cast<u64>(timeout), m_info[0].mask, m_info + 1);
+	atomic_wait_engine::wait(m_info[0].data, m_info[0].old, static_cast<u64>(timeout), m_info + 1);
 }

 // Helper class, provides access to compiler-specific atomic intrinsics
@ -1759,46 +1682,31 @@ public:
 		});
 	}

-	// Timeout is discouraged
-	template <atomic_wait::op Flags = atomic_wait::op::eq>
-	void wait(type old_value, atomic_wait_timeout timeout = atomic_wait_timeout::inf) const noexcept
+	void wait(type old_value, atomic_wait_timeout timeout = atomic_wait_timeout::inf) const
+		requires(sizeof(type) == 4)
 	{
-		const u128 old = std::bit_cast<get_uint_t<sizeof(T)>>(old_value);
-		const u128 mask = atomic_wait::default_mask<atomic_t>;
-		atomic_wait_engine::wait(&m_data, sizeof(T) | (static_cast<u8>(Flags) << 8), old, static_cast<u64>(timeout), mask);
+		atomic_wait_engine::wait(&m_data, std::bit_cast<u32>(old_value), static_cast<u64>(timeout));
 	}

-	// Overload with mask (only selected bits are checked), timeout is discouraged
-	template <atomic_wait::op Flags = atomic_wait::op::eq>
-	void wait(type old_value, type mask_value, atomic_wait_timeout timeout = atomic_wait_timeout::inf) const noexcept
+	[[deprecated]] void wait(type old_value, atomic_wait_timeout timeout = atomic_wait_timeout::inf) const
+		requires(sizeof(type) == 8)
 	{
-		const u128 old = std::bit_cast<get_uint_t<sizeof(T)>>(old_value);
-		const u128 mask = std::bit_cast<get_uint_t<sizeof(T)>>(mask_value);
-		atomic_wait_engine::wait(&m_data, sizeof(T) | (static_cast<u8>(Flags) << 8), old, static_cast<u64>(timeout), mask);
+		atomic_wait::info ext[2]{};
+		ext[0].data = reinterpret_cast<const char*>(&m_data) + 4;
+		ext[0].old = std::bit_cast<u64>(old_value) >> 32;
+		atomic_wait_engine::wait(&m_data, std::bit_cast<u64>(old_value), static_cast<u64>(timeout), ext);
 	}

-	void notify_one() noexcept
+	void notify_one()
+		requires(sizeof(type) == 4 || sizeof(type) == 8)
 	{
-		atomic_wait_engine::notify_one(&m_data, sizeof(T), atomic_wait::default_mask<atomic_t>);
+		atomic_wait_engine::notify_one(&m_data);
 	}

-	// Notify with mask, allowing to not wake up thread which doesn't wait on this mask
-	void notify_one(type mask_value) noexcept
+	void notify_all()
+		requires(sizeof(type) == 4 || sizeof(type) == 8)
 	{
-		const u128 mask = std::bit_cast<get_uint_t<sizeof(T)>>(mask_value);
-		atomic_wait_engine::notify_one(&m_data, sizeof(T), mask);
-	}
-
-	void notify_all() noexcept
-	{
-		atomic_wait_engine::notify_all(&m_data, sizeof(T), atomic_wait::default_mask<atomic_t>);
-	}
-
-	// Notify all threads with mask, allowing to not wake up threads which don't wait on them
-	void notify_all(type mask_value) noexcept
-	{
-		const u128 mask = std::bit_cast<get_uint_t<sizeof(T)>>(mask_value);
-		atomic_wait_engine::notify_all(&m_data, sizeof(T), mask);
+		atomic_wait_engine::notify_all(&m_data);
 	}
 };

@ -1874,23 +1782,6 @@ public:
 	{
 		return base::fetch_xor(1) != 0;
 	}
-
-	// Timeout is discouraged
-	template <atomic_wait::op Flags = atomic_wait::op::eq>
-	void wait(bool old_value, atomic_wait_timeout timeout = atomic_wait_timeout::inf) const noexcept
-	{
-		base::template wait<Flags>(old_value, 1, timeout);
-	}
-
-	void notify_one() noexcept
-	{
-		base::notify_one(1);
-	}
-
-	void notify_all() noexcept
-	{
-		base::notify_all(1);
-	}
 };

 // Specializations
@ -1904,12 +1795,6 @@ struct std::common_type<atomic_t<T, Align>, T2> : std::common_type<T, std::commo
 template <typename T, typename T2, usz Align2>
 struct std::common_type<T, atomic_t<T2, Align2>> : std::common_type<std::common_type_t<T>, T2> {};

-namespace atomic_wait
-{
-	template <usz Align>
-	constexpr u128 default_mask<atomic_t<bool, Align>> = 1;
-}
-
 #ifndef _MSC_VER
 #pragma GCC diagnostic pop
 #pragma GCC diagnostic pop
--- a/rpcs3/util/fifo_mutex.hpp
+++ b/rpcs3/util/fifo_mutex.hpp
@ -6,33 +6,35 @@
 // Mutex that tries to maintain the order of acquisition
 class fifo_mutex
 {
-	// Low 8 bits are incremented on acquisition, high 8 bits are incremented on release
-	atomic_t<u16> m_value{0};
+	// Low 16 bits are incremented on acquisition, high 16 bits are incremented on release
+	atomic_t<u32> m_value{0};

 public:
 	constexpr fifo_mutex() noexcept = default;

 	void lock() noexcept
 	{
-		const u16 val = m_value.fetch_op([](u16& val)
+		// clang-format off
+		const u32 val = m_value.fetch_op([](u32& val)
 		{
-			val = (val & 0xff00) | ((val + 1) & 0xff);
+			val = (val & 0xffff0000) | ((val + 1) & 0xffff);
 		});
+		// clang-format on

-		if (val >> 8 != (val & 0xff)) [[unlikely]]
+		if (val >> 16 != (val & 0xffff)) [[unlikely]]
 		{
 			// TODO: implement busy waiting along with moving to cpp file
-			m_value.wait<atomic_wait::op_ne>(((val + 1) & 0xff) << 8, 0xff00);
+			m_value.wait((val & 0xffff0000) | ((val + 1) & 0xffff));
 		}
 	}

 	bool try_lock() noexcept
 	{
-		const u16 val = m_value.load();
+		const u32 val = m_value.load();

-		if (val >> 8 == (val & 0xff))
+		if (val >> 16 == (val & 0xffff))
 		{
-			if (m_value.compare_and_swap(val, ((val + 1) & 0xff) | (val & 0xff00)))
+			if (m_value.compare_and_swap(val, ((val + 1) & 0xffff) | (val & 0xffff0000)))
 			{
 				return true;
 			}
@ -43,9 +45,9 @@ public:

 	void unlock() noexcept
 	{
-		const u16 val = m_value.add_fetch(0x100);
+		const u32 val = m_value.add_fetch(0x10000);

-		if (val >> 8 != (val & 0xff))
+		if (val >> 16 != (val & 0xffff))
 		{
 			m_value.notify_one();
 		}
@ -53,9 +55,9 @@ public:

 	bool is_free() const noexcept
 	{
-		const u16 val = m_value.load();
+		const u32 val = m_value.load();

-		return (val >> 8) == (val & 0xff);
+		return (val >> 16) == (val & 0xffff);
 	}

 	void lock_unlock() noexcept
--- a/rpcs3/util/media_utils.cpp
+++ b/rpcs3/util/media_utils.cpp
@ -259,8 +259,8 @@ namespace utils

 	void audio_decoder::clear()
 	{
-		track_fully_decoded = false;
-		track_fully_consumed = false;
+		track_fully_decoded = 0;
+		track_fully_consumed = 0;
 		has_error = false;
 		m_size = 0;
 		duration_ms = 0;
@ -274,7 +274,7 @@ namespace utils
 		{
 			auto& thread = *m_thread;
 			thread = thread_state::aborting;
-			track_fully_consumed = true;
+			track_fully_consumed = 1;
 			track_fully_consumed.notify_one();
 			thread();
 			m_thread.reset();
@ -511,7 +511,7 @@ namespace utils
 				media_log.notice("audio_decoder: about to decode: %s (index=%d)", ::at32(m_context.playlist, m_context.current_track), m_context.current_track);

 				decode_track(::at32(m_context.playlist, m_context.current_track));
-				track_fully_decoded = true;
+				track_fully_decoded = 1;

 				if (has_error)
 				{
@ -521,7 +521,7 @@ namespace utils

 				// Let's only decode one track at a time. Wait for the consumer to finish reading the track.
 				media_log.notice("audio_decoder: waiting until track is consumed...");
-				thread_ctrl::wait_on(track_fully_consumed, false);
+				thread_ctrl::wait_on(track_fully_consumed, 0);
 				track_fully_consumed = false;
 			}

--- a/rpcs3/util/media_utils.h
+++ b/rpcs3/util/media_utils.h
@ -77,8 +77,8 @@ namespace utils
 		std::vector<u8> data;
 		atomic_t<u64> m_size = 0;
 		atomic_t<u64> duration_ms = 0;
-		atomic_t<bool> track_fully_decoded{false};
-		atomic_t<bool> track_fully_consumed{false};
+		atomic_t<u32> track_fully_decoded{0};
+		atomic_t<u32> track_fully_consumed{0};
 		atomic_t<bool> has_error{false};
 		std::deque<std::pair<u64, u64>> timestamps_ms;

--- a/rpcs3/util/shared_ptr.hpp
+++ b/rpcs3/util/shared_ptr.hpp
@ -3,6 +3,7 @@
 #include <cstdint>
 #include <memory>
 #include "atomic.hpp"
+#include "asm.hpp"

 namespace stx
 {
@ -21,7 +22,7 @@ namespace stx
 	// Basic assumption of userspace pointer size
 	constexpr uint c_ptr_size = 48;

-	// Use lower 17 bits as atomic_ptr internal counter of borrowed refs (pointer itself is shifted)
+	// Use lower 16 bits as atomic_ptr internal counter of borrowed refs (pointer itself is shifted)
 	constexpr uint c_ref_mask = 0xffff, c_ref_size = 16;

 	// Remaining pointer bits
@ -1054,20 +1055,19 @@ namespace stx
 			return observe() == r.get();
 		}

-		template <atomic_wait::op Flags = atomic_wait::op::eq>
-		void wait(const volatile void* value, atomic_wait_timeout timeout = atomic_wait_timeout::inf)
+		void wait(std::nullptr_t, atomic_wait_timeout timeout = atomic_wait_timeout::inf)
 		{
-			m_val.wait<Flags>(reinterpret_cast<uptr>(value) << c_ref_size, c_ptr_mask, timeout);
+			utils::bless<atomic_t<u32>>(&m_val)[1].wait(0, timeout);
 		}

 		void notify_one()
 		{
-			m_val.notify_one(c_ptr_mask);
+			utils::bless<atomic_t<u32>>(&m_val)[1].notify_one();
 		}

 		void notify_all()
 		{
-			m_val.notify_all(c_ptr_mask);
+			utils::bless<atomic_t<u32>>(&m_val)[1].notify_all();
 		}
 	};

@ -1110,18 +1110,6 @@ namespace stx
 	} null_ptr;
 }

-namespace atomic_wait
-{
-	template <typename T>
-	constexpr u128 default_mask<stx::atomic_ptr<T>> = stx::c_ptr_mask;
-
-	template <typename T>
-	constexpr u128 get_value(stx::atomic_ptr<T>&, const volatile void* value = nullptr)
-	{
-		return reinterpret_cast<uptr>(value) << stx::c_ref_size;
-	}
-}
-
 using stx::null_ptr;
 using stx::single_ptr;
 using stx::shared_ptr;