ppsspp/Windows/WASAPIContext.cpp

#include <windows.h>
#include <mmdeviceapi.h>
#include <functiondiscoverykeys_devpkey.h>
#include <audioclient.h>
#include <avrt.h>
#include <comdef.h>
#include <atomic>
#include <thread>
#include <vector>
#include <string_view>
#include <wrl/client.h>

#include "Common/Data/Encoding/Utf8.h"
#include "Common/Log.h"
#include "Common/Thread/ThreadUtil.h"
#include "WASAPIContext.h"

using Microsoft::WRL::ComPtr;

// We must have one of these already...
static inline s16 ClampFloatToS16(float f) {
	f *= 32768.0f;
	if (f >= 32767) {
		return 32767;
	} else if (f < -32767) {
		return -32767;
	} else {
		return (s16)(s32)f;
	}
}

void BuildStereoFloatFormat(const WAVEFORMATEXTENSIBLE *original, WAVEFORMATEXTENSIBLE *output) {
	// Zero‑init all fields first.
	ZeroMemory(output, sizeof(WAVEFORMATEXTENSIBLE));

	// Fill the WAVEFORMATEX base part.
	output->Format.wFormatTag = WAVE_FORMAT_EXTENSIBLE;
	output->Format.nChannels = 2;
	output->Format.nSamplesPerSec = original->Format.nSamplesPerSec;
	output->Format.wBitsPerSample = 32;                                 // 32‑bit float
	output->Format.nBlockAlign = output->Format.nChannels *
		output->Format.wBitsPerSample / 8;
	output->Format.nAvgBytesPerSec = output->Format.nSamplesPerSec *
		output->Format.nBlockAlign;
	output->Format.cbSize = sizeof(WAVEFORMATEXTENSIBLE) - sizeof(WAVEFORMATEX);

	// Fill the extensible fields.
	output->Samples.wValidBitsPerSample = 32;
	output->dwChannelMask = SPEAKER_FRONT_LEFT | SPEAKER_FRONT_RIGHT;
	output->SubFormat = KSDATAFORMAT_SUBTYPE_IEEE_FLOAT;
}

WASAPIContext::WASAPIContext() : notificationClient_(this) {
	HRESULT hr = CoCreateInstance(__uuidof(MMDeviceEnumerator), nullptr, CLSCTX_ALL, IID_PPV_ARGS(&enumerator_));
	if (FAILED(hr)) {
		// Bad!
		enumerator_ = nullptr;
		return;
	}
	enumerator_->RegisterEndpointNotificationCallback(&notificationClient_);
}

WASAPIContext::~WASAPIContext() {
	if (!enumerator_) {
		// Nothing can have been happening.
		return;
	}
	Stop();
	enumerator_->UnregisterEndpointNotificationCallback(&notificationClient_);
	delete[] tempBuf_;
}

WASAPIContext::AudioFormat WASAPIContext::Classify(const WAVEFORMATEX *format) {
	if (format->wFormatTag == WAVE_FORMAT_PCM && format->wBitsPerSample == 2) {
		return AudioFormat::S16;
	} else if (format->wFormatTag == WAVE_FORMAT_EXTENSIBLE) {
		const WAVEFORMATEXTENSIBLE *ex = (const WAVEFORMATEXTENSIBLE *)format;
		if (ex->SubFormat == KSDATAFORMAT_SUBTYPE_IEEE_FLOAT) {
			return AudioFormat::Float;
		}
	} else {
		WARN_LOG(Log::Audio, "Unhandled output format!");
	}
	return AudioFormat::Unhandled;
}

bool GetDeviceDesc(IMMDevice *device, AudioDeviceDesc *desc) {
	ComPtr<IPropertyStore> props;
	device->OpenPropertyStore(STGM_READ, &props);
	PROPVARIANT nameProp;
	PropVariantInit(&nameProp);
	props->GetValue(PKEY_Device_FriendlyName, &nameProp);
	LPWSTR id_str = 0;
	bool success = false;
	if (SUCCEEDED(device->GetId(&id_str))) {
		desc->name = ConvertWStringToUTF8(nameProp.pwszVal);
		desc->uniqueId = ConvertWStringToUTF8(id_str);
		CoTaskMemFree(id_str);
		success = true;
	}
	PropVariantClear(&nameProp);
	return success;
}

void WASAPIContext::EnumerateDevices(std::vector<AudioDeviceDesc> *output, bool captureDevices) {
	ComPtr<IMMDeviceCollection> collection;
	enumerator_->EnumAudioEndpoints(captureDevices ? eCapture : eRender, DEVICE_STATE_ACTIVE, &collection);

	if (!collection) {
		ERROR_LOG(Log::Audio, "Failed to enumerate devices");
		return;
	}

	UINT count = 0;
	collection->GetCount(&count);

	for (UINT i = 0; i < count; ++i) {
		ComPtr<IMMDevice> device;
		collection->Item(i, &device);

		AudioDeviceDesc desc{};
		if (GetDeviceDesc(device.Get(), &desc)) {
			output->push_back(desc);
		}
	}
}

bool WASAPIContext::InitOutputDevice(std::string_view uniqueId, LatencyMode latencyMode, bool *revertedToDefault) {
	Stop();

	*revertedToDefault = false;

	ComPtr<IMMDevice> device;
	if (uniqueId.empty()) {
		// Use the default device.
		if (FAILED(enumerator_->GetDefaultAudioEndpoint(eRender, eConsole, &device))) {
			return false;
		}
	} else {
		// Use whatever device.
		std::wstring wId = ConvertUTF8ToWString(uniqueId);
		if (FAILED(enumerator_->GetDevice(wId.c_str(), &device))) {
			// Fallback to default device
			INFO_LOG(Log::Audio, "Falling back to default device...\n");
			*revertedToDefault = true;
			if (FAILED(enumerator_->GetDefaultAudioEndpoint(eRender, eConsole, &device))) {
				return false;
			}
		}
	}

	AudioDeviceDesc desc{};
	GetDeviceDesc(device.Get(), &desc);
	INFO_LOG(Log::Audio, "Activating audio device: %s", desc.name.c_str());

	deviceId_ = uniqueId;

	HRESULT hr = E_FAIL;
	// Try IAudioClient3 first if not in "safe" mode. It's probably safe anyway, but still, let's use the legacy client as a safe fallback option.
	if (latencyMode != LatencyMode::Safe) {
		hr = device->Activate(__uuidof(IAudioClient3), CLSCTX_ALL, nullptr, (void**)&audioClient3_);
	}

	// Get rid of any old tempBuf_.
	delete[] tempBuf_;
	tempBuf_ = nullptr;

	if (SUCCEEDED(hr)) {
		audioClient3_->GetMixFormat(&format_);
		// We only use AudioClient3 if we got the format we wanted (stereo float).
		if (format_->nChannels != 2 || Classify(format_) != AudioFormat::Float) {
			// Let's fall back to the old path. The docs seem to be wrong, if you try to create an
			// AudioClient3 with low latency audio with AUTOCONVERTPCM, you get the error 0x88890021.
			audioClient3_.Reset();
			// Fall through to AudioClient creation below.
		} else {
			audioClient3_->GetSharedModeEnginePeriod(format_, &defaultPeriodFrames, &fundamentalPeriodFrames, &minPeriodFrames, &maxPeriodFrames);

			INFO_LOG(Log::Audio, "AudioClient3: default: %d fundamental: %d min: %d max: %d\n", (int)defaultPeriodFrames, (int)fundamentalPeriodFrames, (int)minPeriodFrames, (int)maxPeriodFrames);
			INFO_LOG(Log::Audio, "initializing with %d frame period at %d Hz, meaning %0.1fms\n", (int)minPeriodFrames, (int)format_->nSamplesPerSec, FramesToMs(minPeriodFrames, format_->nSamplesPerSec));

			audioEvent_ = CreateEvent(nullptr, FALSE, FALSE, nullptr);
			HRESULT result = audioClient3_->InitializeSharedAudioStream(
				AUDCLNT_STREAMFLAGS_EVENTCALLBACK,
				minPeriodFrames,
				format_,
				nullptr
			);
			if (FAILED(result)) {
				WARN_LOG(Log::Audio, "Error initializing AudioClient3 shared audio stream: %08lx", result);
				audioClient3_.Reset();
				return false;
			}
			actualPeriodFrames_ = minPeriodFrames;

			audioClient3_->GetBufferSize(&reportedBufferSize_);
			audioClient3_->SetEventHandle(audioEvent_);
			audioClient3_->GetService(IID_PPV_ARGS(&renderClient_));
		}
	}

	if (!audioClient3_) {
		// Fallback to IAudioClient (older OS)
		HRESULT hr = device->Activate(__uuidof(IAudioClient), CLSCTX_ALL, nullptr, (void**)&audioClient_);
		if (FAILED(hr)) {
			ERROR_LOG(Log::Audio, "Failed to activate audio device: %08lx", hr);
			return false;
		}

		audioClient_->GetMixFormat(&format_);

		// If there are too many channels, try asking for a 2-channel output format.
		DWORD extraStreamFlags = 0;
		const AudioFormat fmt = Classify(format_);

		bool createBuffer = false;
		if (fmt == AudioFormat::Float) {
			if (format_->nChannels != 2) {
				INFO_LOG(Log::Audio, "Got %d channels, asking for stereo instead", format_->nChannels);
				WAVEFORMATEXTENSIBLE stereo;
				BuildStereoFloatFormat((const WAVEFORMATEXTENSIBLE *)format_, &stereo);

				WAVEFORMATEX *closestMatch = nullptr;
				const HRESULT result = audioClient_->IsFormatSupported(AUDCLNT_SHAREMODE_SHARED, (const WAVEFORMATEX *)&stereo, &closestMatch);
				if (result == S_OK) {
					// We got the format! Use it and set as current.
					_dbg_assert_(!closestMatch);
					format_ = (WAVEFORMATEX *)CoTaskMemAlloc(sizeof(WAVEFORMATEXTENSIBLE));
					memcpy(format_, &stereo, sizeof(WAVEFORMATEX) + stereo.Format.cbSize);
					extraStreamFlags = AUDCLNT_STREAMFLAGS_AUTOCONVERTPCM | AUDCLNT_STREAMFLAGS_SRC_DEFAULT_QUALITY;
					INFO_LOG(Log::Audio, "Successfully asked for two channels");
				} else if (result == S_FALSE) {
					// We got another format. Meh, let's just use what we got.
					if (closestMatch) {
						WARN_LOG(Log::Audio, "Didn't get the format we wanted, but got: %lu ch=%d", closestMatch->nSamplesPerSec, closestMatch->nChannels);
						CoTaskMemFree(closestMatch);
					} else {
						WARN_LOG(Log::Audio, "Failed to fall back to two channels. Using workarounds.");
					}
					createBuffer = true;
				} else {
					WARN_LOG(Log::Audio, "Got other error %08lx", result);
					_dbg_assert_(!closestMatch);
				}
			} else {
				// All good, nothing to convert.
			}
		} else {
			// Some other format.
			WARN_LOG(Log::Audio, "Format not float, applying conversion.");
			createBuffer = true;
		}

		// Get engine period info
		REFERENCE_TIME defaultPeriod = 0, minPeriod = 0;
		audioClient_->GetDevicePeriod(&defaultPeriod, &minPeriod);

		audioEvent_ = CreateEvent(nullptr, FALSE, FALSE, nullptr);

		const REFERENCE_TIME duration = minPeriod;
		hr = audioClient_->Initialize(
			AUDCLNT_SHAREMODE_SHARED,
			AUDCLNT_STREAMFLAGS_EVENTCALLBACK | extraStreamFlags,
			duration,  // This is a minimum, the result might be larger. We use GetBufferSize to check.
			0,  // ref duration, always 0 in shared mode.
			format_,
			nullptr
		);

		if (FAILED(hr)) {
			WARN_LOG(Log::Audio, "ERROR: Failed to initialize audio with all attempted buffer sizes\n");
			audioClient_.Reset();
			return false;
		}

		audioClient_->GetBufferSize(&reportedBufferSize_);
		actualPeriodFrames_ = reportedBufferSize_;  // we don't have a better estimate.
		audioClient_->SetEventHandle(audioEvent_);
		audioClient_->GetService(IID_PPV_ARGS(&renderClient_));

		if (createBuffer) {
			tempBuf_ = new float[reportedBufferSize_ * 2];
		}
	}

	latencyMode_ = latencyMode;

	_dbg_assert_(audioClient_ || audioClient3_);

	Start();

	return true;
}

void WASAPIContext::Start() {
	running_ = true;
	audioThread_ = std::thread([this]() { AudioLoop(); });
}

void WASAPIContext::Stop() {
	running_ = false;
	if (audioClient_) audioClient_->Stop();
	if (audioEvent_) SetEvent(audioEvent_);
	if (audioThread_.joinable()) audioThread_.join();

	renderClient_.Reset();
	audioClient_.Reset();
	if (audioEvent_) {
		CloseHandle(audioEvent_);
		audioEvent_ = nullptr;
	}
	if (format_) {
		CoTaskMemFree(format_);
		format_ = nullptr;
	}
}

void WASAPIContext::FrameUpdate(bool allowAutoChange) {
	if (deviceId_.empty() && defaultDeviceChanged_ && allowAutoChange) {
		defaultDeviceChanged_ = false;
		Stop();
		Start();
	}
}

void WASAPIContext::AudioLoop() {
	SetCurrentThreadName("WASAPIAudioLoop");

	DWORD taskID = 0;
	HANDLE mmcssHandle = nullptr;
	if (latencyMode_ == LatencyMode::Aggressive) {
		mmcssHandle = AvSetMmThreadCharacteristics(L"Pro Audio", &taskID);
	}

	UINT32 available;
	if (audioClient3_) {
		audioClient3_->Start();
		audioClient3_->GetBufferSize(&available);
	} else if (audioClient_) {
		audioClient_->Start();
		audioClient_->GetBufferSize(&available);
	} else {
		// No audio client, nothing to do.
		WARN_LOG(Log::Audio, "No audio client");
		return;
	}

	const AudioFormat format = Classify(format_);
	const int nChannels = format_->nChannels;

	while (running_) {
		const DWORD waitResult = WaitForSingleObject(audioEvent_, INFINITE);
		if (waitResult != WAIT_OBJECT_0) {
			// Something bad happened.
			break;
		}

		UINT32 padding = 0;
		if (audioClient3_) {
			audioClient3_->GetCurrentPadding(&padding);
		} else {
			audioClient_->GetCurrentPadding(&padding);
		}

		const UINT32 framesToWrite = available - padding;
		BYTE* buffer = nullptr;
		if (framesToWrite > 0 && SUCCEEDED(renderClient_->GetBuffer(framesToWrite, &buffer))) {
			if (!tempBuf_) {
				// Mix directly to the output buffer, avoiding a copy.
				if (buffer) {
					callback_(reinterpret_cast<float *>(buffer), framesToWrite, format_->nSamplesPerSec, userdata_);
				}
			} else {
				// We decided previously that we need conversion, so mix to our temp buffer...
				callback_(tempBuf_, framesToWrite, format_->nSamplesPerSec, userdata_);
				// .. and convert according to format (we support multi-channel float and s16)
				if (format == AudioFormat::S16 && buffer) {
					// Need to convert.
					s16 *dest = reinterpret_cast<s16 *>(buffer);
					for (UINT32 i = 0; i < framesToWrite; i++) {
						if (nChannels == 1) {
							// Maybe some bluetooth speakers? Mixdown.
							float sum = 0.5f * (tempBuf_[i * 2] + tempBuf_[i * 2 + 1]);
							dest[i] = ClampFloatToS16(sum);
						} else {
							dest[i * nChannels] = ClampFloatToS16(tempBuf_[i * 2]);
							dest[i * nChannels + 1] = ClampFloatToS16(tempBuf_[i * 2 + 1]);
							// Zero other channels.
							for (int j = 2; j < nChannels; j++) {
								dest[i * nChannels + j] = 0;
							}
						}
					}
				} else if (format == AudioFormat::Float && buffer) {
					// We have a non-2 number of channels (since we're in the tempBuf_ 'if'), so we contract/expand.
					float *dest = reinterpret_cast<float *>(buffer);
					for (UINT32 i = 0; i < framesToWrite; i++) {
						if (nChannels == 1) {
							// Maybe some bluetooth speakers? Mixdown.
							dest[i] = 0.5f * (tempBuf_[i * 2] + tempBuf_[i * 2 + 1]);
						} else {
							dest[i * nChannels] = tempBuf_[i * 2];
							dest[i * nChannels + 1] = tempBuf_[i * 2 + 1];
							// Zero other channels.
							for (int j = 2; j < nChannels; j++) {
								dest[i * nChannels + j] = 0;
							}
						}
					}
				}
			}

			renderClient_->ReleaseBuffer(framesToWrite, 0);
		}

		// In the old mode, we just estimate the "actualPeriodFrames_" from the framesToWrite.
		if (audioClient_ && framesToWrite < actualPeriodFrames_) {
			actualPeriodFrames_ = framesToWrite;
		}
	}

	if (audioClient3_) {
		audioClient3_->Stop();
	} else {
		audioClient_->Stop();
	}

	if (mmcssHandle) {
		AvRevertMmThreadCharacteristics(mmcssHandle);
	}
}

void WASAPIContext::DescribeOutputFormat(char *buffer, size_t bufferSize) const {
	const int numChannels = format_->nChannels;
	const int sampleBits = format_->wBitsPerSample;
	const int sampleRateHz = format_->nSamplesPerSec;
	const char *fmt = "N/A";
	if (format_->wFormatTag == WAVE_FORMAT_EXTENSIBLE) {
		const WAVEFORMATEXTENSIBLE *ex = (const WAVEFORMATEXTENSIBLE *)format_;
		if (ex->SubFormat == KSDATAFORMAT_SUBTYPE_IEEE_FLOAT) {
			fmt = "float";
		} else {
			fmt = "PCM";
		}
	} else {
		fmt = "PCM";  // probably
	}
	snprintf(buffer, bufferSize, "%d Hz %s %d-bit, %d ch%s", sampleRateHz, fmt, sampleBits, numChannels, audioClient3_ ? " (ac3)" : " (ac)");
}