Blog Post

Depths of Windows APC

Aspects of internals of the Asynchronous Procedure Calls from the kernel mode.

Depths of Windows APC - Aspects of internals of the Asynchronous Procedure Calls from the kernel mode.
This article contains functions and features that are not documented by the original manufacturer. By following advice in this article, you're doing so at your own risk. The methods presented in this article may rely on internal implementation and may not work in the future.

Intro

After our first blog post on the intricacies of the user-mode APCs, we decided to expand this subject with additional in-depth details about the internals of the Asynchronous Procedure Calls (APC) implemented in the Windows OS.

Let's begin, in no particular order.

Table of Contents

The following topics are just loosely connected to each other, so you may want to use the table of contents for easier navigation:

General APC Internals

For the in-depth understanding of the internals of the kernel APCs refer to the following article: " Inside NT's Asynchronous Procedure Call". We won't be repeating what has been said there. We will add some additional, lesser known APC-related details instead.

To mention briefly, technically APC is just a few dozen of bytes in the kernel memory, known as the KAPC struct:

C++
typedef struct _KAPC {
	UCHAR Type;
	UCHAR SpareByte0;
	UCHAR Size;
	UCHAR SpareByte1;
	ULONG SpareLong0;
	_KTHREAD * Thread;
	_LIST_ENTRY ApcListEntry;
	void (* KernelRoutine)( _KAPC * , void (* * )( void * , void * , void * ), void * * , void * * , void * * );
	void (* RundownRoutine)( _KAPC * );
	void (* NormalRoutine)( void * , void * , void * );
	void * Reserved[0x3];
	void * NormalContext;
	void * SystemArgument1;
	void * SystemArgument2;
	CHAR ApcStateIndex;
	CHAR ApcMode;
	UCHAR Inserted;
}KAPC, *PKAPC;

That struct is a part of a double-linked LIST_ENTRY inside the KAPC_STATE struct:

C++
typedef struct _KAPC_STATE {
	_LIST_ENTRY ApcListHead[0x2];
	_KPROCESS * Process;
	UCHAR InProgressFlags;
	UCHAR KernelApcInProgress : 01; // 0x01;
	UCHAR SpecialApcInProgress : 01; // 0x02;
	UCHAR KernelApcPending;
	UCHAR UserApcPendingAll;
	UCHAR SpecialUserApcPending : 01; // 0x01;
	UCHAR UserApcPending : 01; // 0x02;
}KAPC_STATE, *PKAPC_STATE;

And KAPC_STATE itself is a part of the thread object, stored in the KTHREAD struct in the kernel:

C++
typedef struct _KTHREAD {
	_DISPATCHER_HEADER Header;
	void * SListFaultAddress;
	ULONGLONG QuantumTarget;
	void * InitialStack;
	void * volatile StackLimit;
	void * StackBase;
	ULONGLONG ThreadLock;
	ULONGLONG volatile CycleTime;
	ULONG CurrentRunTime;
	ULONG ExpectedRunTime;
	void * KernelStack;
	_XSAVE_FORMAT * StateSaveArea;
	_KSCHEDULING_GROUP * volatile SchedulingGroup;
	_KWAIT_STATUS_REGISTER WaitRegister;
	UCHAR volatile Running;
	UCHAR Alerted[0x2];
	ULONG AutoBoostActive : 01; // 0x00000001;
	ULONG ReadyTransition : 01; // 0x00000002;
	ULONG WaitNext : 01; // 0x00000004;
	ULONG SystemAffinityActive : 01; // 0x00000008;
	ULONG Alertable : 01; // 0x00000010;
	ULONG UserStackWalkActive : 01; // 0x00000020;
	ULONG ApcInterruptRequest : 01; // 0x00000040;
	ULONG QuantumEndMigrate : 01; // 0x00000080;
	ULONG UmsDirectedSwitchEnable : 01; // 0x00000100;
	ULONG TimerActive : 01; // 0x00000200;
	ULONG SystemThread : 01; // 0x00000400;
	ULONG ProcessDetachActive : 01; // 0x00000800;
	ULONG CalloutActive : 01; // 0x00001000;
	ULONG ScbReadyQueue : 01; // 0x00002000;
	ULONG ApcQueueable : 01; // 0x00004000;
	ULONG ReservedStackInUse : 01; // 0x00008000;
	ULONG UmsPerformingSyscall : 01; // 0x00010000;
	ULONG TimerSuspended : 01; // 0x00020000;
	ULONG SuspendedWaitMode : 01; // 0x00040000;
	ULONG SuspendSchedulerApcWait : 01; // 0x00080000;
	ULONG CetUserShadowStack : 01; // 0x00100000;
	ULONG BypassProcessFreeze : 01; // 0x00200000;
	ULONG Reserved : 10; // 0xffc00000;
	LONG MiscFlags;
	ULONG BamQosLevel : 02; // 0x00000003;
	ULONG AutoAlignment : 01; // 0x00000004;
	ULONG DisableBoost : 01; // 0x00000008;
	ULONG AlertedByThreadId : 01; // 0x00000010;
	ULONG QuantumDonation : 01; // 0x00000020;
	ULONG EnableStackSwap : 01; // 0x00000040;
	ULONG GuiThread : 01; // 0x00000080;
	ULONG DisableQuantum : 01; // 0x00000100;
	ULONG ChargeOnlySchedulingGroup : 01; // 0x00000200;
	ULONG DeferPreemption : 01; // 0x00000400;
	ULONG QueueDeferPreemption : 01; // 0x00000800;
	ULONG ForceDeferSchedule : 01; // 0x00001000;
	ULONG SharedReadyQueueAffinity : 01; // 0x00002000;
	ULONG FreezeCount : 01; // 0x00004000;
	ULONG TerminationApcRequest : 01; // 0x00008000;
	ULONG AutoBoostEntriesExhausted : 01; // 0x00010000;
	ULONG KernelStackResident : 01; // 0x00020000;
	ULONG TerminateRequestReason : 02; // 0x000c0000;
	ULONG ProcessStackCountDecremented : 01; // 0x00100000;
	ULONG RestrictedGuiThread : 01; // 0x00200000;
	ULONG VpBackingThread : 01; // 0x00400000;
	ULONG ThreadFlagsSpare : 01; // 0x00800000;
	ULONG EtwStackTraceApcInserted : 08; // 0xff000000;
	LONG volatile ThreadFlags;
	UCHAR volatile Tag;
	UCHAR SystemHeteroCpuPolicy;
	UCHAR UserHeteroCpuPolicy : 07; // 0x7f;
	UCHAR ExplicitSystemHeteroCpuPolicy : 01; // 0x80;
	UCHAR RunningNonRetpolineCode : 01; // 0x01;
	UCHAR SpecCtrlSpare : 07; // 0xfe;
	UCHAR SpecCtrl;
	ULONG SystemCallNumber;
	ULONG ReadyTime;
	void * FirstArgument;
	_KTRAP_FRAME * TrapFrame;
	_KAPC_STATE ApcState;
	UCHAR ApcStateFill[0x2b];
	CHAR Priority;
	ULONG UserIdealProcessor;
	LONGLONG volatile WaitStatus;
	_KWAIT_BLOCK * WaitBlockList;
	_LIST_ENTRY WaitListEntry;
	_SINGLE_LIST_ENTRY SwapListEntry;
	_DISPATCHER_HEADER * volatile Queue;
	void * Teb;
	ULONGLONG RelativeTimerBias;
	_KTIMER Timer;
	_KWAIT_BLOCK WaitBlock[0x4];
	UCHAR WaitBlockFill4[0x14];
	ULONG ContextSwitches;
	UCHAR WaitBlockFill5[0x44];
	UCHAR volatile State;
	CHAR Spare13;
	UCHAR WaitIrql;
	CHAR WaitMode;
	UCHAR WaitBlockFill6[0x74];
	ULONG WaitTime;
	UCHAR WaitBlockFill7[0xa4];
	SHORT KernelApcDisable;
	SHORT SpecialApcDisable;
	ULONG CombinedApcDisable;
	UCHAR WaitBlockFill8[0x28];
	_KTHREAD_COUNTERS * ThreadCounters;
	UCHAR WaitBlockFill9[0x58];
	_XSTATE_SAVE * XStateSave;
	UCHAR WaitBlockFill10[0x88];
	void * volatile Win32Thread;
	UCHAR WaitBlockFill11[0xb0];
	_UMS_CONTROL_BLOCK * Ucb;
	_KUMS_CONTEXT_HEADER * volatile Uch;
	void * Spare21;
	_LIST_ENTRY QueueListEntry;
	ULONG volatile NextProcessor;
	ULONG NextProcessorNumber : 31; // 0x7fffffff;
	ULONG SharedReadyQueue : 01; // 0x80000000;
	LONG QueuePriority;
	_KPROCESS * Process;
	_GROUP_AFFINITY UserAffinity;
	UCHAR UserAffinityFill[0xa];
	CHAR PreviousMode;
	CHAR BasePriority;
	CHAR PriorityDecrement;
	UCHAR ForegroundBoost : 04; // 0x0f;
	UCHAR UnusualBoost : 04; // 0xf0;
	UCHAR Preempted;
	UCHAR AdjustReason;
	CHAR AdjustIncrement;
	ULONGLONG AffinityVersion;
	_GROUP_AFFINITY Affinity;
	UCHAR AffinityFill[0xa];
	UCHAR ApcStateIndex;
	UCHAR WaitBlockCount;
	ULONG IdealProcessor;
	ULONGLONG NpxState;
	_KAPC_STATE SavedApcState;
	UCHAR SavedApcStateFill[0x2b];
	UCHAR WaitReason;
	CHAR SuspendCount;
	CHAR Saturation;
	USHORT SListFaultCount;
	_KAPC SchedulerApc;
	UCHAR SchedulerApcFill0[0x1];
	UCHAR ResourceIndex;
	UCHAR SchedulerApcFill1[0x3];
	UCHAR QuantumReset;
	UCHAR SchedulerApcFill2[0x4];
	ULONG KernelTime;
	UCHAR SchedulerApcFill3[0x40];
	_KPRCB * volatile WaitPrcb;
	UCHAR SchedulerApcFill4[0x48];
	void * LegoData;
	UCHAR SchedulerApcFill5[0x53];
	UCHAR CallbackNestingLevel;
	ULONG UserTime;
	_KEVENT SuspendEvent;
	_LIST_ENTRY ThreadListEntry;
	_LIST_ENTRY MutantListHead;
	UCHAR AbEntrySummary;
	UCHAR AbWaitEntryCount;
	UCHAR AbAllocationRegionCount;
	CHAR SystemPriority;
	ULONG SecureThreadCookie;
	_KLOCK_ENTRY LockEntries[0x6];
	_SINGLE_LIST_ENTRY PropagateBoostsEntry;
	_SINGLE_LIST_ENTRY IoSelfBoostsEntry;
	UCHAR PriorityFloorCounts[0x10];
	ULONG PriorityFloorSummary;
	LONG volatile AbCompletedIoBoostCount;
	LONG volatile AbCompletedIoQoSBoostCount;
	SHORT volatile KeReferenceCount;
	UCHAR AbOrphanedEntrySummary;
	UCHAR AbOwnedEntryCount;
	ULONG ForegroundLossTime;
	_LIST_ENTRY GlobalForegroundListEntry;
	_SINGLE_LIST_ENTRY ForegroundDpcStackListEntry;
	ULONGLONG InGlobalForegroundList;
	LONGLONG ReadOperationCount;
	LONGLONG WriteOperationCount;
	LONGLONG OtherOperationCount;
	LONGLONG ReadTransferCount;
	LONGLONG WriteTransferCount;
	LONGLONG OtherTransferCount;
	_KSCB * QueuedScb;
	ULONG volatile ThreadTimerDelay;
	LONG volatile ThreadFlags2;
	ULONG PpmPolicy : 02; // 0x00000003;
	ULONG ThreadFlags2Reserved : 30; // 0xfffffffc;
	ULONGLONG TracingPrivate[0x1];
	void * SchedulerAssist;
	void * volatile AbWaitObject;
}KTHREAD, *PKTHREAD;

Attaching a Thread To Another Process

One thing worthy to note here is that any thread can be temporarily attached to another process through a call to KeStackAttachProcess (and receive KAPC_STATE object, see its ApcState parameter), or be detached via a call to KeUnstackDetachProcess. But it's a subtle nuance that can lead to problems, so kernel developers needs to be aware of it.

Thus, it is important to understand that when we initialize an APC object using undocumented but exported KeInitializeApc call:

C++
VOID KeInitializeApc(
	IN PRKAPC Apc,									//pointer to KAPC
	IN PKTHREAD Thread,
	IN KAPC_ENVIRONMENT Environment,
	IN PKKERNEL_ROUTINE KernelRoutine,
	IN PKRUNDOWN_ROUTINE RundownRoutine OPTIONAL,
	IN PKNORMAL_ROUTINE NormalRoutine OPTIONAL,
	IN KPROCESSOR_MODE ApcMode,
	IN PVOID NormalContext
);

We provide its KAPC_ENVIRONMENT parameter, that is enumerated as:

C++
typedef enum _KAPC_ENVIRONMENT {
	OriginalApcEnvironment,
	AttachedApcEnvironment,
	CurrentApcEnvironment
} KAPC_ENVIRONMENT;

This parameter specifies APC environment. Or, in other words, when we insert an APC we tell the system whether it should be activated for the current thread, or if it should be activated for the saved state (KTHREAD::SavedApcState) before the thread was attached to another process. This parameter is later saved in the KAPC::ApcStateIndex member.

To illustrate this concept let's review the code inside KiInsertQueueApc that has the following logic:

C++
// KiInsertQueueApc() excerpt:

Thread = Apc->Thread;
PKAPC_STATE ApcState;

if (Apc->ApcStateIndex == 0 && Thread->ApcStateIndex != 0)
{
	ApcState = &Thread->SavedApcState;
}
else
{
	Apc->ApcStateIndex = Thread->ApcStateIndex;
	ApcState = &Thread->ApcState;
}

So basically KAPC::ApcStateIndex is a boolean value:

  • Non-0: means that APC is inserted into the current thread. Or, in other words, that the APC should be executed in the context of the current process, in which the thread is currently running.
  • 0: means that the APC should be executed only in the original process, or the one before the thread was attached to the current process.

Then inside the KeStackAttachProcess function there's the following logic:

C++
// KeStackAttachProcess() excerpt:

if (Thread->ApcStateIndex != 0)
{
	KiAttachProcess(Thread, Process, &LockHandle, ApcState);
}
else
{
	KiAttachProcess(Thread, Process, &LockHandle, &Thread->SavedApcState);
	ApcState->Process = NULL;
}

Which means that when we first attach a thread to another process, i.e. if its KAPC::ApcStateIndex is 0, the current KTHREAD::ApcState is saved in KTHREAD::SavedApcState, and the passed ApcState is not used (apart from setting its KAPC_STATE::Process to 0 to signal that the state was saved in KTHREAD::SavedApcState.)

But if we have a recursive attachment, or when a thread was already attached to another process when KeStackAttachProcess was called again, in that case the APC state is saved in the ApcState object that was passed into the function.

The reason for this logic is to have the original APC state for the thread to be always accessible by the system. This can be used either to insert an APC into the original thread, or to detach the thread back to the original process via a call to KeUnstackDetachProcess.

APC Types

APCs come in two basic flavors: kernel- and user-mode APCs. Kernel-mode APCs give developers more flexibility in the way they are queued and processed. (We discussed user-mode APCs in this blog post already.) Kernel-mode APCs are not accessible directly to the user-mode programmers.

Internally KAPC_STATE::ApcListHead contains 2 lists for kernel-mode and user-mode APCs that were queued for the thread, respectively:

C++
typedef enum _MODE {
	KernelMode = 0x0,
	UserMode = 0x1,
	MaximumMode = 0x2
}MODE;

The kernel uses those lists to maintain the state of each type of APCs. The KAPC::ApcMode serves as an index into KAPC_STATE::ApcListHead when APC is queued or processed by a call to KeInsertQueueApc:

C++
NTSTATUS NtQueueApcThread(
	IN HANDLE Thread,
	IN PKNORMAL_ROUTINE NormalRoutine,
	IN PVOID NormalContext,
	IN PVOID SystemArgument1,
	IN PVOID SystemArgument2
);

Memory Imperative for Kernel APCs

Many novice kernel developers make a mistake of specifying the wrong type of memory for kernel-mode APCs. This is important to realize to prevent all sorts of unexpected BSODs.

The rule of thumb to remember is that KAPC struct has to be allocated from the NonPagedPool memory only (or from a similar NonPagedPool* type.) This is also true even if you initialize and insert your APC at the PASSIVE_LEVEL IRQL.

The reason for such restriction comes from the fact that some other APC can be also inserted into the same thread running at the higher DISPATCH_LEVEL IRQL. During insertion into the double-linked APC list, the system will try to access the other KAPC structs that were already in the list. So if any of them were allocated from the PagedPool you will get an indirect access to a paged memory from the DISPATCH_LEVEL, which is a guaranteed way for BSOD.

The tricky nature of the situation that I described above is that it is very rare and may not come up during the development and testing stage. This will be very hard to diagnose in your production code, since BSOD as I explained above, may happen at a later time in an environment that you do not control.

Interrupts & Blocking Kernel APCs

The important thing to remember about kernel-mode APC is that it works as an interrupt, which means that it can happen between (almost) any two CPU instructions in your code.

Kernel mode development allows us to prevent execution of APCs. This should be resorted to only in some exceptional parts of the code by raising the IRQL to APC_LEVEL or above, or by placing your code between calls to KeEnterCriticalRegion and KeLeaveCriticalRegion. (Note that those functions would not prevent execution of so called special kernel APCs, that can be blocked only by raising the IRQL level.)

An interesting fact about the restriction that I showed above is that if an APC arrives within the critical region, it won't be lost, and will be processed later inside either of the following functions: KeLeaveGuardedRegion, KeLeaveCriticalRegion, KeLowerIrql*, or at the end of the critical region.

RundownRoutine Details

If I quote this blog post again:

Optionally, either kind of APC may define a valid RundownRoutine. This routine must reside in kernel memory and is only called when the system needs to discard the contents of the APC queues, such as when the thread exits. In this case, neither KernelRoutine nor NormalRoutine are executed, just the RundownRoutine. An APC without such a routine will be deleted.

There are couple of additional points that could be added to it:

  • The RundownRoutine callback is only invoked when a thread is exiting while it still has pending APCs queued. (Which is quite possible for user-mode APCs.) But it will not be invoked otherwise.
  • If RundownRoutine is NULL, then the kernel simply calls ExFreeProol(Apc), which is what was assumed under "APC without such a routine will be deleted" in that blog post. But of course, if the programmer allocated memory with a call to ExAllocatePool(NonPagedPool, sizeof(KAPC)) and no additional allocations were involved after that, then we can rely on the system to deallocate it for us. But if KAPC was allocated differently, or if the address of KAPC does not match the beginning of allocated memory, or due to other reasons, then all deallocations must be performed within the RundownRoutine callback override.

APC & Driver Unloading Nuances

There's one subtle moment when it comes to invoking kernel APC callback routines. For instance, the KernelRoutine callback must be always provided, and thus the driver itself cannot be unloaded from memory while its APC callback may be still running. Otherwise, it's a sure recipe for BSOD.

One can easily replicate the BSOD tied to a pending APC for the driver that is being unloaded. Put a breakpoint on some thread and queue an APC to it. Force the driver to unload and then resume the thread and invoke an APC with a call to NtTestAlert. Such will guarantee a BSOD.

Ideally, the system implementation of APCs should have been the following:

  • It must have a reference to DriverObject in KAPC, and before insertion of the APC the KeInsertQueueApc function should have done ObfReferenceObject(Apc->DriverObject) (and additionally, if KeInsertQueueApc fails, also call ObfDereferenceObject(Apc->DriverObject) internally.) With these steps, the driver will not be unloaded while there are queued APCs.
  • Then before the final invocation of the KernelRoutine, NormalRoutine, or RundownRoutine, the system should've read DriverObject = Apc->DriverObject into the local stack, invoked the appropriate APC callback, and then called ObfDereferenceObject(DriverObject), since the Apc itself will not be valid after the callback returns.
  • Additionally, it would be also very helpful if RundownRoutine was invoked unconditionally, and not how it's currently done now.

With the changes that I proposed above, the coding of the kernel-mode APC callback routines would be much more simple. But unfortunately the invocation of those callbacks was not coded correctly. 😒

Incidentally, such functionality has been realized for the WorkItem objects. See IoInitializeWorkItem function. We pass into it a pointer to the DriverObject or device object, which will hold our driver in memory and won't let it unload while WorkItem is still active. Or, in other words, when we add a WorkItem, the system calls ObfReferenceObject for us, and then when our final callback is invoked, the system then calls ObfDereferenceObject. Which is the correct way to implement it.

So what's the workaround for setting up the kernel APC callbacks correctly?

Obviously we can call ObfReferenceObject from the driver itself during initialization. But how do we call ObfDereferenceObject at the end of the lifetime of our object from within it? If we do it, and the execution returns back from the ObfDereferenceObject function, we will create a situation in which the driver code that we're running is already unloaded. This is a good way to cause a BSODs.

My solution to this problem is to use the assembly language and to invoke ObfDereferenceObject function using the JMP instruction instead of a conventional CALL instruction, like most compilers do. By using the JMP instruction, we're guaranteeing that the execution will not return back to the code that is being unloaded. Unfortunately though such solution is not currently available through C or C++ languages.

Check this assembly code for an example of implementation of this technique, or check my GitHub for the full sample.

Case Study - Pitfalls of Early Injection Into Kernel32.dll

This is the actual case that I helped to resolve while freelancing for one antivirus company (that should remain nameless.)

Let's say, that an antivirus company wanted to inject their own DLL into all running processes. Additionally, they wanted to run code in their DLL very early, even before other loaded DLLs had a chance to receive DLL_PROCESS_ATTACH notification.

This worked well for them, except when one competing product was also installed on the system, everything crashed.

They later discovered that the other AV was inserting an APC into loading of kernel32.dll that made their injected DLL to load earlier, and they couldn't figure out why that was causing the crash.

The answer to that conundrum was to understand the early DLL loading process that I describe here. When the custom DLL of our AV company was injected and loaded before kernel32.dll, that DLL should've not had any dependencies on any other DLL except the native ntdll.dll (directly, or indirectly via dependencies in other modules.) But that was not the case, and that is what was causing the crash.

If a driver, like I show here, invokes a user-mode APC callback, that in turn was invoking LoadLibrary on some custom DLL, and if such callback was invoked before kernel32.dll had a chance to load itself, then a call to LoadLibrary will attempt to import ntdll.dll, while the imports were not set up yet. So the first imported call to any function in ntdll.dll from within kernel32.dll will crash the process.

As a workaround for AV company, they needed to write their injector in a different way. APC was not the best solution because of the limitations that I described above, and because of the fact that their DLL was supposed to be loaded into every module in the system.

If we are using APC callback, we must be ready that our callback can be invoked at any moment after we queued it. But if we call LoadLibrary[Ex] type function from our callback, that in itself is imported from kernel32.dll, we're breaking that rule because that library may not be yet initialized in our process.

In that case, a specially crafted shellcode could be a better approach, that will load the DLL using native functions, such as ntdll!LdrLoadDll:

C++
NTSTATUS LdrLoadDll( 
	IN PCWSTR SearchPaths,
	IN PULONG pFlags,
	IN PCUNICODE_STRING DllName,
	OUT HMODULE* pDllBase
);

Additionally, such custom DLL itself must only have static imports from ntdll.dll, or alternatively use delay-loaded imports from kernel32.dll. Such DLL cannot use any of the C Run-Time Libraries (CRT) and many of the C++ constructs either, as they (even if linked statically) will bring implicit imports to kernel32.dll and other libraries.

User-Mode APCs From The Kernel

For the user-mode APC the situation is different in the following ways:

  • It can't execute between any two CPU instructions, or in other words, it is not delivered via a CPU interrupt.
  • It has to run in ring-3 code, or with the user-mode context.
  • It runs only after execution of specific waitable Windows functions when thread is in an alertable state.

To accomplish this, the kernel and the native-subsystem are coded in such a way that user-mode APCs are executed when the CPU leaves the system call. Many Windows functions (or WinAPIs) require a call to the kernel, which is delivered via the sysenter CPU instruction. Upon its execution, the CPU first enters the part of the Windows kernel that is responsible for routing system calls, known as the System Service Dispatcher. Then the system call itself is processed depending on the system function index supplied in the EAX register. And only after that, but before leaving the kernel space, the System Service Dispatcher checks for the presence of the user-mode APCs and adjusts the KTRAP_FRAME on the kernel stack to handle user-mode APC later.

The checks for the presence of the user-mode APCs are done in the nt!KiDeliverApc function in the kernel. In a nutshell, after processing kernel-mode APCs for the thread, it checks if KTHREAD::PreviousMode == UserMode, and that KTHREAD.SpecialApcDisable is not set, and if so it then checks that KTHREAD.ApcState.UserApcPending is not zero, signifying the presence of the user-mode APC. Then it calls nt!KiInitializeUserApc that modifies the user mode context for the return from the system call to process the user-mode APC.

For that, nt!KiInitializeUserApc remembers the original ring-3 context where the system call was supposed to return before adjusting KTRAP_FRAME to return execution into the special ntdll!KiUserApcDispatcher function in the native subsystem. After that nt!KiInitializeUserApc returns.

And only later, upon execution of the sysexit CPU instruction, due to the modified KTRAP_FRAME context, CPU returns into the ntdll!KiUserApcDispatcher function in ring-3. That function in turn processes a single user-mode APC and then calls ntdll!NtContinue(context, TRUE) that returns execution back to the kernel. And the cycle that I described above continues until there's no more user-mode APCs left in the queue for the thread.

Implementation of User-mode APCs

There are some specific aspects of user-mode APCs that I need to point out:

  • Even though CPU can enter kernel-mode at any moment between any two instructions following an interrupt, a user-mode APC callback does not get invoked at that time. User-mode APCs can be invoked only after execution of special Windows API calls, as I described here.
  • Hypothetically any Windows API that requires sysenter can be used to process user-mode APCs upon return, provided that some kernel code sets KTHREAD.ApcState.UserApcPending for the thread, and a user-mode APC is queued prior to the call.
  • Setting the KTHREAD.ApcState.UserApcPending is what MSDN calls alertable state for a thread. Which is a somewhat confusing terminology.
  • Which APIs can set that KTHREAD.ApcState.UserApcPending flag? Obviously the following documented functions can do it: SleepEx, SignalObjectAndWait, MsgWaitForMultipleObjectsEx, WaitForMultipleObjectsEx, or WaitForSingleObjectEx. But there are also these undocumented functions that can do it too:
    • ntdll!NtTestAlert, that has no input parameters. It seems like its only function is to prepare all queued user-mode APCs. Internally it calls nt!KiInitializeUserApc itself, that I described here:
      C++
      NTSTATUS NtTestAlert();
    • ntdll!NtContinue, that returns execution back to the kernel for continued processing (like I described here) and then passes the execution to provided user-mode ThreadContext, while optionally setting KTHREAD.ApcState.UserApcPending if RaiseAlert is set:
      C++
      NTSTATUS NtContinue(
      	IN PCONTEXT ThreadContext,
      	IN BOOLEAN RaiseAlert
      );

"Special" User-mode APCs

There's also a new member in the KAPC_STATE struct, called SpecialUserApcPending. There's not much known about it, except some bits and pieces from the true "Windows internals spelunkers":

It's been a while since APCs got messed around with. RS5 now adds "Special User APCs" (KTHREAD->SpecialUserApcPending) which can be queued with NtQueueApcThreadEx passing in 1 as the reserve handle. These are delivered with Mode == KernelMode to force a thread signal. Big change.

Broken User-Mode APC Implementation in Windows XP

This information applies only to legacy implementation on Windows XP and earlier systems.

If we follow the documentation for the QueueUserAPC function, we can see the following section about APCs:

If an application queues an APC before the thread begins running, the thread begins by calling the APC function ...

Prior to Windows Vista, when a thread began running (from the kernel this happened after a call to KiStartUserThread and then to PspUserThreadStartup) the kernel would queue a user-mode APC with a callback set to ntdll!LdrInitializeThunk. But this meant that in user-mode, the thread would begin running from the special post-System-Service-Dispatcher function ntdll!KiUserApcDispatcher (as I described here) and not from the intended ntdll!LdrInitializeThunk.

The problem in this case was that if we ourselves added our APC into that thread, it could've begun running before ntdll!LdrInitializeThunk, and thus we would receive a thread context that was not yet initialized. That could lead to some intermittent crashes and nasty timing bugs.

The solution back then was to call GetThreadContext that would guarantee that the thread context was initialized before returning. And only after that it was safe to queue an APC:

C++ (Outdated code. Do not use!)
//WARNING: Deprecated code - do not use!
HANDLE hThread = CreateThread(NULL, 0, ThreadProc, 0, CREATE_SUSPENDED, NULL);
if (hThread)
{
	CONTEXT ctx;
	GetThreadContext(hThread, &ctx);		//XP bug workaround

	//Now it's safe to queue APC
	QueueUserAPC(Papcfunc, hThread, 0);

	//Because thread is originally suspended, this will ensure that our APC callback 
	//in 'Papcfunc' is executed before 'ThreadProc'
	ResumeThread(hThread);

	CloseHandle(hThread);
}
The reason GetThreadContext was able to solve that timing bug is because of the way thread context is retrieved. It is done by queuing a special kernel-mode APC into the target thread with a callback function collecting its context, and then by setting an event which is waited by the callee thread, that called GetThreadContext, that reads the context when the internal event is set.

Intricacies of DLL Injection Via User-Mode APC

There is a technique to perform DLL injection into a process that we start ourselves. It works as such:

  • Create a process that is originally suspended (CreateProcess with CREATE_SUSPENDED flag.) We only need it for its initial thread.
  • Add an APC into that thread (QueueUserAPC) with a callback set to LoadLibrary function and resume it (ResumeThread).
  • Our APC callback, or a call to LoadLibrary is guaranteed to be called in the target process before its entry point code.

But when will our APC callback be called? This should technically happen before the entry point code in the process has a chance to run, at the exit from the ntdll!LdrInitializeThunk function call (when the code inside it invokes NtTestAlert.) So we're guaranteed that our APC callback will not be called later than that. But can it be called earlier?

What if one of the DLLs that are loaded into the process at its creation calls one of the alertable wait functions in its DLL_PROCESS_ATTACH handler? This is highly unlikely for the Windows system DLLs, but is still possible for a custom DLL that is also loaded into the process. The bottom line is that such scenario will lead to our APC callback being called earlier.

But really, who cares if we call LoadLibrary and inject our DLL earlier? In most cases this won't matter than much.

PsSetLoadImageNotifyRoutine Gotcha

There's one intricate situation that can be very critical for when DLL is loaded. Say, a driver may use PsSetLoadImageNotifyRoutine function to intercept loading of some DLLs. To do that it queues its own APC early into the DLL loading process. A driver then usually sets the KAPC_STATE::UserApcPending flag (implicitly) with a call to KeDelayExecutionThread, or using the undocumented function KeTestAlertThread, and thus forcing the user-mode code (in the APC callback) to run before the code in the DLL that is being loaded has any chance to run itself.

This can be illustrated in the following pseudo-code:

The full version of the code below can be found at my GitHub.
C++
#ifndef _WIN64
#error Showing this for 64-bit builds only!
#endif

LONG gFlags;
PDRIVER_OBJECT g_DriverObject;

enum{
	flImageNotifySet,
};

extern "C" NTSTATUS NTAPI DriverEntry(PDRIVER_OBJECT DriverObject, PUNICODE_STRING RegistryPath)
{
	g_DriverObject = DriverObject;

	DriverObject->DriverUnload = DriverUnload;

	NTSTATUS status = PsSetLoadImageNotifyRoutine(OnLoadImage);

	if (0 <= status)
	{
		_bittestandset(&gFlags, flImageNotifySet);
	}

	return status;
}

void NTAPI DriverUnload(PDRIVER_OBJECT DriverObject)
{	
	FreeLoadImageData();
}

void FreeLoadImageData()
{
	if (_bittestandreset(&gFlags, flImageNotifySet)) PsRemoveLoadImageNotifyRoutine(OnLoadImage);
}

VOID CALLBACK OnLoadImage(
						  IN PUNICODE_STRING FullImageName,
						  IN HANDLE ProcessId, // Process where image is mapped
						  IN PIMAGE_INFO ImageInfo
						  )
{
	STATIC_UNICODE_STRING(kernel32, "\\kernel32.dll");

	if (
		!ImageInfo->SystemModeImage && 
		ProcessId == PsGetCurrentProcessId() && 	// section can be "remotely" mapped from another process
		SuffixUnicodeString(FullImageName, &kernel32) && 
		IsByLdrLoadDll(&kernel32)
		)
	{
		BeginInject(&NATIVE_DLL::di);
	}
}

VOID CALLBACK RundownRoutine(PKAPC );
VOID CALLBACK KernelRoutine(PKAPC , PKNORMAL_ROUTINE *, PVOID * , PVOID * ,PVOID * );
VOID CALLBACK NormalRoutine(PVOID , PVOID ,PVOID );

void BeginInject(DLL_INFORMATION* pdi)
{
	PVOID Section;

	if (0 <= pdi->GetSection(&Section))
	{
		if (PKAPC Apc = ExAllocatePool(NonPagedPool, sizeof(KAPC)))
		{
			KeInitializeApc(Apc, KeGetCurrentThread(), OriginalApcEnvironment, 
				KernelRoutine, RundownRoutine, NormalRoutine, KernelMode, Apc);

			ObfReferenceObject(g_DriverObject);
			ObfReferenceObject(Section);

			if (!KeInsertQueueApc(Apc, Section, pdi, IO_NO_INCREMENT))
			{
				ObfDereferenceObject(Section);

				RundownRoutine(Apc);
			}
		}
	}
}

extern "C" NTSYSAPI BOOLEAN NTAPI KeTestAlertThread(IN KPROCESSOR_MODE 	AlertMode);

VOID CALLBACK _NormalRoutine (
							  PKAPC Apc,
							  PVOID Section,
							  DLL_INFORMATION* pdi
							  )
{
	PVOID BaseAddress;
	NTSTATUS status = pdi->MapSection(BaseAddress);

	ObfDereferenceObject(Section);

	if (0 <= status)
	{
		union {
			PVOID pvNormalRoutine;
			PKNORMAL_ROUTINE NormalRoutine;
		};

		PVOID NormalContext = BaseAddress;
		pvNormalRoutine = (PBYTE)BaseAddress + pdi->rva_1;

		if (pdi == &WOW_DLL::di) PsWrapApcWow64Thread(&NormalContext, &pvNormalRoutine);

		KeInitializeApc(Apc, KeGetCurrentThread(), OriginalApcEnvironment, 
			KernelRoutine, RundownRoutine, NormalRoutine, UserMode, NormalContext);

		ObfReferenceObject(g_DriverObject);

		if (KeInsertQueueApc(Apc, NtCurrentProcess(), BaseAddress, IO_NO_INCREMENT))
		{
			//Force user-mode APC callback
			KeTestAlertThread(UserMode);

			return;
		}

		ObfDereferenceObject(g_DriverObject);

		MmUnmapViewOfSection(IoGetCurrentProcess(), BaseAddress);
	}

	_RundownRoutine(Apc);
}

VOID CALLBACK _KernelRoutine(
							 PKAPC Apc, 
							 PKNORMAL_ROUTINE * /*NormalRoutine*/, 
							 PVOID * /*NormalContext*/, 
							 PVOID * /*SystemArgument1*/, 
							 PVOID * /*SystemArgument2*/
							 )
{
	if (Apc->ApcMode == KernelMode)
	{
		//Kernel-mode APC
		ObfReferenceObject(g_DriverObject);		//NormalRoutine will be called

		return;
	}

	//User-mode APC -> free Apc object
	_RundownRoutine(Apc);
}

VOID CALLBACK _RundownRoutine(PKAPC Apc)
{
	ExFreePool(Apc);
}

With special supplementary assembly language implementation:

Note that I'm writing these functions in assembly to be able to use the JMP instruction to safely dereference KAPC objects. Read more details here.
x86-64
extern g_DriverObject:QWORD
extern __imp_ObfDereferenceObject:QWORD

extern ?_RundownRoutine@NT@@YAXPEAU_KAPC@1@@Z : PROC
extern ?_NormalRoutine@NT@@YAXPEAU_KAPC@1@PEAXPEAUDLL_INFORMATION@1@@Z : PROC
extern ?_KernelRoutine@NT@@YAXPEAU_KAPC@1@PEAP6AXPEAX11@ZPEAPEAX33@Z : PROC

_TEXT segment

; VOID CALLBACK RundownRoutine(PKAPC );
?RundownRoutine@NT@@YAXPEAU_KAPC@1@@Z proc
	sub    rsp,40
	;      void __cdecl NT::_RundownRoutine(struct NT::_KAPC *)
	call   ?_RundownRoutine@NT@@YAXPEAU_KAPC@1@@Z
	add    rsp,40
	mov    rcx,g_DriverObject
	jmp    __imp_ObfDereferenceObject
?RundownRoutine@NT@@YAXPEAU_KAPC@1@@Z endp

; VOID CALLBACK KernelRoutine(PKAPC , PKNORMAL_ROUTINE *, PVOID * , PVOID * ,PVOID * );
?KernelRoutine@NT@@YAXPEAU_KAPC@1@PEAP6AXPEAX11@ZPEAPEAX33@Z proc
	mov    rax,[rsp + 40]
	mov    [rsp + 24],rax
	mov    rax,[rsp]
	mov    [rsp + 32],rax
	push   rax
	;      void __cdecl NT::_KernelRoutine(struct NT::_KAPC *,void (__cdecl **)(void *,void *,void *),void **,void **,void **)
	call   ?_KernelRoutine@NT@@YAXPEAU_KAPC@1@PEAP6AXPEAX11@ZPEAPEAX33@Z
	pop    rax
	mov    rax,[rsp + 32]
	mov    [rsp],rax
	mov    rcx,g_DriverObject
	jmp    __imp_ObfDereferenceObject
?KernelRoutine@NT@@YAXPEAU_KAPC@1@PEAP6AXPEAX11@ZPEAPEAX33@Z endp

; VOID CALLBACK NormalRoutine(PVOID , PVOID ,PVOID );
?NormalRoutine@NT@@YAXPEAX00@Z proc
	sub    rsp,40
	;      void __cdecl NT::_NormalRoutine(struct NT::_KAPC *,void *,struct NT::DLL_INFORMATION *)
	call   ?_NormalRoutine@NT@@YAXPEAU_KAPC@1@PEAXPEAUDLL_INFORMATION@1@@Z
	add    rsp,40
	mov    rcx,g_DriverObject
	jmp    __imp_ObfDereferenceObject
?NormalRoutine@NT@@YAXPEAX00@Z endp

_TEXT ends
end
The "crazy" externs that you see above are mangled C++ function names. You can obtain them using the __FUNCDNAME__ preprocessor command during the compilation of the source code by placing it as such:
C++
int SomeFunction(WCHAR* pstr, int value)
{
	__pragma(message("extern " __FUNCDNAME__ " : PROC ; "  __FUNCSIG__))
}

When that code compiles, the Output window in Visual Studio will contain the required C++ mangled function name:

extern ?SomeFunction@@YAHPEA_WH@Z : PROC ; int __cdecl SomeFunction(wchar_t *,int)

It is important to understand that PsSetLoadImageNotifyRoutine callback is executed inside a call to the ZwMapViewOfSection function that maps DLL into memory. This callback happens before that function finishes settings up DLL, which means that DLL is mapped but it is not yet initialized. For instance, its imported functions are not yet processed. So in other words, that DLL cannot be used yet!

As a consequence of the statement above, one rule of thumb that must be followed if you decide to load your own module into all other modules using the PsSetLoadImageNotifyRoutine function: you cannot import any other DLLs into your module except for ntdll.dll. That DLL, and no other, is guaranteed to be mapped into any user-mode process.

ZwQueueApcThread vs QueueUserAPC

Let me ask, which function would you use?

QueueUserAPC is obviously documented (more or less), and thus should be safer to use, and ZwQueueApcThread or NtQueueApcThread are not.

For the user-mode code there's no difference between ZwQueueApcThread and NtQueueApcThread functions. It's just the matter of what prefix you like.

Before continuing, let's check how native ZwQueueApcThread function is declared:

C++
NTSTATUS ZwQueueApcThread(
	HANDLE hThread,
	PKNORMAL_ROUTINE ApcRoutine,
	PVOID ApcContext,
	PVOID Argument1,
	PVOID Argument2
);

As you can see, instead of a single custom parameter, or dwData in QueueUserAPC, we have a chance to pass 3 custom parameters with a native function. OK. That simplifies things a little bit for a native function, but still as long as we can pass a pointer we can pass as many parameters as we want. So no big deal for QueueUserAPC, right?

Well, as we shall see below, the difference actually lies with an activation context used by QueueUserAPC. And not only the difference, but actually a bug.

Activation Context Handle Bug

The way user-mode APCs deal with the activation context is not mentioned in the documentation for the QueueUserAPC function at all. Instead it is only briefly touched here:

Asynchronous procedure calls, completion port callbacks, and any other callbacks on other threads automatically get the activation context of the source.

You can see what this means from the implementation of QueueUserAPC. It roughly goes as such on my Windows 10:

C++ pseudo-code
typedef struct _ACTIVATION_CONTEXT_BASIC_INFORMATION {
	HANDLE  hActCtx;
	DWORD   dwFlags;
} ACTIVATION_CONTEXT_BASIC_INFORMATION, *PACTIVATION_CONTEXT_BASIC_INFORMATION;

DWORD QueueUserAPC(PAPCFUNC pfnAPC, HANDLE hThread, ULONG_PTR dwData)
{
	ACTIVATION_CONTEXT_BASIC_INFORMATION ContextInfo = {};

	NTSTATUS status = RtlQueryInformationActivationContext(
						1,		//RTL_QUERY_ACTIVATION_CONTEXT_FLAG_USE_ACTIVE_ACTIVATION_CONTEXT,
						NULL,
						NULL,
						1,		//ActivationContextBasicInformation,
						&ContextInfo,
						sizeof(ContextInfo),
						NULL);
	if(FAILED(status))
	{
		BaseSetLastNTError(status);
		return FALSE;
	}

	status = ZwQueueApcThread(hThread, RtlDispatchAPC, pfnAPC, dwData, 
				!(ContextInfo.dwFlags & 1) ? ContextInfo.hActCtx : INVALID_HANDLE_VALUE);
	if(FAILED(status))
	{
		BaseSetLastNTError(status);
		return FALSE;
	}

	return TRUE;
}

typedef struct _RTL_ACTIVATION_CONTEXT_STACK_FRAME
{
	PRTL_ACTIVATION_CONTEXT_STACK_FRAME Previous;
	_ACTIVATION_CONTEXT * ActivationContext;
	ULONG Flags;
} RTL_ACTIVATION_CONTEXT_STACK_FRAME, *PRTL_ACTIVATION_CONTEXT_STACK_FRAME;

typedef struct _RTL_CALLER_ALLOCATED_ACTIVATION_CONTEXT_STACK_FRAME_EXTENDED
{
	SIZE_T Size;
	ULONG Format;
	RTL_ACTIVATION_CONTEXT_STACK_FRAME Frame;
	PVOID Extra1;
	PVOID Extra2;
	PVOID Extra3;
	PVOID Extra4;
} RTL_CALLER_ALLOCATED_ACTIVATION_CONTEXT_STACK_FRAME_EXTENDED,
 *PRTL_CALLER_ALLOCATED_ACTIVATION_CONTEXT_STACK_FRAME_EXTENDED;

void RtlDispatchAPC(PAPCFUNC pfnAPC, ULONG_PTR dwData, HANDLE hActCtx)
{
	RTL_CALLER_ALLOCATED_ACTIVATION_CONTEXT_STACK_FRAME_EXTENDED ActEx = {};
	ActEx.Size = sizeof(ActEx);
	ActEx.Format = 1;

	if(hActCtx != INVALID_HANDLE_VALUE)
	{
		RtlActivateActivationContextUnsafeFast(&ActEx, hActCtx);

		pfnAPC(dwData);

		RtlDeactivateActivationContextUnsafeFast(&ActEx);
		RtlReleaseActivationContext(hActCtx);
	}
	else
		pfnAPC(dwData);
}

As you can see, they take the current activation context (with added reference to it) and then call ZwQueueApcThread to queue the APC with a callback function pointing to ntdll!RtlDispatchAPC. In it they pass the original callback function, specified by the user, and also user-provided parameter for the call to QueueUserAPC, and finally the handle to the activation context.

This is, by the way, where all 3 parameters are used up in QueueUserAPC. So the user has only 1 parameter left out of available 3.

Inside the APC callback, the ntdll!RtlDispatchAPC implementation activates the context, invokes user-provided callback with a parameter, and then deactivates and releases it.

What is important to note, and where the bug lies, is that activation context "handle" is not really a handle. It is just a pointer to some internal data structure. It is easier to understand it if we reverse engineer the code in the RtlReleaseActivationContext function:

x86-64
	; RtlReleaseActivationContext function
	; rcx = activation context handle

	test    rcx, rcx
	jnz     @@1
	retn
@@1:
	mov     [rsp+0x8], rbx
	push    rdi
	sub     rsp, 20h
	lea     rax, [rcx-1]
	mov     rbx, rcx
	or      rax, 7
	cmp     rax, 0FFFFFFFFFFFFFFFFh
	jz      @@exit
	mov     eax, [rcx]			; potential crash
	mov     ecx, 1
	sub     eax, ecx
	cmp     eax, 7FFFFFFDh
	ja      @@exit
	mov     eax, [rbx]
	lea     edi, [rax-1]
	lock cmpxchg [rbx], edi		; potential overwrite of memory
	; ....

As you can see RtlReleaseActivationContext expects only one input parameter, that is the activation context handle, which is passed in the rcx register. But follow it later in the assembly code. As you can see, this function does a quick check if it is 0 and if so exits. It then does another rudimentary check for the handle bits not to be all 1's, except for lower 3 bits, and if so it also exits.

But this leaves a vast majority of non-zero activation context "handle" values to be allowed to pass through to the mov eax, [rcx] instruction, that merely treats it as an address in memory. Further more, the lock cmpxchg [rbx], edi instruction may begin writing into that address later.

A true handle is an index into a dictionary or a map of objects in a handle table in kernel memory. It should not be used as a mere pointer, especially if such handle can be passed between processes!

Such handling of the activation context "handle" does not pose a problem when used in the same process. But what if we use QueueUserAPC to queue an APC in another process? Then their use of the "handle"/pointer will only mean:

Application crash

But such crash will not be the worst thing. Consider if the activation context "handle" points to a valid memory in the target process. What would happen then? The RtlReleaseActivationContext, as example, will overwrite some writable memory in that process, which would not only lead to undefined behavior (UB) but will also be very difficult to diagnose and debug afterwards.

So why didn't this bug cause a lot of ruckus? Activation context is not a new concept after all.

The reason is that usually an activation context for a process is not present. So a call to RtlQueryInformationActivationContext with ActivationContextBasicInformation, or to its documented equivalent GetCurrentActCtx, will return NULL as the activation context "handle". And NULLs are handled gracefully by the Microsoft's callback function.

The issue happens though when a module has an activation context. For instance, in the DllMain if the module itself has manifest with the ISOLATIONAWARE_MANIFEST_RESOURCE_ID identifier. But this is quite rare and thus, my guess, this issue went unattended.

Cagey APC Documentation

Let's check MSDN documentation concerning the activation context "handle" bug that I explained here:

Note Queuing APCs to threads outside the caller's process is not recommended for a number of reasons. ...

😊 Really? That is because you have an implementation bug in it. So why not just write, that the activation context "handle" cannot be used in another process? Or better still, that it may lead to crashes, undefined behavior and corrupted memory.

But ideally, there should be a separate parameter for the QueueUserAPC function, or maybe a new function QueueUserAPCEx, that should tell it whether or not to use the activation context at all. And, they should technically also modify the current implementation of QueueUserAPC, and internally pass NULL for the activation context into the APC callback function if the hThread input handle points to a thread in a different process.

Then this:

... Similarly, if a 64-bit process queues an APC to a 32-bit process or vice versa, addresses will be incorrect and the application will crash.

Again, they are not telling the whole truth.

You cannot queue a 32-bit APC callback into a 64-bit process. But you can queue a 64-bit APC callback into a 32-bit process. For that, instead of ZwQueueApcThread one needs to use another lesser known and undocumented native function RtlQueueApcWow64Thread, that queues a 64-bit APC callback in a 32-bit WOW64 process:

C++
NTSTATUS RtlQueueApcWow64Thread (
	HANDLE hThread,
	PKNORMAL_ROUTINE ApcRoutine,
	PVOID ApcContext,
	PVOID Argument1,
	PVOID Argument2
);

Alternatively, from the kernel-mode instead of calling KeInsertQueueApc one needs to call PsWrapApcWow64Thread:

C++
NTSTATUS PsWrapApcWow64Thread (
    _Inout_ PVOID *ApcContext,
	_Inout_ PVOID *ApcRoutine
);

But why would someone need to queue a 64-bit APC into a 32-bit process? We'll review it later.

User-Mode APC Demo Code

To illustrate the concepts and pitfalls of the user-mode APCs that I explained above, we wrote a small sample code:

Make sure to check comments in the code below for more details.
C++
int main()
{
	//Create activation context
	HANDLE hActCtx = INVALID_HANDLE_VALUE;
	ACTCTX ActCtx = { sizeof(ActCtx), ACTCTX_FLAG_HMODULE_VALID | ACTCTX_FLAG_RESOURCE_NAME_VALID };

	if (ActCtx.hModule = LoadLibraryW(L"IMAGEHLP"))
	{
		ActCtx.lpResourceName = CREATEPROCESS_MANIFEST_RESOURCE_ID;

		hActCtx = CreateActCtxW(&ActCtx);

		FreeLibrary(ActCtx.hModule);
	}

	if (hActCtx != INVALID_HANDLE_VALUE)
	{
		//Check that we don't have an activation context yet
		QueryCtx();

		//Set our activation context for this process
		ULONG_PTR dwCookie;
		if (ActivateActCtx(hActCtx, &dwCookie))
		{
			//Check that we have an activation context now
			QueryCtx();

			//Queue APC in this process on this thread
			QueueUserAPC(OnApc, GetCurrentThread(), 0);

			//Make APC callback execute now
			ZwTestAlert();			//same as: SleepEx(0, TRUE);

			//Queue APC in a remote process (using native API)
			//It will succeed
			TestAPC_InRemoteProcess(true);

			//Queue APC in a remote process (using Win32 API)
			//It will crash the remote process!
			TestAPC_InRemoteProcess(false);

			DeactivateActCtx(0, dwCookie);
		}

		ReleaseActCtx(hActCtx);
	}

	return 0;
}

void TestAPC_InRemoteProcess(bool bUseNativeApi)
{
	//Invoke a user-mode APC callback in a remote process

	//Get path to cmd.exe
	WCHAR appname[MAX_PATH];
	if (GetEnvironmentVariableW(L"comspec", appname, _countof(appname)))
	{
		PROCESS_INFORMATION pi;
		STARTUPINFO si = { sizeof(si) };

		//Run cmd.exe suspended
		if (CreateProcessW(appname, 0, 0, 0, 0, CREATE_SUSPENDED, 0, 0, &si, &pi))
		{
			//Invoke APC in cmd.exe, using either a native or documented Win32 function
			//We don't care about the callback function itself, for as long as it can
			//handle our input parameters. Thus I will use LPVOID TlsGetValue(DWORD)
			bUseNativeApi
				? ZwQueueApcThread(pi.hThread, (PKNORMAL_ROUTINE)TlsGetValue, 0, 0, 0)
				: QueueUserAPC((PAPCFUNC)TlsGetValue, pi.hThread, 0);

			//Resume thread to let APC execute
			ResumeThread(pi.hThread);

			CloseHandle(pi.hThread);
			CloseHandle(pi.hProcess);
		}
	}
}

void QueryCtx()
{
	//Query activation context in this process and output it into (debugger) console
	SIZE_T cb = 0;
	ACTIVATION_CONTEXT_RUN_LEVEL_INFORMATION acrli;
	union {
		PVOID buf;
		PACTIVATION_CONTEXT_ASSEMBLY_DETAILED_INFORMATION pacadi;
	};
	buf = 0;
	ACTIVATION_CONTEXT_QUERY_INDEX QueryIndex = { 1, 0 };

__again:
	switch (QueryActCtxW(QUERY_ACTCTX_FLAG_USE_ACTIVE_ACTCTX, 0, &QueryIndex,
		AssemblyDetailedInformationInActivationContext, buf, cb, &cb) ? NOERROR : GetLastError())
	{
	case ERROR_INSUFFICIENT_BUFFER:
		buf = alloca(cb);
		goto __again;
		break;
	case NOERROR:
		if (buf)
		{
			DbgPrint("==========\nPID=%u: %S\n%S\n", 
				GetCurrentProcessId(), 
				pacadi->lpAssemblyManifestPath, 
				pacadi->lpAssemblyEncodedAssemblyIdentity);
		}
		break;
	}

	if (QueryActCtxW(QUERY_ACTCTX_FLAG_USE_ACTIVE_ACTCTX, 0, 0, 
		RunlevelInformationInActivationContext, &acrli, sizeof(acrli), &cb))
	{
		DbgPrint("PID=%u: RunLevel = %x\n", GetCurrentProcessId(), acrli.RunLevel);
	}
}

VOID NTAPI OnApc(
	_In_ ULONG_PTR /*Parameter*/
)
{
	//User-mode APC callback
	QueryCtx();
}

To compile this code sample in Visual Studio without WDK, you will need the following declarations:

C++
#pragma comment(lib, "ntdll.lib")		//For native function calls

typedef
VOID
KNORMAL_ROUTINE(
	__in_opt PVOID NormalContext,
	__in_opt PVOID SystemArgument1,
	__in_opt PVOID SystemArgument2
);
typedef KNORMAL_ROUTINE* PKNORMAL_ROUTINE;

extern "C" {
	__declspec(dllimport) NTSTATUS CALLBACK ZwQueueApcThread(HANDLE hThread,
		PKNORMAL_ROUTINE ApcRoutine, 
		PVOID ApcContext, 
		PVOID Argument1, 
		PVOID Argument2);

	__declspec(dllimport) NTSTATUS CALLBACK ZwTestAlert();

	__declspec(dllimport) ULONG CALLBACK
		DbgPrint(
			_In_z_ _Printf_format_string_ PCSTR Format,
			...
		);
}

64-bit User-Mode APC In a 32-bit Process

One reason to queue a 64-bit user-mode APC into a 32-bit process would be to inject a DLL into it. But that is not the only use-case.

Say, what if you need to know a list of modules that were loaded into a process?

One way to do it for your own process is to call undocumented LdrQueryProcessModuleInformation function. It will write the full list in provided memory buffer:

C++
NTSTATUS LdrQueryProcessModuleInformation 
(
	PRTL_PROCESS_MODULES psmi,
	ULONG BufferSize,
	PULONG RealSize
);

But how do you call it for modules in a remote process, that may also be of a different bitness?

Let me give you the steps:

  1. We need to create a section (NtCreateSection) that we will use to collect and pass the information about the modules in the target process (in the Win32 parlance, it is called a file mapping object.)
  2. Map that section into the target process (ZwMapViewOfSection) for writing.
  3. Create suspended thread in the target process with the address of its entry point set to RtlExitUserThread. We don't really need the thread function itself, and thus we will shunt it to exit as soon as possible.
    It is important in this case to use the native function RtlCreateUserThread to start the thread instead of the documented CreateRemoteThread. Such is needed to ensure that we can control the bitness of the entry point of the thread. CreateRemoteThread would not allow it since the actual entry point that it uses is kernel32!BaseThreadInitThunk and not the function that we provide into it in its lpStartAddress parameter.

    To define which context the thread will start in: 64-bit or 32-bit, the system will use the bitness of the module that the entry point of the thread is located in. (Or if there's no module, like in a plain shellcode, the thread will receive a 32-bit context by default.)

    Note that it is possible to run a 64-bit thread in a 32-bit (so called WOW64) process in a 64-bit OS. There is also a 64-bit version of the ntdll.dll module that is mapped into every 32-bit WOW64 process.
  4. Insert a user-mode APC into our suspended thread. The bitness of the callback will depend on the bitness of the target process:
    • 64-bit Process: We only need ZwQueueApcThread function to queue 64-bit APC callback natively. Quite straightforward here.
    • 32-bit Process: First use ZwQueueApcThread to queue a 64-bit callback to retrieve all mapped 64-bit modules. (As I said above, any 32-bit WOW64 process will have at least one 64-bit module loaded into it.) And then use RtlQueueApcWow64Thread to queue a 32-bit APC callback.

    We will use the LdrQueryProcessModuleInformation function as a callback for the APC of the appropriate bitness. Very conveniently for us it has 3 input parameters that match custom arguments for the ZwQueueApcThread and RtlQueueApcWow64Thread functions. This is also another reason why we chose those native functions versus the documented QueueUserAPC.

  5. Resume thread, that will run our queued APC in the target process. Since we set its callback to LdrQueryProcessModuleInformation, that function will fill in the memory in our mapped section with the needed information about the modules in the target process.
  6. The thread itself will run RtlExitUserThread function that will terminate it. (Unlike Create[Remote]Thread that will pass control to an internal wrapper function upon the thread return.)
  7. In our own process we simply wait for the remote thread to finish running.
  8. Then we can unmap the section from the target process, and map it into our own process and read the modules information that we collected.
  9. Destroy the section and do other cleanup.

Having run the algorithm above on an older (32-bit) Microsoft Word process, we can get its list of loaded modules:

Console App

Code Sample to Get Process Modules

To better illustrate the concepts outlined here let me give you the following code sample that will retrieve modules that are mapped into an arbitrary process:

Note: Below is an unoptimized code intended for better readability for the reader. We're formatting it with goto statements only to prevent the need for horizontal scrolling. Please refer to comments for additional details.
C++
NTSTATUS ListModulesForProc(DWORD dwPID)
{
	//'dwPID' = process ID of the process to retrieve modules for
	NTSTATUS status = S_FALSE;

	HANDLE hProcess = NULL;
	LARGE_INTEGER liSectionSize = {};
	SIZE_T ViewSize = 0;
	NTDLL_FN_PTRS nfp = {};
	ULONG_PTR wow = 0;

#ifndef _WIN64
#error Must be compiled as x64 only!
#endif

	hProcess = OpenProcess(PROCESS_VM_OPERATION | PROCESS_CREATE_THREAD | PROCESS_QUERY_INFORMATION, FALSE, dwPID);
	if (!hProcess)
	{
		status = GetLastError();
		goto cleanup;
	}

	//Collect 64-bit modules
	nfp.pRtlExitUserThread.pstrName = "RtlExitUserThread";
	nfp.pRtlExitUserThread.pfn = (FARPROC)RtlExitUserThread;
	nfp.pLdrQueryProcessModuleInformation.pstrName = "LdrQueryProcessModuleInformation";
	nfp.pLdrQueryProcessModuleInformation.pfn = (FARPROC)LdrQueryProcessModuleInformation;

	status = CollectModules(hProcess, TRUE, &nfp);
	if (FAILED(status))
		goto cleanup;

	//Get process bitness
	status = NtQueryInformationProcess(hProcess, ProcessWow64Information, &wow, sizeof(wow), NULL);
	if (FAILED(status))
		goto cleanup;

	if (wow)
	{
		//Collect 32-bit modules
		status = ResolveNtDllFuncs32bit(&nfp);
		if (FAILED(status))
			goto cleanup;

		status = CollectModules(hProcess, FALSE, &nfp);
		if (FAILED(status))
			goto cleanup;
	}
	else
		status = STATUS_SUCCESS;

cleanup:
	//Clean-up process

	if(hProcess)
		CloseHandle(hProcess);

	assert(SUCCEEDED(status));
	return status;
}

The actual work of injecting an APC into a target process is done in the following function:

C++
NTSTATUS CollectModules(HANDLE hProcess, BOOL b64bit, NTDLL_FN_PTRS* pfnPtrs)
{
	//INFO: It is not the most efficient way of calling this function twice with
	//      repeated creation of the section and then mapping it into a process.
	//      Ideally, you'd create it once and then close and re-create it ONLY if its
	//      original size is too small to fit all the modules.
	//
	//      But, I will leave this code as-is for brevity, as such optimization
	//      has nothing to do with the APC concepts that we discuss in this blog post.

	NTSTATUS status;

	HANDLE hThread = NULL;
	BYTE* pThisBaseAddr = NULL;
	SIZE_T ViewSize = 0;
	ULONG uiRealSize = 0;
	PRTL_PROCESS_MODULES pRPMs = NULL;
	PRTL_PROCESS_MODULES32 pRPMs32 = NULL;
	HANDLE hSection = NULL;
	LARGE_INTEGER liSectionSize = {};
	PVOID pBaseAddr = NULL;
	ULONG szBufferSz = 0;
	bool bExportSuppression = false;
	bool bDone = false;

	typedef NTSTATUS(CALLBACK PFN_PTR)(HANDLE hThread,
		PKNORMAL_ROUTINE ApcRoutine,
		PVOID ApcContext,
		PVOID Argument1,
		PVOID Argument2);
	PFN_PTR* pQueueAPC;

	assert(pfnPtrs);
	assert(pfnPtrs->pLdrQueryProcessModuleInformation.pfn);
	assert(pfnPtrs->pRtlExitUserThread.pfn);

	//Assume 8 memory pages as the original section size
	SYSTEM_INFO si = {};
	GetSystemInfo(&si);
	szBufferSz = si.dwPageSize * 8;
	assert(szBufferSz);

	//See if export suppression is enabled in Control Flow Guard (CFG) for the target process
	//INFO: If so, we need to enable our thread's EP function and APC callback for CFG, 
	//      since calling them otherwise will crash the target process as a security measure!
	status = IsExportSuppressionEnabled(hProcess, &bExportSuppression);
	if (FAILED(status))
		goto cleanup;

	if (bExportSuppression)
	{
		//Enable our function pointers for CFG in the process
		status = SetValidExport(hProcess, pfnPtrs->pRtlExitUserThread.pfn);
		if (FAILED(status))
			goto cleanup;

		status = SetValidExport(hProcess, pfnPtrs->pLdrQueryProcessModuleInformation.pfn);
		if (FAILED(status))
			goto cleanup;
	}

	while (!bDone)
	{
		bDone = true;

		liSectionSize.QuadPart = szBufferSz;

		//Create section
		assert(!hSection);
		status = NtCreateSection(&hSection, SECTION_ALL_ACCESS, NULL, &liSectionSize, PAGE_READWRITE, SEC_COMMIT, 0);
		if (FAILED(status))
			goto cleanup;

		assert(!pBaseAddr);
		pBaseAddr = NULL;
		ViewSize = 0;

		//Map section into target process for writing
		status = ZwMapViewOfSection(hSection, hProcess, &pBaseAddr, 0, 0, NULL, &ViewSize, ViewShare, 0, PAGE_READWRITE);
		if (FAILED(status))
			goto cleanup;

		//Create remote thread in the target process (and shunt it to RtlExitUserThread)
		//Ensure that the thread is created suspended!
		assert(!hThread);
		status = RtlCreateUserThread(hProcess, NULL, TRUE, 0, 0, 0, pfnPtrs->pRtlExitUserThread.pfn, NULL, &hThread, NULL);
		if (FAILED(status))
			goto cleanup;

		//(Optional call)
		//INFO: Notifications about creation and termination of this thread will not be passed to an attached debugger.
		//      And, exceptions in such thread will not be passed to a debugger either.
		NtSetInformationThread(hThread, ThreadHideFromDebugger, 0, 0);

		//Pick which APC function to use (depending on the bitness)
		pQueueAPC = b64bit ? ZwQueueApcThread : RtlQueueApcWow64Thread;

		//We'll reserve last ULONG in our buffer for LdrQueryProcessModuleInformation to return its RequiredSize
		status = pQueueAPC(hThread, 
			(PKNORMAL_ROUTINE)pfnPtrs->pLdrQueryProcessModuleInformation.pfn, 
			pBaseAddr, 
			(PVOID)(szBufferSz - sizeof(ULONG)),
			(BYTE*)pBaseAddr + szBufferSz - sizeof(ULONG));

		if (FAILED(status))
			goto cleanup;

		//Let our APC callback and the thread itself run
		if (ResumeThread(hThread) != 1)
		{
			status = GetLastError();
			goto cleanup;
		}

		//Wait for the thread to finish
		if (WaitForSingleObject(hThread, INFINITE) != WAIT_OBJECT_0)
		{
			status = GetLastError();
			goto cleanup;
		}

		//Unmap the section from the target process
		status = ZwUnmapViewOfSection(hProcess, pBaseAddr);
		if (FAILED(status))
			goto cleanup;

		pBaseAddr = NULL;

		assert(!pThisBaseAddr);
		pThisBaseAddr = NULL;
		ViewSize = 0;

		//Map the same section into our own process so that we can read it
		status = ZwMapViewOfSection(hSection, GetCurrentProcess(), 
					(PVOID*)&pThisBaseAddr, 0, 0, NULL, &ViewSize, ViewShare, 0, PAGE_READONLY);
		if (FAILED(status))
			goto cleanup;

		assert(ViewSize <= szBufferSz);

		//Check if the size of the section that we assumed earlier was enough to fill in all modules
		uiRealSize = *(ULONG*)(pThisBaseAddr + szBufferSz - sizeof(ULONG));
		if (uiRealSize <= szBufferSz)
		{
			//Unfortunately we cannot check the return value from the LdrQueryProcessModuleInformation() call. Here's why:
			//The LdrQueryProcessModuleInformation() function is called from an APC callback, and by the time
			//our remote thread gets to calling RtlExitUserThread() its context will be restored by a call to ntdll!NtContinue()

			if (b64bit)
			{
				//64-bit modules
				pRPMs = (PRTL_PROCESS_MODULES)pThisBaseAddr;
				ULONG nNumberOfModules = pRPMs->NumberOfModules;

				//Check that we have at least one module loaded, otherwise it's an error
				if (!nNumberOfModules)
				{
					status = STATUS_PROCEDURE_NOT_FOUND;
					goto cleanup;
				}

				//Output results to the console
				wprintf(L"64-bit Modules (%u):\n", nNumberOfModules);

				RTL_PROCESS_MODULE_INFORMATION* pPMI = pRPMs->Modules;

				do
				{
					printf("%p sz=%08X flg=%08X Ord=%02X %s\n"
						,
						pPMI->ImageBase,
						pPMI->ImageSize,
						pPMI->Flags,
						pPMI->InitOrderIndex,
						pPMI->FullPathName
					);
				}
				while (pPMI++, --nNumberOfModules);
			}
			else
			{
				//32-bit modules
				pRPMs32 = (PRTL_PROCESS_MODULES32)pThisBaseAddr;
				ULONG nNumberOfModules = pRPMs32->NumberOfModules;

				//Check that we have at least one module loaded, otherwise it's an error
				if (!nNumberOfModules)
				{
					status = STATUS_PROCEDURE_NOT_FOUND;
					goto cleanup;
				}

				//Output results to the console
				wprintf(L"32-bit Modules (%u):\n", nNumberOfModules);

				RTL_PROCESS_MODULE_INFORMATION* pPMI32 = pRPMs32->Modules;

				do
				{
					printf("%08X sz=%08X flg=%08X Ord=%02X %s\n"
						,
						pPMI32->ImageBase,
						pPMI32->ImageSize,
						pPMI32->Flags,
						pPMI32->InitOrderIndex,
						pPMI32->FullPathName
					);
				}
				while (pPMI32++, --nNumberOfModules);
			}

			status = STATUS_SUCCESS;
		}
		else
		{
			//Need more memory - allocate it on a page boundary
			if (uiRealSize % si.dwPageSize)
			{
				szBufferSz = uiRealSize / si.dwPageSize;
				szBufferSz++;
				szBufferSz *= si.dwPageSize;
			}
			else
				szBufferSz = uiRealSize;

			//Retry
			bDone = false;
		}

cleanup:
		//Clean-up

		if (pBaseAddr)
		{
			ZwUnmapViewOfSection(GetCurrentProcess(), pBaseAddr);
			pBaseAddr = NULL;
		}

		if (pThisBaseAddr)
		{
			ZwUnmapViewOfSection(GetCurrentProcess(), pThisBaseAddr);
			pThisBaseAddr = NULL;
		}

		if (hSection)
		{
			ZwClose(hSection);
			hSection = NULL;
		}

		if (hThread)
		{
			ZwClose(hThread);
			hThread = NULL;
		}
	}

	return status;
}
You might have noticed that the function above calls NtSetInformationThread with the ThreadHideFromDebugger flag. This is an optional call that may be used by a debugger process to ensure that its own thread that was injected into the target process does not cause notifications, such as thread creation, termination, etc. Usually these notifications are passed to a debugger that is attached to a debuggee process. By using ThreadHideFromDebugger a debugger can prevent that.

Additionally, by specifying ThreadHideFromDebugger for the thread all exceptions in it will not be passed to an attached debugger either.

Other important functions resolve the 32-bit export pointers for the mapped ntdll!LdrQueryProcessModuleInformation and ntdll!RtlExitUserThread native functions that we will need to inject our APC callback into a 32-bit WOW64 process:

C++
NTSTATUS ResolveNtDllFuncs32bit(NTDLL_FN_PTRS* pfnPtrs)
{
	NTSTATUS status;

	HANDLE hSection;
	SECTION_IMAGE_INFORMATION sii;
	PVOID pBaseAddr = NULL;
	SIZE_T ViewSize = 0;

	//We'll need the special 32-bit image section for ntdll.dll
	static const WCHAR oa_ntdll_str[] = L"\\KnownDlls32\\ntdll.dll";
	static const UNICODE_STRING oa_ntdll_ustr = { sizeof(oa_ntdll_str) - sizeof((oa_ntdll_str)[0]), sizeof(oa_ntdll_str), const_cast<PWSTR>(oa_ntdll_str) };
	static OBJECT_ATTRIBUTES oa_ntdll = { sizeof(oa_ntdll), 0, const_cast<PUNICODE_STRING>(&oa_ntdll_ustr), OBJ_CASE_INSENSITIVE };

	pfnPtrs->pLdrQueryProcessModuleInformation.pfn = NULL;
	pfnPtrs->pRtlExitUserThread.pfn = NULL;

	status = ZwOpenSection(&hSection, SECTION_QUERY | SECTION_MAP_READ, &oa_ntdll);
	if (FAILED(status))
		goto cleanup;

	status = ZwQuerySection(hSection, SectionImageInformation, &sii, sizeof(sii), 0);
	if (FAILED(status))
		goto cleanup;

	status = ZwMapViewOfSection(hSection, GetCurrentProcess(), &pBaseAddr, 0, 0, 0, &ViewSize, ViewUnmap, 0, PAGE_READONLY);
	if (FAILED(status))
		goto cleanup;

	__try
	{
		//We will have to parse PE structure manually
		//(Remember, the image section here is of a different bitness than our own process!)
		if (PIMAGE_NT_HEADERS32 pinth = (PIMAGE_NT_HEADERS32)RtlImageNtHeader(pBaseAddr))
		{
			//We'll do a really quick-and-dirty parsing here ...
			status = ResolveModuleExports((PBYTE)sii.TransferAddress - pinth->OptionalHeader.AddressOfEntryPoint, 
				pBaseAddr, (EXPORT_ENTRY *)pfnPtrs, 2);
		}
		else
			status = STATUS_BAD_FILE_TYPE;
	}
	__except (EXCEPTION_EXECUTE_HANDLER)
	{
		//Catch exceptions in case the section is not a valid PE file
		status = STATUS_BAD_DATA;
	}

cleanup:
	//Clean-up

	if (pBaseAddr)
		ZwUnmapViewOfSection(GetCurrentProcess(), pBaseAddr);

	if(hSection)
		ZwClose(hSection);

	return status;
}

NTSTATUS ResolveModuleExports(PVOID ImageBase, PVOID pBaseAddr, EXPORT_ENTRY* pfnExports, int nCntExports)
{
	//Resolve exported functions by their names provided in 'pfnExports', using the image section mapped in memory
	NTSTATUS status;

	ULONG exportSize, exportRVA;
	ULONG NumberOfFunctions;
	ULONG NumberOfNames;
	ULONG OrdinalBase;
	PULONG AddressOfFunctions;
	PULONG AddressOfNames;
	PWORD AddressOfNameOrdinals;

	PIMAGE_EXPORT_DIRECTORY pied = (PIMAGE_EXPORT_DIRECTORY)
		RtlImageDirectoryEntryToData(pBaseAddr, TRUE, IMAGE_DIRECTORY_ENTRY_EXPORT, &exportSize);
	if (!pied)
	{
		status = STATUS_INVALID_IMAGE_FORMAT;
		goto cleanup;
	}

	exportRVA = RtlPointerToOffset(pBaseAddr, pied);
	NumberOfFunctions = pied->NumberOfFunctions;
	if (!NumberOfFunctions)
	{
		status = STATUS_SOURCE_ELEMENT_EMPTY;
		goto cleanup;
	}

	NumberOfNames = pied->NumberOfNames;
	OrdinalBase = pied->Base;

	AddressOfFunctions = (PULONG)RtlOffsetToPointer(pBaseAddr, pied->AddressOfFunctions);
	AddressOfNames = (PULONG)RtlOffsetToPointer(pBaseAddr, pied->AddressOfNames);
	AddressOfNameOrdinals = (PWORD)RtlOffsetToPointer(pBaseAddr, pied->AddressOfNameOrdinals);

	status = STATUS_SUCCESS;

	for (EXPORT_ENTRY* pEnd = pfnExports + nCntExports; pfnExports < pEnd; pfnExports++)
	{
		ULONG i;
		PCSTR Name = pfnExports->pstrName;

		assert(*Name != '#');	//Can't process ordinals

		//Match each export by name
		i = GetNameOrdinal(pBaseAddr, AddressOfNames, NumberOfNames, Name);
		if (i == UINT_MAX)
		{
			status = STATUS_OBJECT_NAME_NOT_FOUND;
			break;
		}

		if (i < NumberOfNames)
			i = AddressOfNameOrdinals[i];

		if (i >= NumberOfFunctions)
		{
			status = STATUS_FOUND_OUT_OF_SCOPE;
			break;
		}

		DWORD Rva = AddressOfFunctions[i];

		if ((ULONG_PTR)Rva - (ULONG_PTR)exportRVA >= exportSize)
		{
			(FARPROC&)pfnExports->pfn = (FARPROC)RtlOffsetToPointer(ImageBase, Rva);
		}
		else
		{
			//For brevity, we won't handle forwarded function exports ...
			//(This has nothing to do with the subject of this blog post.)
			status = STATUS_ILLEGAL_FUNCTION;
			break;
		}
	}

cleanup:
	//Clean-up process

	return status;
}

ULONG GetNameOrdinal(PVOID pBaseAddr, PDWORD AddressOfNames, DWORD NumberOfNames, PCSTR Name)
{
	//Resolve ordinal index by a function name
	//RETURN:
	//		Such index, or
	//		UINT_MAX if error
	if (NumberOfNames)
	{
		DWORD a = 0;

		do
		{
			int u = (a + NumberOfNames) >> 1;
			PCSTR pNm = RtlOffsetToPointer(pBaseAddr, AddressOfNames[u]);
			int i = strcmp(pNm, Name);

			if (!i)
			{
				return u;
			}

			0 > i ? a = u + 1 : NumberOfNames = u;

		} while (a < NumberOfNames);
	}

	//Name was not found
	return UINT_MAX;
}

We also need to account for something else that may interfere with our method above. This has technically nothing to do with the subject of APCs, so I will touch on it very briefly.

I'm talking about Control Flow Guard, or CFG. If it is enabled for the target process, and it has one of its features for the Export Suppression on, this will prevent our APC code injection from going through. And namely, if our APC callback and the remote thread entry point are not in the CFG bitmap, the target process will be forced by CFG to crash. Which is a good security measure, but not very good for our purpose.

For our use-case though, we need to bypass CFG. Luckily for us, this is quite easy to do. All we need is to call the SetProcessValidCallTargets function on the needed export functions to disbale it. This is what the following code accomplishes for us.

The first function below (IsExportSuppressionEnabled) determines if CFG with the Export Suppression is enabled. And the second function (SetValidExport) disables Export Suppression for our exports in the target process:

For completeness it would be also prudent to enable those exports back when our main function exits. It is trivial to do and thus we won't dwell on it here.
Note that the following function poses a race condition in a sense that some other thread, or even a process may enable CFG on our exports after we disable them.
C++
NTSTATUS IsExportSuppressionEnabled(HANDLE hProcess, bool* enabled)
{
	//Checks if CFG with export suppression is enabled for 'hProcess' and returns it in 'enabled'
	//The 'hProcess' handle must be opened with the PROCESS_QUERY_INFORMATION permission flag
	struct PROCESS_MITIGATION {
		PROCESS_MITIGATION_POLICY Policy;
		ULONG Flags;
	};

	bool bEnabled = false;

	PROCESS_MITIGATION m = { ProcessControlFlowGuardPolicy };
	NTSTATUS status = NtQueryInformationProcess(hProcess, ProcessMitigationPolicy, &m, sizeof(m), 0);
	if (SUCCEEDED(status))
	{
		PROCESS_MITIGATION_CONTROL_FLOW_GUARD_POLICY* pCFG = (PROCESS_MITIGATION_CONTROL_FLOW_GUARD_POLICY*)&m.Flags;

		bEnabled = pCFG->EnableControlFlowGuard &&
			pCFG->EnableExportSuppression;
	}

	if(enabled)
		*enabled = bEnabled;

	return status;
}

#pragma comment(lib, "mincore.lib") 
NTSTATUS SetValidExport(HANDLE hProcess, LPCVOID pv)
{
	//Disables CFG export-suppression on 'pv' function in 'hProcess'
	MEMORY_BASIC_INFORMATION mbi;
	NTSTATUS status = NtQueryVirtualMemory(hProcess, (void*)pv, MemoryBasicInformation, &mbi, sizeof(mbi), 0);
	if (SUCCEEDED(status))
	{
		if (mbi.State != MEM_COMMIT || mbi.Type != MEM_IMAGE)
		{
			return STATUS_INVALID_ADDRESS;
		}

		CFG_CALL_TARGET_INFO OffsetInformation = {
			(ULONG_PTR)pv - (ULONG_PTR)mbi.BaseAddress,
			CFG_CALL_TARGET_CONVERT_EXPORT_SUPPRESSED_TO_VALID | CFG_CALL_TARGET_VALID
		};

		return SetProcessValidCallTargets(hProcess, mbi.BaseAddress, mbi.RegionSize, 1, &OffsetInformation) &&
			(OffsetInformation.Flags & CFG_CALL_TARGET_PROCESSED) ? STATUS_SUCCESS : STATUS_STRICT_CFG_VIOLATION;
	}

	return status;
}

And finally, to compile the code above in Visual Studio you would ideally need the WDK installed. Optionally, you can use the following declarations to compile it without the WDK:

C++ - Native API/Struct Declarations
#include <iostream>
#include Windows.h>
#include <assert.h>

#pragma comment(lib, "ntdll.lib")		//For native API calls

struct EXPORT_ENTRY {
	FARPROC pfn;
	PCSTR pstrName;
};
struct NTDLL_FN_PTRS {
	EXPORT_ENTRY pLdrQueryProcessModuleInformation;
	EXPORT_ENTRY pRtlExitUserThread;
};

typedef
VOID
KNORMAL_ROUTINE(
	__in_opt PVOID NormalContext,
	__in_opt PVOID SystemArgument1,
	__in_opt PVOID SystemArgument2
);
typedef KNORMAL_ROUTINE* PKNORMAL_ROUTINE;

typedef struct _UNICODE_STRING {
	USHORT Length;
	USHORT MaximumLength;
	_Field_size_bytes_part_opt_(MaximumLength, Length) PWCH   Buffer;
} UNICODE_STRING;
typedef UNICODE_STRING* PUNICODE_STRING;
typedef const UNICODE_STRING* PCUNICODE_STRING;

typedef struct _OBJECT_ATTRIBUTES {
	ULONG Length;
	HANDLE RootDirectory;
	PUNICODE_STRING ObjectName;
	ULONG Attributes;
	PVOID SecurityDescriptor;        // Points to type SECURITY_DESCRIPTOR
	PVOID SecurityQualityOfService;  // Points to type SECURITY_QUALITY_OF_SERVICE
} OBJECT_ATTRIBUTES;
typedef OBJECT_ATTRIBUTES* POBJECT_ATTRIBUTES;
typedef CONST OBJECT_ATTRIBUTES* PCOBJECT_ATTRIBUTES;

typedef enum _SECTION_INHERIT {
	ViewShare = 1,
	ViewUnmap = 2
} SECTION_INHERIT;

typedef struct _CLIENT_ID {
	HANDLE UniqueProcess;
	HANDLE UniqueThread;
} CLIENT_ID;
typedef CLIENT_ID* PCLIENT_ID;

typedef struct RTL_PROCESS_MODULE_INFORMATION {
	HANDLE Section;                 // Not filled in
	PVOID MappedBase;
	PVOID ImageBase;
	ULONG ImageSize;
	ULONG Flags;
	USHORT LoadOrderIndex;
	USHORT InitOrderIndex;
	USHORT LoadCount;
	USHORT OffsetToFileName;
	CHAR  FullPathName[256];
} *PRTL_PROCESS_MODULE_INFORMATION;

typedef struct RTL_PROCESS_MODULES {
	ULONG NumberOfModules;
	RTL_PROCESS_MODULE_INFORMATION Modules[1];
} *PRTL_PROCESS_MODULES;

typedef int HANDLE32;
typedef int PVOID32;

#pragma pack(push)
#pragma pack(4)
typedef struct RTL_PROCESS_MODULE_INFORMATION32 {
	HANDLE32 Section;                 // Not filled in
	PVOID32 MappedBase;
	PVOID32 ImageBase;
	ULONG ImageSize;
	ULONG Flags;
	USHORT LoadOrderIndex;
	USHORT InitOrderIndex;
	USHORT LoadCount;
	USHORT OffsetToFileName;
	CHAR  FullPathName[256];
} *PRTL_PROCESS_MODULE_INFORMATION32;

typedef struct RTL_PROCESS_MODULES32 {
	ULONG NumberOfModules;
	RTL_PROCESS_MODULE_INFORMATION32 Modules[1];
} *PRTL_PROCESS_MODULES32;
#pragma pack(pop)


typedef enum _PROCESSINFOCLASS {
	ProcessBasicInformation = 0,
	ProcessQuotaLimits = 1,
	ProcessIoCounters = 2,
	ProcessVmCounters = 3,
	ProcessTimes = 4,
	ProcessBasePriority = 5,
	ProcessRaisePriority = 6,
	ProcessDebugPort = 7,
	ProcessExceptionPort = 8,
	ProcessAccessToken = 9,
	ProcessLdtInformation = 10,
	ProcessLdtSize = 11,
	ProcessDefaultHardErrorMode = 12,
	ProcessIoPortHandlers = 13,   // Note: this is kernel mode only
	ProcessPooledUsageAndLimits = 14,
	ProcessWorkingSetWatch = 15,
	ProcessUserModeIOPL = 16,
	ProcessEnableAlignmentFaultFixup = 17,
	ProcessPriorityClass = 18,
	ProcessWx86Information = 19,
	ProcessHandleCount = 20,
	ProcessAffinityMask = 21,
	ProcessPriorityBoost = 22,
	ProcessDeviceMap = 23,
	ProcessSessionInformation = 24,
	ProcessForegroundInformation = 25,
	ProcessWow64Information = 26,
	ProcessImageFileName = 27,
	ProcessLUIDDeviceMapsEnabled = 28,
	ProcessBreakOnTermination = 29,
	ProcessDebugObjectHandle = 30,
	ProcessDebugFlags = 31,
	ProcessHandleTracing = 32,
	ProcessIoPriority = 33,
	ProcessExecuteFlags = 34,
	ProcessTlsInformation = 35,
	ProcessCookie = 36,
	ProcessImageInformation = 37,
	ProcessCycleTime = 38,
	ProcessPagePriority = 39,
	ProcessInstrumentationCallback = 40,
	ProcessThreadStackAllocation = 41,
	ProcessWorkingSetWatchEx = 42,
	ProcessImageFileNameWin32 = 43,
	ProcessImageFileMapping = 44,
	ProcessAffinityUpdateMode = 45,
	ProcessMemoryAllocationMode = 46,
	ProcessGroupInformation = 47,
	ProcessTokenVirtualizationEnabled = 48,
	ProcessOwnerInformation = 49,
	ProcessWindowInformation = 50,
	ProcessHandleInformation = 51,
	ProcessMitigationPolicy = 52,
	ProcessDynamicFunctionTableInformation = 53,
	ProcessHandleCheckingMode = 54,
	ProcessKeepAliveCount = 55,
	ProcessRevokeFileHandles = 56,
	ProcessWorkingSetControl = 57,
	ProcessHandleTable = 58,
	ProcessCheckStackExtentsMode = 59,
	ProcessCommandLineInformation = 60,
	ProcessProtectionInformation = 61,
	ProcessMemoryExhaustion = 62,
	ProcessFaultInformation = 63,
	ProcessTelemetryIdInformation = 64,
	ProcessCommitReleaseInformation = 65,
	ProcessReserved1Information = 66,
	ProcessReserved2Information = 67,
	ProcessSubsystemProcess = 68,
	ProcessInPrivate = 70,
	ProcessRaiseUMExceptionOnInvalidHandleClose = 71,
	ProcessSubsystemInformation = 75,
	ProcessWin32kSyscallFilterInformation = 79,
	ProcessEnergyTrackingState = 82,
	MaxProcessInfoClass                             // MaxProcessInfoClass should always be the last enum
} PROCESSINFOCLASS;

#define OBJ_CASE_INSENSITIVE                0x00000040L

#define STATUS_SUCCESS                   ((NTSTATUS)0x00000000L)
#define STATUS_BAD_DATA                  ((NTSTATUS)0xC000090BL)
#define STATUS_BAD_FILE_TYPE             ((NTSTATUS)0xC0000903L)
#define STATUS_INVALID_IMAGE_FORMAT      ((NTSTATUS)0xC000007BL)
#define STATUS_SOURCE_ELEMENT_EMPTY      ((NTSTATUS)0xC0000283L)
#define STATUS_FOUND_OUT_OF_SCOPE        ((NTSTATUS)0xC000022EL)
#define STATUS_ILLEGAL_FUNCTION          ((NTSTATUS)0xC00000AFL)
#define STATUS_OBJECT_NAME_NOT_FOUND     ((NTSTATUS)0xC0000034L)
#define STATUS_PROCEDURE_NOT_FOUND       ((NTSTATUS)0xC000007AL)
#define STATUS_INVALID_ADDRESS           ((NTSTATUS)0xC0000141L)
#define STATUS_STRICT_CFG_VIOLATION      ((NTSTATUS)0xC0000606L)

#define RtlPointerToOffset(B,P)  ((ULONG)( ((PCHAR)(P)) - ((PCHAR)(B)) ))
#define RtlOffsetToPointer(B,O)  ((PCHAR)( ((PCHAR)(B)) + ((ULONG_PTR)(O)) ))


struct SECTION_IMAGE_INFORMATION
{
	PVOID TransferAddress;
	ULONG ZeroBits;
	SIZE_T MaximumStackSize;
	SIZE_T CommittedStackSize;
	ULONG SubSystemType;
	union
	{
		struct
		{
			USHORT SubSystemMinorVersion;
			USHORT SubSystemMajorVersion;
		};
		ULONG SubSystemVersion;
	};
	ULONG GpValue;
	USHORT ImageCharacteristics;
	USHORT DllCharacteristics;
	USHORT Machine;
	BOOLEAN ImageContainsCode;
	union
	{
		UCHAR ImageFlags;
		struct
		{
			UCHAR ComPlusNativeReady : 1;
			UCHAR ComPlusILOnly : 1;
			UCHAR ImageDynamicallyRelocated : 1;
			UCHAR ImageMappedFlat : 1;
			UCHAR BaseBelow4gb : 1;
			UCHAR Reserved : 3;
		};
	};
	ULONG LoaderFlags;
	ULONG ImageFileSize;
	ULONG CheckSum;
};

enum SECTION_INFORMATION_CLASS
{
	SectionBasicInformation,
	SectionImageInformation
};

typedef enum _THREADINFOCLASS {
	ThreadBasicInformation = 0,
	ThreadTimes = 1,
	ThreadPriority = 2,
	ThreadBasePriority = 3,
	ThreadAffinityMask = 4,
	ThreadImpersonationToken = 5,
	ThreadDescriptorTableEntry = 6,
	ThreadEnableAlignmentFaultFixup = 7,
	ThreadEventPair_Reusable = 8,
	ThreadQuerySetWin32StartAddress = 9,
	ThreadZeroTlsCell = 10,
	ThreadPerformanceCount = 11,
	ThreadAmILastThread = 12,
	ThreadIdealProcessor = 13,
	ThreadPriorityBoost = 14,
	ThreadSetTlsArrayAddress = 15,   // Obsolete
	ThreadIsIoPending = 16,
	ThreadHideFromDebugger = 17,
	ThreadBreakOnTermination = 18,
	ThreadSwitchLegacyState = 19,
	ThreadIsTerminated = 20,
	ThreadLastSystemCall = 21,
	ThreadIoPriority = 22,
	ThreadCycleTime = 23,
	ThreadPagePriority = 24,
	ThreadActualBasePriority = 25,
	ThreadTebInformation = 26,
	ThreadCSwitchMon = 27,   // Obsolete
	ThreadCSwitchPmu = 28,
	ThreadWow64Context = 29,
	ThreadGroupInformation = 30,
	ThreadUmsInformation = 31,   // UMS
	ThreadCounterProfiling = 32,
	ThreadIdealProcessorEx = 33,
	ThreadCpuAccountingInformation = 34,
	ThreadSuspendCount = 35,
	ThreadActualGroupAffinity = 41,
	ThreadDynamicCodePolicyInfo = 42,
	ThreadSubsystemInformation = 45,

	MaxThreadInfoClass = 51,
} THREADINFOCLASS;

typedef enum _MEMORY_INFORMATION_CLASS {
	MemoryBasicInformation
} MEMORY_INFORMATION_CLASS;


//Imported native functions from ntdll
extern "C" {
	__declspec(dllimport) NTSTATUS CALLBACK ZwQueueApcThread
	(
		HANDLE hThread,
		PKNORMAL_ROUTINE ApcRoutine, 
		PVOID ApcContext, 
		PVOID Argument1, 
		PVOID Argument2
	);

	__declspec(dllimport) NTSTATUS CALLBACK NtCreateSection
	(
		_Out_ PHANDLE SectionHandle,
		_In_ ACCESS_MASK DesiredAccess,
		_In_opt_ POBJECT_ATTRIBUTES ObjectAttributes,
		_In_opt_ PLARGE_INTEGER MaximumSize,
		_In_ ULONG SectionPageProtection,
		_In_ ULONG AllocationAttributes,
		_In_opt_ HANDLE FileHandle
	);

	__declspec(dllimport) NTSTATUS CALLBACK ZwClose
	(
		_In_ HANDLE Handle
	);

	__declspec(dllimport) NTSTATUS CALLBACK ZwMapViewOfSection
	(
		_In_ HANDLE SectionHandle,
		_In_ HANDLE ProcessHandle,
		_Outptr_result_bytebuffer_(*ViewSize) PVOID* BaseAddress,
		_In_ ULONG_PTR ZeroBits,
		_In_ SIZE_T CommitSize,
		_Inout_opt_ PLARGE_INTEGER SectionOffset,
		_Inout_ PSIZE_T ViewSize,
		_In_ SECTION_INHERIT InheritDisposition,
		_In_ ULONG AllocationType,
		_In_ ULONG Win32Protect
	);

	__declspec(dllimport) NTSTATUS CALLBACK ZwUnmapViewOfSection
	(
		_In_ HANDLE ProcessHandle,
		_In_opt_ PVOID BaseAddress
	);

	__declspec(dllimport) NTSTATUS CALLBACK RtlCreateUserThread
	(
		IN HANDLE hProcess,
		PVOID   SecurityDescriptor,
		BOOLEAN CreateSuspended,
		ULONG	ZeroBits,
		SIZE_T	StackReserve,
		SIZE_T	StackCommit,
		PVOID	EntryPoint,
		const void* Argument,
		PHANDLE	phThread,
		PCLIENT_ID pCid
	);

	__declspec(dllimport) NTSTATUS CALLBACK RtlExitUserThread
	(
		DWORD dwExitCode
	);

	__declspec(dllimport) NTSTATUS CALLBACK RtlQueueApcWow64Thread
	(
		HANDLE hThread, 
		PKNORMAL_ROUTINE ApcRoutine, 
		PVOID ApcContext, 
		PVOID Argument1, 
		PVOID Argument2
	);

	__declspec(dllimport) NTSTATUS CALLBACK LdrQueryProcessModuleInformation
	(
		PRTL_PROCESS_MODULES psmi,
		ULONG BufferSize,
		PULONG RealSize
	);

	__declspec(dllimport) NTSTATUS CALLBACK NtQueryInformationProcess
	(
		IN HANDLE ProcessHandle,
		IN  PROCESSINFOCLASS ProcessInformationClass,
		OUT PVOID ProcessInformation,
		IN ULONG ProcessInformationLength,
		OUT PULONG ReturnLength OPTIONAL
	);

	__declspec(dllimport) NTSTATUS CALLBACK ZwOpenSection
	(
		_Out_ PHANDLE SectionHandle,
		_In_ ACCESS_MASK DesiredAccess,
		_In_ POBJECT_ATTRIBUTES ObjectAttributes
	);

	__declspec(dllimport) NTSTATUS CALLBACK ZwQuerySection
	(
		IN HANDLE SectionHandle,
		IN ULONG SectionInformationClass,
		OUT PVOID SectionInformation,
		IN ULONG SectionInformationLength,
		OUT PSIZE_T ResultLength OPTIONAL
	);

	__declspec(dllimport) PIMAGE_NT_HEADERS CALLBACK RtlImageNtHeader
	(
		PVOID Base
	);

	__declspec(dllimport) PVOID CALLBACK RtlImageDirectoryEntryToData
	(
		PVOID Base,
		BOOLEAN MappedAsImage,
		USHORT DirectoryEntry,
		PULONG Size
	);

	__declspec(dllimport) NTSTATUS CALLBACK NtSetInformationThread(
		_In_ HANDLE ThreadHandle,
		_In_ THREADINFOCLASS ThreadInformationClass,
		_When_((ThreadInformationClass != ThreadManageWritesToExecutableMemory),
			_In_reads_bytes_(ThreadInformationLength))
		_When_((ThreadInformationClass == ThreadManageWritesToExecutableMemory),
			_Inout_updates_(ThreadInformationLength))
		PVOID ThreadInformation,
		_In_ ULONG ThreadInformationLength
	);

	__declspec(dllimport) NTSTATUS CALLBACK NtQueryVirtualMemory(
		_In_ HANDLE ProcessHandle,
		_In_opt_ PVOID BaseAddress,
		_In_ MEMORY_INFORMATION_CLASS MemoryInformationClass,
		_Out_writes_bytes_(MemoryInformationLength) PVOID MemoryInformation,
		_In_ SIZE_T MemoryInformationLength,
		_Out_opt_ PSIZE_T ReturnLength
	);
}

Epilogue

As you can tell by the size of this blog post Asynchronous Procedure Calls is a tricky subject in Windows. The best way to understand it is to code it yourself and to test it in practice. And if you run into an interesting situation dealing with APCs yourself, feel free to leave a comment below.

Or, if you want to contact me (Rbmm) or Dennis A. Babkin directly, feel free to do that.

Related Articles