cuD3D9CtxCreate returns CUDA_ERROR_UNKNOWN

Hi everyone, I need some help.

(sorry I’m not that good at english)

recently I’m in dev with Nvidia Codec SDK.
current major goal is playing multi-channel FULL-HD H.264 streams with D3D9.

with Nvidia GTX 960M (Maxwell 2nd),
I hope it would decode more than 25 channels at the same time.
but it fails to create cuda context with D3D9 interop when the number of channel (instance) exceeds more than 16.

the source code (C++) :
CUresult cuResult = cuD3D9CtxCreate(&ctx, &device, CU_CTX_BLOCKING_SYNC, (IDirect3DDevice9 *)d3ddevice);

create result for cuda context
1~16 channels : OK
more than 16 channels : FAIL (CUDA_ERROR_UNKNOWN)

what did I do wrong … ?

** d3d9 device creation code (C#) :
var device =
new SharpDX.Direct3D9.DeviceEx(d3d, adapterId, SharpDX.Direct3D9.DeviceType.Hardware, hWnd,
CreateFlags.HardwareVertexProcessing | CreateFlags.Multithreaded | CreateFlags.FpuPreserve,
new PresentParameters(backBufferSize.Width, backBufferSize.Height)
{
PresentationInterval = PresentInterval.One,
SwapEffect = SwapEffect.Discard,
Windowed = true,
DeviceWindowHandle = hWnd,
BackBufferCount = 1,
BackBufferFormat = Format.A8R8G8B8,
});

the result is same in below tests,

int create_d3d9_interop(int chId, int adapter, void *d3ddevice)
{
CUcontext ctx = NULL;
CUdevice device = NULL;

auto cuResult = cuD3D9CtxCreate(&ctx, &device, CU_CTX_BLOCKING_SYNC, (IDirect3DDevice9 *)d3ddevice);
printf("%s() ch_%d) create: %s (result code: %d)\n", __FUNCTION__, chId, cuResult == CUDA_SUCCESS ? "TRUE" : "FALSE", cuResult);
return cuResult == CUDA_SUCCESS ? 1 : 0;

}
int create_d3d11_interop(int chId, int adapter, void *d3ddevice)
{
CUcontext ctx = NULL;
CUdevice device = NULL;

auto cuResult = cuD3D11CtxCreate(&ctx, &device, CU_CTX_BLOCKING_SYNC, (ID3D11Device *)d3ddevice);
printf("%s() ch_%d) create: %s (result code: %d)\n", __FUNCTION__, chId, cuResult == CUDA_SUCCESS ? "TRUE" : "FALSE", cuResult);
return cuResult == CUDA_SUCCESS ? 1 : 0;

}

//
int main(int argc, char *argv)
{
HWND hwnd = NULL;
static int adapter = cuda_get_proper_adapter(); // auto detect cuda adapter

int numThreads = 25;
{
	CD3D9Env d3d(adapter, hwnd);
	vector<std::thread> threads;
	for (int i = 0; i < numThreads; i++) {
		threads.push_back(std::thread(create_d3d9_interop, i, adapter, d3d.GetDevice()));
	}
	for (int i = 0; i < threads.size(); i++) {
		if (threads[i].joinable())
			threads[i].join();
	}
}
printf("\n");
{
	CD3D11Env d3d(adapter, hwnd);
	vector<std::thread> threads;
	for (int i = 0; i < numThreads; i++) {
		threads.push_back(std::thread(create_d3d11_interop, i, adapter, d3d.GetDevice()));
	}
	for (int i = 0; i < threads.size(); i++) {
		if (threads[i].joinable())
			threads[i].join();
	}
}
printf("\n");
system("pause");
return 0;

}

// results:
create_d3d9_interop() ch_21) create: TRUE (result code: 0)
create_d3d9_interop() ch_3) create: TRUE (result code: 0)
create_d3d9_interop() ch_24) create: TRUE (result code: 0)
create_d3d9_interop() ch_6) create: TRUE (result code: 0)
create_d3d9_interop() ch_4) create: TRUE (result code: 0)
create_d3d9_interop() ch_11) create: TRUE (result code: 0)
create_d3d9_interop() ch_14) create: TRUE (result code: 0)
create_d3d9_interop() ch_19) create: TRUE (result code: 0)
create_d3d9_interop() ch_5) create: TRUE (result code: 0)
create_d3d9_interop() ch_12) create: TRUE (result code: 0)
create_d3d9_interop() ch_22) create: TRUE (result code: 0)
create_d3d9_interop() ch_16) create: TRUE (result code: 0)
create_d3d9_interop() ch_10) create: TRUE (result code: 0)
create_d3d9_interop() ch_2) create: TRUE (result code: 0)
create_d3d9_interop() ch_1) create: TRUE (result code: 0)
create_d3d9_interop() ch_23) create: TRUE (result code: 0)
create_d3d9_interop() ch_9) create: FALSE (result code: 999)
create_d3d9_interop() ch_7) create: FALSE (result code: 999)
create_d3d9_interop() ch_17) create: FALSE (result code: 999)
create_d3d9_interop() ch_15) create: FALSE (result code: 999)
create_d3d9_interop() ch_8) create: FALSE (result code: 999)
create_d3d9_interop() ch_20) create: FALSE (result code: 999)
create_d3d9_interop() ch_0) create: FALSE (result code: 999)
create_d3d9_interop() ch_18) create: FALSE (result code: 999)
create_d3d9_interop() ch_13) create: FALSE (result code: 999)

create_d3d11_interop() ch_21) create: TRUE (result code: 0)
create_d3d11_interop() ch_14) create: TRUE (result code: 0)
create_d3d11_interop() ch_0) create: TRUE (result code: 0)
create_d3d11_interop() ch_10) create: TRUE (result code: 0)
create_d3d11_interop() ch_7) create: TRUE (result code: 0)
create_d3d11_interop() ch_4) create: TRUE (result code: 0)
create_d3d11_interop() ch_8) create: TRUE (result code: 0)
create_d3d11_interop() ch_5) create: TRUE (result code: 0)
create_d3d11_interop() ch_16) create: TRUE (result code: 0)
create_d3d11_interop() ch_24) create: TRUE (result code: 0)
create_d3d11_interop() ch_19) create: TRUE (result code: 0)
create_d3d11_interop() ch_23) create: TRUE (result code: 0)
create_d3d11_interop() ch_6) create: TRUE (result code: 0)
create_d3d11_interop() ch_15) create: TRUE (result code: 0)
create_d3d11_interop() ch_11) create: TRUE (result code: 0)
create_d3d11_interop() ch_22) create: FALSE (result code: 999)
create_d3d11_interop() ch_18) create: FALSE (result code: 999)
create_d3d11_interop() ch_13) create: FALSE (result code: 999)
create_d3d11_interop() ch_1) create: FALSE (result code: 999)
create_d3d11_interop() ch_2) create: FALSE (result code: 999)
create_d3d11_interop() ch_3) create: FALSE (result code: 999)
create_d3d11_interop() ch_20) create: FALSE (result code: 999)
create_d3d11_interop() ch_17) create: FALSE (result code: 999)
create_d3d11_interop() ch_9) create: FALSE (result code: 999)
create_d3d11_interop() ch_12) create: FALSE (result code: 999)