I need several threads in host side. One thread acquire data from h/w and put it into cuda host memory and then signal to another thread. Another thread execute calculation and then copy result to host memory.
I found I have to use the same context for all thread to see same address space. So I wrote some test program like below.
But the result is valid for the first. Form the second the result is not changed.
What is the problem ?
// ContextTest.cpp
#include "stdafx.h"
#include <cuda.h>
#include <cutil.h>
extern "C" void ctx_init();
extern "C" void ctx_close();
extern "C" float ctx_mem_set(float a);
extern "C" void ctx_kernel();
extern "C" float ctx_mem_get();
typedef struct {
HANDLE src;
HANDLE dst;
CUcontext ctx;
} thread_t;
DWORD WINAPI thread1(thread_t *t)
{
float ha;
float cnt = 1.0;
while (1)
{
cuCtxPushCurrent(t->ctx);
ha = ctx_mem_set(cnt);
cuCtxPopCurrent(NULL);
SetEvent(t->src);
printf("ha=%f / ", ha);
cnt += 1.0;
Sleep(1000);
}
}
DWORD WINAPI thread2(thread_t *t)
{
float hc = 0.0;
while (1)
{
if (WaitForSingleObject(t->src, 1000) == WAIT_OBJECT_0)
{
cuCtxPushCurrent(t->ctx);
ctx_kernel();
hc = ctx_mem_get();
cuCtxPopCurrent(NULL);
printf("hc=%f\n", hc);
}
}
}
int _tmain(int argc, _TCHAR* argv[])
{
CUcontext hcuContext;
HANDLE e1, e2;
thread_t t;
cuInit(0);
cuCtxCreate(&hcuContext, 0, 0);
ctx_init();
cuCtxPopCurrent(NULL);
e1 = CreateEvent(NULL, false, false, L"event1");
e2 = CreateEvent(NULL, false, false, L"event2");
t.src = e1;
t.dst = e2;
t.ctx = hcuContext;
CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE)thread1, &t, 0, 0);
CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE)thread2, &t, 0, 0);
while (1)
;
ctx_close();
return 0;
}
// ctx.cu
#include <stdio.h>
#include <cuda.h>
#include <cutil.h>
float *ha;
CUdeviceptr da;
float *hc;
CUdeviceptr dc;
extern "C" void ctx_init()
{
cuMemAllocHost((void **)&ha, sizeof(float));
cuMemAlloc(&da, sizeof(float));
cuMemAllocHost((void **)&hc, sizeof(float));
cuMemAlloc(&dc, sizeof(float));
}
extern "C" void ctx_close()
{
cuMemFreeHost(ha);
cuMemFree(da);
cuMemFreeHost(hc);
cuMemFree(dc);
}
extern "C" float ctx_mem_set(float a)
{
*ha = a;
cuMemcpyHtoD(da, ha, sizeof(float));
return *ha;
}
__global__ static void kernel(float *a, float *c)
{
(*c) = (*a) * 1.5;
}
extern "C" void ctx_kernel()
{
dim3 b(1, 1, 1);
dim3 t(1, 1, 1);
kernel<<<b, t>>>((float *)da, (float *)dc);
}
extern "C" float ctx_mem_get()
{
cuMemcpyDtoH(hc, dc, sizeof(float));
return *hc;
}
The result output is
ha=1.000000 / hc=1.500000
ha=2.000000 / hc=1.500000
ha=3.000000 / hc=1.500000
ha=4.000000 / hc=1.500000
ha=5.000000 / hc=1.500000