context and host thread question

I need several threads in host side. One thread acquire data from h/w and put it into cuda host memory and then signal to another thread. Another thread execute calculation and then copy result to host memory.

I found I have to use the same context for all thread to see same address space. So I wrote some test program like below.

But the result is valid for the first. Form the second the result is not changed.

What is the problem ?

// ContextTest.cpp

#include "stdafx.h"

#include <cuda.h>

#include <cutil.h>

extern "C" void ctx_init();

extern "C" void ctx_close();

extern "C" float ctx_mem_set(float a);

extern "C" void ctx_kernel();

extern "C" float ctx_mem_get();

typedef struct {

	HANDLE src;

	HANDLE dst;

	CUcontext ctx;

} thread_t;

DWORD WINAPI thread1(thread_t *t)

{

	float ha;

	float cnt = 1.0;

	while (1)

	{

		cuCtxPushCurrent(t->ctx);

		ha = ctx_mem_set(cnt);

		cuCtxPopCurrent(NULL);

		SetEvent(t->src);

		printf("ha=%f / ", ha);

		cnt += 1.0;

		Sleep(1000);

	}

}

DWORD WINAPI thread2(thread_t *t)

{

	float hc = 0.0;

	while (1)

	{

		if (WaitForSingleObject(t->src, 1000) == WAIT_OBJECT_0)

		{

			cuCtxPushCurrent(t->ctx);

			ctx_kernel();

			hc = ctx_mem_get();

			cuCtxPopCurrent(NULL);

			printf("hc=%f\n", hc);

		}

	}

}

int _tmain(int argc, _TCHAR* argv[])

{

	CUcontext hcuContext;

	HANDLE e1, e2;

	thread_t t;

	cuInit(0);

	cuCtxCreate(&hcuContext, 0, 0);

	ctx_init();

	cuCtxPopCurrent(NULL);

	e1 = CreateEvent(NULL, false, false, L"event1");

	e2 = CreateEvent(NULL, false, false, L"event2");

	t.src = e1;

	t.dst = e2;

	t.ctx = hcuContext;

	CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE)thread1, &t, 0, 0);

	CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE)thread2, &t, 0, 0);

	

	while (1)

	;

	

	ctx_close();

	return 0;

}
// ctx.cu

#include <stdio.h>

#include <cuda.h>

#include <cutil.h>

float *ha;

CUdeviceptr da;

float *hc;

CUdeviceptr dc;

extern "C" void ctx_init()

{

	cuMemAllocHost((void **)&ha, sizeof(float));

	cuMemAlloc(&da, sizeof(float));

	cuMemAllocHost((void **)&hc, sizeof(float));

	cuMemAlloc(&dc, sizeof(float));

}

extern "C" void ctx_close()

{

	cuMemFreeHost(ha);

	cuMemFree(da);

	cuMemFreeHost(hc);	

	cuMemFree(dc);

}

extern "C" float ctx_mem_set(float a)

{

	*ha = a;

	cuMemcpyHtoD(da, ha, sizeof(float));

	return *ha;

}

__global__ static void kernel(float *a, float *c)

{

	(*c) = (*a) * 1.5;

}

extern "C" void ctx_kernel()

{

	dim3 b(1, 1, 1);

	dim3 t(1, 1, 1);

	kernel<<<b, t>>>((float *)da, (float *)dc);

}

extern "C" float ctx_mem_get()

{

	cuMemcpyDtoH(hc, dc, sizeof(float));

	return *hc;

}

The result output is

ha=1.000000 / hc=1.500000

ha=2.000000 / hc=1.500000

ha=3.000000 / hc=1.500000

ha=4.000000 / hc=1.500000

ha=5.000000 / hc=1.500000

I solve this!

The kernel has to be made as a cubin and then loaded by cuModuleLoad to call it.

I take the kernel function from ctx.cu file and put it in new kernel.cu file to make this file as a cubin.

Here is the ctx.cu and kernel.cu

// ctx.cu

#include <stdio.h>

#include <cuda.h>

#include <cutil.h>

float *ha;

CUdeviceptr da;

float *hc;

CUdeviceptr dc;

CUmodule hcuModule;

CUfunction hcuFunction;

extern "C" void ctx_init(char **argv)

{

	cuMemAllocHost((void **)&ha, sizeof(float));

	cuMemAlloc(&da, sizeof(float));

	cuMemAllocHost((void **)&hc, sizeof(float));

	cuMemAlloc(&dc, sizeof(float));

	CUresult status;

	status = cuModuleLoad(&hcuModule, "kernel.cubin");

	status = cuModuleGetFunction(&hcuFunction, hcuModule, "kernel");

}

extern "C" void ctx_close()

{

	cuMemFreeHost(ha);

	cuMemFree(da);

	cuMemFreeHost(hc);	

	cuMemFree(dc);

}

extern "C" float ctx_mem_set(float a)

{

	*ha = a;

	cuMemcpyHtoD(da, ha, sizeof(float));

	return *ha;

}

extern "C" void ctx_kernel()

{

	//dim3 b(1, 1, 1);

	//dim3 t(1, 1, 1);

	//kernel<<<b, t>>>((float *)da, (float *)dc);

	int offset = 0;

	cuFuncSetBlockShape(hcuFunction, 1, 1, 1 );

	cuParamSeti(hcuFunction, offset, da);

	offset += sizeof(da);

	cuParamSeti(hcuFunction, offset, dc);

	offset += sizeof(dc);

	cuParamSetSize(hcuFunction, offset);

	cuLaunch(hcuFunction);

}

extern "C" float ctx_mem_get()

{

	cuMemcpyDtoH(hc, dc, sizeof(float));

	return *hc;

}
// kernel.cu

extern "C" __global__ void kernel(float *a, float *c)

{

	(*c) = (*a) * 1.5;

	__syncthreads();

}