device memory must be set just before calling kernel ?

I’am a cuda beginner.

I wrote a cpu thread which copies host memory to device memory. And cpu main thread runs cuda kernel to calculate the data. But it is not working. Below is test code for it(some codes are eliminated such as malloc).

What I want to do is split copying some data from host to device and calculating data on GPU by making two threads on CPU. One thread on CPU is just copying data from host to device and another is just run the calculation.

Below test code is not working. But if i move stream_test_1()'s contents to just before kernel calling of stream_test_2(), it works.

So my question is do I have to put memory copying just before kernel calling?

//--- CPU Thread ---

void thread1(void *p)

{

	float cnt = 0.0;

	float *ha = (float *)p;

	while (1)

	{

		for (int i = 0; i < 256; i++)

			ha[i] = cnt;

		stream_test_1();

		cnt += 1.0;

	}

}

int _tmain(int argc, _TCHAR* argv[])

{

	float *ha = stream_init();

	_beginthread(thread1, 0, (void *)ha);

	while (1)

	{

		float *hc = stream_test_2();

	}

	return 0;

}
//--- GPU Kernel ---

extern "C" float *stream_init()

{

	CUT_DEVICE_INIT(1, "");

	CUDA_SAFE_CALL(cudaMallocHost((void **)&ha, 1024));

	CUDA_SAFE_CALL(cudaMallocHost((void **)&hc, 1024));

	CUDA_SAFE_CALL(cudaMalloc((void **)&da, 1024));

	CUDA_SAFE_CALL(cudaMalloc((void **)&db, 1024));

	CUDA_SAFE_CALL(cudaMalloc((void **)&dc, 1024));

	CUDA_SAFE_CALL(cudaMemset(db, 0, 1024));

	return ha;

}

__global__ void add_array(float *a, float *b, float *c)

{

	int x = threadIdx.x;

	int y = threadIdx.y;

	c[y*16+x] = a[y*16+x] + b[y*16+x];

	__syncthreads();

}

extern "C" void stream_test_1()

{

	cudaMemcpy(da, ha, 1024, cudaMemcpyHostToDevice);

}

extern "C" float *stream_test_2()

{

	dim3 blocks(1, 1, 1);

	dim3 threads(16, 16, 1);

	add_array<<<blocks, threads, 0>>>(da, db, dc);

	cudaMemcpy(hc, dc, 1024, cudaMemcpyDeviceToHost);

	printf("%.1f %.1f\r", ha[0], hc[0]);

	return hc;

}

2 separate host threads live in different CUDA contexts and cannot access each others device memory. If you want asynchronous memory copies to the device, use the stream API and cudaMemcpyAsync.