QUIT CUDA? Kernel and pinned memory gives strange results

Hello,

I do not understand why my kernel is not doing the appropriate computations. I allocated a pinned memory on the host and wrote a very stupid kernel that takes the pinned memory input and add 2.0 to each elements in the pinned memory variable. I do this using a serialization of kernels (this is in fact a test case for a more complicated case that I would like to solve) for a number of times (corresponding to NBLOCKS). It seems that every input in the pinned variable should be increased by NBLOCKS*2 (in the case here +6). The output of the function is wrong… Can someone help me please? I really need to make this working otherwise I will have to use a traditional parallel processing using CPU.

Here is the code:

/* --------------------------- header secton ----------------------------*/

#include<stdio.h>

#include<cuda.h>

#define NNEUR 10

#define NVarNEUR 4

#define NBLOCKS	(int)	3

/* --------------------------- target code ------------------------------*/

__global__ void TEST(float *IN)

{

	int idx=threadIdx.x;

	int i;

	

	i=0;

	float yout[NVarNEUR];

	for (i=0;i<NVarNEUR;i++) yout[i]=IN[idx*NVarNEUR+i];

	for (i=0;i<NVarNEUR;i++) {

		IN[idx*NVarNEUR+i]=yout[i]+2.0;

	}

	

	__syncthreads();

}

/* --------------------------- host code ------------------------------*/

int main (int argc, char *argv[])

{

int i,j;

float *IN,*IN_d;

cudaSetDevice(0);

cudaSetDeviceFlags(cudaDeviceMapHost);

cudaHostAlloc((float **)&IN, sizeof(float)*NVarNEUR*NNEUR, cudaHostAllocMapped);

for (i=0;i<(NNEUR);i++) {

	IN[i*NVarNEUR]=1.0;

	IN[i*NVarNEUR+1]=2.0;

	IN[i*NVarNEUR+2]=3.0;

	IN[i*NVarNEUR+3]=4.0;

}

cudaHostGetDevicePointer(&IN_d, IN, 0);

// allocate and initialize an array of stream handles and events

cudaStream_t *streams = (cudaStream_t*) malloc((NBLOCKS) * sizeof(cudaStream_t));

for(i = 0; i < (NBLOCKS); i++) cudaStreamCreate(&(streams[i]));

cudaEvent_t *kernelEvent;

kernelEvent = (cudaEvent_t*) malloc(NBLOCKS * sizeof(cudaEvent_t));

for(int i = 0; i < NBLOCKS; i++) cudaEventCreateWithFlags(&(kernelEvent[i]), cudaEventDisableTiming);

TEST<<<1,NNEUR,0,streams[0]>>>(IN_d);

cudaEventRecord(kernelEvent[0], streams[0]);

for (i=1;i<NBLOCKS;i++){

	TEST<<<1,NNEUR,0,streams[i]>>>(IN_d);

	cudaEventRecord(kernelEvent[i], streams[i]);

	cudaStreamWaitEvent(streams[i], kernelEvent[i-1],0);

}

for (i=0;i<NNEUR;i++) {

	for (j=0;j<NVarNEUR;j++) printf("IN: %f \t\t",IN[i*NVarNEUR+j]);

	printf("\n");

}

cudaDeviceSynchronize();

for(int i = 0; i < NBLOCKS; i++) {

	cudaStreamDestroy(streams[i]); 

	cudaEventDestroy(kernelEvent[i]);

}

cudaFreeHost(IN);

}

And the results:

IN: 3.000000 		IN: 6.000000 		IN: 5.000000 		IN: 6.000000 		

IN: 7.000000 		IN: 6.000000 		IN: 7.000000 		IN: 8.000000 		

IN: 7.000000 		IN: 6.000000 		IN: 7.000000 		IN: 8.000000 		

IN: 7.000000 		IN: 6.000000 		IN: 7.000000 		IN: 8.000000 		

IN: 7.000000 		IN: 6.000000 		IN: 7.000000 		IN: 8.000000 		

IN: 7.000000 		IN: 6.000000 		IN: 7.000000 		IN: 8.000000 		

IN: 7.000000 		IN: 6.000000 		IN: 7.000000 		IN: 8.000000 		

IN: 7.000000 		IN: 6.000000 		IN: 7.000000 		IN: 8.000000 		

IN: 7.000000 		IN: 6.000000 		IN: 7.000000 		IN: 8.000000 		

IN: 7.000000 		IN: 6.000000 		IN: 7.000000 		IN: 8.000000

Thank you for helping me…

Pi-r

Hi,

you say: " I do this using a serialization of kernels". This is not true in your case due to the use of streams. By using streams you tell the GPU that commands on different streams could be executed concurrently and out of order.

To achieve what you want, you need to launch the kernels on the same stream.

Remove everything that has to do with streams from your code.

Add instead:

for(int i=0; i < NBLOCKS; i++)

TEST<<<1,NNEUR>>>(IN_d);

cudaDeviceSynchronize();

Hello,

First of all, thank you Brano for your quick answer. I tried what you said and the results are not satisfactory. It seems that every thread is not computing the same thing. And the results are not reproducible. From run to run, I have different results, which is annoying… Is there something I am missing here?

In parallel, I checked with the cuda profiler when the kernels are launched with my original post and they are executed in sequence as I expected. The problem seems to be in the memory transfer…

Pierre

check the return values of your CUDA calls, it’s entirely possible that you’re generating an out of bounds access or something and the context is being destroyed (meaning the copy isn’t happening at all)

Hi,

I changed your code to this and it produces the correct results.

*/

/* --------------------------- header secton ----------------------------*/

#include<stdio.h>

#include<cuda.h>

#define NNEUR 10

#define NVarNEUR 4

#define NBLOCKS (int)   3

/* --------------------------- target code ------------------------------*/

__global__ void TEST(float *IN)

{

        int idx=threadIdx.x;

        int i;

i=0;

float yout[NVarNEUR];

for (i=0;i<NVarNEUR;i++) yout[i]=IN[idx*NVarNEUR+i];

for (i=0;i<NVarNEUR;i++) {

                IN[idx*NVarNEUR+i]=yout[i]+2.0;

        }

__syncthreads();

}

/* --------------------------- host code ------------------------------*/

int main (int argc, char *argv[])

{

int i,j;

float *IN,*IN_d;

cudaSetDevice(0);

cudaSetDeviceFlags(cudaDeviceMapHost);

cudaHostAlloc((float **)&IN, sizeof(float)*NVarNEUR*NNEUR, cudaHostAllocMapped);

for (i=0;i<(NNEUR);i++) {

        IN[i*NVarNEUR]=1.0;

        IN[i*NVarNEUR+1]=2.0;

        IN[i*NVarNEUR+2]=3.0;

        IN[i*NVarNEUR+3]=4.0;

}

cudaHostGetDevicePointer(&IN_d, IN, 0);

for(int i=0; i < NBLOCKS; i++)

TEST<<<1,NNEUR>>>(IN_d);

cudaThreadSynchronize(); //cudaDeviceSynchronize();

for (i=0;i<NNEUR;i++) {

        for (j=0;j<NVarNEUR;j++) printf("IN: %f \t\t",IN[i*NVarNEUR+j]);

        printf("\n");

}

cudaFreeHost(IN);}
IN: 7.000000            IN: 8.000000            IN: 9.000000            IN: 10.0

00000

IN: 7.000000            IN: 8.000000            IN: 9.000000            IN: 10.0

00000

IN: 7.000000            IN: 8.000000            IN: 9.000000            IN: 10.0

00000

IN: 7.000000            IN: 8.000000            IN: 9.000000            IN: 10.0

00000

IN: 7.000000            IN: 8.000000            IN: 9.000000            IN: 10.0

00000

IN: 7.000000            IN: 8.000000            IN: 9.000000            IN: 10.0

00000

IN: 7.000000            IN: 8.000000            IN: 9.000000            IN: 10.0

00000

IN: 7.000000            IN: 8.000000            IN: 9.000000            IN: 10.0

00000

IN: 7.000000            IN: 8.000000            IN: 9.000000            IN: 10.0

00000

IN: 7.000000            IN: 8.000000            IN: 9.000000            IN: 10.0

00000

Press any key to continue . . .

Hello Brano,

Thank you for your quick help. I found the bug that I had, the cudaDeviceSynchronize() was after the printf function… Now it is working. Thank you!

However, I am still disappointed because my first proposition gave a serialization of the kernel execution (I checked with computeprof) and the execution was faster on my original program (the code here was a benchmark of what was not working…)!

I think for my problem (solving a neural network with each neurons represented by a four states stiff ODE), I will have to use a more traditional multi-CPU approach.

Thank you,

Pierre

PS: I did not observe any cuda errors during the execution of the original code…

Hi,

computeprof serializes all kernel executions by default. You will not be able to see concurrent kernel execution with computeprof.

You could post a new topic on your problem and people could help you find a way to solve it with CUDA.