QUIT CUDA? Kernel and pinned memory gives strange results

pierre_daye · September 20, 2011, 8:55pm

Hello,

I do not understand why my kernel is not doing the appropriate computations. I allocated a pinned memory on the host and wrote a very stupid kernel that takes the pinned memory input and add 2.0 to each elements in the pinned memory variable. I do this using a serialization of kernels (this is in fact a test case for a more complicated case that I would like to solve) for a number of times (corresponding to NBLOCKS). It seems that every input in the pinned variable should be increased by NBLOCKS*2 (in the case here +6). The output of the function is wrong… Can someone help me please? I really need to make this working otherwise I will have to use a traditional parallel processing using CPU.

Here is the code:

/* --------------------------- header secton ----------------------------*/

#include<stdio.h>

#include<cuda.h>

#define NNEUR 10

#define NVarNEUR 4

#define NBLOCKS	(int)	3

/* --------------------------- target code ------------------------------*/

__global__ void TEST(float *IN)

{

	int idx=threadIdx.x;

	int i;

	

	i=0;

	float yout[NVarNEUR];

	for (i=0;i<NVarNEUR;i++) yout[i]=IN[idx*NVarNEUR+i];

	for (i=0;i<NVarNEUR;i++) {

		IN[idx*NVarNEUR+i]=yout[i]+2.0;

	}

	

	__syncthreads();

}

/* --------------------------- host code ------------------------------*/

int main (int argc, char *argv[])

{

int i,j;

float *IN,*IN_d;

cudaSetDevice(0);

cudaSetDeviceFlags(cudaDeviceMapHost);

cudaHostAlloc((float **)&IN, sizeof(float)*NVarNEUR*NNEUR, cudaHostAllocMapped);

for (i=0;i<(NNEUR);i++) {

	IN[i*NVarNEUR]=1.0;

	IN[i*NVarNEUR+1]=2.0;

	IN[i*NVarNEUR+2]=3.0;

	IN[i*NVarNEUR+3]=4.0;

}

cudaHostGetDevicePointer(&IN_d, IN, 0);

// allocate and initialize an array of stream handles and events

cudaStream_t *streams = (cudaStream_t*) malloc((NBLOCKS) * sizeof(cudaStream_t));

for(i = 0; i < (NBLOCKS); i++) cudaStreamCreate(&(streams[i]));

cudaEvent_t *kernelEvent;

kernelEvent = (cudaEvent_t*) malloc(NBLOCKS * sizeof(cudaEvent_t));

for(int i = 0; i < NBLOCKS; i++) cudaEventCreateWithFlags(&(kernelEvent[i]), cudaEventDisableTiming);

TEST<<<1,NNEUR,0,streams[0]>>>(IN_d);

cudaEventRecord(kernelEvent[0], streams[0]);

for (i=1;i<NBLOCKS;i++){

	TEST<<<1,NNEUR,0,streams[i]>>>(IN_d);

	cudaEventRecord(kernelEvent[i], streams[i]);

	cudaStreamWaitEvent(streams[i], kernelEvent[i-1],0);

}

for (i=0;i<NNEUR;i++) {

	for (j=0;j<NVarNEUR;j++) printf("IN: %f \t\t",IN[i*NVarNEUR+j]);

	printf("\n");

}

cudaDeviceSynchronize();

for(int i = 0; i < NBLOCKS; i++) {

	cudaStreamDestroy(streams[i]); 

	cudaEventDestroy(kernelEvent[i]);

}

cudaFreeHost(IN);

}

And the results:

IN: 3.000000 		IN: 6.000000 		IN: 5.000000 		IN: 6.000000 		

IN: 7.000000 		IN: 6.000000 		IN: 7.000000 		IN: 8.000000 		

IN: 7.000000 		IN: 6.000000 		IN: 7.000000 		IN: 8.000000 		

IN: 7.000000 		IN: 6.000000 		IN: 7.000000 		IN: 8.000000 		

IN: 7.000000 		IN: 6.000000 		IN: 7.000000 		IN: 8.000000 		

IN: 7.000000 		IN: 6.000000 		IN: 7.000000 		IN: 8.000000 		

IN: 7.000000 		IN: 6.000000 		IN: 7.000000 		IN: 8.000000 		

IN: 7.000000 		IN: 6.000000 		IN: 7.000000 		IN: 8.000000 		

IN: 7.000000 		IN: 6.000000 		IN: 7.000000 		IN: 8.000000 		

IN: 7.000000 		IN: 6.000000 		IN: 7.000000 		IN: 8.000000

Thank you for helping me…

Pi-r

brano · September 21, 2011, 8:49am

Hello,

I do not understand why my kernel is not doing the appropriate computations. I allocated a pinned memory on the host and wrote a very stupid kernel that takes the pinned memory input and add 2.0 to each elements in the pinned memory variable. I do this using a serialization of kernels (this is in fact a test case for a more complicated case that I would like to solve) for a number of times (corresponding to NBLOCKS). It seems that every input in the pinned variable should be increased by NBLOCKS*2 (in the case here +6). The output of the function is wrong… Can someone help me please? I really need to make this working otherwise I will have to use a traditional parallel processing using CPU.

Here is the code:

/* --------------------------- header secton ----------------------------*/

#include<stdio.h>

#include<cuda.h>

#define NNEUR 10

#define NVarNEUR 4

#define NBLOCKS	(int)	3

/* --------------------------- target code ------------------------------*/

__global__ void TEST(float *IN)

{

	int idx=threadIdx.x;

	int i;

	

	i=0;

	float yout[NVarNEUR];

	for (i=0;i<NVarNEUR;i++) yout[i]=IN[idx*NVarNEUR+i];

	for (i=0;i<NVarNEUR;i++) {

		IN[idx*NVarNEUR+i]=yout[i]+2.0;

	}

	

	__syncthreads();

}

/* --------------------------- host code ------------------------------*/

int main (int argc, char *argv[])

{

int i,j;

float *IN,*IN_d;

cudaSetDevice(0);

cudaSetDeviceFlags(cudaDeviceMapHost);

cudaHostAlloc((float **)&IN, sizeof(float)*NVarNEUR*NNEUR, cudaHostAllocMapped);

for (i=0;i<(NNEUR);i++) {

	IN[i*NVarNEUR]=1.0;

	IN[i*NVarNEUR+1]=2.0;

	IN[i*NVarNEUR+2]=3.0;

	IN[i*NVarNEUR+3]=4.0;

}

cudaHostGetDevicePointer(&IN_d, IN, 0);

// allocate and initialize an array of stream handles and events

cudaStream_t *streams = (cudaStream_t*) malloc((NBLOCKS) * sizeof(cudaStream_t));

for(i = 0; i < (NBLOCKS); i++) cudaStreamCreate(&(streams[i]));

cudaEvent_t *kernelEvent;

kernelEvent = (cudaEvent_t*) malloc(NBLOCKS * sizeof(cudaEvent_t));

for(int i = 0; i < NBLOCKS; i++) cudaEventCreateWithFlags(&(kernelEvent[i]), cudaEventDisableTiming);

TEST<<<1,NNEUR,0,streams[0]>>>(IN_d);

cudaEventRecord(kernelEvent[0], streams[0]);

for (i=1;i<NBLOCKS;i++){

	TEST<<<1,NNEUR,0,streams[i]>>>(IN_d);

	cudaEventRecord(kernelEvent[i], streams[i]);

	cudaStreamWaitEvent(streams[i], kernelEvent[i-1],0);

}

for (i=0;i<NNEUR;i++) {

	for (j=0;j<NVarNEUR;j++) printf("IN: %f \t\t",IN[i*NVarNEUR+j]);

	printf("\n");

}

cudaDeviceSynchronize();

for(int i = 0; i < NBLOCKS; i++) {

	cudaStreamDestroy(streams[i]); 

	cudaEventDestroy(kernelEvent[i]);

}

cudaFreeHost(IN);

}

And the results:

IN: 3.000000 		IN: 6.000000 		IN: 5.000000 		IN: 6.000000 		

IN: 7.000000 		IN: 6.000000 		IN: 7.000000 		IN: 8.000000 		

IN: 7.000000 		IN: 6.000000 		IN: 7.000000 		IN: 8.000000 		

IN: 7.000000 		IN: 6.000000 		IN: 7.000000 		IN: 8.000000 		

IN: 7.000000 		IN: 6.000000 		IN: 7.000000 		IN: 8.000000 		

IN: 7.000000 		IN: 6.000000 		IN: 7.000000 		IN: 8.000000 		

IN: 7.000000 		IN: 6.000000 		IN: 7.000000 		IN: 8.000000 		

IN: 7.000000 		IN: 6.000000 		IN: 7.000000 		IN: 8.000000 		

IN: 7.000000 		IN: 6.000000 		IN: 7.000000 		IN: 8.000000 		

IN: 7.000000 		IN: 6.000000 		IN: 7.000000 		IN: 8.000000

Thank you for helping me…

Pi-r

Hi,

you say: " I do this using a serialization of kernels". This is not true in your case due to the use of streams. By using streams you tell the GPU that commands on different streams could be executed concurrently and out of order.

To achieve what you want, you need to launch the kernels on the same stream.

Remove everything that has to do with streams from your code.

Add instead:

for(int i=0; i < NBLOCKS; i++)

TEST<<<1,NNEUR>>>(IN_d);

cudaDeviceSynchronize();

pierre_daye · September 21, 2011, 9:40pm

Hello,

First of all, thank you Brano for your quick answer. I tried what you said and the results are not satisfactory. It seems that every thread is not computing the same thing. And the results are not reproducible. From run to run, I have different results, which is annoying… Is there something I am missing here?

In parallel, I checked with the cuda profiler when the kernels are launched with my original post and they are executed in sequence as I expected. The problem seems to be in the memory transfer…

Pierre

tmurray · September 21, 2011, 10:20pm

check the return values of your CUDA calls, it’s entirely possible that you’re generating an out of bounds access or something and the context is being destroyed (meaning the copy isn’t happening at all)

brano · September 22, 2011, 6:02am

Hi,

I changed your code to this and it produces the correct results.

*/

/* --------------------------- header secton ----------------------------*/

#include<stdio.h>

#include<cuda.h>

#define NNEUR 10

#define NVarNEUR 4

#define NBLOCKS (int)   3

/* --------------------------- target code ------------------------------*/

__global__ void TEST(float *IN)

{

        int idx=threadIdx.x;

        int i;

i=0;

float yout[NVarNEUR];

for (i=0;i<NVarNEUR;i++) yout[i]=IN[idx*NVarNEUR+i];

for (i=0;i<NVarNEUR;i++) {

                IN[idx*NVarNEUR+i]=yout[i]+2.0;

        }

__syncthreads();

}

/* --------------------------- host code ------------------------------*/

int main (int argc, char *argv[])

{

int i,j;

float *IN,*IN_d;

cudaSetDevice(0);

cudaSetDeviceFlags(cudaDeviceMapHost);

cudaHostAlloc((float **)&IN, sizeof(float)*NVarNEUR*NNEUR, cudaHostAllocMapped);

for (i=0;i<(NNEUR);i++) {

        IN[i*NVarNEUR]=1.0;

        IN[i*NVarNEUR+1]=2.0;

        IN[i*NVarNEUR+2]=3.0;

        IN[i*NVarNEUR+3]=4.0;

}

cudaHostGetDevicePointer(&IN_d, IN, 0);

for(int i=0; i < NBLOCKS; i++)

TEST<<<1,NNEUR>>>(IN_d);

cudaThreadSynchronize(); //cudaDeviceSynchronize();

for (i=0;i<NNEUR;i++) {

        for (j=0;j<NVarNEUR;j++) printf("IN: %f \t\t",IN[i*NVarNEUR+j]);

        printf("\n");

}

cudaFreeHost(IN);}

IN: 7.000000            IN: 8.000000            IN: 9.000000            IN: 10.0

00000

IN: 7.000000            IN: 8.000000            IN: 9.000000            IN: 10.0

00000

IN: 7.000000            IN: 8.000000            IN: 9.000000            IN: 10.0

00000

IN: 7.000000            IN: 8.000000            IN: 9.000000            IN: 10.0

00000

IN: 7.000000            IN: 8.000000            IN: 9.000000            IN: 10.0

00000

IN: 7.000000            IN: 8.000000            IN: 9.000000            IN: 10.0

00000

IN: 7.000000            IN: 8.000000            IN: 9.000000            IN: 10.0

00000

IN: 7.000000            IN: 8.000000            IN: 9.000000            IN: 10.0

00000

IN: 7.000000            IN: 8.000000            IN: 9.000000            IN: 10.0

00000

IN: 7.000000            IN: 8.000000            IN: 9.000000            IN: 10.0

00000

Press any key to continue . . .

pierre_daye · September 22, 2011, 12:01pm

Hi,

I changed your code to this and it produces the correct results.

*/

/* --------------------------- header secton ----------------------------*/

#include<stdio.h>

#include<cuda.h>

#define NNEUR 10

#define NVarNEUR 4

#define NBLOCKS (int)   3

/* --------------------------- target code ------------------------------*/

__global__ void TEST(float *IN)

{

        int idx=threadIdx.x;

        int i;

i=0;

float yout[NVarNEUR];

for (i=0;i<NVarNEUR;i++) yout[i]=IN[idx*NVarNEUR+i];

for (i=0;i<NVarNEUR;i++) {

                IN[idx*NVarNEUR+i]=yout[i]+2.0;

        }

__syncthreads();

}

/* --------------------------- host code ------------------------------*/

int main (int argc, char *argv[])

{

int i,j;

float *IN,*IN_d;

cudaSetDevice(0);

cudaSetDeviceFlags(cudaDeviceMapHost);

cudaHostAlloc((float **)&IN, sizeof(float)*NVarNEUR*NNEUR, cudaHostAllocMapped);

for (i=0;i<(NNEUR);i++) {

        IN[i*NVarNEUR]=1.0;

        IN[i*NVarNEUR+1]=2.0;

        IN[i*NVarNEUR+2]=3.0;

        IN[i*NVarNEUR+3]=4.0;

}

cudaHostGetDevicePointer(&IN_d, IN, 0);

for(int i=0; i < NBLOCKS; i++)

TEST<<<1,NNEUR>>>(IN_d);

cudaThreadSynchronize(); //cudaDeviceSynchronize();

for (i=0;i<NNEUR;i++) {

        for (j=0;j<NVarNEUR;j++) printf("IN: %f \t\t",IN[i*NVarNEUR+j]);

        printf("\n");

}

cudaFreeHost(IN);}

IN: 7.000000            IN: 8.000000            IN: 9.000000            IN: 10.0

00000

IN: 7.000000            IN: 8.000000            IN: 9.000000            IN: 10.0

00000

IN: 7.000000            IN: 8.000000            IN: 9.000000            IN: 10.0

00000

IN: 7.000000            IN: 8.000000            IN: 9.000000            IN: 10.0

00000

IN: 7.000000            IN: 8.000000            IN: 9.000000            IN: 10.0

00000

IN: 7.000000            IN: 8.000000            IN: 9.000000            IN: 10.0

00000

IN: 7.000000            IN: 8.000000            IN: 9.000000            IN: 10.0

00000

IN: 7.000000            IN: 8.000000            IN: 9.000000            IN: 10.0

00000

IN: 7.000000            IN: 8.000000            IN: 9.000000            IN: 10.0

00000

IN: 7.000000            IN: 8.000000            IN: 9.000000            IN: 10.0

00000

Press any key to continue . . .

Hello Brano,

Thank you for your quick help. I found the bug that I had, the cudaDeviceSynchronize() was after the printf function… Now it is working. Thank you!

However, I am still disappointed because my first proposition gave a serialization of the kernel execution (I checked with computeprof) and the execution was faster on my original program (the code here was a benchmark of what was not working…)!

I think for my problem (solving a neural network with each neurons represented by a four states stiff ODE), I will have to use a more traditional multi-CPU approach.

Thank you,

Pierre

PS: I did not observe any cuda errors during the execution of the original code…

brano · September 22, 2011, 1:22pm

Hi,

computeprof serializes all kernel executions by default. You will not be able to see concurrent kernel execution with computeprof.

You could post a new topic on your problem and people could help you find a way to solve it with CUDA.

Topic		Replies	Views
Issue with Nested for-loops in kernel CUDA Programming and Performance	13	2322	April 4, 2019
Serial vs Parallel Performance CUDA Programming and Performance	5	907	April 3, 2017
Cuda code performance CUDA Programming and Performance	14	3141	December 16, 2014
Using unified memory causes system crash CUDA Programming and Performance	28	5861	February 4, 2019
Annoying problems with memory and/or syntax CUDA Programming and Performance	19	4769	April 8, 2008
Reduction questions(newbie-ish) CUDA Programming and Performance	7	1793	January 14, 2009
Cuda program taking more time. CUDA Programming and Performance	15	7059	November 21, 2010
Effective Parallelisation of CUDA C code CUDA Programming and Performance	38	1921	December 27, 2021
Help with strange error CUDA Programming and Performance	8	2095	February 25, 2010
cudaSynchronizeDevice() returns error code 6 CUDA Programming and Performance	7	8601	June 16, 2011

QUIT CUDA? Kernel and pinned memory gives strange results

Related topics