Different results on device and Emulation mode

Hi everyone,

I am having a problem running my cuda program on device. :(

Its working fine in EmuDebug mode but giving invalid result on device.

Can anyone tell what could be the problem in the following code.

// simple cuda program to find circular convolution

#define MEM_SIZE 4

global

void flip(int * x, int * fx)

{

// Thread index

int tx = threadIdx.x;

fx[(MEM_SIZE-tx)%MEM_SIZE] = x[tx];

}

global

void conv(int * fx, int * h, int * y)

{

// Block index

int bx = blockIdx.x;

// Thread index

int tx = threadIdx.x;

int ind = (MEM_SIZE + (tx - bx)) % MEM_SIZE;

__shared__ __device__ int sum;

sum += h[tx] * fx[ind];



__syncthreads();

y[bx] = sum;



__syncthreads();



sum =0;

}

int main()

{

CUT_DEVICE_INIT(); // must call to initialize CUDA

int x[16]={1,2,3,0};

int h[16]={10,20,0,0};

int* d_x;

CUDA_SAFE_CALL(cudaMalloc((void**) &d_x, MEM_SIZE * sizeof(int)));

int* d_h;

CUDA_SAFE_CALL(cudaMalloc((void**) &d_h, MEM_SIZE * sizeof(int)));

// copy host memory to device

CUDA_SAFE_CALL(cudaMemcpy(d_x, x, MEM_SIZE * sizeof(int), cudaMemcpyHostToDevice) );

CUDA_SAFE_CALL(cudaMemcpy(d_h, h, MEM_SIZE * sizeof(int), cudaMemcpyHostToDevice) );

int fx[MEM_SIZE] = {0};

int y[MEM_SIZE] = {0};

int* d_fx;

CUDA_SAFE_CALL(cudaMalloc((void**) &d_fx, MEM_SIZE * sizeof(int)));

    // setup execution parameters

dim3 threads(MEM_SIZE, 1);

dim3 grid(1, 1);

// execute the kernel

flip <<< grid, threads >>> ( d_x, d_fx );

// copy result from device to host

CUDA_SAFE_CALL(cudaMemcpy(fx, d_fx, MEM_SIZE * sizeof(int), cudaMemcpyDeviceToHost) );

CUDA_SAFE_CALL(cudaFree(d_x));

printf(" \n FlippedX = " );

for(int i=0; i<MEM_SIZE; i++) {

      printf(" %d,", fx[i]);

}

int* d_y;

CUDA_SAFE_CALL(cudaMalloc((void**) &d_y, MEM_SIZE * sizeof(int)));

// setup execution parameters

dim3 convThread(MEM_SIZE, 1);

dim3 convGrid(MEM_SIZE, 1);

// execute the kernel

conv <<< convGrid, convThread >>> ( d_fx, d_h, d_y );



// copy result from device to host

CUDA_SAFE_CALL(cudaMemcpy(y, d_y, MEM_SIZE * sizeof(int), cudaMemcpyDeviceToHost) );

// check if kernel execution generated and error

CUT_CHECK_ERROR("Kernel execution failed");

printf(" \n Result Y = " );

for(int i=0; i<MEM_SIZE; i++) {

      printf(" %d,", y[i]);

}

// clean up memory

CUDA_SAFE_CALL(cudaFree(d_fx));

CUDA_SAFE_CALL(cudaFree(d_h));

CUDA_SAFE_CALL(cudaFree(d_y));

}

You are using a shared memory variable to accumulate the sum. This will work in emulation mode because it is serialized. Use a reduction tree ( there are several examples in the SDK) to do this.

__shared__ __device__ int sum;

sum += h[tx] * fx[ind];

^^ There is your problem. The sum += line is executed in parallel. So, all threads in a warp read in the SAME value of sum (which is uninitialized too!), then add their local value to it and then write back to sum. Since you are writing many different values to sum at the same time, who knows what will happen.

It works in emulation mode since emulation mode processes warps in serial.

To perform that sum efficiently in parallel, see this post: http://forums.nvidia.com/index.php?showtop…ndpost&p=167921

i have solved the problem, now its working fine…

i have used atomicAdd instruction in my case and now its giving correct results.

But suggest me if there’s a better way to do same kind of thing in this case…

// circular convolution

///////////////////////////

// includes, system

#include <stdlib.h>

#include <stdio.h>

#include <string.h>

#include <math.h>

// includes, project

#include <cutil.h>

#define MEM_SIZE 16

__global__

void flip(int * x, int * fx)

{

    // Thread index

    int tx = threadIdx.x;

	fx[(MEM_SIZE-tx)%MEM_SIZE] = x[tx];

}

__global__

void conv(int * fx, int * h, int * y)

{

    // Block index

    int bx = blockIdx.x;

   // Thread index

    int tx = threadIdx.x;

	int ind = (MEM_SIZE + (tx - bx)) % MEM_SIZE;

	atomicAdd(y + bx, h[tx] * fx[ind]);	

	__syncthreads();

}

int main()

{

	CUT_DEVICE_INIT(); // must call to initialize CUDA

	int x[16]={1,2,3,4,5,6,7,0,0,0,0,0,0,0,0,0};

	int h[16]={10,20,30,0,0,0,0,0,0,0,0,0,0,0,0,0};

   printf("\n X = ");

  for(int i=0; i<MEM_SIZE; i++) {

       printf(" %d,", x[i]);

	}

	printf("\n H = ");

  for(int i=0; i<MEM_SIZE; i++) {

       printf(" %d,", h[i]);

	}

	int* d_x;

	CUDA_SAFE_CALL(cudaMalloc((void**) &d_x, MEM_SIZE * sizeof(int)));//IN THE DEVICE GLOBAL MEMORY

	int* d_h;

	CUDA_SAFE_CALL(cudaMalloc((void**) &d_h, MEM_SIZE * sizeof(int)));

   // copy host memory to device

	CUDA_SAFE_CALL(cudaMemcpy(d_x, x, MEM_SIZE * sizeof(int), cudaMemcpyHostToDevice) );

	CUDA_SAFE_CALL(cudaMemcpy(d_h, h, MEM_SIZE * sizeof(int), cudaMemcpyHostToDevice) );

	int fx[MEM_SIZE] = {0};

	int y[MEM_SIZE] = {0};

	int* d_fx;

	CUDA_SAFE_CALL(cudaMalloc((void**) &d_fx, MEM_SIZE * sizeof(int)));

    // setup execution parameters

    dim3 threads(MEM_SIZE, 1);

    dim3 grid(1, 1);

   // execute the kernel

	flip <<< grid, threads >>> ( d_x, d_fx );

	// copy result from device to host

    CUDA_SAFE_CALL(cudaMemcpy(fx, d_fx, MEM_SIZE * sizeof(int), cudaMemcpyDeviceToHost) );

	CUDA_SAFE_CALL(cudaFree(d_x));

	printf(" \n FlippedX = " );

	for(int i=0; i<MEM_SIZE; i++) {

       printf(" %d,", fx[i]);

	}

	int* d_y;

	CUDA_SAFE_CALL(cudaMalloc((void**) &d_y, MEM_SIZE * sizeof(int)));

	CUDA_SAFE_CALL(cudaMemset(d_y, 0, MEM_SIZE * sizeof(int)));

	// setup execution parameters

    dim3 convThread(MEM_SIZE, 1);

    dim3 convGrid(MEM_SIZE, 1);

   // execute the kernel

    conv <<< convGrid, convThread >>> ( d_fx, d_h, d_y );

	

	// copy result from device to host

    CUDA_SAFE_CALL(cudaMemcpy(y, d_y, MEM_SIZE * sizeof(int), cudaMemcpyDeviceToHost) );

   // check if kernel execution generated and error

    CUT_CHECK_ERROR("Kernel execution failed");

	printf(" \n Result Y = " );

	for(int i=0; i<MEM_SIZE; i++) {

       printf(" %d,", y[i]);

	}

    

    // clean up memory    

    CUDA_SAFE_CALL(cudaFree(d_fx));

    CUDA_SAFE_CALL(cudaFree(d_h));

    CUDA_SAFE_CALL(cudaFree(d_y));

}

Hello everyone,

Chose not to open a new topic, coz i had the same problem as above.

hostcode line:  

cudaMemcpyToSymbol("d_kernel", h_kernel,MAX_KERSIZE*MAX_KERSIZE*sizeof(double), 0, cudaMemcpyHostToDevice);

__device__ __constant__ double d_kernel[MAX_KERSIZE*MAX_KERSIZE];

__global__ 

void convp(pgm d_image, pgm d_oimage, kernel d_ker)

{	

	__shared__ double block[BLOCK_SIZE][BLOCK_SIZE]; 

	int val,out;

	int tx,ty,u,v,x,y;

	double sum;

	int sz;

	tx=threadIdx.x;

 	ty=threadIdx.y;

	block[ty][tx]=(float)d_image.paddedIm[ty*BLOCK_SIZE+tx];

	__syncthreads();	

	if(tx >= d_ker.kernelRadius && tx <= (blockDim.x-1-d_ker.kernelRadius) && ty >= d_ker.kernelRadius && ty <= (blockDim.y-1-d_ker.kernelRadius))

	{ 

		sum=0.0;

		for(v=0;v<d_ker.size;v++)

		{

						for(u=0;u<d_ker.size;u++)

			{	

				x=tx+u-d_ker.kernelRadius;

				y=ty+v-d_ker.kernelRadius;

				sum += (block[y][x])*d_kernel[v*MAX_KERSIZE + u];

			}

		}	

		

		val=abs((int)(sum/((double)d_ker.scale)));

		out=(ty-d_ker.kernelRadius)*(BLOCK_SIZE-2*d_ker.kernelRadius)+(tx-d_ker.kernelRadius);

		d_oimage.pData[out]=val;

	}	

		__syncthreads(); 

}

The problem is I get correct solution in emulation mode, but get all zeroes as answer in device mode. What I observed is values of this array ‘d_kernel[v*MAX_KERSIZE + u]’ are supposedly are zero, this variable d_kernel i have declared it as constant variable. before this i had tried using global memory instead of constant, the output was still wrong that is all zeroes.

If you ask why I am so sure of these values are zeroes coz, if I simply perform this ‘sum += (block[y])’ ie summation of all elements in that 3*3 window then i could see the average in the ouput.

I dont think so what you said that same variable ‘sum’ is being accessed by all threads, since each thread is allocated separate resources of course it is true that same code is Xcuted by all threads but that is totally different.

Can someone plz tell me why it is giving me all zeroes as output ?

I found what was wrong with it, ‘doubles’ are not supported in device mode which I changed to floats and now it works :)