Different results on device and Emulation mode

asherimtiaz · August 6, 2007, 1:01pm

Hi everyone,

I am having a problem running my cuda program on device. :(

Its working fine in EmuDebug mode but giving invalid result on device.

Can anyone tell what could be the problem in the following code.

// simple cuda program to find circular convolution

#define MEM_SIZE 4

global

void flip(int * x, int * fx)

{

// Thread index

int tx = threadIdx.x;

fx[(MEM_SIZE-tx)%MEM_SIZE] = x[tx];

}

global

void conv(int * fx, int * h, int * y)

{

// Block index

int bx = blockIdx.x;

// Thread index

int tx = threadIdx.x;

int ind = (MEM_SIZE + (tx - bx)) % MEM_SIZE;

__shared__ __device__ int sum;

sum += h[tx] * fx[ind];



__syncthreads();

y[bx] = sum;



__syncthreads();



sum =0;

}

int main()

{

CUT_DEVICE_INIT(); // must call to initialize CUDA

int x[16]={1,2,3,0};

int h[16]={10,20,0,0};

int* d_x;

CUDA_SAFE_CALL(cudaMalloc((void**) &d_x, MEM_SIZE * sizeof(int)));

int* d_h;

CUDA_SAFE_CALL(cudaMalloc((void**) &d_h, MEM_SIZE * sizeof(int)));

// copy host memory to device

CUDA_SAFE_CALL(cudaMemcpy(d_x, x, MEM_SIZE * sizeof(int), cudaMemcpyHostToDevice) );

CUDA_SAFE_CALL(cudaMemcpy(d_h, h, MEM_SIZE * sizeof(int), cudaMemcpyHostToDevice) );

int fx[MEM_SIZE] = {0};

int y[MEM_SIZE] = {0};

int* d_fx;

CUDA_SAFE_CALL(cudaMalloc((void**) &d_fx, MEM_SIZE * sizeof(int)));

    // setup execution parameters

dim3 threads(MEM_SIZE, 1);

dim3 grid(1, 1);

// execute the kernel

flip <<< grid, threads >>> ( d_x, d_fx );

// copy result from device to host

CUDA_SAFE_CALL(cudaMemcpy(fx, d_fx, MEM_SIZE * sizeof(int), cudaMemcpyDeviceToHost) );

CUDA_SAFE_CALL(cudaFree(d_x));

printf(" \n FlippedX = " );

for(int i=0; i<MEM_SIZE; i++) {

      printf(" %d,", fx[i]);

}

int* d_y;

CUDA_SAFE_CALL(cudaMalloc((void**) &d_y, MEM_SIZE * sizeof(int)));

// setup execution parameters

dim3 convThread(MEM_SIZE, 1);

dim3 convGrid(MEM_SIZE, 1);

// execute the kernel

conv <<< convGrid, convThread >>> ( d_fx, d_h, d_y );



// copy result from device to host

CUDA_SAFE_CALL(cudaMemcpy(y, d_y, MEM_SIZE * sizeof(int), cudaMemcpyDeviceToHost) );

// check if kernel execution generated and error

CUT_CHECK_ERROR("Kernel execution failed");

printf(" \n Result Y = " );

for(int i=0; i<MEM_SIZE; i++) {

      printf(" %d,", y[i]);

}

// clean up memory

CUDA_SAFE_CALL(cudaFree(d_fx));

CUDA_SAFE_CALL(cudaFree(d_h));

CUDA_SAFE_CALL(cudaFree(d_y));

}

mfatica · August 6, 2007, 2:10pm

You are using a shared memory variable to accumulate the sum. This will work in emulation mode because it is serialized. Use a reduction tree ( there are several examples in the SDK) to do this.

MisterAnderson42 · August 6, 2007, 2:23pm

__shared__ __device__ int sum;

sum += h[tx] * fx[ind];

^^ There is your problem. The sum += line is executed in parallel. So, all threads in a warp read in the SAME value of sum (which is uninitialized too!), then add their local value to it and then write back to sum. Since you are writing many different values to sum at the same time, who knows what will happen.

It works in emulation mode since emulation mode processes warps in serial.

To perform that sum efficiently in parallel, see this post: http://forums.nvidia.com/index.php?showtop…ndpost&p=167921

asherimtiaz · August 10, 2007, 12:21pm

i have solved the problem, now its working fine…

i have used atomicAdd instruction in my case and now its giving correct results.

But suggest me if there’s a better way to do same kind of thing in this case…

// circular convolution

///////////////////////////

// includes, system

#include <stdlib.h>

#include <stdio.h>

#include <string.h>

#include <math.h>

// includes, project

#include <cutil.h>

#define MEM_SIZE 16

__global__

void flip(int * x, int * fx)

{

    // Thread index

    int tx = threadIdx.x;

	fx[(MEM_SIZE-tx)%MEM_SIZE] = x[tx];

}

__global__

void conv(int * fx, int * h, int * y)

{

    // Block index

    int bx = blockIdx.x;

   // Thread index

    int tx = threadIdx.x;

	int ind = (MEM_SIZE + (tx - bx)) % MEM_SIZE;

	atomicAdd(y + bx, h[tx] * fx[ind]);	

	__syncthreads();

}

int main()

{

	CUT_DEVICE_INIT(); // must call to initialize CUDA

	int x[16]={1,2,3,4,5,6,7,0,0,0,0,0,0,0,0,0};

	int h[16]={10,20,30,0,0,0,0,0,0,0,0,0,0,0,0,0};

   printf("\n X = ");

  for(int i=0; i<MEM_SIZE; i++) {

       printf(" %d,", x[i]);

	}

	printf("\n H = ");

  for(int i=0; i<MEM_SIZE; i++) {

       printf(" %d,", h[i]);

	}

	int* d_x;

	CUDA_SAFE_CALL(cudaMalloc((void**) &d_x, MEM_SIZE * sizeof(int)));//IN THE DEVICE GLOBAL MEMORY

	int* d_h;

	CUDA_SAFE_CALL(cudaMalloc((void**) &d_h, MEM_SIZE * sizeof(int)));

   // copy host memory to device

	CUDA_SAFE_CALL(cudaMemcpy(d_x, x, MEM_SIZE * sizeof(int), cudaMemcpyHostToDevice) );

	CUDA_SAFE_CALL(cudaMemcpy(d_h, h, MEM_SIZE * sizeof(int), cudaMemcpyHostToDevice) );

	int fx[MEM_SIZE] = {0};

	int y[MEM_SIZE] = {0};

	int* d_fx;

	CUDA_SAFE_CALL(cudaMalloc((void**) &d_fx, MEM_SIZE * sizeof(int)));

    // setup execution parameters

    dim3 threads(MEM_SIZE, 1);

    dim3 grid(1, 1);

   // execute the kernel

	flip <<< grid, threads >>> ( d_x, d_fx );

	// copy result from device to host

    CUDA_SAFE_CALL(cudaMemcpy(fx, d_fx, MEM_SIZE * sizeof(int), cudaMemcpyDeviceToHost) );

	CUDA_SAFE_CALL(cudaFree(d_x));

	printf(" \n FlippedX = " );

	for(int i=0; i<MEM_SIZE; i++) {

       printf(" %d,", fx[i]);

	}

	int* d_y;

	CUDA_SAFE_CALL(cudaMalloc((void**) &d_y, MEM_SIZE * sizeof(int)));

	CUDA_SAFE_CALL(cudaMemset(d_y, 0, MEM_SIZE * sizeof(int)));

	// setup execution parameters

    dim3 convThread(MEM_SIZE, 1);

    dim3 convGrid(MEM_SIZE, 1);

   // execute the kernel

    conv <<< convGrid, convThread >>> ( d_fx, d_h, d_y );

	

	// copy result from device to host

    CUDA_SAFE_CALL(cudaMemcpy(y, d_y, MEM_SIZE * sizeof(int), cudaMemcpyDeviceToHost) );

   // check if kernel execution generated and error

    CUT_CHECK_ERROR("Kernel execution failed");

	printf(" \n Result Y = " );

	for(int i=0; i<MEM_SIZE; i++) {

       printf(" %d,", y[i]);

	}

    

    // clean up memory    

    CUDA_SAFE_CALL(cudaFree(d_fx));

    CUDA_SAFE_CALL(cudaFree(d_h));

    CUDA_SAFE_CALL(cudaFree(d_y));

}

newbs · February 5, 2009, 5:46am

Hello everyone,

Chose not to open a new topic, coz i had the same problem as above.

hostcode line:  

cudaMemcpyToSymbol("d_kernel", h_kernel,MAX_KERSIZE*MAX_KERSIZE*sizeof(double), 0, cudaMemcpyHostToDevice);

__device__ __constant__ double d_kernel[MAX_KERSIZE*MAX_KERSIZE];

__global__ 

void convp(pgm d_image, pgm d_oimage, kernel d_ker)

{	

	__shared__ double block[BLOCK_SIZE][BLOCK_SIZE]; 

	int val,out;

	int tx,ty,u,v,x,y;

	double sum;

	int sz;

	tx=threadIdx.x;

 	ty=threadIdx.y;

	block[ty][tx]=(float)d_image.paddedIm[ty*BLOCK_SIZE+tx];

	__syncthreads();	

	if(tx >= d_ker.kernelRadius && tx <= (blockDim.x-1-d_ker.kernelRadius) && ty >= d_ker.kernelRadius && ty <= (blockDim.y-1-d_ker.kernelRadius))

	{ 

		sum=0.0;

		for(v=0;v<d_ker.size;v++)

		{

						for(u=0;u<d_ker.size;u++)

			{	

				x=tx+u-d_ker.kernelRadius;

				y=ty+v-d_ker.kernelRadius;

				sum += (block[y][x])*d_kernel[v*MAX_KERSIZE + u];

			}

		}	

		

		val=abs((int)(sum/((double)d_ker.scale)));

		out=(ty-d_ker.kernelRadius)*(BLOCK_SIZE-2*d_ker.kernelRadius)+(tx-d_ker.kernelRadius);

		d_oimage.pData[out]=val;

	}	

		__syncthreads(); 

}

The problem is I get correct solution in emulation mode, but get all zeroes as answer in device mode. What I observed is values of this array ‘d_kernel[v*MAX_KERSIZE + u]’ are supposedly are zero, this variable d_kernel i have declared it as constant variable. before this i had tried using global memory instead of constant, the output was still wrong that is all zeroes.

If you ask why I am so sure of these values are zeroes coz, if I simply perform this ‘sum += (block[y])’ ie summation of all elements in that 3*3 window then i could see the average in the ouput.

I dont think so what you said that same variable ‘sum’ is being accessed by all threads, since each thread is allocated separate resources of course it is true that same code is Xcuted by all threads but that is totally different.

Can someone plz tell me why it is giving me all zeroes as output ?

newbs · February 5, 2009, 8:41pm

I found what was wrong with it, ‘doubles’ are not supported in device mode which I changed to floats and now it works :)

Hello everyone,

Chose not to open a new topic, coz i had the same problem as above.
hostcode line:  

cudaMemcpyToSymbol("d_kernel", h_kernel,MAX_KERSIZE*MAX_KERSIZE*sizeof(double), 0, cudaMemcpyHostToDevice);

__device__ __constant__ double d_kernel[MAX_KERSIZE*MAX_KERSIZE];

__global__ 

void convp(pgm d_image, pgm d_oimage, kernel d_ker)

{	

	__shared__ double block[BLOCK_SIZE][BLOCK_SIZE]; 

	int val,out;

	int tx,ty,u,v,x,y;

	double sum;

	int sz;

	tx=threadIdx.x;

 	ty=threadIdx.y;

	block[ty][tx]=(float)d_image.paddedIm[ty*BLOCK_SIZE+tx];

	__syncthreads();	

	if(tx >= d_ker.kernelRadius && tx <= (blockDim.x-1-d_ker.kernelRadius) && ty >= d_ker.kernelRadius && ty <= (blockDim.y-1-d_ker.kernelRadius))

	{ 

		sum=0.0;

		for(v=0;v<d_ker.size;v++)

		{

						for(u=0;u<d_ker.size;u++)

			{	

				x=tx+u-d_ker.kernelRadius;

				y=ty+v-d_ker.kernelRadius;

				sum += (block[y][x])*d_kernel[v*MAX_KERSIZE + u];

			}

		}	

		

		val=abs((int)(sum/((double)d_ker.scale)));

		out=(ty-d_ker.kernelRadius)*(BLOCK_SIZE-2*d_ker.kernelRadius)+(tx-d_ker.kernelRadius);

		d_oimage.pData[out]=val;

	}	

		__syncthreads(); 

}
The problem is I get correct solution in emulation mode, but get all zeroes as answer in device mode. What I observed is values of this array ‘d_kernel[v*MAX_KERSIZE + u]’ are supposedly are zero, this variable d_kernel i have declared it as constant variable. before this i had tried using global memory instead of constant, the output was still wrong that is all zeroes.

If you ask why I am so sure of these values are zeroes coz, if I simply perform this ‘sum += (block[y])’ ie summation of all elements in that 3*3 window then i could see the average in the ouput.

I dont think so what you said that same variable ‘sum’ is being accessed by all threads, since each thread is allocated separate resources of course it is true that same code is Xcuted by all threads but that is totally different.

Can someone plz tell me why it is giving me all zeroes as output ?

Topic		Replies	Views
Different Output on Device and Emulation Mode 2 What makes difference b/w Emu and Dev? CUDA Programming and Performance	15	13451	December 2, 2007
Problem:Different results on EMULATION and RELEASE Problem on release and debug mode CUDA Programming and Performance	4	1856	June 5, 2008
Help with strange error CUDA Programming and Performance	8	2096	February 25, 2010
device code not executing? CUDA Programming and Performance	4	3823	July 10, 2008
Help! Sum of vectors CUDA Programming and Performance	7	893	June 16, 2011
Emulator works but G80 doesn't CUDA Programming and Performance	11	5432	July 3, 2007
Wrong output for processed images problems running in non-emulation mode CUDA Programming and Performance	0	1012	December 9, 2008
WHAT IT'S WRONG IN THIS KERNEL CODE? CUDA Programming and Performance	4	1952	June 17, 2008
what's wrong with my code? CUDA Programming and Performance	6	3955	July 7, 2009
CUDA - calculation of a sum CUDA Programming and Performance	7	5530	April 30, 2010

Different results on device and Emulation mode

Related topics