device code not executing?

I have a simple test program that should be out-puting two zeros, but instead is out-puting two large numbers.

output:

1921233664.000000 1921233664.000000

but it should output:

0.000000 0.000000

Judging by the execution speed, i don’t think the kernel is even running. But i would expect an error then, which is not the case. The relevant parts of the code are below. If anyone has any idea why this isn’t working, that would be great. thanks.

#define THREADS_PER_BLOCK 128
#define BLOCKS 24
#define THREADS THREADS_PER_BLOCK*BLOCKS

__device float d_sum_squared_error[THREADS];
__device float d_sum_squared_error_probe[THREADS];

void init(int argc, char** argv) {
CUT_DEVICE_INIT(argc,argv);
}

global void zero_rmse(float *g_sum_squared_error, float *g_sum_squared_error_probe) {
g_sum_squared_error[blockIdx.x * THREADS_PER_BLOCK + threadIdx.x] = 0;
g_sum_squared_error_probe[blockIdx.x * THREADS_PER_BLOCK + threadIdx.x] = 0;
}

global void sum_rmse(float *g_sum_squared_error, float *g_sum_squared_error_probe) {
shared float sum_squared_error[THREADS_PER_BLOCK];
shared float sum_squared_error_probe[THREADS_PER_BLOCK];

if( blockIdx.x == 0) {
	float c = 0;
	c = 0;
	for( int i = threadIdx.x; i < NUM_THREADS; i += THREADS_PER_BLOCK)
		c += g_sum_squared_error[i];
	sum_squared_error[threadIdx.x] = c;
	__syncthreads();
	if( threadIdx.x < warpSize ) {
		c = 0;
		for( int i = threadIdx.x; i < THREADS_PER_BLOCK; i += warpSize)
			c += sum_squared_error[i];
		sum_squared_error[threadIdx.x] = c;
	}
	if( threadIdx.x == 0 ) {
		for( int i = 1; i < warpSize; i ++)
			c += sum_squared_error[i];
		g_sum_squared_error[0] = c;
	}
}
if( blockIdx.x == 1) {
	float c = 0;
	c = 0;
	for( int i = threadIdx.x; i < NUM_THREADS; i += THREADS_PER_BLOCK)
		c += g_sum_squared_error_probe[i];
	sum_squared_error_probe[threadIdx.x] = c;
	__syncthreads();
	if( threadIdx.x < warpSize ) {
		c = 0;
		for( int i = threadIdx.x; i < THREADS_PER_BLOCK; i += warpSize)
			c += sum_squared_error_probe[i];
		sum_squared_error_probe[threadIdx.x] = c;
	}
	if( threadIdx.x == 0 ) {
		for( int i = 1; i < warpSize; i ++)
			c += sum_squared_error_probe[i];
		g_sum_squared_error_probe[0] = c;
	}
}

}

void exec() {
float sum_squared_error, sum_squared_error_probe;

    ...

dim3 dimGrid(BLOCKS,1);
dim3 dimBlock(THREADS_PER_BLOCK,1);

zero_rmse<<<dimGrid,dimBlock>>>(d_sum_squared_error,d_sum_squared_error_probe);
CUT_CHECK_ERROR("error in zero_rmse");

CUDA_SAFE_CALL(cudaThreadSynchronize());

sum_rmse<<<dimGrid,dimBlock>>>(d_sum_squared_error,d_sum_squared_error_probe);
CUT_CHECK_ERROR("error in sum_rmse");

CUDA_SAFE_CALL(cudaThreadSynchronize());

CUDA_SAFE_CALL(cudaMemcpy(&sum_squared_error,d_sum_squared_error,sizeof(float), cudaMemcpyDeviceToHost));
CUDA_SAFE_CALL(cudaMemcpy(&sum_squared_error_probe,d_sum_squared_error_probe,sizeof(float), cudaMemcpyDeviceToHost));

printf("%f %f\n",sum_squared_error,sum_squared_error_probe);

    ...

}

void main( int argc, char** argv) {

init(argc,argv);

exec();

}

Just for a slightly better readability:

#define THREADS_PER_BLOCK 128

#define BLOCKS 24

#define THREADS THREADS_PER_BLOCK*BLOCKS

__device float d_sum_squared_error[THREADS];

__device float d_sum_squared_error_probe[THREADS];

void init(int argc, char** argv) {

	CUT_DEVICE_INIT(argc,argv);

}

__global__ void zero_rmse(float *g_sum_squared_error, float *g_sum_squared_error_probe) {

	g_sum_squared_error[blockIdx.x * THREADS_PER_BLOCK + threadIdx.x] = 0;

	g_sum_squared_error_probe[blockIdx.x * THREADS_PER_BLOCK + threadIdx.x] = 0;

}

__global__ void sum_rmse(float *g_sum_squared_error, float *g_sum_squared_error_probe) {

	__shared__ float sum_squared_error[THREADS_PER_BLOCK];

	__shared__ float sum_squared_error_probe[THREADS_PER_BLOCK];

	if( blockIdx.x == 0) {

  float c = 0;

  c = 0;

  for( int i = threadIdx.x; i < NUM_THREADS; i += THREADS_PER_BLOCK)

  	c += g_sum_squared_error[i];

  sum_squared_error[threadIdx.x] = c;

  __syncthreads();

  if( threadIdx.x < warpSize ) {

  	c = 0;

  	for( int i = threadIdx.x; i < THREADS_PER_BLOCK; i += warpSize)

    c += sum_squared_error[i];

  	sum_squared_error[threadIdx.x] = c;

  }

  if( threadIdx.x == 0 ) {

  	for( int i = 1; i < warpSize; i ++)

    c += sum_squared_error[i];

  	g_sum_squared_error[0] = c;

  }

	}

	if( blockIdx.x == 1) {

  float c = 0;

  c = 0;

  for( int i = threadIdx.x; i < NUM_THREADS; i += THREADS_PER_BLOCK)

  	c += g_sum_squared_error_probe[i];

  sum_squared_error_probe[threadIdx.x] = c;

  __syncthreads();

  if( threadIdx.x < warpSize ) {

  	c = 0;

  	for( int i = threadIdx.x; i < THREADS_PER_BLOCK; i += warpSize)

    c += sum_squared_error_probe[i];

  	sum_squared_error_probe[threadIdx.x] = c;

  }

  if( threadIdx.x == 0 ) {

  	for( int i = 1; i < warpSize; i ++)

    c += sum_squared_error_probe[i];

  	g_sum_squared_error_probe[0] = c;

  }

	}

}

void exec() {

	float sum_squared_error, sum_squared_error_probe;

	...

 dim3 dimGrid(BLOCKS,1);

	dim3 dimBlock(THREADS_PER_BLOCK,1);

	zero_rmse<<<dimGrid,dimBlock>>>(d_sum_squared_error,d_sum_squared_error_probe);

	CUT_CHECK_ERROR("error in zero_rmse");

	CUDA_SAFE_CALL(cudaThreadSynchronize());

	sum_rmse<<<dimGrid,dimBlock>>>(d_sum_squared_error,d_sum_squared_error_probe);

	CUT_CHECK_ERROR("error in sum_rmse");

	CUDA_SAFE_CALL(cudaThreadSynchronize());

	CUDA_SAFE_CALL(cudaMemcpy(&sum_squared_error,d_sum_squared_error,sizeof(float), cudaMemcpyDeviceToHost));

	CUDA_SAFE_CALL(cudaMemcpy(& amp;sum_squared_error_probe,d_sum_squared_error_probe,sizeof

    (float), cudaMemcpyDeviceToHost));

	printf("%f %f\n",sum_squared_error,sum_squared_error_probe);

	...

}

void main( int argc, char** argv) {

	...

  init(argc,argv);

	...

  exec();

	...

}

I don’t know if the “__device” is working never seen those before.

try to put this after your kernel invocation to see if there is an error always works for me

cudaThreadSynchronize();

	cudaError_t error = cudaGetLastError();

	if (error != cudaSuccess)

  printf("error :%s\n",cudaGetErrorString(error));

	// check if kernel execution generated and error

	// CUT_CHECK_ERROR("Kernel execution failed");

also try to run the program in device emulation mode by giving the NVCC the -deviceemu argument and do some printf’s to see the values inside your kernel.

if the values inside the kernel are good than probably the Memcpy does not do what you have planned because the numbers look like uninitialized values.

the __device was actually supposed to be device, ofcourse - that was just a typo in copying over the code.

I put the error check in as you suggested and get an “invalid device function” error after each kernel invocation. In device emulation mode, however, i just get a segmentation fault.

Am I allocating device memory wrong? (I have a lot of other data structures that need allocating, and I want to allocate some of them as textures for performance reasons.)

I figured it out. Thanks to the getLastError and device emulation mode suggestions. Apparently the device variables don’t do anything. what one needs to do is declare a normal (host-side) pointer (for instance: float* var;), then use cudaMalloc.

But then I don’t understand how one would declare a region of device memory as constant memory or texture memory, nor how one would use copy to symbol.

I never worked with textures but constants are not that hard. What I do is

I make a constant variable in my .h file for my kernel

like:

__constant__ float planes[1024];

in my main or what ever function you wanna use in C/C++ code

I have some code like this:

CUDA_SAFE_CALL(cudaMemcpyToSymbol("planes", _planes, (1024)*sizeof(float)));

where _planes is already filled of course with the values you want.

Hope this helps

Thanks,

Jordy

EDIT: This exaple shows it for a array but the same can be done for just a variable.