The following code compiled on linux 64 bits or windows 7 64 (further specs below) produces unexpected output. This happens only in release mode, both 32 bits and 64 bits.
In debug mode (GPU, G0) the results are as expected: an array of 64 floats of integer value starting at 0 and ending at 63.
The problem may be linked to optimizations of the kernel-code by the compiler.
The problem was originally reported by Pcasto. Workaround is to protect a variable as volatile, see code.
[codebox]// Code to show a compilation problem in Cuda 3.1, release mode, SM targets (at least) 1.0 - 1.3
[font=“Courier New”]// N.B. kernel code is trivial but derived from something more meaningful, exhibiting the same problem
#include <stdio.h>
#include <cuda.h>
#include “cutil_inline.h”
global void kernel( float *g_array_out, const int *g_table, const int dummy1, const int dummy2 )
{
int index_l; // declaring index_l simply as int produces incorrect output when compiled for release
//volatile int index_l; // declaring index_l volatile produces correct output when compiled for release
int j,b_pid, g_pid;
int tid = threadIdx.x;
for(int k =0; k < dummy1; k++ ) // loop will only run for k = 0
{
b_pid = k * blockDim.x + tid; // b_pid will be equal to tid, since k = 0
g_pid = blockIdx.x * dummy1 * blockDim.x + b_pid; // g_pid will be equal to b_pid and tid since blockIdx.x will be zero
if( g_pid < dummy2 ) // will always be true
{
//index_l = g_pid; // equating index_l to g_pid (instead of g_table[g_pid]) produces correct output
index_l = g_table[ g_pid ];
for( j = 0; j < 4; j++ )
{
g_array_out[index_l] = blockDim.x*j+g_pid; // r-value is chosen to put consecutive values in g_array_out
index_l += dummy2; // stride equal to block size
}
}
}
}
int main()
{
float *h_array_out = 0;
float *d_array_out = 0;
int *d_table = 0;
const int h_table[] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15};
int block_size = 16;
int grid_size = 1;
int device;
cudaDeviceProp props;
cudaGetDeviceCount(&device); // test on all available cuda devices
while (--device>=0)
{
cutilSafeCall(cudaGetDeviceProperties(&props, device));
if (props.major == 9999 && props.minor == 9999) continue; // no cuda
printf("Device %d: \"%s\" with Compute %d.%d capability, %d multiprocessors\n", 1, props.name, props.major, props.minor, props.multiProcessorCount);
cudaSetDevice(device); // select device
// allocate look_up table
if (d_table==0) cudaMalloc( (void**) &d_table, sizeof(h_table) );
cudaMemcpy( d_table, h_table, sizeof(h_table), cudaMemcpyHostToDevice );
// allocate device array_out
if (d_array_out==0) cudaMalloc( (void**) &d_array_out, 4*16*sizeof(float) );
if ( d_array_out == 0 ) { printf("couldn't allocate device memory\n"); exit(1); }
// allocate host copy of array_out
if (h_array_out==0) h_array_out = (float*) malloc( 4*16*sizeof(float) );
// fill device array_out with values to see which positions are written to by the kernel
// use hostarray to fill device array
for (int i=0; i< 4*16; i++) h_array_out[i] = -1.f;
cudaMemcpy( d_array_out, h_array_out, 4*16*sizeof(float), cudaMemcpyHostToDevice );
// execute kernel
kernel <<< grid_size, block_size >>>( d_array_out, d_table, 1, 16 );
cutilCheckMsg( "Kernel execution failed" );
cudaThreadSynchronize();
// copy device array_out to host
cudaMemcpy( h_array_out, d_array_out, 4*16*sizeof(float), cudaMemcpyDeviceToHost );
// print values of array_out
int error=0;
for( int i=0; i<4*16; i++ )
{
printf( "%f\n", h_array_out[i] );
error+=h_array_out[i]!=i;
}
if (error) printf("%d errors\n",error);
else puts("Passed");
// clean up device
cudaFree(d_array_out); d_array_out=0;
cudaFree(d_table); d_table=0;
cudaThreadExit();
}
// clean up host
free(h_array_out);
}[/font]
[/codebox]
The problem occurs on several GPU’s, such as gtx 275, 8500 GT.
Other compiler versions, GPU’s and SM targets > 1.3 have not been tested.
Compilers: VS2008 and GCC.
Expected output: array of 64 floating point values of integer value starting with 0 and continuing to 63.