Hi,
below is a very very stripped down version of some code that runs fine on pre-Fermi HW and ULFs on Fermi. Basically, I cudaMalloc some float array *x, pass the pointer x+2 to the kernel, and index x[-1] in the kernel. Works fine, but unfortunately only if I mix uints (from threadIdx) and ints (the literal “-1”) in the correct way, where “correct” doesn’t match my understanding. The code should be self-explanatory, and I’ve listed 4 1/2 different versions of indexing in the kernel, two of which work everywhere and two of which only work for pre-Fermi.
Tested on 64-bit Ubuntu 10.04 LTS with toolkit 3.2 final and the matching driver.
Instructions: nvcc -arch sm_20 -g -G -O0 test.cu && ./a.out for Fermi (C2070 in my case)
and nvcc -arch sm_23 -g -G -O0 test.cu && ./a.out (GTX280 in my case)
Any insight is appreciated. Thanks!
dom
#include <stdio.h>
int N=32;
int NPADDED=34;
// demo kernel to illustrate the problem
__global__ void dummyKernel(float *x, float *y, int n)
{
// this version works on all devices, because the index passed to []
// is an int and the subtraction yields an int which stores the
// correct value -1 for thread 0
int idx = threadIdx.x;
if (idx < n)
y[idx] = x[idx-1];
// this version works on pre-Fermi devices but ULFs on Fermi. Apparently,
// the subtraction is performed as uint, and the computation for thread 0
// overflows (assigning a negative value (implicitly) to an uint gives
// garbage)
// if (threadIdx.x < n)
// y[threadIdx.x] = x[threadIdx.x-1];
// this results in an "integer conversion resulted in a change of sign"
// warning and consequently fails afterwards on Fermi, but works fine
// on pre-Fermi (despite the same warning!)
// if (threadIdx.x < n)
// y[threadIdx.x] = x[threadIdx.x + (-1)];
// same behaviour as the previous version, based on my understanding,
// both versions should do the index arithmetic in int instead of uint
// because of the literal "-1". It does not make a difference in which
// order the int literal and the uint variable are evaluated.
// if (threadIdx.x < n)
// y[threadIdx.x] = x[(-1) + threadIdx.x];
// this version again works as expected on all devices, because it is
// completely equivalent to the first version
// if (threadIdx.x < n)
// y[threadIdx.x] = x[(int)(threadIdx.x-1)];
}
// simple error checking routine
void coproc_cuda_checkErrors(const int iline, const char *sfile, const char *sroutine)
{
cudaStreamSynchronize(0);
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess)
{
char *e = (char*) cudaGetErrorString(err);
fprintf(stderr, "CUDA ERROR: %s (Routine: %s, line %d, file %s\n", e, sroutine, iline, sfile);
}
}
// inits and allocates a device array of length NPADDED=N+1
float * allocinit(float multiplier, int vec)
{
int i;
float *f;
float * input = (float*)malloc(NPADDED * sizeof(float));
cudaMalloc((void**)&f, NPADDED * sizeof(float));
coproc_cuda_checkErrors(__LINE__, __FILE__, "cudaMalloc");
for (i=0; i<NPADDED; i++)
{
input[i] = multiplier * i;
printf("vector %d before kernel: %d\t%.2f\n",vec, i, input[i]);
}
cudaMemcpy(f, input, NPADDED * sizeof(float), cudaMemcpyHostToDevice);
coproc_cuda_checkErrors(__LINE__, __FILE__, "codaMemcpy");
free(input);
return f;
}
int main (int argc, char **argv)
{
// edit appropriately
cudaSetDevice(0);
// allocate two device vectors of length NPADDED and fill with data:
// first vector: [0]=0.0 ... [33]=33.0
float * x = allocinit(1.0f, 0);
// second vector: [0]=0.0 ... [33]=-33.0
float * y = allocinit(-1.0f, 1);
// launch a kernel that copies N values from x to y with some index shifts.
// x is read from at [1] to [32] (physical) which translates to
// [-1]..[30] kernel-local addresses
// y is written to at [2] to [33] (physical) which translates to
// [0]..[31] kernel-local addresses
// Note that this does not introduce out of bounds accesses!
dummyKernel<<<1,32,0,0>>>(x+2, y+2, N);
coproc_cuda_checkErrors(__LINE__, __FILE__, "dummyKernel");
// allocate and init memory for result
float * result = (float*)malloc(N*sizeof(float));
int i;
for (i=0; i<N; i++)
result[i] = -10000.f;
// copy y[2] to y[33] into result (must be the initial x[1]..x[32])
cudaMemcpy(result, y+2, N*sizeof(float), cudaMemcpyDeviceToHost);
for (i=0; i<N; i++)
printf("result %d: %.1f\n", i, result[i]);
// clean up
cudaFree (x);
cudaFree (y);
free(result);
return 0;
}