The following code produces this output:
debug[0]=0.100000
debug[1]=0.100000
debug[2]=2.100000
debug[3]=2.100000
debug[4]=1.100000
debug[5]=-1.100000
debug[6]=5.600000
debug[7]=3.400000
debug[4] should be -1.1 as well… it seems like cuda forgets the sign.
Here’s the code:
#include <cstdio>
#define MAXTHREADS 32
__constant__ double recf[3] = {0.17, 0.10, 1.1};
__global__ void bug(double * debug, int m)
{
__shared__ double temp1;
__shared__ double temp2;
__shared__ double temp3;
__shared__ double y[MAXTHREADS];
__shared__ double y1[MAXTHREADS];
__shared__ double y2[MAXTHREADS];
__syncthreads();
y2[threadIdx.x] = 1.0;
y1[threadIdx.x] = 2.0;
y[threadIdx.x] = 3.0;
if (threadIdx.x==0)
{
temp1 = recf[m+1]; // assigns 0.1
temp3 = -recf[m+2]; // assigns -1.1
temp2 = 1.0 - temp3; // and this should be 2.1
}
__syncthreads();
// calculate some stuff... should be 3.4
y[threadIdx.x] = temp1 * y[threadIdx.x] + temp2 * y1[threadIdx.x] + temp3 * y2[threadIdx.x];
__syncthreads();
if (threadIdx.x == 0)
{
debug[0] = temp1; // store the temps
debug[2] = temp2;
debug[4] = temp3;
debug[6] = y[0]; // ... and the calculated stuff
}
__syncthreads();
y2[threadIdx.x] = 1.0;
y1[threadIdx.x] = 2.0;
y[threadIdx.x] = 3.0;
__syncthreads();
if (threadIdx.x == 0)
{
double t1 = recf[m+1]; // calculate the same stuff on registers
double t3 = -recf[m+2];
double t2 = 1.0 - t3;
debug[1] = t1; // and store
debug[3] = t2;
debug[5] = t3;
debug[7] = t1 * y[threadIdx.x] + t2 * y1[threadIdx.x] + t3 * y2[threadIdx.x];
}
}
int main()
{
double * debug = new double[8];
double * cudebug;
cudaMalloc((void**)&cudebug, sizeof(double) * 8 );
bug<<<1, MAXTHREADS>>>(cudebug, 0);
cudaMemcpy(debug, cudebug, sizeof(double)*8, cudaMemcpyDeviceToHost);
for (int i=0; i<8; i++)
{
printf("debug[%d]=%f\n", i, debug[i]);
}
}
I wonder, if temp2 is calculated with the correct value, but after that the sign gets lost. This effekt doesn’t appear, if the shared variable “m” is declared as a local var, or is replaced by the evaluated value. It also doesn’t appear, if I access global memory after the line
temp2 = 1.0 - temp3; // and this should be 2.1
I tried much and as result I think that I should report this as a bug. can anyone reproduce the error?
Edit:
I forgot: I use Linux x64, CUDA 2.3, GeForce GTX 285