Double precision problem Kernel returns cudaErrorUnknown in a double precision mode

Here is a small piece of code demonstrating the problem.

2-string kernel Problem() returns cudaErrorUnknown only for double precision mode. Both emulation mode or single precision mode returns cudaSuccess.

If one changes typedef FTYPE and TTYPE to float the pure float situation will work correctly.

Also

for(int n=0; n<1; n++)

or

FTYPEfetch(texU, texUOfs)*dummy[1];

or

*array1 = FTYPEfetch(texU, texUOfs)*dummy[n];

gives no error.

Compiler’s input

“C:\CUDA\bin\nvcc.exe” -arch sm_13 -ccbin “c:\Program Files (x86)\Microsoft Visual Studio 9.0\VC\bin” -use_fast_math -Xcompiler "/EHsc /W3 /nologo /O2 /Zi /MT " -maxrregcount=32 --compile -o x64\Release\Problem.cu.obj Problem.cu

[codebox]

/* Problem.cu

2-string kernel Problem() returns cudaErrorUnknown only for double precision mode. Both emulation mode or single precision mode returns cudaSuccess.

Also

for(int n=0; n<1; n++)

or

FTYPEfetch(texU, texUOfs)*dummy[1];

or

*array1 = FTYPEfetch(texU, texUOfs)*dummy[n];

gives no error.

Compiler’s input

“C:\CUDA\bin\nvcc.exe” -arch sm_13 -ccbin “c:\Program Files (x86)\Microsoft Visual Studio 9.0\VC\bin” -use_fast_math -Xcompiler "/EHsc /W3 /nologo /O2 /Zi /MT " -maxrregcount=32 --compile -o x64\Release\Problem.cu.obj Problem.cu

Copyright Anatoly Vershinin, Denis Sabitov

*/

#include <stdio.h>

typedef double FTYPE;

typedef int2 TTYPE;

constant const FTYPE dummy = {1.0, 2.0};

texture texU;

static inline device FTYPE FTYPEfetch(texture<int2, 1> t, int i)

{

int2 v = tex1Dfetch(t,i);

return __hiloint2double(v.y, v.x);

}

static inline device float FTYPEfetch(texture<float, 1> t, int i)

{

return tex1Dfetch(t,i);

}

global void Problem(FTYPE* array1, const int texUOfs)

{

for(int n=0; n<2; n++)					

	*array1 += FTYPEfetch(texU, texUOfs)*dummy[n];	

}

void main()

{

FTYPE *source, *result;



cudaMalloc((void**)&source, sizeof(FTYPE));

cudaMemset(source, 0, sizeof(FTYPE));

cudaMalloc((void**)&result, sizeof(FTYPE));

cudaMemset(result, 0, sizeof(FTYPE));

size_t hostOfs;

cudaBindTexture(&hostOfs, texU, source, sizeof(FTYPE));

hostOfs /= sizeof(FTYPE);



Problem<<<1, 1>>>(result, hostOfs);

cudaThreadSynchronize();

const cudaError_t error = cudaGetLastError();

printf("Cuda returned %d\n", error);

cudaFree(source);

cudaFree(result);

}

[/codebox]

I thank everyone who will try to run this code and report his results.