Hi All,
I’ve been trying to run the following code on a Tesla c1060. I do know that its double precision compatible and on compiling using the following ‘nvcc -arch=sm_13’ file.cu’ I get the following error ‘Invalid Device Function’. What could be the problem? Do I need to add some more options while compiling…It seems to run in the emulation mode
Thanks,
Vandhan
[codebox]
//cudacomplex is basically a struct with 4 doubles
//kernel
global void cholesky(cudacomplex *gpurows)
{
gpurows[0].real+=1;
gpurows[0].row+=1;
gpurows[0].img+=1;
gpurows[0].column+=1;
double b[4]={9.3,3.6,9.3,3.2};
gpurows[1]=b;
gpurows[2]=complexsqrt(gpurows[1]);
}
int main()
{
//row - column, real , imaginary
cudacomplex *gpurows,*gpurows2;
size_t sizeGM;
gpurows2 = new cudacomplex[3];
double a[4]={1.0,2.0,3.0,4.0};
gpurows2[0]=a;
gpurows2[1]=a;
gpurows2[2]=a;
gpurows = new cudacomplex[3];
cudacomplex blah = {1.0,2.0,3.0,4.0};
cudaMallocPitch((void**) &gpurows, &sizeGM, 3*sizeof(cudacomplex),1);
cudaMemcpy(gpurows,gpurows2,3*sizeof(cudacomplex),cudaMemcpy
HostToDevice);
cholesky<<<1,1>>>(gpurows);
cudaError_t err = cudaGetLastError();
fprintf(stderr, "Kernel Error - %s\n",cudaGetErrorString(err));
fflush(stdout);
cudaMemcpy(gpurows2,gpurows,3*sizeof(cudacomplex),cudaMemcpy
DeviceToHost);
} [/codebox]