Using linear memory textures with double precision.

Hi all, I’m trying to use textures in my cuda code but I’m getting a load of errors and I’m not sure why. Any help or insight into what I might be doing wrong would be much appreciated.

I’m trying to bind a double precision one dimensional array to texture. I understand that textures don’t support double so I’ve used the built-in int2 vector type and hiloint2double. Here’s the test code which I’ve been developing with:

File scope

[codebox]texture<int2,1> texrefa;[/codebox]

Host code

[codebox]int main()

{

int n=100;

double a[n], b[n];

int i;

for ( i=0; i<n; i++){

	a[i]=(i+3)/2;

}



int size;

size = n * sizeof(double);

double* Ad;

cudaMalloc((void**)&Ad, size);

double* Bd;

cudaMalloc((void**)&Bd, size);



cudaMemcpy(Ad, a, size, cudaMemcpyHostToDevice);

cudaMemcpy(Bd, b, size, cudaMemcpyHostToDevice);



cudaBindTexture(0, texrefa, &Ad, size);



texkernel<<<10,10>>>(Ad ,n, Bd);



cudaThreadSynchronize();

return 0;

}[/codebox]

Kernel code

[codebox]global void texkernel(double *a, int n, double b )

{

int i;

/*Block index*/

int bx=blockIdx.x;



/*Thread index*/

int tx=threadIdx.x;



int2 sha=make_int2(n,n);



sha=tex1Dfetch(texrefa, a);



 i=bx*10+tx;



double aa[n];



for (i=0;i<n;i++){

aa[i]=__hiloint2double(sha.x,sha.y);

}



b[i]=aa[i];

}[/codebox]

If I keep argument a as a pointer in the kernel argument list (as above) then I get this compiler error:

[html]texture.cu:65:2: warning: no newline at end of file

texture.cu(26): error: no instance of overloaded function “tex1Dfetch” matches the argument list

        argument types are: (texture<int2, 1, cudaReadModeElementType>, double *)[/html]

If I change the above code so the argument is no longer double *a instead justdouble a I get the following compile time errors:

[html]texture.cu:66:2: warning: no newline at end of file

Signal: Segmentation fault in Code_Expansion phase.

(0): Error: Signal Segmentation fault in phase Code_Expansion – processing aborted

*** Internal stack backtrace:

/usr/local/cuda2.3/open64/lib//be [0x6c09bf]

/usr/local/cuda2.3/open64/lib//be [0x6c1609]

/usr/local/cuda2.3/open64/lib//be [0x6c0d5d]

/usr/local/cuda2.3/open64/lib//be [0x6c1fa6]

/lib64/libc.so.6 [0x347b0302d0]

/usr/local/cuda2.3/open64/lib//be [0x544449]

/usr/local/cuda2.3/open64/lib//be [0x56fcec]

/usr/local/cuda2.3/open64/lib//be [0x56c7bc]

/usr/local/cuda2.3/open64/lib//be [0x56dc31]

/usr/local/cuda2.3/open64/lib//be [0x56c7a8]

/usr/local/cuda2.3/open64/lib//be [0x57132b]

/usr/local/cuda2.3/open64/lib//be [0x571e46]

/usr/local/cuda2.3/open64/lib//be [0x5723fc]

/usr/local/cuda2.3/open64/lib//be [0x54e980]

/usr/local/cuda2.3/open64/lib//be [0x405443]

/usr/local/cuda2.3/open64/lib//be [0x4061f1]

/usr/local/cuda2.3/open64/lib//be [0x40751d]

/lib64/libc.so.6(__libc_start_main+0xf4) [0x347b01d994]

/usr/local/cuda2.3/open64/lib//be [0x4038da]

nvopencc INTERNAL ERROR: /usr/local/cuda2.3/open64/lib//be died due to signal 4[/html]

Thanks in advance.

External Image

Can anyone familiar with using textures advise me?

Can anyone familiar with using textures advise me?

“tex1Dfetch(texrefa, a)” needs an integer index as second parameter, not a pointer.

So should the second parameter be the size of the array I’m trying to bind to the texture?

No, it’s the index where you want to read the texture.

Cheers for the help guys, I’m still struggling with this. My understanding of textures is pretty poor.

I’ve managed to get it to compile with this code.

File scope

[codebox]texture<int2,1> texrefa;

[/codebox]

Host code

[codebox]int main()

{

int n=100;

double a[n], b[n];

int i;

for ( i=0; i<n; i++){

	a[i]=0.567;

}

for (i=0;i<n;i++){

printf("a[%d]=%f\n", i, a[i]);

}



int size;

size = n * sizeof(double);

double* Ad;

cudaMalloc((void**)&Ad, size);

double* Bd;

cudaMalloc((void**)&Bd, size);



cudaMemcpy(Ad, a, size, cudaMemcpyHostToDevice);

cudaMemcpy(Bd, b, size, cudaMemcpyHostToDevice);



cudaBindTexture(0, texrefa, &Ad, size);



texkernel<<<10,10>>>(Ad ,n, Bd);



cudaThreadSynchronize();



cudaMemcpy(b, Bd, size, cudaMemcpyDeviceToHost);

cudaThreadSynchronize();



for (i=0;i<n;i++){

printf("b[%d]=%f\n", i, b[i]);

}

return 0;

}[/codebox]

Kernel code

[codebox]global void texkernel(double a, int n, double b )

{

int i;

/*Block index*/

int bx=blockIdx.x;



/*Thread index*/

int tx=threadIdx.x;



i=bx*10+tx;

/*

b[i]= fetch_double(texrefa, i, n);

*/

int2 sha=make_int2(n,n);

for (i=0;i<n;i++){

sha=tex1Dfetch(texrefa, i);

b[i]=__hiloint2double(sha.x,sha.y);

}

}

[/codebox]

But the results for array b are just zero. I don’t think I’m using the int2 and hiloint2double feature correctly. Does anyone know what I’m doing wrong?

Forgot to say, all the results in array b are zero except for b[0] which is some ridiculously large number.

[font=“Courier New”]cudaBindTexture(0, texrefa, &Ad, size);[/font]
should be
[font=“Courier New”]cudaBindTexture(0, texrefa, Ad, size);[/font]

Okay, I’ve made that amendment and now the entire b array is zero including b[0]. I’m trying to fill array b with the values in array a. So, still no joy.

Any more ideas what I’m doing wrong?

Ah yes, you also need to reverse the endianness - x86 CPUs are little endian:
[font=“Courier New”] b[i]=__hiloint2double(sha.y, sha.x);[/font]

No that’s still giving me zero’s in b, am I declaring sha as an int2 type correctly? I’m not familiar with their usage so I’m a little unsure if it’s correct:

int2 sha=make_int2(n,n);

Or should it be

int2 sha=make_int2(n,0); ?

I tried with both anyway, still zero’s. Be good to get it clarified though.

Cheers.

Does not matter, as it gets overwritten on the next line anyway.

I can’t find anything else wrong with it. And it works for me:

#include <stdio.h>

texture<int2, 1> texrefa;

__global__ void texkernel(int n, double b[] )

{

	int i = blockIdx.x * blockDim.x + threadIdx.x;

	int2 sha = tex1Dfetch(texrefa, i);

	b[i] = __hiloint2double(sha.y,sha.x);

}

int main()

{

	const int n=100;

	double a[n], b[n];

	for (int i=0; i<n; i++) {

		a[i] = i + 0.567;

	}

	for (int i=0; i<n; i++) {

		printf("a[%d]=%f\n", i, a[i]);

	}

	size_t size = n * sizeof(double);

	double* Ad;

	cudaMalloc((void**)&Ad, size);

	double* Bd;

	cudaMalloc((void**)&Bd, size);

	cudaMemcpy(Ad, a, size, cudaMemcpyHostToDevice);

	cudaBindTexture(0, texrefa, Ad, size);

	texkernel<<<10,10>>>(n, Bd);

	cudaMemcpy(b, Bd, size, cudaMemcpyDeviceToHost);

	for (int i=0; i<n; i++) {

		printf("b[%d]=%f\n", i, b[i]);

	}

	return 0;

}

You are trying this on a compute capability 1.3, 2.0 or 2.1 card (and compiling it for the correct capability), I presume? Anything else doesn’t have double precision support and that would go a long way to explaining the zero output, even when the texture code is correct…

Found the problem, whoop!

I had this:

for (i=0;i<n;i++){
sha=tex1Dfetch(texrefa, i);
b[i]=__hiloint2double(sha.x,sha.y);
}

When it should be this:

int2 sha = tex1Dfetch(texrefa, i);
b[i] = __hiloint2double(sha.y,sha.x);

The loop isn’t needed.

Thanks for the help.