Problem with texture memory

Hi!

I’ve allocated two float2 arrays in texture memory in this way:

int main()

{

....

// HOST ALLOCATION

const float2 *hsig = load_input("sig.txt",&sig_length);

const float2 *hsig_d = load_input("sig_d.txt",&sig_length);

// DEVICE ALLOCATION

float2 *dsig;

if(cudaMalloc((void **) &dsig, sig_length*sizeof(float2))!=cudaSuccess)

				PrintCUDAError("Error cudaMalloc() dsig:");

cudaMemcpy(dsig,hsig,sig_length*sizeof(float2),cudaMemcpyHostToDevice);

cudaBindTexture(0, texRefsig, dsig, sig_length*sizeof(float2));

float2 *dsig_d;

if(cudaMalloc((void **) &dsig_d, sig_length*sizeof(float2))!=cudaSuccess)

				PrintCUDAError("Error cudaMalloc() dsig_d:");

cudaMemcpy(dsig_d,hsig_d,sig_length*sizeof(float2),cudaMemcpyHostToDevice);

cudaBindTexture(0, texRefsig, dsig, sig_length*sizeof(float2));

....

}

Inside the kernel, I access to array’s elements in this way:

// Texture references

texture<float2, 1, cudaReadModeElementType> texRefsig;

texture<float2, 1, cudaReadModeElementType> texRefsig_d;

__global__ void myKernel()

{

float2 texsig,texsig_d;

....

for(n=0; n<sig_length; ++n){

				texsig = tex1Dfetch(texRefsig,n);

				texsig_d = tex1Dfetch(texRefsig_d,n);

				.....

}

}

But I’ve different results at every kernel launch. What’s wrong?

I’ve also tried to use offset parameter of cudaBindTexture() but without success (different results at every kernel launch). I’ve used it in this way:

__global__ void myKernel(size_t sigoff, size_t sig_doff)

{

float2 texsig,texsig_d;

....

for(n=0; n<sig_length; ++n){

		 int idxsig = n + sigoff / sizeof(float2);

		 int idxsig_d = n + sig_doff / sizeof(float2);

		 texsig = tex1Dfetch(texRefsig,idxsig);

		 texsig_d = tex1Dfetch(texRefsig_d,idxsig_d);

		 .....

}

......

}

This is wrong. The original post was the correct way.

The code seems to be ok to me. Try to isolate the problem. Have the kernel just output the 100th (for example)

value - without multi-threads/multi-blocks/… - just something like that:

kernel( ... )

{

  gout[ 0 ] =  tex1Dfetch(texRefsig, 100 );

  gout[ 1 ] =  tex1Dfetch(texRefsig_d, 100 );

}

...

kernel<<< 1, 1 >>>( gout );

and see if you get the correct values.

Work from there - maybe the problem is because of race-conditions in the kernel, reading the input on the CPU wrongly,…

If you have CUDA < 3.0 you can use emulation to verify the correct values are used.

eyal.

Thanks for answer. ;)

For testing purposes, I’ve only one thread that execute this kernel, so I can’t have race conditions. However, I’ll try to take single values and check their correctness (with the first method).

Is that normal that cudaBindTexture() is twice used for both the same texture and array ?

@eyalhir74: I’ve made the test and the values taken are corrected.

@kr1_karin: You’re right! I’ve made a big mistake using the same texture reference. Now I’ve the same results at every kernel launch, but still wrong.

How you can see, I’m using float2 for represent complex numbers. Is correct access to them by using x coordinate and y coordinate like real part and imaginary part in complex numbers, respectively?

Yeah it’s correct.
I don’t know what your treatment is on these complex numbers but using a texture of float2 to represent them and tex1Dfetch(tex, n) to access them (with texsig.x the real part and texsig.y the complex one), is perfectly right.

Perhaps, I’ve found the problem. The args that I try pass to kernel seems not to be correct. In main I’ve allocated several arrays in this way:

int main(){

....

// HOST MEMORY ALLOCATION

	const float a = 1e-2f;   // small positive constant

	const float2 *hsig = load_input("sig.txt",&sig_length);

	const float2 *hsig_d = load_input("sig_d.txt",&sig_length);

	float *Energy = (float *) malloc (M_DEFAULT*sizeof(float));	// total energy of prediction error at the input of the m-th stage

	if(Energy==NULL)

		PrintError("Error malloc() Energy:");	

	for(i=0; i<M_DEFAULT; ++i)

		Energy[i] = a;

	

	// DEVICE MEMORY ALLOCATION

	float2 *dsig;

	if(cudaMalloc((void **) &dsig, sig_length*sizeof(float2))!=cudaSuccess)

		PrintCUDAError("Error cudaMalloc() dsig:");

	cudaMemcpy(dsig,hsig,sig_length*sizeof(float2),cudaMemcpyHostToDevice);

	cudaBindTexture(0, texRefsig, dsig, sig_length*sizeof(float2));

	float2 *dsig_d;

	if(cudaMalloc((void **) &dsig_d, sig_length*sizeof(float2))!=cudaSuccess)

		PrintCUDAError("Error cudaMalloc() dsig_d:");

	cudaMemcpy(dsig_d,hsig_d,sig_length*sizeof(float2),cudaMemcpyHostToDevice);	

	cudaBindTexture(0, texRefsig_d, dsig, sig_length*sizeof(float2));

	float *dEnergy;

	if(cudaMalloc((void **) &dEnergy, M_DEFAULT*sizeof(float))!=cudaSuccess)

		PrintCUDAError("Error cudaMalloc() dEnergy:");

	cudaMemcpy(dEnergy,Energy,M_DEFAULT*sizeof(float),cudaMemcpyHostToDevice);

	float2 *dh;	// filter taps

	if(cudaMalloc((void **) &dh, (M_DEFAULT+1)*sizeof(float2))!=cudaSuccess)

		PrintCUDAError("Error cudaMalloc() dh:");

	cudaMemset((void *) dh, 0, (M_DEFAULT+1)*sizeof(float2));

	float *dnorm_b;

	if(cudaMalloc((void **) &dnorm_b, (M_DEFAULT+1)*sizeof(float))!=cudaSuccess)

		PrintCUDAError("Error cudaMalloc() dnorm_b:");

	cudaMemset((void *) dnorm_b, 0, (M_DEFAULT+1)*sizeof(float));

	float2 *dE_f;	// forward error

	if(cudaMalloc((void **) &dE_f, (M_DEFAULT+1)*sizeof(float2))!=cudaSuccess)

		PrintCUDAError("Error cudaMalloc() dE_f:");

	float2 *dE_b;	// backward error, old 1st row, new 2nd row

	if(cudaMalloc((void **) &dE_b, M_DEFAULT2*sizeof(float2))!=cudaSuccess)

		PrintCUDAError("Error cudaMalloc() dE_b:");

	cudaMemset((void *) dE_b, 0, M_DEFAULT2*sizeof(float2));

	float2 *dk;	// reflection coefficient

	if(cudaMalloc((void **) &dk, M_DEFAULT*sizeof(float2))!=cudaSuccess)

		PrintCUDAError("Error cudaMalloc() dk):");	

	float2 *dy;

	if(cudaMalloc((void **) &dy, (M_DEFAULT+1)*sizeof(float2))!=cudaSuccess)

		PrintCUDAError("Error cudaMalloc() dy:");

	float2 *derr;

	if(cudaMalloc((void **) &derr, (M_DEFAULT+1)*sizeof(float2))!=cudaSuccess)

		PrintCUDAError("Error cudaMalloc() derr:");

	float2 *dsig_filt;		

	if(cudaMalloc((void **) &dsig_filt, sig_length*sizeof(float2))!=cudaSuccess)

		PrintCUDAError("Error cudaMalloc() dsig_filt:");

...

}

A part of kernel is the following:

__global__ void GALfilterKern(int sig_length, float2 *sig_d, float2 *sig, float *Energy, float *norm_b, float2 *h, float2 *E_f, float2 *E_b, float2 *k, float2 *y, float2 *err, float2 *sig_filt)

{

	__shared__ float s_Energy[M_DEFAULT];

	__shared__ float2 s_k[M_DEFAULT];

	__shared__ float2 s_E_f[M_DEFAULT1];

	__shared__ float2 s_E_b[M_DEFAULT2];

	__shared__ float s_norm_b[M_DEFAULT1];

	__shared__ float2 s_h[M_DEFAULT1];

	__shared__ float2 s_err[M_DEFAULT1];

	__shared__ float2 s_y[M_DEFAULT1];

	

	int i,n,m;

	float2 res,u,d,addBF,partMul,firstMul,secondMul;

	

	// ===== INITIALIZATION ===

	for(i=0; i<M_DEFAULT2; ++i){

		if(i<M_DEFAULT){		

			s_Energy[i] = Energy[i];

			s_k[i] = k[i];

		}

		if(i<M_DEFAULT1){

			s_E_f[i] = E_f[i];

			s_norm_b[i] = norm_b[i];

			s_h[i] = h[i];

			s_err[i] = err[i];

			s_y[i] = y[i];

		}

		s_E_b[i] = E_b[i];

	}

.....

}

Making some tries, I’ve noted in kernel these arrays doesn’t have right values. For example, in Energy array all its elements must be equals to 1e-2f; instead it has random values. I don’t know what’s wrong. :(

Perhaps, I’ve found the problem. The args that I try pass to kernel seems not to be correct. In main I’ve allocated several arrays in this way:

int main(){

....

// HOST MEMORY ALLOCATION

	const float a = 1e-2f;   // small positive constant

	const float2 *hsig = load_input("sig.txt",&sig_length);

	const float2 *hsig_d = load_input("sig_d.txt",&sig_length);

	float *Energy = (float *) malloc (M_DEFAULT*sizeof(float));	// total energy of prediction error at the input of the m-th stage

	if(Energy==NULL)

		PrintError("Error malloc() Energy:");	

	for(i=0; i<M_DEFAULT; ++i)

		Energy[i] = a;

	

	// DEVICE MEMORY ALLOCATION

	float2 *dsig;

	if(cudaMalloc((void **) &dsig, sig_length*sizeof(float2))!=cudaSuccess)

		PrintCUDAError("Error cudaMalloc() dsig:");

	cudaMemcpy(dsig,hsig,sig_length*sizeof(float2),cudaMemcpyHostToDevice);

	cudaBindTexture(0, texRefsig, dsig, sig_length*sizeof(float2));

	float2 *dsig_d;

	if(cudaMalloc((void **) &dsig_d, sig_length*sizeof(float2))!=cudaSuccess)

		PrintCUDAError("Error cudaMalloc() dsig_d:");

	cudaMemcpy(dsig_d,hsig_d,sig_length*sizeof(float2),cudaMemcpyHostToDevice);	

	cudaBindTexture(0, texRefsig_d, dsig, sig_length*sizeof(float2));

	float *dEnergy;

	if(cudaMalloc((void **) &dEnergy, M_DEFAULT*sizeof(float))!=cudaSuccess)

		PrintCUDAError("Error cudaMalloc() dEnergy:");

	cudaMemcpy(dEnergy,Energy,M_DEFAULT*sizeof(float),cudaMemcpyHostToDevice);

	float2 *dh;	// filter taps

	if(cudaMalloc((void **) &dh, (M_DEFAULT+1)*sizeof(float2))!=cudaSuccess)

		PrintCUDAError("Error cudaMalloc() dh:");

	cudaMemset((void *) dh, 0, (M_DEFAULT+1)*sizeof(float2));

	float *dnorm_b;

	if(cudaMalloc((void **) &dnorm_b, (M_DEFAULT+1)*sizeof(float))!=cudaSuccess)

		PrintCUDAError("Error cudaMalloc() dnorm_b:");

	cudaMemset((void *) dnorm_b, 0, (M_DEFAULT+1)*sizeof(float));

	float2 *dE_f;	// forward error

	if(cudaMalloc((void **) &dE_f, (M_DEFAULT+1)*sizeof(float2))!=cudaSuccess)

		PrintCUDAError("Error cudaMalloc() dE_f:");

	float2 *dE_b;	// backward error, old 1st row, new 2nd row

	if(cudaMalloc((void **) &dE_b, M_DEFAULT2*sizeof(float2))!=cudaSuccess)

		PrintCUDAError("Error cudaMalloc() dE_b:");

	cudaMemset((void *) dE_b, 0, M_DEFAULT2*sizeof(float2));

	float2 *dk;	// reflection coefficient

	if(cudaMalloc((void **) &dk, M_DEFAULT*sizeof(float2))!=cudaSuccess)

		PrintCUDAError("Error cudaMalloc() dk):");	

	float2 *dy;

	if(cudaMalloc((void **) &dy, (M_DEFAULT+1)*sizeof(float2))!=cudaSuccess)

		PrintCUDAError("Error cudaMalloc() dy:");

	float2 *derr;

	if(cudaMalloc((void **) &derr, (M_DEFAULT+1)*sizeof(float2))!=cudaSuccess)

		PrintCUDAError("Error cudaMalloc() derr:");

	float2 *dsig_filt;		

	if(cudaMalloc((void **) &dsig_filt, sig_length*sizeof(float2))!=cudaSuccess)

		PrintCUDAError("Error cudaMalloc() dsig_filt:");

...

}

A part of kernel is the following:

__global__ void GALfilterKern(int sig_length, float2 *sig_d, float2 *sig, float *Energy, float *norm_b, float2 *h, float2 *E_f, float2 *E_b, float2 *k, float2 *y, float2 *err, float2 *sig_filt)

{

	__shared__ float s_Energy[M_DEFAULT];

	__shared__ float2 s_k[M_DEFAULT];

	__shared__ float2 s_E_f[M_DEFAULT1];

	__shared__ float2 s_E_b[M_DEFAULT2];

	__shared__ float s_norm_b[M_DEFAULT1];

	__shared__ float2 s_h[M_DEFAULT1];

	__shared__ float2 s_err[M_DEFAULT1];

	__shared__ float2 s_y[M_DEFAULT1];

	

	int i,n,m;

	float2 res,u,d,addBF,partMul,firstMul,secondMul;

	

	// ===== INITIALIZATION ===

	for(i=0; i<M_DEFAULT2; ++i){

		if(i<M_DEFAULT){		

			s_Energy[i] = Energy[i];

			s_k[i] = k[i];

		}

		if(i<M_DEFAULT1){

			s_E_f[i] = E_f[i];

			s_norm_b[i] = norm_b[i];

			s_h[i] = h[i];

			s_err[i] = err[i];

			s_y[i] = y[i];

		}

		s_E_b[i] = E_b[i];

	}

.....

}

Making some tries, I’ve noted in kernel these arrays doesn’t have right values. For example, in Energy array all its elements must be equals to 1e-2f; instead it has random values. I don’t know what’s wrong. :(

This is hard to see what is wrong with only some parts of code.

But I have one question : what are the values of M_DEFAULT, M_DEFAULT1 and M_DEFAULT2 ?

I ask that because of this part of code :

You are perhaps trying to initialize too much shared memory.

Have you put a cudaGetLastError() after the kernel execution ?

No, I haven’t.

However, I’ve done it. In main() I called, for each array, a cudaMemset() and I’ve initialized all of them to zero. Now the results are correct.

Thanks for all. ;)