problem with texture fetching using tex2D

I can not fetch second column of a N-by-M matrix using tex2D if M is larger than N.

The output of the simple texture program below is

x[0]=1.000000   y[0]=1.000000

x[1]=2.000000   y[1]=1.000000

but y[1] should be equal to x[1]=2.000000. I would appreciate if you could find a bug in my program.

Here is the code

#define HANDLE_ERROR( err ) {if (err != cudaSuccess) { \

	printf( "%s in %s at line %d\n", cudaGetErrorString( err ), __FILE__, __LINE__ ); \

	exit( EXIT_FAILURE );}}

#include <stdio.h>

texture<float,2> tex;

__global__ void kernel( float *y ) {

  int tid=threadIdx.x;

  y[tid]=tex2D(tex,0,tid);

}

int main( void ) {

  float *x, *y, xh[2]={1.0f,2.0f}, yh[2];

  cudaChannelFormatDesc desc = cudaCreateChannelDesc<float>();

  HANDLE_ERROR( cudaMalloc( (void**)&x, 2*sizeof(float) ) );

  HANDLE_ERROR( cudaMalloc( (void**)&y, 2*sizeof(float) ) );

  HANDLE_ERROR( cudaBindTexture2D( NULL, tex, x, desc, 1, 2, sizeof(float) ) );

HANDLE_ERROR( cudaMemcpy( x, xh, 2*sizeof(float), cudaMemcpyHostToDevice ) );    

  kernel<<<1,2>>>( y );

  HANDLE_ERROR( cudaMemcpy( yh, y, 2*sizeof(float), cudaMemcpyDeviceToHost ) );

printf("x[2]=%f",xh[2]);

  printf("y[2]=%f",yh[2]);

cudaUnbindTexture( tex );

  HANDLE_ERROR( cudaFree( x ) );         

  HANDLE_ERROR( cudaFree( y ) );         

}

tex.cu (1013 Bytes)

The indexing in the texture fetch is incorrect.

Should be:

y[tid]=tex2D(tex,tid,0);

not:

y[tid]=tex2D(tex, 0, tid);

In addition, the pitch of the texture reference binding fucntion is incorrect , should be:

cudaBindTexture2D( NULL, tex, x, desc, 2, 1,  2*sizeof(float)

This code should work fine:

#include <cuda_runtime.h>

#define HANDLE_ERROR( err ) {if (err != cudaSuccess) { \

	printf( "%s in %s at line %d\n", cudaGetErrorString( err ), __FILE__, __LINE__ ); \

	exit( EXIT_FAILURE );}}

#include <stdio.h>

texture<float,2> tex;

__global__ void kernel( float *y ) {

	int tid=threadIdx.x;

	y[tid]=tex2D(tex,tid,0);

}

int main( void ) {

	float *x, *y, xh[2]={1.0f,2.0f}, yh[2];

    cudaChannelFormatDesc desc = cudaCreateChannelDesc<float>();

    HANDLE_ERROR( cudaMalloc( (void**)&x, 2*sizeof(float) ) );

	HANDLE_ERROR( cudaMalloc( (void**)&y, 2*sizeof(float) ) );

HANDLE_ERROR( cudaBindTexture2D( NULL, tex, x, desc, 2, 1,  2*sizeof(float) ) );

	HANDLE_ERROR( cudaMemcpy( x, xh, 2*sizeof(float), cudaMemcpyHostToDevice ) ); 

	kernel<<<1,2>>>( y );

	HANDLE_ERROR( cudaMemcpy( yh, y, 2*sizeof(float), cudaMemcpyDeviceToHost ) );

	for(int i=0; i<2; i++)

		printf("x[%d]=%f\ty[%d]=%f\n",i,xh[i],i,yh[i]);

	cudaUnbindTexture( tex );

	HANDLE_ERROR( cudaFree( x ) );

	HANDLE_ERROR( cudaFree( y ) );

}

Thank you for your quick response. Actually I know that fetching by row works as you described. The problem occurs when I try to fetch the columns of texture when the number of columns is more than a number of rows. So like in the example I gave 2 columns is more than 1 row.
I know that it should work because it works if I am using texture bound to CUDA arrays instead. But for some reason the textures bound to linear memory is not working. Either I have to resort to one dimensional textures or I have to transpose my data to make number of columns less than the number of rows.

2D-textures have certain layout requirements, so one would want to use cudaMallocPitch() instead of plain cudaMalloc() when binding a 2D-texture to linear memory. As a defensive programming practice I would recommend to never pass NULL as the first argument of a cudaBindTexture* call. Instead, pass in a suitable pointer so the function can pass back the texture offset and have the code take appropriate action should it be non-zero.

Thank you very much. cudaMallocPitch solved my problem. It turns out that the pitch parameter in cudaBindTexture2D must be a multiple of 256 bytes. Since it was only 4 bytes I had clamped my second column to first column.

The textures should be indexed with + 0.5f, i.e.

y[tid]=tex2D(tex,(float)tid + 0.5f,0);

Hi

I am experiencing some problem in fetching up the second row and do linear interpolation . my coda is like this

=============================

include <stdio.h>

include <stdlib.h>

include <math.h>

include <string.h>

include <errno.h>

include <err.h>

include <sysexits.h>

include <sys/types.h>

include <sys/time.h>

include <cuda.h>

include <cutil_inline_runtime.h>

#include<cutil_math.h>

define numrow 2

define numcol 360

include “cuPrintf.cu”

define WIND_INTERVAL 30

texture<float, 2, cudaReadModeElementType> texRef;

global void computeInterp(int numInter,float* pInterp)

{

// Calculate normalized texture coordinates

unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;//for col

unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;//for row

if ( x >= numcol || y >= numrow)

{

return;

}

//printf("x=%d,y=%d\n",x,y);

int loc=0;

loc = numcol*y+x;

//printf("numrow=%d,numcol=%d,pos=%d\n",y,x,loc);

if(loc<=numcol*numrow)

{

float tu=((float)(2.5f + 1.0f/WIND_INTERVAL + (float)x/WIND_INTERVAL));

// Read from texture and write to global memory

*((pInterp)+y* numcol + x) = tex2D(texRef,tu,y);

// printf(“value of pos=%d,tu=%f,pInterp=%1.15f\n”,loc,tu,*((pInterp)+y * numcol + x));

}

}

// Host code

int main()

{

float A[2][17] =

{ { 0.8000, 1.3, 2.3000, 3.2000, 4.0000, 5.9000, 6.3000, 7.0000, 8.2000, 9.9000, 10.4200, 11.1000, 12.8000, 13.3000, 14.3000, 15.2000, 16.0000},

{ 17.3000, 18.3000, 19.2000, 20.0000, 21.9000, 22.3000, 23.0000, 24.2000, 25.9000, 26.4200, 27.1000, 28.8000, 29.3000, 30.3000, 31.2000, 32.0000,33.8000}

};

int num_wei_a = 17;

int num_wind_inter = 360;

float* p_A = (float*)malloc(2num_wei_asizeof(float));

for (int i = 0; i < 2; ++i)

{

for(int j = 0; j < num_wei_a; ++j)

{

  p_A[i*num_wei_a + j] = A[i][j];

}

}

float *p_dev_AInterp;

float p_AInterp = (float)malloc(numrownumcolsizeof(float));

cutilSafeCall(cudaMalloc((void**)&p_dev_AInterp, numrownumcolsizeof(float)));

dim3 grid(30,30);

dim3 block(16,16);

// Allocate CUDA array in device memory

cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);

//cudaChannelFormatDesc desc = cudaCreateChannelDesc();

cudaArray* cuArray;

cudaMallocArray(&cuArray, &texRef.channelDesc,sizeof(float)*num_wei_a,2);

// Copy to device memory some data located

// in host memory

// A

cudaMemcpyToArray(cuArray, 0, 0, p_WindWeibullA, sizeof(float)num_wei_a2, cudaMemcpyHostToDevice);

// Bind the array to the texture reference

cudaBindTextureToArray(texRef, cuArray);

// Set texture parameters

texRef.filterMode = cudaFilterModeLinear;

texRef.normalized = false;

computeInterp<<<grid,block>>>(num_wind_inter, p_dev_AInterp);

cudaUnbindTexture(texRef);

cudaMemcpy(p_AInterp, p_dev_AInterp, numcol2sizeof(float), cudaMemcpyDeviceToHost);

for (int i = 0; i < 2; i++)

{

for (int j = 0; j < 360;j++)

  {

    printf("pAInterp[%d][%d]=%f\n",i,j,*((p_AInterp)+i* numcol + j));

  }

}

cudaFree(p_dev_AInterp);

cudaFreeArray(cuArray);

}

=============================

The interval is of 30 for taking only 12 element of this interval we do the indexing for tu .

My problem is the interpolation is done for only 1st row of WeibullA . How do i perform the interpolation for second row