2D textures bound to Pitch Linear Memory texture doesn't access correct information

I’m trying to perform a simple addition with textures:

  1. I have a mallocPitch Array

  2. I bind my texture to this array

  3. In a kernel I add one to every texel and write back to a cudaMalloc array (linear memory)

  4. I don’t get the results I want (i.e. the results are not the previous plus 1)

#include<stdio.h>

#include<cuda.h>

#include<iostream>

#define height 16

#define width 11

#define BLOCKSIZE 16

using namespace std;

// Device Kernels

//Texture reference Declaration

texture<float,2> texRefEx;

__global__ void kernel_w_textures(float* devMPPtr, float * devMPtr, int pitch)

{

	// Thread indexes

		unsigned int idx = blockIdx.x*blockDim.x + threadIdx.x;

		unsigned int idy = blockIdx.y*blockDim.y + threadIdx.y;

	

	// Texutre Coordinates

	float u=(idx)/float(width);

	float v=(idy)/float(height);

	devMPtr[idy*width+idx]=devMPPtr[idy*pitch/sizeof(float)+idx];

	// Write Texture Contents to malloc array +1

	devMPtr[idy*width+idx]= tex2D(texRefEx,u,v)+1.0f;

}

//Host Code

int main()

{

	// memory size

	size_t memsize=height*width;

	float * data,		// input from host

		*h_out,		// host space for output

		*devMPPtr,	// malloc Pitch ptr

		*devMPtr;	// malloc ptr

	size_t pitch;

	// Allocate space on the host

	data=(float *)malloc(sizeof(float)*memsize);

	h_out=(float *)malloc(sizeof(float)*memsize);

// Define data

for (int i = 0; i <  height; i++)

	for (int j=0; j < width; j++)

		data[i*width+j]=float(j);

// Define the grid

dim3 grid((int)(width/BLOCKSIZE)+1,(int)(height/BLOCKSIZE)+1), threads(BLOCKSIZE,BLOCKSIZE);

// allocate Malloc Pitch

cudaMallocPitch((void**)&devMPPtr,&pitch, width * sizeof(float), height);

// Print the pitch

printf("The pitch is %d \n",pitch/sizeof(float));

// Texture Channel Description

cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();

// Bind texture to pitch mem:

cudaBindTexture2D(0,&texRefEx,devMPPtr,&channelDesc,width,height,pitch);

// Set mutable properties:

texRefEx.normalized=true;

texRefEx.addressMode[0]=cudaAddressModeWrap;

texRefEx.addressMode[1]=cudaAddressModeWrap;

texRefEx.filterMode= cudaFilterModePoint;

// Allocate cudaMalloc memory

cudaMalloc((void**)&devMPtr,memsize*sizeof(float));

// Read data from host to device

cudaMemcpy2D((void*)devMPPtr,pitch,(void*)data,sizeof(float)*width,

		sizeof(float)*width,height,cudaMemcpyHostToDevice);

//Read back and check this memory

cudaMemcpy2D((void*)h_out,width*sizeof(float),(void*)devMPPtr,pitch,

		sizeof(float)*width,height,cudaMemcpyDeviceToHost);

// Print the memory

	for (int i=0; i<height; i++){

		for (int j=0; j<width; j++){

			printf("%2.2f ",h_out[i*width+j]);

		}

	cout << endl;

	}

	cout << "Done" << endl;

// Memory is fine... 

kernel_w_textures<<<grid,threads>>>(devMPPtr, devMPtr, pitch);

// Copy back data to host

cudaMemcpy((void*)h_out,(void*)devMPtr,width*height*sizeof(float),cudaMemcpyDeviceToHost);

// Print the Result

 cout << endl;

	for (int i=0; i<height; i++){

		for (int j=0; j<width; j++){

			printf("%2.2f ",h_out[i*width+j]);

		}

	cout << endl;

	}

	cout << "Done" << endl;

return(0);

}

I can’t figure out why my results are wrong ( it looks like I’ve bound everything according to the programing guide). On a side note, this program works fine for square dimensions. I’ve been having issues with this for a while and have not seen a similar problem - Although I’m sure it’s something easy to fix.

Perhaps the issue is due to your graphics card hardware. There is a possibility that the size of the texture must be a power of 2, since this is normally how texture memory is packed. I believe that the grid must have power-of-2 dimensions (i.e. such as 128 x 64), unless non-rectangular textures are supported by the graphics card.

I see three issues with the posted code:

(1) The computation of texture coordinates does not account for the (0.5,0.5) offset (i.e. the center of each texel). Proposed fix:

// Texture Coordinates        

        float u = (idx + 0.5f) / WIDTH;

        float v = (idy + 0.5f) / HEIGHT;

(2) The kernel code does not guard against threads that are writing outside the result array. Proposed fix:

if ((idx < WIDTH) && (idy < HEIGHT)) {

       [...]

    }

(3) [minor] The grid computation can overestimate the size of the grid needed. Proposed improvement:

dim3 grid((int)(WIDTH  + BLOCKSIZE - 1) / BLOCKSIZE,

              (int)(HEIGHT + BLOCKSIZE - 1) / BLOCKSIZE);

BTW, the texture defaults should be fine, except for the normalization mode, so expect for this, no explicit setup is needed. Here is the entire program modified according to the comments above, lightly tested on a C2050 and a Quadro FX5800.

#include<stdio.h>

#include<cuda.h>

#include<iostream>

#define USE_TEXTURE 1

#define HEIGHT      17

#define WIDTH       18

#define BLOCKSIZE   16

#define ADD         1.f

using namespace std;

// Macro to catch CUDA errors in CUDA runtime calls

#define CUDA_SAFE_CALL(call)                                          \

do {                                                                  \

    cudaError_t err = call;                                           \

    if (cudaSuccess != err) {                                         \

        fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\

                 __FILE__, __LINE__, cudaGetErrorString(err) );       \

        exit(EXIT_FAILURE);                                           \

    }                                                                 \

} while (0)

// Device Kernels

// Texture reference Declaration

texture<float,2> texRefEx;

__global__ void kernel_w_textures (float* devMPPtr, float * devMPtr, int pitch)

{        

    // Thread indexes                

    unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;

    unsigned int idy = blockIdx.y * blockDim.y + threadIdx.y;

    if ((idx < WIDTH) && (idy < HEIGHT)) {

#if USE_TEXTURE

        // Texture Coordinates        

        float u = (idx + 0.5f) / WIDTH;

        float v = (idy + 0.5f) / HEIGHT;

        devMPtr[idy*WIDTH+idx] = tex2D (texRefEx, u, v) + ADD;

#else  /* USE_TEXTURE */

        devMPtr[idy*WIDTH+idx] = devMPPtr[idy*(pitch/sizeof(float))+idx] + ADD;

#endif /* USE_TEXTURE */

    }

}

//Host Code

int main()

{        

    // memory size        

    size_t memsize=HEIGHT*WIDTH;        

    float *data,         // input from host                

          *h_out,        // host space for output                

          *devMPPtr,     // malloc Pitch ptr                

          *devMPtr;      // malloc ptr        

    size_t pitch;        // Allocate space on the host        

    size_t texOfs;

    data=(float *)malloc(sizeof(float)*memsize);        

    h_out=(float *)malloc(sizeof(float)*memsize);

// Define data

    for (int i = 0; i <  HEIGHT; i++) {

        for (int j = 0; j < WIDTH; j++) {

            data[i*WIDTH+j] = float(i*100+j);

        }

    }

    printf ("array dimensions: %d x %d\n", HEIGHT, WIDTH);

    printf ("Read access via: %s\n", USE_TEXTURE ? "texture" : "array");

    // Define the grid

    dim3 threads(BLOCKSIZE, BLOCKSIZE);

    dim3 grid((int)(WIDTH  + BLOCKSIZE - 1) / BLOCKSIZE,

              (int)(HEIGHT + BLOCKSIZE - 1) / BLOCKSIZE);

    printf ("grid = %d x %d   block = %d x %d\n", 

            grid.x, grid.y, threads.x, threads.y);

    // allocate Malloc Pitch

    CUDA_SAFE_CALL (cudaMallocPitch ((void**)&devMPPtr, &pitch, 

                                     WIDTH * sizeof(float), HEIGHT));

    // Print the pitch

    printf ("Pitch in elements is %d\n", (int)(pitch / sizeof(float)));

    // Texture Channel Description

    cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();

    // Set mutable properties (defaults except for coordinate normalization)

    texRefEx.normalized     = true;

    // Bind texture to pitch mem

    CUDA_SAFE_CALL (cudaBindTexture2D (&texOfs, &texRefEx, devMPPtr,

                                       &channelDesc, WIDTH, HEIGHT, pitch));

    printf ("texOfs = %lld\n", (long long)texOfs);

    // Allocate cudaMalloc memory

    CUDA_SAFE_CALL (cudaMalloc ((void**)&devMPtr,memsize*sizeof(float)));

    // Read data from host to device

    CUDA_SAFE_CALL(cudaMemcpy2D ((void*)devMPPtr, pitch, 

                                 (void*)data, sizeof(float) * WIDTH,

                                 sizeof(float) * WIDTH, HEIGHT,

                                 cudaMemcpyHostToDevice));

    // Read back and check this memory

    CUDA_SAFE_CALL (cudaMemcpy2D ((void*)h_out, WIDTH * sizeof(float),

                                  (void*)devMPPtr, pitch,                

                                  sizeof(float) * WIDTH, HEIGHT,

                                  cudaMemcpyDeviceToHost));

    // Print the memory        

    for (int i = 0; i < HEIGHT; i++){                

        for (int j = 0; j < WIDTH; j++){                        

            printf ("%4.0f ", h_out[i*WIDTH+j]);                

        }        

        cout << endl;        

    }        

    cout << "Done" << endl;

    // Memory is fine... 

    kernel_w_textures<<<grid,threads>>>(devMPPtr, devMPtr, pitch);

    // Copy back data to host

    CUDA_SAFE_CALL (cudaMemcpy ((void*)h_out, (void*)devMPtr,

                                WIDTH * HEIGHT * sizeof(float),

                                cudaMemcpyDeviceToHost));

    // Print the Result 

    cout << endl;        

    for (int i = 0; i < HEIGHT; i++){                

        for (int j = 0;  j < WIDTH; j++){                        

            printf("%4.0f ", h_out[i*WIDTH+j]);                

        }        

        cout << endl;        

    }

    cout << "Done" << endl;

    return(0);

}