tex2D help

Hello,

I have some trouble with tex2D; I figure out that I do not understand well how tex2D works… tex2D(reference, col, row). In this simple sample I want to copy 2D matrix from host to texture device and copy back it to 2D matrix on host, it is very simple but something does not work because output is:

0 1 2 3 4 5 6 7 8 9

9 9 9 9 9 9 9 9 9 9

9 9 9 9 9 9 9 9 9 9

but it should be:

0 1 2 3 4 5 6 7 8 9

10 11 12 13 14 15 16 17 18 19

20 21 22 23 24 25 26 27 28 29

#include "cuda_runtime.h"

#include "device_launch_parameters.h"

#include "cuda_texture_types.h"

#include "texture_fetch_functions.h"

#include "texture_types.h"

#include <stdio.h>

texture<int, 2> textureD;

__global__ void kernel(int *dOutput, int width, int height)

{

   int row =

	   blockIdx.y * blockDim.y + threadIdx.y;

int col =

	   blockIdx.x * blockDim.x + threadIdx.x;

dOutput[row * width + col] =

	   tex2D(textureD, col, row);

}

int main()

{

	int *h;

	int width =

		10;

	int height =

		10;

	int size =

		width * height;

	cudaHostAlloc<int>(&h, size * sizeof(int), cudaHostAllocDefault);

	int i =

		0;

	for(int row = 0; row < height; row++)

	{

		for(int col = 0; col < width; col++)

		{

			h[row * width + col] =

				i;

			i++;

		}

	}

	int *d;

	size_t pitch;

	cudaMallocPitch<int>(&d, &pitch, width * sizeof(int), height);

	

	cudaMemcpy2D(d, pitch, h, width * sizeof(int), width * sizeof(int), height, cudaMemcpyHostToDevice);

	cudaChannelFormatDesc channel =

		cudaCreateChannelDesc<int>();

	cudaBindTexture2D(NULL, &textureD, d, &channel, width, height, pitch); 

	

	int *hOutput;

	cudaHostAlloc<int>(&hOutput, size * sizeof(int), cudaHostAllocDefault);

	

	int *dOutput;

	cudaMalloc<int>(&dOutput, size * sizeof(int));

	kernel<<<1, width * height>>>(dOutput, width, height);

	cudaMemcpy(hOutput, dOutput, size * sizeof(int), cudaMemcpyDeviceToHost);

	for(int row = 0; row < height; row++)

	{

		for(int col = 0; col < width; col++)

		{

			printf("%d ", h[row * width + col]);

		}

		printf("\n");

	}

	printf("\n");

	for(int row = 0; row < height; row++)

	{

		for(int col = 0; col < width; col++)

		{

			printf("%d ", hOutput[row * width + col]);

		}

		printf("\n");

	}

	getchar();

	cudaFreeHost(h);

	cudaFree(d);

	cudaFree(dOutput);

	cudaFreeHost(hOutput);

return 0;

}

Ops my mistake in kernel call… now works