Texture fetching problem the fetch1D function is not working...

Hi,

I need some help about the use of the texture memory and 1D memory texture fetching.

In the small attached example, I create two tables, one that is filled with the threads indexes, and another that is the copy of the previous, after it have been fetched as a texture. Each thread copy one element from the source table to the destination.

And for some reason, the result is a table full of 0.

Any idea ?

Ok, maybe I’m dumb but I don’t manage to attach my code, so here it is:

#include <cutil.h>

#include <cuda.h>

#include <stdio.h>

texture<float, 1, cudaReadModeElementType> tex;

//filling the first table

__global__ void myker(float *t){

  t[threadIdx.x] = threadIdx.x;

}

//doing the copy

__global__ void myker2(float *r){

  r[threadIdx.x] = tex1Dfetch(tex, threadIdx.x);

}

int main(){

  float tab[16];

  float *tab_d;

  float r[16];

  float *r_d;

 CUDA_SAFE_CALL( cudaMalloc((void**) &tab_d, 16*sizeof(float)));

  CUDA_SAFE_CALL( cudaMalloc((void**) &r_d, 16*sizeof(float)));

 dim3 BLOCK(16, 1);

  dim3 GRID(1,1);

 myker <<< BLOCK, GRID >>> (tab_d);

  CUDA_SAFE_CALL( cudaBindTexture( 0, tex, tab_d,  16*sizeof(float)));

//for displaying

  CUDA_SAFE_CALL( cudaMemcpy(tab, tab_d, 16*sizeof(float) , cudaMemcpyDeviceToHost));

 myker2 <<< BLOCK, GRID >>> (r_d);

  CUDA_SAFE_CALL( cudaMemcpy(r, r_d,  16*sizeof(float), cudaMemcpyDeviceToHost));

 for(int i = 0; i < 16; i++)

    printf("tab[%d]= %f, r[%d] =  %f\n", i, tab[i], i,  r[i]);

 CUDA_SAFE_CALL( cudaUnbindTexture(tex));

  CUDA_SAFE_CALL( cudaFree(tab_d));

  CUDA_SAFE_CALL( cudaFree(r_d));

  return 0;

}

Ok, after some hours of test and tears, I’ve found what was wrong… Including the kernels launches !

Here is my solution, if needed by someone:

#include <cutil.h>

#include <cuda.h>

#include <stdio.h>

texture<float, 1, cudaReadModeElementType> tex;

__global__ void myker(float *t){

  t[threadIdx.x] = threadIdx.x + 10;

}

__global__ void myker2(float *r){

  r[threadIdx.x] = tex1Dfetch(tex, threadIdx.x);

}

int main(){

  float tab[16];

  float *tab_d;

  float r[16];

  float *r_d;

  size_t offset;

 CUDA_SAFE_CALL( cudaMalloc((void**) &tab_d, 16*sizeof(float)));

  CUDA_SAFE_CALL( cudaBindTexture( &offset, tex, tab_d, 16*sizeof(float)));

  CUDA_SAFE_CALL( cudaMalloc((void**) &r_d, 16*sizeof(float)));

 dim3 BLOCK(16, 1);

  dim3 GRID(1,1);

 myker <<< GRID, BLOCK >>> (tab_d);

 myker2 <<< GRID, BLOCK >>> (r_d);

  CUDA_SAFE_CALL( cudaMemcpy(r, r_d,  16*sizeof(float), cudaMemcpyDeviceToHost));

  CUDA_SAFE_CALL( cudaMemcpy(tab, tab_d, 16*sizeof(float) , cudaMemcpyDeviceToHost));

 for(int i = 0; i < 16; i++)

    printf("tab[%d]= %f, r[%d] =  %f\n", i, tab[i], i,  r[i]);

 CUDA_SAFE_CALL( cudaUnbindTexture(tex));

  CUDA_SAFE_CALL( cudaFree(tab_d));

  CUDA_SAFE_CALL( cudaFree(r_d));

  return 0;

}