I have a very simple kernel to copy data from the texture to linear device memory at d2_tex. However, I get unexpected results. This being my first code, I am pretty sure I am missing something. Please help. Data stored in host memory at h_tex is copied to d_tex in device memory. Texture reference tex is bound to linear memory d_tex and this is copied to d2_tex in the kernel. The code below should print 32 5s followed by 32 10s. However, it prints 64 5s.
#include <stdio.h>
#include <stdlib.h>
#include <cutil.h>
texture<int, 1, cudaReadModeElementType> tex;
global void TextureCopyKernel(int*,int*);
int main ()
{
int d_tex=NULL, d2_tex=NULL;
int size = 64sizeof(int);
int h_tex[64],h2_tex[64];
for (int i=0;i<=63;i++)
{
h_tex[i]=(i<32)?5:10; // assigning data to the hosr memory
}
cudaMalloc((void*) &d_tex, size);
cudaMalloc((void**) &d2_tex, size);
cudaMemcpy(d_tex, h_tex, size, cudaMemcpyHostToDevice);
cudaBindTexture(0, tex, d_tex, size);
int dimGrid = 2;
int dimBlock = 32;
TextureCopyKernel<<<dimGrid, dimBlock>>>(d_tex, d2_tex);
cudaMemcpy(h2_tex, d2_tex, size, cudaMemcpyDeviceToHost);
for (int i=0;i<64;i++)
{
printf(“\n %d \n”,h2_tex[i]);
}
cudaFree(d_tex);
cudaFree(d2_tex);
}
global void TextureCopyKernel( int *g_idata, int g_odata)
{
// calculate normalized texture coordinates
unsigned int x = blockIdx.xblockDim.x + threadIdx.x;
if (x<32)
{
g_odata=tex1Dfetch(tex,x);
}
}