All I’m trying to do is to add the elements of a matrix to an other, by using textures. The logic is to split the workload into groups of 8 and process the data in parallel. In emulation mode the following example works fine but when running on the GPU seems that tex1D() returns always 0… I’m not quite sure what’s going on…
Am I missing something here??
Here’s a very simple code:
KERNEL:
__global__ void dAdd (float* xq)
{
unsigned int id, sh, str;
id = blockIdx.y; id *= gridDim.x; id += blockIdx.x;
id *= blockDim.x; id *= blockDim.y;
str = 8;
id *= str;
for (unsigned int k = id; k < id + str; k++)
xq[k] += tex1D(ts, k);
}
HOST:
void add (float* x, float* s)
{
unsigned int msize=(Nx+2)*(Ny+2);
float *d_x;
cudaArray *ca_s;
CUDA_SAFE_CALL(cudaMallocArray(&ca_s, &chDesc_float, msize));
CUDA_SAFE_CALL(cudaMalloc((void**)&d_x, size));
CUDA_SAFE_CALL(cudaMemcpyToArray(ca_s, 0, 0, s, size, cudaMemcpyHostToDevice));
CUDA_SAFE_CALL(cudaMemcpy(d_x, x, size, cudaMemcpyHostToDevice));
CUDA_SAFE_CALL(cudaBindTextureToArray(ts, ca_s, chDesc_float));
dAdd <<< blockss, threadss >>> (d_x);
CUT_CHECK_ERROR("* Error\n");
CUDA_SAFE_CALL(cudaMemcpy(x, d_x, size, cudaMemcpyDeviceToHost));
CUDA_SAFE_CALL(cudaFreeArray(ca_s));
CUDA_SAFE_CALL(cudaFree(d_x));
}
id, blocks and threads values are absolutely correct (tested in other programs).
Texture and texture descriptor are defined as:
texture<float, 1, cudaReadModeElementType> ts;
cudaChannelFormatDesc chDesc_float = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
Running on a 9800 under Win XP + CUDA 2.0.
many thanks in advance