Hello everyone,
I am having a lot of trouble trying to retrieve the matrix element A(m,n) from a 2D CUDA array. Interestingly, I obtain the correct element when m = n; i.e. the element is along the diagonal. Otherwise, I get some unexpected behavior; if, for example, I want to fetch element A(13,12), and I attempt to retrieve it using tex2D(tex, row + 0.5f, col + 0.5f), I get A(14,11). As far as I know, I am doing everything as I should, so I am really interested in knowing where I went wrong.
Below follows the kernel I am talking about, for reference. The error occurs right after the first two tex2D fetches, so the rest is not relevant. It is an implementation of matrix multiplication that uses tiling and prefetching, and works correctly when I do not use texture memory.
Thank you very much for your help!
texture<float, 2, cudaReadModeElementType> tex_a;
texture<float, 2, cudaReadModeElementType> tex_b;
// Assume that BinaryFunc is multiplication, and AccumulationFunc is addition,
// so that the kernel computes the standard matrix product.
template <unsigned TileSize, class T, class SizeType, class BinaryFunc,
class AccumulationFunc>
__global__ void
matrix_prod_tex_prefetch(T* c, const SizeType dim, BinaryFunc binary_func,
AccumulationFunc accum_func)
{
__shared__ T as[TileSize][TileSize];
__shared__ T bs[TileSize][TileSize];
SizeType row = blockIdx.y * TileSize + threadIdx.y;
SizeType col = blockIdx.x * TileSize + threadIdx.x;
T p = 0;
T l = tex2D(tex_a, row + 0.5f, threadIdx.x + 0.5f);
T m = tex2D(tex_b, threadIdx.y + 0.5f, col + 0.5f);
// ERROR: l and m are incorrect if row != col.
__syncthreads();
for (SizeType i = 1; i != dim / TileSize; ++i) {
as[threadIdx.y][threadIdx.x] = l;
bs[threadIdx.y][threadIdx.x] = m;
__syncthreads();
l = tex2D(tex_a, row + 0.5f, i * TileSize + threadIdx.x + 0.5f);
m = tex2D(tex_b, i * TileSize + threadIdx.y + 0.5f, col + 0.5f);
for (SizeType k = 0; k != TileSize; ++k) {
p = accum_func(p, binary_func(
as[threadIdx.y][k],
bs[k][threadIdx.x]
));
}
__syncthreads();
}
as[threadIdx.y][threadIdx.x] = l;
bs[threadIdx.y][threadIdx.x] = m;
__syncthreads();
for (SizeType k = 0; k != TileSize; ++k) {
p = accum_func(p, binary_func(
as[threadIdx.y][k],
bs[k][threadIdx.x]
));
}
c[dim * row + col] = p;
}