hi, i am writing a program for convolution which is working fine otherwise but creates a problem when i use texture memory.
i managed solve the syntactical & compilation issues but perhaps i am missing some information which is causing this problem.
i have a matrix named N(5x5) which i want to copy to the texture memory. the elements of the matrix are as follows:
[N] = 2 3 4 5 6
3 4 5 6 7
4 5 6 7 8
5 6 7 8 9
6 7 8 9 10
when i access the values in the device code (before the computation) they are somewhat like 2, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5, 8.5, 9.5
i cant understand why? please help, thanks…
#include<stdio.h>
#include<cuda.h>
texture<float, 1, cudaReadModeElementType> tex;
__global__ void MatrixMulKernel(float* M, float* P, int Width)
{
float Pvalue = 0;
int modval = (Width - 1) / 2;
for (int k = 0; k < Width; ++k)
{
for (int j = 0; j < Width; ++j)
{
float Melement = M[k*Width+j];
float Nelement = 0;
if (threadIdx.y-modval+k >= 0 && threadIdx.y-modval+k < 5 && threadIdx.x-modval+j >= 0 && threadIdx.x-modval+j < 5)
{
Nelement = tex1D(tex, float((threadIdx.y-modval+k)*5+(threadIdx.x-modval+j)));
printf("k = %d\tj = %d\ttx = %d\tty = %d\tnelem = %f\n", k, j, threadIdx.x, threadIdx.y, Nelement);
}
Pvalue += Melement * Nelement;
}
}
P[threadIdx.y*5+threadIdx.x] = Pvalue;
}
int main()
{
float M[3][3], N[5][5], P[5][5];
float *Md, *Nd, *Pd;
int height = 5;
int width = 5;
int wd = 3;
int size = width * height * sizeof(float);
cudaMalloc((void **)&Md, 9*sizeof(float));
cudaMalloc((void **)&Nd, 25*sizeof(float));
cudaMalloc((void **)&Pd, size);
int i, j;
for (i = 0; i < 3; i++)
for (j = 0; j < 3; j++)
{
M[i][j] = i+j+1;
}
for (i = 0; i < 5; i++)
for (j = 0; j < 5; j++)
{
N[i][j] = i+j+2;
P[i][j] = 0;
}
cudaMemcpy(Md, &M, 9*sizeof(float), cudaMemcpyHostToDevice);
cudaArray* cuArray;
cudaMallocArray (&cuArray, &tex.channelDesc, width*height, 1);
cudaMemcpyToArray(cuArray, 0, 0, &N, sizeof(float)*width*height, cudaMemcpyHostToDevice); // bind a texture to the CUDA array
cudaBindTextureToArray(tex, cuArray);
tex.normalized = false; tex.filterMode = cudaFilterModeLinear;
dim3 dimGrid(1, 1);
dim3 dimBlock(5, 5);
MatrixMulKernel <<< dimGrid, dimBlock >>> (Md, Pd, wd);
cudaMemcpy(&P, Pd, sizeof(float)*width*height, cudaMemcpyDeviceToHost);
for (i = 0; i < 5; i++)
{
for (j = 0; j < 5; j++)
{
printf("P[%d][%d] = %f\t", i, j, P[i][j]);
}
printf("\n");
}
cudaUnbindTexture(tex);
cudaFreeArray(cuArray);
cudaFree(Md);
cudaFree(Nd);
cudaFree(Pd);
return 0;
}