Hello to All,
I am trying to make some matrix computation, and I am using cudaMemcpy2D and cudaMallocPitch.
Since I am having some trouble, I developed a simple kernel, which copy a matrix into another.
Here it is the code:
[codebox]global void matrixCopy(float* a, float* c, int a_pitch, int c_pitch, int width)
{
int x = blockIdx.x*blockDim.x+threadIdx.x;
int y = blockIdx.y*blockDim.y+threadIdx.y;
c[yc_pitch+x]=a[ya_pitch+x];
}
void matrixCopyCaller(float* hA, float* hC, int width, float& compute_time)
{
float* dA;
size_t matA_gpu_pitch;
float* dC;
size_t matC_gpu_pitch;
//int memsize=widthwidthsizeof(float);
unsigned int timer = 0;
cudaSetDevice(0);
cudaMallocPitch((void**)&dA,&matA_gpu_pitch, width*sizeof(float),width);
cudaMemcpy2D(dA, matA_gpu_pitch, hA, widthsizeof(float), widthsizeof(float),width,cudaMemcpyHostToDevice);
cudaMallocPitch((void**)&dC,&matC_gpu_pitch, width*sizeof(float),width);
dim3 block(BLOCK_SIZE, BLOCK_SIZE);
dim3 grid(width/block.x,width/block.y);
matrixMul<<<grid,block>>>(dA,dB,dC, matA_gpu_pitch, matB_gpu_pitch, matC_gpu_pitch, width);
cudaThreadSynchronize();
cudaMemcpy2D(hC, widthsizeof(float), dC, matC_gpu_pitch, widthsizeof(float), width, cudaMemcpyDeviceToHost);
cudaFree(dA);
cudaFree(dC);
}
[/codebox]
could you explain me why it does not work?
Thanks in advance
Francesco