Trying this matrix vector multiplication. The code is simple and it should work fine, but i get some garbage printed…
Got me cramming for hours and still nothing.
I found some posts asking why their memcpy wouldn’t work, but I hope this is not the case.
Any ideas how to fix it?
[codebox]#include<stdio.h>
global void prodMatVetGpu(float *A, float *x, int m, int n, float *pblocks){
/* Returns pblocks on global mem */
extern __shared__ float p[];
int bdimx = blockDim.x;
int bidx = blockIdx.x;
int bidy = blockIdx.y;
int tidx = threadIdx.x;
int i;
/* Block multiplications */
p[tidx] = A[n * bidy + bdimx * bidx + tidx] * x[bdimx * bidx + tidx];
__syncthreads();
/* Simple reduction of p */
for(i=bdimx/2; i>0; i=i/2){
if(tidx < i){
p[tidx] = p[tidx] + p[tidx + i];
}
__syncthreads();
}
/* Thread 0 copies to global mem */
if(tidx == 0) pblocks[bidy * (n/bdimx) + bidx] = p[0];
}
int main(){
float *Ac, *Ag; /* matrix A/cpu and matrix A/gpu */
float *vc, *vg; /* vector v/cpu and vector v/gpu */
float *Axc, *Axg; /* partial results per block */
float *y; /* final result */
int m, n; /* A's m per n matrix (9 x 8 in this example) */
int i, j;
/* Initializes and allocates mem */
m = 20;
n = 8;
int tBlock = 8;
dim3 tamGrid(m,(n/tBlock),1);
dim3 tamBlock(tBlock,1,1);
Ac = (float*)malloc(m * n * sizeof(float));
vc = (float*)malloc(n * sizeof(float));
Axc = (float*)malloc(m * (n/tBlock) * sizeof(float));
y = (float*)malloc(m * sizeof(float));
cudaMalloc((void**)&Ag, m * n * sizeof(float));
cudaMalloc((void**)&vg, n * sizeof(float));
cudaMalloc((void**)&Axg, m * (n/tBlock) * sizeof(float));
/* Initializes vectors */
for(i=0; i<m; i++)
for(j=0; j<n; j++)
Ac[i * n + j] = 1.0;
for(j=0; j<n; j++)
vc[j] = 1.0;
for(i=0; i<m; i++)
y[i] = 0.0;
cudaMemcpy(Ag, Ac, m * n * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(vg, vc, n * sizeof(float), cudaMemcpyHostToDevice);
/* Computation */
/* Decomposes A in rows of flattened blocks */
prodMatVetGpu<<<tamGrid, tamBlock, tBlock * sizeof(float)>>>(Ag, vg, m, n, Axg);
cudaMemcpy(Axc, Axg, m * (n/tBlock) * sizeof(float), cudaMemcpyDeviceToHost);
for(i=0; i<m; i++)
for(j=0; j<(n/tBlock); j++)
y[i] = y[i] + Axc[i * (n/tBlock) + j];
/* print results */
for(i=0; i<m; i++) printf("%1.0f ", y[i]);
/* Frees mem */
free(Ac); free(vc); free(Axc); free(y);
cudaFree(Ag); cudaFree(vg); cudaFree(Axg);
return 0;
}
[/codebox]