Dear Friends,
I have a code for Very Simple Matrix Multiplication on CUDA.
All things are OK, but results return 0. I had check the time for different dimension sizes and its OK.
I need your help.
Best Regards
my code is below:
// includes
#include <cutil_inline.h>
#include <shrUtils.h>
#include <cuda.h>
#include <stdio.h>
#include <stdlib.h>
// defines, project
const int N = 1024;
const int blocksize = 32; //Max Size of HW block is 32, then values greater than 32 don’t effect in processing time.
global
void mul_matrix( float* a, float b, float c, int N )
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
int j = blockIdx.y * blockDim.y + threadIdx.y;
int index = i + jN;
c[index] = 0;
if ( i < N && j < N )
for (int k=0; k<N; k++)
c[index] += a[jN+k] * b[i+k*N];
}
int main() {
float a = new float[NN];
float b = new float[NN];
float c = new float[NN];
for ( int i = 0; i < N*N; i++) { a[i] = 1.0f; b[i] = 2.9f; c[i] = 3.5f;} // initializing
float *ad, *bd, *cd;
const int size = N*N*sizeof(float);
cudaMalloc( (void**)&ad, size );
cudaMalloc( (void**)&bd, size );
cudaMalloc( (void**)&cd, size );
cudaMemcpy( ad, a, size, cudaMemcpyHostToDevice );
cudaMemcpy( bd, b, size, cudaMemcpyHostToDevice );
dim3 dimBlock( blocksize, blocksize );
dim3 dimGrid( N/dimBlock.x, N/dimBlock.y );
mul_matrix<<<dimGrid, dimBlock>>>( ad, bd, cd, N );
cudaMemcpy( c, cd, size, cudaMemcpyDeviceToHost );
for (int k=0; k<3; k++)
{
printf("a %d = %f\n", k, a[k]);
printf("b %d = %f\n", k, b[k]);
printf("c %d = %f\n", k, c[k]);
}
cudaFree( ad ); cudaFree( bd ); cudaFree( cd );
delete[] a; delete[] b; delete[] c;
int d;
scanf("%d",&d);
return EXIT_SUCCESS;
}