Matrix Mult Result is zero!

Dear Friends,

I have a code for Very Simple Matrix Multiplication on CUDA.
All things are OK, but results return 0. I had check the time for different dimension sizes and its OK.

I need your help.

Best Regards

my code is below:

// includes
#include <cutil_inline.h>
#include <shrUtils.h>
#include <cuda.h>
#include <stdio.h>
#include <stdlib.h>

// defines, project
const int N = 1024;
const int blocksize = 32; //Max Size of HW block is 32, then values greater than 32 don’t effect in processing time.

global
void mul_matrix( float* a, float b, float c, int N )
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
int j = blockIdx.y * blockDim.y + threadIdx.y;
int index = i + j
N;
c[index] = 0;
if ( i < N && j < N )
for (int k=0; k<N; k++)
c[index] += a[j
N+k] * b[i+k*N];
}

int main() {
float a = new float[NN];
float b = new float[NN];
float c = new float[NN];

for ( int i = 0; i < N*N; i++) {	a[i] = 1.0f; 	b[i] = 2.9f; 	c[i] = 3.5f;} // initializing

float *ad, *bd, *cd;
const int size = N*N*sizeof(float);

cudaMalloc( (void**)&ad, size );
cudaMalloc( (void**)&bd, size );
cudaMalloc( (void**)&cd, size );

cudaMemcpy( ad, a, size, cudaMemcpyHostToDevice );
cudaMemcpy( bd, b, size, cudaMemcpyHostToDevice );

dim3 dimBlock( blocksize, blocksize );
dim3 dimGrid( N/dimBlock.x, N/dimBlock.y );
mul_matrix<<<dimGrid, dimBlock>>>( ad, bd, cd, N );

cudaMemcpy( c, cd, size, cudaMemcpyDeviceToHost );

for (int k=0; k<3; k++)
{
   printf("a %d = %f\n", k, a[k]);
   printf("b %d = %f\n", k, b[k]);
   printf("c %d = %f\n", k, c[k]);
}

cudaFree( ad ); cudaFree( bd ); cudaFree( cd );
delete[] a; delete[] b; delete[] c;

int d;
scanf("%d",&d);

return EXIT_SUCCESS;

}

Hello,

You’re using blocks of size 32*32 = 1024 threads : isn’t that too much for your hardware ? :)

(launch the deviceQuery sample from the SDK to know the maximum number of threads/block your card supports)

Thank you my friend

It really works OK.