Matrix Multiplication In CUDA

Hi ,
I was trying to check the performance of nvidia tegra k1 using a jetson kit.I was trying to perform a matrix multiplication using an example code . Here is the test code
:

//Matrix multiplication using shared and non shared kernal

#include <stdio.h>

#include <math.h>

#define TILE_WIDTH 2
const int WIDTH = 2000 ;
clock_t start, end;

double cpu_time_used;

/matrix multiplication kernels/

//non shared
global void
MatrixMul( float *Md , float *Nd , float *Pd , const int WIDTH )
{

       // calculate thread id

       unsigned int col = TILE_WIDTH*blockIdx.x + threadIdx.x ;

       unsigned int row = TILE_WIDTH*blockIdx.y + threadIdx.y ;

     for (int k = 0 ; k<WIDTH ; k++ )
     {
              Pd[row*WIDTH + col]+= Md[row * WIDTH + k ] * Nd[ k * WIDTH + col] ;
      }

}

// shared
global void
MatrixMulSh( float *Md , float *Nd , float *Pd , const int WIDTH )
{

    //Taking shared array to break the MAtrix in Tile widht and fatch them in that array per ele

      __shared__ float Mds [TILE_WIDTH][TILE_WIDTH] ;

       __shared__ float Nds [TILE_WIDTH][TILE_WIDTH] ;

     // calculate thread id
      unsigned int col = TILE_WIDTH*blockIdx.x + threadIdx.x ;
      unsigned int row = TILE_WIDTH*blockIdx.y + threadIdx.y ;

    for (int m = 0 ; m<WIDTH/TILE_WIDTH ; m++ ) // m indicate number of phase
   {
        Mds[threadIdx.y][threadIdx.x] =  Md[row*WIDTH + (m*TILE_WIDTH + threadIdx.x)]  ;
        Nds[threadIdx.y][threadIdx.x] =  Nd[ ( m*TILE_WIDTH + threadIdx.y) * WIDTH + col] ;
     __syncthreads() ; // for syncronizeing the threads

     // Do for tile
       for ( int k = 0; k<TILE_WIDTH ; k++ )
                   Pd[row*WIDTH + col]+= Mds[threadIdx.x][k] * Nds[k][threadIdx.y] ;
     __syncthreads() ; // for syncronizeing the threads

 }

}

// main routine
int main ()
{

start = clock();

//printf("%d",sizeof(float));

static float array1_h[WIDTH][WIDTH] ,array2_h[WIDTH][WIDTH], M_result_array_h[WIDTH][WIDTH] ;
printf(“DEBUG 2”);
float *array1_d , *array2_d ,*M_result_array_d ; // device array
int i , j ,m=0,n=0 ;
//input in host array
for ( i = 0 ; i<WIDTH ; i++ )
{
for (j = 0 ; j<WIDTH ; j++ )
{
array1_h[i][j] = m ;
array2_h[i][j] = n ;
m++;
n++;
}

}

//create device array cudaMalloc ( (void **)&array_name, sizeofmatrixinbytes) ;

cudaMalloc((void **) &array1_d , WIDTHWIDTHsizeof (int) ) ;

cudaMalloc((void **) &array2_d , WIDTHWIDTHsizeof (int) ) ;

//copy host array to device array; cudaMemcpy ( dest , source , WIDTH , direction )

cudaMemcpy ( array1_d , array1_h , WIDTHWIDTHsizeof (int) , cudaMemcpyHostToDevice ) ;

cudaMemcpy ( array2_d , array2_h , WIDTHWIDTHsizeof (int) , cudaMemcpyHostToDevice ) ;

//allocating memory for resultent device array

//cudaMalloc((void **) &result_array_d , WIDTHWIDTHsizeof (int) ) ;

cudaMalloc((void **) &M_result_array_d , WIDTHWIDTHsizeof (int) ) ;

//calling kernal

dim3 dimGrid ( WIDTH/TILE_WIDTH , WIDTH/TILE_WIDTH ,1 ) ;

dim3 dimBlock( TILE_WIDTH, TILE_WIDTH, 1 ) ;

// Change if 0 to if 1 for running non shared code and make if 0 for shared memory code
#if 1

            MatrixMul <<<dimGrid,dimBlock>>> ( array1_d , array2_d ,M_result_array_d , WIDTH) ;

#endif

#if 0

           MatrixMulSh<<<dimGrid,dimBlock>>> ( array1_d , array2_d ,M_result_array_d , WIDTH) ;

#endif

// all gpu function blocked till kernel is working
//copy back result_array_d to result_array_h

cudaMemcpy(M_result_array_h , M_result_array_d , WIDTHWIDTHsizeof(int) ,
cudaMemcpyDeviceToHost) ;
/*
//printf the result array
for ( i = 0 ; i<WIDTH ; i++ )
{
for ( j = 0 ; j < WIDTH ; j++ )
{
printf ("%f “,M_result_array_h[i][j] ) ;
}
printf (”\n") ;
}
*/
//system(“pause”) ;
end = clock();

cpu_time_used = ((double) (end - start)) / CLOCKS_PER_SEC;
printf("\n CPU TIME USED IS : %.5lf seconds \n",(cpu_time_used));

return 0;
}

I was able to compute the matrix with max width value of 800 .after that it showed segemntation fault ,so i made the array declaration as static .Now i used width value as 1500 .it executed sucessfully within few seconds.
But in case when i give the width value as 2000 or more ,it compiles and hangs upon run .
here is the terminla log :
ubuntu@tegra-ubuntu:~/tegra_sample_code$ nvcc mul14_mod.cu -o abc
ubuntu@tegra-ubuntu:~/tegra_sample_code$ ./abc
…it hangs here no output…
What is happening here ,arrays are initialized by static allocation of memory but still some issue occurs !? Am new to CUDA programming could anyone please explain me wer am wrong ?

I am not really sure what the problem could be. I also sometimes experienced problems with NVIDIA examples on the Jetson. Probably that’s because they are made for architechtures with separated Host and Device memory.

I am just a bit surprised that the code copies a two dimensional array into a one dimensional one. I am not sure if it works just like that with memcpy, I’d assume it doesn’t. Maybe someone else can clear this.

even i had a doubt in the beginning,but it seems to be computing …still am not cleared with that doubt how 2D is converted to 1 D and computed in GPU then transfered back to 2D to CPU. But still i need to chk with larger arrays say 4000 or 5000 !! where the issue occurs…!

When I run your code it completes without error in about 25 seconds.

Some suggestions, although I don’t actually think any of these should be the problem:

  1. try compiling with the arch switch for TK1:

nvcc -arch=sm_32 mul14_mod.cu -o abc

  1. Try adding proper cuda error checking to your code (hint: google “proper cuda error checking”, take the first hit)

  2. Modify your static host allocations to use dynamic (e.g. malloc):

float array1_h = (float )malloc(WIDTHWIDTHsizeof(float));
etc.

this will affect other aspects of the code as well, so you may want to do this last.

  1. Add printf statement to identify the exact line of code in main where the hang occurs. That may give some clues.

  2. You say it hangs, but maybe it’s just taking a long time. Wait for 5 minutes to see if it ever returns.

I compiled with nvcc -arch ,tried the GPU error check, intialised with dynamic memory(malloc) ,tried printf statements also but still there the same issue exists !!

Here is the updated code !
canAnyone resolve this issue ?

//Matrix multiplication using shared and non shared kernal

#include <stdio.h>

#include <math.h>

#define TILE_WIDTH 2

#define gpuErrchk(ans) { gpuAssert((ans), FILE, LINE); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,“GPUassert: %s %s %d\n”, cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
const int WIDTH = 2000 ;
clock_t start, end;

double cpu_time_used;

/matrix multiplication kernels/

//non shared
global void
MatrixMul( float *Md , float *Nd , float *Pd , const int WIDTH )
{

       // calculate thread id

       unsigned int col = TILE_WIDTH*blockIdx.x + threadIdx.x ;

       unsigned int row = TILE_WIDTH*blockIdx.y + threadIdx.y ;

     for (int k = 0 ; k<WIDTH ; k++ )
     {
              Pd[row*WIDTH + col]+= Md[row * WIDTH + k ] * Nd[ k * WIDTH + col] ;
      }

}

// shared
global void
MatrixMulSh( float *Md , float *Nd , float *Pd , const int WIDTH )
{

    //Taking shared array to break the MAtrix in Tile widht and fatch them in that array per ele

      __shared__ float Mds [TILE_WIDTH][TILE_WIDTH] ;

       __shared__ float Nds [TILE_WIDTH][TILE_WIDTH] ;

     // calculate thread id
      unsigned int col = TILE_WIDTH*blockIdx.x + threadIdx.x ;
      unsigned int row = TILE_WIDTH*blockIdx.y + threadIdx.y ;

    for (int m = 0 ; m<WIDTH/TILE_WIDTH ; m++ ) // m indicate number of phase
   {
        Mds[threadIdx.y][threadIdx.x] =  Md[row*WIDTH + (m*TILE_WIDTH + threadIdx.x)]  ;
        Nds[threadIdx.y][threadIdx.x] =  Nd[ ( m*TILE_WIDTH + threadIdx.y) * WIDTH + col] ;
     __syncthreads() ; // for syncronizeing the threads

     // Do for tile
       for ( int k = 0; k<TILE_WIDTH ; k++ )
                   Pd[row*WIDTH + col]+= Mds[threadIdx.x][k] * Nds[k][threadIdx.y] ;
     __syncthreads() ; // for syncronizeing the threads

 }

}
//int size - 4 bytes ,float - 4 byts,double -8 bytes
// main routine
int main ()
{
// printf(“DEBUG 1”);
start = clock();

//printf("%d",sizeof(float));

float array1_h = (float )malloc(WIDTHWIDTHsizeof(float));
float array2_h = (float )malloc(WIDTHWIDTHsizeof(float));
float M_result_array_h = (float )malloc(WIDTHWIDTHsizeof(float));

// static float array1_h[WIDTH][WIDTH] ,array2_h[WIDTH][WIDTH], M_result_array_h[WIDTH][WIDTH] ;
// printf(“DEBUG 2”);
float *array1_d , *array2_d ,*M_result_array_d ; // device array
int i , j ,m=0,n=0 ;
//input in host array
for ( i = 0 ; i<WIDTH ; i++ )
{
for (j = 0 ; j<WIDTH ; j++ )
{
array1_h[i* WIDTH + j] =m;
// *array1_h[i][j] = m ;
//*array2_h[i][j] = n ;
m++;
//n++;
}

}

//create device array cudaMalloc ( (void **)&array_name, sizeofmatrixinbytes) ;

gpuErrchk( cudaMalloc((void **) &array1_d , WIDTHWIDTHsizeof (int) ) );

cudaMalloc((void **) &array1_d , WIDTHWIDTHsizeof (int) ) ;

cudaMalloc((void **) &array2_d , WIDTHWIDTHsizeof (int) ) ;

//copy host array to device array; cudaMemcpy ( dest , source , WIDTH , direction )

cudaMemcpy ( array1_d , array1_h , WIDTHWIDTHsizeof (int) , cudaMemcpyHostToDevice ) ;

cudaMemcpy ( array2_d , array2_h , WIDTHWIDTHsizeof (int) , cudaMemcpyHostToDevice ) ;

//allocating memory for resultent device array

//cudaMalloc((void **) &result_array_d , WIDTHWIDTHsizeof (int) ) ;

cudaMalloc((void **) &M_result_array_d , WIDTHWIDTHsizeof (int) ) ;

//calling kernal

dim3 dimGrid ( WIDTH/TILE_WIDTH , WIDTH/TILE_WIDTH ,1 ) ;

dim3 dimBlock( TILE_WIDTH, TILE_WIDTH, 1 ) ;

// Change if 0 to if 1 for running non shared code and make if 0 for shared memory code
#if 1

            MatrixMul <<<dimGrid,dimBlock>>> ( array1_d , array2_d ,M_result_array_d , WIDTH) ;

#endif

#if 0

           MatrixMulSh<<<dimGrid,dimBlock>>> ( array1_d , array2_d ,M_result_array_d , WIDTH) ;

#endif

// all gpu function blocked till kernel is working
//copy back result_array_d to result_array_h

cudaMemcpy(M_result_array_h , M_result_array_d , WIDTHWIDTHsizeof(int) ,
cudaMemcpyDeviceToHost) ;
/*
//printf the result array
for ( i = 0 ; i<WIDTH ; i++ )
{
for ( j = 0 ; j < WIDTH ; j++ )
{
printf ("%f “,M_result_array_h[i][j] ) ;
}
printf (”\n") ;
}
*/
//system(“pause”) ;
end = clock();

cpu_time_used = ((double) (end - start)) / CLOCKS_PER_SEC;
printf("\n CPU TIME USED IS : %.5lf seconds \n",(cpu_time_used));

return 0;
}

You haven’t implemented the error checking properly.

You should put the error checking macro around every CUDA API function call and you also need to do special steps right after the kernel call.