Hi ,
I was trying to check the performance of nvidia tegra k1 using a jetson kit.I was trying to perform a matrix multiplication using an example code . Here is the test code
:
//Matrix multiplication using shared and non shared kernal
#include <stdio.h>
#include <math.h>
#define TILE_WIDTH 2
const int WIDTH = 2000 ;
clock_t start, end;
double cpu_time_used;
/matrix multiplication kernels/
//non shared
global void
MatrixMul( float *Md , float *Nd , float *Pd , const int WIDTH )
{
// calculate thread id
unsigned int col = TILE_WIDTH*blockIdx.x + threadIdx.x ;
unsigned int row = TILE_WIDTH*blockIdx.y + threadIdx.y ;
for (int k = 0 ; k<WIDTH ; k++ )
{
Pd[row*WIDTH + col]+= Md[row * WIDTH + k ] * Nd[ k * WIDTH + col] ;
}
}
// shared
global void
MatrixMulSh( float *Md , float *Nd , float *Pd , const int WIDTH )
{
//Taking shared array to break the MAtrix in Tile widht and fatch them in that array per ele
__shared__ float Mds [TILE_WIDTH][TILE_WIDTH] ;
__shared__ float Nds [TILE_WIDTH][TILE_WIDTH] ;
// calculate thread id
unsigned int col = TILE_WIDTH*blockIdx.x + threadIdx.x ;
unsigned int row = TILE_WIDTH*blockIdx.y + threadIdx.y ;
for (int m = 0 ; m<WIDTH/TILE_WIDTH ; m++ ) // m indicate number of phase
{
Mds[threadIdx.y][threadIdx.x] = Md[row*WIDTH + (m*TILE_WIDTH + threadIdx.x)] ;
Nds[threadIdx.y][threadIdx.x] = Nd[ ( m*TILE_WIDTH + threadIdx.y) * WIDTH + col] ;
__syncthreads() ; // for syncronizeing the threads
// Do for tile
for ( int k = 0; k<TILE_WIDTH ; k++ )
Pd[row*WIDTH + col]+= Mds[threadIdx.x][k] * Nds[k][threadIdx.y] ;
__syncthreads() ; // for syncronizeing the threads
}
}
// main routine
int main ()
{
start = clock();
//printf(“%d”,sizeof(float));
static float array1_h[WIDTH][WIDTH] ,array2_h[WIDTH][WIDTH], M_result_array_h[WIDTH][WIDTH] ;
printf(“DEBUG 2”);
float *array1_d , *array2_d ,*M_result_array_d ; // device array
int i , j ,m=0,n=0 ;
//input in host array
for ( i = 0 ; i<WIDTH ; i++ )
{
for (j = 0 ; j<WIDTH ; j++ )
{
array1_h[i][j] = m ;
array2_h[i][j] = n ;
m++;
n++;
}
}
//create device array cudaMalloc ( (void **)&array_name, sizeofmatrixinbytes) ;
cudaMalloc((void **) &array1_d , WIDTHWIDTHsizeof (int) ) ;
cudaMalloc((void **) &array2_d , WIDTHWIDTHsizeof (int) ) ;
//copy host array to device array; cudaMemcpy ( dest , source , WIDTH , direction )
cudaMemcpy ( array1_d , array1_h , WIDTHWIDTHsizeof (int) , cudaMemcpyHostToDevice ) ;
cudaMemcpy ( array2_d , array2_h , WIDTHWIDTHsizeof (int) , cudaMemcpyHostToDevice ) ;
//allocating memory for resultent device array
//cudaMalloc((void **) &result_array_d , WIDTHWIDTHsizeof (int) ) ;
cudaMalloc((void **) &M_result_array_d , WIDTHWIDTHsizeof (int) ) ;
//calling kernal
dim3 dimGrid ( WIDTH/TILE_WIDTH , WIDTH/TILE_WIDTH ,1 ) ;
dim3 dimBlock( TILE_WIDTH, TILE_WIDTH, 1 ) ;
// Change if 0 to if 1 for running non shared code and make if 0 for shared memory code
#if 1
MatrixMul <<<dimGrid,dimBlock>>> ( array1_d , array2_d ,M_result_array_d , WIDTH) ;
#if 0
MatrixMulSh<<<dimGrid,dimBlock>>> ( array1_d , array2_d ,M_result_array_d , WIDTH) ;
// all gpu function blocked till kernel is working
//copy back result_array_d to result_array_h
cudaMemcpy(M_result_array_h , M_result_array_d , WIDTHWIDTHsizeof(int) ,
cudaMemcpyDeviceToHost) ;
/*
//printf the result array
for ( i = 0 ; i<WIDTH ; i++ )
{
for ( j = 0 ; j < WIDTH ; j++ )
{
printf (“%f “,M_result_array_h[i][j] ) ;
}
printf (”\n”) ;
}
*/
//system(“pause”) ;
end = clock();
cpu_time_used = ((double) (end - start)) / CLOCKS_PER_SEC;
printf(“\n CPU TIME USED IS : %.5lf seconds \n”,(cpu_time_used));
return 0;
}
I was able to compute the matrix with max width value of 800 .after that it showed segemntation fault ,so i made the array declaration as static .Now i used width value as 1500 .it executed sucessfully within few seconds.
But in case when i give the width value as 2000 or more ,it compiles and hangs upon run .
here is the terminla log :
ubuntu@tegra-ubuntu:~/tegra_sample_code$ nvcc mul14_mod.cu -o abc
ubuntu@tegra-ubuntu:~/tegra_sample_code$ ./abc
…it hangs here no output…
What is happening here ,arrays are initialized by static allocation of memory but still some issue occurs !? Am new to CUDA programming could anyone please explain me wer am wrong ?