Currently, to realise my asynchronous program, I initialize two arrays (h_A and h_B) which include N matrix (of size MxM).
After that, I make an asynchronous (with overlapping) system which :
I execute “M†times this system.
Here is my code:
// setup execution parameters
dim3 threads(BLOCK_SIZE, BLOCK_SIZE);
dim3 grid(N / threads.x, N / threads.y);
cudaStream_t stream[nb_iter];
cudaEvent_t start;
cudaEvent_t stop;
float elapsed;
for(int i=0; i<nb_iter; i++){
cudaStreamCreate(&stream[i]);
}
cudaEventCreate(&start);
cudaEventCreate(&stop);
//allocate host memory
float *h_A = tabMatrixAlloc( N, nb_iter );
float* h_B = tabMatrixAlloc( N, nb_iter );
float* h_C = tabMatrixAlloc( N, nb_iter );
//allocate device memory
float* d_A, *d_B, *d_C;
cudaMalloc((void**) &d_A, getMatrixSize(N)*nb_iter);
cudaMalloc((void**) &d_B, getMatrixSize(N)*nb_iter);
cudaMalloc((void**) &d_C, getMatrixSize(N)*nb_iter);
tabMatrixInit(h_A, N, nb_iter);
tabMatrixInit(h_B, N, nb_iter);
cudaEventRecord(start);
for (int var = 0; var < nb_iter; ++var) {
cudaMemcpyAsync(getMatrix( d_A, N, var), getMatrix( h_A, N, var), getMatrixSize(N), cudaMemcpyHostToDevice ,stream[var]);
cudaMemcpyAsync(getMatrix( d_B, N, var), getMatrix( h_B, N, var), getMatrixSize(N), cudaMemcpyHostToDevice ,stream[var]);
}
for (int var = 0; var < nb_iter; ++var) {
matrixMul<<<grid, threads, 0 ,stream[var]>>>(getMatrix( d_C, N, var), getMatrix( d_A, N, var), getMatrix( d_B, N, var), N);
}
for (int var = 0; var < nb_iter; ++var) {
cudaMemcpyAsync(getMatrix( h_C, N, var), getMatrix( d_C, N, var), getMatrixSize(N), cudaMemcpyDeviceToHost ,stream[var]);
}
cudaEventRecord(stop);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsed,start,stop);
// clean up memory
free(h_A);free(h_B);free(h_C);
cudaFree(d_A);cudaFree(d_B);cudaFree(d_C);
cudaEventDestroy(start); cudaEventDestroy(stop);
//Destroy Stream
for(int i=0; i<nb_iter; i++){cudaStreamDestroy(stream[i]);}
//Reset du gpu
cudaDeviceReset();
return elapsed;
FYI:
getMatrix() : create a matrix from a table
getMatrixSize(): return the matrix size