i am trying to implement AT*A on cuda using shared memory.
if the matrix size is small this code works properly. but for the given matrix size of 11684x48 . it shows error in cudamemcpy from device to host.
i am unable to fix this problem please help in this regard.
and also what should i do to reduce the kernal execution time in this program?(i have already used shared memory)
//(At*A) in GPU using shared memory
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>
#include <stdlib.h>
#include <cuda.h>
#include <assert.h>
#include <complex>
#include <cuComplex.h>
#include <stdio.h>
#include <ctime>
#include <conio.h>
#include <fstream>
#include <sstream>
#define A_c (48)
#define A_r (11684)
#define C_c (A_c)
#define C_r (A_c)
#define N (A_c*A_r)
#define TILE_WIDTH (16)
using namespace std;
void display_matrix_mem (std::complex<double> *A, int row, int col);
__global__ void coalescedMultiply(cuDoubleComplex *a, cuDoubleComplex *c){
__shared__ cuDoubleComplex ds_M[TILE_WIDTH][TILE_WIDTH];
__shared__ cuDoubleComplex ds_N[TILE_WIDTH][TILE_WIDTH];
int ty= threadIdx.y; int by=blockIdx.y;
int tx= threadIdx.x; int bx=blockIdx.x;
//**************************************
/*int blockNumInGrid = blockIdx.x + gridDim.x*blockIdx.y;
int threadNumInBlock = threadIdx.x + blockDim.x*threadIdx.y;
int threadsPerBlock = blockDim.x * blockDim.y;
int globalThreadNum = blockNumInGrid*threadsPerBlock + threadNumInBlock;*/
//**************************************
cuDoubleComplex Pvalue;
Pvalue= make_cuDoubleComplex (0.0,0.0);
// read the matrix tile into shared memory
int Row1 = by * TILE_WIDTH + ty;
int Row = by * TILE_WIDTH + tx;
int Col = bx * TILE_WIDTH + tx;
for (int m = 0; m < (TILE_WIDTH*A_r-1)/TILE_WIDTH+1; ++m)
{
if ( Row < A_c && ty+TILE_WIDTH*m < A_r)
{
ds_M[tx][ty] = a[Row + (m*TILE_WIDTH+ty)*A_c];
// printf("globalid= %d, ds_M: %f,Col,%d index: %d \n",globalThreadNum,ds_M[tx][ty].x,Col, (m*TILE_WIDTH+ty));
}
else
{
ds_M[tx][ty].x = 0;
ds_M[tx][ty].y = 0;
}
if (Col < A_c && m*TILE_WIDTH+ty < A_r)
{
ds_N[ty][tx] = a[Col + (m*TILE_WIDTH+ty)*A_c];
}
else
{
ds_N[ty][tx].x = 0;
ds_N[ty][tx].y = 0;
}
__syncthreads();
for (int k = 0; k < TILE_WIDTH; ++k)
{
cuDoubleComplex svalue1;
svalue1= make_cuDoubleComplex (0.0,0.0);
svalue1 = (cuCmul(cuConj(ds_M[ty][k]), (ds_N[k][tx])));
//printf("globalid: %d , ds_M: %f, ds_N: %f \n",globalThreadNum,ds_M[ty][k].x,ds_N[k][tx].x);
Pvalue = cuCadd(svalue1,Pvalue);
}
__syncthreads();
}
//}//endif for globalthreadNum
if (Row1 < C_r && Col < C_c) {
c[Row1*C_c+Col].x = Pvalue.x;
c[Row1*C_c+Col].y = Pvalue.y;
}
}
int main(int argc,char** argv)
{
int count=1;
complex<double> *source = (complex<double> *)malloc(N * sizeof(complex<double>));
for (int i = 0; i < N ; i++) {
*(source+i) = complex<double> (count,count);
count++;
}
for (int i = 0; i < 20 ; i++) {
cout<< *(source+i) ;
}
std::cout<<"press enter to start multiplication procedure"<<std::endl<<std::endl;
getchar();
std::complex<double> *matC_colesced = (std::complex<double> *)malloc(C_r * C_c * sizeof(std::complex<double>));
const int size_source = A_r*A_c;
const int size_product = A_c*A_c;
int size_Complex_double= sizeof(std::complex<double>);
const int bytes_source = size_source * sizeof(std::complex<double>);
const int bytes_product = size_product * sizeof(std::complex<double>);
cout<<"\n \n size_src \n"<<bytes_source;
cuDoubleComplex *d_a,*d_c;
getchar();
std::cout<<"memory access has stareted"<<std::endl;
cudaMalloc( (void**)&d_a,bytes_source);
cudaMalloc( (void**)&d_c,bytes_product);
if(cudaMemcpy(d_a, source , bytes_source, cudaMemcpyHostToDevice)!=cudaSuccess)
{
std::cout<<"Nope"<<std::endl;
return 0;
}
if(cudaMemcpy(d_c, matC_colesced, bytes_product, cudaMemcpyHostToDevice)!=cudaSuccess)
{
std::cout<<"Nope"<<std::endl;
return 0;
}
float time;
cudaError_t err;
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
//Kernal Dimensions and call
dim3 dimGrid_col_shared(((A_r-1)/TILE_WIDTH+1), ((A_c-1)/TILE_WIDTH+1), 1);
dim3 dimBlock_col_shared(TILE_WIDTH, TILE_WIDTH, 1);
printf("number of blocks_shared: %d x %d x %d \n",dimGrid_col_shared.x,dimGrid_col_shared.y,dimGrid_col_shared.z);
printf("number of Thread_shared: %d x %d x %d \n",dimBlock_col_shared.x,dimBlock_col_shared.y,dimBlock_col_shared.z);
getchar();
clock_t t_gpu;
t_gpu = clock();
cudaEventRecord(start, 0);
coalescedMultiply<<<dimGrid_col_shared, dimBlock_col_shared>>>(d_a, d_c);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
cudaEventDestroy(start);
cudaEventDestroy(stop);
cout << " \n Cuda Time - Transposed mm: " << time << "ms\n";
t_gpu=clock()-t_gpu;
printf (" \n It took me %d clicks (%f seconds) in GPU to perform mult.\n",t_gpu,((float)t_gpu)/CLOCKS_PER_SEC);
std::cout<<"kernal exited"<<std::endl;
getchar();
std::cout<<"mm with colesced shared mem done"<<std::endl;
//getchar();
if(cudaMemcpy(matC_colesced, d_c, bytes_product, cudaMemcpyDeviceToHost)!=cudaSuccess)
{
std::cout<<"Nope"<<std::endl;
getchar();
return 0;
}
std::cout<<"coalesced shared Gpu output \n"<<std::endl;
getchar();
display_matrix_mem (matC_colesced, C_r,C_c);
cudaFree(d_a);
cudaFree(d_c);
free(source);
getchar();
return 0;
}
void display_matrix_mem (std::complex<double> *A, int row, int col)
{
printf("\n\n");
for (int i = 0; i < row; i++)
{ std::cout<<"row: "<<i<<std::endl;
for (int j = 0; j < col; j++)
{
std::cout<<*(A + i*col + j)<<"\t";
}
printf("\n");
}
}