matrix multiplication with its transpose in cuda(cudamemcpy from device to host not working) .

i am trying to implement AT*A on cuda using shared memory.
if the matrix size is small this code works properly. but for the given matrix size of 11684x48 . it shows error in cudamemcpy from device to host.
i am unable to fix this problem please help in this regard.
and also what should i do to reduce the kernal execution time in this program?(i have already used shared memory)

//(At*A) in GPU using shared memory

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>
#include <stdlib.h>
#include <cuda.h>
#include <assert.h>
#include <complex>
#include <cuComplex.h>
#include <stdio.h>
#include <ctime>

#include <conio.h>
#include <fstream>
#include <sstream> 
 
#define A_c (48)
#define A_r (11684)
#define C_c (A_c)
#define C_r (A_c)
#define N (A_c*A_r)
 
#define TILE_WIDTH (16)
using namespace std;

void display_matrix_mem  (std::complex<double> *A, int row, int col);
 
__global__ void coalescedMultiply(cuDoubleComplex *a, cuDoubleComplex *c){
    __shared__ cuDoubleComplex ds_M[TILE_WIDTH][TILE_WIDTH];
    __shared__ cuDoubleComplex ds_N[TILE_WIDTH][TILE_WIDTH];
 
    int ty= threadIdx.y; int by=blockIdx.y;
    int tx= threadIdx.x; int bx=blockIdx.x;
	//**************************************
	/*int blockNumInGrid   = blockIdx.x  + gridDim.x*blockIdx.y;
	int threadNumInBlock = threadIdx.x + blockDim.x*threadIdx.y;
	int threadsPerBlock  = blockDim.x * blockDim.y;
	int globalThreadNum = blockNumInGrid*threadsPerBlock + threadNumInBlock;*/
    //**************************************
	cuDoubleComplex Pvalue;
    Pvalue= make_cuDoubleComplex (0.0,0.0);
    // read the matrix tile into shared memory
    int Row1 = by * TILE_WIDTH + ty;
	int Row = by * TILE_WIDTH + tx;
    int  Col = bx * TILE_WIDTH + tx;
    for (int m = 0; m < (TILE_WIDTH*A_r-1)/TILE_WIDTH+1; ++m)
    {
        if ( Row < A_c  &&  ty+TILE_WIDTH*m < A_r)
        {
            ds_M[tx][ty] = a[Row + (m*TILE_WIDTH+ty)*A_c];
      // 	printf("globalid= %d, ds_M: %f,Col,%d index: %d \n",globalThreadNum,ds_M[tx][ty].x,Col, (m*TILE_WIDTH+ty));
			  	  
		}
        else
        {  
            ds_M[tx][ty].x = 0;
            ds_M[tx][ty].y = 0;
        }
 
        if (Col < A_c && m*TILE_WIDTH+ty < A_r)
        {
            ds_N[ty][tx] = a[Col + (m*TILE_WIDTH+ty)*A_c];
        }
        else
        {
            ds_N[ty][tx].x = 0;
            ds_N[ty][tx].y = 0;

        }
  __syncthreads();

for (int k = 0; k < TILE_WIDTH; ++k)
        {
            cuDoubleComplex svalue1;
            svalue1= make_cuDoubleComplex (0.0,0.0);
            svalue1 = (cuCmul(cuConj(ds_M[ty][k]), (ds_N[k][tx])));
			//printf("globalid: %d , ds_M: %f, ds_N: %f  \n",globalThreadNum,ds_M[ty][k].x,ds_N[k][tx].x);
            Pvalue = cuCadd(svalue1,Pvalue);
        }
        __syncthreads();
    }
    //}//endif for globalthreadNum
 
    if (Row1 < C_r && Col < C_c) {
        c[Row1*C_c+Col].x = Pvalue.x;
        c[Row1*C_c+Col].y = Pvalue.y;
    }
}

int main(int argc,char** argv)
{

 int count=1; 
	complex<double> *source  = (complex<double> *)malloc(N * sizeof(complex<double>));
	for (int i = 0; i < N ; i++) {
	 *(source+i) = complex<double> (count,count);
	 count++;

	 }
    
	for (int i = 0; i < 20 ; i++) {
	 cout<< *(source+i) ;
		 }

	std::cout<<"press enter to start multiplication procedure"<<std::endl<<std::endl;
    getchar();
 
    std::complex<double> *matC_colesced = (std::complex<double> *)malloc(C_r * C_c * sizeof(std::complex<double>));
 
    const int size_source = A_r*A_c;
    const int size_product = A_c*A_c;
 
	int size_Complex_double= sizeof(std::complex<double>);
	
    const int bytes_source = size_source * sizeof(std::complex<double>);
    const int bytes_product = size_product * sizeof(std::complex<double>);
 cout<<"\n \n size_src \n"<<bytes_source;
    cuDoubleComplex *d_a,*d_c;
 getchar();
    std::cout<<"memory access has stareted"<<std::endl;
 
    cudaMalloc( (void**)&d_a,bytes_source);
    cudaMalloc( (void**)&d_c,bytes_product);
 
    if(cudaMemcpy(d_a, source , bytes_source, cudaMemcpyHostToDevice)!=cudaSuccess)
    {
        std::cout<<"Nope"<<std::endl;
        return 0;
    }
 
    if(cudaMemcpy(d_c, matC_colesced, bytes_product, cudaMemcpyHostToDevice)!=cudaSuccess)
    {
        std::cout<<"Nope"<<std::endl;
        return 0;
    }
 
		float time;
	cudaError_t err;
	cudaEvent_t start, stop;
	cudaEventCreate(&start);
	cudaEventCreate(&stop);
	

    //Kernal Dimensions and call
    dim3 dimGrid_col_shared(((A_r-1)/TILE_WIDTH+1), ((A_c-1)/TILE_WIDTH+1), 1);
    dim3 dimBlock_col_shared(TILE_WIDTH, TILE_WIDTH, 1);
 
    printf("number of blocks_shared: %d x %d x %d \n",dimGrid_col_shared.x,dimGrid_col_shared.y,dimGrid_col_shared.z);
    printf("number of Thread_shared: %d x %d x %d \n",dimBlock_col_shared.x,dimBlock_col_shared.y,dimBlock_col_shared.z);
	
	getchar();

	clock_t t_gpu;
	t_gpu = clock();
	cudaEventRecord(start, 0);

    coalescedMultiply<<<dimGrid_col_shared, dimBlock_col_shared>>>(d_a, d_c);
 
	cudaEventRecord(stop, 0);
	cudaEventSynchronize(stop);
	cudaEventElapsedTime(&time, start, stop);
	cudaEventDestroy(start);
	cudaEventDestroy(stop);
	
	cout << " \n Cuda Time - Transposed mm: " << time << "ms\n";

	t_gpu=clock()-t_gpu;
	printf (" \n It took me %d clicks (%f seconds) in GPU to perform mult.\n",t_gpu,((float)t_gpu)/CLOCKS_PER_SEC);

    std::cout<<"kernal exited"<<std::endl;
 
    getchar();
 
    std::cout<<"mm with colesced shared mem done"<<std::endl;
    //getchar();
 
    if(cudaMemcpy(matC_colesced, d_c, bytes_product, cudaMemcpyDeviceToHost)!=cudaSuccess)
    {
        std::cout<<"Nope"<<std::endl;
		getchar();
        return 0;
    }
 
    std::cout<<"coalesced shared Gpu output \n"<<std::endl;
	getchar();
   display_matrix_mem  (matC_colesced, C_r,C_c);
 
    cudaFree(d_a);
    cudaFree(d_c);
 
    free(source);
	getchar();
    return 0;
}
void display_matrix_mem  (std::complex<double> *A, int row, int col)
{
    printf("\n\n");
    for (int i = 0; i < row; i++)
    {   std::cout<<"row: "<<i<<std::endl;
        for (int j = 0; j < col; j++)
        {
            std::cout<<*(A + i*col + j)<<"\t";
        }
        printf("\n");
    }
}

Check the argument lists for your copies against the function prototype as shown in the documentation. In particular, the source and destination pointers passed don’t seem to match the copy direction specified.

no the pointers are doing fine. as i mentioned, this whole program works good for only small matrices, only for the large matrices, it crashes i.e. the memory is not copied from device to host.

the cudaMemcpy() is likely showing an error from the previous kernel launch.

I don’t see a cudaDeviceSynchronize( )/ cudaGetLastError() call to test for possible kernel execution errors before trying to copy the result back.

Hence any kernel execution error will only manifest itself during the implicit synchronization that occurs during cudaMemcpy()

I recommend executing this program under cuda-memcheck. It is likely that it will point out some illegal memory accesses inside the kernel.

@cbuchner1 i have run it under cuda memcheck… it gives error of un-specified launch failure and illegal memory accesses, how can i fix these??

If cuda-memcheck is saying there is illegal memory access, then if you run the program again and scroll the screen up, it will show where the problem is, among the many messages.
Also check Njuffa’s macro at the end of this thread, it provides some debug information on kernel calls:
https://devtalk.nvidia.com/default/topic/1038617/understanding-and-adjusting-mark-harriss-array-reduction/

compile your program with -G (device debug) or alternatively the -lineinfo option. Then debugging tools like cuda-memcheck can pinpoint the exact line of the failure in the source code.

Afterwards, add instrumentation (like print statements) to print indices used in memory access. When you see an out of bounds index printed, try to understand how this came to be and how to fix it.