matrix multiplication with its transpose in cuda(cudamemcpy from device to host not working) .

alanclinton · October 5, 2018, 9:14am

i am trying to implement AT*A on cuda using shared memory.
if the matrix size is small this code works properly. but for the given matrix size of 11684x48 . it shows error in cudamemcpy from device to host.
i am unable to fix this problem please help in this regard.
and also what should i do to reduce the kernal execution time in this program?(i have already used shared memory)

//(At*A) in GPU using shared memory

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>
#include <stdlib.h>
#include <cuda.h>
#include <assert.h>
#include <complex>
#include <cuComplex.h>
#include <stdio.h>
#include <ctime>

#include <conio.h>
#include <fstream>
#include <sstream> 
 
#define A_c (48)
#define A_r (11684)
#define C_c (A_c)
#define C_r (A_c)
#define N (A_c*A_r)
 
#define TILE_WIDTH (16)
using namespace std;

void display_matrix_mem  (std::complex<double> *A, int row, int col);
 
__global__ void coalescedMultiply(cuDoubleComplex *a, cuDoubleComplex *c){
    __shared__ cuDoubleComplex ds_M[TILE_WIDTH][TILE_WIDTH];
    __shared__ cuDoubleComplex ds_N[TILE_WIDTH][TILE_WIDTH];
 
    int ty= threadIdx.y; int by=blockIdx.y;
    int tx= threadIdx.x; int bx=blockIdx.x;
	//**************************************
	/*int blockNumInGrid   = blockIdx.x  + gridDim.x*blockIdx.y;
	int threadNumInBlock = threadIdx.x + blockDim.x*threadIdx.y;
	int threadsPerBlock  = blockDim.x * blockDim.y;
	int globalThreadNum = blockNumInGrid*threadsPerBlock + threadNumInBlock;*/
    //**************************************
	cuDoubleComplex Pvalue;
    Pvalue= make_cuDoubleComplex (0.0,0.0);
    // read the matrix tile into shared memory
    int Row1 = by * TILE_WIDTH + ty;
	int Row = by * TILE_WIDTH + tx;
    int  Col = bx * TILE_WIDTH + tx;
    for (int m = 0; m < (TILE_WIDTH*A_r-1)/TILE_WIDTH+1; ++m)
    {
        if ( Row < A_c  &&  ty+TILE_WIDTH*m < A_r)
        {
            ds_M[tx][ty] = a[Row + (m*TILE_WIDTH+ty)*A_c];
      // 	printf("globalid= %d, ds_M: %f,Col,%d index: %d \n",globalThreadNum,ds_M[tx][ty].x,Col, (m*TILE_WIDTH+ty));
			  	  
		}
        else
        {  
            ds_M[tx][ty].x = 0;
            ds_M[tx][ty].y = 0;
        }
 
        if (Col < A_c && m*TILE_WIDTH+ty < A_r)
        {
            ds_N[ty][tx] = a[Col + (m*TILE_WIDTH+ty)*A_c];
        }
        else
        {
            ds_N[ty][tx].x = 0;
            ds_N[ty][tx].y = 0;

        }
  __syncthreads();

for (int k = 0; k < TILE_WIDTH; ++k)
        {
            cuDoubleComplex svalue1;
            svalue1= make_cuDoubleComplex (0.0,0.0);
            svalue1 = (cuCmul(cuConj(ds_M[ty][k]), (ds_N[k][tx])));
			//printf("globalid: %d , ds_M: %f, ds_N: %f  \n",globalThreadNum,ds_M[ty][k].x,ds_N[k][tx].x);
            Pvalue = cuCadd(svalue1,Pvalue);
        }
        __syncthreads();
    }
    //}//endif for globalthreadNum
 
    if (Row1 < C_r && Col < C_c) {
        c[Row1*C_c+Col].x = Pvalue.x;
        c[Row1*C_c+Col].y = Pvalue.y;
    }
}

int main(int argc,char** argv)
{

 int count=1; 
	complex<double> *source  = (complex<double> *)malloc(N * sizeof(complex<double>));
	for (int i = 0; i < N ; i++) {
	 *(source+i) = complex<double> (count,count);
	 count++;

	 }
    
	for (int i = 0; i < 20 ; i++) {
	 cout<< *(source+i) ;
		 }

	std::cout<<"press enter to start multiplication procedure"<<std::endl<<std::endl;
    getchar();
 
    std::complex<double> *matC_colesced = (std::complex<double> *)malloc(C_r * C_c * sizeof(std::complex<double>));
 
    const int size_source = A_r*A_c;
    const int size_product = A_c*A_c;
 
	int size_Complex_double= sizeof(std::complex<double>);
	
    const int bytes_source = size_source * sizeof(std::complex<double>);
    const int bytes_product = size_product * sizeof(std::complex<double>);
 cout<<"\n \n size_src \n"<<bytes_source;
    cuDoubleComplex *d_a,*d_c;
 getchar();
    std::cout<<"memory access has stareted"<<std::endl;
 
    cudaMalloc( (void**)&d_a,bytes_source);
    cudaMalloc( (void**)&d_c,bytes_product);
 
    if(cudaMemcpy(d_a, source , bytes_source, cudaMemcpyHostToDevice)!=cudaSuccess)
    {
        std::cout<<"Nope"<<std::endl;
        return 0;
    }
 
    if(cudaMemcpy(d_c, matC_colesced, bytes_product, cudaMemcpyHostToDevice)!=cudaSuccess)
    {
        std::cout<<"Nope"<<std::endl;
        return 0;
    }
 
		float time;
	cudaError_t err;
	cudaEvent_t start, stop;
	cudaEventCreate(&start);
	cudaEventCreate(&stop);
	

    //Kernal Dimensions and call
    dim3 dimGrid_col_shared(((A_r-1)/TILE_WIDTH+1), ((A_c-1)/TILE_WIDTH+1), 1);
    dim3 dimBlock_col_shared(TILE_WIDTH, TILE_WIDTH, 1);
 
    printf("number of blocks_shared: %d x %d x %d \n",dimGrid_col_shared.x,dimGrid_col_shared.y,dimGrid_col_shared.z);
    printf("number of Thread_shared: %d x %d x %d \n",dimBlock_col_shared.x,dimBlock_col_shared.y,dimBlock_col_shared.z);
	
	getchar();

	clock_t t_gpu;
	t_gpu = clock();
	cudaEventRecord(start, 0);

    coalescedMultiply<<<dimGrid_col_shared, dimBlock_col_shared>>>(d_a, d_c);
 
	cudaEventRecord(stop, 0);
	cudaEventSynchronize(stop);
	cudaEventElapsedTime(&time, start, stop);
	cudaEventDestroy(start);
	cudaEventDestroy(stop);
	
	cout << " \n Cuda Time - Transposed mm: " << time << "ms\n";

	t_gpu=clock()-t_gpu;
	printf (" \n It took me %d clicks (%f seconds) in GPU to perform mult.\n",t_gpu,((float)t_gpu)/CLOCKS_PER_SEC);

    std::cout<<"kernal exited"<<std::endl;
 
    getchar();
 
    std::cout<<"mm with colesced shared mem done"<<std::endl;
    //getchar();
 
    if(cudaMemcpy(matC_colesced, d_c, bytes_product, cudaMemcpyDeviceToHost)!=cudaSuccess)
    {
        std::cout<<"Nope"<<std::endl;
		getchar();
        return 0;
    }
 
    std::cout<<"coalesced shared Gpu output \n"<<std::endl;
	getchar();
   display_matrix_mem  (matC_colesced, C_r,C_c);
 
    cudaFree(d_a);
    cudaFree(d_c);
 
    free(source);
	getchar();
    return 0;
}
void display_matrix_mem  (std::complex<double> *A, int row, int col)
{
    printf("\n\n");
    for (int i = 0; i < row; i++)
    {   std::cout<<"row: "<<i<<std::endl;
        for (int j = 0; j < col; j++)
        {
            std::cout<<*(A + i*col + j)<<"\t";
        }
        printf("\n");
    }
}

njuffa · October 5, 2018, 9:38am

Check the argument lists for your copies against the function prototype as shown in the documentation. In particular, the source and destination pointers passed don’t seem to match the copy direction specified.

alanclinton · October 5, 2018, 9:51am

no the pointers are doing fine. as i mentioned, this whole program works good for only small matrices, only for the large matrices, it crashes i.e. the memory is not copied from device to host.

cbuchner1 · October 5, 2018, 10:21am

the cudaMemcpy() is likely showing an error from the previous kernel launch.

I don’t see a cudaDeviceSynchronize( )/ cudaGetLastError() call to test for possible kernel execution errors before trying to copy the result back.

Hence any kernel execution error will only manifest itself during the implicit synchronization that occurs during cudaMemcpy()

I recommend executing this program under cuda-memcheck. It is likely that it will point out some illegal memory accesses inside the kernel.

alanclinton · October 5, 2018, 10:56am

@cbuchner1 i have run it under cuda memcheck… it gives error of un-specified launch failure and illegal memory accesses, how can i fix these??

saulocpp · October 5, 2018, 12:16pm

If cuda-memcheck is saying there is illegal memory access, then if you run the program again and scroll the screen up, it will show where the problem is, among the many messages.
Also check Njuffa’s macro at the end of this thread, it provides some debug information on kernel calls:
[url]https://devtalk.nvidia.com/default/topic/1038617/understanding-and-adjusting-mark-harriss-array-reduction/[/url]

cbuchner1 · October 5, 2018, 12:51pm

compile your program with -G (device debug) or alternatively the -lineinfo option. Then debugging tools like cuda-memcheck can pinpoint the exact line of the failure in the source code.

Afterwards, add instrumentation (like print statements) to print indices used in memory access. When you see an out of bounds index printed, try to understand how this came to be and how to fix it.

Topic		Replies	Views
Matrix Multiplication In CUDA CUDA Programming and Performance	6	2525	May 11, 2015
cuda error out of memory how to increase the size of matrix in multiplication CUDA Programming and Performance	10	11730	October 1, 2009
CUDA Matrix Multiplication Issues threads and blocks problem CUDA Programming and Performance	2	3629	March 1, 2009
Error while running 1024*1024 matrix multiplication code for GPU. CUDA Programming and Performance	3	1359	February 2, 2013
Cuda matrix multiplication too slow CUDA Programming and Performance	5	13323	February 17, 2010
Cuda application crashes works fine for small data and crashes for big data CUDA Developer Tools	0	360	December 8, 2020
Error parameter number 10 cublasSgemmBatched GPU-Accelerated Libraries	4	1756	June 23, 2016
Problems of matrix multiplication With and without CUDA CUDA Programming and Performance	15	9993	January 18, 2012
Cuda application crashes works fine for small data and crashes for big data CUDA Programming and Performance	3	405	October 12, 2021
Maximum matrix size for matrix multiplication operation on GeForce GTX 960M CUDA Programming and Performance	12	3557	November 28, 2018

matrix multiplication with its transpose in cuda(cudamemcpy from device to host not working) .

Related topics