Using unified memory for 2Dim and 3 Dim array

I would like to use CudaMallocMangaed for 2 dim array and 3 dim array, here is part of my code

__device__ void methodA(int nx, int ny, DATA_TYPE* a, DATA_TYPE* x, DATA_TYPE* y,DATA_TYPE* tmp)
{
    int i, j;
    printf("HELLO"]);
    for (i = 0; i < ny; i++)
        y[i] = 0; 
}
__global__ void myKernel(int* nx,int* ny,DATA_TYPE* a[MaxSize],DATA_TYPE* x[MaxSize], DATA_TYPE* y[MaxSize], DATA_TYPE* tmp[MaxSize])
{
	int i = blockIdx.x * blockDim.x + threadIdx.x;
	methodA(nx[i],ny[i],a[i],x[i],y[i],tmp[i]);
}
void lunch(int* nx,int* ny,DATA_TYPE* a[MaxSize],DATA_TYPE* x[MaxSize], DATA_TYPE* y[MaxSize], DATA_TYPE* tmp[MaxSize]){

	cudaEvent_t start, stop; // To satart and stop cluck 
	float time;
	int threadsPerBlock = 32;
	int blocksPerGrid = MaxSize / threadsPerBlock;
	cudaEventCreate(&start);
	cudaEventCreate(&stop);
	cudaEventRecord(start, 0); // start timer
    myKernel << <blocksPerGrid, threadsPerBlock >> >(nx,ny,a,x,y,tmp); // excute on kernel
    cudaDeviceSynchronize();
        //cudaFree(dataCPU);
	cudaEventRecord(stop, 0);
	cudaEventSynchronize(stop);
	cudaEventElapsedTime(&time, start, stop);
	cudaEventDestroy(start);
	cudaEventDestroy(stop);
    cudaDeviceReset();
    time = time/1000;
	writeFile(time);
}
int main()
{
    int X, Y, i=0;
    float *a[MaxSize];
    float *x[MaxSize];
    float *y[MaxSize];
    float *tmp[MaxSize];
    int *ny, *nx;
    cudaError_t err =cudaMallocManaged((void **)&nx,sizeof(int)* MaxSize);
    err = cudaMallocManaged((void **)&ny,sizeof(int)* MaxSize);
    int ret = fscanf(stdin, "%d %d", &X, &Y);
    while(ret != EOF && i< MaxSize){
        nx[i] = X ;
        ny[i] = Y;
        err = cudaMallocManaged((void**)&a[i], nx[i]*ny[i]*sizeof(float));
        err = cudaMallocManaged((void**)&x[i],ny[i]*sizeof(float));
        err = cudaMallocManaged((void**)&y[i],ny[i]*sizeof(float));
        err = cudaMallocManaged((void**)&tmp[i],nx[i]*sizeof(float));

        init_arrays(nx[i],ny[i],a[i],x[i]);
        for(int j=0; j<ny[i]; j++)
             y[i][j] = 0.0;
        for(int j=0; j<nx[i]; j++)
            tmp[i][j] = 0.0;
        i++;
        ret = fscanf(stdin, "%d %d", &X, &Y);
    }
    assert(err == cudaSuccess);
    lunch(nx,ny,a,x,y,tmp);
    i=0;
  
cudaFree(nx);
cudaFree(ny);
cudaFree(a);
cudaFree(x);
cudaFree(y);
cudaFree(tmp);

The code compiles correctly. It does not execute methodA. When I add a print statement inside methodA, it does not print it. When I move the for loop, it prints the statement.

I encourage you to use proper error checking and also run your codes with cuda-memcheck, before asking others for help. Your code will spit out errors in cuda-memcheck. Even if you don’t understand those errors, providing them to others when you ask for help (especially if you are going to provide an incomplete code, making it difficult for others to run) is advised.

You cannot create a host variable:

float *a[MaxSize];

and use that variable in device code:

myKernel << <blocksPerGrid, threadsPerBlock >> >(nx,ny,a,x,y,tmp); // excute on kernel
                                                       ^

Here, a is a host-based pointer. You may wish to review how arrays of pointers are stored, and what it means to reference an array using its name (a in this case). These are C/C++ programming concepts, not specific to CUDA. The a pointer points to a location in host memory (which is the location of the first element of the array a). Since this pointer points to a location in host memory, it cannot be used in device code. Nothing that you have done with cudaMallocManaged in any way changes what a is or the address that it points to.

(I’m mixing pointer and array here somewhat loosely. a is an array. When a is used by itself, it decays to a pointer which points to the first element of the array a.)

There are a variety of other possible problems with this code, I’m sure I have not uncovered them all:

  • One possible problem is that you are taking user-provided input from stdin. If that input does not go all the way up to MaxSize, then some of your array elements will be unallocated. The threads handling those unallocated entries will generate faults on the GPU, because they are accessing uninitialized data.

  • Another code defect, is that this is not correct:

cudaFree(a);

In your code, the a pointer was not allocated using the CUDA API, so expecting to free it using CUDA is not valid. Since a is an immediate/stack-based array, attempting to free it under any circumstance doesn’t make sense in your code.

  • The code you have shown has various other defects in it, including things that would prevent compilation, such as this:
printf("HELLO"]);

Because of the complexities of handling and passing multiply-subscripted arrays between device and host, I often recommend to beginners that they flatten such arrays. Managed memory makes the process easier, however, to be sure. Here is a relatively simple modification of your code, demonstrating a possible approach. Note that this is primarily intended to demonstrate a method to handle the first issue I identified. It does not seek to address every possible defect with this code. For example, your calculation of grid size will not work correctly in cases where MaxSize is not divisible by threadPerBlock.

$ cat t326.cu
#include <stdio.h>
#include <assert.h>
#define MaxSize 32
#define DATA_TYPE float
__device__ void methodA(int nx, int ny, DATA_TYPE* a, DATA_TYPE* x, DATA_TYPE* y,DATA_TYPE* tmp)
{
    int i;
    printf("HELLO\n");
    for (i = 0; i < ny; i++)
        y[i] = 0;
}
__global__ void myKernel(int* nx,int* ny,DATA_TYPE* a[MaxSize],DATA_TYPE* x[MaxSize], DATA_TYPE* y[MaxSize], DATA_TYPE* tmp[MaxSize])
{
        int i = blockIdx.x * blockDim.x + threadIdx.x;
        methodA(nx[i],ny[i],a[i],x[i],y[i],tmp[i]);
}
void lunch(int* nx,int* ny,DATA_TYPE* a[MaxSize],DATA_TYPE* x[MaxSize], DATA_TYPE* y[MaxSize], DATA_TYPE* tmp[MaxSize]){

        cudaEvent_t start, stop; // To satart and stop cluck
        float time;
        int threadsPerBlock = 32;
        int blocksPerGrid = MaxSize / threadsPerBlock;
        cudaEventCreate(&start);
        cudaEventCreate(&stop);
        cudaEventRecord(start, 0); // start timer
    myKernel << <blocksPerGrid, threadsPerBlock >> >(nx,ny,a,x,y,tmp); // excute on kernel
    cudaDeviceSynchronize();
        //cudaFree(dataCPU);
        cudaEventRecord(stop, 0);
        cudaEventSynchronize(stop);
        cudaEventElapsedTime(&time, start, stop);
        cudaEventDestroy(start);
        cudaEventDestroy(stop);
}
int main()
{
    int i=0;
    float **a;
    float **x;
    float **y;
    float **tmp;
    cudaMallocManaged(&a, MaxSize*sizeof(float *));
    cudaMallocManaged(&x, MaxSize*sizeof(float *));
    cudaMallocManaged(&y, MaxSize*sizeof(float *));
    cudaMallocManaged(&tmp, MaxSize*sizeof(float *));
    int *ny, *nx;
    cudaError_t err =cudaMallocManaged((void **)&nx,sizeof(int)* MaxSize);
    err = cudaMallocManaged((void **)&ny,sizeof(int)* MaxSize);
    while(i< MaxSize){
        nx[i] = 4 ;
        ny[i] = 4;
        err = cudaMallocManaged((void**)&a[i], nx[i]*ny[i]*sizeof(float));
        err = cudaMallocManaged((void**)&x[i],ny[i]*sizeof(float));
        err = cudaMallocManaged((void**)&y[i],ny[i]*sizeof(float));
        err = cudaMallocManaged((void**)&tmp[i],nx[i]*sizeof(float));

        for(int j=0; j<ny[i]; j++)
             y[i][j] = 0.0;
        for(int j=0; j<nx[i]; j++)
            tmp[i][j] = 0.0;
        i++;
    }
    assert(err == cudaSuccess);
    lunch(nx,ny,a,x,y,tmp);
    i=0;

}
$ nvcc -o t326 t326.cu
$ cuda-memcheck ./t326
========= CUDA-MEMCHECK
HELLO
HELLO
HELLO
HELLO
HELLO
HELLO
HELLO
HELLO
HELLO
HELLO
HELLO
HELLO
HELLO
HELLO
HELLO
HELLO
HELLO
HELLO
HELLO
HELLO
HELLO
HELLO
HELLO
HELLO
HELLO
HELLO
HELLO
HELLO
HELLO
HELLO
HELLO
HELLO
========= ERROR SUMMARY: 0 errors
$

Thank you so much. That works :)