How do I pass a double pointers array to the device? I'm getting cudaErrorIllegalAddress

acessonegato · April 4, 2019, 12:51pm

Hi, I have the following code that gives me code=77(cudaErrorIllegalAddress) when testing the device array by printing a value from it. What am I doing wrong?

#include <stdio.h> 
#include <stdlib.h>  
#include <string.h>

#include <cuda_runtime.h>

#include <helper_functions.h>
#include <helper_cuda.h>

#define ROWS    118059
#define COLUMNS 66


__global__ void testing_kernel(const char** __restrict__ d_buffer00)
{
    
	//2D GRID OF 1D BLOCKS
	unsigned int blockId = blockIdx.y * gridDim.x + blockIdx.x;
	unsigned int tid = blockId * blockDim.x + threadIdx.x;

	if (tid < 118058) 
	{
     
	 if(tid == 0)
	    printf("number is: %s",d_buffer00[tid * COLUMNS + 4]);

		
	}
}


int main(int argc, char *argv[])
{


	FILE *fptr; 
   
        unsigned int i;

        char str[ROWS];
	
	char** h_buffer00;
	const char** d_buffer00;
    
	// Open file 
        fptr = fopen("TEMPLATENEWBIS.txt", "r"); 
        if (fptr == NULL) 
        { 
          printf("Cannot open file \n"); 
          exit(0); 
        } 

	
	/* Allocate host_buffer00 as pinned memory */
	checkCudaErrors(cudaMallocHost((void**)&h_buffer00, ROWS * COLUMNS * sizeof(const char *)));
    
	i = 0;
        while (fgets(str, ROWS, fptr))
        {
        
	   int count;
           char *token = strtok(str, " ");
		
           h_buffer00[i] = strdup (token);
		
           count = 0;
	   while( token != NULL) 
	   {
             i++;
	     count++;
	     if (count == 66)
              break; 
	     token = strtok(NULL, " ");
             h_buffer00[i] = strdup (token); 
           }    
        }
	
   
	
	/* Allocate device_buffer00 on the device global memory */
	checkCudaErrors(cudaMalloc((void**)&d_buffer00, ROWS * COLUMNS * sizeof(char *)));
   
        /* Transfer data to the device host_D .. dev_D */
	checkCudaErrors(cudaMemcpy(d_buffer00, h_buffer00, ROWS * COLUMNS * sizeof(char *), cudaMemcpyHostToDevice));
	
	/* Clean Pinned Host Memory */
	checkCudaErrors(cudaFreeHost(h_buffer00));




        //2D GRID OF 1D BLOCKS

	dim3 grid(2000, 1, 1);
	dim3 blocks(64, 1, 1);

	testing_kernel <<<grid, blocks>>>(d_buffer00); 

	  
	getLastCudaError("Kernel execution failed");


        checkCudaErrors( cudaPeekAtLastError() );
	checkCudaErrors( cudaDeviceSynchronize() );

        checkCudaErrors(cudaFree(d_buffer00));



	return 0;
}

njuffa · April 4, 2019, 5:21pm

It works exactly the same as it works in standard C++. Given that CUDA is a subset of C++11, that shouldn’t come as a surprise. Note: I did not look at your code. What follows is a canned boilerplate response to “double pointer” questions.

What you need is a deep copy (look it up if you are not familiar with the term). The “double pointer” is a bit of a misnomer, and potentially confusion inducing. What you are passing is a pointer to an array of pointers to row (or column) vectors. Note: For performance reasons it is never advisable, CPU or GPU, to use such a data type when a regular plain old contiguously allocated matrix would do. A special place in hell is reserved for all those who apparently teach to use such constructs in lieu of proper matrices.

Below is an example I constructed in response to a previous, similar, question. Note that I omitted all error checking for clarity of exposition. In a real program, you would definitely want to use proper CUDA error checking.

#include <cstdio>
#include <cstdlib>
#include <complex>
#include "cuComplex.h"

#define N  (2)
#define M  (3)

typedef std::complex<float> T;

__global__ void print_device_matrix (cuComplex** mat)
{
    printf ("matrix on device:\n");
    for (int i = 0; i < N; i++) {
        for (int j = 0; j < M; j++) {
            printf ("(%f, %f)  ", cuCrealf (mat[i][j]), cuCimagf (mat[i][j]));
        }
        printf ("\n");
    }
}

int main (void)
{
    /* allocate host "matrix" */
    T **mat = (T**)malloc (N * sizeof (mat[0]));
    for (int i = 0; i < N; i++) {
        mat[i] = (T *)malloc (M * sizeof (mat[0][0]));
    }
    
    /* fill in host "matrix" */
    for (int i = 0; i < N; i++) {
        for (int j = 0; j < M; j++) {
            mat[i][j] = T (float(i)+1, float(j)+1);
        }
    }

    /* print host "matrix" */
    printf ("matrix on host:\n");
    for (int i = 0; i < N; i++) {
        for (int j = 0; j < M; j++) {
            printf ("(%f, %f)  ", real(mat[i][j]), imag(mat[i][j]));
        }
        printf ("\n");
    }

    /* allocate device "matrix" */
    T **tmp = (T**)malloc (N * sizeof (tmp[0]));
    for (int i = 0; i < N; i++) {
        cudaMalloc ((void **)&tmp[i], M * sizeof (tmp[0][0]));
    }
    cuComplex **matD = 0;
    cudaMalloc ((void **)&matD, N * sizeof (matD[0]));

    /* copy "matrix" from host to device */
    cudaMemcpy (matD, tmp, N * sizeof (matD[0]), cudaMemcpyHostToDevice);
    for (int i = 0; i < N; i++) {
        cudaMemcpy (tmp[i], mat[i], M * sizeof (matD[0][0]), cudaMemcpyHostToDevice);
    }
    free (tmp);

    /* print device "matrix" */
    print_device_matrix<<<1,1>>> (matD);

    /* free host "matrix" */
    for (int i = 0; i < N; i++) {
        free (mat[i]);
    }
    free (mat);
    
    /* free device "matrix" */
    tmp = (T**)malloc (N * sizeof (tmp[0]));
    cudaMemcpy (tmp, matD, N * sizeof (matD[0]), cudaMemcpyDeviceToHost);
    for (int i = 0; i < N; i++) {
        cudaFree (tmp[i]);
    }
    free (tmp);
    cudaFree (matD);

    return EXIT_SUCCESS;
}

acessonegato · April 4, 2019, 5:28pm

Hi, thanks

The problem is that the matrix will not allow me to store strings as this part of my code:

i = 0;
        while (fgets(str, ROWS, fptr))
        {
        
	   int count;
           char *token = strtok(str, " ");
		
           h_buffer00[i] = strdup (token);
		
           count = 0;
	   while( token != NULL) 
	   {
             i++;
	     count++;
	     if (count == 66)
              break; 
	     token = strtok(NULL, " ");
             h_buffer00[i] = strdup (token); 
           }    
        }

njuffa · April 4, 2019, 5:34pm

Sorry, I don’t debug third party code. I find my own bugs annoying enough.

As I said, CUDA is a subset of C++, and things generally work the same, except that you may have to keep track in which address space (host or device) a given piece of data is at a particular time.

If your issue derives from insufficient knowledge of C++, please consult a C++ text book or reference of your choice.

acessonegato · April 4, 2019, 5:40pm

Ok, but there must be a way of using double pointer arrays, h_buffer00 works well, I do not understand why passing a double pointer array to the kernel does not work.

Is not a debugging issue, I just need to know how to pass double pointer arrays to the device.

If I create a function in the host and pass the h_buffer00 array it will works, I do not understand why it does not work in the device.

njuffa · April 4, 2019, 5:42pm

I provided a worked example in my initial response. Have you had a chance to look at that? The output from the code I posted should look something like this:

C:\Users\Norbert\My Programs>array_of_pointers_to_vectors.exe
matrix on host:
(1.000000, 1.000000)  (1.000000, 2.000000)  (1.000000, 3.000000)
(2.000000, 1.000000)  (2.000000, 2.000000)  (2.000000, 3.000000)
matrix on device:
(1.000000, 1.000000)  (1.000000, 2.000000)  (1.000000, 3.000000)
(2.000000, 1.000000)  (2.000000, 2.000000)  (2.000000, 3.000000)

acessonegato · April 4, 2019, 5:47pm

Yes I did but it does not work with strings.

njuffa · April 4, 2019, 5:55pm

As I said, this is a canned response using a complex type for array elements. You will have to adapt this if your elements are chars. I am not going to provide such an adaptation.

It is my experience that working through such issues in detail on one’s own ultimately results in a deeper understanding of the underlying fundamental language semantics. My recommendation would be to spend some quality time (several hours) on this problem. Drawing the data structures and their memory layout on a sheet of paper can often be helpful. I am confident you can figure this out.

acessonegato · April 4, 2019, 5:59pm

Yes indeed, however I have already spent two days on this, because I did not want to change the code. I’ll try to find a way.

njuffa · April 4, 2019, 6:09pm

Sometimes it takes longer than a few hours. That’s perfectly normal. Take your time. I am a bit of a slow thinker myself, but I know from experience that if I think long and hard enough and work through an issue step by step (or try simpler examples first), I can figure things out. I am convinced so can you.

As I said, the important thing to keep in mind is that there are two address spaces: host and device. A pointer is an address. A pointer that represents an address in host address space cannot be dereferenced in device code, and vice versa. [Full disclosure: This is not always strictly true, but scenarios where there isn’t a strict separation of host and device address spaces are an advanced topic that should not concern you for now]

My suggestion would be to use two sheets of paper for visualization, where one sheet represents data structures and memory layout in host address space, and the other represents device address space.

Robert_Crovella · April 4, 2019, 11:14pm

There are questions with valid answers on stackoverflow that demonstrate various ways of passing arrays of strings to the GPU, if you want to look for them.

stebos100 · January 17, 2024, 10:11am

Hi @njuffa , I just stumbled across this, I just have one question regarding the logic for the following line of code :

/* copy "matrix" from host to device */

cudaMemcpy (matD, tmp, N * sizeof (matD[0]), cudaMemcpyHostToDevice);
for (int i = 0; i < N; i++) {
cudaMemcpy (tmp[i], mat[i], M * sizeof (matD[0][0]), cudaMemcpyHostToDevice);
}

for the cudaMemcpy in the for loop, why do we specify that the destination for the cudamemcpy is the tmp matrix which is stored on the host ? I would initially did the following which didn’t work:

cudaMemcpy (matD, mat, N * sizeof (matD[0]), cudaMemcpyHostToDevice);
for (int i = 0; i < N; i++) {
    cudaMemcpy (matD[i], mat[i], M * sizeof (matD[0][0]), cudaMemcpyHostToDevice);
}

could you possibly explain why copying the host data stored in mat is transferred to tmp, and can then be accessed via matD ?

njuffa · January 17, 2024, 11:12am

The data structure used here to store a matrix is an array of pointers to arrays. The pointer array tmp is stored on the host side, but the elements of tmp, i.e. tmp[], contain pointers that point to device memory allocated with cudaMalloc(). The content of tmp is copied to matD on the device side as part of the deep copy operation performed here.

One needs to carefully distinguish between the memory space a pointer is stored in versus the memory space what is pointed to by that pointer is stored in, and it all should be clear.

Topic		Replies	Views
Passing a multidimensional array to kernel how to allocate space in host and pass to device? CUDA Programming and Performance	12	16229	November 22, 2014
Array pointing to multiple arrays on the device CUDA Programming and Performance	9	1292	January 24, 2021
How to cudaMalloc two-dimensional array ? CUDA Programming and Performance	46	66727	September 7, 2023
Problems with creating an array of Cuda pointers CUDA Programming and Performance	7	13636	April 20, 2009
CUDA 2D Array Problem Need help to manipulate 2D arrays in CUDA CUDA Programming and Performance	4	26451	March 17, 2011
How to pass large arguments in CUDA kernels Kernel arguments CUDA Programming and Performance	10	19072	December 18, 2009
2D Array Not Updated CUDA Programming and Performance	6	5254	May 4, 2010
Help with cuda 2d array CUDA Programming and Performance	6	7460	September 29, 2014
CudaMallocPitch and CudaMemcpy2D CUDA Programming and Performance	7	5595	August 3, 2015
How to copy Device Struct with pointers to Host? CUDA Programming and Performance	10	8272	July 8, 2014

How do I pass a double pointers array to the device? I'm getting cudaErrorIllegalAddress

Related topics