How do I pass a double pointers array to the device? I'm getting cudaErrorIllegalAddress

Hi, I have the following code that gives me code=77(cudaErrorIllegalAddress) when testing the device array by printing a value from it. What am I doing wrong?

#include <stdio.h> 
#include <stdlib.h>  
#include <string.h>

#include <cuda_runtime.h>

#include <helper_functions.h>
#include <helper_cuda.h>

#define ROWS    118059
#define COLUMNS 66


__global__ void testing_kernel(const char** __restrict__ d_buffer00)
{
    
	//2D GRID OF 1D BLOCKS
	unsigned int blockId = blockIdx.y * gridDim.x + blockIdx.x;
	unsigned int tid = blockId * blockDim.x + threadIdx.x;

	if (tid < 118058) 
	{
     
	 if(tid == 0)
	    printf("number is: %s",d_buffer00[tid * COLUMNS + 4]);

		
	}
}


int main(int argc, char *argv[])
{


	FILE *fptr; 
   
        unsigned int i;

        char str[ROWS];
	
	char** h_buffer00;
	const char** d_buffer00;
    
	// Open file 
        fptr = fopen("TEMPLATENEWBIS.txt", "r"); 
        if (fptr == NULL) 
        { 
          printf("Cannot open file \n"); 
          exit(0); 
        } 

	
	/* Allocate host_buffer00 as pinned memory */
	checkCudaErrors(cudaMallocHost((void**)&h_buffer00, ROWS * COLUMNS * sizeof(const char *)));
    
	i = 0;
        while (fgets(str, ROWS, fptr))
        {
        
	   int count;
           char *token = strtok(str, " ");
		
           h_buffer00[i] = strdup (token);
		
           count = 0;
	   while( token != NULL) 
	   {
             i++;
	     count++;
	     if (count == 66)
              break; 
	     token = strtok(NULL, " ");
             h_buffer00[i] = strdup (token); 
           }    
        }
	
   
	
	/* Allocate device_buffer00 on the device global memory */
	checkCudaErrors(cudaMalloc((void**)&d_buffer00, ROWS * COLUMNS * sizeof(char *)));
   
        /* Transfer data to the device host_D .. dev_D */
	checkCudaErrors(cudaMemcpy(d_buffer00, h_buffer00, ROWS * COLUMNS * sizeof(char *), cudaMemcpyHostToDevice));
	
	/* Clean Pinned Host Memory */
	checkCudaErrors(cudaFreeHost(h_buffer00));




        //2D GRID OF 1D BLOCKS

	dim3 grid(2000, 1, 1);
	dim3 blocks(64, 1, 1);

	testing_kernel <<<grid, blocks>>>(d_buffer00); 

	  
	getLastCudaError("Kernel execution failed");


        checkCudaErrors( cudaPeekAtLastError() );
	checkCudaErrors( cudaDeviceSynchronize() );

        checkCudaErrors(cudaFree(d_buffer00));



	return 0;
}

It works exactly the same as it works in standard C++. Given that CUDA is a subset of C++11, that shouldn’t come as a surprise. Note: I did not look at your code. What follows is a canned boilerplate response to “double pointer” questions.

What you need is a deep copy (look it up if you are not familiar with the term). The “double pointer” is a bit of a misnomer, and potentially confusion inducing. What you are passing is a pointer to an array of pointers to row (or column) vectors. Note: For performance reasons it is never advisable, CPU or GPU, to use such a data type when a regular plain old contiguously allocated matrix would do. A special place in hell is reserved for all those who apparently teach to use such constructs in lieu of proper matrices.

Below is an example I constructed in response to a previous, similar, question. Note that I omitted all error checking for clarity of exposition. In a real program, you would definitely want to use proper CUDA error checking.

#include <cstdio>
#include <cstdlib>
#include <complex>
#include "cuComplex.h"

#define N  (2)
#define M  (3)

typedef std::complex<float> T;

__global__ void print_device_matrix (cuComplex** mat)
{
    printf ("matrix on device:\n");
    for (int i = 0; i < N; i++) {
        for (int j = 0; j < M; j++) {
            printf ("(%f, %f)  ", cuCrealf (mat[i][j]), cuCimagf (mat[i][j]));
        }
        printf ("\n");
    }
}

int main (void)
{
    /* allocate host "matrix" */
    T **mat = (T**)malloc (N * sizeof (mat[0]));
    for (int i = 0; i < N; i++) {
        mat[i] = (T *)malloc (M * sizeof (mat[0][0]));
    }
    
    /* fill in host "matrix" */
    for (int i = 0; i < N; i++) {
        for (int j = 0; j < M; j++) {
            mat[i][j] = T (float(i)+1, float(j)+1);
        }
    }

    /* print host "matrix" */
    printf ("matrix on host:\n");
    for (int i = 0; i < N; i++) {
        for (int j = 0; j < M; j++) {
            printf ("(%f, %f)  ", real(mat[i][j]), imag(mat[i][j]));
        }
        printf ("\n");
    }

    /* allocate device "matrix" */
    T **tmp = (T**)malloc (N * sizeof (tmp[0]));
    for (int i = 0; i < N; i++) {
        cudaMalloc ((void **)&tmp[i], M * sizeof (tmp[0][0]));
    }
    cuComplex **matD = 0;
    cudaMalloc ((void **)&matD, N * sizeof (matD[0]));

    /* copy "matrix" from host to device */
    cudaMemcpy (matD, tmp, N * sizeof (matD[0]), cudaMemcpyHostToDevice);
    for (int i = 0; i < N; i++) {
        cudaMemcpy (tmp[i], mat[i], M * sizeof (matD[0][0]), cudaMemcpyHostToDevice);
    }
    free (tmp);

    /* print device "matrix" */
    print_device_matrix<<<1,1>>> (matD);

    /* free host "matrix" */
    for (int i = 0; i < N; i++) {
        free (mat[i]);
    }
    free (mat);
    
    /* free device "matrix" */
    tmp = (T**)malloc (N * sizeof (tmp[0]));
    cudaMemcpy (tmp, matD, N * sizeof (matD[0]), cudaMemcpyDeviceToHost);
    for (int i = 0; i < N; i++) {
        cudaFree (tmp[i]);
    }
    free (tmp);
    cudaFree (matD);

    return EXIT_SUCCESS;
}

Hi, thanks

The problem is that the matrix will not allow me to store strings as this part of my code:

i = 0;
        while (fgets(str, ROWS, fptr))
        {
        
	   int count;
           char *token = strtok(str, " ");
		
           h_buffer00[i] = strdup (token);
		
           count = 0;
	   while( token != NULL) 
	   {
             i++;
	     count++;
	     if (count == 66)
              break; 
	     token = strtok(NULL, " ");
             h_buffer00[i] = strdup (token); 
           }    
        }

Sorry, I don’t debug third party code. I find my own bugs annoying enough.

As I said, CUDA is a subset of C++, and things generally work the same, except that you may have to keep track in which address space (host or device) a given piece of data is at a particular time.

If your issue derives from insufficient knowledge of C++, please consult a C++ text book or reference of your choice.

Ok, but there must be a way of using double pointer arrays, h_buffer00 works well, I do not understand why passing a double pointer array to the kernel does not work.

Is not a debugging issue, I just need to know how to pass double pointer arrays to the device.

If I create a function in the host and pass the h_buffer00 array it will works, I do not understand why it does not work in the device.

I provided a worked example in my initial response. Have you had a chance to look at that? The output from the code I posted should look something like this:

C:\Users\Norbert\My Programs>array_of_pointers_to_vectors.exe
matrix on host:
(1.000000, 1.000000)  (1.000000, 2.000000)  (1.000000, 3.000000)
(2.000000, 1.000000)  (2.000000, 2.000000)  (2.000000, 3.000000)
matrix on device:
(1.000000, 1.000000)  (1.000000, 2.000000)  (1.000000, 3.000000)
(2.000000, 1.000000)  (2.000000, 2.000000)  (2.000000, 3.000000)

Yes I did but it does not work with strings.

As I said, this is a canned response using a complex type for array elements. You will have to adapt this if your elements are chars. I am not going to provide such an adaptation.

It is my experience that working through such issues in detail on one’s own ultimately results in a deeper understanding of the underlying fundamental language semantics. My recommendation would be to spend some quality time (several hours) on this problem. Drawing the data structures and their memory layout on a sheet of paper can often be helpful. I am confident you can figure this out.

Yes indeed, however I have already spent two days on this, because I did not want to change the code. I’ll try to find a way.

Sometimes it takes longer than a few hours. That’s perfectly normal. Take your time. I am a bit of a slow thinker myself, but I know from experience that if I think long and hard enough and work through an issue step by step (or try simpler examples first), I can figure things out. I am convinced so can you.

As I said, the important thing to keep in mind is that there are two address spaces: host and device. A pointer is an address. A pointer that represents an address in host address space cannot be dereferenced in device code, and vice versa. [Full disclosure: This is not always strictly true, but scenarios where there isn’t a strict separation of host and device address spaces are an advanced topic that should not concern you for now]

My suggestion would be to use two sheets of paper for visualization, where one sheet represents data structures and memory layout in host address space, and the other represents device address space.

There are questions with valid answers on stackoverflow that demonstrate various ways of passing arrays of strings to the GPU, if you want to look for them.

Hi @njuffa , I just stumbled across this, I just have one question regarding the logic for the following line of code :

/* copy "matrix" from host to device */

cudaMemcpy (matD, tmp, N * sizeof (matD[0]), cudaMemcpyHostToDevice);
for (int i = 0; i < N; i++) {
cudaMemcpy (tmp[i], mat[i], M * sizeof (matD[0][0]), cudaMemcpyHostToDevice);
}

for the cudaMemcpy in the for loop, why do we specify that the destination for the cudamemcpy is the tmp matrix which is stored on the host ? I would initially did the following which didn’t work:

cudaMemcpy (matD, mat, N * sizeof (matD[0]), cudaMemcpyHostToDevice);
for (int i = 0; i < N; i++) {
    cudaMemcpy (matD[i], mat[i], M * sizeof (matD[0][0]), cudaMemcpyHostToDevice);
}

could you possibly explain why copying the host data stored in mat is transferred to tmp, and can then be accessed via matD ?

The data structure used here to store a matrix is an array of pointers to arrays. The pointer array tmp is stored on the host side, but the elements of tmp, i.e. tmp[], contain pointers that point to device memory allocated with cudaMalloc(). The content of tmp is copied to matD on the device side as part of the deep copy operation performed here.

One needs to carefully distinguish between the memory space a pointer is stored in versus the memory space what is pointed to by that pointer is stored in, and it all should be clear.

1 Like