Problem with thrust::host_vector<int*>

When I defined and assigned a two-dimensional array using host_vector, the first four data of each row were in error.
The following code:

__global__ void cudaThrustKernel2D(int** ptr_d_vec, const int label, const int cols, const int rows){
  int i = blockDim.x * blockIdx.x + threadIdx.x;
  int j = blockDim.y * blockIdx.y + threadIdx.y;
  if(i < cols && j < rows){
    ptr_d_vec[i][j] += label;
    // int temp = ptr_d_vec[i][j];
    // printf("%d \n", temp);
  }
}

void thrustKernel2D(){
  printf("***********thrustKernel2D************\n");
  // initial data
  cudaError_t err = cudaSuccess;
  int rows = 28;
  int cols = 47;
  int label = 1;
  dim3 dimBlock(32, 4);
  dim3 dimGrid(ceil((cols + dimBlock.x - 1) / dimBlock.x), ceil((rows + dimBlock.y - 1) / dimBlock.y));

  // set 2D thrust vector use int*
  thrust::host_vector<int*> h_vec(cols);
  for(int i = 0; i < cols; i++){
    thrust::host_vector<int> hData(rows, 1);
    int* ptr_hData = thrust::raw_pointer_cast(&hData[0]);
    h_vec[i] = ptr_hData;
    thrust::host_vector<int>().swap(hData);
  }

  // print the result of compute
  for(int i = 0; i < cols; i++){
    for(int j = 0; j < rows; j++){
      printf("%d ", h_vec[i][j]);
    }
    printf("\n");
  }
  thrust::device_vector<int*> d_vec = h_vec;
  int** ptr_d_vec = thrust::raw_pointer_cast(&d_vec[0]);

  printf("call kernel:\n");
  // call kernel
  cudaThrustKernel2D <<<dimGrid, dimBlock>>> (ptr_d_vec, label, cols, rows);
  printf("copy data:\n");
  // copy data from device to host
  thrust::copy(d_vec.begin(), d_vec.end(), h_vec.begin());

  // print the result of compute
  for(int i = 0; i < cols; i++){
    for(int j = 0; j < rows; j++){
      printf("%d ", h_vec[i][j]);
    }
    printf("\n");
  }

  //free data
  thrust::host_vector<int*>().swap(h_vec);
  thrust::device_vector<int*>().swap(d_vec);
}

the result is following:

0 0 -2099142640 85 1 1 1 1 1 1 1 1 1 1 1 ...
call kernel
copy data
teminate called after throwing an instance of 'thrust::system::system_error'
   what(): trivial_device_copy D->H failed: cudaErrorLaunchFailure: unspecified launch failure
Aborted(core dumped)

You’ve copied the array of pointers successfully, but those pointers all point to host allocations. Those host allocations are not accessible in device code. This sort of operation requires a deep copy, or use of managed memory (which effectively handles the deep copy for you.)

You can probably fix this fairly simply by making hData a device_vector instead of host vector.

Also, vector gets deallocated when it goes out of scope. Your code has this problem in the initial for-loop. The following changes seem to work for me:

$ cat t1873.cu
#include <cstdio>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>

__global__ void cudaThrustKernel2D(int** ptr_d_vec, const int label, const int cols, const int rows){
  int i = blockDim.x * blockIdx.x + threadIdx.x;
  int j = blockDim.y * blockIdx.y + threadIdx.y;
  if(i < cols && j < rows){
    ptr_d_vec[i][j] += label;
    // int temp = ptr_d_vec[i][j];
    // printf("%d \n", temp);
  }
}

void thrustKernel2D(){
  printf("***********thrustKernel2D************\n");
  // initial data
  cudaError_t err = cudaSuccess;
  int rows = 28;
  int cols = 47;
  int label = 1;
  dim3 dimBlock(32, 4);
  dim3 dimGrid((cols + dimBlock.x - 1) / dimBlock.x, (rows + dimBlock.y - 1) / dimBlock.y);

  // set 2D thrust vector use int*
  thrust::host_vector<int*> h_vec(cols);
  for(int i = 0; i < cols; i++){
    thrust::device_vector<int> *hData = new thrust::device_vector<int>(rows, 1);
    int* ptr_hData = thrust::raw_pointer_cast(hData->data());
    h_vec[i] = ptr_hData;
  }

  thrust::device_vector<int*> d_vec = h_vec;
  int** ptr_d_vec = thrust::raw_pointer_cast(&d_vec[0]);

  printf("call kernel:\n");
  // call kernel
  cudaThrustKernel2D <<<dimGrid, dimBlock>>> (ptr_d_vec, label, cols, rows);
  printf("copy data:\n");
  thrust::host_vector<int> r(rows);
  // print the result of compute
  for(int i = 0; i < cols; i++){
    err = cudaMemcpy(thrust::raw_pointer_cast(r.data()), h_vec[i], rows*sizeof(int), cudaMemcpyDeviceToHost);
    for(int j = 0; j < rows; j++){
      printf("%d ", r[j]);
    }
    printf("\n");
  }

}

int main(){

  thrustKernel2D();
}
$ nvcc -o t1873 t1873.cu
t1873.cu(18): warning: variable "err" was set but never used

$ cuda-memcheck ./t1873
========= CUDA-MEMCHECK
***********thrustKernel2D************
call kernel:
copy data:
2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
========= ERROR SUMMARY: 0 errors
$

Hi Robert_Crovella,
Thank you for your advice! It work for me. One more questions I would like to ask you is
why you use cudaMemcpy() instead of thrust::copy().

Thanks!

Since h_vec[i] is a pointer, not a thrust vector, it seemed convenient to me to do it that way. You could probably get it to work with thrust::copy also.

Thank you very much!