save a warp's data in memory

Hello, I write a test program to save a warp’s data in “dstrow_out”, but the value print not right, can anyone help to review that, not sure on the code can do the right thing, thank you very much!

#include<cuda.h>
#include<iostream>

#define WARP_SIZE 32
#define BLOCKDIMY 32
#define CEIL(x, y) ((x) + (y) - 1) / (y)
template
  <typename scalar_t>
__global__ void test_save_warpdata(
   int* indices,
   const int n,
   int* dstrow_out)
{

  extern __shared__ char buf[];
  int* indices_batch = (int*)(buf);

  for(int batch_start = 0; batch_start < n; batch_start += blockDim.x*blockDim.y)
  {
    int tid = threadIdx.x + threadIdx.y*blockDim.x;
    if(batch_start + tid < n)
      indices_batch[tid] = (int)indices[batch_start + tid];

    int batch_end = batch_start + blockDim.x*blockDim.y < n ?
                    batch_start + blockDim.x*blockDim.y : n;

    for(int chunk_start = batch_start; chunk_start < batch_end; chunk_start += blockDim.y)
    {

      int src_row = chunk_start + threadIdx.y;
      int dst_row = indices_batch[src_row - batch_start];
      // save block0's warp dst_row
      if (blockIdx.x==0 && threadIdx.y == 0)
        dstrow_out[threadIdx.x] = dst_row;

    }
  }

}

int main()
{

  const int indexnum = 16;
  int * indices = new int[indexnum];

  auto filldata = [&](int size, int* data) {
    for (int i = 0; i < size; ++i) {
      data[i] = i % 2;
    }
  };

  filldata(indexnum, indices);

  for (int i = 0 ; i < indexnum; ++i) {
          std::cout<<indices[i]<<std::endl;
  }
  int* d_index, *dstrow;

  cudaMalloc((void**)&d_index, indexnum*sizeof(int));

  cudaMemcpy(d_index, indices, indexnum*sizeof(int), cudaMemcpyHostToDevice);

int stride = 100;
  dim3 grid(CEIL(stride, WARP_SIZE));
  dim3 block(WARP_SIZE, BLOCKDIMY);

  cudaMalloc((void**)&dstrow, WARP_SIZE*sizeof(int));
  test_save_warpdata<float>
           <<<grid,
              block,
              sizeof(int)*WARP_SIZE*BLOCKDIMY
           >>>
           (d_index,
            indexnum,
            dstrow);

  int* hdstrow = new int[WARP_SIZE];
  cudaMemcpy(hdstrow, dstrow, sizeof(int)*WARP_SIZE, cudaMemcpyDeviceToHost);
  std::cout<<"+++++dstrow:"<<std::endl;
  for(int i = 0; i < WARP_SIZE; ++i) {
    std::cout<<hdstrow[i]<<std::endl;
  }
}

Your posted code doesn’t compile.

Before asking for help here, I usually also suggest running your code with cuda-memcheck (and use proper CUDA error checking)

When posting code here, please also properly format your code. When editing, select your code, then use the code button (</>)

Sorry, forget sync new code, updated, thank you very much.

The print out on the host side exactly matches what is being stored on the device side:

$ cat t8.cu
#include<cuda.h>
#include<iostream>

#define WARP_SIZE 32
#define BLOCKDIMY 32
#define CEIL(x, y) ((x) + (y) - 1) / (y)
template
  <typename scalar_t>
__global__ void test_save_warpdata(
   int* indices,
   const int n,
   int* dstrow_out)
{

  extern __shared__ char buf[];
  int* indices_batch = (int*)(buf);

  for(int batch_start = 0; batch_start < n; batch_start += blockDim.x*blockDim.y)
  {
    int tid = threadIdx.x + threadIdx.y*blockDim.x;
    if(batch_start + tid < n)
      indices_batch[tid] = (int)indices[batch_start + tid];

    int batch_end = batch_start + blockDim.x*blockDim.y < n ?
                    batch_start + blockDim.x*blockDim.y : n;

    for(int chunk_start = batch_start; chunk_start < batch_end; chunk_start += blockDim.y)
    {

      int src_row = chunk_start + threadIdx.y;
      int dst_row = indices_batch[src_row - batch_start];
      // save block0's warp dst_row
      if (blockIdx.x==0 && threadIdx.y == 0){
        dstrow_out[threadIdx.x] = dst_row;
        printf("threadIdx.x: %d, chunk_start: %d, dst_row: %d\n", threadIdx.x, chunk_start, dst_row);}

    }
  }

}

int main()
{

  const int indexnum = 16;
  int * indices = new int[indexnum];

  auto filldata = [&](int size, int* data) {
    for (int i = 0; i < size; ++i) {
      data[i] = i % 2;
    }
  };

  filldata(indexnum, indices);

  for (int i = 0 ; i < indexnum; ++i) {
          std::cout<<indices[i]<<std::endl;
  }
  int* d_index, *dstrow;

  cudaMalloc((void**)&d_index, indexnum*sizeof(int));

  cudaMemcpy(d_index, indices, indexnum*sizeof(int), cudaMemcpyHostToDevice);


  int stride = 100;
  dim3 grid(CEIL(stride, WARP_SIZE));
  dim3 block(WARP_SIZE, BLOCKDIMY);

  cudaMalloc((void**)&dstrow, WARP_SIZE*sizeof(int));
  test_save_warpdata<float>
           <<<grid,
              block,
              sizeof(int)*WARP_SIZE*BLOCKDIMY
           >>>
           (d_index,
            indexnum,
            dstrow);

  int* hdstrow = new int[WARP_SIZE];
  cudaMemcpy(hdstrow, dstrow, sizeof(int)*WARP_SIZE, cudaMemcpyDeviceToHost);
  std::cout<<"+++++dstrow:"<<std::endl;
  for(int i = 0; i < WARP_SIZE; ++i) {
    std::cout<<hdstrow[i]<<std::endl;
  }
}
$ nvcc -std=c++14 -o t8 t8.cu
$ cuda-memcheck ./t8
========= CUDA-MEMCHECK
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
threadIdx.x: 0, chunk_start: 0, dst_row: 0
threadIdx.x: 1, chunk_start: 0, dst_row: 0
threadIdx.x: 2, chunk_start: 0, dst_row: 0
threadIdx.x: 3, chunk_start: 0, dst_row: 0
threadIdx.x: 4, chunk_start: 0, dst_row: 0
threadIdx.x: 5, chunk_start: 0, dst_row: 0
threadIdx.x: 6, chunk_start: 0, dst_row: 0
threadIdx.x: 7, chunk_start: 0, dst_row: 0
threadIdx.x: 8, chunk_start: 0, dst_row: 0
threadIdx.x: 9, chunk_start: 0, dst_row: 0
threadIdx.x: 10, chunk_start: 0, dst_row: 0
threadIdx.x: 11, chunk_start: 0, dst_row: 0
threadIdx.x: 12, chunk_start: 0, dst_row: 0
threadIdx.x: 13, chunk_start: 0, dst_row: 0
threadIdx.x: 14, chunk_start: 0, dst_row: 0
threadIdx.x: 15, chunk_start: 0, dst_row: 0
threadIdx.x: 16, chunk_start: 0, dst_row: 0
threadIdx.x: 17, chunk_start: 0, dst_row: 0
threadIdx.x: 18, chunk_start: 0, dst_row: 0
threadIdx.x: 19, chunk_start: 0, dst_row: 0
threadIdx.x: 20, chunk_start: 0, dst_row: 0
threadIdx.x: 21, chunk_start: 0, dst_row: 0
threadIdx.x: 22, chunk_start: 0, dst_row: 0
threadIdx.x: 23, chunk_start: 0, dst_row: 0
threadIdx.x: 24, chunk_start: 0, dst_row: 0
threadIdx.x: 25, chunk_start: 0, dst_row: 0
threadIdx.x: 26, chunk_start: 0, dst_row: 0
threadIdx.x: 27, chunk_start: 0, dst_row: 0
threadIdx.x: 28, chunk_start: 0, dst_row: 0
threadIdx.x: 29, chunk_start: 0, dst_row: 0
threadIdx.x: 30, chunk_start: 0, dst_row: 0
threadIdx.x: 31, chunk_start: 0, dst_row: 0
+++++dstrow:
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
========= ERROR SUMMARY: 0 errors
$

So you’ll need to debug your device code if that isn’t the printout you are expecting.

Thanks Robert, cuda-gdb is very useful, I find the bug in my code.