Hello, I write a test program to save a warp’s data in “dstrow_out”, but the value print not right, can anyone help to review that, not sure on the code can do the right thing, thank you very much!
#include<cuda.h>
#include<iostream>
#define WARP_SIZE 32
#define BLOCKDIMY 32
#define CEIL(x, y) ((x) + (y) - 1) / (y)
template
<typename scalar_t>
__global__ void test_save_warpdata(
int* indices,
const int n,
int* dstrow_out)
{
extern __shared__ char buf[];
int* indices_batch = (int*)(buf);
for(int batch_start = 0; batch_start < n; batch_start += blockDim.x*blockDim.y)
{
int tid = threadIdx.x + threadIdx.y*blockDim.x;
if(batch_start + tid < n)
indices_batch[tid] = (int)indices[batch_start + tid];
int batch_end = batch_start + blockDim.x*blockDim.y < n ?
batch_start + blockDim.x*blockDim.y : n;
for(int chunk_start = batch_start; chunk_start < batch_end; chunk_start += blockDim.y)
{
int src_row = chunk_start + threadIdx.y;
int dst_row = indices_batch[src_row - batch_start];
// save block0's warp dst_row
if (blockIdx.x==0 && threadIdx.y == 0)
dstrow_out[threadIdx.x] = dst_row;
}
}
}
int main()
{
const int indexnum = 16;
int * indices = new int[indexnum];
auto filldata = [&](int size, int* data) {
for (int i = 0; i < size; ++i) {
data[i] = i % 2;
}
};
filldata(indexnum, indices);
for (int i = 0 ; i < indexnum; ++i) {
std::cout<<indices[i]<<std::endl;
}
int* d_index, *dstrow;
cudaMalloc((void**)&d_index, indexnum*sizeof(int));
cudaMemcpy(d_index, indices, indexnum*sizeof(int), cudaMemcpyHostToDevice);
int stride = 100;
dim3 grid(CEIL(stride, WARP_SIZE));
dim3 block(WARP_SIZE, BLOCKDIMY);
cudaMalloc((void**)&dstrow, WARP_SIZE*sizeof(int));
test_save_warpdata<float>
<<<grid,
block,
sizeof(int)*WARP_SIZE*BLOCKDIMY
>>>
(d_index,
indexnum,
dstrow);
int* hdstrow = new int[WARP_SIZE];
cudaMemcpy(hdstrow, dstrow, sizeof(int)*WARP_SIZE, cudaMemcpyDeviceToHost);
std::cout<<"+++++dstrow:"<<std::endl;
for(int i = 0; i < WARP_SIZE; ++i) {
std::cout<<hdstrow[i]<<std::endl;
}
}