There were various problems. This should have the various issues fixed:
#include <cstdint>
#include <iostream>
const int IMG_ROW = 540;
const int IMG_COL = 512;
__global__ void Trans2DTo1D(uint16_t* tiff_2D, uint16_t* tiff_1D, size_t Pitch2D)
{
int tidx = blockIdx.x * blockDim.x + threadIdx.x;
int tidy = blockIdx.y * blockDim.y + threadIdx.y;
if ((tidx < IMG_COL) && (tidy < IMG_ROW))
{
uint16_t* row_tiff_2D = (uint16_t*)((char*)tiff_2D + tidy * Pitch2D);
tiff_1D[tidy * IMG_COL + tidx] = row_tiff_2D[tidx];
}
}
int main(){
uint16_t *tiff_1D, *dev_res, *h_res, *r_res;
cudaMalloc((void**)&tiff_1D, IMG_ROW * IMG_COL * sizeof(uint16_t));
cudaMalloc((void**)&dev_res, IMG_ROW * IMG_COL * sizeof(uint16_t));
cudaMemset(tiff_1D, 0, IMG_ROW * IMG_COL * sizeof(uint16_t));
h_res = new uint16_t[IMG_ROW*IMG_COL];
r_res = new uint16_t[IMG_ROW*IMG_COL];
for (int i = 0; i < IMG_ROW*IMG_COL; i++) h_res[i] = i%65535;
cudaMemcpy(dev_res, h_res, IMG_ROW*IMG_COL * sizeof(uint16_t), cudaMemcpyHostToDevice);
dim3 block2DTo1D(32,32);
dim3 grid2DTo1D((IMG_COL + block2DTo1D.x - 1)/block2DTo1D.x, (IMG_ROW + block2DTo1D.y - 1)/block2DTo1D.y);
uint16_t* tiff_2D;
size_t Pitch2D;
cudaMallocPitch(&tiff_2D, &Pitch2D, IMG_COL* sizeof(uint16_t), IMG_ROW);
cudaMemcpy2D(tiff_2D, Pitch2D, dev_res, IMG_COL* sizeof(uint16_t), IMG_COL* sizeof(uint16_t), IMG_ROW, cudaMemcpyDeviceToDevice);
Trans2DTo1D << <grid2DTo1D, block2DTo1D >> > (tiff_2D, tiff_1D, Pitch2D);
cudaMemcpy(r_res, tiff_1D, IMG_ROW*IMG_COL*sizeof(uint16_t), cudaMemcpyDeviceToHost);
for (int i = 0; i < IMG_ROW*IMG_COL; i++)
if (r_res[i] != h_res[i])
{
std::cout << "Mismatch at: " << i << " was: " << r_res[i] << " should be: " << h_res[i] << std::endl;
return 0;
}
return 0;
}
I’m don’t have a list of everything I changed, but for example this is not correct:
You are supposed to multiply the row index by the number of columns, not the number of rows.
This is also incorrect:
cudaMalloc
takes the address of the pointer to modify.
Finally, I reversed the sense of tidx
and tidy
in kernel code, to address the performance issue I mentioned.