How to convert a 2D Matrix to a 1D matrix in cuda device?

I’m a new cuda programmer. Now I want to convert a 2D Matrix to a 1D matrix in cuda device, I can’t determine the index of two-2D matrix and 1D matrix, can you please tell me how to set the index of two-2D matrix and 1D matrix to make it work?

this is my code, what’s problem in this?

__global__ void Trans2DTo1D(uint16_t* tiff_2D, uint16_t* tiff_1D, size_t Pitch2D)
{
    int tidx = blockIdx.x * blockDim.x + threadIdx.x;
    int tidy = blockIdx.y * blockDim.y + threadIdx.y;

    if ((tidx < 540) && (tidy < 512))
    {
        uint16_t* row_tiff_2D = (uint16_t*)((char*)tiff_2D + tidx * Pitch2D);
        tiff_1D[tidx * 540+ tidy] = row_tiff_2D[tidy];
        __syncthreads();

    }
}

Can you give an example what your code should do? What do you mean by “converting a 2D matrix to a 1D matrix” ?

Your code looks like it could be replaced by cudaMemcpy2D .

What I mean is to convert data from 2D arrays to 1D arrays for storage, and the conversion process needs to be done on CUDA device. This is my requirement, just as the follow code on the CPU.

int row = sizeof(a) / sizeof(a[0]);
int col = sizeof(a[0]) / sizeof(a[0][0]);

int b[row*col];
int cnt=0;
for(int i = 0; i < row; i++)
{
    for(int j = 0; j<col; j++)
    {
        b[cnt]=a[i][j];
        cnt++;
    }
}
tiff_1D[row index * num columns + column index]=tiff_2D[row index][column index]

That is what the kernel should do, where row index and column index are the thread indices.

But in your kernel, uint16_t* tiff_2D already is a 1d array, not a 2d array. You need to declare it as pointer of pointer like uint16_t** tiff_2D

I will push the main() function code. I use cudaMemcpy2D to init tiff_2D.

uint16_t* tiff_1D;
cudaMalloc((void**)tiff_1D, IMG_ROW * IMG_COL * sizeof(uint16_t));
cudaMemset(tiff_1D, 0, IMG_ROW * IMG_COL * sizeof(uint16_t));

uint16_t* tiff_2D;
size_t Pitch2D;
cudaMallocPitch(&tiff_2D, &Pitch2D, 512* sizeof(uint16_t), 540);
cudaMemcpy2D(tiff_2D, Pitch2D, dev_res, 512* sizeof(uint16_t), 512* sizeof(uint16_t), 540, cudaMemcpyDeviceToDevice);
Trans2DTo1D << <grid2DTo1D, block2DTo1D >> > (tiff_2D, tiff_1D, Pitch2D);
dev_res

is the result of the previous step

It is still not clear to me what you are trying to do.
tiff_1D, tiff_2D, and dev_res are already stored in a 1d array, i.e. contiguous memory.

Why not copy dev_res directly into tiff_1D with cudaMemcpy2D?

I don’t see any problems with the original kernel code you posted. However, from a performance perspective, using tidy for “horizontal” indexing (along a row) and tidx for “vertical” indexing (across rows) is not recommend. Switch the usage of tidx and tidy, and likewise switch the sense of the kernel launch parameters (which you have not shown the definition of).

And the __syncthreads() is unnecessary and should be removed. The usage there is arguably illegal.

the follow is the definition of the parameters

uint16_t* tiff_1D;
cudaMalloc((void**)tiff_1D, IMG_ROW * IMG_COL * sizeof(uint16_t));
cudaMemset(tiff_1D, 0, IMG_ROW * IMG_COL * sizeof(uint16_t));

uint16_t* tiff_2D;
size_t Pitch2D;
cudaMallocPitch(&tiff_2D, &Pitch2D, 512* sizeof(uint16_t), 540);
cudaMemcpy2D(tiff_2D, Pitch2D, dev_res, 512* sizeof(uint16_t), 512* sizeof(uint16_t), 540, cudaMemcpyDeviceToDevice);
Trans2DTo1D << <grid2DTo1D, block2DTo1D >> > (tiff_2D, tiff_1D, Pitch2D);
dev_res

is the result of the previous step

I just try copy dev_res into tiff_1D with cudaMemcpy2D, it did not work.
maybe I make mistake in init tiff_1D?

There were various problems. This should have the various issues fixed:

#include <cstdint>
#include <iostream>

const int IMG_ROW = 540;
const int IMG_COL = 512;

__global__ void Trans2DTo1D(uint16_t* tiff_2D, uint16_t* tiff_1D, size_t Pitch2D)
{
    int tidx = blockIdx.x * blockDim.x + threadIdx.x;
    int tidy = blockIdx.y * blockDim.y + threadIdx.y;

    if ((tidx < IMG_COL) && (tidy < IMG_ROW))
    {
        uint16_t* row_tiff_2D = (uint16_t*)((char*)tiff_2D + tidy * Pitch2D);
        tiff_1D[tidy * IMG_COL + tidx] = row_tiff_2D[tidx];
    }
}


int main(){

  uint16_t  *tiff_1D, *dev_res, *h_res, *r_res;
  cudaMalloc((void**)&tiff_1D, IMG_ROW * IMG_COL * sizeof(uint16_t));
  cudaMalloc((void**)&dev_res, IMG_ROW * IMG_COL * sizeof(uint16_t));
  cudaMemset(tiff_1D, 0, IMG_ROW * IMG_COL * sizeof(uint16_t));
  h_res = new uint16_t[IMG_ROW*IMG_COL];
  r_res = new uint16_t[IMG_ROW*IMG_COL];
  for (int i = 0; i < IMG_ROW*IMG_COL; i++) h_res[i] = i%65535;
  cudaMemcpy(dev_res, h_res, IMG_ROW*IMG_COL * sizeof(uint16_t), cudaMemcpyHostToDevice);
  dim3 block2DTo1D(32,32);
  dim3 grid2DTo1D((IMG_COL + block2DTo1D.x - 1)/block2DTo1D.x, (IMG_ROW + block2DTo1D.y - 1)/block2DTo1D.y);
  uint16_t* tiff_2D;
  size_t Pitch2D;
  cudaMallocPitch(&tiff_2D, &Pitch2D, IMG_COL* sizeof(uint16_t), IMG_ROW);
  cudaMemcpy2D(tiff_2D, Pitch2D, dev_res, IMG_COL* sizeof(uint16_t), IMG_COL* sizeof(uint16_t), IMG_ROW, cudaMemcpyDeviceToDevice);
  Trans2DTo1D << <grid2DTo1D, block2DTo1D >> > (tiff_2D, tiff_1D, Pitch2D);
  cudaMemcpy(r_res, tiff_1D, IMG_ROW*IMG_COL*sizeof(uint16_t), cudaMemcpyDeviceToHost);
  for (int i = 0; i < IMG_ROW*IMG_COL; i++)
    if (r_res[i] != h_res[i])
    {
      std::cout << "Mismatch at: " << i << " was: " << r_res[i] << " should be: " << h_res[i] << std::endl;
      return 0;
    }
  return 0;
}

I’m don’t have a list of everything I changed, but for example this is not correct:

You are supposed to multiply the row index by the number of columns, not the number of rows.

This is also incorrect:

cudaMalloc takes the address of the pointer to modify.

Finally, I reversed the sense of tidx and tidy in kernel code, to address the performance issue I mentioned.

Thanks! It works~ Thanks for your help