How to convert a 2D Matrix to a 1D matrix in cuda device?

I’m a new cuda programmer. Now I want to convert a 2D Matrix to a 1D matrix in cuda device, I can’t determine the index of two-2D matrix and 1D matrix, can you please tell me how to set the index of two-2D matrix and 1D matrix to make it work?

this is my code, what’s problem in this?

``````__global__ void Trans2DTo1D(uint16_t* tiff_2D, uint16_t* tiff_1D, size_t Pitch2D)
{
int tidx = blockIdx.x * blockDim.x + threadIdx.x;
int tidy = blockIdx.y * blockDim.y + threadIdx.y;

if ((tidx < 540) && (tidy < 512))
{
uint16_t* row_tiff_2D = (uint16_t*)((char*)tiff_2D + tidx * Pitch2D);
tiff_1D[tidx * 540+ tidy] = row_tiff_2D[tidy];

}
}
``````

Can you give an example what your code should do? What do you mean by “converting a 2D matrix to a 1D matrix” ?

Your code looks like it could be replaced by `cudaMemcpy2D` .

What I mean is to convert data from 2D arrays to 1D arrays for storage, and the conversion process needs to be done on CUDA device. This is my requirement, just as the follow code on the CPU.

``````int row = sizeof(a) / sizeof(a[0]);
int col = sizeof(a[0]) / sizeof(a[0][0]);

int b[row*col];
int cnt=0;
for(int i = 0; i < row; i++)
{
for(int j = 0; j<col; j++)
{
b[cnt]=a[i][j];
cnt++;
}
}
``````
``````tiff_1D[row index * num columns + column index]=tiff_2D[row index][column index]
``````

That is what the kernel should do, where row index and column index are the thread indices.

But in your kernel, `uint16_t* tiff_2D` already is a 1d array, not a 2d array. You need to declare it as pointer of pointer like `uint16_t** tiff_2D`

I will push the main() function code. I use cudaMemcpy2D to init tiff_2D.

``````uint16_t* tiff_1D;
cudaMalloc((void**)tiff_1D, IMG_ROW * IMG_COL * sizeof(uint16_t));
cudaMemset(tiff_1D, 0, IMG_ROW * IMG_COL * sizeof(uint16_t));

uint16_t* tiff_2D;
size_t Pitch2D;
cudaMallocPitch(&tiff_2D, &Pitch2D, 512* sizeof(uint16_t), 540);
cudaMemcpy2D(tiff_2D, Pitch2D, dev_res, 512* sizeof(uint16_t), 512* sizeof(uint16_t), 540, cudaMemcpyDeviceToDevice);
Trans2DTo1D << <grid2DTo1D, block2DTo1D >> > (tiff_2D, tiff_1D, Pitch2D);
``````
``````dev_res
``````

is the result of the previous step

It is still not clear to me what you are trying to do.
`tiff_1D`, `tiff_2D`, and `dev_res` are already stored in a 1d array, i.e. contiguous memory.

Why not copy `dev_res` directly into `tiff_1D` with cudaMemcpy2D?

I don’t see any problems with the original kernel code you posted. However, from a performance perspective, using tidy for “horizontal” indexing (along a row) and tidx for “vertical” indexing (across rows) is not recommend. Switch the usage of tidx and tidy, and likewise switch the sense of the kernel launch parameters (which you have not shown the definition of).

And the `__syncthreads()` is unnecessary and should be removed. The usage there is arguably illegal.

the follow is the definition of the parameters

``````uint16_t* tiff_1D;
cudaMalloc((void**)tiff_1D, IMG_ROW * IMG_COL * sizeof(uint16_t));
cudaMemset(tiff_1D, 0, IMG_ROW * IMG_COL * sizeof(uint16_t));

uint16_t* tiff_2D;
size_t Pitch2D;
cudaMallocPitch(&tiff_2D, &Pitch2D, 512* sizeof(uint16_t), 540);
cudaMemcpy2D(tiff_2D, Pitch2D, dev_res, 512* sizeof(uint16_t), 512* sizeof(uint16_t), 540, cudaMemcpyDeviceToDevice);
Trans2DTo1D << <grid2DTo1D, block2DTo1D >> > (tiff_2D, tiff_1D, Pitch2D);
``````
``````dev_res
``````

is the result of the previous step

I just try copy dev_res into tiff_1D with cudaMemcpy2D, it did not work.
maybe I make mistake in init tiff_1D?

There were various problems. This should have the various issues fixed:

``````#include <cstdint>
#include <iostream>

const int IMG_ROW = 540;
const int IMG_COL = 512;

__global__ void Trans2DTo1D(uint16_t* tiff_2D, uint16_t* tiff_1D, size_t Pitch2D)
{
int tidx = blockIdx.x * blockDim.x + threadIdx.x;
int tidy = blockIdx.y * blockDim.y + threadIdx.y;

if ((tidx < IMG_COL) && (tidy < IMG_ROW))
{
uint16_t* row_tiff_2D = (uint16_t*)((char*)tiff_2D + tidy * Pitch2D);
tiff_1D[tidy * IMG_COL + tidx] = row_tiff_2D[tidx];
}
}

int main(){

uint16_t  *tiff_1D, *dev_res, *h_res, *r_res;
cudaMalloc((void**)&tiff_1D, IMG_ROW * IMG_COL * sizeof(uint16_t));
cudaMalloc((void**)&dev_res, IMG_ROW * IMG_COL * sizeof(uint16_t));
cudaMemset(tiff_1D, 0, IMG_ROW * IMG_COL * sizeof(uint16_t));
h_res = new uint16_t[IMG_ROW*IMG_COL];
r_res = new uint16_t[IMG_ROW*IMG_COL];
for (int i = 0; i < IMG_ROW*IMG_COL; i++) h_res[i] = i%65535;
cudaMemcpy(dev_res, h_res, IMG_ROW*IMG_COL * sizeof(uint16_t), cudaMemcpyHostToDevice);
dim3 block2DTo1D(32,32);
dim3 grid2DTo1D((IMG_COL + block2DTo1D.x - 1)/block2DTo1D.x, (IMG_ROW + block2DTo1D.y - 1)/block2DTo1D.y);
uint16_t* tiff_2D;
size_t Pitch2D;
cudaMallocPitch(&tiff_2D, &Pitch2D, IMG_COL* sizeof(uint16_t), IMG_ROW);
cudaMemcpy2D(tiff_2D, Pitch2D, dev_res, IMG_COL* sizeof(uint16_t), IMG_COL* sizeof(uint16_t), IMG_ROW, cudaMemcpyDeviceToDevice);
Trans2DTo1D << <grid2DTo1D, block2DTo1D >> > (tiff_2D, tiff_1D, Pitch2D);
cudaMemcpy(r_res, tiff_1D, IMG_ROW*IMG_COL*sizeof(uint16_t), cudaMemcpyDeviceToHost);
for (int i = 0; i < IMG_ROW*IMG_COL; i++)
if (r_res[i] != h_res[i])
{
std::cout << "Mismatch at: " << i << " was: " << r_res[i] << " should be: " << h_res[i] << std::endl;
return 0;
}
return 0;
}
``````

I’m don’t have a list of everything I changed, but for example this is not correct:

You are supposed to multiply the row index by the number of columns, not the number of rows.

This is also incorrect:

`cudaMalloc` takes the address of the pointer to modify.

Finally, I reversed the sense of `tidx` and `tidy` in kernel code, to address the performance issue I mentioned.

Thanks! It works~ Thanks for your help