Hey, I got 2d array in c++ and I want to do an or operation across it’s cols.
int main() {
const int height = 4, width = 3;
char grid[height][width] =
{
{1,2,1},
{1,2,2},
{1,1,1},
{1,2,2}
};
char result[width];
// the operation I want to do using cuda:
for (int i = 0; i < height; i++) {
for (int j = 0; j < width; j++) {
result[j] = result[j] | grid[i][j];
}
}
/*
result[0] = grid[0][0] | grid[1][0] | grid[2][0] | grid[3][0]
result[1] = grid[0][1] | grid[1][1] | grid[2][1] | grid[3][1]
result[2] = grid[0][2] | grid[1][2] | grid[2][2] | grid[3][2]
result[3] = grid[0][3] | grid[1][3] | grid[2][3] | grid[3][3]
*/
// instad of the nested for loop:
Kernal << <height, width >> > (grid, result);
}
// ignore the fact that im not using a sherd memory, it's only illustration of what I want to do
__global__ void Kernal(char** grid, char* result) {
char& gridCell = grid[blockIdx.x][threadIdx.x];
char& resultCell = result[threadIdx.x];
// this code dosen't work because im not using atomics.
// atomics are too slow for me and works only on int.
// how can I do this code as fast as possible?
resultCell = resultCell | gridCell;
}