or operation to a 2d array in cuda

Hey, I got 2d array in c++ and I want to do an or operation across it’s cols.

int main() {
	const int height = 4, width = 3;
	char grid[height][width] = 
	{
		{1,2,1},
		{1,2,2},
		{1,1,1},
		{1,2,2}
	};
	char result[width];

	// the operation I want to do using cuda:
	for (int i = 0; i < height; i++) {
		for (int j = 0; j < width; j++) {
			result[j] = result[j] | grid[i][j];
		}
	}
	/*
	result[0] = grid[0][0] | grid[1][0] | grid[2][0] | grid[3][0]
	result[1] = grid[0][1] | grid[1][1] | grid[2][1] | grid[3][1]
	result[2] = grid[0][2] | grid[1][2] | grid[2][2] | grid[3][2]
	result[3] = grid[0][3] | grid[1][3] | grid[2][3] | grid[3][3]
	*/

	// instad of the nested for loop:
	Kernal << <height, width >> > (grid, result);
}
// ignore the fact that im not using a sherd memory, it's only illustration of what I want to do
__global__ void Kernal(char** grid, char* result) {
	char& gridCell = grid[blockIdx.x][threadIdx.x];
	char& resultCell = result[threadIdx.x];
	// this code dosen't work because im not using atomics.
	// atomics are too slow for me and works only on int.
	// how can I do this code as fast as possible?
	resultCell = resultCell | gridCell;
}

Having stated your intentions, is there anything that prevents you from doing this?