I have a 2D array arr[N][N] = {{0,1,12,13},{0,1,2,3},{0,1,22,33},{0,1,12,13}}
I copy it to device side , now I need to make each thread perform an operation on 2 elements of the
array :
thread 0 : {0,1,12,13},{0,1,2,3}
thread 1 : {0,1,22,33},{0,1,12,13}
this is a part of code that I have done , but think that it is not the better way to do it
__global__ void kernel(int *A){
int tid=threadIdx.x+blockIdx.x*blockDim.x;
for(k = (tid*N),j=0 ;j<N, k < (tid+1)*N*2 ; j++,k++)
tab[j]=A[k];
}