CUDA 2d Array Mapping

#include <cuda_runtime.h>
#include <stdio.h>



void initialint(int *ip,int size)
{
    for(int i=0;i<size;i++)
        ip[i]=i;

}

void printmatrix(int *C,const int nx,const int ny)
{
    int *ic=C;
    printf("\n Matrix: (%d.%d) \n",nx,ny);
    for(int i=0;i<ny;i++){
        for(int j=0;j<nx;j++){
            printf("%3d",ic[j+nx*i]);}
    printf("\n");

    }
printf("\n");
}

__global__ void printthreadindex(int *A,const int nx,const int ny)
{
    int ix=threadIdx.x+blockIdx.x*blockDim.x;
    int iy=threadIdx.y+blockIdx.y*blockDim.y;

    unsigned int idx=ix+iy*nx;

    printf("thread_id (%d,%d) block_id (%d,%d) coordinate (%d,%d) global index %2d  ival %2d \n",threadIdx.x,threadIdx.y,blockIdx.x,blockIdx.y,ix,iy,idx,A[idx]);

}

int main()
{
    int nx=8,ny=6;
    int nxy=nx*ny;
    int nBytes=nxy*sizeof(float);

    int *h_A;
    h_A=(int *)malloc(nBytes);

    initialint(h_A,nxy);
    printmatrix(h_A,nx,ny);

    int *d_MatA;
    cudaMalloc((void **)&d_MatA,nBytes);

    cudaMemcpy(d_MatA,h_A,nBytes,cudaMemcpyHostToDevice);
    dim3 block(4,2);
    dim3 grid(2,3);
    printthreadindex <<<grid,block>>> (d_MatA,nx,ny);

    cudaFree(d_MatA);
    free(h_A);

    system("pause");
    return 0;



}

Output:

Matrix: (8.6)
0 1 2 3 4 5 6 7
8 9 10 11 12 13 14 15
16 17 18 19 20 21 22 23
24 25 26 27 28 29 30 31
32 33 34 35 36 37 38 39
40 41 42 43 44 45 46 47

thread_id (0,0) block_id (1,0) coordinate (4,0) global index 4 ival 4
thread_id (1,0) block_id (1,0) coordinate (5,0) global index 5 ival 5
thread_id (2,0) block_id (1,0) coordinate (6,0) global index 6 ival 6
thread_id (3,0) block_id (1,0) coordinate (7,0) global index 7 ival 7
thread_id (0,1) block_id (1,0) coordinate (4,1) global index 12 ival 12
thread_id (1,1) block_id (1,0) coordinate (5,1) global index 13 ival 13
thread_id (2,1) block_id (1,0) coordinate (6,1) global index 14 ival 14
thread_id (3,1) block_id (1,0) coordinate (7,1) global index 15 ival 15
thread_id (0,0) block_id (1,1) coordinate (4,2) global index 20 ival 20
thread_id (1,0) block_id (1,1) coordinate (5,2) global index 21 ival 21
thread_id (2,0) block_id (1,1) coordinate (6,2) global index 22 ival 22
thread_id (3,0) block_id (1,1) coordinate (7,2) global index 23 ival 23
thread_id (0,1) block_id (1,1) coordinate (4,3) global index 28 ival 28
thread_id (1,1) block_id (1,1) coordinate (5,3) global index 29 ival 29
thread_id (2,1) block_id (1,1) coordinate (6,3) global index 30 ival 30
thread_id (3,1) block_id (1,1) coordinate (7,3) global index 31 ival 31
thread_id (0,0) block_id (0,2) coordinate (0,4) global index 32 ival 32
thread_id (1,0) block_id (0,2) coordinate (1,4) global index 33 ival 33
thread_id (2,0) block_id (0,2) coordinate (2,4) global index 34 ival 34
thread_id (3,0) block_id (0,2) coordinate (3,4) global index 35 ival 35
thread_id (0,1) block_id (0,2) coordinate (0,5) global index 40 ival 40
thread_id (1,1) block_id (0,2) coordinate (1,5) global index 41 ival 41
thread_id (2,1) block_id (0,2) coordinate (2,5) global index 42 ival 42
thread_id (3,1) block_id (0,2) coordinate (3,5) global index 43 ival 43
thread_id (0,0) block_id (1,2) coordinate (4,4) global index 36 ival 36
thread_id (1,0) block_id (1,2) coordinate (5,4) global index 37 ival 37
thread_id (2,0) block_id (1,2) coordinate (6,4) global index 38 ival 38
thread_id (3,0) block_id (1,2) coordinate (7,4) global index 39 ival 39
thread_id (0,1) block_id (1,2) coordinate (4,5) global index 44 ival 44

Hi, the above code is an example from a CUDA book which tries to explain how a 2D array is mapped to CUDA grids and blocks and prints the matrix coordinates and offset in global memory for each thread.

I am a bit confused as to how exactly the threads get mapped, especially the statement “idx=ix+iynx”. I tried to interchange the indices value of nx,ny and then change this statement to “idx=iy+ixny”, but that did not seem to work.

Also the matrix elements mapped to the threads as

Block(0,0) -0,1,2,3,8,9,10,11 Block(1,0)-4,5,6,7,12,13,14,15 …

If I want a mapping like

Block(0,0) -0,1,2,3,4,5,6,7 Block(0,1)-8,9,10,11,12,13,14,15 …

how do I modify the parameters and launch the kernel.

P.S- I am using an i7 processor with GTX 860M with VS 2012 on Windows 8.1.

Thanks.

seems to be answered here:

http://stackoverflow.com/questions/29858234/cuda-2d-array-mapping