Did you have a chance to look at the little example program I sent you in a PM. It shows how to allocate 2D arrays and how address them with the help of macros. Here it is again:
#include <stdio.h>
#include <stdlib.h>
#define M 9
#define N 7
#define in(row,col) in[row * N + col]
#define out(row,col) out[row * M + col]
__global__ void kernel (const int *in, int *out)
{
__shared__ int inbuf[M][N];
__shared__ int outbuf[N][M];
inbuf [threadIdx.x][threadIdx.y] = in (threadIdx.x, threadIdx.y);
__syncthreads();
outbuf [threadIdx.y][threadIdx.x] = inbuf [threadIdx.x][threadIdx.y];
__syncthreads();
out (threadIdx.y, threadIdx.x) = outbuf [threadIdx.y][threadIdx.x];
}
int main (void)
{
dim3 blcks(1);
dim3 thrds(M,N);
int in_arr[M][N], out_arr[N][M];
int *in_arr_d = 0, *out_arr_d = 0;
int i, j;
for (i = 0; i < M; i++) {
for (j = 0; j < N; j++) {
in_arr[i][j] = (i+1)*10 + (j+1);
printf ("%2d ", in_arr[i][j]);
}
printf ("\n");
}
printf("\n");
cudaMalloc ((void**)&in_arr_d, M*N*sizeof(in_arr_d[0]));
cudaMalloc ((void**)&out_arr_d, N*M*sizeof(out_arr_d[0]));
cudaMemcpy (in_arr_d, &in_arr[0], M*N*sizeof(in_arr_d[0]),
cudaMemcpyHostToDevice);
cudaMemset (out_arr_d, 0, N*M*sizeof(out_arr_d[0]));
kernel<<<blcks,thrds>>>(in_arr_d, out_arr_d);
cudaMemcpy (&out_arr[0], out_arr_d, N*M*sizeof(out_arr_d[0]),
cudaMemcpyDeviceToHost);
for (i = 0; i < N; i++) {
for (j = 0; j < M; j++) {
printf ("%2d ", out_arr[i][j]);
}
printf ("\n");
}
cudaFree (in_arr);
cudaFree (out_arr);
return EXIT_SUCCESS;
}
The output should show an original matrix and its transpose, like so:
11 12 13 14 15 16 17
21 22 23 24 25 26 27
31 32 33 34 35 36 37
41 42 43 44 45 46 47
51 52 53 54 55 56 57
61 62 63 64 65 66 67
71 72 73 74 75 76 77
81 82 83 84 85 86 87
91 92 93 94 95 96 97
11 21 31 41 51 61 71 81 91
12 22 32 42 52 62 72 82 92
13 23 33 43 53 63 73 83 93
14 24 34 44 54 64 74 84 94
15 25 35 45 55 65 75 85 95
16 26 36 46 56 66 76 86 96
17 27 37 47 57 67 77 87 97