help to clairfy usage of number of grids and number of blocks in kernal

Hi all,
I am reading an introduction on CUDA programming for some basic idea of CUDA programming. I am reading this http://www.nvidia.com/content/GTC-2010/pdfs/2131_GTC2010.pdf and http://on-demand.gputechconf.com/gtc-express/2011/presentations/GTC_Express_Sarah_Tariq_June2011.pdf

Based on my understanding, the basic GPU programming unit is assigned to threads and some threads is bind as a block and some blocks are enclosed in a grid. Each kernel is technically run in one grid.

I have a device Telsa C2075 and I run the following code to extract the specification of the device

#include <cuda.h>
#include <iostream>
#include "matio.h"

int InitGPUSet()  
{  
  char GPU[5100] = "GPU: ";  
  char str[1000];
  cudaDeviceProp tCard;  
  int num = 0;  
  if (cudaSuccess == cudaGetDeviceCount(&num))  
  {  
    for (int i = 0; i < num; ++ i)  
    {  
      cudaSetDevice(i);  
      cudaGetDeviceProperties(&tCard, i);  
      puts(strcat(GPU, tCard.name));

      sprintf(str, "\nMaximum threads per block: %d\n", tCard.maxThreadsPerBlock);
      puts(strcat(GPU, str));
      sprintf(str, "Maximum dimension (1,2,3) of block: %d %d %d\n", tCard.maxThreadsDim[0], tCard.maxThreadsDim[1], tCard.maxThreadsDim[2]);
      puts(strcat(GPU, str));
      sprintf(str, "Maximum dimension (1,2,3) of grid: %d %d %d\n", tCard.maxGridSize[0], tCard.maxGridSize[1], tCard.maxGridSize[2]);
      puts(strcat(GPU, str));
     }  
   }  
   else return 0;  
   return 1;  
}

int main(void)
{
  if(!InitGPUSet())  
  {
    puts("device is not ready!");  
    cout << "error" << endl;
  }
}

And it comes out

GPU: Tesla C2075
Maximum threads per block: 1024
Maximum dimension (1,2,3) of block: 1024 1024 64
Maximum dimension (1,2,3) of grid: 65535 65535 65535

In one document of introduction, it shows that the kernel is of the form

kernel<<<num_blocks, num_threads_per_block>>>(…)

But in other document of introduction, it tells that the kernel is of the form

kernel<<<grids, blocks>>>(…)

So I wonder which one is correct. In my code, I actually consider the first dimension is for blocks and the second one is for threads. In my algorithm, I have the kernel designed in the following way

#include <cuda.h>
#include <iostream>
#include <stdio.h>

using namespace std;

#define TPERGROUP 24 // TPERGROUPxTPERGROUP threads
#define N 512    // NxN blocks

// Macro to catch CUDA errors in kernel launches
#define CHECK_LAUNCH_ERROR() \
do { \
 /* Check synchronous errors, i.e. pre-launch */ \
 cudaError_t err = cudaGetLastError(); \
 if (cudaSuccess != err) { \
 fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
 __FILE__, __LINE__, cudaGetErrorString(err) ); \
 exit(EXIT_FAILURE); \
 } \
 /* Check asynchronous errors, i.e. kernel failed (ULF) */ \
 err = cudaThreadSynchronize(); \
 if (cudaSuccess != err) { \
 fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
 __FILE__, __LINE__, cudaGetErrorString( err) ); \
 exit(EXIT_FAILURE); \
 } \
} while (0)

__global__ void work(const int n, const int m, int *data)
{
  double x, y, a, b;
  x = (double)blockIdx.x*0.2;
  y = -3.14 + (double)blockIdx.y*0.2;

  a =(double)(n*TPERGROUP + threadIdx.y)*0.1;
  b = (double)(m*TPERGROUP + threadIdx.x)*0.1;

  for (int n=0; n<10000; n++)
  {
    x = x - a*y*y;
    y = y + b*x*x;
  }
  //if (x>y)
//  atomicAdd(&data[(n*TPERGROUP+threadIdx.y)*N + (m*TPERGROUP+ threadIdx.x)], 2);
}

int main(void)
{
  int *A, *B;
  A = (int*)malloc(N*N*sizeof(int));
  cudaMalloc(&B, N*N*sizeof(int));

  CHECK_LAUNCH_ERROR();
  for (int j=0; j<N*N; j++) A[j] = 0;
  cudaMemcpy(B, A, N*N*sizeof(int), cudaMemcpyHostToDevice);

  for (int n=0; n<N; n++)
  {
    for (int m=0; m<N; m++) 
    {
      //work<<<dim3(N, N) , dim3(TPERGROUP , TPERGROUP)>>>(n, m, B);
      CHECK_LAUNCH_ERROR(); 
    }
  }

  cudaMemcpy(A, B, N*N*sizeof(int), cudaMemcpyDeviceToHost);  
  cudaFree(B);
  free(A);

  return 1;
}

In this code, it will use 512x512 blocks and 24x24 threads. But as shown in the spec for the device, I have 65535x65535x65535 grids to use, so how can I make use of those grids to fully parallelize my algorithm? Thanks.