invalid configuration argument error

Dears
why does this code gives me invalid configuration argument??

#include "cuda.h"
#include "cuda_runtime.h"

#include "device_launch_parameters.h"
#include <stdio.h>
#include <math.h>

#define N 128
#define shared_size 64 
#define BLOCKS (N/shared_size)
#define THREADS_PER_BLOCK shared_size
// Define this to turn on error checking
#define CUDA_ERROR_CHECK
#define CudaSafeCall( err ) __cudaSafeCall( err, __FILE__, __LINE__ )
#define CudaCheckError()    __cudaCheckError( __FILE__, __LINE__ )

//****************************************************************************************
// functions for cuda error checking
inline void __cudaSafeCall(cudaError err, const char *file, const int line)
{
#ifdef CUDA_ERROR_CHECK
	if (cudaSuccess != err)
	{
		fprintf(stderr, "cudaSafeCall() failed at %s:%i : %s\n",
			file, line, cudaGetErrorString(err));
		exit(-1);
	}
#endif

	return;
}
inline void __cudaCheckError(const char *file, const int line)
{
#ifdef CUDA_ERROR_CHECK
	cudaError err = cudaGetLastError();
	if (cudaSuccess != err)
	{
		fprintf(stderr, "cudaCheckError() failed at %s:%i : %s\n",
			file, line, cudaGetErrorString(err));
		exit(-1);
	}

	// More careful checking. However, this will affect performance.
	// Comment away if needed.
	err = cudaDeviceSynchronize();
	if (cudaSuccess != err)
	{
		fprintf(stderr, "cudaCheckError() with sync failed at %s:%i : %s\n",
			file, line, cudaGetErrorString(err));
		exit(-1);
	}
#endif

	return;
}
//****************************************************************************************

__global__ void calculate_ratios(float *a)
{
	
	int tx = threadIdx.x;
	int ty = threadIdx.y;



	__shared__ float temp[shared_size][shared_size];


	//  copy from global memory to shared memory
	temp[ty][tx] = a[ty*(shared_size)+tx];
	__syncthreads();
	temp[ty][tx] = temp[ty][tx] * 2;
	__syncthreads();
	a[ty*(N)+tx] = temp[ty][tx];
	}


int main()
{

	float *a_h;
	a_h = (float *)malloc(N*N*sizeof(float)); //allocate memory on host

	for (int i = 0; i < N*N; i++){

			int num = rand() % 10;
			 a_h[i]=1;

		
	
	}



	
	float *a_d;
	float *b_d;
	cudaMalloc((void **)&a_d, N*N*sizeof(float));
	cudaMalloc((void **)&b_d, N*N*sizeof(float));

	 cudaMemcpy(a_d, a_h, N*N*sizeof(float), cudaMemcpyHostToDevice);
	 dim3 dimBlock(THREADS_PER_BLOCK, THREADS_PER_BLOCK, 1);
	 dim3 dimGrid(BLOCKS, BLOCKS, 1);
	 calculate_ratios << <dimGrid, dimBlock >> >(a_d);
	cudaDeviceSynchronize();
	cudaMemcpy(a_h, a_d, N*N*sizeof(float), cudaMemcpyDeviceToHost);

	for (int i = 0; i < N; i++){

		for (int j = 0; j < N; j++){
			printf("%.1f ", a_h[i *N + j]);

		}
		printf("\n");
	}
	// cuda error checking
	cudaError_t error = cudaGetLastError();
	if (error != cudaSuccess)
	{
		printf("CUDA Error: %s\n", cudaGetErrorString(error));

		// we can't recover from the error -- exit the program
		return 1;
	}
	
	cudaFree(a_d);
	cudaFree(b_d);
}

Because of these lines:

#define shared_size 64 
#define THREADS_PER_BLOCK shared_size
dim3 dimBlock(THREADS_PER_BLOCK, THREADS_PER_BLOCK, 1);
calculate_ratios << <dimGrid, dimBlock >> >(a_d);

You are requesting a block that has 64x64 threads, i.e. 4096 threads total. But CUDA threadblocks are limited to 1024 threads maximum.

oooooooh , thanks