shared memory problem

Dear all
when I try to compile this code with geforce gt 740m, it give me the following error:
Error 3 error : Entry function ‘Z10FinalPart1PiS’ uses too much shared data (0x10000 bytes, 0xc000 max)

how can I fix this problem??

#include "cuda.h"
#include "cuda_runtime.h"
#include "device_functions.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <cmath>

#define PP  1
#define  P (PP*128)
#define N  P*32*1024 

// Define this to turn on error checking
#define CUDA_ERROR_CHECK
#define CudaSafeCall( err ) __cudaSafeCall( err, __FILE__, __LINE__ )
#define CudaCheckError()    __cudaCheckError( __FILE__, __LINE__ )

//****************************************************************************************
// functions for cuda error checking
inline void __cudaSafeCall(cudaError err, const char *file, const int line)
{
#ifdef CUDA_ERROR_CHECK
	if (cudaSuccess != err)
	{
		fprintf(stderr, "cudaSafeCall() failed at %s:%i : %s\n",
			file, line, cudaGetErrorString(err));
		exit(-1);
	}
#endif

	return;
}
inline void __cudaCheckError(const char *file, const int line)
{
#ifdef CUDA_ERROR_CHECK
	cudaError err = cudaGetLastError();
	if (cudaSuccess != err)
	{
		fprintf(stderr, "cudaCheckError() failed at %s:%i : %s\n",
			file, line, cudaGetErrorString(err));
		exit(-1);
	}

	// More careful checking. However, this will affect performance.
	// Comment away if needed.
	err = cudaDeviceSynchronize();
	if (cudaSuccess != err)
	{
		fprintf(stderr, "cudaCheckError() with sync failed at %s:%i : %s\n",
			file, line, cudaGetErrorString(err));
		exit(-1);
	}
#endif

	return;
}



__global__ void FinalPart1(int *a, int *temp)
{
	__shared__ int part_ary[16* 1024];
	int threadId = threadIdx.x;
	
	part_ary[threadId] = a[threadId ];
	__syncthreads();
	part_ary[threadId] = part_ary[threadId] * blockIdx.x;
	__syncthreads();
	a[threadId + (blockIdx.x *( 32 * 1024))] = part_ary[threadId];

}



//****************************************************************************************************************************************
int main()
{
	int *a; // the main array to be sorted
	int *dev_a, *dev_temp1;   // array on the device


	//*********************************************
	cudaMalloc((void **)&dev_a, N*sizeof(int));//allocate memory on device
	cudaMalloc((void **)&dev_temp1, N*sizeof(int));//allocate memory on device
	a = (int *)malloc(N*sizeof(int)); //allocate memory on host

	//*******1*************************************


	//--------------------------
	//genetate random data
	for (int i = 0; i < N; i++)
	{

		a[i] = 1;
	}


	//*********************************************************
	//transfer data from host memory to device memory, where dev_a is the destination
	cudaMemcpy(dev_a, a, N*sizeof(int), cudaMemcpyHostToDevice);
	//*********************************************************

	//*************************************************************
	// calculating elapsed time, phase1
	cudaEvent_t start, stop; float time;
	cudaEventCreate(&start); cudaEventCreate(&stop);
	cudaEventRecord(start, 0);


	//**************************************************************


	//start execution
	//*********************************************************************************

	//sort up to N=1024*1024
	

	//********************************************************************************
	FinalPart1 << <32, 1024>> >(dev_a, dev_temp1);

	//***********************************************************************
	// calculating elapsed time, phase2
	cudaEventRecord(stop, 0);
	cudaEventSynchronize(stop);
	cudaEventElapsedTime(&time, start, stop);
	cudaEventDestroy(start);
	cudaEventDestroy(stop);

	printf("kernel time in ms:\t%.7f\n", time);
	//**********************************************************************
	//------------------------------------------------------------
	// cuda error checking
	cudaError_t error = cudaGetLastError();
	if (error != cudaSuccess)
	{
		printf("CUDA Error: %s\n", cudaGetErrorString(error));

		// we can't recover from the error -- exit the program
		return 1;
	}
	//-----------------------------------------------------------
	cudaDeviceSynchronize();

		cudaMemcpy(a, dev_a, N*sizeof(int), cudaMemcpyDeviceToHost); // return sorted array to host
	
	//***********************************************************************************



	for (int i = 0; i <N; i++)
	{
	
	printf("\n\n ** \n \n");
	
	printf("%d ", a[i]);

	}

	printf("\n ++++++++++++++++++++++++++++++++++++++\n\n\n\n ");

	cudaFree(dev_a);//free memory on device
	cudaFree(dev_temp1);
	free(a);//free memory on host



}

Line 63 declares a 16K int array. That’s 64K bytes and the limit for the 740m is 48K bytes.

thanks a lot
solved…