question of kernal function call my project cannot call the kernel function

Folowing is the brief of my function

when EmuRelease or EmuDebug version, it can implement normally

but when debug or release, it can not call kernal fuction,

cudamalloc and cudaMemcopy can run correctly

Graphic Card is nvidia 9800GT, VS2008, CUDA2.1

Thanks for your help!!!

// includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <time.h>

#include “FileIO.h”
#include “SplitWeight.h”

// includes, project
#include <cutil.h>
#include <cuda_runtime.h>

// includes, kernels
#include <NN_kernel.cu>

////////////////////////////////////////////////////////////////////////////////
// declaration, forward

extern “C”
void computeGold(float*, const float*, const float*, double*, const double*, const double*,unsigned int, unsigned int, unsigned int);

#define g_cImageSize 28 // dimension
#define g_cImageCount 10000 // the num of t10k-images.idx3-ubyte

// struct of text image and its label
typedef struct inputPattern
{
double image[29*29];
int label;
}inputPattern;

int NeuralNetwork(double Layer1_Neurons_CPU,int argc, char* argv);
void StartTesting(struct inputPattern pInputtoLayer1, int ImageCount,int argc, char* argv);

////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////

int main(int argc, char** argv)
{

CUT_DEVICE_INIT(argc, argv); 

struct inputPattern *pInputtoLayer1;  
pInputtoLayer1 = (struct inputPattern*)malloc( sizeof(struct inputPattern)*g_cImageCount);          // 10000 units

if(pInputtoLayer1 != NULL)
{
	for(int ii=0; ii<g_cImageCount; ii++)
	{
		for(int jj=0; jj<29*29; jj++)
		{
			pInputtoLayer1[ii].image[jj] =-1.0;
			pInputtoLayer1[ii].label = -1;
		}
	}

	StartTesting(pInputtoLayer1,g_cImageCount,argc, argv);

	delete pInputtoLayer1;
}
else
{
	printf("there are not enough memory\n");
}
return 0;

}

void output(double *layer, int n)
{
for(int i=0;i<n;i++)
{
printf("%0.3f ",final[i]);
}
}

int NeuralNetwork(double Layer1_Neurons_CPU,int argc, char* argv)
{

double *Layer1_Neurons_GPU;
double Layer1_Weights_CPU[156];
double *Layer1_Weights_GPU;

double *Layer2_Weights_GPU;
double *Layer2_Neurons_GPU;
    double Layer2_Neurons_CPU[1014];


    // initial layer 1 weight
FILE * pFile1 = fopen ("lw1.han","rb");
if (pFile1 != NULL)
{
	fread(Layer1_Weights_CPU,sizeof(double),156,pFile1);
	fclose (pFile1);
}

//allocate momory on Device

CUDA_SAFE_CALL(cudaMalloc((void**) &Layer1_Neurons_GPU, sizeof(double)*29*29));
CUDA_SAFE_CALL(cudaMalloc((void**) &Layer1_Weights_GPU, sizeof(double)*156));

CUDA_SAFE_CALL(cudaMalloc((void**) &Layer2_Neurons_GPU, sizeof(double)*13*13*6));



//copy from CPU to GPU
CUDA_SAFE_CALL(cudaMemcpy(Layer1_Neurons_GPU,Layer1_Neurons_

CPU, sizeof(double)2929, cudaMemcpyHostToDevice));
CUDA_SAFE_CALL(cudaMemcpy(Layer1_Weights_GPU,Layer1_Weights_
CPU, sizeof(double)*156, cudaMemcpyHostToDevice));

dim3 Layer1_Block(6,1);
dim3 Layer1_Thread(13,13);
executeFirstLayer<<<Layer1_Block,Layer1_Thread,0>>>(Layer1_Neurons_GPU,Layer1_Weights_GPU,Layer2_Neurons_GPU

);

//copy from GPU to CPU
    CUDA_SAFE_CALL(cudaMemcpy(Layer2_Neurons_CPU,Layer2_Neurons_

GPU, sizeof(double)*1014, cudaMemcpyDeviceToHost));

    //output(Layer2_Neurons_CPU);

output(Layer2_Neurons_CPU,1014);

}

void StartTesting(struct inputPattern pInputtoLayer1, int ImageCount,int argc, char* argv)
{
NeuralNetwork(pInputtoLayer1[0].image,argc, argv);
}

global void executeFirstLayer(double *Layer1_Neurons_GPU,double *Layer1_Weights_GPU,double *Layer2_Neurons_GPU)
{
int blockID=blockIdx.x;
int pixelX=threadIdx.x;
int pixelY=threadIdx.y;

Layer2_Neurons_GPU[13*13*blockID+pixelY*13+pixelX]=0.6666666

7;
}

Are you sure your GPU has support for doubles ? And if so, are you passing -arch sm_13 to nvcc when you build ?