Folowing is the brief of my function
when EmuRelease or EmuDebug version, it can implement normally
but when debug or release, it can not call kernal fuction,
cudamalloc and cudaMemcopy can run correctly
Graphic Card is nvidia 9800GT, VS2008, CUDA2.1
Thanks for your help!!!
// includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <time.h>
#include “FileIO.h”
#include “SplitWeight.h”
// includes, project
#include <cutil.h>
#include <cuda_runtime.h>
// includes, kernels
#include <NN_kernel.cu>
////////////////////////////////////////////////////////////////////////////////
// declaration, forward
extern “C”
void computeGold(float*, const float*, const float*, double*, const double*, const double*,unsigned int, unsigned int, unsigned int);
#define g_cImageSize 28 // dimension
#define g_cImageCount 10000 // the num of t10k-images.idx3-ubyte
// struct of text image and its label
typedef struct inputPattern
{
double image[29*29];
int label;
}inputPattern;
int NeuralNetwork(double Layer1_Neurons_CPU,int argc, char* argv);
void StartTesting(struct inputPattern pInputtoLayer1, int ImageCount,int argc, char* argv);
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char** argv)
{
CUT_DEVICE_INIT(argc, argv);
struct inputPattern *pInputtoLayer1;
pInputtoLayer1 = (struct inputPattern*)malloc( sizeof(struct inputPattern)*g_cImageCount); // 10000 units
if(pInputtoLayer1 != NULL)
{
for(int ii=0; ii<g_cImageCount; ii++)
{
for(int jj=0; jj<29*29; jj++)
{
pInputtoLayer1[ii].image[jj] =-1.0;
pInputtoLayer1[ii].label = -1;
}
}
StartTesting(pInputtoLayer1,g_cImageCount,argc, argv);
delete pInputtoLayer1;
}
else
{
printf("there are not enough memory\n");
}
return 0;
}
void output(double *layer, int n)
{
for(int i=0;i<n;i++)
{
printf("%0.3f ",final[i]);
}
}
int NeuralNetwork(double Layer1_Neurons_CPU,int argc, char* argv)
{
double *Layer1_Neurons_GPU;
double Layer1_Weights_CPU[156];
double *Layer1_Weights_GPU;
double *Layer2_Weights_GPU;
double *Layer2_Neurons_GPU;
double Layer2_Neurons_CPU[1014];
// initial layer 1 weight
FILE * pFile1 = fopen ("lw1.han","rb");
if (pFile1 != NULL)
{
fread(Layer1_Weights_CPU,sizeof(double),156,pFile1);
fclose (pFile1);
}
//allocate momory on Device
CUDA_SAFE_CALL(cudaMalloc((void**) &Layer1_Neurons_GPU, sizeof(double)*29*29));
CUDA_SAFE_CALL(cudaMalloc((void**) &Layer1_Weights_GPU, sizeof(double)*156));
CUDA_SAFE_CALL(cudaMalloc((void**) &Layer2_Neurons_GPU, sizeof(double)*13*13*6));
//copy from CPU to GPU
CUDA_SAFE_CALL(cudaMemcpy(Layer1_Neurons_GPU,Layer1_Neurons_
CPU, sizeof(double)2929, cudaMemcpyHostToDevice));
CUDA_SAFE_CALL(cudaMemcpy(Layer1_Weights_GPU,Layer1_Weights_
CPU, sizeof(double)*156, cudaMemcpyHostToDevice));
dim3 Layer1_Block(6,1);
dim3 Layer1_Thread(13,13);
executeFirstLayer<<<Layer1_Block,Layer1_Thread,0>>>(Layer1_Neurons_GPU,Layer1_Weights_GPU,Layer2_Neurons_GPU
);
//copy from GPU to CPU
CUDA_SAFE_CALL(cudaMemcpy(Layer2_Neurons_CPU,Layer2_Neurons_
GPU, sizeof(double)*1014, cudaMemcpyDeviceToHost));
//output(Layer2_Neurons_CPU);
output(Layer2_Neurons_CPU,1014);
}
void StartTesting(struct inputPattern pInputtoLayer1, int ImageCount,int argc, char* argv)
{
NeuralNetwork(pInputtoLayer1[0].image,argc, argv);
}
global void executeFirstLayer(double *Layer1_Neurons_GPU,double *Layer1_Weights_GPU,double *Layer2_Neurons_GPU)
{
int blockID=blockIdx.x;
int pixelX=threadIdx.x;
int pixelY=threadIdx.y;
Layer2_Neurons_GPU[13*13*blockID+pixelY*13+pixelX]=0.6666666
7;
}