problems with cudaMemcpy target is empty after cudaMemcpy

Hi,

I’m new and have no experience with cuda. I try to parallelize the calculation of euclidian distance (sum((x_i -y_i)^2)) of two float vectors (called fFeature1 & fFeature2).

I split it in 3 tasks: substract, square and sum_reduction:

__global__ void SubstractAndSquare(float* fFeature1, float* fFeature2, float* fTarget) {

	int i = threadIdx.x;

	fTarget[i] = fFeature1[i] - fFeature2[i];

	fTarget[i] = fTarget[i] * fTarget[i];

}

void Euclid_dist_gpu(float* fFeature1, float* fFeature2, int iFeatureLen1, float* fDist) {

	float* fTarget = NULL;

	float* devFeature1 = NULL;

	float* devFeature2 = NULL;

	dim3 dimBlock(iFeatureLen1);

	CudaSafeCall( cudaMalloc((void**) &fTarget, sizeof(float) * iFeatureLen1));

	CudaSafeCall( cudaMalloc((void**) &devFeature1, sizeof(float) * iFeatureLen1));

	CudaSafeCall( cudaMalloc((void**) &devFeature2, sizeof(float) * iFeatureLen1));

	CudaSafeCall( cudaMemcpy(devFeature1, fFeature1, sizeof(float) * iFeatureLen1, cudaMemcpyHostToDevice));

	CudaSafeCall( cudaMemcpy(devFeature2, fFeature2, sizeof(float) * iFeatureLen1, cudaMemcpyHostToDevice));

	SubstractAndSquare<<<1, dimBlock>>>(devFeature1, devFeature2, fTarget);

	//sum_reduction(fTarget) with sdk example

}

Unfortunatly I can’t test my parallelization of SubstractAndSquare because the vector devFeature1 and devFeature2 are still initialized to zero after cudaMemcpy.

What am I doing wrong??

CudaSafeCall is a functional implementation of the macro CUDA_SAFE_CALL and it returns no errors:

cudaError CudaSafeCallNoSync(cudaError iCudaRC) {

   Singleton* pExtProcSession = Singleton::getInstance();    	

    char cMessage[512];            	

	cudaError err = iCudaRC;                                                    

	if( cudaSuccess != err) {                                                

  sprintf(cMessage, "Cuda error in file '%s' in line %i : %s.\n", __FILE__, __LINE__, cudaGetErrorString( err) ); 

  strcat(pExtProcSession->pLog->cErrorMessage, cMessage);    

	}

	return iCudaRC;

}

cudaError CudaSafeCall(cudaError iCudaRC) {

   Singleton* pExtProcSession = Singleton::getInstance();

	char cMessage[512];            	

	CudaSafeCallNoSync(iCudaRC);                                            

    cudaError err = cudaThreadSynchronize();                                 

	if( cudaSuccess != err) {                                                

  sprintf(cMessage, "Cuda error in file '%s' in line %i : %s.\n", __FILE__, __LINE__, cudaGetErrorString( err) ); 

  strcat(pExtProcSession->pLog->cErrorMessage, cMessage);    

	}

	return iCudaRC;

}

Thanks in advance.