cudaMemcpy doesn't work

Hi guys,

I’m working on a cuda exercise in university, however I do have an issue when copying back the data from the GPU.
It seems that cudaMemcpy doesn’t work at all.
I created a function which must do results_D[idx]=idx (which is just a generic code to test our function) for i in … but when I do cudaMemcpy(results, results_D, …, …) it seems nothing is done. The cout returns random value like 6.23513e-33, and all the time same values for each execution, so it seems to be “memory” values. If i do results[0]=a before the cudaMemcpy I obtain the value a for results[0] and freeaking values for the rest…

If someone could help me !!

There is the code :

#include <curand_kernel.h>
#include
#include <sys/time.h>

#define NTHREADS 10
#define NBLOCKS 10
#define NGAUSS 1000
#define dimvect 10

using namespace std;

/*****************************************************************
SEEDS INITIALISATION
*****************************************************************/

global void iniStates (int seed, curandState * states) {
const int idx = blockIdx.x * blockDim.x + threadIdx.x ;
curand_init( seed, idx, 0, &states[idx] ) ;
}

/*****************************************************************
GAUSSIAN VECTORS CONSTRUCTION
*****************************************************************/

global void gaussianVector(int nbgauss, int dim, curandState * global_states, float *results, float mean, float stdev){

const int idx = blockIdx.x * blockDim.x + threadIdx.x ;
const int nSimus = blockDim.x * gridDim.x ;
curandState local_state = global_states[idx] ;

int vectorIndex, vectorNumber;

__syncthreads();

for(int i = idx; i<nbgauss*dim/2 ; i+=nSimus) { //SKIP AHEAD "nSimus" EACH TIME TO GET "nbgauss" VECTORS (of size "dim") IN THE END
	vectorIndex = i %dim ;
	vectorNumber = i / dim ;

	float2 normal = curand_normal2( & local_state ) ;


	results[ vectorNumber*dim + vectorIndex ] = normal.x * stdev + mean;
	results[ (vectorNumber+nbgauss/2)*dim + vectorIndex ] = normal.y * stdev + mean;
}


__syncthreads();

}

/***********************************************************
NEIGHBOR FUNCTION
***********************************************************/

global void neighbor(int nbgauss, int dim, float *gauss, float results, float z){

const int blockPlace = blockIdx.x * blockDim.x ; //First square of the current bloc in the global vector (of size (NBLOCKS*NTHREADS))
const int idx = blockPlace + threadIdx.x ; 	//Square corresponding to the current thread of the current block in the global vector

const int nSimus = blockDim.x * gridDim.x ; 	//dimension of the grid

/************************/
//TEST FOR DATA TRANSFERT FROM DEVICE TO HOST

results[idx] = idx;
/*************************/

}

/***********************************************************
MAIN
***********************************************************/

int main() {
//VECTOR TO COMPARE WITH GAUSSIANS
//HOST
float ref = NULL;
ref = (float ) malloc(dimvectsizeof(float));
for(int i=0; i<dimvect; i++)
ref[i] = i
0.01;
//DEVICE
float * ref_D = NULL;
cudaMalloc((void**)& ref_D, dimvectsizeof(float));
//COPY
cudaMemcpy(ref_D, ref, dimvect
sizeof(float), cudaMemcpyHostToDevice) ;

//GAUSSIAN VECTOR (DEVICE)
	float* gauss_D = NULL;
	cudaMalloc((void**)& ref_D, NGAUSS*sizeof(float));

// DISTANCE + NB OF VECTOR WITHIN EACH BLOCK
	//DEVICE
		float * results_D = NULL;
    		cudaMalloc((void**)& results_D, 2*NBLOCKS*sizeof(float));
	//HOST
		float * results = NULL;
			results = (float *) malloc(2*NBLOCKS*sizeof(float));


//INITIALISATION OF SEEDS
	int initialseed = time(NULL);
	int nRNGs = NBLOCKS * NTHREADS ; //TO IMPROVE?????????
	curandState * States_D = NULL ;
	cudaMalloc((void**) & States_D, nRNGs * sizeof(curandState));	
	iniStates<<<NBLOCKS,NTHREADS>>>(initialseed, States_D) ;
	
	cudaThreadSynchronize();

//CREATION OF THE GAUSSIAN VECTOR
	float mean=0, stdev=1;
	gaussianVector<<<NBLOCKS, NTHREADS>>>(NGAUSS, dimvect, States_D, gauss_D, mean, stdev) ;

	cudaThreadSynchronize();

//LOOKING FOR THE BEST VECTOR IN EACH BLOCK
	neighbor<<<NBLOCKS, NTHREADS>>>(NGAUSS, dimvect, gauss_D, results_D, ref_D) ;	

	cudaThreadSynchronize();

//COPY RESULTS TO HOST
	cudaMemcpy(results, results_D, 2*NBLOCKS*sizeof(float), cudaMemcpyDeviceToHost) ;


//DISPLAY (ONE CLOSEST NEIGHBOR PER BLOCK)
	for(int i = 0; i < 2*NBLOCKS-1 ; i+=2) {
		cout<<"Block "<< i/2 +1 <<"\t Nb vector : "<< results[i] << "\t Distance : " << results[i+1] <<endl;
	}

cudaFree(ref_D);
free(ref);
cudaFree(States_D);
cudaFree(results_D) ;
free(results) ;
cudaFree(gauss_D) ;

return 0;

}

check the errors returned by all of the cuda* calls, as well as calling cudaGetLastError() after launching a kernel to make sure that the kernel was actually launched in the first place

Okay I will try this and tell you more. So for you it’s our function vectorGaussian which doesn’t work implying that the rest of the cuda code doesn’t do anything ? I will comment this part and relaunch it. I will also try to use cudaGetLastError() !!

If someone have any other idea, leave a message.

I’m sorry guys, It only was a mistake when allocating the gpu memory…

Thanks.