Hi guys,
I’m working on a cuda exercise in university, however I do have an issue when copying back the data from the GPU.
It seems that cudaMemcpy doesn’t work at all.
I created a function which must do results_D[idx]=idx (which is just a generic code to test our function) for i in … but when I do cudaMemcpy(results, results_D, …, …) it seems nothing is done. The cout returns random value like 6.23513e-33, and all the time same values for each execution, so it seems to be “memory” values. If i do results[0]=a before the cudaMemcpy I obtain the value a for results[0] and freeaking values for the rest…
If someone could help me !!
There is the code :
#include <curand_kernel.h>
#include
#include <sys/time.h>
#define NTHREADS 10
#define NBLOCKS 10
#define NGAUSS 1000
#define dimvect 10
using namespace std;
/*****************************************************************
SEEDS INITIALISATION
*****************************************************************/
global void iniStates (int seed, curandState * states) {
const int idx = blockIdx.x * blockDim.x + threadIdx.x ;
curand_init( seed, idx, 0, &states[idx] ) ;
}
/*****************************************************************
GAUSSIAN VECTORS CONSTRUCTION
*****************************************************************/
global void gaussianVector(int nbgauss, int dim, curandState * global_states, float *results, float mean, float stdev){
const int idx = blockIdx.x * blockDim.x + threadIdx.x ;
const int nSimus = blockDim.x * gridDim.x ;
curandState local_state = global_states[idx] ;
int vectorIndex, vectorNumber;
__syncthreads();
for(int i = idx; i<nbgauss*dim/2 ; i+=nSimus) { //SKIP AHEAD "nSimus" EACH TIME TO GET "nbgauss" VECTORS (of size "dim") IN THE END
vectorIndex = i %dim ;
vectorNumber = i / dim ;
float2 normal = curand_normal2( & local_state ) ;
results[ vectorNumber*dim + vectorIndex ] = normal.x * stdev + mean;
results[ (vectorNumber+nbgauss/2)*dim + vectorIndex ] = normal.y * stdev + mean;
}
__syncthreads();
}
/***********************************************************
NEIGHBOR FUNCTION
***********************************************************/
global void neighbor(int nbgauss, int dim, float *gauss, float results, float z){
const int blockPlace = blockIdx.x * blockDim.x ; //First square of the current bloc in the global vector (of size (NBLOCKS*NTHREADS))
const int idx = blockPlace + threadIdx.x ; //Square corresponding to the current thread of the current block in the global vector
const int nSimus = blockDim.x * gridDim.x ; //dimension of the grid
/************************/
//TEST FOR DATA TRANSFERT FROM DEVICE TO HOST
results[idx] = idx;
/*************************/
}
/***********************************************************
MAIN
***********************************************************/
int main() {
//VECTOR TO COMPARE WITH GAUSSIANS
//HOST
float ref = NULL;
ref = (float ) malloc(dimvectsizeof(float));
for(int i=0; i<dimvect; i++)
ref[i] = i0.01;
//DEVICE
float * ref_D = NULL;
cudaMalloc((void**)& ref_D, dimvectsizeof(float));
//COPY
cudaMemcpy(ref_D, ref, dimvectsizeof(float), cudaMemcpyHostToDevice) ;
//GAUSSIAN VECTOR (DEVICE)
float* gauss_D = NULL;
cudaMalloc((void**)& ref_D, NGAUSS*sizeof(float));
// DISTANCE + NB OF VECTOR WITHIN EACH BLOCK
//DEVICE
float * results_D = NULL;
cudaMalloc((void**)& results_D, 2*NBLOCKS*sizeof(float));
//HOST
float * results = NULL;
results = (float *) malloc(2*NBLOCKS*sizeof(float));
//INITIALISATION OF SEEDS
int initialseed = time(NULL);
int nRNGs = NBLOCKS * NTHREADS ; //TO IMPROVE?????????
curandState * States_D = NULL ;
cudaMalloc((void**) & States_D, nRNGs * sizeof(curandState));
iniStates<<<NBLOCKS,NTHREADS>>>(initialseed, States_D) ;
cudaThreadSynchronize();
//CREATION OF THE GAUSSIAN VECTOR
float mean=0, stdev=1;
gaussianVector<<<NBLOCKS, NTHREADS>>>(NGAUSS, dimvect, States_D, gauss_D, mean, stdev) ;
cudaThreadSynchronize();
//LOOKING FOR THE BEST VECTOR IN EACH BLOCK
neighbor<<<NBLOCKS, NTHREADS>>>(NGAUSS, dimvect, gauss_D, results_D, ref_D) ;
cudaThreadSynchronize();
//COPY RESULTS TO HOST
cudaMemcpy(results, results_D, 2*NBLOCKS*sizeof(float), cudaMemcpyDeviceToHost) ;
//DISPLAY (ONE CLOSEST NEIGHBOR PER BLOCK)
for(int i = 0; i < 2*NBLOCKS-1 ; i+=2) {
cout<<"Block "<< i/2 +1 <<"\t Nb vector : "<< results[i] << "\t Distance : " << results[i+1] <<endl;
}
cudaFree(ref_D);
free(ref);
cudaFree(States_D);
cudaFree(results_D) ;
free(results) ;
cudaFree(gauss_D) ;
return 0;
}