Here is the code I launch:
int N = ps->getPartNum();
FILE *ofp;
ofp = fopen("c:\exec.txt", "w");
fprintf(ofp, "neighb 1966 10 = %f \n", ps->particles[1966].neighbours[10].particle->density);
Particle* dParticles;
Neighbour* dNeighbours;
Particle* pParticles_temp;
Neighbour* pNeighbours_temp;
cudaMalloc((void **) &dParticles, N*sizeof(Particle));
cudaMalloc((void **) &dNeighbours, N*128*sizeof(Neighbour));
pParticles_temp = new Particle[N];
pNeighbours_temp = new Neighbour[N*128];
memcpy(pParticles_temp, ps->particles, N*sizeof(Particle));
for (int i = 0; i < N; i++)
{
pParticles_temp[i].neighbours = dNeighbours + i*128;
for(int j = 0; j < pParticles_temp[i].num_of_neighbours; j++)
{
pNeighbours_temp[i*128+j].particle =
dParticles +
(ps->particles[i].neighbours[j].particle - ps->particles);
}
}
cudaMemcpy(dParticles, pParticles_temp, N*sizeof(Particle), cudaMemcpyHostToDevice);
cudaMemcpy(dNeighbours, pNeighbours_temp, N*128*sizeof(Neighbour), cudaMemcpyHostToDevice);
delete[] pParticles_temp;
delete[] pNeighbours_temp;
cudaPrintfInit();
mainCalcCuda <<< N, 1 >>> (dParticles, framenum);
cudaThreadSynchronize();
cudaMemcpy(ps->particles, dParticles, N*sizeof(Particle), cudaMemcpyDeviceToHost);
//neighbours from device to host
for (int i = 0; i < N; i++)
{
ps->particles[i].neighbours = new Neighbour[128];
cudaMemcpy(ps->particles[i].neighbours, dNeighbours + i*128, 128*sizeof(Neighbour), cudaMemcpyDeviceToHost);
}
printf("%s \n",cudaGetErrorString(cudaGetLastError()));
//fprintf(ofp, "after neighb 1966 10 = %f \n", ps->particles[1966].neighbours[10].particle->density);
fclose(ofp);
cudaPrintfDisplay(stdout, true);
cudaPrintfEnd();
Before passing data to gpu I check
ps->particles[1966].neighbours[10].particle->density
and everything seems ok (zero).
but after receiving the data from gpu smth wrong happens:
ps->particles[1966].neighbours[10].particle->density
becomes 1.#QNAN0 and error occures.
Here is the code where I find 1.#qnan first time (inside mainCalcCuda-kernel)
Neighbour* neighbours = p.neighbours;
float summ = 0;
for(int k = 0; k < p.num_of_neighbours; k++){
Particle* c = neighbours[k].particle;
cuPrintf("c.density %d %d = %f;\n", idx, k, c->density); //there it happens
float w = cudaW(cudaNorm(c->prevPosition - p.prevPosition), c);
summ += c->mass * w;
}
All others neighbours are ok. 1970 particles are in the system.
Am I doing passing/receiving data in the right way? How do you think what’s wrong?