Ok, so I made a minimal sample that does not run on the Tesla cards, but on all others. I deleted most of the stuff that isn’t necessary, it’s still long…
[codebox]
// includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <time.h>
#include <float.h>
#include <fenv.h>
// includes, cuda
#include <cutil.h>
#include <cuda_runtime.h>
#include <cuda.h>
#include <cutil_inline.h>
#define blocksize 128
#define number_of_gridpoints_per_cell 4
#define number_of_cells 10
#define max_number_of_particles_per_cell 10000
global void presort_neighbors(float4* Particles, int* Number_of_rejected_particles, int* Number_of_particles_in_cell, int* GPU_left_escape, int* GPU_right_escape, int charge)
{
shared int shared_index[4];
int tid = threadIdx.x;
int cell = blockIdx.x;
int particle_offset = blockDim.x;
if(tid==0) shared_index[0] = Number_of_particles_in_cell[cell]; //current number in this cell
if(tid==1) //get particles from left
{
if(cell>0) shared_index[1] = Number_of_rejected_particles[cell-1]; //left cell
else
{
shared_index[1] = 0; //counter for escape in first (left) cell
}
}
if(tid==2) //get particles from right
{
if(cell<number_of_cells-1) shared_index[2] = Number_of_rejected_particles[cell+1]; //right cell
else
{
shared_index[2] = 0; //counter for escape in last (right) cell
}
}
if(tid==3)
{
if((cell==0)||(cell==number_of_cells-1))
{
shared_index[3] = Number_of_rejected_particles[cell];
}
else
{
shared_index[3] = 0;
}
}
__syncthreads();
if(tid < particle_offset/2)
{
if(cell>0) //get particles from left
{
while(tid < shared_index[1]) //get particles with half of threads from the left until all rejected ones from the left are done
{
float4 particle = Particles[(cell-1)*max_number_of_particles_per_cell + max_number_of_particles_per_cell - tid -1];
int new_particle_cell = __float2int_rd(particle.w / ((float)(number_of_gridpoints_per_cell-1)));
if(new_particle_cell==cell)
{
int new_particle_index_in_current_cell = atomicAdd(&shared_index[0],1);
Particles[cell*max_number_of_particles_per_cell + new_particle_index_in_current_cell] = particle;
}
tid += particle_offset/2;
}
}
else //if cell == 0 → count left escapes
{
while(tid < shared_index[3]) //get particles with half of threads from the left until all rejected ones from the left are done
{
// if(tid==0) printf(“[0].w = %f \n”,Particles[max_number_of_particles_per_cell - 1].w); //works
if(tid==0) printf("[0].w = %f \n",Particles[max_number_of_particles_per_cell - tid - 1].w); //Doesn't work
if(tid>0)
{
float4 particle = Particles[max_number_of_particles_per_cell -1 -tid];
printf(".w = %f \n",particle.w);
//new:
int new_particle_cell = __float2int_rd(particle.w / ((float)(number_of_gridpoints_per_cell-1)));
if(new_particle_cell < 0)
{
atomicAdd(&shared_index[1],1);
}
}
tid += (int)particle_offset/2;
}
}
}
else
{
//not needed here
}
__syncthreads();
//
//not needed here
//
}
int main(int argc, char* argv)
{
cudaError_t error = cudaSetDevice(0);
if(error == cudaSuccess) printf(“device set!\n”);
else
{
printf("device not found!\n");
exit(1);
}
srand48(time(0)); //initialize RNG
int cell,particle, gridpoint; //counter
int total_number_of_particles = 50000;
int init_number_of_particles = total_number_of_particles / number_of_cells; //per cell
printf(“%d particles per cell \n”, init_number_of_particles);
float4* GPU_particle_array;
CUDA_SAFE_CALL(cudaMalloc((void**)&GPU_particle_array, max_number_of_particles_per_cell * number_of_cells * sizeof(float4)));
float4* CPU_particle_array;
CPU_particle_array = (float4*) malloc(max_number_of_particles_per_cell * number_of_cells * sizeof(float4));
int* GPU_number_of_rejected_particles;
CUDA_SAFE_CALL(cudaMalloc((void**)&GPU_number_of_rejected_particles, number_of_cells * sizeof(int)));
int* GPU_number_of_particles_per_cell;
CUDA_SAFE_CALL(cudaMalloc((void**)&GPU_number_of_particles_per_cell, number_of_cells * sizeof(int)));
int* CPU_number_of_rejected_particles;
CPU_number_of_rejected_particles = (int*) malloc(number_of_cells*sizeof(int));
int* CPU_number_of_particles_per_cell;
CPU_number_of_particles_per_cell = (int*) malloc(number_of_cells*sizeof(int));
int *GPU_left_particle_escape, *GPU_right_particle_escape;
CUDA_SAFE_CALL(cudaMalloc((void**)&GPU_left_particle_escape, sizeof(int)));
CUDA_SAFE_CALL(cudaMalloc((void**)&GPU_right_particle_escape, sizeof(int)));
int CPU_left_particle_escape, CPU_right_particle_escape;
for(cell=0; cell < number_of_cells; cell++)
{
for(particle=0; particle<max_number_of_particles_per_cell; particle++)
{
CPU_particle_array[cell*max_number_of_particles_per_cell + particle].x = drand48()-0.5f;
CPU_particle_array[cell*max_number_of_particles_per_cell + particle].y = drand48()-0.5f;
CPU_particle_array[cell*max_number_of_particles_per_cell + particle].z = drand48()-0.5f;
CPU_particle_array[cell*max_number_of_particles_per_cell + particle].w = (float)cell*(number_of_gridpoints_per_cell-1) + drand48()*(number_of_gridpoints_per_cell-1);
}
CPU_number_of_particles_per_cell[cell] = init_number_of_particles;
}
//put 10 “wrong” particles to the top of each cell:
for(cell=0; cell < number_of_cells; cell++)
{
for(particle=max_number_of_particles_per_cell-1; particle>max_number_of_particles_per_cell-11; particle--)
{
CPU_particle_array[cell*max_number_of_particles_per_cell + particle].w -= 1.0f;
}
CPU_number_of_rejected_particles[cell] = 10;
}
CUDA_SAFE_CALL(cudaMemcpy(GPU_number_of_particles_per_cell, CPU_number_of_particles_per_cell, number_of_cells * sizeof(int), cudaMemcpyHostToDevice));
CUDA_SAFE_CALL(cudaMemcpy(GPU_number_of_rejected_particles, CPU_number_of_rejected_particles, number_of_cells * sizeof(int), cudaMemcpyHostToDevice));
CUDA_SAFE_CALL(cudaMemcpy(GPU_particle_array, CPU_particle_array, max_number_of_particles_per_cell * number_of_cells * sizeof(float4), cudaMemcpyHostToDevice));
int zero = 0;
CUDA_SAFE_CALL(cudaMemcpy(GPU_left_particle_escape, &zero, sizeof(int), cudaMemcpyHostToDevice));
CUDA_SAFE_CALL(cudaMemcpy(GPU_right_particle_escape, &zero, sizeof(int), cudaMemcpyHostToDevice));
presort_neighbors <<< number_of_cells,blocksize >>> (GPU_particle_array, GPU_number_of_rejected_particles, GPU_number_of_particles_per_cell, GPU_left_particle_escape, GPU_right_particle_escape, 0);
cutilSafeCall( cudaThreadSynchronize() );
printf(“Program finished.\n”);
return 0;
}
[/codebox]