I’ve been suffering from a strange behavior of CUDA that over-writes all the values in the structure to be zero (at least with printf("%e")
), and have nailed down the causes - an IF statement that should have no impact to the results - but cannot get why this is happening.
The brief structure of my __global__
function looks as the below. struct Particle
has an int
component ‘lost’, which is all set to zero. Therefore, I expected no impact from the the presence of the commented IF statement.
__global__ void Kernel_test(int n_particle, Particle *particle_list ) {
int index_i, index_j;
Particle particle_i, particle_j;
__shared__ double sdata[128][27];
...
/* 'sdata' initialization */
...
__syncthreads();
...
// COMMEMTING ON and OFF this IF statement have an impact on the results
// (should have no impact, since all 'lost' of the particles are 'zero')
// if (particle_j.lost == 0) { // only if the neighboring particle is not lost
// put some arbitrary values
for (int i_mat = 0; i_mat<3; i_mat++) {
for (int j_mat = 0; j_mat<3; j_mat++) {
sdata[threadIdx.x][ptr_matrix0 + 3*i_mat + j_mat] = 10.;
}
}
// }
__syncthreads();
}
But it turned out it is not - if I turn on the IF statement, then the values of calculated ‘stress’ becomes zero, which should have been ‘50’.
I initialized 'lost’s of the all particles to be zero, thereby in principle there should have been no impact by the presence of the IF statement.
This behavior of this reduced teset is a bit different from my real problem (where all other components of the particles seem to be over-written as zeros), but it is already strange and interesting as it is.
My full reduced code is like below.
#include <stdio.h>
typedef struct Particle {
double rho;
double vel[3];
double stress[3][3];
int n_neighbor;
int *neighbor_list;
int lost;
} Particle;
__global__ void Kernel_test(int n_particle, Particle *particle_list ) {
int index_i, index_j;
Particle particle_i, particle_j;
__shared__ double sdata[128][27]; // n_thread fixed to '128', and each thread will calculate three 3x3 matrix
// '128' is to take into account large number of neighbors, while in this test only 5.
int ptr_matrix0 = 0;
int ptr_matrix1 = 9; // dummy, will not be actually used in this example
int ptr_matrix2 = 18; // dummy, will not be actually used in this example
for (int i_mat = 0; i_mat<3; i_mat++) {
for (int j_mat = 0; j_mat<3; j_mat++) {
sdata[threadIdx.x][ptr_matrix0 + 3*i_mat + j_mat] = 0.;
sdata[threadIdx.x][ptr_matrix1 + 3*i_mat + j_mat] = 0.;
sdata[threadIdx.x][ptr_matrix2 + 3*i_mat + j_mat] = 0.;
}
}
__syncthreads();
// each block is assigned to one particle
index_i = blockIdx.x;
if (index_i < n_particle) { // safe-guard
particle_i = particle_list[index_i];
if (threadIdx.x < particle_i.n_neighbor) { // n_neighbor = 5 in this test
index_j = particle_i.neighbor_list[threadIdx.x];
particle_j = particle_list[index_j];
// COMMEMTING ON and OFF this IF statement have an impact on the results
// if (particle_j.lost == 0) { // only if the neighboring particle is not lost
// put some arbitrary values
for (int i_mat = 0; i_mat<3; i_mat++) {
for (int j_mat = 0; j_mat<3; j_mat++) {
sdata[threadIdx.x][ptr_matrix0 + 3*i_mat + j_mat] = 10.;
}
}
// } // particle_j.lost
}
}
__syncthreads();
// then the 0th thread collects and push it back to the 'particle_list'
if (threadIdx.x == 0) {
if (index_i < n_particle) { // safe-guard
// clean-up the previously filled value
for (int i_mat = 0; i_mat<3; i_mat++) {
for (int j_mat = 0; j_mat<3; j_mat++) {
particle_list[index_i].stress[i_mat][j_mat] = 0.;
}
}
// reduce 'sdata' in row
for (int i_row = 0; i_row<128; i_row++ ) {
for (int i_mat = 0; i_mat<3; i_mat++) {
for (int j_mat = 0; j_mat<3; j_mat++) {
particle_list[index_i].stress[i_mat][j_mat] += sdata[i_row][3*i_mat+j_mat];
}
}
}
}
}
}
int main() {
Particle *particles, *dev_particles;
int n_particle = 1000;
particles = (Particle *)malloc( n_particle * sizeof(Particle) );
for (int i = 0; i<n_particle; i++) {
particles[i].rho = (double)i*10.;
particles[i].vel[0] = 0.;
particles[i].vel[1] = (double)i * 0.1;
particles[i].vel[2] = 0.;
for (int i_mat = 0; i_mat<3; i_mat++) {
for (int j_mat = 0; j_mat<3; j_mat++) {
particles[i].stress[i_mat][j_mat] = 0.;
}
}
particles[i].n_neighbor = 5;
particles[i].neighbor_list = (int *)malloc(5*sizeof(int));
particles[i].neighbor_list[0] = 100;
particles[i].neighbor_list[1] = 101;
particles[i].neighbor_list[2] = 102;
particles[i].neighbor_list[3] = 103;
particles[i].neighbor_list[4] = 104;
particles[i].lost = 0;
}
// print an arbitrary part of the particles
printf("before Kernel_test \n");
for (int i = 500; i<510; i++) {
printf("%d th: rho %e, vel[0] %e, vel[1] %e, stress[0][0] %e, stress[1][0] %e \n",
i, particles[i].rho, particles[i].vel[0], particles[i].vel[1], particles[i].stress[0][0], particles[i].stress[1][0] );
}
cudaMalloc( (void **)&dev_particles, n_particle * sizeof(Particle) );
cudaMemcpy( dev_particles, particles, n_particle * sizeof(Particle), cudaMemcpyHostToDevice );
// each block assigns with one particle, each thread calculates the contribution of each neighbors
int n_thread = 128;
int n_block = n_particle;
//Kernel_printNeighbor <<<n_block, n_thread>>> ( n_particle, dev_particles );
Kernel_test <<<n_block, n_thread>>> ( n_particle, dev_particles );
cudaMemcpy( particles, dev_particles, n_particle * sizeof(Particle), cudaMemcpyDeviceToHost );
// print an arbitrary part of the particles
printf("after Kernel_test \n");
for (int i = 500; i<510; i++) {
printf("%d th: rho %e, vel[0] %e, vel[1] %e, stress[0][0] %e, stress[1][0] %e \n",
i, particles[i].rho, particles[i].vel[0], particles[i].vel[1], particles[i].stress[0][0], particles[i].stress[1][0] );
}
}
This is a code based on the particle scheme, where the evolution of the i-th particle is determined by the contributions from its neighbors. In principle, the neighbors should be found via dedicated functions, here the neighbors of every particles are set to 100, 101, 102, 103, 104-th particles so each particle has 5 neighbors.
Each block will be assigned to each particle, and the threads under the block will calculate the contributions from the neighbors. Although only 5 threads are needed in this example, 128 threads are hard-coded to take into account more neighbors in the future.
Each thread will store their contribution to the shared memory sdata[thread_id][:]
, and this will be pushed back to the original particle after reduction by the threadIdx.x = 0
thread.
There are 1000 particles in this test, and only the particle 500 ~ 509 are printed out to keep the output short.