What could have caused the code to work only on one block but not for more than one?
Device Code:
/* ALG XIV.H: For each of the process’s people, each process spawns
* threads to do the following */
__global__ void cuda_susceptible(char *states_dev, int *x_locations_dev,
int *y_locations_dev, int *infected_x_locations_dev,
int *infected_y_locations_dev, int *num_infected_dev,
int *num_susceptible_dev, int *num_infection_attempts_dev,
int *num_infections_dev, curandState *cuda_states, int global_num_infected,
int infection_radius, int contagiousness_factor, char SUSCEPTIBLE, char INFECTED,
int numThread)
{
// set up shared memory
int *num_infected = (int*)array;
int *num_susceptible = (int*)&num_infected[numThread];
int *num_infection_attempts = (int*)&num_susceptible[numThread];
int *num_infections = (int*)&num_infection_attempts[numThread];
int id = threadIdx.x + blockIdx.x * blockDim.x;
int i, num_infected_nearby;
// clear the shared memory
num_infected[id] = 0;
num_susceptible[id] = 0;
num_infection_attempts[id] = 0;
num_infections[id] = 0;
/* ALG XIV.H.1: If the person is susceptible, then */
if(states_dev[id] == SUSCEPTIBLE)
{
/* ALG XIV.H.1.a: For each of the infected people (received
* earlier from all processes) or until the number of infected
* people nearby is 1, the thread does the following */
num_infected_nearby = 0;
for(i=0; i<=global_num_infected-1 && num_infected_nearby<1; i++)
{
/* ALG XIV.H.1.a.i: If person 1 is within the infection
* radius, then */
if( (x_locations_dev[id] > infected_x_locations_dev[i] - infection_radius) &&
(x_locations_dev[id] < infected_x_locations_dev[i] + infection_radius) &&
(y_locations_dev[id] > infected_y_locations_dev[i] - infection_radius) &&
(y_locations_dev[id] < infected_y_locations_dev[i] + infection_radius) )
{
/* ALG XIV.H.1.a.i.1: The thread increments the number
* of infected people nearby */
num_infected_nearby ++;
}
}
if(num_infected_nearby >= 1){
num_infection_attempts[threadIdx.x]++;
}
/* ALG XIV.H.1.b: If there is at least one infected person
* nearby, and a random number less than 100 is less than or
* equal to the contagiousness factor, then */
// generate a random number between 0 and 100
int rand_num = (int)(curand_uniform(&cuda_states[id])*100);
if(num_infected_nearby >= 1 && rand_num <= contagiousness_factor)
{
/* ALG XIV.H.1.b.i: The thread changes person1’s state to
* infected */
states_dev[id] = INFECTED;
/* ALG XIV.H.1.b.ii: The thread updates the counters */
num_infected[threadIdx.x]++;
num_susceptible[threadIdx.x]--;
num_infections[threadIdx.x]++;
}
}
__syncthreads();
// use atomicAdd function to add up results
if( threadIdx.x == 0 ) {
int num_infected_sum = 0;
int num_susceptible_sum = 0;
int num_infection_attempts_sum = 0;
int num_infections_sum = 0;
for(i=0; i<numThread; i++){
num_infected_sum += num_infected[i];
num_susceptible_sum += num_susceptible[i];
num_infection_attempts_sum += num_infection_attempts[i];
num_infections_sum += num_infections[i];
}
atomicAdd(num_infected_dev, num_infected_sum);
atomicAdd(num_susceptible_dev, num_susceptible_sum);
atomicAdd(num_infection_attempts_dev, num_infection_attempts_sum);
atomicAdd(num_infections_dev, num_infections_sum);
}
}
Host Code
void cuda_run(struct global_t *global, struct stats_t *stats,
struct const_t *constant, struct cuda_t *cuda)
{
// copy host info to device
cudaMemcpy(cuda->infected_x_locations_dev, global->infected_x_locations, cuda->people_size, cudaMemcpyHostToDevice);
cudaMemcpy(cuda->infected_y_locations_dev, global->infected_y_locations, cuda->people_size, cudaMemcpyHostToDevice);
if(global->current_day == 0){
// cast stats data to int
cuda->num_infections_int = (int)stats->num_infections;
cuda->num_infection_attempts_int = (int)stats->num_infection_attempts;
cuda->num_deaths_int = (int)stats->num_deaths;
cuda->num_recovery_attempts_int = (int)stats->num_recovery_attempts;
cudaMemcpy(cuda->num_infections_dev, &cuda->num_infections_int, sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(cuda->num_infection_attempts_dev, &cuda->num_infection_attempts_int, sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(cuda->num_deaths_dev, &cuda->num_deaths_int, sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(cuda->num_recovery_attempts_dev, &cuda->num_recovery_attempts_int, sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(cuda->x_locations_dev, global->x_locations, cuda->people_size, cudaMemcpyHostToDevice);
cudaMemcpy(cuda->y_locations_dev, global->y_locations, cuda->people_size, cudaMemcpyHostToDevice);
cudaMemcpy(cuda->states_dev, global->states, cuda->states_size, cudaMemcpyHostToDevice);
cudaMemcpy(cuda->num_days_infected_dev, global->num_days_infected, cuda->people_size, cudaMemcpyHostToDevice);
cudaMemcpy(cuda->num_susceptible_dev, &global->num_susceptible, sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(cuda->num_immune_dev, &global->num_immune, sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(cuda->num_dead_dev, &global->num_dead, sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(cuda->num_infected_dev, &global->num_infected, sizeof(int), cudaMemcpyHostToDevice);
}
/* set up cuda Random Number Generator */
// set up curand states
curandState *cuda_states;
cudaMalloc(&cuda_states, cuda->numThread * cuda->numBlock);
// obatin current time as seed
time_t current_time;
time(¤t_time);
// initialize rand_kernel function
rand_kernel<<<cuda->numBlock, cuda->numThread>>>(cuda_states, (unsigned long)current_time);
// execute device code on susceptible people
int infection_radius = constant->infection_radius;
int contagiousness_factor = constant->contagiousness_factor;
int num_infected = global->num_infected;
cuda_susceptible<<<cuda->numBlock, cuda->numThread, 4*cuda->numThread*sizeof(int)>>>(
cuda->states_dev, cuda->x_locations_dev, cuda->y_locations_dev,
cuda->infected_x_locations_dev, cuda->infected_y_locations_dev,
cuda->num_infected_dev, cuda->num_susceptible_dev,
cuda->num_infection_attempts_dev, cuda->num_infections_dev,
cuda_states, num_infected, infection_radius,
contagiousness_factor, SUSCEPTIBLE, INFECTED, cuda->numThread);
// Sync
cudaThreadSynchronize();
cudaMemcpy(global->x_locations, cuda->x_locations_dev, cuda->people_size, cudaMemcpyDeviceToHost);
cudaMemcpy(global->y_locations, cuda->y_locations_dev, cuda->people_size, cudaMemcpyDeviceToHost);
cudaMemcpy(global->states, cuda->states_dev, cuda->states_size, cudaMemcpyDeviceToHost);
cudaMemcpy(&global->num_infected, cuda->num_infected_dev, sizeof(int), cudaMemcpyDeviceToHost);
cudaFree(cuda_states);
printf("x location %d y location %d state %c \n", global->x_locations[10],
global->y_locations[10], global->states[10]);
/*int i;
for(i=0; i<global->number_of_people; i++){
printf("x location %d y location %d \n", global->x_locations[i], global->y_locations[i]);
}*/
printf("infected number %d \n", global->num_infected);
if(global->current_day == constant->total_number_of_days){
cudaMemcpy(&global->num_susceptible, cuda->num_susceptible_dev, sizeof(int), cudaMemcpyDeviceToHost);
cudaMemcpy(&global->num_immune, cuda->num_immune_dev, sizeof(int), cudaMemcpyDeviceToHost);
cudaMemcpy(&global->num_dead, cuda->num_dead_dev, sizeof(int), cudaMemcpyDeviceToHost);
cudaMemcpy(&cuda->num_infections_int, cuda->num_infections_dev, sizeof(int), cudaMemcpyDeviceToHost);
cudaMemcpy(&cuda->num_infection_attempts_int, cuda->num_infection_attempts_dev, sizeof(int), cudaMemcpyDeviceToHost);
cudaMemcpy(&cuda->num_deaths_int, cuda->num_deaths_dev, sizeof(int), cudaMemcpyDeviceToHost);
cudaMemcpy(&cuda->num_recovery_attempts_int, cuda->num_recovery_attempts_dev, sizeof(int), cudaMemcpyDeviceToHost);
stats->num_infections = (double)cuda->num_infections_int;
stats->num_infection_attempts = (double)cuda->num_infection_attempts_int;
stats->num_deaths = (double)cuda->num_deaths_int;
stats->num_recovery_attempts = (double)cuda->num_recovery_attempts_int;
}
}