Erroneous results when launching more than 32 threads

include <cmath>
#include <iostream>
#include <fstream>
#include <stdio.h>
#include <stdlib.h>
#include "timer.h"


using namespace std;

typedef struct {
  float record[8];
  float dis;
  int t_class;
}node;

/* Distance metric */
float Dist (float * a, float *b, int width){

  float rst = 0;

  for (int i =0; i < width; i++){
    rst += (a[i] - b[i])*(a[i] - b[i]);
  }

  return sqrt(rst);

}

void init(node * &in_data, int &num_record, int &num_feature, float * &new_point, int &k){

  const char* filename = "Data/abalone_4177_8.csv";
  in_data = CSV_Read(filename);


  num_record = 4177;
  num_feature = 8;

  new_point = (float*)malloc(num_feature*sizeof(float));

  new_point[0] = 2;
  new_point[1] = 0.18;
  new_point[2] = 0.11;
  new_point[3] = 0.02;
  new_point[4] = 0.445;
  new_point[5] = 0.078;
  new_point[6] = 0.111;
  new_point[7] = 0.245;


  k = 1;

}

#pragma acc routine seq
void knn (node * in_data, int num_record, int num_feature, float * new_point, int k, node * rst,int id){

  /* Exhaustive Search */
  for (int i=0; i < num_record; i++){
    in_data[i].dis = Dist(in_data[i].record,new_point,num_feature);
  }

  node tmp;
  for (int i=0; i< num_record-1;i++)
    {
    for (int j =0; j<num_record-i-1; j++)
      {
        if (in_data[j].dis > in_data[j+1].dis)
          {
            tmp = in_data[j];
            in_data[j] = in_data[j+1];
            in_data[j+1] = tmp;
          }
      }
      }

  /* K nearest result */
  for (int i=0;i<k;i++)
    rst[id*k+i] = in_data[i];


}
int main(){

  node * in_data;
  int num_record;
  int num_feature;
  float * new_point;
  int k;

  init(in_data, num_record, num_feature, new_point, k);
  int threads = 33;

  node * rst = (node*)malloc(threads*k*sizeof(node));

  node * in_data_copy = (node*)malloc(4177*sizeof(node));

  float seed;
  float rand;
  float z0 = 0;
  float z1 =0;
  bool generate = false;

  StartTimer();

#pragma acc parallel loop gang worker vector pcopyin(in_data[0:num_record],new_point[0:num_feature]) pcopyout(rst[0:threads*k]\
)
    for (int i=0; i<threads; i++){

      /*                                                                                                                       
      for (int j = 0; j< 4177; j++){                                                                                           
        in_data_copy[j] = in_data[j];                                                                                          
        }*/

      knn(in_data,num_record,num_feature,new_point,k,rst,i);

    }


  double runtime = GetTimer();
  printf(" total: %f s\n", runtime / 1000.f);

  for (int i=0; i<threads; i++){
    for (int ii=0; ii<k; ii++){
      for (int j = 0; j<8;j++)
        std::cout << rst[i*k+ii].record[j] << " ";
      std::cout<< " | ";
    }
    std::cout << std::endl;
  }



}

When launching l<= 32 threads, the results are correct.
Yet when more than 32 threads, results are erroneous.

Hi EvanzzzZ,

Since there’s missing files I can’t replicate your results. But in looking at your code I see the following in knn:

 
#pragma acc routine seq 
void knn (node * in_data, int num_record, int num_feature, float * new_point, int k, node * rst,int id){ 

  /* Exhaustive Search */ 
  for (int i=0; i < num_record; i++){ 
    in_data[i].dis = Dist(in_data[i].record,new_point,num_feature); 
  } 

  node tmp; 
  for (int i=0; i< num_record-1;i++) 
    { 
    for (int j =0; j<num_record-i-1; j++) 
      { 
        if (in_data[j].dis > in_data[j+1].dis) 
          { 
            tmp = in_data[j]; 
            in_data[j] = in_data[j+1]; 
            in_data[j+1] = tmp; 
          } 
      } 
      } 

  /* K nearest result */ 
  for (int i=0;i<k;i++) 
    rst[id*k+i] = in_data[i]; 


}

This is a race condition. It most likely works with 32 threads since this would be a single warp which are executing in SIMT. But once you move to multiple warps, some threads could be performing the swap at the same time others are executing the “Exhaustive Search” loop.

I think this is why you’ve been asking about “private”. So, yes you could create a private copy of “in_data” for every thread and initialize the private copies, but that would just mean you’re having every thread perform redundant computation. Better to rethink how you can change you algorithm to be parallelizable.

  • Mat

This is a race condition. It most likely works with 32 threads since this would be a single warp which are executing in SIMT. But once you move to multiple warps, some threads could be performing the swap at the same time others are executing the “Exhaustive Search” loop.

Yes. But all threads are dealing with the same data set. Even some threads are doing swapping, the other threads are trying to update the distance into the same state as the swapping threads. So the data race should not cause an error here. And the same code would be correct if used as OpenCL kernel.

Sorry EvanzzzZ, I not convinced. Maybe if you have each loop be it’s own kernel. (The CUDA kNN version I looked at does it this way).

The other issue is that you have every thread doing redundant work. I’d recommend you investigate how you can change the algorithm so that you can decompose the problem across thread.

  • Mat