Multi-device problem Only first CPU thread to launch GPU component finishes.

I am trying to do a very simple distance matrix calculation but I need to use multiple GPUs as a single GPU cannot hold all the data, so more is ideal. The problem I am having is that the only code that runs is the very first GPU call, while the rest are left as zeros. Has anyone run in to a similar problem?

Code and sample run for examples.

[codebox]#include <cuda.h>

#include

using namespace std;

#include <boost/thread.hpp>

//Compile with -lboost_thread

bool testing=true;

int numPoints, elements, numThreads, numGPUs;

int blocks, threadsPer;

float *host_array, *host_matrix;

size_t hostArraySize, hostMatrixSize;

bool badValue=false;

boost::mutex io_mutex;

global void genDist(float *array, float *distMatrix, int start, int end, int totalPoints, int totalElements){

int threadNum=blockIdx.x*blockDim.x+threadIdx.x;

int totalThreads=gridDim.x*blockDim.x;

int selfStart=start+((end-start)*threadNum)/totalThreads;

int selfEnd=start+((end-start)*(threadNum+1))/totalThreads;

//For all i in [selfStart, selfEnd)

for(int i=selfStart; i<selfEnd; i++){

//For all j

for(int j=0; j<totalPoints; j++){

  //Sum all squares of the elements of point i and j

  float dist=0;

  for(int e=0; e<totalElements; e++)

dist+=(array[i+e*totalPoints]-array[j+e*totalPoints])*(array[i+e*totalPoints]-array[j+e*totalPoints]);

  dist=sqrtf(dist);

  distMatrix[i*totalPoints+j]=dist;

}

}

}

void setConstants();

void setNumThreads();

struct gpuLaunchThread{

int threadNum;

gpuLaunchThread(int tNum):threadNum(tNum){

if (testing){

  boost::mutex::scoped_lock lock(io_mutex);

  cout<<"Thread "<<threadNum<<" launched."<<endl;

}

}

void operator()(){

cudaSetDevice(threadNum%numGPUs);

int start=(numPoints*threadNum)/numThreads;

int end=(numPoints*(threadNum+1))/numThreads;

size_t matrixSize=(end-start)*numPoints*sizeof(float);

float *matrixStart=host_matrix+start*numPoints;

if (testing){

  boost::mutex::scoped_lock lock(io_mutex);

  cout<<"Selecting device "<<threadNum%numGPUs<<"."<<endl;	

  cout<<"Indexed from ["<<start<<", "<<end<<")"<<endl;

  cout<<"Matrix starts from "<<matrixStart<<endl;

  cout<<threadNum<<" matrixSize="<<matrixSize<<endl;

}

float *distanceMatrix, *array;

cudaMalloc((void**) &distanceMatrix, matrixSize);

cudaMemset(distanceMatrix, 0, matrixSize);

cudaMalloc((void**) &array, hostArraySize);

cudaMemset(array, 0, hostArraySize);

cudaMemcpy(array, host_array, hostArraySize, cudaMemcpyHostToDevice);

genDist <<<blocks, threadsPer>>> (array, distanceMatrix, start, end, numPoints, elements);

cudaMemcpy(matrixStart, distanceMatrix, matrixSize, cudaMemcpyDeviceToHost);

cudaFree(distanceMatrix);

cudaFree(array);

}

};

int main(void){

setConstants();

if (badValue)

return 0;

boost::thread_group *threads;

threads=new boost::thread_group;

if (testing){

boost::mutex::scoped_lock lock(io_mutex);

cout<<"Total matrixSize="<<hostMatrixSize<<endl;

}

for(int i=0; i<numThreads; i++){

gpuLaunchThread tmp(i);

boost::thread* temp=new boost::thread(tmp);

threads->add_thread(temp);

if (threads->size()==numGPUs){

  threads->join_all();

  free(threads);

  threads=new boost::thread_group;

}

}

if (threads->size()!=0)

threads->join_all();

int index1, index2;

do{

cout<<"Enter two points [0, "<<numPoints<<") <Out of range to exit>: ";

cin>>index1>>index2;

if(index1>=0&&index2>=0&&index1<numPoints&&index2<numPoints){

  cout<<"===Point "<<index1<<"==="<<endl;

  for(int e=0; e<elements; e++)

cout<<host_array[index1+e*numPoints]<<" ";

  cout<<endl;

  cout<<"===Point "<<index2<<"==="<<endl;

  for(int e=0; e<elements; e++)

cout<<host_array[index2+e*numPoints]<<" ";

  cout<<endl;

  cout<<"Distance(1,2): "<<host_matrix[index1*numPoints+index2]<<endl;

  cout<<"Distance(2,1): "<<host_matrix[index2*numPoints+index1]<<endl;

}

} while(index1>=0&&index2>=0&&index1<numPoints&&index2<numPoints);

free(threads);

free(host_array);

free(host_matrix);

}

//Function to set the number of threads.

void setNumThreads(){

cudaGetDeviceCount(&numGPUs);

if (numGPUs<1){

cout<<"No CUDA capable GPUs on this machine"<<endl;

badValue=true;

return;

}

cout<<numGPUs<<" devices detected…"<<endl;

cout<<"Enter number of threads to use: ";

cin>>numThreads;

if (numThreads<1){

cout<<"Incorrect input for thread count"<<endl;

badValue=true;

return;

}

cout<<"Enter number of blocks: ";

cin>>blocks;

cout<<"Enter threads per block: ";

cin>>threadsPer;

if (blocks<1||threadsPer<1){

cout<<"Bad thread constraints."<<endl;

badValue=true;

return;

}

}

//Function to set constant values (Pointers, point data)

void setConstants(){

cout<<"Enter number of points to generate: ";

cin>>numPoints;

cout<<"Enter dimensionality of points: ";

cin>>elements;

if (numPoints<1||elements<1){

cout<<"Bad point values."<<endl;

badValue=true;

return;

}

hostArraySize=numPointselementssizeof(float);

host_array=(float*)malloc(hostArraySize);

hostMatrixSize=numPointsnumPointssizeof(float);

host_matrix=(float*)malloc(hostMatrixSize);

cout<<“Generating “<<numPoints<<” random points with “<<elements<<” elements.”<<endl;

for(int i=0; i<elements; i++)

for(int j=0; j<numPoints; j++)

  host_array[j*elements+i]=(float)(rand()%10000)/1000;

setNumThreads();

}[/codebox]

[codebox]Enter number of points to generate: 1500

Enter dimensionality of points: 5

Generating 1500 random points with 5 elements.

4 devices detected…

Enter number of threads to use: 6

Enter number of blocks: 8

Enter threads per block: 256

Total matrixSize=9000000

Thread 0 launched.

Selecting device 0.

Indexed from [0, 250)

Matrix starts from 0x2b94b2f10010

0 matrixSize=1500000

Thread 1 launched.

Thread 2 launched.

Thread 3 launched.

Selecting device 1.

Indexed from [250, 500)

Matrix starts from 0x2b94b307e370

1 matrixSize=1500000

Selecting device 2.

Indexed from [500, 750)

Matrix starts from 0x2b94b31ec6d0

2 matrixSize=1500000

Selecting device 3.

Indexed from [750, 1000)

Matrix starts from 0x2b94b335aa30

3 matrixSize=1500000

Thread 4 launched.

Selecting device 0.

Indexed from [1000, 1250)

Matrix starts from 0x2b94b34c8d90

4 matrixSize=1500000

Thread 5 launched.

Selecting device 1.

Indexed from [1250, 1500)

Matrix starts from 0x2b94b36370f0

5 matrixSize=1500000

Enter two points [0, 1500) : 0 1

===Point 0===

9.383 2.09 0.27 9.932 9.613

===Point 1===

2.853 3.923 4.684 1.246 5.739

Distance(1,2): 12.4875

Distance(2,1): 12.4875

Enter two points [0, 1500) : 0 0

===Point 0===

9.383 2.09 0.27 9.932 9.613

===Point 0===

9.383 2.09 0.27 9.932 9.613

Distance(1,2): 0

Distance(2,1): 0

Enter two points [0, 1500) : 249 248

===Point 249===

6.229 2.482 5.915 3.207 4.232

===Point 248===

7.877 9.524 9.474 7.973 5.776

Distance(1,2): 9.49057

Distance(2,1): 9.49057

Enter two points [0, 1500) : 249 250

===Point 249===

6.229 2.482 5.915 3.207 4.232

===Point 250===

6.413 7.672 1.536 6.651 5.354

Distance(1,2): 7.69842 (This first one is correct because the point i,j is computed twice. Once for i,j and once for j,i Since 249 is in the first thread [249,250] was computed but since 250 isn’t, [250,249] was not computed.)

Distance(2,1): 0

Enter two points [0, 1500) : 1000 25

===Point 1000===

8.776 2.261 6.272 8.468 7.603

===Point 25===

8.335 9.107 8.363 8.86 7.667

Distance(1,2): 0

Distance(2,1): 7.18277 (This is the same as above. [25,1000] was computed, [1000,25] was not)

Enter two points [0, 1500) : 1000 1001

===Point 1000===

8.776 2.261 6.272 8.468 7.603

===Point 1001===

7.457 4.536 1.786 7.744 6.052

Distance(1,2): 0

Distance(2,1): 0

Enter two points [0, 1500) : -1 -1[/codebox]

There was an indexing error here: distMatrix[i*totalPoints+j]=dist; should be distMatrix[(i-start)*totalPoints+j]=dist; as the entire matrix is not on each device. Moderators can feel free to remove this topic, as I am not finding the option.

There was an indexing error here: distMatrix[i*totalPoints+j]=dist; should be distMatrix[(i-start)*totalPoints+j]=dist; as the entire matrix is not on each device. Moderators can feel free to remove this topic, as I am not finding the option.