I am trying to do a very simple distance matrix calculation but I need to use multiple GPUs as a single GPU cannot hold all the data, so more is ideal. The problem I am having is that the only code that runs is the very first GPU call, while the rest are left as zeros. Has anyone run in to a similar problem?
Code and sample run for examples.
[codebox]#include <cuda.h>
using namespace std;
#include <boost/thread.hpp>
//Compile with -lboost_thread
bool testing=true;
int numPoints, elements, numThreads, numGPUs;
int blocks, threadsPer;
float *host_array, *host_matrix;
size_t hostArraySize, hostMatrixSize;
bool badValue=false;
boost::mutex io_mutex;
global void genDist(float *array, float *distMatrix, int start, int end, int totalPoints, int totalElements){
int threadNum=blockIdx.x*blockDim.x+threadIdx.x;
int totalThreads=gridDim.x*blockDim.x;
int selfStart=start+((end-start)*threadNum)/totalThreads;
int selfEnd=start+((end-start)*(threadNum+1))/totalThreads;
//For all i in [selfStart, selfEnd)
for(int i=selfStart; i<selfEnd; i++){
//For all j
for(int j=0; j<totalPoints; j++){
//Sum all squares of the elements of point i and j
float dist=0;
for(int e=0; e<totalElements; e++)
dist+=(array[i+e*totalPoints]-array[j+e*totalPoints])*(array[i+e*totalPoints]-array[j+e*totalPoints]);
dist=sqrtf(dist);
distMatrix[i*totalPoints+j]=dist;
}
}
}
void setConstants();
void setNumThreads();
struct gpuLaunchThread{
int threadNum;
gpuLaunchThread(int tNum):threadNum(tNum){
if (testing){
boost::mutex::scoped_lock lock(io_mutex);
cout<<"Thread "<<threadNum<<" launched."<<endl;
}
}
void operator()(){
cudaSetDevice(threadNum%numGPUs);
int start=(numPoints*threadNum)/numThreads;
int end=(numPoints*(threadNum+1))/numThreads;
size_t matrixSize=(end-start)*numPoints*sizeof(float);
float *matrixStart=host_matrix+start*numPoints;
if (testing){
boost::mutex::scoped_lock lock(io_mutex);
cout<<"Selecting device "<<threadNum%numGPUs<<"."<<endl;
cout<<"Indexed from ["<<start<<", "<<end<<")"<<endl;
cout<<"Matrix starts from "<<matrixStart<<endl;
cout<<threadNum<<" matrixSize="<<matrixSize<<endl;
}
float *distanceMatrix, *array;
cudaMalloc((void**) &distanceMatrix, matrixSize);
cudaMemset(distanceMatrix, 0, matrixSize);
cudaMalloc((void**) &array, hostArraySize);
cudaMemset(array, 0, hostArraySize);
cudaMemcpy(array, host_array, hostArraySize, cudaMemcpyHostToDevice);
genDist <<<blocks, threadsPer>>> (array, distanceMatrix, start, end, numPoints, elements);
cudaMemcpy(matrixStart, distanceMatrix, matrixSize, cudaMemcpyDeviceToHost);
cudaFree(distanceMatrix);
cudaFree(array);
}
};
int main(void){
setConstants();
if (badValue)
return 0;
boost::thread_group *threads;
threads=new boost::thread_group;
if (testing){
boost::mutex::scoped_lock lock(io_mutex);
cout<<"Total matrixSize="<<hostMatrixSize<<endl;
}
for(int i=0; i<numThreads; i++){
gpuLaunchThread tmp(i);
boost::thread* temp=new boost::thread(tmp);
threads->add_thread(temp);
if (threads->size()==numGPUs){
threads->join_all();
free(threads);
threads=new boost::thread_group;
}
}
if (threads->size()!=0)
threads->join_all();
int index1, index2;
do{
cout<<"Enter two points [0, "<<numPoints<<") <Out of range to exit>: ";
cin>>index1>>index2;
if(index1>=0&&index2>=0&&index1<numPoints&&index2<numPoints){
cout<<"===Point "<<index1<<"==="<<endl;
for(int e=0; e<elements; e++)
cout<<host_array[index1+e*numPoints]<<" ";
cout<<endl;
cout<<"===Point "<<index2<<"==="<<endl;
for(int e=0; e<elements; e++)
cout<<host_array[index2+e*numPoints]<<" ";
cout<<endl;
cout<<"Distance(1,2): "<<host_matrix[index1*numPoints+index2]<<endl;
cout<<"Distance(2,1): "<<host_matrix[index2*numPoints+index1]<<endl;
}
} while(index1>=0&&index2>=0&&index1<numPoints&&index2<numPoints);
free(threads);
free(host_array);
free(host_matrix);
}
//Function to set the number of threads.
void setNumThreads(){
cudaGetDeviceCount(&numGPUs);
if (numGPUs<1){
cout<<"No CUDA capable GPUs on this machine"<<endl;
badValue=true;
return;
}
cout<<numGPUs<<" devices detected…"<<endl;
cout<<"Enter number of threads to use: ";
cin>>numThreads;
if (numThreads<1){
cout<<"Incorrect input for thread count"<<endl;
badValue=true;
return;
}
cout<<"Enter number of blocks: ";
cin>>blocks;
cout<<"Enter threads per block: ";
cin>>threadsPer;
if (blocks<1||threadsPer<1){
cout<<"Bad thread constraints."<<endl;
badValue=true;
return;
}
}
//Function to set constant values (Pointers, point data)
void setConstants(){
cout<<"Enter number of points to generate: ";
cin>>numPoints;
cout<<"Enter dimensionality of points: ";
cin>>elements;
if (numPoints<1||elements<1){
cout<<"Bad point values."<<endl;
badValue=true;
return;
}
hostArraySize=numPointselementssizeof(float);
host_array=(float*)malloc(hostArraySize);
hostMatrixSize=numPointsnumPointssizeof(float);
host_matrix=(float*)malloc(hostMatrixSize);
cout<<“Generating “<<numPoints<<” random points with “<<elements<<” elements.”<<endl;
for(int i=0; i<elements; i++)
for(int j=0; j<numPoints; j++)
host_array[j*elements+i]=(float)(rand()%10000)/1000;
setNumThreads();
}[/codebox]
[codebox]Enter number of points to generate: 1500
Enter dimensionality of points: 5
Generating 1500 random points with 5 elements.
4 devices detected…
Enter number of threads to use: 6
Enter number of blocks: 8
Enter threads per block: 256
Total matrixSize=9000000
Thread 0 launched.
Selecting device 0.
Indexed from [0, 250)
Matrix starts from 0x2b94b2f10010
0 matrixSize=1500000
Thread 1 launched.
Thread 2 launched.
Thread 3 launched.
Selecting device 1.
Indexed from [250, 500)
Matrix starts from 0x2b94b307e370
1 matrixSize=1500000
Selecting device 2.
Indexed from [500, 750)
Matrix starts from 0x2b94b31ec6d0
2 matrixSize=1500000
Selecting device 3.
Indexed from [750, 1000)
Matrix starts from 0x2b94b335aa30
3 matrixSize=1500000
Thread 4 launched.
Selecting device 0.
Indexed from [1000, 1250)
Matrix starts from 0x2b94b34c8d90
4 matrixSize=1500000
Thread 5 launched.
Selecting device 1.
Indexed from [1250, 1500)
Matrix starts from 0x2b94b36370f0
5 matrixSize=1500000
Enter two points [0, 1500) : 0 1
===Point 0===
9.383 2.09 0.27 9.932 9.613
===Point 1===
2.853 3.923 4.684 1.246 5.739
Distance(1,2): 12.4875
Distance(2,1): 12.4875
Enter two points [0, 1500) : 0 0
===Point 0===
9.383 2.09 0.27 9.932 9.613
===Point 0===
9.383 2.09 0.27 9.932 9.613
Distance(1,2): 0
Distance(2,1): 0
Enter two points [0, 1500) : 249 248
===Point 249===
6.229 2.482 5.915 3.207 4.232
===Point 248===
7.877 9.524 9.474 7.973 5.776
Distance(1,2): 9.49057
Distance(2,1): 9.49057
Enter two points [0, 1500) : 249 250
===Point 249===
6.229 2.482 5.915 3.207 4.232
===Point 250===
6.413 7.672 1.536 6.651 5.354
Distance(1,2): 7.69842 (This first one is correct because the point i,j is computed twice. Once for i,j and once for j,i Since 249 is in the first thread [249,250] was computed but since 250 isn’t, [250,249] was not computed.)
Distance(2,1): 0
Enter two points [0, 1500) : 1000 25
===Point 1000===
8.776 2.261 6.272 8.468 7.603
===Point 25===
8.335 9.107 8.363 8.86 7.667
Distance(1,2): 0
Distance(2,1): 7.18277 (This is the same as above. [25,1000] was computed, [1000,25] was not)
Enter two points [0, 1500) : 1000 1001
===Point 1000===
8.776 2.261 6.272 8.468 7.603
===Point 1001===
7.457 4.536 1.786 7.744 6.052
Distance(1,2): 0
Distance(2,1): 0
Enter two points [0, 1500) : -1 -1[/codebox]