Well Jamie K, thanks a lot for remarking that there was no race condition when it seem it was. About coping the data to the weight matrix, I made a test in debug mode printing all the values that I previously created before, randomly just for testing purpose, and they were all there. I am pasting some of my code now here, but I will upload a test project that I am right now translating from Spanish so you can all test it.
This is what I am trying to do: the secuencial Floyd algorithm stores the length of the paths of a directed graph in a matrix where weightMatrix[v][w] contains the path length of going from vertex v to vertex w. The algorithms has a triple nested loop testing for all the paths which vertex is the best intermediary, in other words if is better going straight from v to w that going from v to u and from u to w, in the latest case he update the path length and the intermediary vertex, code is like this:
[codebox]for (int u=0;u < vertexCount;u++)
for (int v=0;v < vertexCount;v++)
for (int w=0;w < vertexCount;w++)
{
if(weightMatrix[v][w]>(weightMatrix[v][u] + weightMatrix[u][w]))
{
weightMatrix[v][w] = weightMatrix[v][u] + weightMatrix[u][w];
intermediaryMatrix[v][w] = u;
}
}[/codebox]
I only create a matrix in 1D and do all the calculations in parallel by changing the intermediary from one kernel call to another, almost like Floyd, so:
Creating random matrix for testing purpose so I can check if I am doing the correct calculations having several distinct cases:
[codebox]float initialValue = 1;
float *matrix = new float[n];
for (int i=0; i < n; i++)
matrix[i] = new float[n];
for(int i=0; i<n; i++)
for(int j=0; j<n; j++)
{
if(i==j)
matrix[i][j]= 0;
else
matrix[i][j]= initialValue;
}
for(int i = 0; i < n; i++)
for(int j = 0; j < n; j++)
{
if(matrix[i][j]==initialValue && matrix[i][j] != 0)
{
float value = (float)rand()/n;
if(value > 0)
matrix[i][j] = value;
else
matrix[i][j] = (float)(i + j);
}
}
[/codebox]
Creating 1D matrix, coping to device and executing the kernel:
[codebox]//N: number of vertices in the graph
float h_weightMatrix1D = new float[NN];
int h_intermMatrix1D = new int[NN];
for (int i=0; i < N; i++)
for (int j=0; j < N; j++)
{
h_weightMatrix1D[i*N + j] = matrix[i][j];
//since all paths are now direct
//they have no intermediary
h_intermMatrix1D[i*N + j] = 0;
}
//both sizes may be the same
//but I have them apart any way
unsigned int size_Mem_int = sizeof(int) * N * N;
unsigned int size_Mem_float = sizeof(float) * N * N;
int *d_intermediaries = NULL;
float *d_weight = NULL;
CUDA_SAFE_CALL(cudaMalloc((void**) &d_intermediaries, size_Mem_int));
CUDA_SAFE_CALL(cudaMalloc((void**) &d_weight, size_Mem_float));
CUDA_SAFE_CALL(cudaMemcpy(d_intermediaries, h_intermMatrix1D, size_Mem_int, cudaMemcpyHostToDevice));
CUDA_SAFE_CALL(cudaMemcpy(d_weight, h_weightMatrix1D, size_Mem_float, cudaMemcpyHostToDevice));
dimBlock.x = min(16,N);
dimBlock.y = dimBlock.x;
dimGrid.x = (int)ceil((float)N/dimBlock.x);
dimGrid.y = dimGrid.x;
for(int u = 0; u < N ; u++)
{
floyd<<< dimGrid, dimBlock >>>(d_weight, d_intermediaries, N,u);
checkCUDAError(“Error executing kernel\n”);
}
unsigned int timer = 0;
CUT_SAFE_CALL(cutCreateTimer(&timer));
CUT_SAFE_CALL(cutStartTimer(timer));
//right now I only care about the best
//path not the weight of it, so I just
//keep the intermediarie matrix
CUDA_SAFE_CALL(cudaMemcpy(h_intermMatrix1D, d_intermediaries, size_Mem_int, cudaMemcpyDeviceToHost));
CUT_SAFE_CALL(cutStopTimer(timer));
printf(“GPU processing time: %f (ms) \n”, cutGetTimerValue(timer));
CUT_SAFE_CALL(cutDeleteTimer(timer));
//create 2D matrix for third party user
int** intermMatrix = new int*[N];
for (unsigned int i=0; i < N; i++)
{
intermMatrix[i] = new int[N];
for(unsigned int j = 0; j < N; j++)
{
intermMatrix[i][j] = intermMatrix1D[i*N + j];
}
}
CUDA_SAFE_CALL(cudaFree(d_intermediaries));
CUDA_SAFE_CALL(cudaFree(d_weight));
return intermMatrix;
[/codebox]
I will compile this in other test project for you to run it.
Hope you can light me up! thanks a lot everybody!
