Well Jamie K, thanks a lot for remarking that there was no race condition when it seem it was. About coping the data to the weight matrix, I made a test in debug mode printing all the values that I previously created before, randomly just for testing purpose, and they were all there. I am pasting some of my code now here, but I will upload a test project that I am right now translating from Spanish so you can all test it.

This is what I am trying to do: the secuencial Floyd algorithm stores the length of the paths of a directed graph in a matrix where weightMatrix[v][w] contains the path length of going from vertex v to vertex w. The algorithms has a triple nested loop testing for all the paths which vertex is the best intermediary, in other words if is better going straight from v to w that going from v to u and from u to w, in the latest case he update the path length and the intermediary vertex, code is like this:

[codebox]for (int u=0;u < vertexCount;u++)

for (int v=0;v < vertexCount;v++)

```
for (int w=0;w < vertexCount;w++)
{
if(weightMatrix[v][w]>(weightMatrix[v][u] + weightMatrix[u][w]))
{
weightMatrix[v][w] = weightMatrix[v][u] + weightMatrix[u][w];
intermediaryMatrix[v][w] = u;
}
}[/codebox]
```

I only create a matrix in 1D and do all the calculations in parallel by changing the intermediary from one kernel call to another, almost like Floyd, so:

Creating random matrix for testing purpose so I can check if I am doing the correct calculations having several distinct cases:

[codebox]float initialValue = 1;

float **matrix = new float*[n];

for (int i=0; i < n; i++)

matrix[i] = new float[n];

for(int i=0; i<n; i++)

for(int j=0; j<n; j++)

{

if(i==j)

matrix[i][j]= 0;

else

matrix[i][j]= initialValue;

}

for(int i = 0; i < n; i++)

for(int j = 0; j < n; j++)

{

if(matrix[i][j]==initialValue && matrix[i][j] != 0)

{

float value = (float)rand()/n;

if(value > 0)

```
matrix[i][j] = value;
```

else

```
matrix[i][j] = (float)(i + j);
```

}

}

[/codebox]

Creating 1D matrix, coping to device and executing the kernel:

[codebox]//N: number of vertices in the graph

float *h_weightMatrix1D = new float[N*N];

int *h_intermMatrix1D = new int[N*N];

for (int i=0; i < N; i++)

for (int j=0; j < N; j++)

{

h_weightMatrix1D[i*N + j] = matrix[i][j];

//since all paths are now direct

//they have no intermediary

h_intermMatrix1D[i*N + j] = 0;

}

//both sizes may be the same

//but I have them apart any way

unsigned int size_Mem_int = sizeof(int) * N * N;

unsigned int size_Mem_float = sizeof(float) * N * N;

int *d_intermediaries = NULL;

float *d_weight = NULL;

CUDA_SAFE_CALL(cudaMalloc((void**) &d_intermediaries, size_Mem_int));

CUDA_SAFE_CALL(cudaMalloc((void**) &d_weight, size_Mem_float));

CUDA_SAFE_CALL(cudaMemcpy(d_intermediaries, h_intermMatrix1D, size_Mem_int, cudaMemcpyHostToDevice));

CUDA_SAFE_CALL(cudaMemcpy(d_weight, h_weightMatrix1D, size_Mem_float, cudaMemcpyHostToDevice));

dimBlock.x = min(16,N);

dimBlock.y = dimBlock.x;

dimGrid.x = (int)ceil((float)N/dimBlock.x);

dimGrid.y = dimGrid.x;

for(int u = 0; u < N ; u++)

{

floyd<<< dimGrid, dimBlock >>>(d_weight, d_intermediaries, N,u);

checkCUDAError(“Error executing kernel\n”);

}

unsigned int timer = 0;

CUT_SAFE_CALL(cutCreateTimer(&timer));

CUT_SAFE_CALL(cutStartTimer(timer));

//right now I only care about the best

//path not the weight of it, so I just

//keep the intermediarie matrix

CUDA_SAFE_CALL(cudaMemcpy(h_intermMatrix1D, d_intermediaries, size_Mem_int, cudaMemcpyDeviceToHost));

CUT_SAFE_CALL(cutStopTimer(timer));

printf(“GPU processing time: %f (ms) \n”, cutGetTimerValue(timer));

CUT_SAFE_CALL(cutDeleteTimer(timer));

//create 2D matrix for third party user

int** intermMatrix = new int*[N];

for (unsigned int i=0; i < N; i++)

{

intermMatrix[i] = new int[N];

for(unsigned int j = 0; j < N; j++)

{

intermMatrix[i][j] = intermMatrix1D[i*N + j];

}

}

CUDA_SAFE_CALL(cudaFree(d_intermediaries));

CUDA_SAFE_CALL(cudaFree(d_weight));

return intermMatrix;

[/codebox]

I will compile this in other test project for you to run it.

Hope you can light me up! thanks a lot everybody!