Fixed. The problem is obvious now that I think about it, but it reduces the speed of the algorithm -_- still MUCH faster then cpu

PREVIOUS:

```
__global__ void BellmanFord(int V, int E, struct Edge* edges, int *dist, int src) {
int index = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
// Step 1: Initialize distances from src to all other vertices
// as INFINITE. While a little wasteful, V will always be smaller then E
for (int i = index; i < V; i += stride)
dist[i] = INT_MAX;
dist[src] = 0;
// Step 2: Relax all edges |V| - 1 times. A simple shortest
// path from src to any other vertex can have at-most |V| - 1
// edges
for (int i = 1; i <= V - 1; i++) {
for (int j = index; j < E; j += stride) {
int u = edges[j].src;
int v = edges[j].dest;
int weight = edges[j].weight;
// NOTE: The issue here is that two threads might try to write to the
// same memory allocation (at the same time). To avoid this we should use atomics.
if (dist[u] != INT_MAX && dist[u] + weight < dist[v])
atomicMin(&dist[v], dist[u] + weight);
}
__syncthreads();
}
return;
}
```

NEW:

```
SetupDist<<<numBlocksV, blockSize>>>(V, d_dist, 0);
cudaDeviceSynchronize();
for (int i = 1; i <= V - 1; i++) {
BellmanFord<<<numBlocksE, blockSize>>>(V, E, d_edges, d_dist);
cudaDeviceSynchronize();
}
__global__ void BellmanFord(int V, int E, struct Edge* edges, int *dist) {
int index = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
// Step 2: Relax all edges |V| - 1 times. A simple shortest
// path from src to any other vertex can have at-most |V| - 1
// edges
for (int j = index; j < E; j += stride) {
int u = edges[j].src;
int v = edges[j].dest;
int weight = edges[j].weight;
// NOTE: The issue here is that two threads might try to write to the
// same memory allocation (at the same time). To avoid this we should use atomics.
if (dist[u] != INT_MAX && dist[u] + weight < dist[v])
atomicMin(&dist[v], dist[u] + weight);
}
return;
}
```

Just like you said for step1, there is also an issue for step2 that I linearly iterate inside device code.