Hi, i´ám new at programming in CUDA

The following code using 2 gpu can´t provide a better speedup than with one gpu:

```
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <cuda.h>
#include <omp.h>
#define BLOCK_SIZE 32
__global__ void populationMatrix2D(double *a, int rows, int jsta, int jend) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
int j = blockIdx.y * blockDim.y + threadIdx.y;
if (i < rows && j >= jsta - 1 && j < jend)
a[i + j * rows] = (i + j + 2) * 1.;
}
__global__ void kernel(double *a, double *c, int m, int n, int jsta2, int jend2, int dx, int dz) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
int j = blockIdx.y * blockDim.y + threadIdx.y;
double sx, sz;
if (j >= jsta2 - 1 && j < jend2 && i >= 1 && i < (m - 1)) {
sx = a[(i - 1) + j * n] + a[(i + 1) + j * n] + 2 * a[i + j * n];
sz = a[i + (j - 1) * n] + a[i + (j + 1) * n] + 2 * a[i + j * n];
c[i + j * n] = (sx / (dx * dx)) + (sz / (dz * dz));
}
}
void showMatrix(double *a, int n) {
for (int i = 0; i < n; i++) {
for (int j = 0; j < n; j++) {
printf("%1.2f\t", a[i + j * n]);
}
printf("\n");
}
printf("\n");
}
void PARA_RANGE(int n1,int n2, int nprocs, int myid, int jsta, int jend, int *vector_return){
int iwork1 = (n2 - n1 + 1) / nprocs;
int iwork2 = (n2 - n1 + 1) % nprocs;
jsta = (myid * iwork1) + n1 + fmin((double)myid, (double)iwork2);
jend = jsta + iwork1 - 1;
if (iwork2 > myid)
jend = jend + 1;
vector_return[0] = jsta;
vector_return[1] = jend;
}
int main(int argc, char *argv[]) {
int rows = atoi(argv[1]);
int columns = rows;
int myid = 0;
int nprocs = 4; // Changed to 4 for 4 GPUs
double dx = 1, dz = 1;
int jsta = 1, jend = 1, jsta2, jend2;
int *vector_return = (int *) calloc (2, sizeof(int));
double t1, t2;
jsta2 = 2;
jend2 = columns - 1;
PARA_RANGE(1, rows, nprocs, myid, jsta, jend, vector_return);
jsta = vector_return[0];
jend = vector_return[1];
double *a, *c;
t1 = omp_get_wtime();
cudaMallocManaged(&a, rows * columns * sizeof(double));
cudaMallocManaged(&c, rows * columns * sizeof(double));
dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
dim3 dimGrid((rows + dimBlock.x - 1) / dimBlock.x, (columns + dimBlock.y - 1) / dimBlock.y);
// Loop over GPUs
for (int i = 0; i < nprocs; i++) {
cudaSetDevice(i);
int jsta_gpu = 0, jend_gpu = 0; // Initialize variables
PARA_RANGE(1, rows, nprocs, i, jsta_gpu, jend_gpu, vector_return);
jsta_gpu = vector_return[0];
jend_gpu = vector_return[1];
populationMatrix2D<<<dimGrid, dimBlock>>>(a, rows, jsta_gpu, jend_gpu);
kernel<<<dimGrid, dimBlock>>>(a, c, rows, columns, jsta2, jend2, dx, dz);
}
cudaDeviceSynchronize();
t2 = omp_get_wtime();
printf("%d x %d \t%1.3f\n",rows, columns, t2-t1);
cudaFree(a);
cudaFree(c);
return 0;
}
```

Can anyone help?