Hello, I am new again in this and I would like you to help me with a query, in my code I am taking a matrix to a function called “solver” that applies Gauss-Seidel for iterations, now I am at the point of passing it to the format of cuda, when I enter a matrix value of 2 it seems to work fine but when I enter a greater number than 3 I no longer print anything and the execution time of the kernel is 0.

I do not know where it is failing or what I am doing wrong, this is the total code that I have implemented so far, if someone can help me as I must modify the solver function to work correctly I would appreciate it.

```
#include <stdio.h>
#include <stdlib.h>
#include <sys/time.h>
#define MAX_ITER 1000000
#define MAX 100 //maximum value of the matrix element
#define TOL 0.000001
// Generate a random float number with the maximum value of max
float rand_float(int max){
return ((float)rand()/(float)(RAND_MAX)) * max;
}
// Allocate 2D matrix
void allocate_init_2Dmatrix(float ***mat, int n, int m){
int i, j;
*mat = (float **) malloc(n * sizeof(float *));
for(i = 0; i < n; i++) {
(*mat)[i] = (float *)malloc(m * sizeof(float));
for (j = 0; j < m; j++)
(*mat)[i][j] = rand_float(MAX);
}
}
// solver
__global__ void solver(float **matd, int n, int m){
float diff = 0, temp;
int done = 0, cnt_iter = 0;
int j= blockIdx.y*blockDim.y + threadIdx.y;
int i= blockIdx.x*blockDim.x + threadIdx.x;
while (!done && (cnt_iter < MAX_ITER)){
diff = 0;
if( i > 0 && j > 0 && (i=1) < (n-1) && (j=1) <(m-1)){
temp = (matd)[i][j];
printf("temp[%d][%d]: %f\n",i,j,temp);
(matd)[i][j] = 0.2 * ((matd)[i][j] + (matd)[i][j-1 ] + (matd)[i-1 ][j] + (matd)[i][j + 1] + (matd)[i + 1][j]);
diff += abs((matd)[i][j] - temp);
// printf("diff:%f\n",diff);
}
if (diff/n/n < TOL)
done = 1;
cnt_iter ++;
}
if (done)
printf("Solver converged after %d iterations\n", cnt_iter);
else
printf("Solver not converged after %d iterations\n", cnt_iter);
}
int main(int argc, char *argv[]) {
int n;
float **a,**ad;
struct timeval start, end,start1, end1;
double mtime, seconds, useconds,x,mtime1, seconds1, useconds1,y;
gettimeofday(&start, NULL);
dim3 DimGrid(2,2);
dim3 DimBlock(2,2);
cudaEvent_t startcuda;
cudaEvent_t stopcuda;
if (argc < 2) {
printf("Call this program with two parameters: matrix_size communication \n");
printf("\t matrix_size: Add 2 to a power of 2 (e.g. : 18, 1026)\n");
exit(1);
}
n = atoi(argv[1]);
// float **temp;
float **tem;
printf("Matrix size = %d\n", n);
allocate_init_2Dmatrix(&a, n, n);
// Allocate 2D array in Device
cudaMalloc(&ad, n*sizeof(float *));
for(int i=0;i<n;i++){
cudaMalloc(&tem[i],n*sizeof(float));
cudaMemcpy(tem[i],a[i],n*sizeof(float),cudaMemcpyHostToDevice);
}
cudaMemcpy(ad,tem,n*n*sizeof(float),cudaMemcpyHostToDevice);
udaEventCreate(&startcuda);
cudaEventCreate(&stopcuda);
gettimeofday(&start1, NULL);
cudaEventRecord(startcuda,0);
solver<<<DimGrid, DimBlock>>>(ad, n, n);
cudaEventRecord(stopcuda,0);
/* for (int i = 0; i < n; i++){
cudaMemcpy(a[i],temp[i],n*sizeof(float),cudaMemcpyHostToDevice);
}*/
cudaFree(ad);
cudaEventSynchronize(stopcuda);
float tiempocuda;
cudaEventElapsedTime(&tiempocuda,startcuda,stopcuda);
printf("valor tiempo kernel:%f milisegundos\n",tiempocuda);
gettimeofday(&end1, NULL);
seconds1 = end1.tv_sec - start1.tv_sec;
useconds1 = end1.tv_usec - start1.tv_usec;
mtime1 = ((seconds1)*1000+ useconds1/1000);
y=mtime1/1000;
printf("\nTiempo calculo de funcion solver es: %g segundos", y);
gettimeofday(&end, NULL);
seconds = end.tv_sec - start.tv_sec;
useconds = end.tv_usec - start.tv_usec;
mtime = ((seconds)*1000+ useconds/1000);
x=mtime/1000;
printf("\nTiempo total de programa: %g segundos\n", x);
return 0;
}
```