CUDA error 700 ?? cudaDeviceSynchronize returned error code 700

Hello

Im new programing in cuda and one of my first proyects give´s me this error:

cudaDeviceSynchronize returned error code 700 after launching mult! (mult is my global function´s name)

i´ve searched in some foros and it tells that is drivers problem but i´m not sure, if anyone could help me i really dont know how to solve it
Here is my code:
At the moment just works with square matrices

#include “cuda_runtime.h”
#include “device_launch_parameters.h”
#include <stdio.h>
#include <stdlib.h>
#include <time.h>

global void mult(int **matriz2d1, int **matriz2d2, int **matriz2d3, int nfil, int ncol, int size) { // Realiza la multiplicacion asignando valores a la mariz 3
int temp = 0, c2;
int f = blockDim.y * blockIdx.y + threadIdx.y; //for (int f = 0; f <= fil - 1; f += 1) equivalencia en for
int c = blockDim.x * blockIdx.x + threadIdx.x; //for (int c = 0; c <= col - 1; c += 1)
if (f<nfil && c<ncol) {
//if (f*c < size){
for (c2 = 0; c2 <= ncol - 1; c2 += 1) {
temp += matriz2d1[f][c2] * matriz2d2[c2][c];
}
matriz2d3[f][c] = temp;
}
}

void impresion(int** matrizn, int nfil, int ncol) {//imprime la matriz
for (int f = 0; f <= nfil - 1; f += 1) {
for (int c = 0; c <= ncol - 1; c += 1) {
printf(“%d “, matrizn[f][c]);
}
printf(”\n”);
}
}

cudaError_t cargar(int **matriz2d1, int **matriz2d2, int **matriz2d3, int nfil, int ncol, int size) {// sube las matrices a los buses de los gpu
int **dev_a = nullptr;
int **dev_b = nullptr;
int **dev_c = nullptr;
dim3 num_fil(nfil / nfil);
dim3 num_col(nfil, ncol);
cudaError_t cudaStatus;

cudaStatus = cudaSetDevice(0);
if (cudaStatus != cudaSuccess) {
	fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");
	goto Error;
}

cudaStatus = cudaMalloc((void **)&dev_a, size * sizeof(int));
if (cudaStatus != cudaSuccess) {
	fprintf(stderr, "cudaMalloc failed!");
	goto Error;
}
cudaStatus = cudaMalloc((void **)&dev_b, size * sizeof(int));
if (cudaStatus != cudaSuccess) {
	fprintf(stderr, "cudaMalloc failed!");
	goto Error;
}
cudaStatus = cudaMalloc((void **)&dev_c, size * sizeof(int));
if (cudaStatus != cudaSuccess) {
	fprintf(stderr, "cudaMalloc failed!");
	goto Error;
}

cudaStatus = cudaMemcpy(dev_a, matriz2d1, size * sizeof(int), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
	fprintf(stderr, "cudaMemcpy failed!");
	goto Error;
}
cudaStatus = cudaMemcpy(dev_b, matriz2d2, size * sizeof(int), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
	fprintf(stderr, "cudaMemcpy failed!");
	goto Error;
}

mult << < num_fil, num_col >> >(dev_a, dev_b, dev_c, nfil, ncol, size);
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess) {
	fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
	goto Error;
}
cudaStatus = cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess) {
	fprintf(stderr, "\ncudaDeviceSynchronize returned error code %d after launching mult!\n\n", cudaStatus);
	goto Error;
}

cudaStatus = cudaMemcpy(matriz2d3, dev_c, size * sizeof(int), cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) {
	fprintf(stderr, "cudaMemcpy failed!");
	goto Error;
}

Error:
cudaFree(dev_a);
cudaFree(dev_b);
cudaFree(dev_c);
return cudaStatus;
}

int main() {
int matriz2d1, matriz2d2, matriz2d3;
int nfil, ncol;
srand(time(NULL));
printf(“ingrese el numero de filas: “);
scanf(”%d”, &nfil);
printf(“ingrese el numero de columnas: “);
scanf(”%d”, &ncol);
while (nfil <= 0) {
printf(“el numero ingresado no puede ser menor o igual a cero ingrese otro: “);
scanf(”%d”, &nfil);
}
while (ncol <= 0) {
printf(“el numero ingresado no puede ser menor o igual a cero ingrese otro: “);
scanf(”%d”, &ncol);
}
int size = nfil * ncol;
//llenado de la matriz 1
matriz2d1 = (int
)malloc(nfil * sizeof(int
));
for (int i = 0; i <= nfil - 1; i++) {
matriz2d1[i] = (int
)malloc(nfil * sizeof(int));
}
for (int f = 0; f <= nfil - 1; f += 1) {
for (int c = 0; c <= ncol - 1; c += 1) {
matriz2d1[f][c] = 1 + rand() % (11 - 1);
}
}
printf(“Datos Matriz 1: \n”); //imprime matriz1
impresion(matriz2d1, nfil, ncol);
//////llenado de la matriz 2//////////////////////////////////
matriz2d2 = (int
)malloc(nfil * sizeof(int*));
for (int i = 0; i <= nfil - 1; i++) {
matriz2d2[i] = (int*)malloc(nfil * sizeof(int));
}
for (int f = 0; f <= nfil - 1; f += 1) {
for (int c = 0; c <= ncol - 1; c += 1) {
matriz2d2[f][c] = 1 + rand() % (11 - 1);
}
}
printf(“\nDatos Matriz 2: \n”); //imprime matriz2
impresion(matriz2d2, nfil, ncol);
//////inicializando 3/////////////////////////////////////////
matriz2d3 = (int**)malloc(nfil * sizeof(int*));
for (int i = 0; i <= nfil - 1; i++) {
matriz2d3[i] = (int*)malloc(nfil * sizeof(int));
}
for (int f = 0; f <= nfil - 1; f += 1) {
for (int c = 0; c <= ncol - 1; c += 1) {
matriz2d2[f][c] = 1 + rand() % (11 - 1);
}
}
printf(“\nDatos Matriz 3: \n”); //imprime matriz3
impresion(matriz2d3, nfil, ncol);
/////Llamada al kernel////////////////////////////////////////
cudaError_t cudaStatus = cargar(matriz2d1, matriz2d2, matriz2d3, nfil, ncol, size);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, “mult failed!\n\n”);
return 1;
}
//cargar(matriz2d1, matriz2d2, matriz2d3, nfil, ncol, size); /Carga de archivos llenado de matriz 3/
printf(“\nDatos Matriz 3: \n”);
impresion(matriz2d3, nfil, ncol); //imprime la multiplicación
cudaStatus = cudaDeviceReset();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, “cudaDeviceReset failed!”);
return 1;
}
free(matriz2d1);
free(matriz2d2);
free(matriz2d3);
return 0;
}

Error 700 is cudaErrorIllegalAddress

(you can discover this yourself by passing the error code through cudaGetErrorString())

It means your kernel is making an illegal, out-of-bounds access.

That is a defect in your code and needs to be debugged. It is not a problem with your GPU, driver, or CUDA setup.

If your kernel is taking a long time, the first thing you want to do is rule out the possibility of a WDDM TDR timeout (google that).

Once you have ruled that out, then debug your code to find out why you are making an illegal access. The method described here:

[url]cuda - Unspecified launch failure on Memcpy - Stack Overflow

may be a good starting point.

Your attempt to use double pointer (int **matriz2d1) is broken. To pick just one example:

  • taking a double pointer:

int **dev_a = nullptr;

  • and the address of it, creates a triple-pointer. Then casting that triple-pointer to a double-pointer:

cudaStatus = cudaMalloc((void **)&dev_a

is not going to give you the correct behavior as a C/C++ programmer. That is a broken programming strategy, not just in CUDA, but in any usage of C/C++.

Sorting this out is fairly complicated.

If you are a beginner, the general recommendation would be to flatten your arrays, and reference them using a single pointer.

You might wish to study the matrix multiply example given in the programming guide:

[url]Programming Guide :: CUDA Toolkit Documentation

You will see there that the matrix pointers are “flattened” and so access looks like this:

C.elements[row * C.width + col] = Cvalue;

instead of like this:

matriz2d3[f][c] = temp;