Error 719 on BigInteger calculation

doscar.sole · August 17, 2020, 6:25pm

Hello.
I’m trying to port a Big Integer architecture I wrote on C to CUDA, and I’m getting a 719 error on “random” situations.

I may assume it’s an issue with memory, because when testing, I get the error sooner or later depending on the integer length.

To sum up, my main test code is the following

//BITesting.cu//
int main() {
  /*************************** TESTING v2 INI ***************************/
  struct BigInteger* a = (struct BigInteger*)malloc(sizeof(struct BigInteger));
  //char* s = "1111111111222222222233333333334444444444555555555566666666667777777777888888888899999999990000000000";
  char* s = "1";
  //char* s = "123";
  int i = 0;

  newBI_testing(a, s, 0);

  printf("<");

  for (; i <= a->count; i++)
    printf("%i", a->n[i]);

  printf(">\n");

  printf("Ending...\n");
  /*************************** TESTING v2 END ***************************/

  return 0;
}

Then, the main code will run the test:

/*
 * newBI_testing
 *
 * Preámbulo para pruebas
 */
void newBI_testing(void* dst, char* s, int sig) {
  //variables para CUDA
  struct BigInteger* cu_dst;
  struct BigInteger* cu_b;
  char* cu_s;
  int* cu_sig;

  //allocamos memoria
  cudaMalloc((void**)& cu_dst, sizeof(struct BigInteger));
  cudaMalloc((void**)& cu_b, sizeof(struct BigInteger));
  cudaMalloc((void**)& cu_s, sizeof(char) * strlen(s));
  cudaMalloc((void**)& cu_sig, sizeof(int));

  //copiamos s a cu_s y sig a cu_sig
  cudaMemcpy(cu_s, s, sizeof(char)* strlen(s), cudaMemcpyHostToDevice);
  cudaMemcpy(cu_sig, &sig, sizeof(char)* strlen(s), cudaMemcpyHostToDevice);

  printf("START: %d\n", clock());
  //llamamos a CUDA
  _newBI_testing<<<1, 1>>>(cu_dst, cu_s, cu_sig, cu_b);

  //printf("Last Error: %i\n", cudaGetLastError());

  //sincronizamos
  cudaDeviceSynchronize();
  
  printf("END: %d\n", clock());
  printf("Last Error: %i\n", cudaGetLastError());
  //printf("Last String Error: %s\n", cudaGetErrorString(cudaGetLastError()));

  //copiamos de vuelta
  cudaMemcpy(dst, cu_dst, sizeof(struct BigInteger), cudaMemcpyDeviceToHost);
} 

/*
 * _newBI_testing
 *
 * Lanzadera para _newBI
 */
__global__ static void _newBI_testing(void* dst, char* s, int* sig, void* b) {
  _newBI(dst, s, sig);
  _newBI(b, s, sig);

  int i = 0;
  
  for (; i < 1000; i++) {
    printf("I: %i\n", i);
    _add(dst, b);
  }
}

It calls “_newBI” function twice (a function that creates a BigInteger structure, basically, an integer-array representation for the given string, then make “_add” (that just makes “dst = dst + b”) a couple of times.

With this setting, I got the 719 error when “i = 354”.

BigInteger struct:

//struct
struct BigInteger {
  int count;
  int n[4096];
};

_newBI function

__device__ int BLOCK_SIZE = 256;
__device__ int NUM_BLOCKS = 16; /*(CU_MAX_LENGTH + BLOCK_SIZE - 1) / BLOCK_SIZE;*/

/*
 * Función _newBI.
 *
 * Genera un nuevo dato BI a partir del string que recibe como entrada.
 * Se cargan en orden inverso para permitir el crecimiento de manera sencilla.
 */
__device__ void _newBI(void* dst, char* s, int* sig){
  //longitud del string
  int i = cu_strlen(s) - 1;

  //limpiamos el array
  _clean<<<NUM_BLOCKS, BLOCK_SIZE>>>(dst);

  //validamos que  no sobrepase el límite establecido
  if (i > CU_MAX_LENGTH) {
    _showError(1);

    return;
  }

  //semáforo para clean.
  //mientras tanto, se puede ir validando la longitud (no afecta a ret)
  cudaDeviceSynchronize();

  //recorremos el string y lo guardamos en integers
  newBI_fill<<<NUM_BLOCKS, BLOCK_SIZE >>>(dst, s, i);

  //ajustamos la longitud
  ((struct BigInteger*)dst)->count = i;

  //semáforo para newBI_fill
  //mientras tanto, vamos ajustando la longitud de ret
  cudaDeviceSynchronize();

  //¿ha habido algún error en newBI_fill?
  if (CU_RET > 0)
    return;

  //validamos signo
  if (*sig == -1)
    ((struct BigInteger*)dst)->n[((struct BigInteger*)dst)->count] *= -1;
}

/*
 * Función _newBI_fill.
 *
 * Rellena a.n de manera paralela a partir del string s de manera inversa
 * usando len como punto medio
 */
__global__ static void newBI_fill(void* va, char* s, int len) {
  int index = blockIdx.x * blockDim.x + threadIdx.x;
  int stride = blockDim.x * gridDim.x;
  int i = 0;
  char c;

  for (i = index; i <= len; i += stride) {
    if (i > len) 
      //puede que len no sea divisible por 16 y por tanto debemos controlar el final
      return;

    //vamos capturando los caracteres de manera inversa
    c = (int)(s[len - i] - 48);

    if (c >= 0 && c <= 9) 
      ((struct BigInteger*)va)->n[i] = c;
    else {
      _showError(2);

      return;
    }
  }
}

/*
 * Función cu_strlen.
 *
 * Sinónimo de strlen C
 */
__device__ static int cu_strlen(char* s) {
  int ret = 0;

  while (*s++ != '\0') 
    ++ret;

  return ret;
}

/*
 * Función _clean.
 *
 * Limpia la estructura
 */
__global__ static void _clean(void* va){
  int index = blockIdx.x * blockDim.x + threadIdx.x;
  int stride = blockDim.x * gridDim.x;
  int i = 0;

  for (i = index; i < CU_MAX_LENGTH; i += stride) 
    ((struct BigInteger*)va)->n[i] = 0;
}

So, as I explained, it moves (on backwards order) each char to an int representation on the struct, using paralelysm.

I had to “copy” strlen function as it does not work on kernel, but not big deal.

Then, on the “_add” function:

/*
 * Función _add. Usar para sumar dos números.
 *
 * Realiza la operación de suma, teniendo en cuenta los signos de los números.
 *
 * Si los signos son iguales, hace una suma, sino, una resta.
 */
__device__ void _add(void* va, void* vb) {
  //validamos los datos antes de tratarlos
  _validateBI<<<1, 1>>>(va);
  _validateBI<<<1, 1>>>(vb);

  //semáforo
  cudaDeviceSynchronize();

  //¿Ha habido algún error?
  if (CU_RET > 0)
    return;

  //delegamos en la función estática
  _pAdd(va, vb);
}

/*
 * Función _pAdd. Usar para sumar dos números.
 *
 * Realiza la operación de suma, teniendo en cuenta los signos de los números.
 *
 * Si los signos son iguales, hace una suma, sino, una resta.
 */
__device__ static void _pAdd(void* va, void* vb) {
  //calcumamos signum
  int sig = _signum(((struct BigInteger*)va)->n[((struct BigInteger*)va)->count], 
           ((struct BigInteger*)vb)->n[((struct BigInteger*)vb)->count]);

  //normalizamos los operandos
  if (sig == 10)
    //a negativo, b positivo. Cambiamos el signo de "a" y hacemos suma
    ((struct BigInteger*)va)->n[((struct BigInteger*)va)->count] *= -1;
  else if (sig == 1)
    //b negativo, a positivo. Cambiamos el signo de "b" y hacemos suma
    ((struct BigInteger*)vb)->n[((struct BigInteger*)vb)->count] *= -1;
  else if (sig == 11) {
    //a negativo, b negativo. Cambiamos signos y hacemos suma
    ((struct BigInteger*)va)->n[((struct BigInteger*)va)->count] *= -1;
    ((struct BigInteger*)vb)->n[((struct BigInteger*)vb)->count] *= -1;
  }

  //si ambos signos son iguales, se suma, sino, se resta
  if (sig == 0 || sig == 11)
    _addition(va, vb);
  else
    _pSub(va, vb);

  if (sig == 10 || sig == 11)
    //en estos casos, siempre se le va la vuelta al signo
    ((struct BigInteger*)va)->n[((struct BigInteger*)va)->count] *= -1;
}

/*
 * Función _addition.
 *
 * Simula la operación a = a + b
 */
__device__ static void _addition(void* va, void* vb) {
  int limit;
  int min;
  int swap;
  int move;

  //asumimos que a tiene la mayor longitud
  limit = ((struct BigInteger*)va)->count;

  //asumimos que b tiene la menor longitud
  min = ((struct BigInteger*)vb)->count;

  //indicador de necesidad de arrastre
  move = 0;

  //si no es así, rectificamos
  if (((struct BigInteger*)vb)->count > limit) {
    //intercambiamos limit y min
    swap = limit;
    limit = min;
    min = swap;

    move = 1;
  }

  //sumamos los dígitos que coinciden
  _addition_merge<<<NUM_BLOCKS, BLOCK_SIZE>>>(va, vb, min);

  //los dígitos que no coinciden los traspasamos
  if (move == 1) {
    _addition_move <<<NUM_BLOCKS, BLOCK_SIZE >>>(va, vb, min + 1, limit);

    ((struct BigInteger*)va)->count = limit;
  }

  //Esperamos que las dos funciones terminen
  cudaDeviceSynchronize();

  //gestionamos el acarreo
  /*
   * Si a->count = b->count --> min = a->count (Se tratarán todos los dígitos)
   * Si a->count > b->count --> min = b->count (Se hará la gestión en dos partes)
   * Si a->count < b->count --> min = a->count (Se hará la gestión en dos partes)
   */
  _carryAdd(va, 1, min);
}

/*
 * Función _addition_merge.
 *
 * Simula la operación a = a + b de un dígito
 */
__global__ static void _addition_merge(void* va, void* vb, int min) {
  int index = blockIdx.x * blockDim.x + threadIdx.x;
  int stride = blockDim.x * gridDim.x;
  int i = 0;

  for (i = index; i <= min; i += stride) {
    if (i > min) 
      //puede que len no sea divisible por 16 y por tanto debemos controlar el final
      return;
    else
      ((struct BigInteger*)va)->n[i] += ((struct BigInteger*)vb)->n[i];
  }
}

/*
 * Función _addition_move.
 *
 * Mueve un dígito que existe en "b" pero no en "a"
 */
__global__ static void _addition_move(void* va, void* vb, int offset, int min) {
  int index = blockIdx.x * blockDim.x + threadIdx.x + offset;
  int stride = blockDim.x * gridDim.x;
  int i = 0;

  for (i = index; i <= min; i += stride)
    if (i > min)
      //puede que len no sea divisible por 16 y por tanto debemos controlar el final
      return;
    else
      ((struct BigInteger*)va)->n[i] = ((struct BigInteger*)vb)->n[i];
}

/*
 * Función _carryAdd.
 *
 * Gestiona el acarreo de la suma. Si hay movimiento de datos, se mueve el valor 1 a ret.
 * De esta manera, podemos llamar hasta que no haya cambios en el acarreo.
 */
__device__ static void _carryAdd(void* va, int move, int min) {
  int i = 0;
  int acc;
  int limit;

  acc = 0;

  //move == 1 --> sabemos que hay una parte no común. En min está el límite de la parte común
  if (move == 1)
    limit = min;
  else
    limit = ((struct BigInteger*)va)->count;

  //recorremos a y vamos sumando el acarreo de la parte común
  for (; i <= limit; i++) {
    //sumamos acarreo
    ((struct BigInteger*)va)->n[i] += acc;

    //como acc es int, podemos dividir entre 10 y sacar el acarreo
    acc = ((struct BigInteger*)va)->n[i] / 10;

    if (acc > 0)
      //normalizamos el número
      ((struct BigInteger*)va)->n[i] = ((struct BigInteger*)va)->n[i] % 10;
  }

  if (move == 1) {
    //queda parte no común. Acarreamos hatsa que acc sea 0, ya que la parte no común ya está normalizada
    while (acc > 0 && i <= ((struct BigInteger*)va)->count) {
      //sumamos acarreo
      ((struct BigInteger*)va)->n[i] += acc;

      //como acc es int, podemos dividir entre 10 y sacar el acarreo
      acc = ((struct BigInteger*)va)->n[i] / 10;

      if (acc > 0)
        //normalizamos el número
        ((struct BigInteger*)va)->n[i] = ((struct BigInteger*)va)->n[i] % 10;

      i++;
    }
  }

  //si ha quedado acarreo, lo guardamos al final;
  if (acc > 0) {
    if (((struct BigInteger*)va)->count == CU_MAX_LENGTH)
      _showError(1);
    else
      ((struct BigInteger*)va)->n[++(((struct BigInteger*)va)->count)] = acc;
  }
}


/*
 * Función _signum.
 *
 * Devuelve la cantidad de datos negativos que hay en la operación.
 */
__device__ static int _signum(int a, int b) {
  int ret = 0;

  if (a < 0)
    ret = 10;

  if (b < 0)
    ++ret;

  return ret;
}

/*
 * Función _validateBI
 *
 * Valida que todos los datos del BI sean coherentes.
 * Se crea como __globlal__ para poder llamarse en paralelo.
 */
__global__ void _validateBI(void* a) {
  int* t = (int*)(malloc(sizeof(int)));

  if (t == NULL || a == NULL) {
    _showError(98);

    return;
  }

  memcpy(t, (int*)a, sizeof(int));

  //validamos la variable de longitud
  if (*t < 0 || *t > CU_MAX_LENGTH) {
    _showError(99);
    free(t);

    return;
  }

  memcpy(t, (int*)a + 1, sizeof(int));

  //validamos el resto de dígitos, que pueden ser positivos o negativos
  validateBI_loop <<<NUM_BLOCKS, BLOCK_SIZE>>>(a);

  //liberamos memoria
  free(t);

  //sincronizamos antes de terminar
  //la memoria de "t" se puede eliminar sin problemas
  cudaDeviceSynchronize();
}

/*
 * Función loop_validateBI
 *
 * Valida BigInteger.n de manera paralela
 */
__global__ void validateBI_loop(void* a) {
  int* t = (int*)(malloc(sizeof(int)));
  int index = blockIdx.x * blockDim.x + threadIdx.x;
  int stride = blockDim.x * gridDim.x;
  int i = 0;

  if (t == NULL)
    _showError(98);

  memcpy(t, (int*)a + (index + 1), sizeof(int));

  for (i = index; i < CU_MAX_LENGTH; i += stride) {
    //si no está en el rango [-9, +9], damos error
    if (*t < -9 || *t > 9) {
      _showError(99);
      free(t);

      return;
    }

    memcpy(t, (int*)a + (i + 1), sizeof(int));
  }

  free(t);
}

It validates the format of both parameteres, then add each correlative array common position, while at the same time moves the even positions from the second array to the first; then checks the result, it works fine (or should).

So, maybe it’s a simple error… I’m a bit new on CUDA programming, or it’s a memory flaw, or a void-BigInteger casting issue…

Can anyone help me, please?

I also did a CUDA_MEMCHECK on the program, so it may help out_CUDA_MEMCHECK.txt (8.1 KB)

I’m not asking for some magic “change this line” or “do not do this part here”… I want to understand the root cause, so please ask me if there’s any info I can provide.

I’m working on Windows 10, using Visual Studio plugin for CUDA

(PS: I don’t want to make spam, but the full code is on GitHub, so if necessary, please ask and I provide the link so you can review full source code.

Thanks a lot

doscar.sole · August 22, 2020, 2:40pm

Just find it. It was a WDDM TDR timeout issue.