larger than 64bit integer I want to count with very big integers, larger than 64bit

batmanzsolti · April 7, 2011, 12:06pm

Hi!

Is there any way in CUDA to count with larger integer than 64bit?
If its possible can you give me instructions how can I do this?

njuffa · April 7, 2011, 5:05pm

(unsigned) long long int provides 64-bits. There is no built-in non-vector integer type that is wider than 64 bits. However, you could easily build your own 128-bit integer type. For example:

typedef struct {

  unsigned long long int lo;

  unsigned long long int hi;

} my_uint128;

my_uint128 add_uint128 (my_uint128 a, my_uint128 b)

{

  my_uint128 res;

  res.lo = a.lo + b.lo;

  res.hi = a.hi + b.hi + (res.lo < a.lo);

  return res;

}

If a higher performance solution is desired, consider mapping a 128-bit integer to a uint4 and using inline PTX for more efficient handling of the carries between the four 32-bit chunks.

batmanzsolti · April 7, 2011, 10:01pm

(unsigned) long long int provides 64-bits. There is no built-in non-vector integer type that is wider than 64 bits. However, you could easily build your own 128-bit integer type. For example:
typedef struct {

  unsigned long long int lo;

  unsigned long long int hi;

} my_uint128;

my_uint128 add_uint128 (my_uint128 a, my_uint128 b)

{

  my_uint128 res;

  res.lo = a.lo + b.lo;

  res.hi = a.hi + b.hi + (res.lo < a.lo);

  return res;

}
If a higher performance solution is desired, consider mapping a 128-bit integer to a uint4 and using inline PTX for more efficient handling of the carries between the four 32-bit chunks.

Thats exactly what I want. Thank you very much!

More questions: :-)

1.How can I display in decimal number system?

If I want to allocate N-element my_uint128 array to my device, I have to use this code?

cudaMalloc( (void**)&deviceArray, (N * sizeof(my_uint128)) )

njuffa · April 8, 2011, 8:54am

Your malloc() call looks OK. Here is some code that does conversions. This is rather simplistic and will not win any speed records, and it does not provide industrial-strength robustness (for example, conversion from my_uint128 is limited to numbers < 10**38 instead of the full range). But it should be sufficient as a starting point for your own work.

#include <stdio.h>

#include <stdlib.h>

#include <string.h>

#define TRIALS 1000000

typedef struct {

  unsigned long long int lo;

  unsigned long long int hi;

} my_uint128;

my_uint128 add_my_uint128 (my_uint128 a, my_uint128 b) 

{

  my_uint128 res;

  res.lo = a.lo + b.lo;

  res.hi = a.hi + b.hi + (res.lo < a.lo);

  return res;

}

my_uint128 sub_my_uint128 (my_uint128 a, my_uint128 b) 

{

  my_uint128 res;

  res.lo = a.lo - b.lo;

  res.hi = a.hi - b.hi - (res.lo > a.lo);

  return res;

}

my_uint128 shl_my_uint128 (my_uint128 a, int s)

{

  if (s) {

    a.hi = (a.hi << s) | (a.lo >> (64 - s));

    a.lo =  a.lo << s;

  }

  return a;

}

my_uint128 mul10_my_uint128 (my_uint128 a)

{

  my_uint128 s, t;

  s = shl_my_uint128 (a, 3);

  t = shl_my_uint128 (a, 1);

  return add_my_uint128 (s, t);

}

static const my_uint128 pwrten [] =

{

  {0x0000000000000001, 0x0000000000000000}, /* 10**0  */

  {0x000000000000000a, 0x0000000000000000}, /* 10**1  */

  {0x0000000000000064, 0x0000000000000000}, /* 10**2  */

  {0x00000000000003e8, 0x0000000000000000}, /* 10**3  */

  {0x0000000000002710, 0x0000000000000000}, /* 10**4  */

  {0x00000000000186a0, 0x0000000000000000}, /* 10**5  */

  {0x00000000000f4240, 0x0000000000000000}, /* 10**6  */

  {0x0000000000989680, 0x0000000000000000}, /* 10**7  */

  {0x0000000005f5e100, 0x0000000000000000}, /* 10**8  */

  {0x000000003b9aca00, 0x0000000000000000}, /* 10**9  */

  {0x00000002540be400, 0x0000000000000000}, /* 10**10 */

  {0x000000174876e800, 0x0000000000000000}, /* 10**11 */

  {0x000000e8d4a51000, 0x0000000000000000}, /* 10**12 */

  {0x000009184e72a000, 0x0000000000000000}, /* 10**13 */

  {0x00005af3107a4000, 0x0000000000000000}, /* 10**14 */

  {0x00038d7ea4c68000, 0x0000000000000000}, /* 10**15 */

  {0x002386f26fc10000, 0x0000000000000000}, /* 10**16 */

  {0x016345785d8a0000, 0x0000000000000000}, /* 10**17 */

  {0x0de0b6b3a7640000, 0x0000000000000000}, /* 10**18 */

  {0x8ac7230489e80000, 0x0000000000000000}, /* 10**19 */

  {0x6bc75e2d63100000, 0x0000000000000005}, /* 10**20 */

  {0x35c9adc5dea00000, 0x0000000000000036}, /* 10**21 */

  {0x19e0c9bab2400000, 0x000000000000021e}, /* 10**22 */

  {0x02c7e14af6800000, 0x000000000000152d}, /* 10**23 */

  {0x1bcecceda1000000, 0x000000000000d3c2}, /* 10**24 */

  {0x161401484a000000, 0x0000000000084595}, /* 10**25 */

  {0xdcc80cd2e4000000, 0x000000000052b7d2}, /* 10**26 */

  {0x9fd0803ce8000000, 0x00000000033b2e3c}, /* 10**27 */

  {0x3e25026110000000, 0x00000000204fce5e}, /* 10**28 */

  {0x6d7217caa0000000, 0x00000001431e0fae}, /* 10**29 */

  {0x4674edea40000000, 0x0000000c9f2c9cd0}, /* 10**30 */

  {0xc0914b2680000000, 0x0000007e37be2022}, /* 10**31 */

  {0x85acef8100000000, 0x000004ee2d6d415b}, /* 10**32 */

  {0x38c15b0a00000000, 0x0000314dc6448d93}, /* 10**33 */

  {0x378d8e6400000000, 0x0001ed09bead87c0}, /* 10**34 */

  {0x2b878fe800000000, 0x0013426172c74d82}, /* 10**35 */

  {0xb34b9f1000000000, 0x00c097ce7bc90715}, /* 10**36 */

  {0x00f436a000000000, 0x0785ee10d5da46d9}, /* 10**37 */

};

#define MAX_PWR ((int)(sizeof(pwrten)/sizeof(pwrten[0]))-1)

#define DIGITS  (MAX_PWR+1)

void cvt_my_uint128_to_str (my_uint128 a, char *cp) 

{

  my_uint128 t;

  int pwr, bit, non_zero, digit, remainder_neg;

  non_zero = 0;

  for (pwr = MAX_PWR; pwr >= 0; pwr--) {

    digit = 0;

    for (bit = 3; bit >= 0; bit--) {

      t = shl_my_uint128 (pwrten[pwr], bit);

      a = sub_my_uint128 (a, t);

      remainder_neg = ((long long int)a.hi) < 0;

      digit = (digit << 1) | !remainder_neg;

      if (remainder_neg) {

        a = add_my_uint128 (a, t);

      }

    }

    non_zero |= digit;

    if (non_zero || pwr == 0) {

      *cp++ = '0' + digit;

    }

    *cp = 0;

  }

}

my_uint128 cvt_str_to_my_uint128 (char *cp)

{

  my_uint128 a = {0, 0};

  my_uint128 t = {0, 0};

  while (*cp) {

    a = mul10_my_uint128 (a);

    t.lo = *cp++ - '0';

    a = add_my_uint128 (a, t);

  }

  return a;

}

void rand_num_str (char *cp, int digits)

{

  int i;

  for (i = 0; i < digits; i++) {

    *cp++ = '0' + ((i) ? (rand() % 10) : (rand() % 9 + 1));

  }

  *cp = 0;

}

int main (void)

{

  my_uint128 a;

  int i;  

  char in_str[DIGITS+1];

  char out_str[DIGITS+1];

  int errors = 0;

  printf ("^^^^ DIGITS=%d\n", DIGITS);

  for (i = 0; i < TRIALS; i++) {

    rand_num_str (in_str, DIGITS);

    a = cvt_str_to_my_uint128 (in_str);

    cvt_my_uint128_to_str (a, out_str);

    if (strcmp (in_str, out_str) != 0) {

      printf ("!!!! error: in_str = %s  out_str=%s\n", in_str, out_str);

      errors++;

    }

  }

  printf ("@@@@ %d errors in %d trials\n", errors, TRIALS);

  printf ("&&&& my_uint128 conversion test %s\n", errors ? "FAILED":"PASSED");

  return EXIT_SUCCESS;

}

batmanzsolti · April 8, 2011, 12:45pm

Works fine! :D
You helped me a lot.
Thank you very much!

wanderine · April 8, 2011, 2:18pm

Have you considered to use logarithms?