Here 's code on C/CUDA
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <time.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <time.h>
#include <windows.h>
#include <ctype.h>
#include <stdint.h>
#define m_VALUES 16
#define M_VALUE 1
#define T_EXP_VALUE 3
#define SIZE_BIT 32
#define G_VALUES (int64_t)pow((double)2, (double)22)
#define NUM_ITERATIONS (int64_t)pow((double)2, (double)14)
#define T_SIZE 8
//typedef unsigned long long int64_t;
int64_t MontgExp(int64_t base, int64_t exp, int64_t mod);
__host__ __device__ int64_t decimal_to_binary(int64_t);
void extended_euclid(int64_t a, int64_t b, int64_t *x, int64_t *y, int64_t *d);
__host__ __device__ int64_t contains(int64_t num, int64_t *arr, int64_t size);
#define THREADS_PER_BLOCK 64
#define BLOCKS_IN_GRID 1024
#define gpuErrchk(ans) {gpuAssert((ans), __FILE__, __LINE__);}
#define MAX_TMP_ARR_SIZE 1024
void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true)
{
if (code != cudaSuccess) {
fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
//if (abort) exit(code);
}
}
__constant__ int64_t const_bArr[m_VALUES];
__constant__ int64_t *const_bigPrimeNumber;
__global__ void searchElements(int64_t *gArr, int64_t *cArr, int64_t *rArr)
{
int idx = blockDim.x * blockIdx.x + threadIdx.x;
int j = 0;
int tmp = 0;
if (idx < MAX_TMP_ARR_SIZE) {
tmp = ( const_bArr[ decimal_to_binary( gArr[ idx ] ) ] * gArr[idx]) % (*const_bigPrimeNumber);
if (contains(tmp, cArr, T_SIZE) == 1)
rArr[idx]++;
else {
for (j = 1; j < NUM_ITERATIONS; j++) {
tmp = (const_bArr[decimal_to_binary(tmp)] * tmp) % (*const_bigPrimeNumber);
if (contains(tmp, cArr, T_SIZE) == 1) {
rArr[idx]++;
break;
}
}
}
}
}
int main()
{
int64_t bigPrimeNumber = 7;
int64_t i = 0;
int64_t bArr[m_VALUES];
int64_t T = (int64_t)pow((double)2, (double)T_EXP_VALUE);
int64_t *dArr, *cArr;
int64_t M = 0;
int64_t *gArr;
int64_t R = 0;
int64_t tmp = 0;
int k = 0;
int64_t *rArr;
int64_t *gArr_tmp, *rArr_tmp;
int64_t *gArr_tmp_dev, *rArr_tmp_dev;
int flagExit = 0, old_i = 0, steps = 0, diff = 0;
int64_t *cArr_dev;
cudaEvent_t start, stop;
float gpuTime = 0.0f;
cudaEventCreate(&start);
cudaEventCreate(&stop);
srand(time(NULL));
memset(bArr, 0, m_VALUES * sizeof(int64_t));
dArr = (int64_t*)malloc(T * sizeof(int64_t));
memset(dArr, 0, T * sizeof(int64_t));
cArr = (int64_t*)malloc(T * sizeof(int64_t));
memset(cArr, 0, T * sizeof(int64_t));
gArr = (int64_t*)malloc(G_VALUES * sizeof(int64_t));
memset(gArr, 0, G_VALUES * sizeof(int64_t));
rArr = (int64_t*)malloc(G_VALUES * sizeof(int64_t));
memset(rArr, 0, G_VALUES * sizeof(int64_t));
for (i = 0; i < m_VALUES; i++)
bArr[i] = MontgExp(2, rand(), bigPrimeNumber);
for (i = 0; i < T; i++) {
dArr[i] = MontgExp(2, rand(), bigPrimeNumber);
cArr[i] = bArr[i];
}
for (i = 0; i < T; i++) {
M = 0;
while (M != M_VALUE) {
cArr[i] = (bArr[decimal_to_binary(cArr[i])] * cArr[i]) % bigPrimeNumber;
M++;
}
}
srand(time(NULL));
for (i = 0; i < G_VALUES; i++) {
tmp = rand();
if (tmp != 0) {
gArr[i] = rand();
}
}
gpuErrchk( cudaMemcpyToSymbol(const_bArr, bArr, m_VALUES * sizeof(int64_t), 0, cudaMemcpyHostToDevice));
gpuErrchk( cudaMemcpyToSymbol(const_bigPrimeNumber, &bigPrimeNumber, sizeof(int64_t), 0, cudaMemcpyHostToDevice));
cudaMalloc ( (void**)&cArr_dev, T * sizeof(int64_t));
gpuErrchk ( cudaMemcpy(cArr_dev, cArr, T * sizeof(int64_t), cudaMemcpyHostToDevice));
while (true) {
diff = G_VALUES - steps * old_i;
if (diff < MAX_TMP_ARR_SIZE) {
flagExit = 1;
break;
}
gArr_tmp = (int64_t*)malloc(MAX_TMP_ARR_SIZE * sizeof(int64_t));
rArr_tmp = (int64_t*)malloc(MAX_TMP_ARR_SIZE * sizeof(int64_t));
for (i = old_i, k = 0; i < MAX_TMP_ARR_SIZE; i++, k++) {
gArr_tmp[k] = gArr[i];
rArr_tmp[k] = rArr[i];
}
old_i = i;
cudaMalloc ( (void**)&gArr_tmp_dev, MAX_TMP_ARR_SIZE * sizeof(int64_t));
cudaMalloc ( (void**)&rArr_tmp_dev, MAX_TMP_ARR_SIZE * sizeof(int64_t));
gpuErrchk ( cudaMemcpy(gArr_tmp_dev, gArr_tmp, MAX_TMP_ARR_SIZE * sizeof(int64_t), cudaMemcpyHostToDevice));
gpuErrchk ( cudaMemcpy(rArr_tmp_dev, rArr_tmp, MAX_TMP_ARR_SIZE * sizeof(int64_t), cudaMemcpyHostToDevice));
dim3 threads = dim3(THREADS_PER_BLOCK);
dim3 blocks = dim3(BLOCKS_IN_GRID);
cudaEventRecord(start, 0);
searchElements<<<blocks, threads>>>(gArr_tmp_dev, cArr_dev, rArr_tmp_dev);
gpuErrchk( cudaPeekAtLastError() );
gpuErrchk( cudaDeviceSynchronize() );
cudaEventRecord(stop, 0);
gpuErrchk( cudaMemcpy(rArr_tmp, rArr_tmp_dev, MAX_TMP_ARR_SIZE * sizeof(int64_t), cudaMemcpyDeviceToHost) );
gpuErrchk( cudaEventSynchronize(stop));
for (k = 0; k < MAX_TMP_ARR_SIZE; k++)
R += rArr_tmp[k];
printf("%.2f milliseconds\n", gpuTime);
cudaFree(gArr_tmp_dev);
cudaFree(rArr_tmp_dev);
steps++;
}
printf("%.2f milliseconds\n", gpuTime);
cudaEventDestroy(start);
cudaEventDestroy(stop);
printf("Press any key...");
getchar();
return 0;
}
int64_t MontgExp(int64_t base, int64_t exp, int64_t mod)
{
int64_t z = 0;
if (exp == 0) return 1;
z = MontgExp(base, exp / 2, mod);
if (exp % 2 == 0)
return (z*z) % mod;
else
return (base*z*z) % mod;
}
__host__ __device__ int64_t decimal_to_binary(int64_t n)
{
int c = 0, d = 0, count;
char *pointer, charLSB[5];
int64_t LSB = 0, i = 0;
count = 0;
pointer = (char*)malloc(SIZE_BIT+1);
memset(pointer, 0, (SIZE_BIT + 1) * sizeof(char));
//if ( pointer == NULL )
// exit(EXIT_FAILURE);
memset(charLSB, 0, 5 * sizeof(char));
for ( c = SIZE_BIT - 1 ; c >= 0 ; c-- ) {
d = n >> c;
if ( d & 1 )
*(pointer+count) = 1 + '0';
else
*(pointer+count) = 0 + '0';
count++;
}
*(pointer+count) = '
#include “cuda_runtime.h”
#include “device_launch_parameters.h”
#include <stdio.h>
#include <time.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <time.h>
#include <windows.h>
#include <ctype.h>
#include <stdint.h>
#define m_VALUES 16
#define M_VALUE 1
#define T_EXP_VALUE 3
#define SIZE_BIT 32
#define G_VALUES (int64_t)pow((double)2, (double)22)
#define NUM_ITERATIONS (int64_t)pow((double)2, (double)14)
#define T_SIZE 8
//typedef unsigned long long int64_t;
int64_t MontgExp(int64_t base, int64_t exp, int64_t mod);
host device int64_t decimal_to_binary(int64_t);
void extended_euclid(int64_t a, int64_t b, int64_t *x, int64_t *y, int64_t *d);
host device int64_t contains(int64_t num, int64_t *arr, int64_t size);
#define THREADS_PER_BLOCK 64
#define BLOCKS_IN_GRID 1024
#define gpuErrchk(ans) {gpuAssert((ans), FILE, LINE);}
#define MAX_TMP_ARR_SIZE 1024
void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true)
{
if (code != cudaSuccess) {
fprintf(stderr, “GPUassert: %s %s %d\n”, cudaGetErrorString(code), file, line);
//if (abort) exit(code);
}
}
constant int64_t const_bArr[m_VALUES];
constant int64_t *const_bigPrimeNumber;
global void searchElements(int64_t *gArr, int64_t *cArr, int64_t *rArr)
{
int idx = blockDim.x * blockIdx.x + threadIdx.x;
int j = 0;
int tmp = 0;
if (idx < MAX_TMP_ARR_SIZE) {
tmp = ( const_bArr[ decimal_to_binary( gArr[ idx ] ) ] * gArr[idx]) % (*const_bigPrimeNumber);
if (contains(tmp, cArr, T_SIZE) == 1)
rArr[idx]++;
else {
for (j = 1; j < NUM_ITERATIONS; j++) {
tmp = (const_bArr[decimal_to_binary(tmp)] * tmp) % (*const_bigPrimeNumber);
if (contains(tmp, cArr, T_SIZE) == 1) {
rArr[idx]++;
break;
}
}
}
}
}
int main()
{
int64_t bigPrimeNumber = 7;
int64_t i = 0;
int64_t bArr[m_VALUES];
int64_t T = (int64_t)pow((double)2, (double)T_EXP_VALUE);
int64_t *dArr, *cArr;
int64_t M = 0;
int64_t *gArr;
int64_t R = 0;
int64_t tmp = 0;
int k = 0;
int64_t *rArr;
int64_t *gArr_tmp, *rArr_tmp;
int64_t *gArr_tmp_dev, *rArr_tmp_dev;
int flagExit = 0, old_i = 0, steps = 0, diff = 0;
int64_t *cArr_dev;
cudaEvent_t start, stop;
float gpuTime = 0.0f;
cudaEventCreate(&start);
cudaEventCreate(&stop);
srand(time(NULL));
memset(bArr, 0, m_VALUES * sizeof(int64_t));
dArr = (int64_t*)malloc(T * sizeof(int64_t));
memset(dArr, 0, T * sizeof(int64_t));
cArr = (int64_t*)malloc(T * sizeof(int64_t));
memset(cArr, 0, T * sizeof(int64_t));
gArr = (int64_t*)malloc(G_VALUES * sizeof(int64_t));
memset(gArr, 0, G_VALUES * sizeof(int64_t));
rArr = (int64_t*)malloc(G_VALUES * sizeof(int64_t));
memset(rArr, 0, G_VALUES * sizeof(int64_t));
for (i = 0; i < m_VALUES; i++)
bArr[i] = MontgExp(2, rand(), bigPrimeNumber);
for (i = 0; i < T; i++) {
dArr[i] = MontgExp(2, rand(), bigPrimeNumber);
cArr[i] = bArr[i];
}
for (i = 0; i < T; i++) {
M = 0;
while (M != M_VALUE) {
cArr[i] = (bArr[decimal_to_binary(cArr[i])] * cArr[i]) % bigPrimeNumber;
M++;
}
}
srand(time(NULL));
for (i = 0; i < G_VALUES; i++) {
tmp = rand();
if (tmp != 0) {
gArr[i] = rand();
}
}
gpuErrchk( cudaMemcpyToSymbol(const_bArr, bArr, m_VALUES * sizeof(int64_t), 0, cudaMemcpyHostToDevice));
gpuErrchk( cudaMemcpyToSymbol(const_bigPrimeNumber, &bigPrimeNumber, sizeof(int64_t), 0, cudaMemcpyHostToDevice));
cudaMalloc ( (void**)&cArr_dev, T * sizeof(int64_t));
gpuErrchk ( cudaMemcpy(cArr_dev, cArr, T * sizeof(int64_t), cudaMemcpyHostToDevice));
while (true) {
diff = G_VALUES - steps * old_i;
if (diff < MAX_TMP_ARR_SIZE) {
flagExit = 1;
break;
}
gArr_tmp = (int64_t*)malloc(MAX_TMP_ARR_SIZE * sizeof(int64_t));
rArr_tmp = (int64_t*)malloc(MAX_TMP_ARR_SIZE * sizeof(int64_t));
for (i = old_i, k = 0; i < MAX_TMP_ARR_SIZE; i++, k++) {
gArr_tmp[k] = gArr[i];
rArr_tmp[k] = rArr[i];
}
old_i = i;
cudaMalloc ( (void**)&gArr_tmp_dev, MAX_TMP_ARR_SIZE * sizeof(int64_t));
cudaMalloc ( (void**)&rArr_tmp_dev, MAX_TMP_ARR_SIZE * sizeof(int64_t));
gpuErrchk ( cudaMemcpy(gArr_tmp_dev, gArr_tmp, MAX_TMP_ARR_SIZE * sizeof(int64_t), cudaMemcpyHostToDevice));
gpuErrchk ( cudaMemcpy(rArr_tmp_dev, rArr_tmp, MAX_TMP_ARR_SIZE * sizeof(int64_t), cudaMemcpyHostToDevice));
dim3 threads = dim3(THREADS_PER_BLOCK);
dim3 blocks = dim3(BLOCKS_IN_GRID);
cudaEventRecord(start, 0);
searchElements<<<blocks, threads>>>(gArr_tmp_dev, cArr_dev, rArr_tmp_dev);
gpuErrchk( cudaPeekAtLastError() );
gpuErrchk( cudaDeviceSynchronize() );
cudaEventRecord(stop, 0);
gpuErrchk( cudaMemcpy(rArr_tmp, rArr_tmp_dev, MAX_TMP_ARR_SIZE * sizeof(int64_t), cudaMemcpyDeviceToHost) );
gpuErrchk( cudaEventSynchronize(stop));
for (k = 0; k < MAX_TMP_ARR_SIZE; k++)
R += rArr_tmp[k];
printf("%.2f milliseconds\n", gpuTime);
cudaFree(gArr_tmp_dev);
cudaFree(rArr_tmp_dev);
steps++;
}
printf("%.2f milliseconds\n", gpuTime);
cudaEventDestroy(start);
cudaEventDestroy(stop);
printf("Press any key...");
getchar();
return 0;
}
int64_t MontgExp(int64_t base, int64_t exp, int64_t mod)
{
int64_t z = 0;
if (exp == 0) return 1;
z = MontgExp(base, exp / 2, mod);
if (exp % 2 == 0)
return (zz) % mod;
else
return (basez*z) % mod;
}
host device int64_t decimal_to_binary(int64_t n)
{
int c = 0, d = 0, count;
char *pointer, charLSB[5];
int64_t LSB = 0, i = 0;
count = 0;
pointer = (char*)malloc(SIZE_BIT+1);
memset(pointer, 0, (SIZE_BIT + 1) * sizeof(char));
//if ( pointer == NULL )
// exit(EXIT_FAILURE);
memset(charLSB, 0, 5 * sizeof(char));
for ( c = SIZE_BIT - 1 ; c >= 0 ; c-- ) {
d = n >> c;
if ( d & 1 )
*(pointer+count) = 1 + '0';
else
*(pointer+count) = 0 + '0';
count++;
}
*(pointer+count) = ‘\0’;
//strncpy(charLSB, pointer + (32 - 4), 4);
for (i = 0 ; i < 4; i++)
charLSB[i] = pointer[(32 - 4) + i];
charLSB[4] = ‘\0’;
for (i = 0; i < 5; i++) {
if (charLSB[i] == ‘0’)
LSB *= 2;
if (charLSB[i] == ‘1’)
LSB = 2 * LSB + 1;
}
free(pointer);
return LSB;
}
void extended_euclid(int64_t a, int64_t b, int64_t *x, int64_t *y, int64_t *d)
{
int64_t q = 0, r = 0, x1 = 0, x2 = 0, y1 = 0, y2 = 0;
if (b == 0) {
*d = a, *x = 1, *y = 0;
return;
}
x2 = 1, x1 = 0, y2 = 0, y1 = 1;
while (b > 0) {
q = a / b, r = a - q * b;
*x = x2 - q * x1, *y = y2 - q * y1;
a = b, b = r;
x2 = x1, x1 = *x, y2 = y1, y1 = *y;
}
*d = a, *x = x2, *y = y2;
}
host device int64_t contains(int64_t num, int64_t arr, int64_t size)
{
int64_t i = 0, result = 0;
for (i = 0; i < size; i++)
if ((arr + i) == num) {
result = 1;
break;
}
return result;
}
';
//strncpy(charLSB, pointer + (32 - 4), 4);
for (i = 0 ; i < 4; i++)
charLSB[i] = pointer[(32 - 4) + i];
charLSB[4] = '
#include “cuda_runtime.h”
#include “device_launch_parameters.h”
#include <stdio.h>
#include <time.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <time.h>
#include <windows.h>
#include <ctype.h>
#include <stdint.h>
#define m_VALUES 16
#define M_VALUE 1
#define T_EXP_VALUE 3
#define SIZE_BIT 32
#define G_VALUES (int64_t)pow((double)2, (double)22)
#define NUM_ITERATIONS (int64_t)pow((double)2, (double)14)
#define T_SIZE 8
//typedef unsigned long long int64_t;
int64_t MontgExp(int64_t base, int64_t exp, int64_t mod);
host device int64_t decimal_to_binary(int64_t);
void extended_euclid(int64_t a, int64_t b, int64_t *x, int64_t *y, int64_t *d);
host device int64_t contains(int64_t num, int64_t *arr, int64_t size);
#define THREADS_PER_BLOCK 64
#define BLOCKS_IN_GRID 1024
#define gpuErrchk(ans) {gpuAssert((ans), FILE, LINE);}
#define MAX_TMP_ARR_SIZE 1024
void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true)
{
if (code != cudaSuccess) {
fprintf(stderr, “GPUassert: %s %s %d\n”, cudaGetErrorString(code), file, line);
//if (abort) exit(code);
}
}
constant int64_t const_bArr[m_VALUES];
constant int64_t *const_bigPrimeNumber;
global void searchElements(int64_t *gArr, int64_t *cArr, int64_t *rArr)
{
int idx = blockDim.x * blockIdx.x + threadIdx.x;
int j = 0;
int tmp = 0;
if (idx < MAX_TMP_ARR_SIZE) {
tmp = ( const_bArr[ decimal_to_binary( gArr[ idx ] ) ] * gArr[idx]) % (*const_bigPrimeNumber);
if (contains(tmp, cArr, T_SIZE) == 1)
rArr[idx]++;
else {
for (j = 1; j < NUM_ITERATIONS; j++) {
tmp = (const_bArr[decimal_to_binary(tmp)] * tmp) % (*const_bigPrimeNumber);
if (contains(tmp, cArr, T_SIZE) == 1) {
rArr[idx]++;
break;
}
}
}
}
}
int main()
{
int64_t bigPrimeNumber = 7;
int64_t i = 0;
int64_t bArr[m_VALUES];
int64_t T = (int64_t)pow((double)2, (double)T_EXP_VALUE);
int64_t *dArr, *cArr;
int64_t M = 0;
int64_t *gArr;
int64_t R = 0;
int64_t tmp = 0;
int k = 0;
int64_t *rArr;
int64_t *gArr_tmp, *rArr_tmp;
int64_t *gArr_tmp_dev, *rArr_tmp_dev;
int flagExit = 0, old_i = 0, steps = 0, diff = 0;
int64_t *cArr_dev;
cudaEvent_t start, stop;
float gpuTime = 0.0f;
cudaEventCreate(&start);
cudaEventCreate(&stop);
srand(time(NULL));
memset(bArr, 0, m_VALUES * sizeof(int64_t));
dArr = (int64_t*)malloc(T * sizeof(int64_t));
memset(dArr, 0, T * sizeof(int64_t));
cArr = (int64_t*)malloc(T * sizeof(int64_t));
memset(cArr, 0, T * sizeof(int64_t));
gArr = (int64_t*)malloc(G_VALUES * sizeof(int64_t));
memset(gArr, 0, G_VALUES * sizeof(int64_t));
rArr = (int64_t*)malloc(G_VALUES * sizeof(int64_t));
memset(rArr, 0, G_VALUES * sizeof(int64_t));
for (i = 0; i < m_VALUES; i++)
bArr[i] = MontgExp(2, rand(), bigPrimeNumber);
for (i = 0; i < T; i++) {
dArr[i] = MontgExp(2, rand(), bigPrimeNumber);
cArr[i] = bArr[i];
}
for (i = 0; i < T; i++) {
M = 0;
while (M != M_VALUE) {
cArr[i] = (bArr[decimal_to_binary(cArr[i])] * cArr[i]) % bigPrimeNumber;
M++;
}
}
srand(time(NULL));
for (i = 0; i < G_VALUES; i++) {
tmp = rand();
if (tmp != 0) {
gArr[i] = rand();
}
}
gpuErrchk( cudaMemcpyToSymbol(const_bArr, bArr, m_VALUES * sizeof(int64_t), 0, cudaMemcpyHostToDevice));
gpuErrchk( cudaMemcpyToSymbol(const_bigPrimeNumber, &bigPrimeNumber, sizeof(int64_t), 0, cudaMemcpyHostToDevice));
cudaMalloc ( (void**)&cArr_dev, T * sizeof(int64_t));
gpuErrchk ( cudaMemcpy(cArr_dev, cArr, T * sizeof(int64_t), cudaMemcpyHostToDevice));
while (true) {
diff = G_VALUES - steps * old_i;
if (diff < MAX_TMP_ARR_SIZE) {
flagExit = 1;
break;
}
gArr_tmp = (int64_t*)malloc(MAX_TMP_ARR_SIZE * sizeof(int64_t));
rArr_tmp = (int64_t*)malloc(MAX_TMP_ARR_SIZE * sizeof(int64_t));
for (i = old_i, k = 0; i < MAX_TMP_ARR_SIZE; i++, k++) {
gArr_tmp[k] = gArr[i];
rArr_tmp[k] = rArr[i];
}
old_i = i;
cudaMalloc ( (void**)&gArr_tmp_dev, MAX_TMP_ARR_SIZE * sizeof(int64_t));
cudaMalloc ( (void**)&rArr_tmp_dev, MAX_TMP_ARR_SIZE * sizeof(int64_t));
gpuErrchk ( cudaMemcpy(gArr_tmp_dev, gArr_tmp, MAX_TMP_ARR_SIZE * sizeof(int64_t), cudaMemcpyHostToDevice));
gpuErrchk ( cudaMemcpy(rArr_tmp_dev, rArr_tmp, MAX_TMP_ARR_SIZE * sizeof(int64_t), cudaMemcpyHostToDevice));
dim3 threads = dim3(THREADS_PER_BLOCK);
dim3 blocks = dim3(BLOCKS_IN_GRID);
cudaEventRecord(start, 0);
searchElements<<<blocks, threads>>>(gArr_tmp_dev, cArr_dev, rArr_tmp_dev);
gpuErrchk( cudaPeekAtLastError() );
gpuErrchk( cudaDeviceSynchronize() );
cudaEventRecord(stop, 0);
gpuErrchk( cudaMemcpy(rArr_tmp, rArr_tmp_dev, MAX_TMP_ARR_SIZE * sizeof(int64_t), cudaMemcpyDeviceToHost) );
gpuErrchk( cudaEventSynchronize(stop));
for (k = 0; k < MAX_TMP_ARR_SIZE; k++)
R += rArr_tmp[k];
printf("%.2f milliseconds\n", gpuTime);
cudaFree(gArr_tmp_dev);
cudaFree(rArr_tmp_dev);
steps++;
}
printf("%.2f milliseconds\n", gpuTime);
cudaEventDestroy(start);
cudaEventDestroy(stop);
printf("Press any key...");
getchar();
return 0;
}
int64_t MontgExp(int64_t base, int64_t exp, int64_t mod)
{
int64_t z = 0;
if (exp == 0) return 1;
z = MontgExp(base, exp / 2, mod);
if (exp % 2 == 0)
return (zz) % mod;
else
return (basez*z) % mod;
}
host device int64_t decimal_to_binary(int64_t n)
{
int c = 0, d = 0, count;
char *pointer, charLSB[5];
int64_t LSB = 0, i = 0;
count = 0;
pointer = (char*)malloc(SIZE_BIT+1);
memset(pointer, 0, (SIZE_BIT + 1) * sizeof(char));
//if ( pointer == NULL )
// exit(EXIT_FAILURE);
memset(charLSB, 0, 5 * sizeof(char));
for ( c = SIZE_BIT - 1 ; c >= 0 ; c-- ) {
d = n >> c;
if ( d & 1 )
*(pointer+count) = 1 + '0';
else
*(pointer+count) = 0 + '0';
count++;
}
*(pointer+count) = ‘\0’;
//strncpy(charLSB, pointer + (32 - 4), 4);
for (i = 0 ; i < 4; i++)
charLSB[i] = pointer[(32 - 4) + i];
charLSB[4] = ‘\0’;
for (i = 0; i < 5; i++) {
if (charLSB[i] == ‘0’)
LSB *= 2;
if (charLSB[i] == ‘1’)
LSB = 2 * LSB + 1;
}
free(pointer);
return LSB;
}
void extended_euclid(int64_t a, int64_t b, int64_t *x, int64_t *y, int64_t *d)
{
int64_t q = 0, r = 0, x1 = 0, x2 = 0, y1 = 0, y2 = 0;
if (b == 0) {
*d = a, *x = 1, *y = 0;
return;
}
x2 = 1, x1 = 0, y2 = 0, y1 = 1;
while (b > 0) {
q = a / b, r = a - q * b;
*x = x2 - q * x1, *y = y2 - q * y1;
a = b, b = r;
x2 = x1, x1 = *x, y2 = y1, y1 = *y;
}
*d = a, *x = x2, *y = y2;
}
host device int64_t contains(int64_t num, int64_t arr, int64_t size)
{
int64_t i = 0, result = 0;
for (i = 0; i < size; i++)
if ((arr + i) == num) {
result = 1;
break;
}
return result;
}
';
for (i = 0; i < 5; i++) {
if (charLSB[i] == '0')
LSB *= 2;
if (charLSB[i] == '1')
LSB = 2 * LSB + 1;
}
free(pointer);
return LSB;
}
void extended_euclid(int64_t a, int64_t b, int64_t *x, int64_t *y, int64_t *d)
{
int64_t q = 0, r = 0, x1 = 0, x2 = 0, y1 = 0, y2 = 0;
if (b == 0) {
*d = a, *x = 1, *y = 0;
return;
}
x2 = 1, x1 = 0, y2 = 0, y1 = 1;
while (b > 0) {
q = a / b, r = a - q * b;
*x = x2 - q * x1, *y = y2 - q * y1;
a = b, b = r;
x2 = x1, x1 = *x, y2 = y1, y1 = *y;
}
*d = a, *x = x2, *y = y2;
}
__host__ __device__ int64_t contains(int64_t num, int64_t *arr, int64_t size)
{
int64_t i = 0, result = 0;
for (i = 0; i < size; i++)
if (*(arr + i) == num) {
result = 1;
break;
}
return result;
}
I compile and debug in VS2012 Professional, CUDA Toolkit 9.1. Info about my device
GeForce GT730M
Compute Capability 3.5
Global Memory 2 GB
Shared Memory Per Block 48 KB
Max Threads Per Block 1024
Number of multiprocessors 2
Max Threads Dim 1024 : 1024 : 64
Max Grid Dim 2*(10 ^ 9) : 65535 : 65535
well, the problem is that after 1 itteration of while i have in console after calling kernel following errors:
GPUassert: misaligned address kernel.cu 163
GPUassert: misaligned address kernel.cu 165
GPUassert: misaligned address kernel.cu 166
Can’t understand how to fix them.