Accelerating easy program

Here 's some code.

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>
#include <time.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <time.h>
#include <windows.h>
#include <ctype.h>
#include <stdint.h>

#define m_VALUES 16 
#define M_VALUE 1 
#define T_EXP_VALUE 3 
#define SIZE_BIT 32 
#define G_VALUES 4194304 
#define NUM_ITERATIONS (int64_t)pow((double)2, (double)14) 
#define T_SIZE 8

int64_t MontgExp(int64_t base, int64_t exp, int64_t mod);
 __host__ __device__ int64_t decimal_to_binary(int64_t); 
__host__ __device__ int64_t contains(int64_t num, int64_t *arr, int64_t size); 

#define THREADS_PER_BLOCK 1024
#define BLOCKS_IN_GRID 4096
#define gpuErrchk(ans) {gpuAssert((ans), __FILE__, __LINE__);}
#define MAX_TMP_ARR_SIZE 1024

void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true)
{
	if (code != cudaSuccess) {
		fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
		//if (abort) exit(code);
	}
}
__constant__ int64_t const_bArr[m_VALUES];
__constant__ int64_t const_bigPrimeNumber = 4294967279;
__constant__ int64_t const_cArr[T_SIZE]; 
__device__ int64_t global_gArr[G_VALUES]; 
__device__ int64_t global_rArr[G_VALUES];
__global__ void searchElements()
{
	int idx = blockDim.x * blockIdx.x + threadIdx.x; 
	int j = 0;
	int tmp = 0;

	if (idx < G_VALUES) {
		tmp = ( const_bArr[ decimal_to_binary( global_gArr[ idx ] ) ] * global_gArr[idx]) % const_bigPrimeNumber;
		if (contains(tmp, const_cArr, T_SIZE) == 1) 
			global_rArr[idx]++;
		else {
			for (j = 1; j < NUM_ITERATIONS; j++) {
				tmp = (const_bArr[decimal_to_binary(tmp)] * tmp) % const_bigPrimeNumber;
				if (contains(tmp, const_cArr, T_SIZE) == 1) {
					global_rArr[idx]++;
					break;
				}
			}
		}
	}
}
int main()
{
	int64_t bigPrimeNumber = 4294967279;
	int64_t i = 0;
	int64_t bArr[m_VALUES]; 
	int64_t T = (int64_t)pow((double)2, (double)T_EXP_VALUE);
	int64_t *dArr, *cArr; 
	int64_t M = 0;
	int64_t *gArr; 
	int64_t tmp = 0;
	int64_t *rArr;
	
	cudaEvent_t start, stop;
	float gpuTime = 0.0f;

	cudaEventCreate(&start);
	cudaEventCreate(&stop);
	srand(time(NULL));
	memset(bArr, 0, m_VALUES * sizeof(int64_t));

	dArr = (int64_t*)malloc(T * sizeof(int64_t));
	memset(dArr, 0, T * sizeof(int64_t));

	cArr = (int64_t*)malloc(T * sizeof(int64_t));
	memset(cArr, 0, T * sizeof(int64_t));

	gArr = (int64_t*)malloc(G_VALUES * sizeof(int64_t));
	memset(gArr, 0, G_VALUES * sizeof(int64_t));

	rArr = (int64_t*)malloc(G_VALUES * sizeof(int64_t));
	memset(rArr, 0, G_VALUES * sizeof(int64_t));
	
	for (i = 0; i < m_VALUES; i++) 
		bArr[i] = MontgExp(2, rand(), bigPrimeNumber);
	
	
	for (i = 0; i < T; i++) {
		dArr[i] = MontgExp(2, rand(), bigPrimeNumber);
		cArr[i] = bArr[i]; 
	}
	for (i = 0; i < T; i++) {
		M = 0;
		while (M != M_VALUE) {
			cArr[i] = (bArr[decimal_to_binary(cArr[i])] * cArr[i]) % bigPrimeNumber;
			M++;
		}
	}
	srand(time(NULL));
	for (i = 0; i < G_VALUES; i++) {
		tmp = rand();
		if (tmp != 0) {
			gArr[i] = rand();
		}
	}
	gpuErrchk( cudaMemcpyToSymbol(const_bArr, bArr, m_VALUES * sizeof(int64_t), 0, cudaMemcpyHostToDevice));
	gpuErrchk( cudaMemcpyToSymbol(const_cArr, cArr, T_SIZE * sizeof(int64_t), 0, cudaMemcpyHostToDevice));

	gpuErrchk ( cudaMemcpyToSymbol(global_gArr, gArr, G_VALUES * sizeof(int64_t), 0, cudaMemcpyHostToDevice));
	gpuErrchk ( cudaMemcpyToSymbol(global_rArr, rArr, G_VALUES * sizeof(int64_t), 0, cudaMemcpyHostToDevice));

	dim3 threads = dim3(THREADS_PER_BLOCK);
	dim3 blocks = dim3(BLOCKS_IN_GRID);

	cudaEventRecord(start, 0);
	searchElements<<<blocks, threads>>> ();
	gpuErrchk( cudaPeekAtLastError() );
	gpuErrchk( cudaDeviceSynchronize() );
	cudaEventRecord(stop, 0);
	cudaEventElapsedTime(&gpuTime, start, stop);
	printf("%.2f milliseconds\n", gpuTime);
	cudaEventDestroy(start);
	cudaEventDestroy(stop);
	printf("Press any key...");
	getchar();
	return 0;

}
int64_t MontgExp(int64_t base, int64_t exp, int64_t mod)
{
    int64_t z = 0;
    if (exp == 0) return 1;
    z = MontgExp(base, exp / 2, mod);
    if (exp % 2 == 0)
        return (z*z) % mod;
    else
        return (base*z*z) % mod;
}
__host__ __device__ int64_t decimal_to_binary(int64_t n)
{
   int c = 0, d = 0, count;
   char *pointer, charLSB[5];
   int64_t LSB = 0, i = 0;

count = 0;
   pointer = (char*)malloc(SIZE_BIT+1);
   memset(pointer, 0, (SIZE_BIT + 1) * sizeof(char));
   //if ( pointer == NULL )
   //   exit(EXIT_FAILURE);
   
   memset(charLSB, 0, 5 * sizeof(char));
   for ( c = SIZE_BIT - 1 ; c >= 0 ; c-- ) {
      d = n >> c;
 
      if ( d & 1 )
         *(pointer+count) = 1 + '0';
      else
         *(pointer+count) = 0 + '0';
 
      count++;
   }
   *(pointer+count) = '\0';
   //strncpy(charLSB, pointer + (32 - 4), 4);
   for (i = 0 ; i < 4; i++)
	   charLSB[i] = pointer[(32 - 4) + i];
   charLSB[4] = '\0';
   for (i = 0; i < 5; i++) {
	   if (charLSB[i] == '0')
		   LSB *= 2;
	   if (charLSB[i] == '1')
		   LSB = 2 * LSB + 1;
   }
   free(pointer);
   return LSB;
}
__host__ __device__ int64_t contains(int64_t num, int64_t *arr, int64_t size)
{
	int64_t i = 0, result = 0;
	for (i = 0; i < size; i++) 
		if (*(arr + i) == num) {
			result = 1;
			break;
		}
	return result;
}

Info about my device:

GeForce GT730M
Compute Capability 3.5
Global Memory 2 GB
Shared Memory Per Block 48 KB
Max Threads Per Block 1024
Number of multiprocessors 2
Max Threads Dim 1024 : 1024 : 64
Max Grid Dim 2*(10 ^ 9) : 65535 : 65535

TDR_file : delay 60 seconds.

I can’t understand two thiqns:

  1. How to calculate time my kernel code need to execute code (time until the end of work).
  2. How to accelerate my code.