error in aligning memory in easy code

Here 's code on C/CUDA

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>
#include <time.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <time.h>
#include <windows.h>
#include <ctype.h>
#include <stdint.h>

#define m_VALUES 16  
#define M_VALUE 1 
#define T_EXP_VALUE 3  
#define SIZE_BIT 32  
#define G_VALUES (int64_t)pow((double)2, (double)22) 
#define NUM_ITERATIONS (int64_t)pow((double)2, (double)14)  
#define T_SIZE 8

//typedef unsigned long long int64_t; 

int64_t MontgExp(int64_t base, int64_t exp, int64_t mod); 
 __host__ __device__ int64_t decimal_to_binary(int64_t); 
void extended_euclid(int64_t a, int64_t b, int64_t *x, int64_t *y, int64_t *d); 
__host__ __device__ int64_t contains(int64_t num, int64_t *arr, int64_t size); 


 
#define THREADS_PER_BLOCK 64
#define BLOCKS_IN_GRID 1024
#define gpuErrchk(ans) {gpuAssert((ans), __FILE__, __LINE__);}
#define MAX_TMP_ARR_SIZE 1024

void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true)
{
	if (code != cudaSuccess) {
		fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
		//if (abort) exit(code);
	}
}
__constant__ int64_t const_bArr[m_VALUES];
__constant__ int64_t *const_bigPrimeNumber;
__global__ void searchElements(int64_t *gArr, int64_t *cArr, int64_t *rArr)
{
	int idx = blockDim.x * blockIdx.x + threadIdx.x;  
	int j = 0;
	int tmp = 0;

	if (idx < MAX_TMP_ARR_SIZE) {
		tmp = ( const_bArr[ decimal_to_binary( gArr[ idx ] ) ] * gArr[idx]) % (*const_bigPrimeNumber);
		if (contains(tmp, cArr, T_SIZE) == 1) 
			rArr[idx]++;
		else {
			for (j = 1; j < NUM_ITERATIONS; j++) {
				tmp = (const_bArr[decimal_to_binary(tmp)] * tmp) % (*const_bigPrimeNumber);
				if (contains(tmp, cArr, T_SIZE) == 1) {
					rArr[idx]++;
					break;
				}
			}
		}
	}
}
int main()
{
	int64_t bigPrimeNumber = 7;
	int64_t i = 0;
	int64_t bArr[m_VALUES]; 
	int64_t T = (int64_t)pow((double)2, (double)T_EXP_VALUE);
	int64_t *dArr, *cArr;  
	int64_t M = 0;
	int64_t *gArr;  
	int64_t R = 0;  
	int64_t tmp = 0;
	int k = 0;
	int64_t *rArr;
	 
	int64_t *gArr_tmp, *rArr_tmp;
	int64_t *gArr_tmp_dev, *rArr_tmp_dev;
	int flagExit = 0, old_i = 0, steps = 0, diff = 0;
	 
	int64_t *cArr_dev;
	cudaEvent_t start, stop;
	float gpuTime = 0.0f;

	cudaEventCreate(&start);
	cudaEventCreate(&stop);
	srand(time(NULL));
	memset(bArr, 0, m_VALUES * sizeof(int64_t));

	dArr = (int64_t*)malloc(T * sizeof(int64_t));
	memset(dArr, 0, T * sizeof(int64_t));

	cArr = (int64_t*)malloc(T * sizeof(int64_t));
	memset(cArr, 0, T * sizeof(int64_t));

	gArr = (int64_t*)malloc(G_VALUES * sizeof(int64_t));
	memset(gArr, 0, G_VALUES * sizeof(int64_t));

	rArr = (int64_t*)malloc(G_VALUES * sizeof(int64_t));
	memset(rArr, 0, G_VALUES * sizeof(int64_t));
	 
	for (i = 0; i < m_VALUES; i++) 
		bArr[i] = MontgExp(2, rand(), bigPrimeNumber);
	
	 
	for (i = 0; i < T; i++) {
		dArr[i] = MontgExp(2, rand(), bigPrimeNumber);
		cArr[i] = bArr[i];  
	}
	for (i = 0; i < T; i++) {
		M = 0;
		while (M != M_VALUE) {
			cArr[i] = (bArr[decimal_to_binary(cArr[i])] * cArr[i]) % bigPrimeNumber;
			M++;
		}
	}
	 
	srand(time(NULL));
	 
	for (i = 0; i < G_VALUES; i++) {
		tmp = rand();
		if (tmp != 0) {
			gArr[i] = rand();
		}
	}
	 
	gpuErrchk( cudaMemcpyToSymbol(const_bArr, bArr, m_VALUES * sizeof(int64_t), 0, cudaMemcpyHostToDevice));
	gpuErrchk( cudaMemcpyToSymbol(const_bigPrimeNumber, &bigPrimeNumber, sizeof(int64_t), 0, cudaMemcpyHostToDevice));

	cudaMalloc ( (void**)&cArr_dev, T * sizeof(int64_t));
	gpuErrchk ( cudaMemcpy(cArr_dev, cArr, T * sizeof(int64_t), cudaMemcpyHostToDevice));

	while (true) {
		diff = G_VALUES - steps * old_i;
		if (diff < MAX_TMP_ARR_SIZE) {
			flagExit = 1;
			break;
		}
		gArr_tmp = (int64_t*)malloc(MAX_TMP_ARR_SIZE * sizeof(int64_t));
		rArr_tmp = (int64_t*)malloc(MAX_TMP_ARR_SIZE * sizeof(int64_t));

		for (i = old_i, k = 0; i < MAX_TMP_ARR_SIZE; i++, k++) {
			gArr_tmp[k] = gArr[i];
			rArr_tmp[k] = rArr[i];
		}
		old_i = i;
		cudaMalloc ( (void**)&gArr_tmp_dev, MAX_TMP_ARR_SIZE * sizeof(int64_t));
		cudaMalloc ( (void**)&rArr_tmp_dev, MAX_TMP_ARR_SIZE * sizeof(int64_t));

		gpuErrchk ( cudaMemcpy(gArr_tmp_dev, gArr_tmp, MAX_TMP_ARR_SIZE * sizeof(int64_t), cudaMemcpyHostToDevice));
		gpuErrchk ( cudaMemcpy(rArr_tmp_dev, rArr_tmp, MAX_TMP_ARR_SIZE * sizeof(int64_t), cudaMemcpyHostToDevice));

		dim3 threads = dim3(THREADS_PER_BLOCK);
		dim3 blocks = dim3(BLOCKS_IN_GRID);

		cudaEventRecord(start, 0);
		searchElements<<<blocks, threads>>>(gArr_tmp_dev, cArr_dev, rArr_tmp_dev);
		gpuErrchk( cudaPeekAtLastError() );
		gpuErrchk( cudaDeviceSynchronize() );
		cudaEventRecord(stop, 0);
		gpuErrchk( cudaMemcpy(rArr_tmp, rArr_tmp_dev, MAX_TMP_ARR_SIZE * sizeof(int64_t), cudaMemcpyDeviceToHost) );
		gpuErrchk( cudaEventSynchronize(stop));

		for (k = 0; k < MAX_TMP_ARR_SIZE; k++)
			R += rArr_tmp[k];
		printf("%.2f milliseconds\n", gpuTime);

		cudaFree(gArr_tmp_dev);
		cudaFree(rArr_tmp_dev); 
		steps++;
	}
	printf("%.2f milliseconds\n", gpuTime);
	cudaEventDestroy(start);
	cudaEventDestroy(stop);
	printf("Press any key...");
	getchar();
	return 0;

}
int64_t MontgExp(int64_t base, int64_t exp, int64_t mod)
{
    int64_t z = 0;
    if (exp == 0) return 1;
    z = MontgExp(base, exp / 2, mod);
    if (exp % 2 == 0)
        return (z*z) % mod;
    else
        return (base*z*z) % mod;
}
__host__ __device__ int64_t decimal_to_binary(int64_t n)
{
   int c = 0, d = 0, count;
   char *pointer, charLSB[5];
   int64_t LSB = 0, i = 0;


   count = 0;
   pointer = (char*)malloc(SIZE_BIT+1);
   memset(pointer, 0, (SIZE_BIT + 1) * sizeof(char));
   //if ( pointer == NULL )
   //   exit(EXIT_FAILURE);
   
   memset(charLSB, 0, 5 * sizeof(char));
   for ( c = SIZE_BIT - 1 ; c >= 0 ; c-- ) {
      d = n >> c;
 
      if ( d & 1 )
         *(pointer+count) = 1 + '0';
      else
         *(pointer+count) = 0 + '0';
 
      count++;
   }
   *(pointer+count) = '

#include “cuda_runtime.h”
#include “device_launch_parameters.h”

#include <stdio.h>
#include <time.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <time.h>
#include <windows.h>
#include <ctype.h>
#include <stdint.h>

#define m_VALUES 16
#define M_VALUE 1
#define T_EXP_VALUE 3
#define SIZE_BIT 32
#define G_VALUES (int64_t)pow((double)2, (double)22)
#define NUM_ITERATIONS (int64_t)pow((double)2, (double)14)
#define T_SIZE 8

//typedef unsigned long long int64_t;

int64_t MontgExp(int64_t base, int64_t exp, int64_t mod);
host device int64_t decimal_to_binary(int64_t);
void extended_euclid(int64_t a, int64_t b, int64_t *x, int64_t *y, int64_t *d);
host device int64_t contains(int64_t num, int64_t *arr, int64_t size);

#define THREADS_PER_BLOCK 64
#define BLOCKS_IN_GRID 1024
#define gpuErrchk(ans) {gpuAssert((ans), FILE, LINE);}
#define MAX_TMP_ARR_SIZE 1024

void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true)
{
if (code != cudaSuccess) {
fprintf(stderr, “GPUassert: %s %s %d\n”, cudaGetErrorString(code), file, line);
//if (abort) exit(code);
}
}
constant int64_t const_bArr[m_VALUES];
constant int64_t *const_bigPrimeNumber;
global void searchElements(int64_t *gArr, int64_t *cArr, int64_t *rArr)
{
int idx = blockDim.x * blockIdx.x + threadIdx.x;
int j = 0;
int tmp = 0;

if (idx < MAX_TMP_ARR_SIZE) {
	tmp = ( const_bArr[ decimal_to_binary( gArr[ idx ] ) ] * gArr[idx]) % (*const_bigPrimeNumber);
	if (contains(tmp, cArr, T_SIZE) == 1) 
		rArr[idx]++;
	else {
		for (j = 1; j < NUM_ITERATIONS; j++) {
			tmp = (const_bArr[decimal_to_binary(tmp)] * tmp) % (*const_bigPrimeNumber);
			if (contains(tmp, cArr, T_SIZE) == 1) {
				rArr[idx]++;
				break;
			}
		}
	}
}

}
int main()
{
int64_t bigPrimeNumber = 7;
int64_t i = 0;
int64_t bArr[m_VALUES];
int64_t T = (int64_t)pow((double)2, (double)T_EXP_VALUE);
int64_t *dArr, *cArr;
int64_t M = 0;
int64_t *gArr;
int64_t R = 0;
int64_t tmp = 0;
int k = 0;
int64_t *rArr;

int64_t *gArr_tmp, *rArr_tmp;
int64_t *gArr_tmp_dev, *rArr_tmp_dev;
int flagExit = 0, old_i = 0, steps = 0, diff = 0;
 
int64_t *cArr_dev;
cudaEvent_t start, stop;
float gpuTime = 0.0f;

cudaEventCreate(&start);
cudaEventCreate(&stop);
srand(time(NULL));
memset(bArr, 0, m_VALUES * sizeof(int64_t));

dArr = (int64_t*)malloc(T * sizeof(int64_t));
memset(dArr, 0, T * sizeof(int64_t));

cArr = (int64_t*)malloc(T * sizeof(int64_t));
memset(cArr, 0, T * sizeof(int64_t));

gArr = (int64_t*)malloc(G_VALUES * sizeof(int64_t));
memset(gArr, 0, G_VALUES * sizeof(int64_t));

rArr = (int64_t*)malloc(G_VALUES * sizeof(int64_t));
memset(rArr, 0, G_VALUES * sizeof(int64_t));
 
for (i = 0; i < m_VALUES; i++) 
	bArr[i] = MontgExp(2, rand(), bigPrimeNumber);

 
for (i = 0; i < T; i++) {
	dArr[i] = MontgExp(2, rand(), bigPrimeNumber);
	cArr[i] = bArr[i];  
}
for (i = 0; i < T; i++) {
	M = 0;
	while (M != M_VALUE) {
		cArr[i] = (bArr[decimal_to_binary(cArr[i])] * cArr[i]) % bigPrimeNumber;
		M++;
	}
}
 
srand(time(NULL));
 
for (i = 0; i < G_VALUES; i++) {
	tmp = rand();
	if (tmp != 0) {
		gArr[i] = rand();
	}
}
 
gpuErrchk( cudaMemcpyToSymbol(const_bArr, bArr, m_VALUES * sizeof(int64_t), 0, cudaMemcpyHostToDevice));
gpuErrchk( cudaMemcpyToSymbol(const_bigPrimeNumber, &bigPrimeNumber, sizeof(int64_t), 0, cudaMemcpyHostToDevice));

cudaMalloc ( (void**)&cArr_dev, T * sizeof(int64_t));
gpuErrchk ( cudaMemcpy(cArr_dev, cArr, T * sizeof(int64_t), cudaMemcpyHostToDevice));

while (true) {
	diff = G_VALUES - steps * old_i;
	if (diff < MAX_TMP_ARR_SIZE) {
		flagExit = 1;
		break;
	}
	gArr_tmp = (int64_t*)malloc(MAX_TMP_ARR_SIZE * sizeof(int64_t));
	rArr_tmp = (int64_t*)malloc(MAX_TMP_ARR_SIZE * sizeof(int64_t));

	for (i = old_i, k = 0; i < MAX_TMP_ARR_SIZE; i++, k++) {
		gArr_tmp[k] = gArr[i];
		rArr_tmp[k] = rArr[i];
	}
	old_i = i;
	cudaMalloc ( (void**)&gArr_tmp_dev, MAX_TMP_ARR_SIZE * sizeof(int64_t));
	cudaMalloc ( (void**)&rArr_tmp_dev, MAX_TMP_ARR_SIZE * sizeof(int64_t));

	gpuErrchk ( cudaMemcpy(gArr_tmp_dev, gArr_tmp, MAX_TMP_ARR_SIZE * sizeof(int64_t), cudaMemcpyHostToDevice));
	gpuErrchk ( cudaMemcpy(rArr_tmp_dev, rArr_tmp, MAX_TMP_ARR_SIZE * sizeof(int64_t), cudaMemcpyHostToDevice));

	dim3 threads = dim3(THREADS_PER_BLOCK);
	dim3 blocks = dim3(BLOCKS_IN_GRID);

	cudaEventRecord(start, 0);
	searchElements<<<blocks, threads>>>(gArr_tmp_dev, cArr_dev, rArr_tmp_dev);
	gpuErrchk( cudaPeekAtLastError() );
	gpuErrchk( cudaDeviceSynchronize() );
	cudaEventRecord(stop, 0);
	gpuErrchk( cudaMemcpy(rArr_tmp, rArr_tmp_dev, MAX_TMP_ARR_SIZE * sizeof(int64_t), cudaMemcpyDeviceToHost) );
	gpuErrchk( cudaEventSynchronize(stop));

	for (k = 0; k < MAX_TMP_ARR_SIZE; k++)
		R += rArr_tmp[k];
	printf("%.2f milliseconds\n", gpuTime);

	cudaFree(gArr_tmp_dev);
	cudaFree(rArr_tmp_dev); 
	steps++;
}
printf("%.2f milliseconds\n", gpuTime);
cudaEventDestroy(start);
cudaEventDestroy(stop);
printf("Press any key...");
getchar();
return 0;

}
int64_t MontgExp(int64_t base, int64_t exp, int64_t mod)
{
int64_t z = 0;
if (exp == 0) return 1;
z = MontgExp(base, exp / 2, mod);
if (exp % 2 == 0)
return (zz) % mod;
else
return (base
z*z) % mod;
}
host device int64_t decimal_to_binary(int64_t n)
{
int c = 0, d = 0, count;
char *pointer, charLSB[5];
int64_t LSB = 0, i = 0;

count = 0;
pointer = (char*)malloc(SIZE_BIT+1);
memset(pointer, 0, (SIZE_BIT + 1) * sizeof(char));
//if ( pointer == NULL )
// exit(EXIT_FAILURE);

memset(charLSB, 0, 5 * sizeof(char));
for ( c = SIZE_BIT - 1 ; c >= 0 ; c-- ) {
d = n >> c;

  if ( d & 1 )
     *(pointer+count) = 1 + '0';
  else
     *(pointer+count) = 0 + '0';

  count++;

}
*(pointer+count) = ‘\0’;
//strncpy(charLSB, pointer + (32 - 4), 4);
for (i = 0 ; i < 4; i++)
charLSB[i] = pointer[(32 - 4) + i];
charLSB[4] = ‘\0’;
for (i = 0; i < 5; i++) {
if (charLSB[i] == ‘0’)
LSB *= 2;
if (charLSB[i] == ‘1’)
LSB = 2 * LSB + 1;
}
free(pointer);
return LSB;
}
void extended_euclid(int64_t a, int64_t b, int64_t *x, int64_t *y, int64_t *d)
{

int64_t q = 0, r = 0, x1 = 0, x2 = 0, y1 = 0, y2 = 0;

if (b == 0) {
*d = a, *x = 1, *y = 0;
return;
}

x2 = 1, x1 = 0, y2 = 0, y1 = 1;

while (b > 0) {
q = a / b, r = a - q * b;
*x = x2 - q * x1, *y = y2 - q * y1;
a = b, b = r;
x2 = x1, x1 = *x, y2 = y1, y1 = *y;
}

*d = a, *x = x2, *y = y2;
}
host device int64_t contains(int64_t num, int64_t arr, int64_t size)
{
int64_t i = 0, result = 0;
for (i = 0; i < size; i++)
if (
(arr + i) == num) {
result = 1;
break;
}
return result;
}

';
   //strncpy(charLSB, pointer + (32 - 4), 4);
   for (i = 0 ; i < 4; i++)
	   charLSB[i] = pointer[(32 - 4) + i];
   charLSB[4] = '

#include “cuda_runtime.h”
#include “device_launch_parameters.h”

#include <stdio.h>
#include <time.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <time.h>
#include <windows.h>
#include <ctype.h>
#include <stdint.h>

#define m_VALUES 16
#define M_VALUE 1
#define T_EXP_VALUE 3
#define SIZE_BIT 32
#define G_VALUES (int64_t)pow((double)2, (double)22)
#define NUM_ITERATIONS (int64_t)pow((double)2, (double)14)
#define T_SIZE 8

//typedef unsigned long long int64_t;

int64_t MontgExp(int64_t base, int64_t exp, int64_t mod);
host device int64_t decimal_to_binary(int64_t);
void extended_euclid(int64_t a, int64_t b, int64_t *x, int64_t *y, int64_t *d);
host device int64_t contains(int64_t num, int64_t *arr, int64_t size);

#define THREADS_PER_BLOCK 64
#define BLOCKS_IN_GRID 1024
#define gpuErrchk(ans) {gpuAssert((ans), FILE, LINE);}
#define MAX_TMP_ARR_SIZE 1024

void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true)
{
if (code != cudaSuccess) {
fprintf(stderr, “GPUassert: %s %s %d\n”, cudaGetErrorString(code), file, line);
//if (abort) exit(code);
}
}
constant int64_t const_bArr[m_VALUES];
constant int64_t *const_bigPrimeNumber;
global void searchElements(int64_t *gArr, int64_t *cArr, int64_t *rArr)
{
int idx = blockDim.x * blockIdx.x + threadIdx.x;
int j = 0;
int tmp = 0;

if (idx < MAX_TMP_ARR_SIZE) {
	tmp = ( const_bArr[ decimal_to_binary( gArr[ idx ] ) ] * gArr[idx]) % (*const_bigPrimeNumber);
	if (contains(tmp, cArr, T_SIZE) == 1) 
		rArr[idx]++;
	else {
		for (j = 1; j < NUM_ITERATIONS; j++) {
			tmp = (const_bArr[decimal_to_binary(tmp)] * tmp) % (*const_bigPrimeNumber);
			if (contains(tmp, cArr, T_SIZE) == 1) {
				rArr[idx]++;
				break;
			}
		}
	}
}

}
int main()
{
int64_t bigPrimeNumber = 7;
int64_t i = 0;
int64_t bArr[m_VALUES];
int64_t T = (int64_t)pow((double)2, (double)T_EXP_VALUE);
int64_t *dArr, *cArr;
int64_t M = 0;
int64_t *gArr;
int64_t R = 0;
int64_t tmp = 0;
int k = 0;
int64_t *rArr;

int64_t *gArr_tmp, *rArr_tmp;
int64_t *gArr_tmp_dev, *rArr_tmp_dev;
int flagExit = 0, old_i = 0, steps = 0, diff = 0;
 
int64_t *cArr_dev;
cudaEvent_t start, stop;
float gpuTime = 0.0f;

cudaEventCreate(&start);
cudaEventCreate(&stop);
srand(time(NULL));
memset(bArr, 0, m_VALUES * sizeof(int64_t));

dArr = (int64_t*)malloc(T * sizeof(int64_t));
memset(dArr, 0, T * sizeof(int64_t));

cArr = (int64_t*)malloc(T * sizeof(int64_t));
memset(cArr, 0, T * sizeof(int64_t));

gArr = (int64_t*)malloc(G_VALUES * sizeof(int64_t));
memset(gArr, 0, G_VALUES * sizeof(int64_t));

rArr = (int64_t*)malloc(G_VALUES * sizeof(int64_t));
memset(rArr, 0, G_VALUES * sizeof(int64_t));
 
for (i = 0; i < m_VALUES; i++) 
	bArr[i] = MontgExp(2, rand(), bigPrimeNumber);

 
for (i = 0; i < T; i++) {
	dArr[i] = MontgExp(2, rand(), bigPrimeNumber);
	cArr[i] = bArr[i];  
}
for (i = 0; i < T; i++) {
	M = 0;
	while (M != M_VALUE) {
		cArr[i] = (bArr[decimal_to_binary(cArr[i])] * cArr[i]) % bigPrimeNumber;
		M++;
	}
}
 
srand(time(NULL));
 
for (i = 0; i < G_VALUES; i++) {
	tmp = rand();
	if (tmp != 0) {
		gArr[i] = rand();
	}
}
 
gpuErrchk( cudaMemcpyToSymbol(const_bArr, bArr, m_VALUES * sizeof(int64_t), 0, cudaMemcpyHostToDevice));
gpuErrchk( cudaMemcpyToSymbol(const_bigPrimeNumber, &bigPrimeNumber, sizeof(int64_t), 0, cudaMemcpyHostToDevice));

cudaMalloc ( (void**)&cArr_dev, T * sizeof(int64_t));
gpuErrchk ( cudaMemcpy(cArr_dev, cArr, T * sizeof(int64_t), cudaMemcpyHostToDevice));

while (true) {
	diff = G_VALUES - steps * old_i;
	if (diff < MAX_TMP_ARR_SIZE) {
		flagExit = 1;
		break;
	}
	gArr_tmp = (int64_t*)malloc(MAX_TMP_ARR_SIZE * sizeof(int64_t));
	rArr_tmp = (int64_t*)malloc(MAX_TMP_ARR_SIZE * sizeof(int64_t));

	for (i = old_i, k = 0; i < MAX_TMP_ARR_SIZE; i++, k++) {
		gArr_tmp[k] = gArr[i];
		rArr_tmp[k] = rArr[i];
	}
	old_i = i;
	cudaMalloc ( (void**)&gArr_tmp_dev, MAX_TMP_ARR_SIZE * sizeof(int64_t));
	cudaMalloc ( (void**)&rArr_tmp_dev, MAX_TMP_ARR_SIZE * sizeof(int64_t));

	gpuErrchk ( cudaMemcpy(gArr_tmp_dev, gArr_tmp, MAX_TMP_ARR_SIZE * sizeof(int64_t), cudaMemcpyHostToDevice));
	gpuErrchk ( cudaMemcpy(rArr_tmp_dev, rArr_tmp, MAX_TMP_ARR_SIZE * sizeof(int64_t), cudaMemcpyHostToDevice));

	dim3 threads = dim3(THREADS_PER_BLOCK);
	dim3 blocks = dim3(BLOCKS_IN_GRID);

	cudaEventRecord(start, 0);
	searchElements<<<blocks, threads>>>(gArr_tmp_dev, cArr_dev, rArr_tmp_dev);
	gpuErrchk( cudaPeekAtLastError() );
	gpuErrchk( cudaDeviceSynchronize() );
	cudaEventRecord(stop, 0);
	gpuErrchk( cudaMemcpy(rArr_tmp, rArr_tmp_dev, MAX_TMP_ARR_SIZE * sizeof(int64_t), cudaMemcpyDeviceToHost) );
	gpuErrchk( cudaEventSynchronize(stop));

	for (k = 0; k < MAX_TMP_ARR_SIZE; k++)
		R += rArr_tmp[k];
	printf("%.2f milliseconds\n", gpuTime);

	cudaFree(gArr_tmp_dev);
	cudaFree(rArr_tmp_dev); 
	steps++;
}
printf("%.2f milliseconds\n", gpuTime);
cudaEventDestroy(start);
cudaEventDestroy(stop);
printf("Press any key...");
getchar();
return 0;

}
int64_t MontgExp(int64_t base, int64_t exp, int64_t mod)
{
int64_t z = 0;
if (exp == 0) return 1;
z = MontgExp(base, exp / 2, mod);
if (exp % 2 == 0)
return (zz) % mod;
else
return (base
z*z) % mod;
}
host device int64_t decimal_to_binary(int64_t n)
{
int c = 0, d = 0, count;
char *pointer, charLSB[5];
int64_t LSB = 0, i = 0;

count = 0;
pointer = (char*)malloc(SIZE_BIT+1);
memset(pointer, 0, (SIZE_BIT + 1) * sizeof(char));
//if ( pointer == NULL )
// exit(EXIT_FAILURE);

memset(charLSB, 0, 5 * sizeof(char));
for ( c = SIZE_BIT - 1 ; c >= 0 ; c-- ) {
d = n >> c;

  if ( d & 1 )
     *(pointer+count) = 1 + '0';
  else
     *(pointer+count) = 0 + '0';

  count++;

}
*(pointer+count) = ‘\0’;
//strncpy(charLSB, pointer + (32 - 4), 4);
for (i = 0 ; i < 4; i++)
charLSB[i] = pointer[(32 - 4) + i];
charLSB[4] = ‘\0’;
for (i = 0; i < 5; i++) {
if (charLSB[i] == ‘0’)
LSB *= 2;
if (charLSB[i] == ‘1’)
LSB = 2 * LSB + 1;
}
free(pointer);
return LSB;
}
void extended_euclid(int64_t a, int64_t b, int64_t *x, int64_t *y, int64_t *d)
{

int64_t q = 0, r = 0, x1 = 0, x2 = 0, y1 = 0, y2 = 0;

if (b == 0) {
*d = a, *x = 1, *y = 0;
return;
}

x2 = 1, x1 = 0, y2 = 0, y1 = 1;

while (b > 0) {
q = a / b, r = a - q * b;
*x = x2 - q * x1, *y = y2 - q * y1;
a = b, b = r;
x2 = x1, x1 = *x, y2 = y1, y1 = *y;
}

*d = a, *x = x2, *y = y2;
}
host device int64_t contains(int64_t num, int64_t arr, int64_t size)
{
int64_t i = 0, result = 0;
for (i = 0; i < size; i++)
if (
(arr + i) == num) {
result = 1;
break;
}
return result;
}

';
   for (i = 0; i < 5; i++) {
	   if (charLSB[i] == '0')
		   LSB *= 2;
	   if (charLSB[i] == '1')
		   LSB = 2 * LSB + 1;
   }
   free(pointer);
   return LSB;
}
void extended_euclid(int64_t a, int64_t b, int64_t *x, int64_t *y, int64_t *d)
{

  int64_t q = 0, r = 0, x1 = 0, x2 = 0, y1 = 0, y2 = 0;

  if (b == 0) {
    *d = a, *x = 1, *y = 0;
    return;
  }

  x2 = 1, x1 = 0, y2 = 0, y1 = 1;

  while (b > 0) {
    q = a / b, r = a - q * b;
    *x = x2 - q * x1, *y = y2 - q * y1;
    a = b, b = r;
    x2 = x1, x1 = *x, y2 = y1, y1 = *y;
  }

  *d = a, *x = x2, *y = y2;
}
__host__ __device__ int64_t contains(int64_t num, int64_t *arr, int64_t size)
{
	int64_t i = 0, result = 0;
	for (i = 0; i < size; i++) 
		if (*(arr + i) == num) {
			result = 1;
			break;
		}
	return result;
}

I compile and debug in VS2012 Professional, CUDA Toolkit 9.1. Info about my device

GeForce GT730M
Compute Capability 3.5
Global Memory 2 GB
Shared Memory Per Block 48 KB
Max Threads Per Block 1024
Number of multiprocessors 2
Max Threads Dim 1024 : 1024 : 64
Max Grid Dim 2*(10 ^ 9) : 65535 : 65535

well, the problem is that after 1 itteration of while i have in console after calling kernel following errors:

GPUassert: misaligned address kernel.cu 163
GPUassert: misaligned address kernel.cu 165
GPUassert: misaligned address kernel.cu 166

Can’t understand how to fix them.

If you compile with -lineinfo and use cuda-memcheck, you can localize such errors to a single line of kernel code that is producing them.

The problem is in your use of this:

constant int64_t *const_bigPrimeNumber;

not sure what you’re trying to accomplish, but your usage there is broken. It’s taking the value of 7 that you copied there, interpreting it as a pointer (which is what you defined it as), then using that pointer address of 7 to dereference into GPU memory. and that is broken.

If you’re simply trying to pass the value of 7 to the kernel code, then define it as:

constant int64_t const_bigPrimeNumber;

and use it directly. (I’m not suggesting that one single change will magically fix your code. You’ll have to also modify your kernel code where you are using that number.)

If you actually intended it to be a pointer, and somehow you thought that the value of 7 was the right way to initialize that pointer, than your thought process is broken.

Oh, thank you very much! It is my fault and inattention.

I changed it and my kernel function code, now everything works fine.

Can you say, how i use cuda-memcheck or something like this useful tool in VS, without linux?

compile with -lineinfo

That is a project option in a VS CUDA project. Look around at the various project settings. or you could just build a debug version of the project, which will also include line info.

Once you’ve compiled the project, you’ll need to run it from a windows command prompt like this:

cuda-memcheck my_app.exe

From there, you can follow this example for interpreting the output:

https://stackoverflow.com/questions/27277365/unspecified-launch-failure-on-memcpy/27278218#27278218