CUDA Prime factorization

Hello! I’m trying to implement a simple program to compute prime factorization using cuda (I hope to be in the right section). I’m starting with a simple algorithm, trial division with Sieve of Eratosthenes (sieve on CPU and trial division on GPU). The sieve returns me an array where prime indexes are one and the rest zero.
I only recently learned to use CUDA, so I’m having a hard time. When I try to enter a number, the output window closes with an error code, and trying debugging I think it doesn’t even enter the “trial” device function.
Also, what should I put in the <<< >>> notation? Consider that I have a GTX 1050 GPU.
Here’s my code, sorry for the bad mistakes (as I said I’m still a novice) and thank you in advance!

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <math.h>
#include <stdio.h>
#include <time.h>
#define SIZE 512

__global__ void trial(long int *prime, long int *number, long int *length) {
	int i = threadIdx.x;

	if ((i > 1) && (i < *length))
		if (prime[i])
			long int val = i;
			if (val * val == *number)
				printf("\nPrime factors are %ld and %ld\n", val, val);
			if (*number % val == 0)
				printf("\nPrime factors are %ld ", val);
				long int val2 = *number / val;
				printf("and %ld.\n", val2);


void main()
	clock_t start, end;
	double tempo;
	start = clock();

	long int *d_length;
	long int *prime;
	long int *d_prime;
	int n = 2;
	long int elim;
	long int number;
	long int *d_number;

	printf("Enter number to factorize: ");
	scanf("%d", &number);

	long int length = floor(sqrt(number));

	prime = (long int *)malloc(SIZE * sizeof(long int));
	cudaMalloc((void**) &d_prime, SIZE * sizeof(long int));
	cudaMalloc((void**) &d_number, sizeof(long int));
	cudaMalloc((void**) &d_length, sizeof(long int));

        //sieve of Eratosthenes
	for (int i = 0; i < length; i++)
		prime[i] = 1;

	while (n <= length)
		if (prime[n] == 1)
			elim = n + n;
			while (elim <= length)
				prime[elim] = 0;
				elim += n;

	cudaMemcpy(d_prime, &prime, SIZE * sizeof(long int), cudaMemcpyHostToDevice);
	cudaMemcpy(d_number, &number, sizeof(long int), cudaMemcpyHostToDevice);
	cudaMemcpy(d_length, &length, sizeof(long int), cudaMemcpyHostToDevice);

	trial << <1, 1 >> > (d_prime, d_number, d_length);