GTX1060 code to Jetson tx2

Hello,

I’m newcomer in Jetson tx2 programing and I’ve some questions.

why i can’t run afunction with 1024 threads (for example global void napl_index).

why is my distance of two points computing function 1000 (literally) time slower than on 1060 3GB.

Also if you know some better algorythm, how to compute distances between each two points which lies on the hyperbole let me know, please

Thank you very much for all ideas

// my code for 1060
#include “cuda_runtime.h”
#include “device_launch_parameters.h”
//for __syncthreads()
#ifndef CUDACC_RTC
#define CUDACC_RTC
#endif // !(CUDACC_RTC)
//for atomicAdd
#ifndef CUDACC
#define CUDACC
#endif // !CUDACC
#include <device_functions.h>
#include <device_functions.h>
#include"stdlib.h"
#include “math.h”
#include <stdio.h>
#include
#include

using namespace std::chrono;
using namespace std;
#define pi 3.1415926535897932
#define THREADS_PER_BLOCK 1024
#define THREDS_PER_BLOCK_XY 32
using namespace std::chrono;
using namespace std;

#define pi 3.1415926535897932
FILE* soubor;
struct Sseq
{
int n;
double *seq;
};

struct Shyp
{
double *x;
double *y;
};

struct Sprus
{
double x;
double y;
double prus;
};

double f1x = 50;
double f1y = 50;
double f2x = 150;
double f2y = 30;
double f3x = 300;
double f3y = 70;
double d12 = 40;
double d23 = 60;

double ca = 0.5sqrt((pow((f2y - f1y), 2) + pow((f2x - f1x), 2)));
double cb = 0.5
sqrt((pow(f3y - f2y, 2) + pow(f3x - f2x, 2)));

double aa = d12 / 2;
double ab = d23 / 2;
double ba = sqrt(pow(ca, 2) - pow(aa, 2));
double bb = sqrt(pow(cb, 2) - pow(ab, 2));

double rotc1a = 0;
double rots1a = 0;
double rotc1b = 0;
double rots1b = 0;

double rozlis = 0.01;

double* t1a;

void vypis_seq(double * seq, int n)
{
printf("\n");
for (int i = 0; i <n; i++) {
printf(“seq%d= %.9f \n”, i, seq[i]);
}
printf("\n");
}
struct Sseq fillt(double rozlis) {
Sseq ret;
int pocet_pred = 0;
int pocet_po = pocet_pred + 1;
double* sekvence = (double*)malloc(sizeof(double));
double prvek;
double prirustek = rozlis;
do {
double * nova_sek = (double*)malloc(pocet_po * sizeof(double));
for (int i = 0; i < pocet_pred; i++)
{
nova_sek[i] = sekvence[i];
if (i == pocet_pred - 1) {
free(sekvence);
sekvence = (double*)malloc(pocet_po * sizeof(double));
}
}
prvek = pi / 2 + prirustek;
nova_sek[pocet_pred] = prvek;
for (int i = 0; i < pocet_po; i++)
{
sekvence[i] = nova_sek[i];
}

	pocet_pred++;
	pocet_po++;
	prirustek = prirustek + rozlis;

} while (prvek < pi - rozlis);

prirustek = rozlis;
do {
	double * nova_sek = (double*)malloc(pocet_po * sizeof(double));
	for (int i = 0; i < pocet_pred; i++)
	{
		nova_sek[i] = sekvence[i];
		if (i == pocet_pred - 1) {
			free(sekvence);
			sekvence = (double*)malloc(pocet_po * sizeof(double));
		}
	}

	prvek = pi + prirustek;
	nova_sek[pocet_pred] = prvek;
	for (int i = 0; i < pocet_po; i++)
	{
		sekvence[i] = nova_sek[i];
	}

	pocet_pred++;
	pocet_po++;
	prirustek += rozlis;

} while (prvek < (3 * pi / 2) - 2 * rozlis);

ret.n = pocet_pred;
ret.seq = sekvence;
return ret;

}
double* hyp_br_x(double *seq, double y, int n)
{
double *x;

x = (double*)malloc(n * sizeof(double));
for (int i = 0; i<n; i++)
{
	x[i] = y / cos(seq[i]);
}
return x;

}

double* hyp_br_y(double *seq, double y, int n)
{
double *x;

x = (double*)malloc(n * sizeof(double));
for (int i = 0; i<n; i++)
{

	x[i] = y * tan(seq[i]);
}
return x;

}

double rot_fill(double f1, double f2, double c)
{
double x;
x = (f1 - f2) / (2.0 * c);
return x;
}

double * x_hyp(double* X, double* Y, double f1, double f2, int n, double rotc, double rots)
{
double * vysledek;
vysledek = (double*)malloc(n * sizeof(double));

for (int i = 0; i < n; i++)
{
	vysledek[i] = ((f1 + f2) / 2.0) + (X[i] * rotc) - (Y[i] * rots);
}

return vysledek;

}

double * y_hyp(double* X, double* Y, double f1, double f2, int n, double rotc, double rots)
{
double * vysledek;
vysledek = (double*)malloc(n * sizeof(double));

for (int i = 0; i < n; i++)
{
	vysledek[i] = ((f1 + f2) / 2.0) + (X[i] * rots) + (Y[i] * rotc);
}

return vysledek;

}

global void napl_index(int* index,int n)
{
int i = threadIdx.x + blockIdx.x*blockDim.x;
if (i > n)return;
index[i] = i;
}

// computing of distance of two points (each point is on a hyperbolic function
global void vzdal_vyp_kernel(double * ret, double* Ax, double * Bx, double* Ay, double * By, int n)
{

int sloupec = threadIdx.x + blockIdx.x * blockDim.x; //column
int radek = threadIdx.y + blockIdx.y * blockDim.y; //row

double min = 10;
int k = 0;
for (int i = 0; i < n; i++)
{
	ret[sloupec*n + radek + i] = sqrt(pow((Bx[sloupec] - Ax[i]), 2) + pow((By[sloupec] - Ay[i]), 2));

}

}
global void paraller_reduction(double* data, double* vysledek,int index, int size)
{
int thread_id = threadIdx.x;
int linear_id = blockIdx.x
blockDim.x + threadIdx.x;

if (linear_id >= size) return;

double* local_data_of_block = data + blockIdx.x*blockDim.x;
int* local_data_index = index + blockIdx.x*blockDim.x;
for (int stride = 1; stride < blockDim.x; stride = stride * 2)
{
	if ((thread_id % (2 * stride)) == 0)
	{
		if (local_data_of_block[thread_id] > local_data_of_block[thread_id + stride])
		{
			local_data_of_block[thread_id] = local_data_of_block[thread_id + stride];
			local_data_index[thread_id] = local_data_index[thread_id + stride];
		}
	}

	__syncthreads();
}
if (thread_id == 0) {
	vysledek[blockIdx.x] = local_data_of_block[0]; 
	index[blockIdx.x] = local_data_index[0];
}

}

int main()
{
cudaFree(0);
fopen_s(&soubor, “vysledekspravny.txt”, “wt”);
Sprus prusecik;
Shyp hypA;
Shyp hypB;
double X1;
double Y1;
double minimum;
minimum = (double
)malloc(sizeof(double));
Sseq store_seq;
int size_seq;
store_seq = fillt(rozlis);
t1a = store_seq.seq;
size_seq = store_seq.n;
//printf("%d \n",size_seq);
//vypis_seq(t1a, size_seq);
rotc1a = rot_fill(f1x, f2x, ca);
rots1a = rot_fill(f1y, f2y, ca);
high_resolution_clock::time_point start_time = high_resolution_clock::now();
X1 = (double
)malloc(size_seq * sizeof(double));
X1 = hyp_br_x(t1a, aa, size_seq);
//vypis_seq(X1, size_seq);
Y1 = (double
)malloc(size_seq * sizeof(double));
Y1 = hyp_br_y(t1a, ba, size_seq);
//vypis_seq(Y1, size_seq);

hypA.x = x_hyp(X1, Y1, f1x, f2x, size_seq, rotc1a, rots1a);
//vypis_seq(hypA.x, size_seq);

hypA.y = y_hyp(X1, Y1, f1y, f2y, size_seq, rotc1a, rots1a);
//vypis_seq(hypA.y, size_seq);


free(X1);
free(Y1);



rotc1b = rot_fill(f3x, f2x, cb);
rots1b = rot_fill(f3y, f2y, cb);

X1 = (double*)malloc(size_seq * sizeof(double));
X1 = hyp_br_x(t1a, ab, size_seq);
//vypis_seq(X1, size_seq);
Y1 = (double*)malloc(size_seq * sizeof(double));
Y1 = hyp_br_y(t1a, bb, size_seq);
//vypis_seq(Y1, size_seq);

hypB.x = x_hyp(X1, Y1, f3x, f2x, size_seq, rotc1b, rots1b);

hypB.y = y_hyp(X1, Y1, f3y, f2y, size_seq, rotc1b, rots1b);

/*
for (int i = 0; i < size_seq; i++)
{
	fprintf(soubor, "%f\t%f\n", hypB.x[i], hypB.y[i]);
}
*/
/*printf("HypB.x\n");
vypis_seq(hypB.x, size_seq);
printf("HypB.y\n");
vypis_seq(hypB.y, size_seq);
*/
double* d_Ax;
double* d_Bx;
double* d_Ay;
double* d_By;
double* d_vys;
double* h_vys;
int* d_indexy;
int* h_indexy;
h_vys = (double*)malloc(size_seq*size_seq * sizeof(double));
h_indexy = (int*)malloc(size_seq*size_seq * sizeof(int));
//Alokování device
 cudaMalloc((void**)&d_Ax, size_seq * sizeof(double));

 cudaMalloc((void**)&d_Bx, size_seq * sizeof(double));

 cudaMalloc((void**)&d_Ay, size_seq * sizeof(double));

 cudaMalloc((void**)&d_By, size_seq * sizeof(double));

 cudaMalloc((void**)&d_vys, size_seq * size_seq * sizeof(double));
 
 cudaMalloc((void**)&d_indexy, size_seq*size_seq * sizeof(int));

//kopírování dat do GPU

cudaMemcpy(d_Ax, hypA.x, size_seq * sizeof(double), cudaMemcpyHostToDevice);

cudaMemcpy(d_Bx, hypB.x, size_seq * sizeof(double), cudaMemcpyHostToDevice);

cudaMemcpy(d_Ay, hypA.y, size_seq * sizeof(double), cudaMemcpyHostToDevice);

cudaMemcpy(d_By, hypB.y, size_seq * sizeof(double), cudaMemcpyHostToDevice);
cudaDeviceSynchronize();

dim3 block_size(THREDS_PER_BLOCK_XY, THREDS_PER_BLOCK_XY);
dim3 grid_size(size_seq / block_size.x + 1, size_seq / block_size.y + 1);
vzdal_vyp_kernel << <grid_size, block_size >> > (d_vys, d_Ax, d_Bx, d_Ay, d_By, size_seq);

// čeká na dokončení kernelu
cudaDeviceSynchronize();

double big_array_size=size_seq*size_seq;	
dim3 BLOCK(THREADS_PER_BLOCK, 1);
dim3 GRID(( big_array_size+ BLOCK.x - 1) / BLOCK.x, 1);

size_t double_bytes = big_array_size* sizeof(double);

double* d_data;
double* d_vysledeky;

//alokování host
double* vysledeky = (double*)malloc(GRID.x * sizeof(double));

//Alokování device


cudaMalloc((void**)&d_vysledeky, GRID.x * sizeof(double));

cudaDeviceSynchronize();
napl_index <<< GRID, BLOCK >> > (d_indexy, big_array_size);

cudaDeviceSynchronize();

paraller_reduction << <GRID, BLOCK >> > (d_vys, d_vysledeky, d_indexy, big_array_size);

// čeká na dokončení kernelu

cudaDeviceSynchronize();

//kopírování výsledků zpět do paměti 

cudaMemcpy(vysledeky, d_vysledeky, GRID.x * sizeof(double), cudaMemcpyDeviceToHost);
cudaMemcpy(h_indexy, d_indexy, GRID.x * sizeof(int), cudaMemcpyDeviceToHost);

//vypis_seq(vysledeky, GRID.x);
double temp = 50;
int index_temp = 0;
for (int i = 0; i < GRID.x; i++)
{
	if (vysledeky[i] < temp&&vysledeky[i]>0.000001)
	{
		temp = vysledeky[i];
		index_temp = h_indexy[i];
	}
}
prusecik.prus = temp;
index_temp /= size_seq;
index_temp--;
prusecik.x = hypB.x[index_temp];
prusecik.y = hypB.y[index_temp];
printf("Cil_x: %f \nCil_y: %f \nprusecik= %f\n\n", prusecik.x, prusecik.y, prusecik.prus);


high_resolution_clock::time_point stop_time = high_resolution_clock::now();
duration<double> elapsed_time = duration_cast<duration<double>>(stop_time - start_time);


printf("Vypocet trval: %f s\n", elapsed_time.count());





fclose(soubor);
return 0;

}

Hi,

1) Could you share which kind of error do you meet when launching 1024 threads?

2) It’s recommended to check our CUDA sample as a start point.
You can find it at /usr/local/cuda/samples

3) Have you maximized the device performance?

sudo ./jetson_clocks.sh

Too many resources requested for launch

  1. I put my code into sample. (modified kernel)

  2. No I haven’t… maybe that is my problem

add question my code runs slow on gtx1060 too when I do not put cudaFree(0) is it normal?

Hi,

This error means that the number of registers available on the multiprocessor is being exceeded.
Reduce the number of threads per block to solve the problem.

Thanks.