Hello,
I’m newcomer in Jetson tx2 programing and I’ve some questions.
why i can’t run afunction with 1024 threads (for example global void napl_index).
why is my distance of two points computing function 1000 (literally) time slower than on 1060 3GB.
Also if you know some better algorythm, how to compute distances between each two points which lies on the hyperbole let me know, please
Thank you very much for all ideas
// my code for 1060
#include “cuda_runtime.h”
#include “device_launch_parameters.h”
//for __syncthreads()
#ifndef CUDACC_RTC
#define CUDACC_RTC
#endif // !(CUDACC_RTC)
//for atomicAdd
#ifndef CUDACC
#define CUDACC
#endif // !CUDACC
#include <device_functions.h>
#include <device_functions.h>
#include"stdlib.h"
#include “math.h”
#include <stdio.h>
#include
#include
using namespace std::chrono;
using namespace std;
#define pi 3.1415926535897932
#define THREADS_PER_BLOCK 1024
#define THREDS_PER_BLOCK_XY 32
using namespace std::chrono;
using namespace std;
#define pi 3.1415926535897932
FILE* soubor;
struct Sseq
{
int n;
double *seq;
};
struct Shyp
{
double *x;
double *y;
};
struct Sprus
{
double x;
double y;
double prus;
};
double f1x = 50;
double f1y = 50;
double f2x = 150;
double f2y = 30;
double f3x = 300;
double f3y = 70;
double d12 = 40;
double d23 = 60;
double ca = 0.5sqrt((pow((f2y - f1y), 2) + pow((f2x - f1x), 2)));
double cb = 0.5sqrt((pow(f3y - f2y, 2) + pow(f3x - f2x, 2)));
double aa = d12 / 2;
double ab = d23 / 2;
double ba = sqrt(pow(ca, 2) - pow(aa, 2));
double bb = sqrt(pow(cb, 2) - pow(ab, 2));
double rotc1a = 0;
double rots1a = 0;
double rotc1b = 0;
double rots1b = 0;
double rozlis = 0.01;
double* t1a;
void vypis_seq(double * seq, int n)
{
printf(“\n”);
for (int i = 0; i <n; i++) {
printf(“seq%d= %.9f \n”, i, seq[i]);
}
printf(“\n”);
}
struct Sseq fillt(double rozlis) {
Sseq ret;
int pocet_pred = 0;
int pocet_po = pocet_pred + 1;
double* sekvence = (double*)malloc(sizeof(double));
double prvek;
double prirustek = rozlis;
do {
double * nova_sek = (double*)malloc(pocet_po * sizeof(double));
for (int i = 0; i < pocet_pred; i++)
{
nova_sek[i] = sekvence[i];
if (i == pocet_pred - 1) {
free(sekvence);
sekvence = (double*)malloc(pocet_po * sizeof(double));
}
}
prvek = pi / 2 + prirustek;
nova_sek[pocet_pred] = prvek;
for (int i = 0; i < pocet_po; i++)
{
sekvence[i] = nova_sek[i];
}
pocet_pred++;
pocet_po++;
prirustek = prirustek + rozlis;
} while (prvek < pi - rozlis);
prirustek = rozlis;
do {
double * nova_sek = (double*)malloc(pocet_po * sizeof(double));
for (int i = 0; i < pocet_pred; i++)
{
nova_sek[i] = sekvence[i];
if (i == pocet_pred - 1) {
free(sekvence);
sekvence = (double*)malloc(pocet_po * sizeof(double));
}
}
prvek = pi + prirustek;
nova_sek[pocet_pred] = prvek;
for (int i = 0; i < pocet_po; i++)
{
sekvence[i] = nova_sek[i];
}
pocet_pred++;
pocet_po++;
prirustek += rozlis;
} while (prvek < (3 * pi / 2) - 2 * rozlis);
ret.n = pocet_pred;
ret.seq = sekvence;
return ret;
}
double* hyp_br_x(double *seq, double y, int n)
{
double *x;
x = (double*)malloc(n * sizeof(double));
for (int i = 0; i<n; i++)
{
x[i] = y / cos(seq[i]);
}
return x;
}
double* hyp_br_y(double *seq, double y, int n)
{
double *x;
x = (double*)malloc(n * sizeof(double));
for (int i = 0; i<n; i++)
{
x[i] = y * tan(seq[i]);
}
return x;
}
double rot_fill(double f1, double f2, double c)
{
double x;
x = (f1 - f2) / (2.0 * c);
return x;
}
double * x_hyp(double* X, double* Y, double f1, double f2, int n, double rotc, double rots)
{
double * vysledek;
vysledek = (double*)malloc(n * sizeof(double));
for (int i = 0; i < n; i++)
{
vysledek[i] = ((f1 + f2) / 2.0) + (X[i] * rotc) - (Y[i] * rots);
}
return vysledek;
}
double * y_hyp(double* X, double* Y, double f1, double f2, int n, double rotc, double rots)
{
double * vysledek;
vysledek = (double*)malloc(n * sizeof(double));
for (int i = 0; i < n; i++)
{
vysledek[i] = ((f1 + f2) / 2.0) + (X[i] * rots) + (Y[i] * rotc);
}
return vysledek;
}
global void napl_index(int* index,int n)
{
int i = threadIdx.x + blockIdx.x*blockDim.x;
if (i > n)return;
index[i] = i;
}
// computing of distance of two points (each point is on a hyperbolic function
global void vzdal_vyp_kernel(double * ret, double* Ax, double * Bx, double* Ay, double * By, int n)
{
int sloupec = threadIdx.x + blockIdx.x * blockDim.x; //column
int radek = threadIdx.y + blockIdx.y * blockDim.y; //row
double min = 10;
int k = 0;
for (int i = 0; i < n; i++)
{
ret[sloupec*n + radek + i] = sqrt(pow((Bx[sloupec] - Ax[i]), 2) + pow((By[sloupec] - Ay[i]), 2));
}
}
global void paraller_reduction(double* data, double* vysledek,int index, int size)
{
int thread_id = threadIdx.x;
int linear_id = blockIdx.xblockDim.x + threadIdx.x;
if (linear_id >= size) return;
double* local_data_of_block = data + blockIdx.x*blockDim.x;
int* local_data_index = index + blockIdx.x*blockDim.x;
for (int stride = 1; stride < blockDim.x; stride = stride * 2)
{
if ((thread_id % (2 * stride)) == 0)
{
if (local_data_of_block[thread_id] > local_data_of_block[thread_id + stride])
{
local_data_of_block[thread_id] = local_data_of_block[thread_id + stride];
local_data_index[thread_id] = local_data_index[thread_id + stride];
}
}
__syncthreads();
}
if (thread_id == 0) {
vysledek[blockIdx.x] = local_data_of_block[0];
index[blockIdx.x] = local_data_index[0];
}
}
int main()
{
cudaFree(0);
fopen_s(&soubor, “vysledekspravny.txt”, “wt”);
Sprus prusecik;
Shyp hypA;
Shyp hypB;
double X1;
double Y1;
double minimum;
minimum = (double)malloc(sizeof(double));
Sseq store_seq;
int size_seq;
store_seq = fillt(rozlis);
t1a = store_seq.seq;
size_seq = store_seq.n;
//printf(“%d \n”,size_seq);
//vypis_seq(t1a, size_seq);
rotc1a = rot_fill(f1x, f2x, ca);
rots1a = rot_fill(f1y, f2y, ca);
high_resolution_clock::time_point start_time = high_resolution_clock::now();
X1 = (double)malloc(size_seq * sizeof(double));
X1 = hyp_br_x(t1a, aa, size_seq);
//vypis_seq(X1, size_seq);
Y1 = (double)malloc(size_seq * sizeof(double));
Y1 = hyp_br_y(t1a, ba, size_seq);
//vypis_seq(Y1, size_seq);
hypA.x = x_hyp(X1, Y1, f1x, f2x, size_seq, rotc1a, rots1a);
//vypis_seq(hypA.x, size_seq);
hypA.y = y_hyp(X1, Y1, f1y, f2y, size_seq, rotc1a, rots1a);
//vypis_seq(hypA.y, size_seq);
free(X1);
free(Y1);
rotc1b = rot_fill(f3x, f2x, cb);
rots1b = rot_fill(f3y, f2y, cb);
X1 = (double*)malloc(size_seq * sizeof(double));
X1 = hyp_br_x(t1a, ab, size_seq);
//vypis_seq(X1, size_seq);
Y1 = (double*)malloc(size_seq * sizeof(double));
Y1 = hyp_br_y(t1a, bb, size_seq);
//vypis_seq(Y1, size_seq);
hypB.x = x_hyp(X1, Y1, f3x, f2x, size_seq, rotc1b, rots1b);
hypB.y = y_hyp(X1, Y1, f3y, f2y, size_seq, rotc1b, rots1b);
/*
for (int i = 0; i < size_seq; i++)
{
fprintf(soubor, "%f\t%f\n", hypB.x[i], hypB.y[i]);
}
*/
/*printf("HypB.x\n");
vypis_seq(hypB.x, size_seq);
printf("HypB.y\n");
vypis_seq(hypB.y, size_seq);
*/
double* d_Ax;
double* d_Bx;
double* d_Ay;
double* d_By;
double* d_vys;
double* h_vys;
int* d_indexy;
int* h_indexy;
h_vys = (double*)malloc(size_seq*size_seq * sizeof(double));
h_indexy = (int*)malloc(size_seq*size_seq * sizeof(int));
//Alokování device
cudaMalloc((void**)&d_Ax, size_seq * sizeof(double));
cudaMalloc((void**)&d_Bx, size_seq * sizeof(double));
cudaMalloc((void**)&d_Ay, size_seq * sizeof(double));
cudaMalloc((void**)&d_By, size_seq * sizeof(double));
cudaMalloc((void**)&d_vys, size_seq * size_seq * sizeof(double));
cudaMalloc((void**)&d_indexy, size_seq*size_seq * sizeof(int));
//kopírování dat do GPU
cudaMemcpy(d_Ax, hypA.x, size_seq * sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(d_Bx, hypB.x, size_seq * sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(d_Ay, hypA.y, size_seq * sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(d_By, hypB.y, size_seq * sizeof(double), cudaMemcpyHostToDevice);
cudaDeviceSynchronize();
dim3 block_size(THREDS_PER_BLOCK_XY, THREDS_PER_BLOCK_XY);
dim3 grid_size(size_seq / block_size.x + 1, size_seq / block_size.y + 1);
vzdal_vyp_kernel << <grid_size, block_size >> > (d_vys, d_Ax, d_Bx, d_Ay, d_By, size_seq);
// čeká na dokončení kernelu
cudaDeviceSynchronize();
double big_array_size=size_seq*size_seq;
dim3 BLOCK(THREADS_PER_BLOCK, 1);
dim3 GRID(( big_array_size+ BLOCK.x - 1) / BLOCK.x, 1);
size_t double_bytes = big_array_size* sizeof(double);
double* d_data;
double* d_vysledeky;
//alokování host
double* vysledeky = (double*)malloc(GRID.x * sizeof(double));
//Alokování device
cudaMalloc((void**)&d_vysledeky, GRID.x * sizeof(double));
cudaDeviceSynchronize();
napl_index <<< GRID, BLOCK >> > (d_indexy, big_array_size);
cudaDeviceSynchronize();
paraller_reduction << <GRID, BLOCK >> > (d_vys, d_vysledeky, d_indexy, big_array_size);
// čeká na dokončení kernelu
cudaDeviceSynchronize();
//kopírování výsledků zpět do paměti
cudaMemcpy(vysledeky, d_vysledeky, GRID.x * sizeof(double), cudaMemcpyDeviceToHost);
cudaMemcpy(h_indexy, d_indexy, GRID.x * sizeof(int), cudaMemcpyDeviceToHost);
//vypis_seq(vysledeky, GRID.x);
double temp = 50;
int index_temp = 0;
for (int i = 0; i < GRID.x; i++)
{
if (vysledeky[i] < temp&&vysledeky[i]>0.000001)
{
temp = vysledeky[i];
index_temp = h_indexy[i];
}
}
prusecik.prus = temp;
index_temp /= size_seq;
index_temp--;
prusecik.x = hypB.x[index_temp];
prusecik.y = hypB.y[index_temp];
printf("Cil_x: %f \nCil_y: %f \nprusecik= %f\n\n", prusecik.x, prusecik.y, prusecik.prus);
high_resolution_clock::time_point stop_time = high_resolution_clock::now();
duration<double> elapsed_time = duration_cast<duration<double>>(stop_time - start_time);
printf("Vypocet trval: %f s\n", elapsed_time.count());
fclose(soubor);
return 0;
}