Ok so i’m coming back,
I do my check errors with a define, for the mem-check i don’t really know how to do it on VS2013 (windows).
I code a program with kernels and if i just check the index of the kernels (see if i get all the index and not some) it work (so i guess i didn’t reach the launch limit). But if i do something in the kernels like the calculation i must do, then the program crash on the cudaDeviceSytynchronize in the main. With the error :
“unknow error”
Here is my code that work :
__global__ void secondKernel(int realTid, Cartesian Sat, Cartesian Base, Cartesian Geo)
{
int tid;
if ((tid = threadIdx.x + blockIdx.x * blockDim.x) < 209440)
{
realTid < 1 && tid > 209438 ? printf("Sat %.3f, %.3f, %.3f\nBase %.3f %.3f %.3f\nGeo %.3f %.3f %.3f\n", Sat.m_X, Sat.m_Y, Sat.m_Z, Base.m_X, Base.m_Y, Base.m_Z, Geo.m_X, Geo.m_Y, Geo.m_Z) : 0;
}
}
__global__ void kernelStream(double mult, Propagator *sat)
{
int tid;
Cartesian sat_position;
Cartesian geo_position;
Cartesian base_position(1597885.53777688, 1253552.16551859, 6046164.27311665);
tid = (threadIdx.x + blockIdx.x * blockDim.x) + mult;
if (tid < 798132)
{
sat_position = sat[0].evaluate(tid * STEP, SIMULATION_DURATION, 0);
geo_position = findStartGeo(base_position);
secondKernel <<< 205, 1024 >>> (tid, sat_position, base_position, geo_position);
gpucheckError(cudaGetLastError());
gpucheckError(cudaDeviceSynchronize());
}
}
int main(void)
{
cudaStream_t stream[NB_STREAM];
cudaEvent_t start, stop;
dim3 nb_threads(98, 1, 1);
dim3 nb_blocks(1, 1, 1);
float elapsedTime;
Propagator *sat, *cuda_sat;
cpucheckError(cudaHostAlloc((void **)&sat, sizeof(Propagator) * NB_SAT, cudaHostAllocDefault));
cpucheckError(cudaMalloc((void **)&cuda_sat, sizeof(Propagator) * NB_SAT));
sat[0].propagator("Sat 1", 7847.3, 53, 0, 18, 0, 67.5, true, 5, true, 3.4000000596279278E-05);
cpucheckError(cudaMemcpy(cuda_sat, sat, sizeof(Propagator) * NB_SAT, cudaMemcpyHostToDevice));
for (int i = 0; i < NB_STREAM; i++)
cpucheckError(cudaStreamCreate(&stream[i]));
cpucheckError(cudaEventCreate(&start));
cpucheckError(cudaEventCreate(&stop));
cpucheckError(cudaEventRecord(start, 0));
for (int i = 0; i < NB_STREAM; i++)
{
kernelStream << < nb_blocks, nb_threads, 0, stream[i] >> > (i * nb_threads.x * nb_blocks.x, cuda_sat);
cpucheckError(cudaGetLastError());
}
cpucheckError(cudaDeviceSynchronize());
cpucheckError(cudaEventRecord(stop, 0));
cpucheckError(cudaEventSynchronize(stop));
cpucheckError(cudaEventElapsedTime(&elapsedTime, start, stop));
for (int i = 0; i < NB_STREAM; i++)
cpucheckError(cudaStreamDestroy(stream[i]));
cpucheckError(cudaEventDestroy(start));
cpucheckError(cudaEventDestroy(stop));
printf("time : %f ms\n", elapsedTime);
return (0);
}
Here is my defines for check :
# define cpucheckError(value) { cpuAssert((value), __FILE__, __LINE__); }
__host__ inline void cpuAssert(cudaError_t code, const char *file, int line)
{
if (code != cudaSuccess)
{
printf("%s %s %d\n", cudaGetErrorString(code), file, line);
exit(0);
}
}
# define gpucheckError(value) { gpuAssert((value), __FILE__, __LINE__); }
__device__ inline void gpuAssert(cudaError_t code, const char *file, int line)
{
if (code != cudaSuccess)
{
printf("%s %s %d\n", cudaGetErrorString(code), file, line);
return;
}
}
Here is the code that didn’t work
__constant__ double CUDA_result[BLOCKS_ANGLE];
__constant__ int CUDA_tid[BLOCKS_ANGLE];
__global__ void secondKernel(int realTid, Cartesian Sat, Cartesian Base, Cartesian Geo)
{
int tid;
if ((tid = threadIdx.x + blockIdx.x * blockDim.x) < 209440)
{
__shared__ double tmp[THREADS_ANGLE][2];
Cartesian new_pos, vecU, vecV;
Global global;
int tid, idx, i;
idx = threadIdx.x;
if ((tid = threadIdx.x + blockIdx.x * blockDim.x) < IT_ANGLE)
{
new_pos = global.rotationZAxis(Geo, STEP_ANGLE * tid);
vecU.set(Sat.m_X - Base.m_X, Sat.m_Y - Base.m_Y, Sat.m_Z - Base.m_Z);
vecV.set(new_pos.m_X - Base.m_X, new_pos.m_Y - Base.m_Y, new_pos.m_Z - Base.m_Z);
tmp[idx][0] = global.dotProduct(vecU, vecV);
tmp[idx][1] = (double)tid;
}
__syncthreads();
i = THREADS_ANGLE / 2;
while (i != 0)
{
(idx < i && tmp[idx][0] > tmp[idx + i][0]) ? (tmp[idx][0] = tmp[idx + i][0], tmp[idx][1] = tmp[idx + i][1]) : (0);
__syncthreads();
i /= 2;
}
(idx == 0) ? (CUDA_result[blockIdx.x] = tmp[0][0], CUDA_tid[blockIdx.x] = (int)tmp[0][1]) : (0);
}
}
__global__ void kernelStream(double mult, Propagator *sat)
{
int tid, index;
Cartesian sat_position;
Cartesian geo_position;
Cartesian base_position(1597885.53777688, 1253552.16551859, 6046164.27311665);
Cartesian output;
tid = (threadIdx.x + blockIdx.x * blockDim.x) + mult;
if (tid < 798132)
{
sat_position = sat[0].evaluate(tid * STEP, SIMULATION_DURATION, 0);
geo_position = findStartGeo(base_position);
secondKernel <<< 205, 1024 >>> (tid, sat_position, base_position, geo_position);
gpucheckError(cudaGetLastError());
gpucheckError(cudaDeviceSynchronize());
index = getIndex(CUDA_result, CUDA_tid);
output.set(geo_position.m_X * cos(STEP * index) + geo_position.m_Y * -sin(STEP * index), geo_position.m_X * sin(STEP * index) + geo_position.m_Y * cos(STEP * index), geo_position.m_Z);
}
}
int main(void)
{
cudaStream_t stream[NB_STREAM];
cudaEvent_t start, stop;
dim3 nb_threads(98, 1, 1);
dim3 nb_blocks(1, 1, 1);
float elapsedTime;
Propagator *sat, *cuda_sat;
cpucheckError(cudaHostAlloc((void **)&sat, sizeof(Propagator) * NB_SAT, cudaHostAllocDefault));
cpucheckError(cudaMalloc((void **)&cuda_sat, sizeof(Propagator) * NB_SAT));
sat[0].propagator("Sat 1", 7847.3, 53, 0, 18, 0, 67.5, true, 5, true, 3.4000000596279278E-05);
cpucheckError(cudaMemcpy(cuda_sat, sat, sizeof(Propagator) * NB_SAT, cudaMemcpyHostToDevice));
for (int i = 0; i < NB_STREAM; i++)
cpucheckError(cudaStreamCreate(&stream[i]));
cpucheckError(cudaEventCreate(&start));
cpucheckError(cudaEventCreate(&stop));
cpucheckError(cudaEventRecord(start, 0));
for (int i = 0; i < NB_STREAM; i++)
{
kernelStream << < nb_blocks, nb_threads, 0, stream[i] >> > (i * nb_threads.x * nb_blocks.x, cuda_sat);
cpucheckError(cudaGetLastError());
}
cpucheckError(cudaDeviceSynchronize());
cpucheckError(cudaEventRecord(stop, 0));
cpucheckError(cudaEventSynchronize(stop));
cpucheckError(cudaEventElapsedTime(&elapsedTime, start, stop));
for (int i = 0; i < NB_STREAM; i++)
cpucheckError(cudaStreamDestroy(stream[i]));
cpucheckError(cudaEventDestroy(start));
cpucheckError(cudaEventDestroy(stop));
printf("time : %f ms\n", elapsedTime);
return (0);
}
I set the number of stream to : 12471