If only it were as simple you say!..
Now try the following code example:
[codebox]#include <stdio.h>
#include <cutil_inline.h>
//BreakPointDelay set to 40960 Seconds = 11.37 Hours
#define BLOCK_SIZE 256 // Thread block size
#define REPEATS 300000 // Thread block size
global
void myKernel(int* INPUTpointer, int* OUTPUTpointer, int N, int Cin)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x; // Calculate a linear index
for( int count = 0; count<REPEATS; count++) // Spending a long time here
{
if (idx < N) OUTPUTpointer[idx] = INPUTpointer[idx] + Cin;
}
}
struct GPUTimingsSTRUCT
{
float TotalTime;
float MemcopyCPU2GPU;
float MemcopyGPU2CPU;
float GPU_KERNEL_1;
float GPU_KERNEL_ALL;
};
main()
{
int N = 65536;
int* DATA1_d; // *DATA1_d is the input vector of integers stored in device memory
int* DATA2_d; // *DATA2_d is the output vector of integers stored in device memory
int* DATA1_h; // *DATA1_h is the input vector of integers stored in host memory
int* DATA2_h; // *DATA2_h is the output vector of integers stored in host memory
GPUTimingsSTRUCT GPUTimings;
// Define Execution Configuration
dim3 dimBlock;
dimBlock.x = BLOCK_SIZE;
dimBlock.y = 1;
dimBlock.z = 1;
dim3 dimGrid;
dimGrid.x = 256;
dimGrid.y = 1;
dimGrid.z = 1;
size_t dynShared = 0;
cudaEvent_t Event1, Event2, Event3, Event4, Event5;
cudaEventCreate(&Event1);
cudaEventCreate(&Event2);
cudaEventCreate(&Event3);
cudaEventCreate(&Event4);
cudaEventCreate(&Event5);
// Allocate vectors and variables in device memory
cutilSafeCallNoSync(cudaMalloc((void**)&DATA1_d, N*sizeof(int)));
cutilSafeCallNoSync(cudaMalloc((void**)&DATA2_d, N*sizeof(int)));
// Allocate vectors in host memory
DATA1_h = (int*)malloc(N*sizeof(int));
DATA2_h = (int*)malloc(N*sizeof(int));
// Copy vectors from host memory to device memory
cudaEventRecord(Event1, 0); cudaEventSynchronize(Event1);
cutilSafeCallNoSync(cudaMemcpy(DATA1_d, DATA1_h, N*sizeof(int), cudaMemcpyHostToDevice));
cudaEventRecord(Event2, 0); cudaEventSynchronize(Event2);
// Invoke broken-up kernels
myKernel<<<dimBlock, dimGrid, dynShared, 0>>>(DATA1_d, DATA2_d, N, 0);
cudaEventRecord(Event3, 0); cudaEventSynchronize(Event3);
printf("STEP 1 ok!\n");
myKernel<<<dimBlock, dimGrid, dynShared, 0>>>(DATA1_d, DATA2_d, N, 1);
printf("STEP 2 ok!\n");
myKernel<<<dimBlock, dimGrid, dynShared, 0>>>(DATA1_d, DATA2_d, N, 2);
printf("STEP 3 ok!\n");
//myKernel<<<dimBlock, dimGrid, dynShared, 0>>>(DATA1_d, DATA2_d, N, 3);
// printf(“STEP 4 ok!\n”);
//myKernel<<<dimBlock, dimGrid, dynShared, 0>>>(DATA1_d, DATA2_d, N, 4);
// printf(“STEP 5 ok!\n”);
//myKernel<<<dimBlock, dimGrid, dynShared, 0>>>(DATA1_d, DATA2_d, N, 5);
// printf(“STEP 6 ok!\n”);
//myKernel<<<dimBlock, dimGrid, dynShared, 0>>>(DATA1_d, DATA2_d, N, 6);
// printf(“STEP 7 ok!\n”);
//myKernel<<<dimBlock, dimGrid, dynShared, 0>>>(DATA1_d, DATA2_d, N, 7);
// printf(“STEP 8 ok!\n”);
//myKernel<<<dimBlock, dimGrid, dynShared, 0>>>(DATA1_d, DATA2_d, N, 8);
// printf(“STEP 9 ok!\n”);
//myKernel<<<dimBlock, dimGrid, dynShared, 0>>>(DATA1_d, DATA2_d, N, 9);
// printf(“STEP 10 ok!\n”);
//myKernel<<<dimBlock, dimGrid, dynShared, 0>>>(DATA1_d, DATA2_d, N, 10);
// printf(“STEP 11 ok!\n”);
//myKernel<<<dimBlock, dimGrid, dynShared, 0>>>(DATA1_d, DATA2_d, N, 11);
// printf(“STEP 12 ok!\n”);
//myKernel<<<dimBlock, dimGrid, dynShared, 0>>>(DATA1_d, DATA2_d, N, 12);
// printf(“STEP 13 ok!\n”);
//myKernel<<<dimBlock, dimGrid, dynShared, 0>>>(DATA1_d, DATA2_d, N, 13);
// printf(“STEP 14 ok!\n”);
//myKernel<<<dimBlock, dimGrid, dynShared, 0>>>(DATA1_d, DATA2_d, N, 14);
// printf(“STEP 15 ok!\n”);
//myKernel<<<dimBlock, dimGrid, dynShared, 0>>>(DATA1_d, DATA2_d, N, 15);
// printf(“STEP 16 ok!\n”);
//myKernel<<<dimBlock, dimGrid, dynShared, 0>>>(DATA1_d, DATA2_d, N, 16);
// printf(“STEP 17 ok!\n”);
//myKernel<<<dimBlock, dimGrid, dynShared, 0>>>(DATA1_d, DATA2_d, N, 17);
// printf(“STEP 18 ok!\n”);
//myKernel<<<dimBlock, dimGrid, dynShared, 0>>>(DATA1_d, DATA2_d, N, 18);
// printf(“STEP 19 ok!\n”)
//myKernel<<<dimBlock, dimGrid, dynShared, 0>>>(DATA1_d, DATA2_d, N, 19);
// printf(“STEP 20 ok!\n”);
cudaEventRecord(Event4, 0); cudaEventSynchronize(Event4);
// Copy vectors from device memory to host memory
cutilSafeCallNoSync(cudaMemcpy(DATA2_h, DATA2_d, N*sizeof(int), cudaMemcpyDeviceToHost));
cudaEventRecord(Event5, 0); cudaEventSynchronize(Event5);
cudaEventElapsedTime(&GPUTimings.TotalTime, Event1, Event5);
cudaEventElapsedTime(&GPUTimings.MemcopyCPU2GPU, Event1, Event2);
cudaEventElapsedTime(&GPUTimings.GPU_KERNEL_1, Event2, Event3);
cudaEventElapsedTime(&GPUTimings.GPU_KERNEL_ALL, Event3, Event4);
cudaEventElapsedTime(&GPUTimings.MemcopyGPU2CPU, Event4, Event5);
printf("GPUTimings.TotalTime = %f\n", GPUTimings.TotalTime);
printf("GPUTimings.Memcopy CPU2GPU = %f\n", GPUTimings.MemcopyCPU2GPU);
printf("GPUTimings.GPU_KERNEL_1 = %f\n", GPUTimings.GPU_KERNEL_1);
printf("GPUTimings.GPU_KERNEL_ALL = %f\n", GPUTimings.GPU_KERNEL_ALL);
printf("GPUTimings.Memcopy GPU2CPU = %f\n", GPUTimings.MemcopyGPU2CPU);
printf("GPU Additions /Sec = %f\n", (1000.0*(float)REPEATS*(float)N)/(GPUTimings.GPU_KERNEL_1));
getchar();
// cleanup
free(DATA1_h);
free(DATA2_h);
cutilSafeCallNoSync(cudaFree(DATA1_d));
cutilSafeCallNoSync(cudaFree(DATA2_d));
cudaEventDestroy(Event1);
cudaEventDestroy(Event2);
cudaEventDestroy(Event3);
cudaEventDestroy(Event4);
cudaEventDestroy(Event5);
}[/codebox]
It works fine as I pasted it… However, try uncommenting more of the kernel calls, and soon enough, you will run into problems!
And it actually still stops in about 5 seconds no matter how many kernel calls you ask it to do.
This is the output I get:
"
STEP 1 ok!
STEP 2 ok!
STEP 3 ok!
STEP 4 ok!
STEP 5 ok!
STEP 6 ok!
STEP 7 ok!
STEP 8 ok!
STEP 9 ok!
STEP 10 ok!
STEP 11 ok!
STEP 12 ok!
STEP 13 ok!
STEP 14 ok!
STEP 15 ok!
STEP 16 ok!
STEP 17 ok!
STEP 18 ok!
STEP 19 ok!
STEP 20 ok!
cudaSafeCallNoSync() Runtime API error in file ‘mytest.cu’ in line 131 : the launch timed out and was terminated. …"
The error is exactly the same as when I try to run a 100 second Kernel!
So again I ask… What is the problem here?