Hello,
I want to use the GPU of the TK1 board to calculate the absolute values of complex numbers. When i run the kernel in a loop, the average execution time is about 300us (pretty slow anyway…). But sometimes the GPU needs several milliseconds. The worst thing i saw was 100ms for one run.
I am using CUDA toolkit 6.5 (cross compiling)
NVCC version is
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2014 NVIDIA Corporation
Built on Wed_Aug_27_10:36:36_CDT_2014
Cuda compilation tools, release 6.5, V6.5.16
Compiling is done by hand:
/usr/local/cuda/bin/nvcc -I/usr/local/cuda/include -g -O3 -ccbin arm-linux-gnueabihf-g++ -gencode arch=compute_32,code=sm_32 --target-cpu-architecture ARM -m32 -M -o "main.d" "main.cu"
/usr/local/cuda/bin/nvcc -I/usr/local/cuda/include -g -O3 -ccbin arm-linux-gnueabihf-g++ -gencode arch=compute_32,code=sm_32 --compile --relocatable-device-code=false --target-cpu-architecture ARM -m32 -x cu -o "main.o" "main.cu"
/usr/local/cuda/bin/nvcc -I/usr/local/cuda/include -L/opt/cuda-6.5-linaro/lib --cudart shared -Xlinker --unresolved-symbols=ignore-in-object-files --relocatable-device-code=false -gencode arch=compute_32,code=sm_32 --target-cpu-architecture ARM -m32 -ccbin arm-linux-gnueabihf-g++ -link -o "CUDAHang" main.o
This is the test code main.cu:
#include <stdio.h>
#include <sys/time.h>
#include <time.h>
#define ARRAY_SIZE 4088
#define NUM_RUNS 2000
int imin(int a, int b)
{
return (a < b ? a : b);
}
const int _threadsPerBlock = 1024;
const int _blocksPerGrid = imin(32, (ARRAY_SIZE/2+_threadsPerBlock-1)/_threadsPerBlock);
typedef unsigned long long timestamp_t;
inline timestamp_t get_timestamp() {
struct timeval now;
gettimeofday(&now, NULL);
return now.tv_usec + (timestamp_t) now.tv_sec * 1000000.0;
}
// Calculate the abolsute values of the input array with complex numbers.
__global__ void absolute_values(float * d_out, float * d_in){
int idx = threadIdx.x + blockIdx.x * blockDim.x;
//
if(idx < ARRAY_SIZE/2)
{
float img = d_in[idx*2];
float real = d_in[idx*2+1];
d_out[idx] = img*img + real*real;
}
}
int main(int argc, char ** argv)
{
// Assume there is only one device
cudaError_t err = cudaSetDevice(0);
if (err != cudaSuccess)
{
printf("Cuda Error: %d - %s\n", err, cudaGetErrorString(err));
return -1;
}
cudaDeviceReset();
// Generate host input and output array
float h_in[ARRAY_SIZE]; // Array with complex numbers
float h_out[ARRAY_SIZE/2]; // Array with absolute values (half the size)
for (int i = 0; i < ARRAY_SIZE; i++)
h_in[i] = float(i);
// Declare GPU memory pointers
float * d_in;
float * d_out;
// Allocate GPU memory
cudaMalloc((void**) &d_in, ARRAY_SIZE * sizeof(float));
cudaMalloc((void**) &d_out, ARRAY_SIZE/2 * sizeof(float));
// Transfer input array to GPU
cudaMemcpy(d_in, h_in, ARRAY_SIZE * sizeof(float), cudaMemcpyHostToDevice);
timestamp_t before;
int current = 0;
int minimum = 0;
int maximum = 0;
int average = 0;
// Launch the kernel with threads only
printf("Blocks: %d a %d threads for %d values.\n", _blocksPerGrid, _threadsPerBlock, ARRAY_SIZE);
for(int i = 0; i < NUM_RUNS; i++)
{
before = get_timestamp();
absolute_values<<<_blocksPerGrid, _threadsPerBlock>>>(d_out, d_in);
///>>> Synchronization can be done by copying data or by an explicit call... hangups in both cases
//cudaDeviceSynchronize();
cudaMemcpy(h_out, d_out, ARRAY_SIZE/2 * sizeof(float), cudaMemcpyDeviceToHost);
// Some statistics
current = (get_timestamp() - before);
if(current > minimum*3)
{
if(i != 0) printf(">>> ");
printf("Run %d took %d us (three times minimum).\n", i, current);
}
if(current < minimum || i == 0) minimum = current;
if(current > maximum || i == 0) maximum = current;
average += current;
}
// Copy back the result array to the CPU
cudaMemcpy(h_out, d_out, ARRAY_SIZE/2 * sizeof(float), cudaMemcpyDeviceToHost);
printf("Minimum: %d us, Maximum: %d us, Average: %d us.\n", minimum, maximum, average/NUM_RUNS);
cudaFree(d_in);
cudaFree(d_out);
return 0;
}
Thanks for help!