Hello, I wrote simple bench for CUDA vs CPU but
CUDA time 4539 ms for 1 mio cycles
CPU time 1580ms
Why CUDA is slower?
May be it is incorrect to run kernel multiple times?
global void
kernel(float *ex, float *v, float *hi, float *low, float *cl, int m)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
int j;
if (i < m)
{
j = ex[i];
v[i] = low[j];
}
}
int main()
{
int numBars = loadBars();
int numExtr = defineExtremums(numBars);
cout << “GPU kernel start:” << “\n”;
int N = numBars;
int M = numExtr;
float *ex, *z, *h, *l, *c;
float *d_x, *d_xx, *d_y, *d_z, *d_w;
int size_m = M * sizeof(float);
int size_n = N * sizeof(float);
ex = (float*)malloc(size_m);
z = (float*)malloc(size_m);
h = (float*)malloc(size_n);
l = (float*)malloc(size_n);
c = (float*)malloc(size_n);
cudaMalloc(&d_x, size_m);
cudaMalloc(&d_xx, size_m);
cudaMalloc(&d_y, size_n);
cudaMalloc(&d_z, size_n);
cudaMalloc(&d_w, size_n);
for (int i = 0; i < M; i++)
{
ex[i] = extr[i];
}
for (int i = 0; i < N; i++)
{
h[i] = high[i];
l[i] = low[i];
c[i] = close[i];
}
cudaMemcpy(d_x, ex, size_m, cudaMemcpyHostToDevice);
cudaMemcpy(d_y, h, size_n, cudaMemcpyHostToDevice);
cudaMemcpy(d_z, l, size_n, cudaMemcpyHostToDevice);
cudaMemcpy(d_w, c, size_n, cudaMemcpyHostToDevice);
dim3 threads = dim3(1024,1);
dim3 blocks = dim3(N/threads.x,1);
int numbench = 100000;
clock_t t1 = clock();
for (int u = 0; u < numbench; u++)
{
kernel<<<threads,blocks>>>(d_x, d_xx, d_y, d_z, d_w, M);
cudaMemcpy(z, d_xx, size_m, cudaMemcpyDeviceToHost);
}
clock_t t2 = clock();
clock_t t3 = t2-t1;
cout << "GPU: " << t3 << “\n”;
clock_t t4 = clock();
for (int u = 0; u < numbench; u++)
{
for(int i = 0; i < M; i++)
{
int j = ex[i];
z[i] = low[j];
}
}
clock_t t5 = clock();
clock_t t6 = t5-t4;
cout << "CPU: " << t6 << “\n”;
cudaFree(d_x);
cudaFree(d_xx);
cudaFree(d_y);
cudaFree(d_z);
cudaFree(d_w);
free(ex);
free(z);
free(h);
free(l);
free(c);
}