Hi all.
I am new to CUDA development so I apologise if my questions sound dumb.
In the past I have been working in some algorithms that are calculated using CPU based distributed computing. This computing, to sum up, involves performing some kind of more or less complex graph evolution. Looking for an improvement in the performances I’m now working on modifying these algorithms to run in a GPU.
However, my first preliminary tests are giving me unexpected results. I’ve arranged a over-simplified version of the algorithm consisting on performing accesses to a vector, and storing in another vector the sum of the values accessed. I have two versions of the algorithm, one for CPU and another for GPU
#include <cuda_runtime.h>
#include <helper_cuda.h>
#include <conio.h>
#include <iostream>
using namespace std;
__host__ void procesaGrafo(
double *h_vector,
double *h_resultado,
int size)
{
for (int x = 0; x < size; x++)
{
h_resultado[x] = 0;
for (int y = x - 100; y < x + 100; y++)
if (y >= 0 && y < size)
h_resultado[x] += h_vector[y];
}
}
__global__ void procesaGrafoKernel(
double *d_vector,
double *d_resultado,
int size)
{
int index = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
if (index == 0)
printf("Bloque %d, hilo %d, dimension rejilla %d, dimension bloque %d, indice %d, stride %d\n", blockIdx.x, threadIdx.x, gridDim.x, blockDim.x, index, stride);
for (int x = index; x < size; x+=stride)
{
d_resultado[x] = 0;
for (int y = x - 100; y < x + 100; y++)
if (y >= 0 && y < size)
d_resultado[x] += d_vector[y];
}
}
Then I allocate memory for both local and device arrays, fill up the data vector with some data, execute both and measure the time required to perform this by the CPU, and then by the GPU:
#define VECSIZE 100000
cudaDeviceProp verifica()
{
int devicesCount;
cudaGetDeviceCount(&devicesCount);
//devicesCount has the number of GPUs
cudaDeviceProp device_prop;
int dev_id = findCudaDevice(0, 0);
checkCudaErrors(cudaGetDeviceProperties(&device_prop, dev_id));
if (!device_prop.managedMemory) {
fprintf(stderr, "Unified Memory not supported on this device\n");
cudaDeviceReset();
exit(EXIT_WAIVED);
}
if (device_prop.computeMode == cudaComputeModeExclusive || device_prop.computeMode == cudaComputeModeProhibited)
{
fprintf(stderr, "This sample requires a device in either default or process exclusive mode\n");
cudaDeviceReset();
exit(EXIT_WAIVED);
}
return device_prop;
}
int main(void)
{
double *d_vector;
double *d_resultado;
double *h_vector;
double *h_resultado;
cudaDeviceProp dispositivo = verifica();
cudaError_t e1 = cudaMallocManaged(&d_vector, VECSIZE*sizeof(double));
cudaError_t e2 = cudaMallocManaged(&d_resultado, VECSIZE*sizeof(double));
h_vector = new double [VECSIZE];
h_resultado = new double [VECSIZE];
if (e1 || e2)
{
cout << "ERROR cudaMallocManaged";
getch();
exit(0);
}
for (int x = 0; x < VECSIZE; x++)
{
d_vector[x] = x;
d_resultado[x] = 0;
h_vector[x] = x;
h_resultado[x] = 0;
}
cudaError_t result;
cudaEvent_t start1, stop1, start2, stop2;
float time1 = -1, time2 = -1;
result = cudaEventCreate(&start1);
result = cudaEventCreate(&stop1);
result = cudaEventRecord(start1, 0);
procesaGrafo(h_vector,h_resultado,VECSIZE);
result = cudaEventRecord(stop1, 0);
result = cudaEventSynchronize(stop1);
result = cudaEventElapsedTime(&time1, start1, stop1);
result = cudaEventCreate(&start2);
result = cudaEventCreate(&stop2);
result = cudaEventRecord(start2, 0);
procesaGrafoKernel<<<512,1024>>>(d_vector,d_resultado,VECSIZE);
result = cudaDeviceSynchronize();
result = cudaEventRecord(stop2, 0);
result = cudaEventSynchronize(stop2);
result = cudaEventElapsedTime(&time2, start2, stop2);
bool iguales = true;
for (int x = 0; x < VECSIZE && iguales == true; x++)
{
if (d_resultado[x] != h_resultado[x])
iguales = false;
}
cout << "Tiempo en host:" << time1 << " ms. Tiempo en GPU:" << time2 << " ms. Resultado equivalente: " << iguales << ". Ultimo estado = " << result << endl;
delete h_resultado;
delete h_vector;
cudaFree(d_resultado);
cudaFree(d_vector);
getch();
return 0;
}
Processing time starting values, for sake of comparison, are, for a vector size of 100000 elements, less than 1 ms for CPU Xeon X5650 @ 2.67 Ghz, and 30 seconds using a single block with just one thread in a GTX 670. Now, as I increase the number of blocks and threads I can see how the performances are improved as follows:
<<<1,1>>> 30 s
<<<512,1>> 1400 ms
<<<512,512>> 66 ms
<<<512,1024>> 89 ms
With this values in front of me, the first question is, how could it be that the CPU is overperforming the GPU even when I use 512x512 threads to process the array values in parallel? And the second question is, why when I even increase the block size I get worse values? I guess that the answer to the second question is overhead, but still, would expect quite better values.
I would thank any advice or feedback to this topic.
Regards.