I need help. I wrote a simple CUDA program for multiplying two vectors and I also wrote the same code in OpenCL but I got huge differences in measuring time of host memora allocation, device memory allocation and copying results from host to device and vice versa.
Can anyone help me? what could be the reason? I’m using Nvidia Geforce 9800 GT
CUDA
/*
*/
// includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <cutil.h>
// includes, kernels
#include <VecAdd_kernel.cu>
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int
main(void)
{
float *ulazd_A, *ulazd_B, *izlazd_C;
float *ulazh_A, *ulazh_B, *izlazh_C;
int velicinaBloka =512;
int N = 65536;
printf("Velicina vektora je: %d\n\n", N);
unsigned int brojac = 0;
float H_ALL_vrijeme = 0.0f;
float D_ALL_vrijeme = 0.0f;
float H_to_D_vrijeme = 0.0f;
float D_to_H_vrijeme = 0.0f;
float D_vrijeme = 0.0f;
cudaEvent_t start1, start2, start3, start4, start5, start6;
cudaEvent_t stop1, stop2, stop3, stop4, stop5, stop6;
cudaEventCreate( &start1 ) ;
cudaEventCreate( &start2 ) ;
cudaEventCreate( &start3 ) ;
cudaEventCreate( &start4 ) ;
cudaEventCreate( &start5 ) ;
cudaEventCreate( &start6 ) ;
cudaEventCreate( &stop1 ) ;
cudaEventCreate( &stop2 ) ;
cudaEventCreate( &stop3 ) ;
cudaEventCreate( &stop4 ) ;
cudaEventCreate( &stop5 ) ;
cudaEventCreate( &stop6 ) ;
cutCreateTimer(&brojac);
cutStartTimer( brojac);
//dodijeli memoriju domacina
cudaEventRecord( start1, 0 );
ulazh_A = (float*)malloc(sizeof(float)*N);
ulazh_B = (float*)malloc(sizeof(float)*N);
izlazh_C = (float*)malloc(sizeof(float)*N);
cudaEventRecord( stop1, 0 );
cudaEventSynchronize(stop1);
cudaEventElapsedTime(&H_ALL_vrijeme,start1, stop1);
//ispuni vektore nasumiÄnim realnim brojevima
for (int i=0;i<N;i++)
{
ulazh_A[i] = (float)rand()/(float)RAND_MAX;
ulazh_B[i] = (float)rand()/(float)9;
}
//dodijeli memoriju uređaja
cudaEventRecord( start2, 0 );
cudaMalloc((void **)&ulazd_A, N*sizeof(float));
cudaMalloc((void **)&ulazd_B, N*sizeof(float));
cudaMalloc((void **)&izlazd_C, N*sizeof(float));
cudaEventRecord( stop2, 0 );
cudaEventSynchronize(stop2);
cudaEventElapsedTime(&D_ALL_vrijeme,start2, stop2);
//xxxx
dim3 dimBlock(velicinaBloka);
dim3 dimGrid (ceil(float(N)/float(dimBlock.x)));
//kopiraj memoriju s domacina na uredjaj
cudaEventRecord( start3, 0 );
cudaMemcpy(ulazd_A, ulazh_A, sizeof(float)*N, cudaMemcpyHostToDevice);
cudaMemcpy(ulazd_B, ulazh_B, sizeof(float)*N, cudaMemcpyHostToDevice);
cudaEventRecord( stop3, 0 );
cudaEventSynchronize(stop3);
cudaEventElapsedTime(&H_to_D_vrijeme,start3, stop3);
// izvrsi jezgru
cudaEventRecord( start4, 0 );
VecAdd<<< dimGrid,dimBlock >>>(ulazd_A,ulazd_B,izlazd_C);
//cudaThreadSynchronize();
cudaEventRecord( stop4, 0 );
cudaEventSynchronize(stop4);
cudaEventElapsedTime(&D_vrijeme,start4, stop4);
//kopiraj rezultate s uređaja na domacina
cudaEventRecord( start6, 0 );
cudaMemcpy(izlazh_C, izlazd_C, N*sizeof(float), cudaMemcpyDeviceToHost);
cudaThreadSynchronize();
cudaEventRecord( stop6, 0 );
cudaEventSynchronize(stop6);
cudaEventElapsedTime(&D_to_H_vrijeme,start6, stop6);
cudaThreadSynchronize();
//zaustavi brojac
cutStopTimer( brojac);
printf("GPU Vrijeme procesiranja: %f ms\n", cutGetTimerValue(brojac));
//ispisi rezultat
printf(“Vrijeme alociranja memorije domacina: %f ms\n”, H_ALL_vrijeme);
printf("Vrijeme alociranja memorije uredjaja: %f ms\n", D_ALL_vrijeme);
printf("Vrijeme kopiranja podataka s CPU na GPU: %f ms\n", H_to_D_vrijeme);
printf("Vrijeme izvodenja jezgre: %f ms\n", D_vrijeme);
printf("Vrijeme kopiranja podataka s GPU-a na RAM: %f ms\n", D_to_H_vrijeme);
system (“PAUSE”);
// oslobodi memoriju
free( ulazh_A);
free( ulazh_B);
free( izlazh_C);
cudaFree(ulazd_A);
cudaFree(ulazd_B);
cudaFree(izlazd_C);
cudaEventDestroy(start1);
cudaEventDestroy(stop1);
cudaEventDestroy(start2);
cudaEventDestroy(stop2);
cudaEventDestroy(start3);
cudaEventDestroy(stop3);
cudaEventDestroy(start4);
cudaEventDestroy(stop4);
cudaEventDestroy(start5);
cudaEventDestroy(stop5);
cudaEventDestroy(start6);
cudaEventDestroy(stop6);
cutDeleteTimer(brojac);
cudaThreadExit();
cutDeleteTimer(brojac);
}
OPENCL
/*
*Code
*/
#include <stdio.h>
#include <shrQATest.h>
#include <oclUtils.h>
#include
// OpenCL application to compute a simple vector multiplication
// computation between 2 arrays on the GPU
// ******************************************************************
#include <stdio.h>
#include <stdlib.h>
#include
#include <string.h>
#include
#include <assert.h>
#include <time.h>
#include <Windows.h>
#include “Timer.h”
// OpenCL izvorni kod
const char* OpenCLSource = {
“__kernel void VectorAdd(__global float* GPUVector1, __global float* GPUVector2,__global float* GPUOutputVector)”,
“{”,
" // Indeks elemenata koji se trebaju pomnožiti \n",
" unsigned int n = get_global_id(0);",
" // pomnoži elemente vektora a i b i spremi u c \n",
" GPUVector1[n] = GPUVector2[n] + GPUOutputVector[n];",
“}”
};
// broj elemenata vektora
#define SIZE 10000
// Main function
// *********************************************************************
int main(int argc, char **argv)
{
printf("Velicina vektora je: %d \n\n", SIZE);
// Alociraj memoriju domaćina
CPerfCounter t;
t.Reset();
t.Start();
float* HostVector1 = (float *)malloc(sizeof(float) * SIZE);
float* HostVector2 = (float *)malloc(sizeof(float) * SIZE);
float* HostOutputVector = (float *)malloc(sizeof(float) * SIZE);
t.Stop();
printf(“Vrijeme potrebno za alokaciju memorije domacina: %2f ms\n\n”, t.GetElapsedTime());
t.Reset();
//kreiraj događaje za GPU
cl_event events [3];
//cl_event CPUtoGPU;
//cl_event CPUtoGPU2;
// Ispuni vektore domaćina s nasumiÄnim realnim brojevima
for(int c = 0; c < SIZE; c++)
{
HostVector1[c] = (float)rand()/(float)RAND_MAX;
HostVector2[c] = (float)rand()/(float)9;
}
//OpenCL varijable
cl_int status;
cl_platform_id platform;
cl_context GPUContext;
size_t ParmDataBytes;
cl_device_id* GPUDevices;
cl_command_queue GPUCommandQueue;
cl_mem GPUVector1;
cl_mem GPUVector2;
cl_mem GPUOutputVector;
cl_program OpenCLProgram;
cl_kernel OpenCLVectorAdd;
// platforma
status = clGetPlatformIDs (1, &platform, NULL);
if (status != CL_SUCCESS) {
printf( "Greska: Nema dostupnih OpenCL platformi!");
return 0;
}
// postavi kontekst
cl_context_properties props[3];
props[0] = (cl_context_properties)CL_CONTEXT_PLATFORM;
props[1] = (cl_context_properties)platform;
props[2] = (cl_context_properties)0;
// Kreiraj OpenCL kontekst
GPUContext = clCreateContextFromType(props, CL_DEVICE_TYPE_GPU,NULL, NULL, &status);
if (status != CL_SUCCESS) {
printf( "Greska u kreiranju konteksta " );
return 0;
}
// Dohvati listu GPU uređaja koji su u kontekstu
clGetContextInfo(GPUContext, CL_CONTEXT_DEVICES, 0, NULL, &ParmDataBytes);
GPUDevices = (cl_device_id*)malloc(ParmDataBytes);
clGetContextInfo(GPUContext, CL_CONTEXT_DEVICES, ParmDataBytes, GPUDevices, NULL);
// Kreiraj commandqueuq za GPU uređaj
GPUCommandQueue = clCreateCommandQueue(GPUContext, GPUDevices[0], CL_QUEUE_PROFILING_ENABLE, &status);
if (status != CL_SUCCESS) {
printf( "Greska: Neuspjesno kreiranje memorije sa slijednim pristupom naredbama!" );
return 0;
}
// Alociraj memoriju uređaja
t.Reset();
t.Start();
GPUVector1 = clCreateBuffer(GPUContext, CL_MEM_READ_WRITE, sizeof(float) * SIZE, NULL, &status);
if (status != CL_SUCCESS) {
printf( "Greska u kreiranju memorijskog meduspremnika GPUVector1: " );
return 0;
}
GPUVector2 = clCreateBuffer(GPUContext, CL_MEM_READ_WRITE, sizeof(float) * SIZE, NULL, &status);
if (status != CL_SUCCESS) {
printf( "Greska u kreiranju memorijskog meduspremnika GPUVector2: " );
return 0;
}
// AlocirajGPU memoriju za rezultantni
GPUOutputVector = clCreateBuffer(GPUContext, CL_MEM_WRITE_ONLY,sizeof(float) * SIZE, NULL, NULL);
if (status != CL_SUCCESS) {
printf( "Greska u kreiranju memorijskog meduspremnika GPUOutputVector: " );
return 0;
}
t.Stop();
printf( “Alociranje memorije ureÄ‘aja: %2f ms\n\n”, t.GetElapsedTime());
t.Reset();
// Kopiraj podatke s CPU na GPU
t.Reset();
t.Start();
clEnqueueWriteBuffer(GPUCommandQueue, GPUVector1, CL_FALSE, 0, SIZE * sizeof(float), HostVector1, 0, NULL, &events[3]);
status = clWaitForEvents(1, &events[3]);
if(status != CL_SUCCESS) {
printf( "Greska u cekanju na kopiranje podataka s CPU na GPU!" );
return 0;
}
/* Calculate performance */
cl_ulong startTime2;
cl_ulong endTime2;
/* Get kernel profiling info */
status = clGetEventProfilingInfo(events[3], CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &startTime2, 0);
if(status != CL_SUCCESS) {
printf( "Greska u proracunu vremena jedan! %d", status );
return 0;
}
status = clGetEventProfilingInfo(events[3], CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &endTime2, 0);
if(status != CL_SUCCESS) {
printf( "Greska u proracunu vremena CPU na GPU!" );
return 0;
}
double vrijeme1 = (endTime2 - startTime2) * 1.0e-6f;
printf( “Vrijeme kopiranja podataka s CPU na GPU HostVector1: %lf ms\n”, vrijeme1);
clEnqueueWriteBuffer(GPUCommandQueue, GPUVector2, CL_TRUE, 0, SIZE * sizeof(float), HostVector2, 0, NULL, &events[2]);
t.Stop();
printf( “Vrijeme kopiranja podataka s CPU na CGU: %2f ms\n\n”, t.GetElapsedTime());
t.Reset();
status = clWaitForEvents(1, &events[2]);
if(status != CL_SUCCESS) {
printf( "Greska u cekanju na kopiranje podataka s CPU na GPU!" );
return 0;
}
/* Calculate performance */
cl_ulong startTime3;
cl_ulong endTime3;
/* Get kernel profiling info */
status = clGetEventProfilingInfo(events[2], CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &startTime3, 0);
if(status != CL_SUCCESS) {
printf( "Greska u proracunu vremena jedan! %d", status );
return 0;
}
status = clGetEventProfilingInfo(events[2], CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &endTime3, 0);
if(status != CL_SUCCESS) {
printf( "Greska u proracunu vremena CPU na GPU!" );
return 0;
}
double vrijeme2 = (endTime3 - startTime3) * 1.0e-6f;
//printf( “Vrijeme kopiranja podataka s CPU na CGU: %2f ms\n”, vrijeme2);
printf( “Ukupno vrijeme kopiranja podataka s CPU na GPU event: %2f ms\n”, vrijeme2+vrijeme1);
clReleaseEvent(events[2]);
clReleaseEvent(events[3]);
// Create OpenCL program with source code
OpenCLProgram = clCreateProgramWithSource(GPUContext, 7, OpenCLSource, NULL, &status);
if (status != CL_SUCCESS) {
printf( "Greska u kreiranju programa " );
return 0;
}
// Build the program (OpenCL JIT compilation)
clBuildProgram(OpenCLProgram, 0, NULL, NULL, NULL, NULL);
if (status != CL_SUCCESS) {
printf( "Greska u izgradnji programa" );
return 0;
}
// Create a handle to the compiled OpenCL function (Kernel)
OpenCLVectorAdd = clCreateKernel(OpenCLProgram, “VectorAdd”, &status);
if (status != CL_SUCCESS) {
printf( "Greska u kreiranju jezgre " );
return 0;
}
// In the next step we associate the GPU memory with the Kernel arguments
clSetKernelArg(OpenCLVectorAdd, 0, sizeof(cl_mem), (void*)&GPUOutputVector);
clSetKernelArg(OpenCLVectorAdd, 1, sizeof(cl_mem), (void*)&GPUVector1);
clSetKernelArg(OpenCLVectorAdd, 2, sizeof(cl_mem), (void*)&GPUVector2);
// Launch the Kernel on the GPU
size_t WorkSize[1] = {SIZE}; // one dimensional Range
t.Reset();
t.Start();
status = clEnqueueNDRangeKernel(GPUCommandQueue, OpenCLVectorAdd, 1, NULL, WorkSize, NULL, 0, NULL, &events[0]);
if (status != CL_SUCCESS) {
printf( "Greska u pokretanju jezgre!" );
return 0;
}
t.Stop();
printf( “Vrijeme izvodjenja jezgree: %lf ms\n”, t.GetElapsedTime());
t.Reset();
status = clWaitForEvents(1, &events[0]);
if(status != CL_SUCCESS) {
printf( "Greska u cekanju na dogadjaj!" );
return 0;
}
/* Calculate performance */
cl_ulong startTime;
cl_ulong endTime;
/* Get kernel profiling info */
status = clGetEventProfilingInfo(events[0], CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &startTime, NULL);
if(status != CL_SUCCESS) {
printf( "Greska u proracunu vremena jedan! %d", status );
return 0;
}
status = clGetEventProfilingInfo(events[0], CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &endTime, NULL);
if(status != CL_SUCCESS) {
printf( "Greska u proracunu vremena dva!" );
return 0;
}
double KernelExecutionTime = (endTime - startTime) * 1.0e-6f;
printf( “Vrijeme izvodjenja jezgre event: %2f ms\n”, KernelExecutionTime);
clReleaseEvent(events[0]);
// Copy the output in GPU memory back to CPU memory
t.Reset();
t.Start();
clEnqueueReadBuffer(GPUCommandQueue, GPUOutputVector, CL_FALSE, 0, SIZE * sizeof(float), HostOutputVector, 0, NULL, &events[1]);
t.Stop();
printf( “Vrijeme kopiranja podataka s GPU na CPU: %lf ms\n”, t.GetElapsedTime());
t.Reset();
status = clWaitForEvents(1, &events[1]);
if(status != CL_SUCCESS) {
printf( "Greska u cekanju na kopiranje podataka s GPU na CPU!" );
return 0;
}
/* Calculate performance */
cl_ulong startTime1;
cl_ulong endTime1;
/* Get kernel profiling info */
status = clGetEventProfilingInfo(events[1], CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &startTime1, NULL);
if(status != CL_SUCCESS) {
printf( "Greska u proracunu vremena jedan! %d", status );
return 0;
}
status = clGetEventProfilingInfo(events[1], CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &endTime1, NULL);
if(status != CL_SUCCESS) {
printf( "Greska u proracunu vremena dva!" );
return 0;
}
double vrijeme = (endTime1 - startTime1) * 1.0e-6f;
printf( “Vrijeme kopiranja podataka s GPU na CPU pomocu event: %lf ms\n”, vrijeme);
clReleaseEvent(events[1]);
// Cleanup
free(GPUDevices);
free (HostVector1);
free (HostVector2);
free (HostOutputVector);
clReleaseKernel(OpenCLVectorAdd);
clReleaseProgram(OpenCLProgram);
clReleaseCommandQueue(GPUCommandQueue);
clReleaseContext(GPUContext);
clReleaseMemObject(GPUVector1);
clReleaseMemObject(GPUVector2);
clReleaseMemObject(GPUOutputVector);
}