Hi everyone,
I’m now working on an OpenCL project and have to optimize my program to get the best rate of speed.
So I got to use NSight software (under eclipse edition) to profile my project.
The problem is that no timeline is generated when I click on the “Profile” button.
But with a CUDA sample, it works great.
Have I missed something in the NSight settings ?
It would be usefull if someone has an idea to help me up.
Thanks.
The configuration I have :
Ubuntu 14
Nvidia GTX 960 (OpenCL 1.2)
NSight 7.5
An OpenCL Sample I try to profile but still empty :
#include <iostream>
#include <CL/cl.hpp>
using namespace std;
int main() {
//get all platforms (drivers)
std::vector<cl::Platform> all_platforms;
cl::Platform::get(&all_platforms);
if(all_platforms.size()==0){
std::cout<<" No platforms found. Check OpenCL installation!\n";
exit(1);
}
cl::Platform default_platform=all_platforms[0];
std::cout << "Using platform: "<<default_platform.getInfo<CL_PLATFORM_NAME>()<<"\n";
//get default device of the default platform
std::vector<cl::Device> all_devices;
default_platform.getDevices(CL_DEVICE_TYPE_ALL, &all_devices);
if(all_devices.size()==0){
std::cout<<" No devices found. Check OpenCL installation!\n";
exit(1);
}
cl::Device default_device=all_devices[0];
std::cout<< "Using device: "<<default_device.getInfo<CL_DEVICE_NAME>()<<"\n";
cl::Context context(CL_DEVICE_TYPE_GPU);
cl::Program::Sources sources;
// kernel calculates for each element C=A+B
std::string kernel_code=
" void kernel simple_add(global const int* A, global const int* B, global int* C){ "
" C[get_global_id(0)]=A[get_global_id(0)]+B[get_global_id(0)]; "
" } ";
cl::Program program(context, kernel_code, true);
// create buffers on the device
cl::Buffer buffer_A(context,CL_MEM_READ_WRITE,sizeof(int)*10);
cl::Buffer buffer_B(context,CL_MEM_READ_WRITE,sizeof(int)*10);
cl::Buffer buffer_C(context,CL_MEM_READ_WRITE,sizeof(int)*10);
int A[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
int B[] = {0, 1, 2, 0, 1, 2, 0, 1, 2, 0};
//create queue to which we will push commands for the device.
cl::CommandQueue queue(context,default_device, CL_QUEUE_PROFILING_ENABLE);
//write arrays A and B to the device
queue.enqueueWriteBuffer(buffer_A,CL_TRUE,0,sizeof(int)*10,A);
queue.enqueueWriteBuffer(buffer_B,CL_TRUE,0,sizeof(int)*10,B);
//run the kernel
auto vadd = cl::make_kernel<cl::Buffer, cl::Buffer, cl::Buffer>(program, "simple_add");
vadd(cl::EnqueueArgs(queue, cl::NDRange(10)), buffer_A, buffer_B, buffer_C);
int C[10];
//read result C from the device to array C
queue.enqueueReadBuffer(buffer_C,CL_TRUE,0,sizeof(int)*10,C);
queue.finish();
queue.flush();
std::cout<<" result: \n";
for(int i=0;i<10;i++){
std::cout<<C[i]<<" ";
}
return 0;
}