Hello together!
I’m very lucky about the fact that I can use the quite new OpenCL implementation from Nvidia and that there are so many code examples and docs which help me as someone without any prior knowledge about OpenCL or CUDA to understand this topic. Many thanks for that!
So I’ve read the OpenCL_GettingStartedLinux.pdf from the NVDeveloper homepage and installed this stuff on my OpenSUSE 11.0 system:
[font=“Courier New”]cudatoolkit_3.1_linux_64_suse11.2.run
devdriver_3.1_linux_64_258.19_opencl1.1.run
gpucomputingsdk_1_1_beta_linux.run[/font]
After compilation I could run oclDeviceQuery and the output was correct.
Then I’ve compiled the nice example from the OpenCL_GettingStartedLinux.pdf (appended) but the output was “empty”. So I decided to output the values of the HostOutputVector as integers and there are only zeros instead of the sum of the two other vectors:
[font=“Courier New”] > ./vectoradd
CL_DEVICE_NAME: GeForce GTX 260
CL_DRIVER_VERSION: 258.19
00000000000000000000 00000000000000000000 00000000000000000000 00000000000000000000 00000000000000000000
[…]
The End[/font]
I only added the output about the driver and the device to see if it’s right and it is.
Am I doing something wrong? Many thanks for your help!
Best regards,
Lukas
That’s the code. I couldn’t attach it as a file…
[codebox]//************************************************************
// Demo OpenCL application to compute a simple vector addition
// computation between 2 arrays on the GPU
// ************************************************************
#include <stdio.h>
#include <stdlib.h>
#include <CL/cl.h>
// OpenCL source code
const char* OpenCLSource = {
"__kernel void VectorAdd(__global int* c, __global int* a,__global int* B)",
"{",
" // Index of the elements to add \n",
" unsigned int n = get_global_id(0);",
" // Sum the n’th element of vectors a and b and store in c \n",
" c[n] = a[n] + b[n];",
"}"
};
// Some interesting data for the vectors
int InitialData1[20] = {37,50,54,50,56,0,43,43,74,71,32,36,16,43,56,100,50,25,15,17
};
int InitialData2[20] = {35,51,54,58,55,32,36,69,27,39,35,40,16,44,55,14,58,75,18,15
};
// Number of elements in the vectors to be added
#define SIZE 2048
// Main function
// ************************************************************
int main(int argc, char **argv)
{
// Two integer source vectors in Host memory
int HostVector1, HostVector2;
// Initialize with some interesting repeating data
for(int c = 0; c < SIZE; c++)
{
HostVector1[c] = InitialData1[c%20];
HostVector2[c] = InitialData2[c%20];
}
//Get an OpenCL platform
cl_platform_id cpPlatform;
clGetPlatformIDs(1, &cpPlatform, NULL);
// Get a GPU device
cl_device_id cdDevice;
clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 1, &cdDevice, NULL);
char cBuffer[1024];
clGetDeviceInfo(cdDevice, CL_DEVICE_NAME, sizeof(cBuffer), &cBuffer, NULL);
printf(“CL_DEVICE_NAME: %s\n”, cBuffer);
clGetDeviceInfo(cdDevice, CL_DRIVER_VERSION, sizeof(cBuffer), &cBuffer, NULL);
printf(“CL_DRIVER_VERSION: %s\n\n”, cBuffer);
// Create a context to run OpenCL on our CUDA-enabled NVIDIA GPU
cl_context GPUContext = clCreateContextFromType(0, CL_DEVICE_TYPE_GPU, NULL, NULL, NULL);
// Create a command-queue on the GPU device
cl_command_queue cqCommandQueue = clCreateCommandQueue(GPUContext, cdDevice, 0, NULL);
// Allocate GPU memory for source vectors AND initialize from CPU memory
cl_mem GPUVector1 = clCreateBuffer(GPUContext, CL_MEM_READ_ONLY |
CL_MEM_COPY_HOST_PTR, sizeof(int) * SIZE, HostVector1, NULL);
cl_mem GPUVector2 = clCreateBuffer(GPUContext, CL_MEM_READ_ONLY |
CL_MEM_COPY_HOST_PTR, sizeof(int) * SIZE, HostVector2, NULL);
// Allocate output memory on GPU
cl_mem GPUOutputVector = clCreateBuffer(GPUContext, CL_MEM_WRITE_ONLY,
sizeof(int) * SIZE, NULL, NULL);
// Create OpenCL program with source code
cl_program OpenCLProgram = clCreateProgramWithSource(GPUContext, 7,
OpenCLSource, NULL, NULL);
// Build the program (OpenCL JIT compilation)
clBuildProgram(OpenCLProgram, 0, NULL, NULL, NULL, NULL);
// Create a handle to the compiled OpenCL function (Kernel)
cl_kernel OpenCLVectorAdd = clCreateKernel(OpenCLProgram, “VectorAdd”, NULL);
// In the next step we associate the GPU memory with the Kernel arguments
clSetKernelArg(OpenCLVectorAdd, 0, sizeof(cl_mem), (void*)&GPUOutputVector);
clSetKernelArg(OpenCLVectorAdd, 1, sizeof(cl_mem), (void*)&GPUVector1);
clSetKernelArg(OpenCLVectorAdd, 2, sizeof(cl_mem), (void*)&GPUVector2);
// Launch the Kernel on the GPU
size_t WorkSize[1] = {SIZE}; // one dimensional Range
clEnqueueNDRangeKernel(cqCommandQueue, OpenCLVectorAdd, 1, NULL,
WorkSize, NULL, 0, NULL, NULL);
// Copy the output in GPU memory back to CPU memory
int HostOutputVector;
clEnqueueReadBuffer(cqCommandQueue, GPUOutputVector, CL_TRUE, 0,
SIZE * sizeof(int), HostOutputVector, 0, NULL, NULL);
// Cleanup
clReleaseKernel(OpenCLVectorAdd);
clReleaseProgram(OpenCLProgram);
clReleaseCommandQueue(cqCommandQueue);
clReleaseContext(GPUContext);
clReleaseMemObject(GPUVector1);
clReleaseMemObject(GPUVector2);
clReleaseMemObject(GPUOutputVector);
// Print out the results
for (int Rows = 0; Rows < (SIZE/20); Rows++, printf(“\t”)){
for(int c = 0; c <20; c++){
printf("%d",HostOutputVector[Rows * 20 + c]);
}
}
printf(“\n\nThe End\n\n”);
return 0;
}[/codebox]