You have a typo in your code here:
That thing at the end before the comment should be a semicolon: ; not a comma: ,
In C or C++, You cannot use a function name before you declare it (“innerproduct”):
user9725:
enqueue_kernel(get_default_queue(), CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange,^ { innerproduct(&inp, &ppnum, &out); });//CLK_ENQUEUE_FLAGS_WAIT_KERNEL表示父級的所有工作項內核必須完成執行並且所有立即在入隊之前發生的副作用子內核可以開始執行。
I suggest updating to the latest driver for your GPU
You are passing arguments incorrectly to the nested kernel call, you should pass the arguments directly, not by taking their address:
That would be the minimum set of changes I would recommend to get past the clBuildProgram error:
$ cat t8.cpp
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <iostream>
#include <vector>
#include <string>
#include <cstring>
#include <stddef.h>
#include <CL/cl.h>
using namespace std;
#define testpp(...)#__VA_ARGS__
const char* pp = testpp(
kernel void innerproduct
(
global double* inp1,
global double* inp2,
global double* out
)
{
int id = get_global_id(0);
out[id] = inp1[id] * inp2[id];
}
__kernel void ppp
(
__global double* inp,
__global double* ppnum,
__global double* out
)
{
int id = get_global_id(0);
ndrange_t ndrange = ndrange_1D(5, 1);//ndrange_t ndrange = ndrange_1D(global_work_size,local_work_size)設定子內核工作量
enqueue_kernel(get_default_queue(), CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange,^ { innerproduct(inp, ppnum, out); });//CLK_ENQUEUE_FLAGS_WAIT_KERNEL表示父級的所有工作項內核必須完成執行並且所有立即在入隊之前發生的副作用子內核可以開始執行。
}
);
double targetinp[5] = { 5,7,0,4,6 };
double tarpp[5] = { 1,5,0,0,3 };
double oup[5];
int main()
{
cl_int err = 0;
cl_uint numPlatforms;
cl_platform_id platform = NULL;
int ans;
err = clGetPlatformIDs(0, NULL, &numPlatforms);
if (err != CL_SUCCESS)
{
printf("Error: Getting Platforms\n");
return EXIT_FAILURE;
}
if (numPlatforms > 0)
{
cl_platform_id* platforms = (cl_platform_id*)malloc(numPlatforms * sizeof(cl_platform_id));// 建立平台列表陣列空間
err = clGetPlatformIDs(numPlatforms, platforms, NULL);// 收集/查詢平台ID
if (err != CL_SUCCESS)
{
printf("Error: Getting Platform Ids.(clGetPlatformIDs)\n");
system("pause");
return -1;
}
for (unsigned int i = 0; i < numPlatforms; ++i)
{
char pbuff[100];
err = clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR, sizeof(pbuff), pbuff, NULL);//獲取平台供應商
//platform = platforms[i];
cout << i << ":" << pbuff << "\n";
}
cout << "select platform: ";
cin >> ans;
platform = platforms[ans];
free(platforms);
}
else
{
cout << "沒有找到任何平台";
return EXIT_FAILURE;
}
cl_uint num_devices = 0;
cl_device_id* devices = NULL;
cl_device_id device = NULL;
//建立主要設備
err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 0, devices, &num_devices);//查詢設備數量
if (num_devices == 0) //no GPU available.
{
cout << "No GPU device available." << endl;
return EXIT_FAILURE;
}
else
{
devices = (cl_device_id*)malloc(num_devices * sizeof(cl_device_id));
err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, num_devices, devices, NULL);//查詢設備ID
cout << num_devices << "\n";
for (unsigned int i = 0; i < num_devices; ++i)
{
char pbuff[100];
err = clGetDeviceInfo(devices[i], CL_DEVICE_NAME, sizeof(pbuff), pbuff, NULL);//獲取設備訊息
cout << i << ":" << pbuff << "\n";
}
cout << "select device: ";
cin >> ans;
device = devices[ans];
free(devices);
}
//建立上下文
cl_context context = nullptr;
context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
// 創建主要命令隊列
cl_command_queue commandQueue = nullptr;
commandQueue = clCreateCommandQueue(context, device, 0, &err);
//建立程式物件
size_t ppsize[] = { strlen(pp) }; //求程式字串長度
//cout << context;
cl_program pprog = clCreateProgramWithSource(context, 1, &pp, ppsize, &err);
if (err != CL_SUCCESS)
{
printf("Error: Loading Binary into cl_program (clCreateProgramWithBinary)\n");
return EXIT_FAILURE;
}
//cout << pprog;
//編譯device程式
err = clBuildProgram(pprog, 1, &device, NULL, NULL, NULL);
if (err != CL_SUCCESS)
{
if (err == CL_BUILD_PROGRAM_FAILURE) {
// Determine the size of the log
size_t log_size;
clGetProgramBuildInfo(pprog, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
// Allocate memory for the log
char* log = (char*)malloc(log_size);
// Get the log
clGetProgramBuildInfo(pprog, device, CL_PROGRAM_BUILD_LOG, log_size, log, NULL);
// Print the log
printf("%s\n", log);
}
cout << err;
printf("Error: Building Program (clBuildingProgram)\n");
return EXIT_FAILURE;
}
// 建立內核
cl_kernel testkernel = clCreateKernel(pprog, "ppp", &err);
if (err != CL_SUCCESS)
{
size_t log_size;
clGetProgramBuildInfo(pprog, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
// Allocate memory for the log
char* log = (char*)malloc(log_size);
// Get the log
clGetProgramBuildInfo(pprog, device, CL_PROGRAM_BUILD_LOG, log_size, log, NULL);
// Print the log
printf("%s\n", log);
printf("Error: Creating Kernel from program.(clCreateKernel)\n");
return EXIT_FAILURE;
}
// 建立記憶體物件
cl_mem inply = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(double) * 5, &targetinp, &err);
cl_mem preg = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(double) * 5, &tarpp, &err);
cl_mem ouply = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(double) * 5, &oup, &err);
// 在設備上分發記憶體物件
err = clSetKernelArg(testkernel, 0, sizeof(cl_mem), (void*)&inply);
err = clSetKernelArg(testkernel, 1, sizeof(cl_mem), (void*)&preg);
err = clSetKernelArg(testkernel, 2, sizeof(cl_mem), (void*)&ouply);
// 設備執行
size_t globalThreads = 5; //設定每個維度上work_items總數量
size_t localThreads = 1; //設定每個工作組中work_items的數量,對應於local_size
err = clEnqueueNDRangeKernel(commandQueue, testkernel, 1, NULL, &globalThreads, &localThreads, 0, NULL, NULL);
if (err != CL_SUCCESS)
{
printf("Error: Enqueueing kernel\n");
return EXIT_FAILURE;
}
// 確認 command queue 中所有命令都執行完畢
err = clFinish(commandQueue);
if (err != CL_SUCCESS)
{
printf("Error: Finish command queue\n");
return EXIT_FAILURE;
}
// 取回計算值
err = clEnqueueReadBuffer(commandQueue, ouply, CL_TRUE, 0, sizeof(double) * 5, &oup, 0, NULL, NULL);
for (int i = 0; i < 5; i++)
{
cout << oup[i] << "\n";
}
}
$ g++ t8.cpp -o t8 -L/usr/local/cuda/lib64 -lOpenCL -I/usr/local/cuda/include
$ ./t8
0:NVIDIA Corporation
select platform: 0
4
0:Tesla V100-PCIE-32GB
1:Tesla K20Xm
2:Tesla K20Xm
3:Tesla K20Xm
select device: 0
5
35
0
0
18
$
There may be other changes necessary also (for example, the global/local threads choices for parent and child kernels may not make sense in other settings). However, your program seems to be printing sensible results, for this particular case.