cxGPUContext = clCreateContext(0, 1, &device_id, NULL, NULL, &ciErrNum);
////////////////////////////// kernel1: crate command queue, kernel, prepare buffer for kernel
commandQueue = clCreateCommandQueue(cxGPUContext, device_id, CL_QUEUE_PROFILING_ENABLE, &ciErrNum);
cl_program cpProgram = clCreateProgramWithSource(cxGPUContext,1, (const char **) &ProgramSource, NULL, &ciErrNum);
ciErrNum = clBuildProgram(cpProgram, 0, NULL, “-cl-fast-relaxed-math”, NULL, NULL);
kernel = clCreateKernel(cpProgram, “hello”, &ciErrNum);
input = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, DATA_SIZE * sizeof(double), NULL,NULL);
output = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, DATA_SIZE * sizeof(double), NULL,NULL);
clEnqueueWriteBuffer(commandQueue, input, CL_FALSE, 0, sizeof(double) * DATA_SIZE, inputData, 0, NULL, NULL);
clFlush(commandQueue);
////////////////////////////// kernel2: crate command queue, kernel, prepare buffer for kernel
commandQueue2 = clCreateCommandQueue(cxGPUContext, device_id, CL_QUEUE_PROFILING_ENABLE, &ciErrNum);
cl_program cpProgram2 = clCreateProgramWithSource(cxGPUContext,1, (const char **) &ProgramSource2, NULL, &ciErrNum);
ciErrNum = clBuildProgram(cpProgram2, 0, NULL, “-cl-fast-relaxed-math”, NULL, NULL);
kernel2 = clCreateKernel(cpProgram2, “hello”, &ciErrNum);
input2 = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, DATA_SIZE * sizeof(double), NULL,NULL);
output2 = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, DATA_SIZE * sizeof(double), NULL,NULL);
clEnqueueWriteBuffer(commandQueue2, input2, CL_FALSE, 0, sizeof(double) * DATA_SIZE, inputData, 0, NULL, NULL);
clFlush(commandQueue2);
////////////////////////////// kernel1: run & read buffer
clSetKernelArg(kernel, 0, sizeof(cl_mem), &input);
clSetKernelArg(kernel, 1, sizeof(cl_mem), &output);
clEnqueueNDRangeKernel(commandQueue, kernel, 1, NULL, &global, NULL, 0, NULL, &event3);
clEnqueueReadBuffer(commandQueue, output, CL_FALSE, 0, DATA_SIZE * sizeof(double), results, 0, NULL, &event1);
clFlush(commandQueue);
/////////////////////////////// kernel2: run & read buffer
clSetKernelArg(kernel2, 0, sizeof(cl_mem), &input2);
clSetKernelArg(kernel2, 1, sizeof(cl_mem), &output2);
clEnqueueNDRangeKernel(commandQueue2, kernel2, 1, NULL, &global, NULL, 0, NULL, &event4);
clEnqueueReadBuffer(commandQueue2, output2, CL_FALSE, 0, DATA_SIZE * sizeof(double), results2, 0, NULL, &event2);
clFlush(commandQueue2);
clFinish(commandQueue);
clFlush(commandQueue2);