// ********************************************************************* // oclVectorSub Notes: // // A simple OpenCL API demo application that implements // element by element vector subtraction between 2 float arrays. // ********************************************************************* #include #include #include #include const char* cSourceFile = "VectorSub.cl"; void *srcA, *srcB, *dst; // Host buffers for OpenCL test // OpenCL Vars cl_context cxGPUContext; // OpenCL context cl_command_queue cqCommandQueue;// OpenCL command que cl_platform_id cpPlatform; // OpenCL platform cl_device_id cdDevice; // OpenCL device cl_program cpProgram; // OpenCL program cl_kernel ckKernel; // OpenCL kernel cl_kernel ckKernel_rp; // OpenCL kernel cl_kernel ckKernel_re; // OpenCL kernel cl_kernel ckKernel_rz; // OpenCL kernel cl_kernel ckKernel_rn; // OpenCL kernel cl_mem cmDevSrcA; // OpenCL device source buffer A cl_mem cmDevSrcB; // OpenCL device source buffer B cl_mem cmDevDst; // OpenCL device destination buffer size_t szGlobalWorkSize; // 1D var for Total # of work items size_t szLocalWorkSize; // 1D var for # of work items in the work group size_t szParmDataBytes; // Byte size of context information size_t szKernelLength; // Byte size of kernel code cl_int ciErr1, ciErr2; // Error code var char* cPathAndName = NULL; // var for full paths to data, src, etc. char* cSourceCL = NULL; // Buffer to hold source for compilation int iNumElements = 20; void VectorSubHost(const float* pfData1, const float* pfData2, float* pfResult, int iNumElements); void Cleanup (int iExitCode); static unsigned floatAsUInt( float x ); int verifyResults( float* h_A, float* h_B, float* h_C, int N ); int main(int argc, char **argv) { int i; // start logs shrSetLogFileName ("oclVectorSub.txt"); shrLog("%s Starting...\n\n# of float elements per Array \t= %i\n", argv[0], iNumElements); // set and log Global and Local work size dimensions szLocalWorkSize = 256; // rounded up to the nearest multiple of the LocalWorkSize szGlobalWorkSize = shrRoundUp((int)szLocalWorkSize, iNumElements); shrLog("Global Work Size \t\t= %u\nLocal Work Size \t\t= %u\n" "# of Work Groups \t\t= %u\n\n", szGlobalWorkSize, szLocalWorkSize, (szGlobalWorkSize % szLocalWorkSize + szGlobalWorkSize/szLocalWorkSize)); // Allocate and initialize host arrays shrLog( "Allocate and Init Host Mem...\n"); srcA = (void *)malloc(sizeof(cl_float) * szGlobalWorkSize); srcB = (void *)malloc(sizeof(cl_float) * szGlobalWorkSize); dst = (void *)malloc(sizeof(cl_float) * szGlobalWorkSize); shrFillArray((float*)srcA, iNumElements); shrFillArray((float*)srcB, iNumElements); //Get an OpenCL platform ciErr1 = clGetPlatformIDs(1, &cpPlatform, NULL); shrLog("clGetPlatformID...\n"); if (ciErr1 != CL_SUCCESS) { shrLog("Error in clGetPlatformID, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(EXIT_FAILURE); } //Get the devices ciErr1 = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 1, &cdDevice, NULL); shrLog("clGetDeviceIDs...\n"); if (ciErr1 != CL_SUCCESS) { shrLog("Error in clGetDeviceIDs, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(EXIT_FAILURE); } //Create the context cxGPUContext = clCreateContext(0, 1, &cdDevice, NULL, NULL, &ciErr1); shrLog("clCreateContext...\n"); if (ciErr1 != CL_SUCCESS) { shrLog("Error in clCreateContext, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(EXIT_FAILURE); } // Create a command-queue cqCommandQueue = clCreateCommandQueue(cxGPUContext, cdDevice, 0, &ciErr1); shrLog("clCreateCommandQueue...\n"); if (ciErr1 != CL_SUCCESS) { shrLog("Error in clCreateCommandQueue, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(EXIT_FAILURE); } // Allocate the OpenCL buffer memory objects for source and result on the device GMEM cmDevSrcA = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, sizeof(cl_float) * szGlobalWorkSize, NULL, &ciErr1); cmDevSrcB = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, sizeof(cl_float) * szGlobalWorkSize, NULL, &ciErr2); ciErr1 |= ciErr2; cmDevDst = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, sizeof(cl_float) * szGlobalWorkSize, NULL, &ciErr2); ciErr1 |= ciErr2; shrLog("clCreateBuffer...\n"); if (ciErr1 != CL_SUCCESS) { shrLog("Error in clCreateBuffer, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(EXIT_FAILURE); } // Read the OpenCL kernel in from source file shrLog("oclLoadProgSource (%s)...\n", cSourceFile); cPathAndName = shrFindFilePath(cSourceFile, argv[0]); cSourceCL = oclLoadProgSource(cPathAndName, "", &szKernelLength); // Create the program cpProgram = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&cSourceCL, &szKernelLength, &ciErr1); shrLog("clCreateProgramWithSource...\n"); if (ciErr1 != CL_SUCCESS) { shrLog("Error in clCreateProgramWithSource, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(EXIT_FAILURE); } ciErr1 = clBuildProgram(cpProgram, 0, NULL, NULL, NULL, NULL); shrLog("clBuildProgram...\n"); if (ciErr1 != CL_SUCCESS) { shrLog("Error in clBuildProgram, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(EXIT_FAILURE); } // Create the kernels shrLog("clCreateKernel (VectorSub)...\n"); ckKernel = clCreateKernel(cpProgram, "VectorSub", &ciErr1); shrLog("clCreateKernel (VectorSub_re)...\n"); ckKernel_rp = clCreateKernel(cpProgram, "VectorSub_re", &ciErr2); ciErr1 |= ciErr2; shrLog("clCreateKernel (VectorSub_rz)...\n"); ckKernel_re = clCreateKernel(cpProgram, "VectorSub_rz", &ciErr2); ciErr1 |= ciErr2; shrLog("clCreateKernel (VectorSub_rp)...\n"); ckKernel_rz = clCreateKernel(cpProgram, "VectorSub_rp", &ciErr2); ciErr1 |= ciErr2; shrLog("clCreateKernel (VectorSub_rn)...\n"); ckKernel_rn = clCreateKernel(cpProgram, "VectorSub_rn", &ciErr2); ciErr1 |= ciErr2; if (ciErr1 != CL_SUCCESS) { shrLog("Error in clCreateKernel, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(EXIT_FAILURE); } // Set the Argument values ciErr1 = clSetKernelArg(ckKernel, 0, sizeof(cl_mem), (void*)&cmDevSrcA); ciErr1 |= clSetKernelArg(ckKernel, 1, sizeof(cl_mem), (void*)&cmDevSrcB); ciErr1 |= clSetKernelArg(ckKernel, 2, sizeof(cl_mem), (void*)&cmDevDst); ciErr1 |= clSetKernelArg(ckKernel, 3, sizeof(cl_int), (void*)&iNumElements); ciErr1 |= clSetKernelArg(ckKernel_re, 0, sizeof(cl_mem), (void*)&cmDevSrcA); ciErr1 |= clSetKernelArg(ckKernel_re, 1, sizeof(cl_mem), (void*)&cmDevSrcB); ciErr1 |= clSetKernelArg(ckKernel_re, 2, sizeof(cl_mem), (void*)&cmDevDst); ciErr1 |= clSetKernelArg(ckKernel_re, 3, sizeof(cl_int), (void*)&iNumElements); ciErr1 |= clSetKernelArg(ckKernel_rz, 0, sizeof(cl_mem), (void*)&cmDevSrcA); ciErr1 |= clSetKernelArg(ckKernel_rz, 1, sizeof(cl_mem), (void*)&cmDevSrcB); ciErr1 |= clSetKernelArg(ckKernel_rz, 2, sizeof(cl_mem), (void*)&cmDevDst); ciErr1 |= clSetKernelArg(ckKernel_rz, 3, sizeof(cl_int), (void*)&iNumElements); ciErr1 |= clSetKernelArg(ckKernel_rp, 0, sizeof(cl_mem), (void*)&cmDevSrcA); ciErr1 |= clSetKernelArg(ckKernel_rp, 1, sizeof(cl_mem), (void*)&cmDevSrcB); ciErr1 |= clSetKernelArg(ckKernel_rp, 2, sizeof(cl_mem), (void*)&cmDevDst); ciErr1 |= clSetKernelArg(ckKernel_rp, 3, sizeof(cl_int), (void*)&iNumElements); ciErr1 |= clSetKernelArg(ckKernel_rn, 0, sizeof(cl_mem), (void*)&cmDevSrcA); ciErr1 |= clSetKernelArg(ckKernel_rn, 1, sizeof(cl_mem), (void*)&cmDevSrcB); ciErr1 |= clSetKernelArg(ckKernel_rn, 2, sizeof(cl_mem), (void*)&cmDevDst); ciErr1 |= clSetKernelArg(ckKernel_rn, 3, sizeof(cl_int), (void*)&iNumElements); shrLog("clSetKernelArg 0 - 3...\n\n"); if (ciErr1 != CL_SUCCESS) { shrLog("Error in clSetKernelArg, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(EXIT_FAILURE); } // Asynchronous write of data to GPU device ciErr1 = clEnqueueWriteBuffer(cqCommandQueue, cmDevSrcA, CL_FALSE, 0, sizeof(cl_float) * szGlobalWorkSize, srcA, 0, NULL, NULL); ciErr1 |= clEnqueueWriteBuffer(cqCommandQueue, cmDevSrcB, CL_FALSE, 0, sizeof(cl_float) * szGlobalWorkSize, srcB, 0, NULL, NULL); shrLog("clEnqueueWriteBuffer (SrcA and SrcB)...\n\n"); if (ciErr1 != CL_SUCCESS) { shrLog("Error in clEnqueueWriteBuffer, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(EXIT_FAILURE); } // -------------------------------------------------------- // In turn launch each kernel, read the output and verify the output // Launch original kernel shrLog( "Testing VectorSub (default)\n" ); ciErr1 = clEnqueueNDRangeKernel(cqCommandQueue, ckKernel, 1, NULL, &szGlobalWorkSize, &szLocalWorkSize, 0, NULL, NULL); shrLog("clEnqueueNDRangeKernel (VectorSub)...\n"); if (ciErr1 != CL_SUCCESS) { shrLog("Error in clEnqueueNDRangeKernel, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(EXIT_FAILURE); } ciErr1 = clEnqueueReadBuffer(cqCommandQueue, cmDevDst, CL_TRUE, 0, sizeof(cl_float) * szGlobalWorkSize, dst, 0, NULL, NULL); shrLog("clEnqueueReadBuffer (Dst)...\n"); if (ciErr1 != CL_SUCCESS) { shrLog("Error in clEnqueueReadBuffer, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(EXIT_FAILURE); } i = verifyResults( (float*)srcA, (float*)srcB, (float*)dst, iNumElements ); shrLog( "%s\n\n", (i==iNumElements) ? "PASSED" : "FAILED" ); // Launch kernel_re - round to nearest even shrLog( "Testing VectorSub_re\n" ); ciErr1 = clEnqueueNDRangeKernel(cqCommandQueue, ckKernel_re, 1, NULL, &szGlobalWorkSize, &szLocalWorkSize, 0, NULL, NULL); shrLog("clEnqueueNDRangeKernel (VectorSub_re)...\n"); if (ciErr1 != CL_SUCCESS) { shrLog("Error in clEnqueueNDRangeKernel, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(EXIT_FAILURE); } ciErr1 = clEnqueueReadBuffer(cqCommandQueue, cmDevDst, CL_TRUE, 0, sizeof(cl_float) * szGlobalWorkSize, dst, 0, NULL, NULL); shrLog("clEnqueueReadBuffer (Dst)...\n"); if (ciErr1 != CL_SUCCESS) { shrLog("Error in clEnqueueReadBuffer, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(EXIT_FAILURE); } i = verifyResults( (float*)srcA, (float*)srcB, (float*)dst, iNumElements ); shrLog( "%s\n\n", (i==iNumElements) ? "PASSED" : "FAILED" ); // Launch kernel_rz - round to zero shrLog( "Testing VectorSub_rz\n" ); ciErr1 = clEnqueueNDRangeKernel(cqCommandQueue, ckKernel_rz, 1, NULL, &szGlobalWorkSize, &szLocalWorkSize, 0, NULL, NULL); shrLog("clEnqueueNDRangeKernel (VectorSub_rz)...\n"); if (ciErr1 != CL_SUCCESS) { shrLog("Error in clEnqueueNDRangeKernel, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(EXIT_FAILURE); } ciErr1 = clEnqueueReadBuffer(cqCommandQueue, cmDevDst, CL_TRUE, 0, sizeof(cl_float) * szGlobalWorkSize, dst, 0, NULL, NULL); shrLog("clEnqueueReadBuffer (Dst)...\n"); if (ciErr1 != CL_SUCCESS) { shrLog("Error in clEnqueueReadBuffer, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(EXIT_FAILURE); } i = verifyResults( (float*)srcA, (float*)srcB, (float*)dst, iNumElements ); shrLog( "%s\n\n", (i==iNumElements) ? "PASSED" : "FAILED" ); // Launch kernel_rp - round to positive inf shrLog( "Testing VectorSub_rp\n" ); ciErr1 = clEnqueueNDRangeKernel(cqCommandQueue, ckKernel_rp, 1, NULL, &szGlobalWorkSize, &szLocalWorkSize, 0, NULL, NULL); shrLog("clEnqueueNDRangeKernel (VectorSub_rp)...\n"); if (ciErr1 != CL_SUCCESS) { shrLog("Error in clEnqueueNDRangeKernel, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(EXIT_FAILURE); } ciErr1 = clEnqueueReadBuffer(cqCommandQueue, cmDevDst, CL_TRUE, 0, sizeof(cl_float) * szGlobalWorkSize, dst, 0, NULL, NULL); shrLog("clEnqueueReadBuffer (Dst)...\n"); if (ciErr1 != CL_SUCCESS) { shrLog("Error in clEnqueueReadBuffer, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(EXIT_FAILURE); } i = verifyResults( (float*)srcA, (float*)srcB, (float*)dst, iNumElements ); shrLog( "%s\n\n", (i==iNumElements) ? "PASSED" : "FAILED" ); // Launch kernel_rn - round to neg inf shrLog( "Testing VectorSub_rn\n" ); ciErr1 = clEnqueueNDRangeKernel(cqCommandQueue, ckKernel_rn, 1, NULL, &szGlobalWorkSize, &szLocalWorkSize, 0, NULL, NULL); shrLog("clEnqueueNDRangeKernel (VectorSub_rn)...\n"); if (ciErr1 != CL_SUCCESS) { shrLog("Error in clEnqueueNDRangeKernel, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(EXIT_FAILURE); } ciErr1 = clEnqueueReadBuffer(cqCommandQueue, cmDevDst, CL_TRUE, 0, sizeof(cl_float) * szGlobalWorkSize, dst, 0, NULL, NULL); shrLog("clEnqueueReadBuffer (Dst)...\n"); if (ciErr1 != CL_SUCCESS) { shrLog("Error in clEnqueueReadBuffer, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(EXIT_FAILURE); } i = verifyResults( (float*)srcA, (float*)srcB, (float*)dst, iNumElements ); shrLog( "%s\n\n", (i==iNumElements) ? "PASSED" : "FAILED" ); //-------------------------------------------------------- // Cleanup and leave Cleanup (EXIT_SUCCESS); } static unsigned floatAsUInt( float x ) { volatile union { float f; unsigned i; } xx; xx.f = x; return xx.i; } int verifyResults( float* h_A, float* h_B, float* h_C, int N ) { int i; // FE_TONEAREST matches GPU default // fesetround( FE_TONEAREST ); // fesetround( FE_TOWARDZERO ); // fesetround( FE_DOWNWARD ); fesetround( FE_UPWARD ); for( i=0; i 0 ) int int_diff = floatAsUInt( h_C[i] ) - floatAsUInt( sum ); std::cout << std::setprecision( 10 ) << h_A[i] << " - " << h_B[i] << " = " << h_C[i] << " vs " << sum << " on CPU (" << floatAsUInt( h_C[i] ) << " vs " << floatAsUInt( sum ) << " for i=" << i << ")" << std::endl; if( fabs( int_diff ) > 0 ) { break; } } return i; } void Cleanup (int iExitCode) { // Cleanup allocated objects shrLog("Starting Cleanup...\n\n"); if(cPathAndName)free(cPathAndName); if(cSourceCL)free(cSourceCL); if(ckKernel)clReleaseKernel(ckKernel); if(ckKernel_re)clReleaseKernel(ckKernel_re); if(ckKernel_rz)clReleaseKernel(ckKernel_rz); if(ckKernel_rp)clReleaseKernel(ckKernel_rp); if(ckKernel_rn)clReleaseKernel(ckKernel_rn); if(cpProgram)clReleaseProgram(cpProgram); if(cqCommandQueue)clReleaseCommandQueue(cqCommandQueue); if(cxGPUContext)clReleaseContext(cxGPUContext); if(cmDevSrcA)clReleaseMemObject(cmDevSrcA); if(cmDevSrcB)clReleaseMemObject(cmDevSrcB); if(cmDevDst)clReleaseMemObject(cmDevDst); // Free host memory free(srcA); free(srcB); free (dst); exit (iExitCode); }