Hi all!
Currently, I am working on Java-bindings for the driver-layer of CUDA, which are pretty complete. As a test I tried to port the [font=“Courier New”]matrixMulDrv[/font] project of the CUDA-SDK. The entire setup works well:
[*]initialization of CUDA
[*]obtaining the device
[*]creating a context
[*]memcopy between device and host, even for pagelocked mem
[*]loading the [font=“Courier New”]cubin[/font] file
[*]selecting and configuring the function [font=“Courier New”]matrixMul[/font]
Unfortunately the kernel refuses to launch with the error: [font=“Courier New”]CUDA_ERROR_LAUNCH_FAILED[/font].
This is the Java code:
[font=“Courier New”]
CUDADriver.cuInit(0);
CUdevice cuDevice = new CUdevice();
CUDADriver.cuDeviceGet(cuDevice, 0);
CUcontext cuContext = new CUcontext();
CUDADriver.cuCtxCreate(cuContext, 0, cuDevice);
CUmodule cuModule = new CUmodule();
CUDADriver.cuModuleLoad(cuModule, "matrixMul_kernel.cubin");
CUfunction matrixMul = new CUfunction();
CUDADriver.cuModuleGetFunction(matrixMul, cuModule, "matrixMul");
// allocate host memory for matrices A and B
int size_A = WA * HA;
int mem_size_A = 4 * size_A;
float[] h_A = new float; //malloc(mem_size_A);
int size_B = WB * HB;
int mem_size_B = 4 * size_B;
float[] h_B = new float; //(float*) malloc(mem_size_B);
// initialize host memory
randomInit(h_A);
randomInit(h_B);
// allocate device memory
CUdeviceptr d_A = new CUdeviceptr();
CUDADriver.cuMemAlloc(d_A, mem_size_A);
CUDADriver.cuCtxSynchronize();
CUdeviceptr d_B = new CUdeviceptr();
CUDADriver.cuMemAlloc(d_B, mem_size_B);
CUDADriver.cuCtxSynchronize();
// copy host memory to device
CUDADriver.cuMemcpyHtoD(d_A, h_A, 0, h_A.length);
CUDADriver.cuCtxSynchronize();
CUDADriver.cuMemcpyHtoD(d_B, h_B, 0, h_B.length);
CUDADriver.cuCtxSynchronize();
// allocate device memory for result
int size_C = WC * HC;
int mem_size_C = 4 * size_C;
CUdeviceptr d_C = new CUdeviceptr();
CUDADriver.cuMemAlloc(d_C, mem_size_C);
CUDADriver.cuCtxSynchronize();
// setup execution parameters
CUDADriver.cuFuncSetBlockShape(matrixMul, BLOCK_SIZE, BLOCK_SIZE, 1);
CUDADriver.cuCtxSynchronize();
CUDADriver.cuFuncSetSharedSize(matrixMul, 2*BLOCK_SIZE*BLOCK_SIZE*4);
CUDADriver.cuCtxSynchronize();
CUDADriver.cuParamSeti(matrixMul, 0, d_C);
CUDADriver.cuCtxSynchronize();
CUDADriver.cuParamSeti(matrixMul, 4, d_A);
CUDADriver.cuCtxSynchronize();
CUDADriver.cuParamSeti(matrixMul, 8, d_B);
CUDADriver.cuCtxSynchronize();
CUDADriver.cuParamSeti(matrixMul, 12, WA);
CUDADriver.cuCtxSynchronize();
CUDADriver.cuParamSeti(matrixMul, 16, WB);
CUDADriver.cuCtxSynchronize();
CUDADriver.cuParamSetSize(matrixMul, 20);
CUDADriver.cuCtxSynchronize();
CUDADriver.cuLaunchGrid(matrixMul, WC / BLOCK_SIZE, HC / BLOCK_SIZE);
CUDADriver.cuCtxSynchronize();
// allocate mem for the result on host side
float[] h_C = new float; // (float*) malloc(mem_size_C);
// copy result from device to host
CUDADriver.cuMemcpyDtoH(h_C, 0, h_C.length, d_C);
CUDADriver.cuCtxSynchronize();
// compute reference solution
float[] reference = new float; //(float*) malloc(mem_size_C);
computeGold(reference, h_A, h_B, HA, WA, WB);
// check result
boolean res = cutCompareL2fe(reference, h_C, size_C, 1e-6f);
System.out.format("Test %s\n", res ? "PASSED" : "FAILED");
// clean up memory
CUDADriver.cuMemFree(d_A);
CUDADriver.cuCtxSynchronize();
CUDADriver.cuMemFree(d_B);
CUDADriver.cuCtxSynchronize();
CUDADriver.cuMemFree(d_C);
CUDADriver.cuCtxSynchronize();
CUDADriver.cuCtxDetach(cuContext);
[/font]
The JNI-bindings are straight-forward:
[font=“Courier New”]
/*
-
Class: edu_sit_jacuzzi_CUDADriver
-
Method: cuLaunchGrid
-
Signature: (Ledu/sit/jacuzzi/CUfunction;II)V
*/
JNIEXPORT void JNICALL Java_edu_sit_jacuzzi_CUDADriver_cuLaunchGrid
(JNIEnv* env, jclass cls, jobject f, jint grid_width, jint grid_height)
{
jlong fn = (*env)->GetLongField(env, f, jacu_cufunction_handle);
CUresult res = cuLaunchGrid(*(CUfunction*)&fn, grid_width, grid_height);
TEST_THROW(res);
}
[/font]
Ideas anybody?
PS: The code is open-source and I will mail copies on demand.