How can i report something wich i strongly belive could be a bug both cuEventRecord and cudaEventRec


I have tried a lot of possible scenarios with both cuEventRecord and cudaEventRecord on a stream . No matter i pass any specific stream to the second argument , It always gives the result as if i was passing 0(context). I’m pretty convinced it could be a bug in the API. This api has broad applicability
where developers like to use stream functionality. Please suggest how i can report this issue to the CUDA support group. Or is there any way i can get help from the developers working in this area?


It will likely get noticed here, but otherwise you can send a PM to an nvidia employee (tmurray e.g.). This might even be a known bug (somewhere I remember reading something like this before)

The best way is probably to post a minimalistic repro app here.

Thank you for your response! Here are the repro steps.

Platform: Linux 2.6.18-92.1.13.el5 #1 SMP Wed Sep 24 19:32:05 EDT 2008 x86_64 x86_64 x86_64 GNU/Linux

Device : 9800GT

A simple example using using low level driver api. Modify the sample matrixMulDrv example by replacing the “runtest” with the following. Please observe the ****** marked comments.


runTest(int argc, char** argv)


// initialize CUDA

CUfunction matrixMul = NULL;

CU_SAFE_CALL(initCUDA(argc, argv, &matrixMul ));

    int nstreams=2;

    // allocate and initialize an array of stream handles

    CUstream *streams = (CUstream*) malloc(nstreams * sizeof(CUstream));

    for(int i = 0; i < nstreams; i++)

            CU_SAFE_CALL( cuStreamCreate(&(streams[i]),0) );

// create CUDA event handles

float elapsed_time=0;

CUevent start_event, stop_event;

CU_SAFE_CALL( cuEventCreate(&start_event,0) );

CU_SAFE_CALL( cuEventCreate(&stop_event,0) );

// set seed for rand()


// allocate host memory for matrices A and B

unsigned int size_A = WA * HA;

unsigned int mem_size_A = sizeof(float) * size_A;

float* h_A = (float*) malloc(mem_size_A);

unsigned int size_B = WB * HB;

unsigned int mem_size_B = sizeof(float) * size_B;

float* h_B = (float*) malloc(mem_size_B);

// initialize host memory

randomInit(h_A, size_A);

randomInit(h_B, size_B);

// allocate device memory

CUdeviceptr d_A;

CU_SAFE_CALL(cuMemAlloc( &d_A, mem_size_A ));

CUdeviceptr d_B;

CU_SAFE_CALL(cuMemAlloc( &d_B, mem_size_B ));

// copy host memory to device

CU_SAFE_CALL(cuMemcpyHtoD( d_A, h_A, mem_size_A ));

CU_SAFE_CALL(cuMemcpyHtoD( d_B, h_B, mem_size_B ));

// allocate device memory for result

unsigned int size_C = WC * HC;

unsigned int mem_size_C = sizeof(float) * size_C;

CUdeviceptr d_C;

CU_SAFE_CALL(cuMemAlloc(&d_C, mem_size_C));

// create and start timer

unsigned int timer = 0;


// setup execution parameters

CU_SAFE_CALL(cuFuncSetBlockShape( matrixMul, BLOCK_SIZE, BLOCK_SIZE, 1 ));

CU_SAFE_CALL(cuFuncSetSharedSize( matrixMul, 2*BLOCK_SIZE*BLOCK_SIZE*sizeof(float) ) );

CU_SAFE_CALL(cuParamSeti( matrixMul, 0,  d_C ));

CU_SAFE_CALL(cuParamSeti( matrixMul, 4,  d_A ));

CU_SAFE_CALL(cuParamSeti( matrixMul, 8,  d_B ));

CU_SAFE_CALL(cuParamSeti( matrixMul, 12, WA ));

CU_SAFE_CALL(cuParamSeti( matrixMul, 16, WB ));

CU_SAFE_CALL(cuParamSetSize( matrixMul, 20 ));

//CU_SAFE_CALL(cuLaunchGrid( matrixMul, WC / BLOCK_SIZE, HC / BLOCK_SIZE ));


 cuEventRecord(start_event, streams[0]);

[b] //cuEventRecord(stop_event, streams[0]); // 1)*************** First disable this and enable 2)

 cuLaunchGridAsync( matrixMul, WC / BLOCK_SIZE, HC / BLOCK_SIZE , streams[1]);

 cuEventRecord(stop_event, streams[0]);  // 2)********* second time disable this and enable 1)




CU_SAFE_CALL( cuEventElapsedTime(&elapsed_time, start_event, stop_event) );

// stop and destroy timer


[b]//*****Observe the second value printed here we are suppose to see almost value no matter we choose 1) or 2) because there is ano task on stream[0].

printf("Processing time: %f (ms)##%f (ms) \n", cutGetTimerValue(timer),elapsed_time);[/b]


// allocate mem for the result on host side

float* h_C = (float*) malloc(mem_size_C);

// copy result from device to host

CU_SAFE_CALL(cuMemcpyDtoH((void *) h_C, d_C, mem_size_C) );

// compute reference solution

float* reference = (float*) malloc(mem_size_C);

computeGold(reference, h_A, h_B, HA, WA, WB);

// check result

CUTBoolean res = cutCompareL2fe(reference, h_C, size_C, 1e-6f);

printf("Test %s\n", (1 == res) ? "PASSED" : "FAILED");

// clean up memory









// release resources

    for(int i = 0; i < nstreams; i++)



Also please make sure to increase the size of computation by changing the values in matrixMul.h

#define WA (30 * BLOCK_SIZE) // Matrix A width

#define HA (50 * BLOCK_SIZE) // Matrix A height

#define WB (80 * BLOCK_SIZE) // Matrix B width



Thank you for the valuable contact information. I’ll soon send out an email to him if i don’t get an answer here.


Any news on this?
I can reproduce the same behavior on a Tesla C870 card (compute capability 1.0).