Cuda + omp = big slowdown

I’m using omp to parallelise my 8 gpu code, and its’ actually resulting in a very big slowdown to the code:

af_multi7.cpp(300):Thread : 0 / 8, Gpu : 0 - GeForce GTX TITAN, time taken : 0.066377
af_multi7.cpp(300):Thread : 7 / 8, Gpu : 7 - GeForce GTX TITAN, time taken : 34.620190
af_multi7.cpp(300):Thread : 5 / 8, Gpu : 5 - GeForce GTX TITAN, time taken : 35.356863
af_multi7.cpp(300):Thread : 4 / 8, Gpu : 4 - GeForce GTX TITAN, time taken : 37.317620
af_multi7.cpp(300):Thread : 2 / 8, Gpu : 2 - GeForce GTX TITAN, time taken : 39.163119
af_multi7.cpp(300):Thread : 6 / 8, Gpu : 6 - Tesla K20Xm, time taken : 40.834065
af_multi7.cpp(300):Thread : 3 / 8, Gpu : 3 - GeForce GTX TITAN, time taken : 42.025256
af_multi7.cpp(300):Thread : 1 / 8, Gpu : 1 - GeForce GTX TITAN, time taken : 43.021797
Elapsed time is 84.573489 seconds.

Timings are obviously all in parallel, but typically one serial run should take 3 seconds. This is taking 40 seconds, or roughly a 2* slowdown. Every gpu is performing an equivalent calculation (same size, same values)

Code being benchmarked is as below:

Code to create a sparse matrix :

timer::start();
	darray out(h_row, h_col, h_val, nnz, len, par);
	MSG("Thread : %d / %d, Gpu : %d - %s, time taken : %6f",omp_get_thread_num(),ngpu,device,prop.name,timer::stop());

Relevant class constructor:

darray::darray(int *rowind, int *colind, float *val, int nnz, int r, int c) : sparse(true),  nnz(nnz), dims0(r), dims1(c), data(NULL)
{
	int *rowInd;
	cusparseHandle_t handle;
	CUSPARSE(cusparseCreate(&handle));

	CUDA(cudaMalloc(&rowPtr,(r+1)*sizeof(int)));
	CUDA(cudaMalloc(&colInd,nnz*sizeof(int)));
	CUDA(cudaMalloc(&Val,nnz*sizeof(float)));
	CUDA(cudaMalloc(&rowInd,nnz*sizeof(int)));

	CUDA(cudaMemcpyAsync(rowInd,rowind,nnz*sizeof(int), cudaMemcpyDefault));
	CUDA(cudaMemcpyAsync(colInd,colind,nnz*sizeof(int), cudaMemcpyDefault));	
	CUDA(cudaMemcpyAsync(Val,val,nnz*sizeof(float), cudaMemcpyDefault));
	cudaDeviceSynchronize();
	CUSPARSE(cusparseXcoo2csr(handle, rowInd, nnz, dims0, rowPtr, CUSPARSE_INDEX_BASE_ZERO));   //coo to csr conversion
	CUSPARSE(cusparseDestroy(handle));
	CUDA(cudaFree(rowInd));
}

Update:

Now using pure cuda from within the same source file:

float *mkd(int *rowind, int *colind, float *val, int nnz, int r, int c)
{
	int *rowInd;
	int *rowPtr;
	int *colInd;
	float *Val;
	cusparseHandle_t handle;

	CUSPARSE(cusparseCreate(&handle));

	CUDA(cudaMalloc(&rowPtr,(r+1)*sizeof(int)));
	CUDA(cudaMalloc(&colInd,nnz*sizeof(int)));
	CUDA(cudaMalloc(&Val,nnz*sizeof(float)));
	CUDA(cudaMalloc(&rowInd,nnz*sizeof(int)));

	CUDA(cudaMemcpyAsync(rowInd,rowind,nnz*sizeof(int), cudaMemcpyDefault));
	CUDA(cudaMemcpyAsync(colInd,colind,nnz*sizeof(int), cudaMemcpyDefault));	
	CUDA(cudaMemcpyAsync(Val,val,nnz*sizeof(float), cudaMemcpyDefault));
	
	cudaDeviceSynchronize();

	CUSPARSE(cusparseXcoo2csr(handle, rowInd, nnz, r, rowPtr, CUSPARSE_INDEX_BASE_ZERO));   //coo to csr conversion

	CUSPARSE(cusparseDestroy(handle));

	CUDA(cudaFree(rowInd));
	return Val;
}

//darray get_sparse(const mxArray *m)
void get_sparse(const mxArray *m)
{
    mwSize *dims = (mwSize *)mxGetDimensions(m);
    int nnz = dims[0];
	int par = 1;
	int len = 1;

    double *row = mxGetPr(m);
    double *col = row + nnz;
    double *val = col + nnz;

    int * h_row = (int *)malloc(nnz * sizeof(int));
    int * h_col = (int *)malloc(nnz * sizeof(int));
    float * h_val = (float *)malloc(nnz * sizeof(float));

    for (int i = 0; i < nnz; i++) {
		if ((row[i]+1)>len) len=row[i]+1;
		if ((col[i]+1)>par) par=col[i]+1;
        h_row[i] = (int) row[i];
        h_col[i] = (int) col[i];
        h_val[i] = (float) val[i];
    }
	timer::start();
	//darray out;
	cudaDeviceSynchronize();MSG("%d, %6f",omp_get_thread_num(),timer::stop());timer::start();
	//darray out(h_row, h_col, h_val, nnz, len, par);
	float *o1 = mkd(h_row, h_col, h_val, nnz, len, par);
	cudaDeviceSynchronize();MSG("%d, %6f",omp_get_thread_num(),timer::stop());timer::start();

	free(h_row);
	free(h_col);
	free(h_val);
	//return out;
}

Produces:

af_multi7.cpp(327):0, 0.000023
af_multi7.cpp(330):0, 0.073611
af_multi7.cpp(327):5, 33.527801
af_multi7.cpp(330):5, 0.055800
af_multi7.cpp(327):4, 0.508664
af_multi7.cpp(330):4, 0.059002
af_multi7.cpp(327):1, 3.159862
af_multi7.cpp(330):1, 0.043828
af_multi7.cpp(327):2, 2.073359
af_multi7.cpp(330):2, 0.064704
af_multi7.cpp(327):6, 0.399756
af_multi7.cpp(330):6, 0.045866
af_multi7.cpp(327):7, 1.814776
af_multi7.cpp(330):7, 0.051561
af_multi7.cpp(327):3, 0.190012
af_multi7.cpp(330):3, 0.038962

Note: These seem to be serialising… though I’m not sure why.
Elapsed time is 82.768771 seconds.

And partial nvidia-smi…

GPU 0000:85:00.0
    Product Name                    : GeForce GTX TITAN
    Display Mode                    : N/A
    Display Active                  : N/A
    Persistence Mode                : Enabled
    Accounting Mode                 : N/A
    Accounting Mode Buffer Size     : N/A
    Driver Model
        Current                     : N/A
        Pending                     : N/A
    Serial Number                   : N/A
    GPU UUID                        : GPU-4f680afb-8dcc-04d5-710b-67e8db1bc37d
    VBIOS Version                   : 80.10.2C.00.06
    Inforom Version
        Image Version               : N/A
        OEM Object                  : N/A
        ECC Object                  : N/A
        Power Management Object     : N/A
    GPU Operation Mode
        Current                     : N/A
        Pending                     : N/A
    PCI
        Bus                         : 0x85
        Device                      : 0x00
        Domain                      : 0x0000
        Device Id                   : 0x100510DE
        Bus Id                      : 0000:85:00.0
        Sub System Id               : 0x84511043
        GPU Link Info
            PCIe Generation
                Max                 : N/A
                Current             : N/A
            Link Width
                Max                 : N/A
                Current             : N/A
    Fan Speed                       : 34 %
    Performance State               : N/A
    Clocks Throttle Reasons         : N/A
    Memory Usage
        Total                       : 6143 MB
        Used                        : 92 MB
        Free                        : 6051 MB
    Compute Mode                    : Default
    Utilization
        Gpu                         : N/A
        Memory                      : N/A
    Ecc Mode
        Current                     : N/A
        Pending                     : N/A
    ECC Errors
        Volatile
            Single Bit            
                Device Memory       : N/A
                Register File       : N/A
                L1 Cache            : N/A
                L2 Cache            : N/A
                Texture Memory      : N/A
                Total               : N/A
            Double Bit            
                Device Memory       : N/A
                Register File       : N/A
                L1 Cache            : N/A
                L2 Cache            : N/A
                Texture Memory      : N/A
                Total               : N/A
        Aggregate
            Single Bit            
                Device Memory       : N/A
                Register File       : N/A
                L1 Cache            : N/A
                L2 Cache            : N/A
                Texture Memory      : N/A
                Total               : N/A
            Double Bit            
                Device Memory       : N/A
                Register File       : N/A
                L1 Cache            : N/A
                L2 Cache            : N/A
                Texture Memory      : N/A
                Total               : N/A
    Retired Pages
        Single Bit ECC              : N/A
        Double Bit ECC              : N/A
        Pending                     : N/A
    Temperature
        Gpu                         : 50 C
    Power Readings
        Power Management            : N/A
        Power Draw                  : N/A
        Power Limit                 : N/A
        Default Power Limit         : N/A
        Enforced Power Limit        : N/A
        Min Power Limit             : N/A
        Max Power Limit             : N/A
    Clocks
        Graphics                    : N/A
        SM                          : N/A
        Memory                      : N/A
    Applications Clocks
        Graphics                    : N/A
        Memory                      : N/A
    Default Applications Clocks
        Graphics                    : N/A
        Memory                      : N/A
    Max Clocks
        Graphics                    : N/A
        SM                          : N/A
        Memory                      : N/A
    Compute Processes               : N/A

GPU 0000:86:00.0
    Product Name                    : GeForce GTX TITAN
    Display Mode                    : N/A
    Display Active                  : N/A
    Persistence Mode                : Enabled
    Accounting Mode                 : N/A
    Accounting Mode Buffer Size     : N/A
    Driver Model
        Current                     : N/A
        Pending                     : N/A
    Serial Number                   : N/A
    GPU UUID                        : GPU-39325139-a5b1-083b-0252-b3e4dc505f84
    VBIOS Version                   : 80.10.2C.00.06
    Inforom Version
        Image Version               : N/A
        OEM Object                  : N/A
        ECC Object                  : N/A
        Power Management Object     : N/A
    GPU Operation Mode
        Current                     : N/A
        Pending                     : N/A
    PCI
        Bus                         : 0x86
        Device                      : 0x00
        Domain                      : 0x0000
        Device Id                   : 0x100510DE
        Bus Id                      : 0000:86:00.0
        Sub System Id               : 0x84511043
        GPU Link Info
            PCIe Generation
                Max                 : N/A
                Current             : N/A
            Link Width
                Max                 : N/A
                Current             : N/A
    Fan Speed                       : 34 %
    Performance State               : N/A
    Clocks Throttle Reasons         : N/A
    Memory Usage
        Total                       : 6143 MB
        Used                        : 113 MB
        Free                        : 6030 MB
    Compute Mode                    : Default
    Utilization
        Gpu                         : N/A
        Memory                      : N/A
    Ecc Mode
        Current                     : N/A
        Pending                     : N/A
    ECC Errors
        Volatile
            Single Bit            
                Device Memory       : N/A
                Register File       : N/A
                L1 Cache            : N/A
                L2 Cache            : N/A
                Texture Memory      : N/A
                Total               : N/A
            Double Bit            
                Device Memory       : N/A
                Register File       : N/A
                L1 Cache            : N/A
                L2 Cache            : N/A
                Texture Memory      : N/A
                Total               : N/A
        Aggregate
            Single Bit            
                Device Memory       : N/A
                Register File       : N/A
                L1 Cache            : N/A
                L2 Cache            : N/A
                Texture Memory      : N/A
                Total               : N/A
            Double Bit            
                Device Memory       : N/A
                Register File       : N/A
                L1 Cache            : N/A
                L2 Cache            : N/A
                Texture Memory      : N/A
                Total               : N/A
    Retired Pages
        Single Bit ECC              : N/A
        Double Bit ECC              : N/A
        Pending                     : N/A
    Temperature
        Gpu                         : 49 C
    Power Readings
        Power Management            : N/A
        Power Draw                  : N/A
        Power Limit                 : N/A
        Default Power Limit         : N/A
        Enforced Power Limit        : N/A
        Min Power Limit             : N/A
        Max Power Limit             : N/A
    Clocks
        Graphics                    : N/A
        SM                          : N/A
        Memory                      : N/A
    Applications Clocks
        Graphics                    : N/A
        Memory                      : N/A
    Default Applications Clocks
        Graphics                    : N/A
        Memory                      : N/A
    Max Clocks
        Graphics                    : N/A
        SM                          : N/A
        Memory                      : N/A
    Compute Processes               : N/A

GPU 0000:89:00.0
    Product Name                    : Tesla K20Xm
    Display Mode                    : Disabled
    Display Active                  : Disabled
    Persistence Mode                : Enabled
    Accounting Mode                 : Disabled
    Accounting Mode Buffer Size     : 128
    Driver Model
        Current                     : N/A
        Pending                     : N/A
    Serial Number                   : 0324912021556
    GPU UUID                        : GPU-98348cf4-cdf7-7271-7db6-764c224a6716
    VBIOS Version                   : 80.10.17.00.02
    Inforom Version
        Image Version               : 2081.0200.01.09
        OEM Object                  : 1.1
        ECC Object                  : 3.0
        Power Management Object     : N/A
    GPU Operation Mode
        Current                     : Compute
        Pending                     : Compute
    PCI
        Bus                         : 0x89
        Device                      : 0x00
        Domain                      : 0x0000
        Device Id                   : 0x102110DE
        Bus Id                      : 0000:89:00.0
        Sub System Id               : 0x097D10DE
        GPU Link Info
            PCIe Generation
                Max                 : 2
                Current             : 2
            Link Width
                Max                 : 16x
                Current             : 16x
    Fan Speed                       : N/A
    Performance State               : P0
    Clocks Throttle Reasons
        Idle                        : Not Active
        Applications Clocks Setting : Active
        SW Power Cap                : Not Active
        HW Slowdown                 : Not Active
        Unknown                     : Not Active
    Memory Usage
        Total                       : 6143 MB
        Used                        : 103 MB
        Free                        : 6040 MB
    Compute Mode                    : Default
    Utilization
        Gpu                         : 0 %
        Memory                      : 0 %
    Ecc Mode
        Current                     : Disabled
        Pending                     : Disabled
    ECC Errors
        Volatile
            Single Bit            
                Device Memory       : N/A
                Register File       : N/A
                L1 Cache            : N/A
                L2 Cache            : N/A
                Texture Memory      : N/A
                Total               : N/A
            Double Bit            
                Device Memory       : N/A
                Register File       : N/A
                L1 Cache            : N/A
                L2 Cache            : N/A
                Texture Memory      : N/A
                Total               : N/A
        Aggregate
            Single Bit            
                Device Memory       : N/A
                Register File       : N/A
                L1 Cache            : N/A
                L2 Cache            : N/A
                Texture Memory      : N/A
                Total               : N/A
            Double Bit            
                Device Memory       : N/A
                Register File       : N/A
                L1 Cache            : N/A
                L2 Cache            : N/A
                Texture Memory      : N/A
                Total               : N/A
    Retired Pages
        Single Bit ECC              : 0
        Double Bit ECC              : 0
        Pending                     : No
    Temperature
        Gpu                         : 38 C
    Power Readings
        Power Management            : Supported
        Power Draw                  : 56.68 W
        Power Limit                 : 235.00 W
        Default Power Limit         : 235.00 W
        Enforced Power Limit        : 235.00 W
        Min Power Limit             : 150.00 W
        Max Power Limit             : 235.00 W
    Clocks
        Graphics                    : 732 MHz
        SM                          : 732 MHz
        Memory                      : 2600 MHz
    Applications Clocks
        Graphics                    : 732 MHz
        Memory                      : 2600 MHz
    Default Applications Clocks
        Graphics                    : 732 MHz
        Memory                      : 2600 MHz
    Max Clocks
        Graphics                    : 784 MHz
        SM                          : 784 MHz
        Memory                      : 2600 MHz
    Compute Processes
        Process ID                  : 16537
            Name                    : /usr/local/MATLAB/R2013a/bin/glnxa64/MATLAB
            Used GPU Memory         : 814 MB

GPU 0000:8A:00.0
    Product Name                    : GeForce GTX TITAN
    Display Mode                    : N/A
    Display Active                  : N/A
    Persistence Mode                : Enabled
    Accounting Mode                 : N/A
    Accounting Mode Buffer Size     : N/A
    Driver Model
        Current                     : N/A
        Pending                     : N/A
    Serial Number                   : N/A
    GPU UUID                        : GPU-f3b68287-90b3-579b-9c87-2d37a62d0eff
    VBIOS Version                   : 80.10.2C.00.06
    Inforom Version
        Image Version               : N/A
        OEM Object                  : N/A
        ECC Object                  : N/A
        Power Management Object     : N/A
    GPU Operation Mode
        Current                     : N/A
        Pending                     : N/A
    PCI
        Bus                         : 0x8A
        Device                      : 0x00
        Domain                      : 0x0000
        Device Id                   : 0x100510DE
        Bus Id                      : 0000:8A:00.0
        Sub System Id               : 0x84511043
        GPU Link Info
            PCIe Generation
                Max                 : N/A
                Current             : N/A
            Link Width
                Max                 : N/A
                Current             : N/A
    Fan Speed                       : 34 %
    Performance State               : N/A
    Clocks Throttle Reasons         : N/A
    Memory Usage
        Total                       : 6143 MB
        Used                        : 114 MB
        Free                        : 6029 MB
    Compute Mode                    : Default
    Utilization
        Gpu                         : N/A
        Memory                      : N/A
    Ecc Mode
        Current                     : N/A
        Pending                     : N/A
    ECC Errors
        Volatile
            Single Bit            
                Device Memory       : N/A
                Register File       : N/A
                L1 Cache            : N/A
                L2 Cache            : N/A
                Texture Memory      : N/A
                Total               : N/A
            Double Bit            
                Device Memory       : N/A
                Register File       : N/A
                L1 Cache            : N/A
                L2 Cache            : N/A
                Texture Memory      : N/A
                Total               : N/A
        Aggregate
            Single Bit            
                Device Memory       : N/A
                Register File       : N/A
                L1 Cache            : N/A
                L2 Cache            : N/A
                Texture Memory      : N/A
                Total               : N/A
            Double Bit            
                Device Memory       : N/A
                Register File       : N/A
                L1 Cache            : N/A
                L2 Cache            : N/A
                Texture Memory      : N/A
                Total               : N/A
    Retired Pages
        Single Bit ECC              : N/A
        Double Bit ECC              : N/A
        Pending                     : N/A
    Temperature
        Gpu                         : 51 C
    Power Readings
        Power Management            : N/A
        Power Draw                  : N/A
        Power Limit                 : N/A
        Default Power Limit         : N/A
        Enforced Power Limit        : N/A
        Min Power Limit             : N/A
        Max Power Limit             : N/A
    Clocks
        Graphics                    : N/A
        SM                          : N/A
        Memory                      : N/A
    Applications Clocks
        Graphics                    : N/A
        Memory                      : N/A
    Default Applications Clocks
        Graphics                    : N/A
        Memory                      : N/A
    Max Clocks
        Graphics                    : N/A
        SM                          : N/A
        Memory                      : N/A
    Compute Processes               : N/A

You have to set OMP_NUM_THREADS environment variable for using required number of threads. You can do this in Linux bush shell by the following command:

export OMP_NUM_THREADS = n

where n is the number of threads you need.

Hey Thanks,
I resolved the issue. It actually turned out to be really slow context intialisation time (First cuda call was was taking forever). Fixed it by calling CudaDeviceReset before any other cuda calls.