How can I accelerate this code?

Lip7 · July 12, 2017, 1:02pm

Hi everyone!

I need to accelerate this code:

__global__ void kernelAdd(TYPEOFOPERAND *dvalues, int numOperations,
                          int firstInd, int nextColInd)
{
  int vi = firstInd + blockIdx.x * blockDim.x + threadIdx.x;
  float temp = dvalues[vi];
    
  if (vi < nextColInd) {
    for (int j=0; j<numOperations; ++j) {  
      temp += temp*temp;
    }                              
    dvalues[vi] = temp;
  }
}

int main(int argc, char **argv)
{
  //=========================================================================
  // Variables.
  // CUDA.
  cudaDeviceProp deviceProp;
  cudaStream_t *stream;  
  cudaEvent_t start, stop;
    
  // Matrix.
  // Harwell-Boeing format.
  int nrow, ncol, nnzero;
  // Compressed Sparse Column format.
  int *colptr, *rowind;
  TYPEOFOPERAND *values;
  double *values64;
    
  // To measure time elapsed and performance achieved
  float msecMemHst, msecMemDvc, msecCompStr, msecCompKrn;
  float numOperationsPerValue, numFloatingPointOperations, opIntensity;
  double flops, gigaFlops;
    
  // Misc.
  int devID;
  int *blocks;
  int *threads;
  TYPEOFOPERAND *dvalues;
    
  //=========================================================================
  // Check command line arguments.
  if (argc < 5) {
    fprintf(stderr, "ERROR: Wrong number of arguments: %d\n", argc - 1);
    fprintf(stderr, "Use: ./mytoy <deviceID> <numOperationsPer");
    fprintf(stderr, "Value> <inputMatrixFile> <outputMatrixFile>\n");
    exit(EXIT_FAILURE);
  }
    
  //-------------------------------------------------------------------------
  devID = atoi(argv[1]);
    
  numOperationsPerValue = atoi(argv[2]);
  if (numOperationsPerValue <= 0) {
    fprintf(stderr, "ERROR: The second argument is wrong: %s.\n", argv[2]);
    fprintf(stderr, "The number of operations perfomed on each nonzero ");
    fprintf(stderr, "must be greater than 0.\n");
    exit(EXIT_FAILURE);
  }
    
  //=========================================================================
  // Get properties of the chosen device.
  getDeviceProperties(devID, &deviceProp);
    
  //-------------------------------------------------------------------------
  // Create CUDA events for timing.
  cudaEventCreate(&start);
  cudaEventCreate(&stop);
    
  //=========================================================================
  // Read the input matrix.
  readInputMatrix(argv[3], &nrow, &ncol, &nnzero, &colptr, &rowind, &values64);
    
  values = (TYPEOFOPERAND*)malloc(nnzero * sizeof(TYPEOFOPERAND));
  for (int i=0; i<nnzero; ++i) {
    values[i] = (TYPEOFOPERAND) values64[i];
  }
    
  // Maximum number of threads per block and warp size.
  int blockSize = 512;
  int warpSize = 32;
    
  // Calculate the number of blocks and threads required for each column.
  // MU3: Here you get the grid size to deploy parallelism on CUDA kernel launches (see MU5)
  blocks = (int*)malloc(ncol * sizeof(int));
  threads = (int*)malloc(ncol * sizeof(int));
  for (int i=0; i<ncol; ++i) {
    threads[i] = (((colptr[i+1] - colptr[i]) / warpSize) + 1) * warpSize;
    if (threads[i] <= blockSize) {
      blocks[i] = 1;
    } else {
      blocks[i] = threads[i] / blockSize;
      if (threads[i] % blockSize > 0) {blocks[i]++;}
        threads[i] = blockSize;
    }
  }
    
  //=========================================================================
    
  //-------------------------------------------------------------------------
  // Copy matrix values from host memory to device memory.

  int valuesSize = nnzero * sizeof(TYPEOFOPERAND);
    
  cudaErrorHandler(cudaEventRecord(start, NULL), __LINE__);

cudaErrorHandler(cudaMalloc((void**)&dvalues, valuesSize), __LINE__);

  cudaErrorHandler(cudaMemcpy(dvalues, values, valuesSize,
                                cudaMemcpyHostToDevice), __LINE__);
    
  cudaErrorHandler(cudaEventRecord(stop, NULL), __LINE__);
  cudaErrorHandler(cudaEventSynchronize(stop), __LINE__);
  cudaErrorHandler(cudaEventElapsedTime(&msecMemHst, start, stop), __LINE__);
    
  //-------------------------------------------------------------------------
  // Create streams.
  cudaErrorHandler(cudaEventRecord(start, NULL), __LINE__);
    
//   cudaErrorHandler(cudaStreamCreate(&stream), __LINE__);

   stream = (cudaStream_t*)malloc(ncol * sizeof(cudaStream_t));
  for (int i=0; i<ncol; ++i) {
    cudaErrorHandler(cudaStreamCreate(&stream[i]), __LINE__);
  }
  //fprintf(stdout, "Stream(s) created successfully.\n");
    
  cudaErrorHandler(cudaEventRecord(stop, NULL), __LINE__);
  cudaErrorHandler(cudaEventSynchronize(stop), __LINE__);
  cudaErrorHandler(cudaEventElapsedTime(&msecCompStr, start, stop),__LINE__);
    
  //-------------------------------------------------------------------------
  // Launch streams.
  cudaErrorHandler(cudaEventRecord(start, NULL), __LINE__);
    
  //fprintf(stdout, "Launching stream(s).\n");
  #pragma unroll
  for (int i=0; i<ncol; ++i) {
    kernelAdd<<<blocks[i], threads[i], 0, stream[i]>>>
             (dvalues, numOperationsPerValue, colptr[i], colptr[i+1]);
  }
  cudaErrorHandler(cudaEventRecord(stop, NULL), __LINE__);
  cudaErrorHandler(cudaEventSynchronize(stop), __LINE__);
  cudaErrorHandler(cudaEventElapsedTime(&msecCompKrn, start, stop),__LINE__);
    
  cudaErrorHandler(cudaDeviceSynchronize(), __LINE__);
  //fprintf(stdout, "Stream(s) executed successfully.\n");
    
  //-------------------------------------------------------------------------
  // Copy matrix values back to host memory.
  cudaErrorHandler(cudaEventRecord(start, NULL), __LINE__);

  cudaErrorHandler(cudaMemcpy(values, dvalues, valuesSize,
                              cudaMemcpyDeviceToHost), __LINE__);
    
  cudaErrorHandler(cudaEventRecord(stop, NULL), __LINE__);
  cudaErrorHandler(cudaEventSynchronize(stop), __LINE__);
  cudaErrorHandler(cudaEventElapsedTime(&msecMemDvc, start, stop), __LINE__);
    
  //=========================================================================
  // Write the output matrix.
  for (int i=0; i<nnzero; ++i) {
    values64[i] = (double)values[i];
  }
    
  writeOutputMatrix(argv[4], nrow, ncol, nnzero,
                    colptr, rowind, values64);

  float msecComp = msecCompStr + msecCompKrn;
    
  opIntensity = numOperationsPerValue / sizeof(TYPEOFOPERAND);
  fprintf(stdout, "Operational intensity: %.4f FLOP/byte.\n", opIntensity);
    
  numFloatingPointOperations = nnzero * numOperationsPerValue;
  flops = numFloatingPointOperations / (msecComp / 1000.0f);
  gigaFlops = flops * 1.0e-9f;
  fprintf(stdout, "Performance: %.4f GFLOPS.\n\n", gigaFlops);
    
  //=========================================================================
  // Free host memory.
  free(colptr); free(rowind); free(values);
  free(blocks); free(threads);
    
  // Free all device resources.
  cudaErrorHandler(cudaDeviceReset(), __LINE__);
    
  return EXIT_SUCCESS;
}

The goal is simple: obtain the max Gflops of the device.
The code read a sparse matrix and makes operations with each element.
You can choose the number of operations and type.

I have a GTX480, 1340 Gflops performance, Fermi architecture and CUDA 4.0.
The actual performance of this code is 270 Gflops.

I only need some ideas or when I can find it!

¿Can anyone help me?

Thank you very much!

BulatZiganshin · July 12, 2017, 1:42pm

it’s home assignment:

http://www.ac.uma.es/~ujaldon/descargas/the-kepler-web-site/mytoy.cu

njuffa · July 12, 2017, 1:46pm

This doesn’t jibe with this (emphasis mine):

Operations on sparse matrices are typically memory bound, so the GFLOPS achieved will be far below the theoretical limit of the device.

If the goal is really creating a benchmark that approximates the maximum achievable throughput of floating-point operations, use a dense vector, assign each vector element to a thread, then compute two independent chains of FMAs (say, 200 terms each) per thread in a straight-line piece of code whose sum is assigned to the vector element at the end. That gives you 400 single-precision FMAs (= 800 floating-point operations) per 4-byte result, so performance will be limited by compute throughput.

Lip7 · July 12, 2017, 2:04pm

Yes, it is. It’s a problem of a CUDA course.

Thank you very much, njuffa. I will try it!

Topic		Replies	Views
Cuda code performance CUDA Programming and Performance	14	3144	December 16, 2014
CUDA Fortran performance with explicit finite difference method CUDA Programming and Performance	4	655	February 28, 2020
CUDA Fortran matrix-multiply 10x slower than CUDA C version Legacy PGI Compilers	5	6920	July 14, 2010
Outrageous speedups over serial code: advice needed CUDA Programming and Performance	17	2801	November 18, 2012
CUDA very slow performance CUDA Programming and Performance	21	16715	March 6, 2020
Cuda program taking more time. CUDA Programming and Performance	15	7059	November 21, 2010
How to correct the time units on the results CUDA Programming and Performance	2	449	May 11, 2018
GPU/CPU precision comparison and Kernel instructions question CUDA Programming and Performance	5	679	April 4, 2017
WHAT IT'S WRONG IN THIS KERNEL CODE? CUDA Programming and Performance	4	1952	June 17, 2008
how to speed up? data transfer CUDA Programming and Performance	22	3798	April 5, 2011

How can I accelerate this code?

Related topics