How can I accelerate this code?

Hi everyone!

I need to accelerate this code:

__global__ void kernelAdd(TYPEOFOPERAND *dvalues, int numOperations,
                          int firstInd, int nextColInd)
{
  int vi = firstInd + blockIdx.x * blockDim.x + threadIdx.x;
  float temp = dvalues[vi];
    
  if (vi < nextColInd) {
    for (int j=0; j<numOperations; ++j) {  
      temp += temp*temp;
    }                              
    dvalues[vi] = temp;
  }
}

int main(int argc, char **argv)
{
  //=========================================================================
  // Variables.
  // CUDA.
  cudaDeviceProp deviceProp;
  cudaStream_t *stream;  
  cudaEvent_t start, stop;
    
  // Matrix.
  // Harwell-Boeing format.
  int nrow, ncol, nnzero;
  // Compressed Sparse Column format.
  int *colptr, *rowind;
  TYPEOFOPERAND *values;
  double *values64;
    
  // To measure time elapsed and performance achieved
  float msecMemHst, msecMemDvc, msecCompStr, msecCompKrn;
  float numOperationsPerValue, numFloatingPointOperations, opIntensity;
  double flops, gigaFlops;
    
  // Misc.
  int devID;
  int *blocks;
  int *threads;
  TYPEOFOPERAND *dvalues;
    
  //=========================================================================
  // Check command line arguments.
  if (argc < 5) {
    fprintf(stderr, "ERROR: Wrong number of arguments: %d\n", argc - 1);
    fprintf(stderr, "Use: ./mytoy <deviceID> <numOperationsPer");
    fprintf(stderr, "Value> <inputMatrixFile> <outputMatrixFile>\n");
    exit(EXIT_FAILURE);
  }
    
  //-------------------------------------------------------------------------
  devID = atoi(argv[1]);
    
  numOperationsPerValue = atoi(argv[2]);
  if (numOperationsPerValue <= 0) {
    fprintf(stderr, "ERROR: The second argument is wrong: %s.\n", argv[2]);
    fprintf(stderr, "The number of operations perfomed on each nonzero ");
    fprintf(stderr, "must be greater than 0.\n");
    exit(EXIT_FAILURE);
  }
    
  //=========================================================================
  // Get properties of the chosen device.
  getDeviceProperties(devID, &deviceProp);
    
  //-------------------------------------------------------------------------
  // Create CUDA events for timing.
  cudaEventCreate(&start);
  cudaEventCreate(&stop);
    
  //=========================================================================
  // Read the input matrix.
  readInputMatrix(argv[3], &nrow, &ncol, &nnzero, &colptr, &rowind, &values64);
    
  values = (TYPEOFOPERAND*)malloc(nnzero * sizeof(TYPEOFOPERAND));
  for (int i=0; i<nnzero; ++i) {
    values[i] = (TYPEOFOPERAND) values64[i];
  }
    
  // Maximum number of threads per block and warp size.
  int blockSize = 512;
  int warpSize = 32;
    
  // Calculate the number of blocks and threads required for each column.
  // MU3: Here you get the grid size to deploy parallelism on CUDA kernel launches (see MU5)
  blocks = (int*)malloc(ncol * sizeof(int));
  threads = (int*)malloc(ncol * sizeof(int));
  for (int i=0; i<ncol; ++i) {
    threads[i] = (((colptr[i+1] - colptr[i]) / warpSize) + 1) * warpSize;
    if (threads[i] <= blockSize) {
      blocks[i] = 1;
    } else {
      blocks[i] = threads[i] / blockSize;
      if (threads[i] % blockSize > 0) {blocks[i]++;}
        threads[i] = blockSize;
    }
  }
    
  //=========================================================================
    
  //-------------------------------------------------------------------------
  // Copy matrix values from host memory to device memory.

  int valuesSize = nnzero * sizeof(TYPEOFOPERAND);
    
  cudaErrorHandler(cudaEventRecord(start, NULL), __LINE__);

cudaErrorHandler(cudaMalloc((void**)&dvalues, valuesSize), __LINE__);

  cudaErrorHandler(cudaMemcpy(dvalues, values, valuesSize,
                                cudaMemcpyHostToDevice), __LINE__);
    
  cudaErrorHandler(cudaEventRecord(stop, NULL), __LINE__);
  cudaErrorHandler(cudaEventSynchronize(stop), __LINE__);
  cudaErrorHandler(cudaEventElapsedTime(&msecMemHst, start, stop), __LINE__);
    
  //-------------------------------------------------------------------------
  // Create streams.
  cudaErrorHandler(cudaEventRecord(start, NULL), __LINE__);
    
//   cudaErrorHandler(cudaStreamCreate(&stream), __LINE__);

   stream = (cudaStream_t*)malloc(ncol * sizeof(cudaStream_t));
  for (int i=0; i<ncol; ++i) {
    cudaErrorHandler(cudaStreamCreate(&stream[i]), __LINE__);
  }
  //fprintf(stdout, "Stream(s) created successfully.\n");
    
  cudaErrorHandler(cudaEventRecord(stop, NULL), __LINE__);
  cudaErrorHandler(cudaEventSynchronize(stop), __LINE__);
  cudaErrorHandler(cudaEventElapsedTime(&msecCompStr, start, stop),__LINE__);
    
  //-------------------------------------------------------------------------
  // Launch streams.
  cudaErrorHandler(cudaEventRecord(start, NULL), __LINE__);
    
  //fprintf(stdout, "Launching stream(s).\n");
  #pragma unroll
  for (int i=0; i<ncol; ++i) {
    kernelAdd<<<blocks[i], threads[i], 0, stream[i]>>>
             (dvalues, numOperationsPerValue, colptr[i], colptr[i+1]);
  }
  cudaErrorHandler(cudaEventRecord(stop, NULL), __LINE__);
  cudaErrorHandler(cudaEventSynchronize(stop), __LINE__);
  cudaErrorHandler(cudaEventElapsedTime(&msecCompKrn, start, stop),__LINE__);
    
  cudaErrorHandler(cudaDeviceSynchronize(), __LINE__);
  //fprintf(stdout, "Stream(s) executed successfully.\n");
    
  //-------------------------------------------------------------------------
  // Copy matrix values back to host memory.
  cudaErrorHandler(cudaEventRecord(start, NULL), __LINE__);

  cudaErrorHandler(cudaMemcpy(values, dvalues, valuesSize,
                              cudaMemcpyDeviceToHost), __LINE__);
    
  cudaErrorHandler(cudaEventRecord(stop, NULL), __LINE__);
  cudaErrorHandler(cudaEventSynchronize(stop), __LINE__);
  cudaErrorHandler(cudaEventElapsedTime(&msecMemDvc, start, stop), __LINE__);
    
  //=========================================================================
  // Write the output matrix.
  for (int i=0; i<nnzero; ++i) {
    values64[i] = (double)values[i];
  }
    
  writeOutputMatrix(argv[4], nrow, ncol, nnzero,
                    colptr, rowind, values64);

  float msecComp = msecCompStr + msecCompKrn;
    
  opIntensity = numOperationsPerValue / sizeof(TYPEOFOPERAND);
  fprintf(stdout, "Operational intensity: %.4f FLOP/byte.\n", opIntensity);
    
  numFloatingPointOperations = nnzero * numOperationsPerValue;
  flops = numFloatingPointOperations / (msecComp / 1000.0f);
  gigaFlops = flops * 1.0e-9f;
  fprintf(stdout, "Performance: %.4f GFLOPS.\n\n", gigaFlops);
    
  //=========================================================================
  // Free host memory.
  free(colptr); free(rowind); free(values);
  free(blocks); free(threads);
    
  // Free all device resources.
  cudaErrorHandler(cudaDeviceReset(), __LINE__);
    
  return EXIT_SUCCESS;
}

The goal is simple: obtain the max Gflops of the device.
The code read a sparse matrix and makes operations with each element.
You can choose the number of operations and type.

I have a GTX480, 1340 Gflops performance, Fermi architecture and CUDA 4.0.
The actual performance of this code is 270 Gflops.

I only need some ideas or when I can find it!

¿Can anyone help me?

Thank you very much!

it’s home assignment:

http://www.ac.uma.es/~ujaldon/descargas/the-kepler-web-site/mytoy.cu

http://cms.ac.uma.es/ujaldon/index.php/es

This doesn’t jibe with this (emphasis mine):

Operations on sparse matrices are typically memory bound, so the GFLOPS achieved will be far below the theoretical limit of the device.

If the goal is really creating a benchmark that approximates the maximum achievable throughput of floating-point operations, use a dense vector, assign each vector element to a thread, then compute two independent chains of FMAs (say, 200 terms each) per thread in a straight-line piece of code whose sum is assigned to the vector element at the end. That gives you 400 single-precision FMAs (= 800 floating-point operations) per 4-byte result, so performance will be limited by compute throughput.

Yes, it is. It’s a problem of a CUDA course.

Thank you very much, njuffa. I will try it!