Hi everyone!
I need to accelerate this code:
__global__ void kernelAdd(TYPEOFOPERAND *dvalues, int numOperations,
int firstInd, int nextColInd)
{
int vi = firstInd + blockIdx.x * blockDim.x + threadIdx.x;
float temp = dvalues[vi];
if (vi < nextColInd) {
for (int j=0; j<numOperations; ++j) {
temp += temp*temp;
}
dvalues[vi] = temp;
}
}
int main(int argc, char **argv)
{
//=========================================================================
// Variables.
// CUDA.
cudaDeviceProp deviceProp;
cudaStream_t *stream;
cudaEvent_t start, stop;
// Matrix.
// Harwell-Boeing format.
int nrow, ncol, nnzero;
// Compressed Sparse Column format.
int *colptr, *rowind;
TYPEOFOPERAND *values;
double *values64;
// To measure time elapsed and performance achieved
float msecMemHst, msecMemDvc, msecCompStr, msecCompKrn;
float numOperationsPerValue, numFloatingPointOperations, opIntensity;
double flops, gigaFlops;
// Misc.
int devID;
int *blocks;
int *threads;
TYPEOFOPERAND *dvalues;
//=========================================================================
// Check command line arguments.
if (argc < 5) {
fprintf(stderr, "ERROR: Wrong number of arguments: %d\n", argc - 1);
fprintf(stderr, "Use: ./mytoy <deviceID> <numOperationsPer");
fprintf(stderr, "Value> <inputMatrixFile> <outputMatrixFile>\n");
exit(EXIT_FAILURE);
}
//-------------------------------------------------------------------------
devID = atoi(argv[1]);
numOperationsPerValue = atoi(argv[2]);
if (numOperationsPerValue <= 0) {
fprintf(stderr, "ERROR: The second argument is wrong: %s.\n", argv[2]);
fprintf(stderr, "The number of operations perfomed on each nonzero ");
fprintf(stderr, "must be greater than 0.\n");
exit(EXIT_FAILURE);
}
//=========================================================================
// Get properties of the chosen device.
getDeviceProperties(devID, &deviceProp);
//-------------------------------------------------------------------------
// Create CUDA events for timing.
cudaEventCreate(&start);
cudaEventCreate(&stop);
//=========================================================================
// Read the input matrix.
readInputMatrix(argv[3], &nrow, &ncol, &nnzero, &colptr, &rowind, &values64);
values = (TYPEOFOPERAND*)malloc(nnzero * sizeof(TYPEOFOPERAND));
for (int i=0; i<nnzero; ++i) {
values[i] = (TYPEOFOPERAND) values64[i];
}
// Maximum number of threads per block and warp size.
int blockSize = 512;
int warpSize = 32;
// Calculate the number of blocks and threads required for each column.
// MU3: Here you get the grid size to deploy parallelism on CUDA kernel launches (see MU5)
blocks = (int*)malloc(ncol * sizeof(int));
threads = (int*)malloc(ncol * sizeof(int));
for (int i=0; i<ncol; ++i) {
threads[i] = (((colptr[i+1] - colptr[i]) / warpSize) + 1) * warpSize;
if (threads[i] <= blockSize) {
blocks[i] = 1;
} else {
blocks[i] = threads[i] / blockSize;
if (threads[i] % blockSize > 0) {blocks[i]++;}
threads[i] = blockSize;
}
}
//=========================================================================
//-------------------------------------------------------------------------
// Copy matrix values from host memory to device memory.
int valuesSize = nnzero * sizeof(TYPEOFOPERAND);
cudaErrorHandler(cudaEventRecord(start, NULL), __LINE__);
cudaErrorHandler(cudaMalloc((void**)&dvalues, valuesSize), __LINE__);
cudaErrorHandler(cudaMemcpy(dvalues, values, valuesSize,
cudaMemcpyHostToDevice), __LINE__);
cudaErrorHandler(cudaEventRecord(stop, NULL), __LINE__);
cudaErrorHandler(cudaEventSynchronize(stop), __LINE__);
cudaErrorHandler(cudaEventElapsedTime(&msecMemHst, start, stop), __LINE__);
//-------------------------------------------------------------------------
// Create streams.
cudaErrorHandler(cudaEventRecord(start, NULL), __LINE__);
// cudaErrorHandler(cudaStreamCreate(&stream), __LINE__);
stream = (cudaStream_t*)malloc(ncol * sizeof(cudaStream_t));
for (int i=0; i<ncol; ++i) {
cudaErrorHandler(cudaStreamCreate(&stream[i]), __LINE__);
}
//fprintf(stdout, "Stream(s) created successfully.\n");
cudaErrorHandler(cudaEventRecord(stop, NULL), __LINE__);
cudaErrorHandler(cudaEventSynchronize(stop), __LINE__);
cudaErrorHandler(cudaEventElapsedTime(&msecCompStr, start, stop),__LINE__);
//-------------------------------------------------------------------------
// Launch streams.
cudaErrorHandler(cudaEventRecord(start, NULL), __LINE__);
//fprintf(stdout, "Launching stream(s).\n");
#pragma unroll
for (int i=0; i<ncol; ++i) {
kernelAdd<<<blocks[i], threads[i], 0, stream[i]>>>
(dvalues, numOperationsPerValue, colptr[i], colptr[i+1]);
}
cudaErrorHandler(cudaEventRecord(stop, NULL), __LINE__);
cudaErrorHandler(cudaEventSynchronize(stop), __LINE__);
cudaErrorHandler(cudaEventElapsedTime(&msecCompKrn, start, stop),__LINE__);
cudaErrorHandler(cudaDeviceSynchronize(), __LINE__);
//fprintf(stdout, "Stream(s) executed successfully.\n");
//-------------------------------------------------------------------------
// Copy matrix values back to host memory.
cudaErrorHandler(cudaEventRecord(start, NULL), __LINE__);
cudaErrorHandler(cudaMemcpy(values, dvalues, valuesSize,
cudaMemcpyDeviceToHost), __LINE__);
cudaErrorHandler(cudaEventRecord(stop, NULL), __LINE__);
cudaErrorHandler(cudaEventSynchronize(stop), __LINE__);
cudaErrorHandler(cudaEventElapsedTime(&msecMemDvc, start, stop), __LINE__);
//=========================================================================
// Write the output matrix.
for (int i=0; i<nnzero; ++i) {
values64[i] = (double)values[i];
}
writeOutputMatrix(argv[4], nrow, ncol, nnzero,
colptr, rowind, values64);
float msecComp = msecCompStr + msecCompKrn;
opIntensity = numOperationsPerValue / sizeof(TYPEOFOPERAND);
fprintf(stdout, "Operational intensity: %.4f FLOP/byte.\n", opIntensity);
numFloatingPointOperations = nnzero * numOperationsPerValue;
flops = numFloatingPointOperations / (msecComp / 1000.0f);
gigaFlops = flops * 1.0e-9f;
fprintf(stdout, "Performance: %.4f GFLOPS.\n\n", gigaFlops);
//=========================================================================
// Free host memory.
free(colptr); free(rowind); free(values);
free(blocks); free(threads);
// Free all device resources.
cudaErrorHandler(cudaDeviceReset(), __LINE__);
return EXIT_SUCCESS;
}
The goal is simple: obtain the max Gflops of the device.
The code read a sparse matrix and makes operations with each element.
You can choose the number of operations and type.
I have a GTX480, 1340 Gflops performance, Fermi architecture and CUDA 4.0.
The actual performance of this code is 270 Gflops.
I only need some ideas or when I can find it!
¿Can anyone help me?
Thank you very much!