#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdafx.h>
#include <stdio.h>
#include <stdlib.h>
#define MAX_LEN 5000 //Size of memory to hold one row in one file at a time.
#define MAX_ELEMS 300000 //Size of memory allocated to the array that holds entire rows and columns serially, one file at a time
#define REAL float //Setting the type here. Precision of float wasn't sufficient so double was used. Consumed twice memory then float
FILE *streamlog; //Stream for log file
FILE *streamread; //Stream for input file
FILE *streamwrite; //Stream for output file
__global__ void kernel(REAL *odata, REAL *idata, unsigned int width, unsigned int height)
{
unsigned int xIndex = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int yIndex = blockIdx.y * blockDim.y + threadIdx.y;
unsigned int index_in = xIndex + width * yIndex;
unsigned int index_out = yIndex + height * xIndex;
for (unsigned int i = 0; i < height; i += 1)
odata[index_out + i] = idata[index_in + i * width];
}
int main(void)
{
char log_filename[30] = "Logging.txt"; //Holds log filename
char in_filename[30] = ""; //Holds input filename
char out_filename[30] = ""; //Holds output filename
errno_t openerrcode; //Error code returned for errors occuring during file opening
unsigned int totalfiles = 560; //Enter total number of files to transpose
bool reading_failedflag = false; //Flag to remember if any file reading caused error
bool writing_failedflag = false; //Flag to remember if any file writing caused error
bool global_failedflag = false; //Flag to remember if any error occured during whole program execution
unsigned int filecount = 0; //Total count of files successfully processed
///Opens a log stream to log all messages into log file for later review.
if( openerrcode = fopen_s( &streamlog, log_filename, "w" ) != 0 )
{
printf( "The log file %s was not opened for writing. Error code: %d\n", log_filename, openerrcode );
getchar();
exit(1); //Exits the program
}
///Initiate time-monitored transpose process.
time_t start, end;
time (&start);
printf("Initial time: %s", ctime(&start));
fprintf( streamlog, "Initial time: %s", ctime(&start));
printf("Transposing file(s) started...\n");
fprintf( streamlog, "Transposing file(s) started...\n");
///Loop through each input files.
for (unsigned int i = 0; i < totalfiles; ++i)
{
char s[MAX_LEN]; //Holds one row at a time
unsigned int numelems = 0; //Counter to hold the total number of elements in a file
unsigned int numrows = 0; //Counter to hold the total number of rows in a file
char *str; //This array will take command of the row read into s[MAX_LEN] to eventually tokenize it
char delims[] = "\t"; //File is tab-delimited. Change to comma or pipe accordingly if file-format changes
char *result; //The tokenized result (single elment) as broken down from str will be hold here
REAL *allElems = (REAL *) malloc(sizeof(REAL *) * MAX_ELEMS); //Holds all elements of the file, one file at a time
sprintf_s(in_filename, "%s%d%s", "raw data ", i, ".xls"); //Holds the input file. Format is "raw data <number>.xls"
sprintf_s(out_filename, "%s%s", "trns-", in_filename); //Holds the output file. Format is "trns-raw data <number>.xls"
///Opens reading stream (for input file) and writing stream (for output file).
if( openerrcode = fopen_s( &streamread, in_filename, "r" ) != 0 )
{
fprintf( streamlog, "The file %s was not opened for reading. Error code: %d.\n", in_filename, openerrcode );
reading_failedflag = true; //Recording error
global_failedflag = true; //Recording error
continue; //Continues to next file
}
if( openerrcode = fopen_s( &streamwrite, out_filename, "w" ) != 0 )
{
fprintf( streamlog, "The file %s was not opened for writing. Error code: %d.\n", out_filename, openerrcode );
writing_failedflag = true; //Recording error
global_failedflag = true; //Recording error
continue; //Continues to next file
}
///Reads till the end of the input file/stream.
while( !feof( streamread ) )
{
fgets(s, MAX_LEN, streamread); //Read a row from file
if (strlen(s) <= 1) continue; //Bypass empty rows
str = s; //Take command of the row, for tokenizing
result = strtok( str, delims ); //First token (element) of the row obtained in result
///Reads each element and stores all data sequentially into an array (single column, multiple rows).
while( result != NULL ) //Loop until whole row is read
{
allElems[numelems] = (REAL) atof(result); //Element is converted to numeric format and stored in array
result = strtok( NULL, delims ); //Obtains second token onwards
numelems += 1; //Recording total number of elements in the file
}
numrows += 1; //Recording total number of rows in the file
s[0] = '
#include “cuda_runtime.h”
#include “device_launch_parameters.h”
#include <stdafx.h>
#include <stdio.h>
#include <stdlib.h>
#define MAX_LEN 5000 //Size of memory to hold one row in one file at a time.
#define MAX_ELEMS 300000 //Size of memory allocated to the array that holds entire rows and columns serially, one file at a time
#define REAL float //Setting the type here. Precision of float wasn’t sufficient so double was used. Consumed twice memory then float
FILE *streamlog; //Stream for log file
FILE *streamread; //Stream for input file
FILE *streamwrite; //Stream for output file
global void kernel(REAL *odata, REAL *idata, unsigned int width, unsigned int height)
{
unsigned int xIndex = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int yIndex = blockIdx.y * blockDim.y + threadIdx.y;
unsigned int index_in = xIndex + width * yIndex;
unsigned int index_out = yIndex + height * xIndex;
for (unsigned int i = 0; i < height; i += 1)
odata[index_out + i] = idata[index_in + i * width];
}
int main(void)
{
char log_filename[30] = "Logging.txt"; //Holds log filename
char in_filename[30] = ""; //Holds input filename
char out_filename[30] = ""; //Holds output filename
errno_t openerrcode; //Error code returned for errors occuring during file opening
unsigned int totalfiles = 560; //Enter total number of files to transpose
bool reading_failedflag = false; //Flag to remember if any file reading caused error
bool writing_failedflag = false; //Flag to remember if any file writing caused error
bool global_failedflag = false; //Flag to remember if any error occured during whole program execution
unsigned int filecount = 0; //Total count of files successfully processed
///Opens a log stream to log all messages into log file for later review.
if( openerrcode = fopen_s( &streamlog, log_filename, "w" ) != 0 )
{
printf( "The log file %s was not opened for writing. Error code: %d\n", log_filename, openerrcode );
getchar();
exit(1); //Exits the program
}
///Initiate time-monitored transpose process.
time_t start, end;
time (&start);
printf("Initial time: %s", ctime(&start));
fprintf( streamlog, "Initial time: %s", ctime(&start));
printf("Transposing file(s) started...\n");
fprintf( streamlog, "Transposing file(s) started...\n");
///Loop through each input files.
for (unsigned int i = 0; i < totalfiles; ++i)
{
char s[MAX_LEN]; //Holds one row at a time
unsigned int numelems = 0; //Counter to hold the total number of elements in a file
unsigned int numrows = 0; //Counter to hold the total number of rows in a file
char *str; //This array will take command of the row read into s[MAX_LEN] to eventually tokenize it
char delims[] = "\t"; //File is tab-delimited. Change to comma or pipe accordingly if file-format changes
char *result; //The tokenized result (single elment) as broken down from str will be hold here
REAL *allElems = (REAL *) malloc(sizeof(REAL *) * MAX_ELEMS); //Holds all elements of the file, one file at a time
sprintf_s(in_filename, "%s%d%s", "raw data ", i, ".xls"); //Holds the input file. Format is "raw data <number>.xls"
sprintf_s(out_filename, "%s%s", "trns-", in_filename); //Holds the output file. Format is "trns-raw data <number>.xls"
///Opens reading stream (for input file) and writing stream (for output file).
if( openerrcode = fopen_s( &streamread, in_filename, "r" ) != 0 )
{
fprintf( streamlog, "The file %s was not opened for reading. Error code: %d.\n", in_filename, openerrcode );
reading_failedflag = true; //Recording error
global_failedflag = true; //Recording error
continue; //Continues to next file
}
if( openerrcode = fopen_s( &streamwrite, out_filename, "w" ) != 0 )
{
fprintf( streamlog, "The file %s was not opened for writing. Error code: %d.\n", out_filename, openerrcode );
writing_failedflag = true; //Recording error
global_failedflag = true; //Recording error
continue; //Continues to next file
}
///Reads till the end of the input file/stream.
while( !feof( streamread ) )
{
fgets(s, MAX_LEN, streamread); //Read a row from file
if (strlen(s) <= 1) continue; //Bypass empty rows
str = s; //Take command of the row, for tokenizing
result = strtok( str, delims ); //First token (element) of the row obtained in result
///Reads each element and stores all data sequentially into an array (single column, multiple rows).
while( result != NULL ) //Loop until whole row is read
{
allElems[numelems] = (REAL) atof(result); //Element is converted to numeric format and stored in array
result = strtok( NULL, delims ); //Obtains second token onwards
numelems += 1; //Recording total number of elements in the file
}
numrows += 1; //Recording total number of rows in the file
s[0] = '\0'; //Flushing the row which was taken in command; for the purpose of reading next row
}
cudaError_t err, errrunning;
REAL mem_size = (numelems) * sizeof(REAL); //Total size of memory needed to be allocated in device
REAL *h_odata = (REAL *) malloc(mem_size); //Memory allocation for the transposed result as received from device
REAL *d_idata; //Holds the entire matrix
REAL *d_odata; //Holds the resulting transposed matrix
//printf("File %s has total %d elements and occupies %.3lf KB in GPU.\n", in_filename, numelems, (mem_size / 1024) * 2);
///Allocate memory and copy in-memory data into device.
err = cudaMalloc( (void**) &d_idata, mem_size); //Memory to hold matrix is allocated in device
if( err != cudaSuccess )
{
fprintf( streamlog, "Error: %s\n", cudaGetErrorString(err) );
global_failedflag = true; //Recording error
}
err = cudaMalloc( (void**) &d_odata, mem_size); //Memory to hold transposed matrix is allocated in device
if( err != cudaSuccess )
{
fprintf( streamlog, "Error: %s\n", cudaGetErrorString(err) );
global_failedflag = true; //Recording error
}
err =cudaMemcpy(d_idata, allElems, mem_size, cudaMemcpyHostToDevice); //Matrix copied to device from host
if( err != cudaSuccess )
{
fprintf( streamlog, "Error: %s\n", cudaGetErrorString(err) );
global_failedflag = true; //Recording error
}
///Perform transpose operation and return the transposed result to host.
kernel<<<numelems / numrows, 1>>>(d_odata, d_idata, numelems / numrows, numrows); //Perform transpose on device
//The GPU function-launch is async, so the launch "returns" to the CPU immediately, even though the GPU code has not finished executing
err = cudaGetLastError(); //Catches errors occuring immediately at launchtime
errrunning = cudaThreadSynchronize(); //Catches all other kernel run-time errors
if( err != cudaSuccess )
{
fprintf( streamlog, "Error: %s\n", cudaGetErrorString(err) );
global_failedflag = true; //Recording error
}
if( errrunning != cudaSuccess )
{
fprintf( streamlog, "Error: %s\n", cudaGetErrorString(errrunning) );
global_failedflag = true; //Recording error
}
err =cudaMemcpy(h_odata, d_odata, mem_size, cudaMemcpyDeviceToHost); //Copy back the transposed result to host from device
if( err != cudaSuccess )
{
fprintf( streamlog, "Error: %s\n", cudaGetErrorString(err) );
global_failedflag = true; //Recording error
}
///Write the transposed output to output stream.
unsigned int traceline = 0; //Column counter flag
for (unsigned int j = 0; j < numelems; ++j)
{
++traceline;
if (traceline == numrows)
{
//A new line has been reached. Reset writing to next new line
fprintf(streamwrite, "%.10f\n", h_odata[j]);
traceline = 0; //Reset column counter flag
}
else
fprintf(streamwrite, "%.10f\t", h_odata[j]); //End of line not reached; proceed with writing in the same line
}
++filecount;
//Close both reading and writing streams and free memory
fclose( streamwrite );
fclose( streamread );
free( allElems );
cudaFree( d_idata );
cudaFree( d_odata );
printf( "The file %s was transposed successfully.\n", in_filename );
fprintf( streamlog, "The file %s was transposed successfully.\n", in_filename );
}
printf("%d / %d file(s) transposed. Transposing completed.\n", filecount, totalfiles);
fprintf( streamlog, "%d / %d file(s) transposed. Transposing completed.\n", filecount, totalfiles);
///Display completion time.
time (&end);
printf("Final time: %s", ctime(&end));
fprintf( streamlog, "Final time: %s", ctime(&end));
printf("Time taken: %.2lf seconds\n", difftime(end, start));
fprintf( streamlog, "Time taken: %.2lf seconds\n", difftime(end, start));
//Report error if any file failed read/write
if (reading_failedflag)
{
printf("Failed to open some file(s) for reading.");
fprintf( streamlog, "Failed to open some file(s) for reading.");
}
if (writing_failedflag)
{
printf("Failed to open some file(s) for writing.");
fprintf( streamlog, "Failed to open some file(s) for writing.");
}
if (global_failedflag)
printf("\nProgram didn't execute normally. Please view the file %s for log information.", log_filename);
else
printf("\nProgram executed normally.");
getchar();
return 0;
}
'; //Flushing the row which was taken in command; for the purpose of reading next row
}
cudaError_t err, errrunning;
REAL mem_size = (numelems) * sizeof(REAL); //Total size of memory needed to be allocated in device
REAL *h_odata = (REAL *) malloc(mem_size); //Memory allocation for the transposed result as received from device
REAL *d_idata; //Holds the entire matrix
REAL *d_odata; //Holds the resulting transposed matrix
//printf("File %s has total %d elements and occupies %.3lf KB in GPU.\n", in_filename, numelems, (mem_size / 1024) * 2);
///Allocate memory and copy in-memory data into device.
err = cudaMalloc( (void**) &d_idata, mem_size); //Memory to hold matrix is allocated in device
if( err != cudaSuccess )
{
fprintf( streamlog, "Error: %s\n", cudaGetErrorString(err) );
global_failedflag = true; //Recording error
}
err = cudaMalloc( (void**) &d_odata, mem_size); //Memory to hold transposed matrix is allocated in device
if( err != cudaSuccess )
{
fprintf( streamlog, "Error: %s\n", cudaGetErrorString(err) );
global_failedflag = true; //Recording error
}
err =cudaMemcpy(d_idata, allElems, mem_size, cudaMemcpyHostToDevice); //Matrix copied to device from host
if( err != cudaSuccess )
{
fprintf( streamlog, "Error: %s\n", cudaGetErrorString(err) );
global_failedflag = true; //Recording error
}
///Perform transpose operation and return the transposed result to host.
kernel<<<numelems / numrows, 1>>>(d_odata, d_idata, numelems / numrows, numrows); //Perform transpose on device
//The GPU function-launch is async, so the launch "returns" to the CPU immediately, even though the GPU code has not finished executing
err = cudaGetLastError(); //Catches errors occuring immediately at launchtime
errrunning = cudaThreadSynchronize(); //Catches all other kernel run-time errors
if( err != cudaSuccess )
{
fprintf( streamlog, "Error: %s\n", cudaGetErrorString(err) );
global_failedflag = true; //Recording error
}
if( errrunning != cudaSuccess )
{
fprintf( streamlog, "Error: %s\n", cudaGetErrorString(errrunning) );
global_failedflag = true; //Recording error
}
err =cudaMemcpy(h_odata, d_odata, mem_size, cudaMemcpyDeviceToHost); //Copy back the transposed result to host from device
if( err != cudaSuccess )
{
fprintf( streamlog, "Error: %s\n", cudaGetErrorString(err) );
global_failedflag = true; //Recording error
}
///Write the transposed output to output stream.
unsigned int traceline = 0; //Column counter flag
for (unsigned int j = 0; j < numelems; ++j)
{
++traceline;
if (traceline == numrows)
{
//A new line has been reached. Reset writing to next new line
fprintf(streamwrite, "%.10f\n", h_odata[j]);
traceline = 0; //Reset column counter flag
}
else
fprintf(streamwrite, "%.10f\t", h_odata[j]); //End of line not reached; proceed with writing in the same line
}
++filecount;
//Close both reading and writing streams and free memory
fclose( streamwrite );
fclose( streamread );
free( allElems );
cudaFree( d_idata );
cudaFree( d_odata );
printf( "The file %s was transposed successfully.\n", in_filename );
fprintf( streamlog, "The file %s was transposed successfully.\n", in_filename );
}
printf("%d / %d file(s) transposed. Transposing completed.\n", filecount, totalfiles);
fprintf( streamlog, "%d / %d file(s) transposed. Transposing completed.\n", filecount, totalfiles);
///Display completion time.
time (&end);
printf("Final time: %s", ctime(&end));
fprintf( streamlog, "Final time: %s", ctime(&end));
printf("Time taken: %.2lf seconds\n", difftime(end, start));
fprintf( streamlog, "Time taken: %.2lf seconds\n", difftime(end, start));
//Report error if any file failed read/write
if (reading_failedflag)
{
printf("Failed to open some file(s) for reading.");
fprintf( streamlog, "Failed to open some file(s) for reading.");
}
if (writing_failedflag)
{
printf("Failed to open some file(s) for writing.");
fprintf( streamlog, "Failed to open some file(s) for writing.");
}
if (global_failedflag)
printf("\nProgram didn't execute normally. Please view the file %s for log information.", log_filename);
else
printf("\nProgram executed normally.");
getchar();
return 0;
}
Here’s my code to tranpose xls file from a directory using CUDA in GPU. However it is taking too much time than required. I found out that it may be due to the limitation of CPU rather than GPU. The CPU is taking a long time to read data from xls file and then send it to GPU. Can you suggest me some ways in which I can reduce the compilation time by using CPU multi-threading (reading multiple files simultaneously and send to GPU) or any other way. Please help me out…