Multi-threading in host(CPU)

#include "cuda_runtime.h"

#include "device_launch_parameters.h"

#include <stdafx.h>

#include <stdio.h>

#include <stdlib.h>

#define MAX_LEN 5000 //Size of memory to hold one row in one file at a time.

#define MAX_ELEMS 300000 //Size of memory allocated to the array that holds entire rows and columns serially, one file at a time

#define REAL float //Setting the type here. Precision of float wasn't sufficient so double was used. Consumed twice memory then float

FILE *streamlog; //Stream for log file

FILE *streamread; //Stream for input file

FILE *streamwrite; //Stream for output file

__global__ void kernel(REAL *odata, REAL *idata, unsigned int width, unsigned int height)

{

	unsigned int xIndex = blockIdx.x * blockDim.x + threadIdx.x;

	unsigned int yIndex = blockIdx.y * blockDim.y + threadIdx.y;

	unsigned int index_in = xIndex + width * yIndex;

	unsigned int index_out = yIndex + height * xIndex;

	for (unsigned int i = 0; i < height; i += 1) 

		odata[index_out + i] = idata[index_in + i * width];	

}

int main(void)

{

	char log_filename[30] = "Logging.txt"; //Holds log filename

	char in_filename[30] = ""; //Holds input filename

	char out_filename[30] = ""; //Holds output filename	

	errno_t openerrcode; //Error code returned for errors occuring during file opening

	unsigned int totalfiles = 560; //Enter total number of files to transpose	

	bool reading_failedflag = false; //Flag to remember if any file reading caused error

	bool writing_failedflag = false; //Flag to remember if any file writing caused error

	bool global_failedflag = false; //Flag to remember if any error occured during whole program execution

	unsigned int filecount = 0; //Total count of files successfully processed

	///Opens a log stream to log all messages into log file for later review.

	if( openerrcode = fopen_s( &streamlog, log_filename, "w" ) != 0 )

	{

		printf( "The log file %s was not opened for writing. Error code: %d\n", log_filename, openerrcode );		

		getchar();

		exit(1); //Exits the program

	}

	///Initiate time-monitored transpose process.

	time_t start, end;

	time (&start);

	printf("Initial time: %s", ctime(&start));

	fprintf( streamlog, "Initial time: %s", ctime(&start));

	printf("Transposing file(s) started...\n");

	fprintf( streamlog, "Transposing file(s) started...\n");

	///Loop through each input files.

	for (unsigned int i = 0; i < totalfiles; ++i)

	{

		char s[MAX_LEN]; //Holds one row at a time

		unsigned int numelems = 0; //Counter to hold the total number of elements in a file

		unsigned int numrows = 0; //Counter to hold the total number of rows in a file

		char *str; //This array will take command of the row read into s[MAX_LEN] to eventually tokenize it

		char delims[] = "\t"; //File is tab-delimited. Change to comma or pipe accordingly if file-format changes

		char *result; //The tokenized result (single elment) as broken down from str will be hold here

		REAL *allElems = (REAL *) malloc(sizeof(REAL *) * MAX_ELEMS); //Holds all elements of the file, one file at a time

		sprintf_s(in_filename, "%s%d%s", "raw data ", i, ".xls"); //Holds the input file. Format is "raw data <number>.xls"

		sprintf_s(out_filename, "%s%s", "trns-", in_filename);	//Holds the output file. Format is "trns-raw data <number>.xls"

		///Opens reading stream (for input file) and writing stream (for output file).

		if( openerrcode = fopen_s( &streamread, in_filename, "r" ) != 0 )

		{

			fprintf( streamlog, "The file %s was not opened for reading. Error code: %d.\n", in_filename, openerrcode );

			reading_failedflag = true; //Recording error

			global_failedflag = true; //Recording error

			continue; //Continues to next file

		}

		if( openerrcode = fopen_s( &streamwrite, out_filename, "w" ) != 0 )

		{

			fprintf( streamlog, "The file %s was not opened for writing. Error code: %d.\n", out_filename, openerrcode );

			writing_failedflag = true; //Recording error

			global_failedflag = true; //Recording error

			continue; //Continues to next file

		}		

		///Reads till the end of the input file/stream.

		while( !feof( streamread ) )

		{	

			fgets(s, MAX_LEN, streamread); //Read a row from file

			if (strlen(s) <= 1) continue; //Bypass empty rows

			str = s; //Take command of the row, for tokenizing

			result = strtok( str, delims ); //First token (element) of the row obtained in result

			///Reads each element and stores all data sequentially into an array (single column, multiple rows).

			while( result != NULL ) //Loop until whole row is read

			{				

				allElems[numelems] = (REAL) atof(result); //Element is converted to numeric format and stored in array

				result = strtok( NULL, delims ); //Obtains second token onwards

				numelems += 1; //Recording total number of elements in the file

			}	

			numrows += 1; //Recording total number of rows in the file

			s[0] = '

#include “cuda_runtime.h”

#include “device_launch_parameters.h”

#include <stdafx.h>

#include <stdio.h>

#include <stdlib.h>

#define MAX_LEN 5000 //Size of memory to hold one row in one file at a time.

#define MAX_ELEMS 300000 //Size of memory allocated to the array that holds entire rows and columns serially, one file at a time

#define REAL float //Setting the type here. Precision of float wasn’t sufficient so double was used. Consumed twice memory then float

FILE *streamlog; //Stream for log file

FILE *streamread; //Stream for input file

FILE *streamwrite; //Stream for output file

global void kernel(REAL *odata, REAL *idata, unsigned int width, unsigned int height)

{

unsigned int xIndex = blockIdx.x * blockDim.x + threadIdx.x;

unsigned int yIndex = blockIdx.y * blockDim.y + threadIdx.y;

unsigned int index_in = xIndex + width * yIndex;

unsigned int index_out = yIndex + height * xIndex;

for (unsigned int i = 0; i < height; i += 1) 

	odata[index_out + i] = idata[index_in + i * width];	

}

int main(void)

{

char log_filename[30] = "Logging.txt"; //Holds log filename

char in_filename[30] = ""; //Holds input filename

char out_filename[30] = ""; //Holds output filename	

errno_t openerrcode; //Error code returned for errors occuring during file opening

unsigned int totalfiles = 560; //Enter total number of files to transpose	

bool reading_failedflag = false; //Flag to remember if any file reading caused error

bool writing_failedflag = false; //Flag to remember if any file writing caused error

bool global_failedflag = false; //Flag to remember if any error occured during whole program execution

unsigned int filecount = 0; //Total count of files successfully processed

///Opens a log stream to log all messages into log file for later review.

if( openerrcode = fopen_s( &streamlog, log_filename, "w" ) != 0 )

{

	printf( "The log file %s was not opened for writing. Error code: %d\n", log_filename, openerrcode );		

	getchar();

	exit(1); //Exits the program

}

///Initiate time-monitored transpose process.

time_t start, end;

time (&start);

printf("Initial time: %s", ctime(&start));

fprintf( streamlog, "Initial time: %s", ctime(&start));

printf("Transposing file(s) started...\n");

fprintf( streamlog, "Transposing file(s) started...\n");

///Loop through each input files.

for (unsigned int i = 0; i < totalfiles; ++i)

{

	char s[MAX_LEN]; //Holds one row at a time

	unsigned int numelems = 0; //Counter to hold the total number of elements in a file

	unsigned int numrows = 0; //Counter to hold the total number of rows in a file

	char *str; //This array will take command of the row read into s[MAX_LEN] to eventually tokenize it

	char delims[] = "\t"; //File is tab-delimited. Change to comma or pipe accordingly if file-format changes

	char *result; //The tokenized result (single elment) as broken down from str will be hold here

	REAL *allElems = (REAL *) malloc(sizeof(REAL *) * MAX_ELEMS); //Holds all elements of the file, one file at a time

	sprintf_s(in_filename, "%s%d%s", "raw data ", i, ".xls"); //Holds the input file. Format is "raw data <number>.xls"

	sprintf_s(out_filename, "%s%s", "trns-", in_filename);	//Holds the output file. Format is "trns-raw data <number>.xls"

	///Opens reading stream (for input file) and writing stream (for output file).

	if( openerrcode = fopen_s( &streamread, in_filename, "r" ) != 0 )

	{

		fprintf( streamlog, "The file %s was not opened for reading. Error code: %d.\n", in_filename, openerrcode );

		reading_failedflag = true; //Recording error

		global_failedflag = true; //Recording error

		continue; //Continues to next file

	}

	if( openerrcode = fopen_s( &streamwrite, out_filename, "w" ) != 0 )

	{

		fprintf( streamlog, "The file %s was not opened for writing. Error code: %d.\n", out_filename, openerrcode );

		writing_failedflag = true; //Recording error

		global_failedflag = true; //Recording error

		continue; //Continues to next file

	}		

	///Reads till the end of the input file/stream.

	while( !feof( streamread ) )

	{	

		fgets(s, MAX_LEN, streamread); //Read a row from file

		if (strlen(s) <= 1) continue; //Bypass empty rows

		str = s; //Take command of the row, for tokenizing

		result = strtok( str, delims ); //First token (element) of the row obtained in result

		///Reads each element and stores all data sequentially into an array (single column, multiple rows).

		while( result != NULL ) //Loop until whole row is read

		{				

			allElems[numelems] = (REAL) atof(result); //Element is converted to numeric format and stored in array

			result = strtok( NULL, delims ); //Obtains second token onwards

			numelems += 1; //Recording total number of elements in the file

		}	

		numrows += 1; //Recording total number of rows in the file

		s[0] = '\0'; //Flushing the row which was taken in command; for the purpose of reading next row

	}

	cudaError_t err, errrunning;

	REAL mem_size = (numelems) * sizeof(REAL); //Total size of memory needed to be allocated in device

	REAL *h_odata = (REAL *) malloc(mem_size); //Memory allocation for the transposed result as received from device

	REAL *d_idata; //Holds the entire matrix

	REAL *d_odata; //Holds the resulting transposed matrix

	//printf("File %s has total %d elements and occupies %.3lf KB in GPU.\n", in_filename, numelems, (mem_size / 1024) * 2);

	///Allocate memory and copy in-memory data into device.

	err = cudaMalloc( (void**) &d_idata, mem_size); //Memory to hold matrix is allocated in device

	if( err != cudaSuccess ) 

	{

		fprintf( streamlog, "Error: %s\n", cudaGetErrorString(err) );

		global_failedflag = true; //Recording error

	}

	err = cudaMalloc( (void**) &d_odata, mem_size); //Memory to hold transposed matrix is allocated in device

	if( err != cudaSuccess ) 

	{

		fprintf( streamlog, "Error: %s\n", cudaGetErrorString(err) );

		global_failedflag = true; //Recording error

	}

	err =cudaMemcpy(d_idata, allElems, mem_size, cudaMemcpyHostToDevice); //Matrix copied to device from host

	if( err != cudaSuccess ) 

	{

		fprintf( streamlog, "Error: %s\n", cudaGetErrorString(err) );

		global_failedflag = true; //Recording error

	}

	///Perform transpose operation and return the transposed result to host.

	kernel<<<numelems / numrows, 1>>>(d_odata, d_idata, numelems / numrows, numrows); //Perform transpose on device

	//The GPU function-launch is async, so the launch "returns" to the CPU immediately, even though the GPU code has not finished executing

	err = cudaGetLastError(); //Catches errors occuring immediately at launchtime

	errrunning = cudaThreadSynchronize(); //Catches all other kernel run-time errors

	if( err != cudaSuccess ) 

	{

		fprintf( streamlog, "Error: %s\n", cudaGetErrorString(err) );

		global_failedflag = true; //Recording error

	}

	if( errrunning != cudaSuccess ) 

	{

		fprintf( streamlog, "Error: %s\n", cudaGetErrorString(errrunning) );

		global_failedflag = true; //Recording error

	}

	err =cudaMemcpy(h_odata, d_odata, mem_size, cudaMemcpyDeviceToHost); //Copy back the transposed result to host from device

	if( err != cudaSuccess ) 

	{

		fprintf( streamlog, "Error: %s\n", cudaGetErrorString(err) );

		global_failedflag = true; //Recording error

	}

	///Write the transposed output to output stream.

	unsigned int traceline = 0; //Column counter flag

	for (unsigned int j = 0; j < numelems; ++j)

	{			

		++traceline;

		if (traceline == numrows)

		{

			//A new line has been reached. Reset writing to next new line

			fprintf(streamwrite, "%.10f\n", h_odata[j]);

			traceline = 0; //Reset column counter flag

		}

		else

			fprintf(streamwrite, "%.10f\t", h_odata[j]); //End of line not reached; proceed with writing in the same line

	}

	++filecount;

	//Close both reading and writing streams and free memory

	fclose( streamwrite );

	fclose( streamread );

	free( allElems );

	cudaFree( d_idata );

	cudaFree( d_odata );

	printf( "The file %s was transposed successfully.\n", in_filename );

	fprintf( streamlog, "The file %s was transposed successfully.\n", in_filename );

}

printf("%d / %d file(s) transposed. Transposing completed.\n", filecount, totalfiles);

fprintf( streamlog, "%d / %d file(s) transposed. Transposing completed.\n", filecount, totalfiles);

///Display completion time.

time (&end);

printf("Final time: %s", ctime(&end));

fprintf( streamlog, "Final time: %s", ctime(&end));

printf("Time taken: %.2lf seconds\n", difftime(end, start));

fprintf( streamlog, "Time taken: %.2lf seconds\n", difftime(end, start));

//Report error if any file failed read/write

if (reading_failedflag)

{

	printf("Failed to open some file(s) for reading.");

	fprintf( streamlog, "Failed to open some file(s) for reading.");

}

if (writing_failedflag)

{

	printf("Failed to open some file(s) for writing.");

	fprintf( streamlog, "Failed to open some file(s) for writing.");

}

if (global_failedflag)

	printf("\nProgram didn't execute normally. Please view the file %s for log information.", log_filename);

else

	printf("\nProgram executed normally.");

getchar();	

return 0;

}

'; //Flushing the row which was taken in command; for the purpose of reading next row

		}

		cudaError_t err, errrunning;

		REAL mem_size = (numelems) * sizeof(REAL); //Total size of memory needed to be allocated in device

		REAL *h_odata = (REAL *) malloc(mem_size); //Memory allocation for the transposed result as received from device

		REAL *d_idata; //Holds the entire matrix

		REAL *d_odata; //Holds the resulting transposed matrix

		//printf("File %s has total %d elements and occupies %.3lf KB in GPU.\n", in_filename, numelems, (mem_size / 1024) * 2);

		///Allocate memory and copy in-memory data into device.

		err = cudaMalloc( (void**) &d_idata, mem_size); //Memory to hold matrix is allocated in device

		if( err != cudaSuccess ) 

		{

			fprintf( streamlog, "Error: %s\n", cudaGetErrorString(err) );

			global_failedflag = true; //Recording error

		}

		err = cudaMalloc( (void**) &d_odata, mem_size); //Memory to hold transposed matrix is allocated in device

		if( err != cudaSuccess ) 

		{

			fprintf( streamlog, "Error: %s\n", cudaGetErrorString(err) );

			global_failedflag = true; //Recording error

		}

		err =cudaMemcpy(d_idata, allElems, mem_size, cudaMemcpyHostToDevice); //Matrix copied to device from host

		if( err != cudaSuccess ) 

		{

			fprintf( streamlog, "Error: %s\n", cudaGetErrorString(err) );

			global_failedflag = true; //Recording error

		}

		///Perform transpose operation and return the transposed result to host.

		kernel<<<numelems / numrows, 1>>>(d_odata, d_idata, numelems / numrows, numrows); //Perform transpose on device

		//The GPU function-launch is async, so the launch "returns" to the CPU immediately, even though the GPU code has not finished executing

		err = cudaGetLastError(); //Catches errors occuring immediately at launchtime

		errrunning = cudaThreadSynchronize(); //Catches all other kernel run-time errors

		if( err != cudaSuccess ) 

		{

			fprintf( streamlog, "Error: %s\n", cudaGetErrorString(err) );

			global_failedflag = true; //Recording error

		}

		if( errrunning != cudaSuccess ) 

		{

			fprintf( streamlog, "Error: %s\n", cudaGetErrorString(errrunning) );

			global_failedflag = true; //Recording error

		}

		err =cudaMemcpy(h_odata, d_odata, mem_size, cudaMemcpyDeviceToHost); //Copy back the transposed result to host from device

		if( err != cudaSuccess ) 

		{

			fprintf( streamlog, "Error: %s\n", cudaGetErrorString(err) );

			global_failedflag = true; //Recording error

		}

		///Write the transposed output to output stream.

		unsigned int traceline = 0; //Column counter flag

		for (unsigned int j = 0; j < numelems; ++j)

		{			

			++traceline;

			if (traceline == numrows)

			{

				//A new line has been reached. Reset writing to next new line

				fprintf(streamwrite, "%.10f\n", h_odata[j]);

				traceline = 0; //Reset column counter flag

			}

			else

				fprintf(streamwrite, "%.10f\t", h_odata[j]); //End of line not reached; proceed with writing in the same line

		}

		++filecount;

		//Close both reading and writing streams and free memory

		fclose( streamwrite );

		fclose( streamread );

		free( allElems );

		cudaFree( d_idata );

		cudaFree( d_odata );

		printf( "The file %s was transposed successfully.\n", in_filename );

		fprintf( streamlog, "The file %s was transposed successfully.\n", in_filename );

	}

	printf("%d / %d file(s) transposed. Transposing completed.\n", filecount, totalfiles);

	fprintf( streamlog, "%d / %d file(s) transposed. Transposing completed.\n", filecount, totalfiles);

	///Display completion time.

	time (&end);

	printf("Final time: %s", ctime(&end));

	fprintf( streamlog, "Final time: %s", ctime(&end));

	printf("Time taken: %.2lf seconds\n", difftime(end, start));

	fprintf( streamlog, "Time taken: %.2lf seconds\n", difftime(end, start));

	//Report error if any file failed read/write

	if (reading_failedflag)

	{

		printf("Failed to open some file(s) for reading.");

		fprintf( streamlog, "Failed to open some file(s) for reading.");

	}

	if (writing_failedflag)

	{

		printf("Failed to open some file(s) for writing.");

		fprintf( streamlog, "Failed to open some file(s) for writing.");

	}

	if (global_failedflag)

		printf("\nProgram didn't execute normally. Please view the file %s for log information.", log_filename);

	else

		printf("\nProgram executed normally.");

	getchar();	

	return 0;

}

Here’s my code to tranpose xls file from a directory using CUDA in GPU. However it is taking too much time than required. I found out that it may be due to the limitation of CPU rather than GPU. The CPU is taking a long time to read data from xls file and then send it to GPU. Can you suggest me some ways in which I can reduce the compilation time by using CPU multi-threading (reading multiple files simultaneously and send to GPU) or any other way. Please help me out…

Your kernel is unnecessarily sequential. You probably don’t even need a for loop there.
You’re also launching a large number of blocks of 1 thread each.
Something tells me you need a better understanding of CUDA first.

I am new to CUDA so I am not able to optimize the code. So can you pls tell me how much thread I can define to make it run faster. pls help…