8-16 fold speed up on reading AsciiGrid files by using the GPU to do the Ascii to float conversions.
Hi,
I was reading in some AsciiGrid files the other day and noticed that it was taking minutes to run and seemed CPU bound.
As the GPU could process the data in a couple of seconds the file IO was a real bottleneck.
I thought about how to use the GPU to speed it up and have a working solution, this is only a prototype and I’m sure can be improved in many ways.
However a large file that was taking about 5 minutes to convert on the CPU is taking ~30 seconds when the task is shared between CPU and GPU.
Reading a similar sized binary file takes about 9-10 seconds so the prototype code is already ~1/3rd the speed of doing a simple binary read.
Design
Each block is responsible for a 1024 byte ‘chunk’ of the file.
The host reads 240*1024 bytes using a single fread and this data is copied to device (GPU) memory.
A kernel with 240 blocks is fired to process it. Each block copies its 1024 ‘chunk’ from global device RAM to its shared cache then works on it.
As numbers may fall partly in two blocks I am using a rule that the number ‘belongs to’ the block it ends in.
Example 1
501.3333 is split between blocks A and B, so it is processed by block B
… 497.92 501.3333 504 …
- block A -----><— block B –
For simplicity each block also copies into its shared cache an overlap area of the last 32 bytes belonging to the previous block.
I have chosen a 32 byte overlap because floats vary greatly in their textual representation, 16 byte overlap may not be large enough for some double precision numbers but 32 bytes should be.
So each block converts all numbers that end in its 1024 bytes from ascii to float and put the result into the appropriate element of the result array.
However many applications trim trailing zeros from a number and even the decimal point e.g. the number 504.00000 would be output as just “504”
As example 2 shows the field size of a value in the same file can vary considerably e.g. one byte for the value “0” to nine bytes for 501.33951
Example 2
… -9999 0 497.92 501.33951 504 …
This variation makes it impossible to put a result into the right element of an array without knowing how many array elements every preceding block found.
The only way to guarantee this is is too use 2 Kernel calls with a cudaThreadSynchronize() between them, so the approach becomes:
-
Host reads 240k of file
-
the data is copied to the device memory (GPU)
-
blocks of 1st Kernel count number of values that fall their 1k ‘chunks’ of the file and save this in device memory
-
call cudaThreadSynchronize()
-
2nd Kernel does actual conversion and uses the counts made by 1st Kernel to store its output in the correct elements of the array.
-
wait until 2nd kernel has completed
repeat
Anyone who wishes to test the attached prototype code is welcome to
Note it is an unfinished prototype with no guarantees
I am sure that there are many ways that it can be improved on and I hope that others will do so.
Some areas for improvement are:
-
accuracy ( e.g. text value of “537.5552” is read by fscanf as 537.555176 but by the prototype as 537.555237 )
-
range of formats it supports. at present only supports single precision and does not support scientific ‘E’ notation
-
speed
I would suggest that speed improvements be done after accuracy and extra functionality.
[codebox]
// Cuda code
// Aim: parse ~200k of a file at a time
// EndFieldKernel: each block finds the positions of the end char of any fields in its ‘chunk’
// toFloatKernel: each thread does ascii to float (atof) conversion on one field in a ‘chunk’ may need some chars from previous chunk
//Keep shared so 3 blocks can run concurrently ( < 16k/3 bytes each)
// best if SHAPOSNS is 32SEARCHTHREADS and SHALINE is (2SHAPOSNS)+32 MAXCHARS should then be just under MAXBLOCKS2SHAPOSNS
// NB need to make some changes if you make MAXBLOCKS > THREADSPERBLOCK, or if you change THREADSPERBLOCK
// NB SEARCHTHREADS must always be a multiple of Packing Threads
// CHUNKSIZE should be 2x THREADSPERBLOCK, SHAPOSNS equal to THREADSPERBLOCK, SHALINE = CHUNKSIZE+OVERLAP, MAXCHARS <= MAXBLOCKS*CHUNKSIZE
#define THREADSPERBLOCK 512
#define SEARCHTHREADS 32
#define MAXPERSEARCHTHREAD 16
#define SHAPOSNS 512
#define CHUNKSIZE 1024
#define OVERLAP 32
#define SHALINE 1056
#define MAXCHARS 240000
#define MAXBLOCKS 240
shared char shLine[SHALINE];
shared short shPosns[SHAPOSNS];
shared char shCounts[SEARCHTHREADS];
device void populateShLine(char* d_line, long len, char* d_unProcd)
{
// len is the number of bytes of d_line actually read from disk, will normally be MAXCHARS
long loc = (blockIdx.x * CHUNKSIZE + threadIdx.x);
// The 1st 32 bytes of shLine hold last 32 chars of previous block ie OVERLAP
if ( threadIdx.x < OVERLAP)
{
if ( blockIdx.x == 0 )
shLine[threadIdx.x] = d_unProcd[threadIdx.x]; // copy last N bytes from previous 'read' of file
else
shLine[threadIdx.x] = d_line[loc-OVERLAP]; // copy last N bytes from previous blocks portion of d_line
}
// all threads now copy bytes from d_line into shLine
int cell = threadIdx.x + OVERLAP;
long src = loc;
for ( int ii = 0; ii < CHUNKSIZE/THREADSPERBLOCK; ii++ )
{
if ( src < len)
shLine[cell] = d_line[src];
else
shLine[cell] = 255;
cell = cell + THREADSPERBLOCK;
src = src + THREADSPERBLOCK;
}
}
device void searchSeparators()
{
const int space = 32; const int dot = 46; const int zero = 48; const int nine = 57; const int tab = 11; const int LF = 10; const int CR = 13;
int count = -1;
int myPart = CHUNKSIZE/SEARCHTHREADS;
int offset = OVERLAP + threadIdx.x*myPart;
for (int ii = 0; ii < myPart; ii++)
{
if ( (shLine[offset-1] >= zero && shLine[offset-1] <= nine) || shLine[offset-1] == dot) // number doesnt have to have a digit after the decimal point
{
if ( shLine[offset] <= space )
{
if ((shLine[offset] == space || shLine[offset] == tab || shLine[offset] == 0 || shLine[offset]==LF || shLine[offset]==CR) )
{
count++;
shPosns[ threadIdx.x*MAXPERSEARCHTHREAD + count] = offset-1;
}
}
}
offset++;
}
shCounts[threadIdx.x] = count+1;
}
device float asciiToFloat( int endchar )
{
// Convert a string of characters into a floating point number. Only numbers with up to 9 digits in them and doesnt handle exponential numbers
// the string will be a portion of the shared array shLine and the position of the last character of the string passed
// 0x7fffffff (NaN) is returned if the field does not contain a valid number
// NB numbers of following format are allowed as float “-9999” “-9999.” this is in order to read some data produced by Arc and other packages
// NB code also allows --9999 and +9999
//char* shLine = " -12345.6789";
//int endchar = 11;
const int space = 32; const int minus = 45; const int dot = 46; const int zero = 48; const int tab = 11; const int plus = 43; const int LF = 10; const int CR = 13;
int ii = endchar;
long num = 0;
int count = 1;
int digit = 0;
int error = 0;
int sign = 0;
int divisor = -99;
float result;
// Works from rightmost character to left stopping when it finds a whitespace
while ( ii >= 0)
{
digit = shLine[ii];
if ( digit-zero >= 0 && digit-zero <= 9 )
{
num = num + count * (digit-zero);
count = count*10;
}
else
{
if ( digit == dot )
{
if (divisor < 0 )
divisor = count;
else
error = 1; // don't allow ".."
}
else
{
ii = -1;
if ( digit == minus )
sign = 1;
else
{
if ( digit != plus && digit != space && digit != tab && digit != LF && digit != CR )
error = 1;
}
}
}
ii--;
}
if ( error == 1)
result = 0x7fffffff; // NaN or should this be 0xFFA00000 ?
else
{
if ( divisor < 0 )
result = (float) num;
else
result = (float)num/(float) divisor;
if ( sign != 0)
result = -result;
}
return result;
}
global void EndFieldKernel( char* d_line, long len, int* d_Posns, int* d_Counts, char* d_unProcd)
{
// finds numeric fields (separated by whitespace) in a large chunk of text e.g. read from a file
// actually finds the last character of each numeric field where it is expected that that character will be one of “0-9” or “.” and followed by whitespace/null/end_of_line
// minus = 45, zero = 48, nine = 57 ( = x-48) space 32, tab 11, LF 10, CR 13, + 43, . 46, alphabet 65 and above
//const int space = 32; const int minus = 45; const int dot = 46; const int zero = 48; const int nine = 57; const int tab = 11; const int LF = 10; const int CR = 13;
// 1: Setup. all threads copy characters from main array into shared array
populateShLine(d_line, len, d_unProcd);
__syncthreads();
// 2: Search for separators
if ( threadIdx.x < SEARCHTHREADS )
searchSeparators();
__syncthreads();
// 3: Save
// first pack the Posns array (pull the 32 sets of positions into 1 big set)
int val;
int used = 0;
for ( int tt = 1; tt < SEARCHTHREADS; tt++)
{
used = used + shCounts[tt-1];
if ( threadIdx.x < MAXPERSEARCHTHREAD )
val = shPosns[tt*MAXPERSEARCHTHREAD + threadIdx.x];
__syncthreads(); // NB Essential that __synch is here, ensures that all data is read before other data moved into its space.
if ( threadIdx.x < MAXPERSEARCHTHREAD )
shPosns[used + threadIdx.x ] = val;
__syncthreads();
}
used = used + shCounts[SEARCHTHREADS-1];
if (threadIdx.x == 0)
d_Counts[blockIdx.x] = used; //total number of fields found by this block
// Save the Posns to global array
__syncthreads();
if ( threadIdx.x < used )
d_Posns[ blockIdx.x*SHAPOSNS + threadIdx.x ] = shPosns[threadIdx.x];
}
global void toFloatKernel( char* d_line, float* d_Data, long len, int* d_Posns, int* d_Counts, char* d_unProcd, long offset, long* d_resultArrayElementsDone )
{
// Convert a string of characters into a floating point number
// NB following are allowed as float -9999 -9999. -9999.9 in order to read data produced by Arc and TIME
// the string will be a potion of the shared array and the position of its last character supplied.
// minus = 45, zero = 48, nine = 57 ( = x-48) space 32, tab 11, LF 10, CR 13, + 43, . 46, alphabet 65 and above
// needs to set a flag if there is an error
// 1: Setup
populateShLine(d_line, len, d_unProcd); // copy CHUNKSIZE bytes from global array d_line into shLine
//d_Counts[MAXBLOCKS] contains the number of fields that EndFieldKernel found for each block in its CHUNKSIZE bytes of d_line, required so that results can be written to correct array element
// want oset = offset + sum( d_counts[ blk ] ) for all blocks before this one
// (briefly pre-using shPosns to hold counts)
if (threadIdx.x <= blockIdx.x )
shPosns[threadIdx.x] = d_Counts[threadIdx.x];
__syncthreads();
long oset = offset;
for ( int ii = 0; ii < blockIdx.x; ii++) // *** blockIdx.x is uint, so blockIdx.x-1 gives horrible results ***
oset = oset + shPosns[ii];
int used = shPosns[blockIdx.x];
// Read in Posns from global array
__syncthreads();
if ( threadIdx.x < used )
shPosns[threadIdx.x] = d_Posns[blockIdx.x*SHAPOSNS + threadIdx.x ];
__syncthreads();
// 2: Convert string to float
__syncthreads();
if ( threadIdx.x < used )
d_Data[oset+threadIdx.x] = asciiToFloat( shPosns[threadIdx.x] );
// 3: Save the total number of fields processed and written to result array by all kernel calls so far
if ( blockIdx.x == gridDim.x-1 && threadIdx.x == 0 )
d_resultArrayElementsDone[0] = oset + used;
if ( blockIdx.x == 0 && threadIdx.x < OVERLAP && (len-32+threadIdx.x) >= 0)
d_unProcd[ threadIdx.x ] = d_line[len-32+threadIdx.x]; // Save last 'Overlap' bytes of d_line, needed by block 0 of next kernel call
}
[/codebox]
[codebox]
// Host code
//#define COLUMNS 18010
//#define ROWS 22970
#define ROWS 4800
#define COLUMNS 6000
typedef struct
{
size_t sizeT;
int rows;
int columns;
float north;
float south;
float west;
float east;
float cellsize;
float noDataValue;
char storedAs[8];
char element[20];
char elementSubType[20];
long dataForDate;
} gridMeta_t;
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
// read a line or portion of the file
int main( int argc, char** argv)
{
clock_t startm, stopm;
// === Sizes of memory to be allocated ===
size_t sizeT_Data = ROWS*COLUMNS * sizeof(float);
size_t sizeT_line = MAXCHARS * sizeof(char);
size_t sizeT_Posns = MAXBLOCKS * SHAPOSNS * sizeof(int);
size_t sizeT_Counts = MAXBLOCKS * sizeof(int);
size_t sizeT_unProcd = OVERLAP *sizeof(char); //used to carry unprocessed part of one 'chunk'
size_t sizeT_resultArrayElementsDone = sizeof(float);
// === pointers to host and device memory for arrays ===
char *h_line, *d_line;
float *h_Data, *d_Data, *h_Orig;
char *h_unProcd, *d_unProcd;
int *d_Posns, *h_Posns;
int *d_Counts, *h_Counts;
long *h_resultArrayElementsDone, *d_resultArrayElementsDone;
h_Data = (float *) malloc( sizeT_Data);
h_line = (char *) malloc( sizeT_line);
h_Counts = (int *) malloc( sizeT_Counts);
h_Posns = (int *) malloc( sizeT_Posns);
h_unProcd = (char *) malloc( sizeT_unProcd);
h_resultArrayElementsDone = (long *) malloc( sizeT_resultArrayElementsDone );
cudaMalloc( (void**) &d_Data, sizeT_Data );
cudaMalloc( (void**) &d_line, sizeT_line );
cudaMalloc( (void**) &d_unProcd, sizeT_unProcd );
cudaMalloc( (void**) &d_Posns, sizeT_Posns );
cudaMalloc( (void**) &d_Counts, sizeT_Counts );
cudaMalloc( (void**) &d_resultArrayElementsDone, sizeT_resultArrayElementsDone );
for ( int ii = 0; ii < ROWS*COLUMNS; ii++)
h_Data[ii] = 0;
for ( int ii = 0; ii < MAXBLOCKS; ii++)
h_Counts[ii] = 0;
for (int ii = 0; ii < OVERLAP; ii++)
h_unProcd[ii] = 0;
char *demFile = “D:\testDem.asc”;
int OK;
gridMeta_t meta, metaO;
START;
size_t limit, count;
dim3 dimBlock( THREADSPERBLOCK );
OK = 0;
int errs = 0;
FILE *fp;
fp = fopen( demFile, “r” );
if (fp != NULL)
{
OK = readAsciiHeader( fp, &meta);
cudaMemcpy( d_Counts, h_Counts, sizeT_Counts, cudaMemcpyHostToDevice );
checkCUDAError("cudaMemcpy C");
cudaMemcpy( d_Posns, h_Posns, sizeT_Posns, cudaMemcpyHostToDevice );
checkCUDAError("cudaMemcpy P");
cudaMemcpy( d_unProcd, h_unProcd, sizeT_unProcd, cudaMemcpyHostToDevice );
checkCUDAError("cudaMemcpy U");
cudaMemcpy( d_Data, h_Data, sizeT_Data, cudaMemcpyHostToDevice );
checkCUDAError("cudaMemcpy D");
int rr = -1;
long inArrayOffset = 0;
count = fread( h_line, sizeof(char), MAXCHARS, fp);
while ( count > 0 )
{
cudaMemcpy( d_line, h_line, count, cudaMemcpyHostToDevice );;
checkCUDAError("cudaMemcpy 1");
cudaThreadSynchronize();
checkCUDAError("Synch");
int blocksNeeded = (int)count/(int)CHUNKSIZE;
if ( (int)count%(int)CHUNKSIZE >0 )
blocksNeeded++;
dim3 dimGrid( blocksNeeded );
EndFieldKernel<<< dimGrid , dimBlock >>> ( d_line, (int)count, d_Posns, d_Counts, d_unProcd);
checkCUDAError("kernel 1 execution");
cudaThreadSynchronize();
checkCUDAError("Synch");
toFloatKernel<<< dimGrid , dimBlock >>> ( d_line, d_Data, count, d_Posns, d_Counts, d_unProcd, inArrayOffset, d_resultArrayElementsDone);
checkCUDAError("kernel 2 execution");
cudaThreadSynchronize();
checkCUDAError("Synch");
cudaMemcpy( h_resultArrayElementsDone, d_resultArrayElementsDone, sizeT_resultArrayElementsDone, cudaMemcpyDeviceToHost );
checkCUDAError("cudaMemcpy offset back");
inArrayOffset = h_resultArrayElementsDone[0];
count = fread( h_line, sizeof(char), MAXCHARS, fp);
}
fclose(fp);
cudaMemcpy( h_Data, d_Data, sizeT_Data, cudaMemcpyDeviceToHost );
checkCUDAError("cudaMemcpy results back");
}
// do something with the results !! (or optimiser might optimise everything away !
return 0;
}
void checkCUDAError(const char *msg)
{
cudaError_t err = cudaGetLastError();
if( cudaSuccess != err)
{
fprintf(stderr, "Cuda error: %s: %s.\n", msg, cudaGetErrorString( err) );
exit(-1);
}
}
[/codebox]