Need help on big data file

I am dealing with a big data file whose size is up to serveral tera bytes. Malloc such a big memory is impossible, so I can’t cache everything into the host memory.

What should I do to maximize the performance? Can anybody give me some code samples or hints ?

below is my attempt to the goal, any suggestion?

[codebox]int main(int argc, char* argv)

{

if(!InitCUDA()) {

	return 0;

}

//----------------------------------------------------------------------

// Copy cipher text to constant memory

DWORD cipherText[32] =

{

	0x48236065,	0x13DE2D16,

	0x9152846E,	0x9F686492,

	0xBAED17F2,	0xE8958B0C,

	0x478623B1,	0xAC4BF0AA,

	0x23F2DD3C,	0x7E2F6F5B,

	0x084FEDEB,	0x02D216EE,

	0x0CAC6E9B,	0x437E47C0,

	0x431146B4,	0x6BECB0D7,

	0x5916460A,	0xB83A4951,

	0x73F62882,	0x33CFCF62,

	0xBD21E797,	0x1D49C596,

	0xB200228A,	0x9B4E3DBE,

	0xDCD06586,	0x65A73A2E,

	0xB0D6E10A,	0xF4B34A77,

	0xB43D1316,	0xFC683C8B,

	0,0

};

CUDA_SAFE_CALL( cudaMemcpyToSymbol(CipherText, cipherText, sizeof(cipherText)) );

//----------------------------------------------------------------------

// define CUDA block and threads

int numBlocks = 32768;

int numThreads = 128;

//

DWORD memsize = numBlocks * numThreads << 4;

DWORD realNum = numBlocks * numThreads;

//----------------------------------------------------------------------

// malloc space for input

ulong4 *device_keys[2];

char *host_keys[2];

CUDA_SAFE_CALL( cudaMalloc((void**) &device_keys[0], memsize) );

CUDA_SAFE_CALL( cudaMalloc((void**) &device_keys[1], memsize) );

CUDA_SAFE_CALL( cudaMallocHost(&host_keys[0], memsize) );

CUDA_SAFE_CALL( cudaMallocHost(&host_keys[1], memsize) );

// malloc space for output

DWORD *device_result, *host_result;

CUDA_SAFE_CALL( cudaMalloc((void**) &device_result, sizeof(DWORD)));

CUDA_SAFE_CALL( cudaMemset(device_result, 0xFF, sizeof(DWORD)) );

CUDA_SAFE_CALL( cudaMallocHost((void**) &host_result, sizeof(DWORD)));

*host_result = 0xFFFFFFFF;

// create work stream

cudaStream_t stream[2];

cudaStreamCreate(&stream[0]);

cudaStreamCreate(&stream[1]);

// open file for read

FILE *fp = fopen("1.dat", "rb");

if (fp == NULL)

{

	printf("Error: open 1.dat");

	CUT_EXIT(argc, argv);

}

// make timer

unsigned int timer = 0;

CUT_SAFE_CALL( cutCreateTimer( &timer));

CUT_SAFE_CALL( cutStartTimer( timer));

int loopTimes = 0, readLen = 0, streamId = 0;

do

{

	// read file

	readLen = fread(host_keys[streamId], 1, memsize, fp);

	realNum = readLen >> 4;

	// transfer data to memory

	CUDA_SAFE_CALL( cudaMemcpyAsync(host_result, device_result, sizeof(DWORD), cudaMemcpyDeviceToHost, stream[streamId]) );

	CUDA_SAFE_CALL( cudaMemcpyAsync(device_keys[streamId], host_keys[streamId], readLen, cudaMemcpyHostToDevice, stream[streamId]) );

	CUDA_SAFE_CALL( cudaStreamSynchronize(stream[streamId]) );

	// check if we found the key

	if (*host_result != 0xFFFFFFFF)

	{

		break;

	}

	// do kernel

	FindKey<<<numBlocks, numThreads, 0, stream[streamId]>>>(device_keys[streamId], device_result, realNum);

	CUT_CHECK_ERROR("Kernel execution failed\n");

	// switch stream

	streamId ^= 1;

	// increase loop times

	++loopTimes;

}while (readLen >= memsize);

// stop timer

CUDA_SAFE_CALL( cudaThreadSynchronize() );

CUT_SAFE_CALL( cutStopTimer( timer));

printf("Processing time: %f (ms)\n", cutGetTimerValue( timer));

CUT_SAFE_CALL( cutDeleteTimer( timer));

// get result

if (readLen < memsize)

{

	CUDA_SAFE_CALL( cudaMemcpy(host_result, device_result, sizeof(DWORD), cudaMemcpyDeviceToHost));

	--loopTimes;

}

if (*host_result == 0xFFFFFFFF)

	printf("no result found!");

else

	printf("result = %lu\n", loopTimes * (memsize >> 4) + *host_result);

// free all

fclose(fp);

// CUT_SAFE_CALL( cudaStreamDestroy(stream[0]) ); // <- this call will trigger an error, why?

// CUT_SAFE_CALL( cudaStreamDestroy(stream[1]) ); // <- this call will trigger an error, why?

CUDA_SAFE_CALL( cudaFree(device_result));

CUDA_SAFE_CALL( cudaFreeHost(host_result));

CUDA_SAFE_CALL( cudaFree(device_keys[0]));

CUDA_SAFE_CALL( cudaFree(device_keys[1]));

CUDA_SAFE_CALL( cudaFreeHost(host_keys[0]) );

CUDA_SAFE_CALL( cudaFreeHost(host_keys[1]) );

//CUT_EXIT(argc, argv);

return 0;

}[/codebox]

If you are working with a proprietary data format, then on a POSIX system, I would suggest using mmap(). The Windows API doesn’t have mmap(), but it does have MapViewOfFile(), which might make the file IO side of things a little easier. You will probably want to overlap host site disk IO and GPU side computation to improve performance.