Hi All!
The test task is simple:
-
create 2 buffers in host memory (pinned)
-
create 2 buffers in device global memory (cudaMalloc)
-
load text file (~50Mb) to host in_buffer_host
-
transfer it to the device: in_buffer_host → in_buffer_device
-
transfer inside device (kernel function, 1 block 1 thread): in_buffer_device → out_buffer_device
-
transfer back to the host: out_buffer_device → out_buffer_host
-
write out_buffer_host to another file
But if I copy 50Mb - output file contains zeroes :(
If I copy ~10Mb or ~15Mb - test passes!
Are there some global memory access limitations?
Could you please explain what’s wrong with the following code?
Full source code here:
// includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <time.h>
#include <windows.h>
// includes, project
#include <cutil.h>
#include <cuda.h>
#define NUM_BLOCKS 1
#define NUM_THREADS 1
__global__ void Copy(char *from, char *to, int n)
{
for(int i=0; i < n; i++) to[i] = from[i];
}
int main(int argc, char** argv)
{
unsigned int memSize = 50000000; // ~50Mb
FILE *F;
// host vars
char *h_idata = NULL;
char *h_odata = NULL;
// device vars
char *d_idata = NULL;
char *d_odata = NULL;
CUT_DEVICE_INIT(argc, argv);
//pinned memory mode - use special function to get OS-pinned memory
CUDA_SAFE_CALL( cudaMallocHost( (void**)&h_idata, memSize ) );
CUDA_SAFE_CALL( cudaMallocHost( (void**)&h_odata, memSize ) );
//allocate device memory
CUDA_SAFE_CALL(cudaMalloc((void**)&d_idata, memSize));
CUDA_SAFE_CALL(cudaMalloc((void**)&d_odata, memSize));
//load file
F = fopen("in.txt", "rb");
fread(h_idata, 1, memSize, F);
fclose(F);
unsigned int timer = 0;
float elapsedTimeInMs = 0.0f;
CUT_SAFE_CALL( cutCreateTimer( &timer ) );
CUT_SAFE_CALL( cutStartTimer( timer));
// copy host to device
CUDA_SAFE_CALL(cudaMemcpy(d_idata, h_idata, memSize, cudaMemcpyHostToDevice));
// copy inside device
Copy<<<NUM_BLOCKS, NUM_THREADS>>>(d_idata, d_odata, memSize);
CUDA_SAFE_CALL(cudaThreadSynchronize());
// copy device to host
CUDA_SAFE_CALL(cudaMemcpy(h_odata, d_odata, memSize, cudaMemcpyDeviceToHost));
CUT_SAFE_CALL( cutStopTimer( timer));
elapsedTimeInMs = cutGetTimerValue( timer);
printf("Elapsed: %f seconds\n", elapsedTimeInMs / (float)1000);
// write to file
F = fopen("out.txt", "wb");
fwrite(h_odata, 1, memSize, F);
fclose(F);
//clean up memory
CUDA_SAFE_CALL(cudaFreeHost(h_idata));
CUDA_SAFE_CALL(cudaFreeHost(h_odata));
CUDA_SAFE_CALL(cudaFree(d_idata));
CUDA_SAFE_CALL(cudaFree(d_odata));
CUT_EXIT(argc, argv);
}
Any suggestions?