8800GTX - strange global memory problem... :( memory copy host->device->device->host

Hi All!

The test task is simple:

  1. create 2 buffers in host memory (pinned)

  2. create 2 buffers in device global memory (cudaMalloc)

  3. load text file (~50Mb) to host in_buffer_host

  4. transfer it to the device: in_buffer_host -> in_buffer_device

  5. transfer inside device (kernel function, 1 block 1 thread): in_buffer_device -> out_buffer_device

  6. transfer back to the host: out_buffer_device -> out_buffer_host

  7. write out_buffer_host to another file

But if I copy 50Mb - output file contains zeroes :(

If I copy ~10Mb or ~15Mb - test passes!

Are there some global memory access limitations?

Could you please explain what’s wrong with the following code?

Full source code here:

// includes, system

#include <stdlib.h>

#include <stdio.h>

#include <string.h>

#include <math.h>

#include <time.h>

#include <windows.h>

// includes, project

#include <cutil.h>

#include <cuda.h>

#define NUM_BLOCKS 1

#define NUM_THREADS 1

__global__ void Copy(char *from, char *to, int n)

{

     for(int i=0; i < n; i++) to[i] = from[i];

}

int main(int argc, char** argv)

{

unsigned int memSize = 50000000; // ~50Mb

FILE *F;

// host vars

char *h_idata = NULL;

char *h_odata = NULL;

// device vars

char *d_idata = NULL;

char *d_odata = NULL;

CUT_DEVICE_INIT(argc, argv);

//pinned memory mode - use special function to get OS-pinned memory

CUDA_SAFE_CALL( cudaMallocHost( (void**)&h_idata, memSize ) );

CUDA_SAFE_CALL( cudaMallocHost( (void**)&h_odata, memSize ) );

//allocate device memory

CUDA_SAFE_CALL(cudaMalloc((void**)&d_idata, memSize));

CUDA_SAFE_CALL(cudaMalloc((void**)&d_odata, memSize));

//load file

F = fopen("in.txt", "rb");

fread(h_idata, 1, memSize, F);

fclose(F);

unsigned int timer = 0;

float elapsedTimeInMs = 0.0f;

CUT_SAFE_CALL( cutCreateTimer( &timer ) );

CUT_SAFE_CALL( cutStartTimer( timer));

// copy host to device

CUDA_SAFE_CALL(cudaMemcpy(d_idata, h_idata, memSize, cudaMemcpyHostToDevice));

// copy inside device

Copy<<<NUM_BLOCKS, NUM_THREADS>>>(d_idata, d_odata, memSize);

CUDA_SAFE_CALL(cudaThreadSynchronize());

// copy device to host

CUDA_SAFE_CALL(cudaMemcpy(h_odata, d_odata, memSize, cudaMemcpyDeviceToHost));

CUT_SAFE_CALL( cutStopTimer( timer));

elapsedTimeInMs = cutGetTimerValue( timer);

printf("Elapsed: %f seconds\n", elapsedTimeInMs / (float)1000);

// write to file

F = fopen("out.txt", "wb");

fwrite(h_odata, 1, memSize, F);

fclose(F);

//clean up memory

CUDA_SAFE_CALL(cudaFreeHost(h_idata));

CUDA_SAFE_CALL(cudaFreeHost(h_odata));

CUDA_SAFE_CALL(cudaFree(d_idata));

CUDA_SAFE_CALL(cudaFree(d_odata));

CUT_EXIT(argc, argv);

}

Any suggestions?

While OS are you on? That kernel looks like it would take quite a while to me - you may be hitting the watchdog.

Why are you using only one thread and block? Are you testing for some particular behaviour?

How long does it take to process 10-15MB?

WinXP Pro.

Could you please provide watchdog timeout value here?

As far as I understood, the kernel fails due to some timeout?

But when I do the following:

__global__ void Copy(char *from, char *to, int n)

{

for(int i=0; i < 10000000; i++) to[i] = from[i];

for(int i=10000000; i < 20000000; i++) to[i] = from[i];

for(int i=20000000; i < 30000000; i++) to[i] = from[i];

for(int i=30000000; i < 40000000; i++) to[i] = from[i];

for(int i=40000000; i < n; i++) to[i] = from[i];

}

It starts to work OK.

This means that watchdog can’t be a root cause of this problem, isn’t it?

On 8800GTX there is 780Mb of RAM, but some amount of it is consumed for desktop.

Does it mean that I can’t use for example 512Mb or 256Mb at a time?

When I use 16 threads - it doesn’t work even with 10Mb :(

When I decrease a memory per-kernel to 1Mb - it starts to work.

So, it looks like some memory limitation…

10-15 Mb - ~3 sec.

50Mb (fails) - ~7 sec.

Watchdog on XP is 5 seconds for a primary card.

Aha - this is a different kernel. In this case we know a lot more at compile time (as the 50,000,000 is not known at compile time), and so it probably runs in less time.

If this is a problem you are acutally facing I would suggest upping the threads to at least 50,000. This should get you a bit more pace.

I change the code to the following:

// includes, system

#include <stdlib.h>

#include <stdio.h>

#include <string.h>

#include <math.h>

#include <time.h>

#include <windows.h>

// includes, project

#include <cutil.h>

#include <cuda.h>

#define NUM_BLOCKS  1

#define NUM_THREADS 50000

__global__ void Copy(char *from, char *to, int n)

{

	const int start = threadIdx.x * 1000; // 0 <= threadIdx.x <= 49999

	const int end   = start + 1000;

    for(int i=start; i < end; i++) to[i] = from[i];

}

int main(int argc, char** argv)

{

unsigned int memSize = 50000000; // ~50Mb

FILE *F;

// host vars

char *h_idata = NULL;

char *h_odata = NULL;

// device vars

char *d_idata = NULL;

char *d_odata = NULL;

CUT_DEVICE_INIT(argc, argv);

// alloc memory

CUDA_SAFE_CALL( cudaMallocHost( (void**)&h_idata, memSize ) );

memset(h_idata, 0x11, memSize);

CUDA_SAFE_CALL(cudaMalloc((void**)&d_idata, memSize));

CUDA_SAFE_CALL(cudaMemset(d_idata, 0x22, memSize));

CUDA_SAFE_CALL(cudaMalloc((void**)&d_odata, memSize));

CUDA_SAFE_CALL(cudaMemset(d_odata, 0x33, memSize));

CUDA_SAFE_CALL( cudaMallocHost( (void**)&h_odata, memSize ) );

memset(h_odata, 0x44, memSize);

//load file

F = fopen("in.txt", "rb");

fread(h_idata, 1, memSize, F);

fclose(F);

unsigned int timer = 0;

float elapsedTimeInMs = 0.0f;

CUT_SAFE_CALL( cutCreateTimer( &timer ) );

CUT_SAFE_CALL( cutStartTimer( timer));

// copy host to device

CUDA_SAFE_CALL(cudaMemcpy(d_idata, h_idata, memSize, cudaMemcpyHostToDevice));

// copy inside device

Copy<<<NUM_BLOCKS, NUM_THREADS>>>(d_idata, d_odata, memSize);

CUDA_SAFE_CALL(cudaThreadSynchronize());

// copy device to host

CUDA_SAFE_CALL(cudaMemcpy(h_odata, d_odata, memSize, cudaMemcpyDeviceToHost));

CUT_SAFE_CALL( cutStopTimer( timer));

elapsedTimeInMs = cutGetTimerValue( timer);

printf("Elapsed: %f seconds\n", elapsedTimeInMs / (float)1000);

// write to file

F = fopen("out.txt", "wb");

fwrite(h_odata, 1, memSize, F);

fclose(F);

//clean up memory

CUDA_SAFE_CALL(cudaFreeHost(h_idata));

CUDA_SAFE_CALL(cudaFreeHost(h_odata));

CUDA_SAFE_CALL(cudaFree(d_idata));

CUDA_SAFE_CALL(cudaFree(d_odata));

CUT_EXIT(argc, argv);

}

But I see out.txt full of 0x33 value (see cudaMemset(d_odata, 0x33, memSize))…

It works 0,03 sec - but result is nothing :(((

Are you compiling with _DEBUG? You should be getting all sorts of errors.

I would read through the programming guide again. It seems like you’ve missed a few things. The maximum number of threads per block is 512 (though it’s probably inefficient to go over 256 with a 8800). You’re trying to run 50,000 threads in one block.

I’m compiling RELEASE.

May I somehow:

  1. Switch off videocard in Windows;

  2. Do the calculations;

  3. Switch on videocard;

to prevent watchdog occur?

You mean that I should run 200 blocks with 250 threads ?

P.S. The final task will be: search a thousands of words in big text arrays.

I expect to search 1 word in 1 big document per thread rather than 1 word in 1 big document in multiple threads…

P.P.S. As far as I understood, one MP executes 1 block at a time, 8800GTX has 16MPs, so my optimal config would be: 16 blocks each with 256 threads?

or I may do 16X blocks each with 256Y threads?

I would compile with debug until it works then compile with release. CUDA_SAFE_CALL does nothing in release mode, but checks for errors in debug mode.

There is currently no way around the watchdog on windows (that I know of). I think there is meant to be a fix soon which will allow you to use your CUDA GPU as a secondary card (ie. not with display attached) which gets around the watchdog

200 blocks with 250 threads each sounds good. 1000 iterations per thread is still high, but it should be a lot faster than before. Probably well under a second in the kernel.

But in the real (not this test) task I expect to load much more huge array - near 512 megabytes. May I just increase blocks (divisible by 16) and threads (divisible by 256) or it will affect total performance and watchdog occurs?

This works stable on 50Mb:

#define NUM_BLOCKS  200

#define NUM_THREADS 250

__global__ void Copy(char *from, char *to, int n)

{

	const int sid   = __umul24(blockIdx.x, blockDim.x) + threadIdx.x;

	const int start = sid * 1000;

	const int end   = start + 1000;

    for(int i=start; i <= end; i++) to[i] = from[i];

}

and it takes 0.14 seconds.

Thanks a lot!

Now I suggest to increase a) file b) blocks & threads.

and publish results here.

Why don’t you build your kernel like this:

__global__ void Copy(char *from, char *to, int n)

{ 

 Â int tid=threadId.x+blockId.x*blockDim.x;

 Â if(tid<n) //just to be sure, there should be no divergence

 Â  Â  to[tid]=from[tid]

}

and launch it with

256 threads per block and 204800 blocks. That gives memsize=52 428 800 bytes (exactly 50MB).

Your previous code with 50000 threads and one block couldn’t possibly work, you can have 512 threads per block max. You should go for many blocks of threads.

Having a single thread in a single block iterating over the whole thing might have triggered the watchdog. It is also serial programming, you weren’t exploiting massive parallelism at all.

Or maybe I’m missing something? You’re trying to do a simple parallel copy from one array to another, right?

Also, my example above might not be optimized. I think it may have uncoalesced reads (coalesced means each threads reads/writes a word, double word or quad word - one byte isn’t even a whole word). It’ll still most likely be much better than what you have there.