Cuda Error

I am getting the following error in my code. I thing i’ve made a bit of a balls up but just in case it isn’t a code mistake I thought I would make a post. Please find the error code below

ptxas error  : Entry function 'code' uses too much local data (0x1e000 bytes, 0x4000 max)

The error comes from somewhere in the following code (i guess), this is my first CUDA program that isn’t from the examples so it’s almost certainly really wrong.

I’m afraid i can only post the variable declarations etc…

[CODE]

main(int iargc, char **argv)

{

int nop=9;//

int nsamps=4; //

int ntrcin=6;

float stime=0.0;

int nsidelen;

float starttime; float sampleinterval; /* start time and sample interval */

int noPrefilteredInputTraceVersions; /* number of prefiltered input trace versions */

float fn; float rdfs;

int lengthInputTraces; /* length of input traces */

short noOutSamples; /* no. of o/p samples */

int noOutImageLocs; /* number of o/p image locations */

int noInputTraces; /* no. of input traces */

float d2p;

float *din;

int handt;

float out[MAX_NSAMPS_MAX_NOP]; /* o/p image buffer */

float velocityInfoBufferAtImageAddr; / offset to velocity info buffer at image address */

int ox[MAX_NOP]; /* output location coordinates */

int oy[MAX_NOP];

int rx[MAX_NTRACE]; /* receiver coordinates */

int ry[MAX_NTRACE];

int sx[MAX_NTRACE]; /* source coordinates */

int sy[MAX_NTRACE];

int kintvi2;

float dtrin[MAX_NTRACE]; /* topographic term */

float dtsin[MAX_NTRACE];

int nf=2;//

int i, j, k, nsre;

unsigned int hTimer;

int noversample=2;	/* switch off */

float si=4.0;

float sire;

char a;

int OPT_SZ;

float *vi2, *out_emu, *out_dimec;

int iop;

int   *iox, *ioy;

int   irx[MAX_NTRACE], iry[MAX_NTRACE], isx[MAX_NTRACE], isy[MAX_NTRACE];

clock_t startTime;

clock_t endTime;

float executionTime;

CUT_CHECK_DEVICE();

	j=0;

nsidelen = (int) sqrt((double) nop);

nop = nsidelen * nsidelen;

for (i=0;i<ntrcin;i++) {

  irx[i]=(i*25)+250;

  iry[i]=0;

  if (i/10 > j) j++;

  isx[i]=(j*50);

  isy[i]=0;

  dtrin[i]=0.0;

  dtsin[i]=0.0;

}

sire=si;

nsre=nsamps;

    if ( noversample > 0 ) {

  for (i=0;i<noversample;i++) {

    sire *= 0.50;

    nsre *= 2;

  }

}

    fn=500.0/si;

    rdfs=(float) nf/fn;

vi2 = (float *) malloc(nsamps * nop * sizeof(float));

din = (float *) malloc(ntrcin * nsre * nf * sizeof(float));

out_emu = (float *) malloc(nsamps * nop * sizeof(float));

for (k=0;k<nsamps;k++) {

  vi2[k]=1500.0 + ((float) (k) * 5.0);

  vi2[k]*=vi2[k];

  vi2[k]=1.0/vi2[k];

  din[k]=(float) cos((double) (stime + k*si));

}

for (k=1;k<ntrcin;k++) {

  memcpy(&(din[k*nsamps]), din, nsamps*sizeof(float));

}

iox=(int *) malloc(sizeof(int) * nop);

ioy=(int *) malloc(sizeof(int) * nop);

    iop=0;

for (i=0;i<nsidelen;i++) {

  for (j=0;j<nsidelen;j++) {

    if (iop>0) {

      memcpy(&(vi2[iop*nsamps]), vi2, nsamps*sizeof(float));

    }

    iox[iop] = i*25;

    iox[iop] = j*25;

    out_emu[iop] = 0.0;

    iop++;

  }

}

d2p=25.0;

handt=0;

OPT_SZ = sizeof(float);

startTime = clock();



  //  CUDA_SAFE_CALL( cudaMalloc((void **)&starttime, OPT_SZ)  );

    CUDA_SAFE_CALL( cudaMalloc((void **)&sampleinterval,  OPT_SZ)  );

	CUDA_SAFE_CALL( cudaMalloc((void **)&fn,   OPT_SZ) );

    CUDA_SAFE_CALL( cudaMalloc((void **)&rdfs,  OPT_SZ) );

    CUDA_SAFE_CALL( cudaMalloc((void **)&d2p,   OPT_SZ) );

	CUDA_SAFE_CALL( cudaMalloc((void **)&din,   OPT_SZ) );

    CUDA_SAFE_CALL( cudaMalloc((void **)&out,  OPT_SZ) );

    CUDA_SAFE_CALL( cudaMalloc((void **)&velocityInfoBufferAtImageAddr, OPT_SZ) );

    CUDA_SAFE_CALL( cudaMalloc((void **)&dtrin,   (sizeof(float)*MAX_NTRACE)) );

    CUDA_SAFE_CALL( cudaMalloc((void **)&dtsin,  (sizeof(float)*MAX_NTRACE)) );

printf("...copying input data to GPU mem.\n");

    //Copy options data to GPU memory for further processing

/* CUDA_SAFE_CALL( cudaMemcpy(starttime, stime, sizeof(float), cudaMemcpyHostToDevice) );

    CUDA_SAFE_CALL( cudaMemcpy(sampleinterval, si,  OPT_SZ, cudaMemcpyHostToDevice) );

    CUDA_SAFE_CALL( cudaMemcpy(noPrefilteredInputTraceVersions,  nf,   sizeof(int), cudaMemcpyHostToDevice) );

	CUDA_SAFE_CALL( cudaMemcpy(fn,  fn,   OPT_SZ, cudaMemcpyHostToDevice) );

    CUDA_SAFE_CALL( cudaMemcpy(rdfs, rdfs,  OPT_SZ, cudaMemcpyHostToDevice) );

    CUDA_SAFE_CALL( cudaMemcpy(lengthInputTraces,  nsamps,   sizeof(int), cudaMemcpyHostToDevice) );

	CUDA_SAFE_CALL( cudaMemcpy(noOutSamples,  nsamps,   sizeof(short), cudaMemcpyHostToDevice) );

	CUDA_SAFE_CALL( cudaMemcpy(noOutImageLocs,  nop,   sizeof(int), cudaMemcpyHostToDevice) );

    CUDA_SAFE_CALL( cudaMemcpy(noInputTraces, ntrcin,  sizeof(int), cudaMemcpyHostToDevice) );

    CUDA_SAFE_CALL( cudaMemcpy(d2p,  d2p,   OPT_SZ, cudaMemcpyHostToDevice) );

	CUDA_SAFE_CALL( cudaMemcpy(din[MAX_NF_MAX_NSAMPS],  din,   OPT_SZ, cudaMemcpyHostToDevice) );

    CUDA_SAFE_CALL( cudaMemcpy(out[MAX_NSMAP_MAX_NOP], out_emu,  OPT_SZ, cudaMemcpyHostToDevice) );

    CUDA_SAFE_CALL( cudaMemcpy(*velocityInfoBufferAtImageAddr,  vi2,   OPT_SZ, cudaMemcpyHostToDevice) );

	CUDA_SAFE_CALL( cudaMemcpy(ox[MAX_NOP],  iox,   (sizeof(int)*MAX_NOP), cudaMemcpyHostToDevice) );

    CUDA_SAFE_CALL( cudaMemcpy(oy[MAX_NOP], ioy,  (sizeof(int)*MAX_NOP), cudaMemcpyHostToDevice) );

    CUDA_SAFE_CALL( cudaMemcpy(rx[MAX_NTRACE],  irx,  (sizeof(int)*MAX_NTRACE), cudaMemcpyHostToDevice) );

	CUDA_SAFE_CALL( cudaMemcpy(ry[MAX_NTRACE],  iry,   (sizeof(int)*MAX_NTRACE), cudaMemcpyHostToDevice) );

    CUDA_SAFE_CALL( cudaMemcpy(sx[MAX_NTRACE], isx,  (sizeof(int)*MAX_NTRACE), cudaMemcpyHostToDevice) );

    CUDA_SAFE_CALL( cudaMemcpy(sy[MAX_NTRACE],  isy,   (sizeof(int)*MAX_NTRACE), cudaMemcpyHostToDevice) );

	CUDA_SAFE_CALL( cudaMemcpy(dtrin[MAX_NTRACE],  dtrin,   (sizeof(float)*MAX_NTRACE), cudaMemcpyHostToDevice) );

    CUDA_SAFE_CALL( cudaMemcpy(dtsin[MAX_NTRACE], dtsin,  (sizeof(float)*MAX_NTRACE), cudaMemcpyHostToDevice) );

*/

printf(“Data init done.\n”);

printf(“Executing GPU kernel…\n”);

    CUDA_SAFE_CALL( cudaThreadSynchronize() );

    CUT_SAFE_CALL( cutResetTimer(hTimer) );

    CUT_SAFE_CALL( cutStartTimer(hTimer) );

    code<<<512, 512>>>(

   stime, si,                 /* start time and sample interval */

nf, /* number of prefiltered input trace versions */

fn, rdfs,

nsamps, /* length of input traces */

nsamps, /* no. of o/p samples */

nop, /* number of o/p image locations */

ntrcin, /* no. of input traces */

d2p,

din, /* input data */

out_emu, /* o/p image buffer */

vi2, /* velocity info buffer at image */

iox, /* output location coordinates */

ioy,

irx, /* receiver coordinates */

iry,

isx, /* source coordinates */

isy,

dtrin, /* topographic term */

dtsin); /* topographic term */

    CUDA_SAFE_CALL( cudaThreadSynchronize() );

    CUT_SAFE_CALL( cutStopTimer(hTimer) );

What are the values of MAX_NOP, MAX_NTRACE, MAX_NSAMPS_MAX_NOP etc? Since it is these that determine the amount of local data that is allocated, it would be helpful if they were provided in the code snapshot. Also, are you certain that they are fully defined and not assigned some random garbage?

Sorry, my bad. Does this help?

#define MAX_NTRACE (256)

#define MAX_NSAMPS (4096)

#define MAX_NOP    (100000)

#define MAX_NF     (20)

#define MAX_NF_MAX_NSAMPS (MAX_NF*MAX_NSAMPS)

#define MAX_NSAMPS_MAX_NOP  (MAX_NSAMPS*MAX_NOP)

Well, if we work back from the error and deduce that CUDA imposes some limit (0x4000) on the amount of local data that you can have in a function, then you are clearly going to have problems with these numbers.

For starters, you declare three int arrays of size 100,000 elements (based on MAX_NOP) plus a float array of size 409,600,000 elements (based on MAX_NSAMPS_MAX_NOP). - These are bound to blow those limits!

I’m sorry that my news is basically negative - I’m sure there must be a way of working around this rather low limit, but I don’t really know what it might be (allocate on the heap, probably).

Doh! I just re-read your code - you are allocating on (the CUDA) heap; you’re just not doing it correctly! External Image

The declaration: int foobar[12345] explicitly allocates 12345*sizeof(int) bytes of local memory, which is what is causing your problems.

Later on though, you are attempting to allocate the memory for foobar on the cuda device, using cudaMalloc. However, you need to note two points:

  • The amount of memory you need to allocate will typically be 12345*sizeof(float) (NOT just sizeof(float) as you are doing)

    • The variable foobar just needs to be declared as a float* (or an unsized array, of the form float foobar - which is effectively equivalent, albeit a bit clearer to the reader).

Those two changes ought to solve most of your problems.

Incidentally, on a minor coding issue, I find your comments a bit redundant, e.g.:

int lengthInputTraces; /* length of input traces */

If the variable names are meaningful (and they ought to be) then the comments add nothing but noise! ;)

Thanks I’ll have a look at these suggestions.

Part of the problem is i’ve tried to port an FPGA code into a CUDA code without really paying attention to what I was doing.

As for my idiotic comment that sbecause the original variables had helpful names such as lit and as I read through the original code I commented them so I could understand what previous developers had done. Then I decided to change the variable names and must have forgot to remove some of the ocmments!

int lengthInputTraces; /* length of input traces */

Thanks for the help,

Chris

Hmmmm - FPGA to CUDA: Fun! :P or Magic? :magic:

Sadly there isn’t a pictre of a guy yanking his hair out of his head so this will have to do External Media