Cuda Error

cmorrison · July 23, 2007, 4:02pm

I am getting the following error in my code. I thing i’ve made a bit of a balls up but just in case it isn’t a code mistake I thought I would make a post. Please find the error code below

ptxas error  : Entry function 'code' uses too much local data (0x1e000 bytes, 0x4000 max)

The error comes from somewhere in the following code (i guess), this is my first CUDA program that isn’t from the examples so it’s almost certainly really wrong.

I’m afraid i can only post the variable declarations etc…

[CODE]

main(int iargc, char **argv)

{

int nop=9;//

int nsamps=4; //

int ntrcin=6;

float stime=0.0;

int nsidelen;

float starttime; float sampleinterval; /* start time and sample interval */

int noPrefilteredInputTraceVersions; /* number of prefiltered input trace versions */

float fn; float rdfs;

int lengthInputTraces; /* length of input traces */

short noOutSamples; /* no. of o/p samples */

int noOutImageLocs; /* number of o/p image locations */

int noInputTraces; /* no. of input traces */

float d2p;

float *din;

int handt;

float out[MAX_NSAMPS_MAX_NOP]; /* o/p image buffer */

float velocityInfoBufferAtImageAddr; / offset to velocity info buffer at image address */

int ox[MAX_NOP]; /* output location coordinates */

int oy[MAX_NOP];

int rx[MAX_NTRACE]; /* receiver coordinates */

int ry[MAX_NTRACE];

int sx[MAX_NTRACE]; /* source coordinates */

int sy[MAX_NTRACE];

int kintvi2;

float dtrin[MAX_NTRACE]; /* topographic term */

float dtsin[MAX_NTRACE];

int nf=2;//

int i, j, k, nsre;

unsigned int hTimer;

int noversample=2;	/* switch off */

float si=4.0;

float sire;

char a;

int OPT_SZ;

float *vi2, *out_emu, *out_dimec;

int iop;

int   *iox, *ioy;

int   irx[MAX_NTRACE], iry[MAX_NTRACE], isx[MAX_NTRACE], isy[MAX_NTRACE];

clock_t startTime;

clock_t endTime;

float executionTime;

CUT_CHECK_DEVICE();

	j=0;

nsidelen = (int) sqrt((double) nop);

nop = nsidelen * nsidelen;

for (i=0;i<ntrcin;i++) {

  irx[i]=(i*25)+250;

  iry[i]=0;

  if (i/10 > j) j++;

  isx[i]=(j*50);

  isy[i]=0;

  dtrin[i]=0.0;

  dtsin[i]=0.0;

}

sire=si;

nsre=nsamps;

    if ( noversample > 0 ) {

  for (i=0;i<noversample;i++) {

    sire *= 0.50;

    nsre *= 2;

  }

}

    fn=500.0/si;

    rdfs=(float) nf/fn;

vi2 = (float *) malloc(nsamps * nop * sizeof(float));

din = (float *) malloc(ntrcin * nsre * nf * sizeof(float));

out_emu = (float *) malloc(nsamps * nop * sizeof(float));

for (k=0;k<nsamps;k++) {

  vi2[k]=1500.0 + ((float) (k) * 5.0);

  vi2[k]*=vi2[k];

  vi2[k]=1.0/vi2[k];

  din[k]=(float) cos((double) (stime + k*si));

}

for (k=1;k<ntrcin;k++) {

  memcpy(&(din[k*nsamps]), din, nsamps*sizeof(float));

}

iox=(int *) malloc(sizeof(int) * nop);

ioy=(int *) malloc(sizeof(int) * nop);

    iop=0;

for (i=0;i<nsidelen;i++) {

  for (j=0;j<nsidelen;j++) {

    if (iop>0) {

      memcpy(&(vi2[iop*nsamps]), vi2, nsamps*sizeof(float));

    }

    iox[iop] = i*25;

    iox[iop] = j*25;

    out_emu[iop] = 0.0;

    iop++;

  }

}

d2p=25.0;

handt=0;

OPT_SZ = sizeof(float);

startTime = clock();



  //  CUDA_SAFE_CALL( cudaMalloc((void **)&starttime, OPT_SZ)  );

    CUDA_SAFE_CALL( cudaMalloc((void **)&sampleinterval,  OPT_SZ)  );

	CUDA_SAFE_CALL( cudaMalloc((void **)&fn,   OPT_SZ) );

    CUDA_SAFE_CALL( cudaMalloc((void **)&rdfs,  OPT_SZ) );

    CUDA_SAFE_CALL( cudaMalloc((void **)&d2p,   OPT_SZ) );

	CUDA_SAFE_CALL( cudaMalloc((void **)&din,   OPT_SZ) );

    CUDA_SAFE_CALL( cudaMalloc((void **)&out,  OPT_SZ) );

    CUDA_SAFE_CALL( cudaMalloc((void **)&velocityInfoBufferAtImageAddr, OPT_SZ) );

    CUDA_SAFE_CALL( cudaMalloc((void **)&dtrin,   (sizeof(float)*MAX_NTRACE)) );

    CUDA_SAFE_CALL( cudaMalloc((void **)&dtsin,  (sizeof(float)*MAX_NTRACE)) );

printf("...copying input data to GPU mem.\n");

    //Copy options data to GPU memory for further processing

/* CUDA_SAFE_CALL( cudaMemcpy(starttime, stime, sizeof(float), cudaMemcpyHostToDevice) );

    CUDA_SAFE_CALL( cudaMemcpy(sampleinterval, si,  OPT_SZ, cudaMemcpyHostToDevice) );

    CUDA_SAFE_CALL( cudaMemcpy(noPrefilteredInputTraceVersions,  nf,   sizeof(int), cudaMemcpyHostToDevice) );

	CUDA_SAFE_CALL( cudaMemcpy(fn,  fn,   OPT_SZ, cudaMemcpyHostToDevice) );

    CUDA_SAFE_CALL( cudaMemcpy(rdfs, rdfs,  OPT_SZ, cudaMemcpyHostToDevice) );

    CUDA_SAFE_CALL( cudaMemcpy(lengthInputTraces,  nsamps,   sizeof(int), cudaMemcpyHostToDevice) );

	CUDA_SAFE_CALL( cudaMemcpy(noOutSamples,  nsamps,   sizeof(short), cudaMemcpyHostToDevice) );

	CUDA_SAFE_CALL( cudaMemcpy(noOutImageLocs,  nop,   sizeof(int), cudaMemcpyHostToDevice) );

    CUDA_SAFE_CALL( cudaMemcpy(noInputTraces, ntrcin,  sizeof(int), cudaMemcpyHostToDevice) );

    CUDA_SAFE_CALL( cudaMemcpy(d2p,  d2p,   OPT_SZ, cudaMemcpyHostToDevice) );

	CUDA_SAFE_CALL( cudaMemcpy(din[MAX_NF_MAX_NSAMPS],  din,   OPT_SZ, cudaMemcpyHostToDevice) );

    CUDA_SAFE_CALL( cudaMemcpy(out[MAX_NSMAP_MAX_NOP], out_emu,  OPT_SZ, cudaMemcpyHostToDevice) );

    CUDA_SAFE_CALL( cudaMemcpy(*velocityInfoBufferAtImageAddr,  vi2,   OPT_SZ, cudaMemcpyHostToDevice) );

	CUDA_SAFE_CALL( cudaMemcpy(ox[MAX_NOP],  iox,   (sizeof(int)*MAX_NOP), cudaMemcpyHostToDevice) );

    CUDA_SAFE_CALL( cudaMemcpy(oy[MAX_NOP], ioy,  (sizeof(int)*MAX_NOP), cudaMemcpyHostToDevice) );

    CUDA_SAFE_CALL( cudaMemcpy(rx[MAX_NTRACE],  irx,  (sizeof(int)*MAX_NTRACE), cudaMemcpyHostToDevice) );

	CUDA_SAFE_CALL( cudaMemcpy(ry[MAX_NTRACE],  iry,   (sizeof(int)*MAX_NTRACE), cudaMemcpyHostToDevice) );

    CUDA_SAFE_CALL( cudaMemcpy(sx[MAX_NTRACE], isx,  (sizeof(int)*MAX_NTRACE), cudaMemcpyHostToDevice) );

    CUDA_SAFE_CALL( cudaMemcpy(sy[MAX_NTRACE],  isy,   (sizeof(int)*MAX_NTRACE), cudaMemcpyHostToDevice) );

	CUDA_SAFE_CALL( cudaMemcpy(dtrin[MAX_NTRACE],  dtrin,   (sizeof(float)*MAX_NTRACE), cudaMemcpyHostToDevice) );

    CUDA_SAFE_CALL( cudaMemcpy(dtsin[MAX_NTRACE], dtsin,  (sizeof(float)*MAX_NTRACE), cudaMemcpyHostToDevice) );

*/

printf(“Data init done.\n”);

printf(“Executing GPU kernel…\n”);

    CUDA_SAFE_CALL( cudaThreadSynchronize() );

    CUT_SAFE_CALL( cutResetTimer(hTimer) );

    CUT_SAFE_CALL( cutStartTimer(hTimer) );

    code<<<512, 512>>>(

   stime, si,                 /* start time and sample interval */

nf, /* number of prefiltered input trace versions */

fn, rdfs,

nsamps, /* length of input traces */

nsamps, /* no. of o/p samples */

nop, /* number of o/p image locations */

ntrcin, /* no. of input traces */

d2p,

din, /* input data */

out_emu, /* o/p image buffer */

vi2, /* velocity info buffer at image */

iox, /* output location coordinates */

ioy,

irx, /* receiver coordinates */

iry,

isx, /* source coordinates */

isy,

dtrin, /* topographic term */

dtsin); /* topographic term */

    CUDA_SAFE_CALL( cudaThreadSynchronize() );

    CUT_SAFE_CALL( cutStopTimer(hTimer) );

e.ping · July 24, 2007, 9:59pm

What are the values of MAX_NOP, MAX_NTRACE, MAX_NSAMPS_MAX_NOP etc? Since it is these that determine the amount of local data that is allocated, it would be helpful if they were provided in the code snapshot. Also, are you certain that they are fully defined and not assigned some random garbage?

cmorrison · July 25, 2007, 9:06am

Sorry, my bad. Does this help?

#define MAX_NTRACE (256)

#define MAX_NSAMPS (4096)

#define MAX_NOP    (100000)

#define MAX_NF     (20)

#define MAX_NF_MAX_NSAMPS (MAX_NF*MAX_NSAMPS)

#define MAX_NSAMPS_MAX_NOP  (MAX_NSAMPS*MAX_NOP)

e.ping · July 25, 2007, 12:17pm

Well, if we work back from the error and deduce that CUDA imposes some limit (0x4000) on the amount of local data that you can have in a function, then you are clearly going to have problems with these numbers.

For starters, you declare three int arrays of size 100,000 elements (based on MAX_NOP) plus a float array of size 409,600,000 elements (based on MAX_NSAMPS_MAX_NOP). - These are bound to blow those limits!

I’m sorry that my news is basically negative - I’m sure there must be a way of working around this rather low limit, but I don’t really know what it might be (allocate on the heap, probably).

e.ping · July 25, 2007, 12:30pm

Doh! I just re-read your code - you are allocating on (the CUDA) heap; you’re just not doing it correctly! External Image

The declaration: int foobar[12345] explicitly allocates 12345*sizeof(int) bytes of local memory, which is what is causing your problems.

Later on though, you are attempting to allocate the memory for foobar on the cuda device, using cudaMalloc. However, you need to note two points:

The amount of memory you need to allocate will typically be 12345*sizeof(float) (NOT just sizeof(float) as you are doing)
- The variable foobar just needs to be declared as a float* (or an unsized array, of the form float foobar - which is effectively equivalent, albeit a bit clearer to the reader).

Those two changes ought to solve most of your problems.

Incidentally, on a minor coding issue, I find your comments a bit redundant, e.g.:

int lengthInputTraces; /* length of input traces */

If the variable names are meaningful (and they ought to be) then the comments add nothing but noise! ;)

cmorrison · July 25, 2007, 1:41pm

Thanks I’ll have a look at these suggestions.

Part of the problem is i’ve tried to port an FPGA code into a CUDA code without really paying attention to what I was doing.

As for my idiotic comment that sbecause the original variables had helpful names such as lit and as I read through the original code I commented them so I could understand what previous developers had done. Then I decided to change the variable names and must have forgot to remove some of the ocmments!

int lengthInputTraces; /* length of input traces */

Thanks for the help,

Chris

e.ping · July 25, 2007, 3:12pm

Hmmmm - FPGA to CUDA: Fun! :P or Magic? :magic:

cmorrison · July 25, 2007, 3:20pm

Sadly there isn’t a pictre of a guy yanking his hair out of his head so this will have to do External Media

Topic		Replies	Views
Cuda code performance CUDA Programming and Performance	14	3129	December 16, 2014
Deep subroutine use causes cudaError_enum? CUDA Programming and Performance	14	14384	February 6, 2010
Calling CUDA C from fortran CUDA Programming and Performance	4	906	December 4, 2021
Inconsistent Cuda Errors CUDA Programming and Performance	10	1058	July 7, 2017
Error running simple CUDA Fortran program Legacy PGI Compilers	9	21313	February 26, 2010
Cuda unknown error on the shaded line.can anyone help? CUDA Programming and Performance	4	1264	March 12, 2015
Problem with CUDA code,overflow? CUDA Programming and Performance	9	1194	September 10, 2020
GPU Transfer problems GPU won't correctly read data out from Device to Host CUDA Programming and Performance	15	2633	August 2, 2010
Internal Profiling error - insufficient kernel bounds data CUDA Programming and Performance	8	4633	May 9, 2016
cuda memory issues Legacy PGI Compilers	13	9510	March 19, 2012

Cuda Error

Related topics