I am getting the following error in my code. I thing i’ve made a bit of a balls up but just in case it isn’t a code mistake I thought I would make a post. Please find the error code below
ptxas error : Entry function 'code' uses too much local data (0x1e000 bytes, 0x4000 max)
The error comes from somewhere in the following code (i guess), this is my first CUDA program that isn’t from the examples so it’s almost certainly really wrong.
I’m afraid i can only post the variable declarations etc…
[CODE]
main(int iargc, char **argv)
{
int nop=9;//
int nsamps=4; //
int ntrcin=6;
float stime=0.0;
int nsidelen;
float starttime; float sampleinterval; /* start time and sample interval */
int noPrefilteredInputTraceVersions; /* number of prefiltered input trace versions */
float fn; float rdfs;
int lengthInputTraces; /* length of input traces */
short noOutSamples; /* no. of o/p samples */
int noOutImageLocs; /* number of o/p image locations */
int noInputTraces; /* no. of input traces */
float d2p;
float *din;
int handt;
float out[MAX_NSAMPS_MAX_NOP]; /* o/p image buffer */
float velocityInfoBufferAtImageAddr; / offset to velocity info buffer at image address */
int ox[MAX_NOP]; /* output location coordinates */
int oy[MAX_NOP];
int rx[MAX_NTRACE]; /* receiver coordinates */
int ry[MAX_NTRACE];
int sx[MAX_NTRACE]; /* source coordinates */
int sy[MAX_NTRACE];
int kintvi2;
float dtrin[MAX_NTRACE]; /* topographic term */
float dtsin[MAX_NTRACE];
int nf=2;//
int i, j, k, nsre;
unsigned int hTimer;
int noversample=2; /* switch off */
float si=4.0;
float sire;
char a;
int OPT_SZ;
float *vi2, *out_emu, *out_dimec;
int iop;
int *iox, *ioy;
int irx[MAX_NTRACE], iry[MAX_NTRACE], isx[MAX_NTRACE], isy[MAX_NTRACE];
clock_t startTime;
clock_t endTime;
float executionTime;
CUT_CHECK_DEVICE();
j=0;
nsidelen = (int) sqrt((double) nop);
nop = nsidelen * nsidelen;
for (i=0;i<ntrcin;i++) {
irx[i]=(i*25)+250;
iry[i]=0;
if (i/10 > j) j++;
isx[i]=(j*50);
isy[i]=0;
dtrin[i]=0.0;
dtsin[i]=0.0;
}
sire=si;
nsre=nsamps;
if ( noversample > 0 ) {
for (i=0;i<noversample;i++) {
sire *= 0.50;
nsre *= 2;
}
}
fn=500.0/si;
rdfs=(float) nf/fn;
vi2 = (float *) malloc(nsamps * nop * sizeof(float));
din = (float *) malloc(ntrcin * nsre * nf * sizeof(float));
out_emu = (float *) malloc(nsamps * nop * sizeof(float));
for (k=0;k<nsamps;k++) {
vi2[k]=1500.0 + ((float) (k) * 5.0);
vi2[k]*=vi2[k];
vi2[k]=1.0/vi2[k];
din[k]=(float) cos((double) (stime + k*si));
}
for (k=1;k<ntrcin;k++) {
memcpy(&(din[k*nsamps]), din, nsamps*sizeof(float));
}
iox=(int *) malloc(sizeof(int) * nop);
ioy=(int *) malloc(sizeof(int) * nop);
iop=0;
for (i=0;i<nsidelen;i++) {
for (j=0;j<nsidelen;j++) {
if (iop>0) {
memcpy(&(vi2[iop*nsamps]), vi2, nsamps*sizeof(float));
}
iox[iop] = i*25;
iox[iop] = j*25;
out_emu[iop] = 0.0;
iop++;
}
}
d2p=25.0;
handt=0;
OPT_SZ = sizeof(float);
startTime = clock();
// CUDA_SAFE_CALL( cudaMalloc((void **)&starttime, OPT_SZ) );
CUDA_SAFE_CALL( cudaMalloc((void **)&sampleinterval, OPT_SZ) );
CUDA_SAFE_CALL( cudaMalloc((void **)&fn, OPT_SZ) );
CUDA_SAFE_CALL( cudaMalloc((void **)&rdfs, OPT_SZ) );
CUDA_SAFE_CALL( cudaMalloc((void **)&d2p, OPT_SZ) );
CUDA_SAFE_CALL( cudaMalloc((void **)&din, OPT_SZ) );
CUDA_SAFE_CALL( cudaMalloc((void **)&out, OPT_SZ) );
CUDA_SAFE_CALL( cudaMalloc((void **)&velocityInfoBufferAtImageAddr, OPT_SZ) );
CUDA_SAFE_CALL( cudaMalloc((void **)&dtrin, (sizeof(float)*MAX_NTRACE)) );
CUDA_SAFE_CALL( cudaMalloc((void **)&dtsin, (sizeof(float)*MAX_NTRACE)) );
printf("...copying input data to GPU mem.\n");
//Copy options data to GPU memory for further processing
/* CUDA_SAFE_CALL( cudaMemcpy(starttime, stime, sizeof(float), cudaMemcpyHostToDevice) );
CUDA_SAFE_CALL( cudaMemcpy(sampleinterval, si, OPT_SZ, cudaMemcpyHostToDevice) );
CUDA_SAFE_CALL( cudaMemcpy(noPrefilteredInputTraceVersions, nf, sizeof(int), cudaMemcpyHostToDevice) );
CUDA_SAFE_CALL( cudaMemcpy(fn, fn, OPT_SZ, cudaMemcpyHostToDevice) );
CUDA_SAFE_CALL( cudaMemcpy(rdfs, rdfs, OPT_SZ, cudaMemcpyHostToDevice) );
CUDA_SAFE_CALL( cudaMemcpy(lengthInputTraces, nsamps, sizeof(int), cudaMemcpyHostToDevice) );
CUDA_SAFE_CALL( cudaMemcpy(noOutSamples, nsamps, sizeof(short), cudaMemcpyHostToDevice) );
CUDA_SAFE_CALL( cudaMemcpy(noOutImageLocs, nop, sizeof(int), cudaMemcpyHostToDevice) );
CUDA_SAFE_CALL( cudaMemcpy(noInputTraces, ntrcin, sizeof(int), cudaMemcpyHostToDevice) );
CUDA_SAFE_CALL( cudaMemcpy(d2p, d2p, OPT_SZ, cudaMemcpyHostToDevice) );
CUDA_SAFE_CALL( cudaMemcpy(din[MAX_NF_MAX_NSAMPS], din, OPT_SZ, cudaMemcpyHostToDevice) );
CUDA_SAFE_CALL( cudaMemcpy(out[MAX_NSMAP_MAX_NOP], out_emu, OPT_SZ, cudaMemcpyHostToDevice) );
CUDA_SAFE_CALL( cudaMemcpy(*velocityInfoBufferAtImageAddr, vi2, OPT_SZ, cudaMemcpyHostToDevice) );
CUDA_SAFE_CALL( cudaMemcpy(ox[MAX_NOP], iox, (sizeof(int)*MAX_NOP), cudaMemcpyHostToDevice) );
CUDA_SAFE_CALL( cudaMemcpy(oy[MAX_NOP], ioy, (sizeof(int)*MAX_NOP), cudaMemcpyHostToDevice) );
CUDA_SAFE_CALL( cudaMemcpy(rx[MAX_NTRACE], irx, (sizeof(int)*MAX_NTRACE), cudaMemcpyHostToDevice) );
CUDA_SAFE_CALL( cudaMemcpy(ry[MAX_NTRACE], iry, (sizeof(int)*MAX_NTRACE), cudaMemcpyHostToDevice) );
CUDA_SAFE_CALL( cudaMemcpy(sx[MAX_NTRACE], isx, (sizeof(int)*MAX_NTRACE), cudaMemcpyHostToDevice) );
CUDA_SAFE_CALL( cudaMemcpy(sy[MAX_NTRACE], isy, (sizeof(int)*MAX_NTRACE), cudaMemcpyHostToDevice) );
CUDA_SAFE_CALL( cudaMemcpy(dtrin[MAX_NTRACE], dtrin, (sizeof(float)*MAX_NTRACE), cudaMemcpyHostToDevice) );
CUDA_SAFE_CALL( cudaMemcpy(dtsin[MAX_NTRACE], dtsin, (sizeof(float)*MAX_NTRACE), cudaMemcpyHostToDevice) );
*/
printf(“Data init done.\n”);
printf(“Executing GPU kernel…\n”);
CUDA_SAFE_CALL( cudaThreadSynchronize() );
CUT_SAFE_CALL( cutResetTimer(hTimer) );
CUT_SAFE_CALL( cutStartTimer(hTimer) );
code<<<512, 512>>>(
stime, si, /* start time and sample interval */
nf, /* number of prefiltered input trace versions */
fn, rdfs,
nsamps, /* length of input traces */
nsamps, /* no. of o/p samples */
nop, /* number of o/p image locations */
ntrcin, /* no. of input traces */
d2p,
din, /* input data */
out_emu, /* o/p image buffer */
vi2, /* velocity info buffer at image */
iox, /* output location coordinates */
ioy,
irx, /* receiver coordinates */
iry,
isx, /* source coordinates */
isy,
dtrin, /* topographic term */
dtsin); /* topographic term */
CUDA_SAFE_CALL( cudaThreadSynchronize() );
CUT_SAFE_CALL( cutStopTimer(hTimer) );