as title,i write a cuda program code like folowing
cudaMemcpy (cudavels, vels, nvels * sizeof (float),
cudaMemcpyHostToDevice);
cudaMemcpy (cudaoffs, offs, nfold * sizeof (float),
cudaMemcpyHostToDevice);
cudaMemcpy (cudacdpx, cdpx, outtrace * sizeof (float),
cudaMemcpyHostToDevice);
cudaMemcpy (cudacdpy, cdpy, outtrace * sizeof (float),
cudaMemcpyHostToDevice);
cudaMemcpy (cudamtime, mtime, nmtime * sizeof (float),
cudaMemcpyHostToDevice);
cudaMemcpy (cudaan, an, nan * sizeof (float),
cudaMemcpyHostToDevice);
cudaMemcpy (cudatnow, tnow, ntnow * sizeof (float),
cudaMemcpyHostToDevice);
cudaMemcpy (cudafpa, fpa, 17 * sizeof (float),
cudaMemcpyHostToDevice);
cudaMemcpy (cudaapert, apert, napert * sizeof (float),
cudaMemcpyHostToDevice);
cudaMemcpy (cudaapwork, apwork, napwork * sizeof (float),
cudaMemcpyHostToDevice);
cudaMemcpy (cudaapwork, apwork, napwork * sizeof (float),
cudaMemcpyHostToDevice);
pstmkernel <<< 1, 1 >>> (cudamods, &cudadatain[j * ndatain],
cudavels, cudaoffs, cudaitmp,
cudacdpx, cudacdpy, cudamtime,
cudaan, cudatnow, cudafpa, sl, xl,
shape, ntabs, cudaapert, cudaapert,
cudaapwork, subap, crsap, sunout,
ntnew, nband1, submin, submax, crsmin,
crsmax, subinc, crsinc, sunout3, ndcrs,
live, noutoff, moff, gathran[0],
imgsum, myid, resamp, weight);
cudaError_t err = cudaGetLastError ();
if (err != cudaSuccess)
{
printf ("error execute pstmkernel on GPU\n");
exit(-1);
}
did any one have some idea?