emulation run ok but fail on really device?

as title,i write a cuda program code like folowing

cudaMemcpy (cudavels, vels, nvels * sizeof (float),
cudaMemcpyHostToDevice);
cudaMemcpy (cudaoffs, offs, nfold * sizeof (float),
cudaMemcpyHostToDevice);
cudaMemcpy (cudacdpx, cdpx, outtrace * sizeof (float),
cudaMemcpyHostToDevice);
cudaMemcpy (cudacdpy, cdpy, outtrace * sizeof (float),
cudaMemcpyHostToDevice);
cudaMemcpy (cudamtime, mtime, nmtime * sizeof (float),
cudaMemcpyHostToDevice);
cudaMemcpy (cudaan, an, nan * sizeof (float),
cudaMemcpyHostToDevice);
cudaMemcpy (cudatnow, tnow, ntnow * sizeof (float),
cudaMemcpyHostToDevice);
cudaMemcpy (cudafpa, fpa, 17 * sizeof (float),
cudaMemcpyHostToDevice);
cudaMemcpy (cudaapert, apert, napert * sizeof (float),
cudaMemcpyHostToDevice);
cudaMemcpy (cudaapwork, apwork, napwork * sizeof (float),
cudaMemcpyHostToDevice);
cudaMemcpy (cudaapwork, apwork, napwork * sizeof (float),
cudaMemcpyHostToDevice);

          pstmkernel <<< 1, 1 >>> (cudamods, &cudadatain[j * ndatain],
                                   cudavels, cudaoffs, cudaitmp,
                                   cudacdpx, cudacdpy, cudamtime,
                                   cudaan, cudatnow, cudafpa, sl, xl,
                                   shape, ntabs, cudaapert, cudaapert,
                                   cudaapwork, subap, crsap, sunout,
                                   ntnew, nband1, submin, submax, crsmin,
                                   crsmax, subinc, crsinc, sunout3, ndcrs,
                                   live, noutoff, moff, gathran[0],
                                   imgsum, myid, resamp, weight);
          cudaError_t err = cudaGetLastError ();
          if (err != cudaSuccess)
            {
              printf ("error execute pstmkernel on GPU\n");
             exit(-1);
             }

did any one have some idea?

What’s the error output you get?

Have you tried running the emulated version in Valgrind? Some memory errors can go unnoticed in emulation mode, but crash the real version – and some are the other way around.

the error message is unspecified launch failure.

Too many parameters?

Note that there’s 256bytes limit in parameter list, and even if your code exceed that limit, nvcc can still compile your code into ptx and cubin without any warning.

i don’t think is problem ,there is 14 float* parameter and 25 int

int size is 4 ,float * is 8 so

there is total 14x8+4*25=212byte

i’m trying useing valgrind to find problem get follow messages

==20660== 2,416 bytes in 1 blocks are still reachable in loss record 102 of 111

==20660== at 0x4A1A649: operator new(unsigned long) (vg_replace_malloc.c:230)

==20660== by 0x4CDBA4E: std::__default_alloc_template<true, 0>::_S_chunk_alloc(unsigned long, int&) (in /usr/lib64/libstdc++.so.5.0.3)

==20660== by 0x4CDB959: std::__default_alloc_template<true, 0>::_S_refill(unsigned long) (in /usr/lib64/libstdc++.so.5.0.3)

==20660== by 0x4CDB4FA: std::__default_alloc_template<true, 0>::allocate(unsigned long) (in /usr/lib64/libstdc++.so.5.0.3)

==20660== by 0x4B3C09E: (within /usr/local/cuda/lib/libcudart.so.1.1)

==20660== by 0x4B3665E: (within /usr/local/cuda/lib/libcudart.so.1.1)

==20660== by 0x4B3D408: (within /usr/local/cuda/lib/libcudart.so.1.1)

==20660== by 0x4B2FA91: (within /usr/local/cuda/lib/libcudart.so.1.1)

==20660== by 0x4B2895A: cudaLaunch (in /usr/local/cuda/lib/libcudart.so.1.1)

==20660== by 0x44AADD: __device_stub__Z10pstmkernelPfS_S_S_S_S_S_PiS_S_S_iiiiS_S_S_iiiiiiiiiiiiiiiiiiiii (tmpxft_000050a7_00000000-0.stub.c:25)

==20660== by 0x439456: main (kqmpiapstm.cu:2656)

blabla