PTX Assembly Instruction timing project (comments and questions) trying to calculate instruction tim

Notes:
-I am just a beginner with cuda/ptx.
-Below I ran each f32 type instruction and timed it.
-I played around with the code by adding padding to try and minimize the times.
-I subtracted 52 clock cycles because this is the base time. (time to run no instructions)

//////////////////////Specs///////////////////////////
-GeForce 9600 GSO
-run 1x1 grid

//////////////////////Instructions///////////////////////////
Ticks: 6 for add.rn.f32 _df32, _sf32, _sf32;
Ticks: 14 for add.rn.sat.f32 _df32, _sf32, _sf32;
Ticks: 6 for add.rz.f32 _df32, _sf32, _sf32;
Ticks: 14 for add.rz.sat.f32 _df32, _sf32, _sf32;
Ticks: 6 for sub.rn.f32 _df32, _sf32, _sf32;
Ticks: 14 for sub.rn.sat.f32 _df32, _sf32, _sf32;
Ticks: 6 for sub.rz.f32 _df32, _sf32, _sf32;
Ticks: 14 for sub.rz.sat.f32 _df32, _sf32, _sf32;
Ticks: 6 for mul.rn.f32 _df32, _sf32, _sf32;
Ticks: 22 for mul.rn.sat.f32 _df32, _sf32, _sf32;
Ticks: 6 for mul.rz.f32 _df32, _sf32, _sf32;
Ticks: 22 for mul.rz.sat.f32 _df32, _sf32, _sf32;
Ticks: 6 for mad.f32 _df32, _sf32, _sf32, _sf32;
Ticks: 6 for mad.sat.f32 _df32, _sf32, _sf32, _sf32;
Ticks: 34 for div.f32 _df32, _sf32, _sf32;
Ticks: 32 for div.sat.f32 _df32, _sf32, _sf32;
Ticks: 6 for abs.f32 _df32, _sf32;
Ticks: 6 for neg.f32 _df32, _sf32;
Ticks: 6 for max.f32 _df32, _sf32, _sf32;
Ticks: 6 for min.f32 _df32, _sf32, _sf32;
Ticks: 14 for set.eq.f32.f32 _df32, _sf32, _sf32;
Ticks: 14 for set.ne.f32.f32 _df32, _sf32, _sf32;
Ticks: 14 for set.lt.f32.f32 _df32, _sf32, _sf32;
Ticks: 14 for set.le.f32.f32 _df32, _sf32, _sf32;
Ticks: 14 for set.gt.f32.f32 _df32, _sf32, _sf32;
Ticks: 14 for set.ge.f32.f32 _df32, _sf32, _sf32;
Ticks: 14 for set.equ.f32.f32 _df32, _sf32, _sf32;
Ticks: 14 for set.neu.f32.f32 _df32, _sf32, _sf32;
Ticks: 14 for set.ltu.f32.f32 _df32, _sf32, _sf32;
Ticks: 14 for set.leu.f32.f32 _df32, _sf32, _sf32;
Ticks: 14 for set.gtu.f32.f32 _df32, _sf32, _sf32;
Ticks: 14 for set.geu.f32.f32 _df32, _sf32, _sf32;
Ticks: 14 for set.num.f32.f32 _df32, _sf32, _sf32;
Ticks: 14 for set.nan.f32.f32 _df32, _sf32, _sf32;
Ticks: 844 for set.eq.and.f32.f32 _df32, _sf32, _sf32, !_sPdt; <—accessing off-chip memory?
Ticks: 66 for set.eq.and.f32.f32 _df32, _sf32, _sf32, _sPdt;
Ticks: 828 for set.eq.or.f32.f32 _df32, _sf32, _sf32, !_sPdt;
Ticks: 66 for set.eq.or.f32.f32 _df32, _sf32, _sf32, _sPdt;
Ticks: 814 for set.eq.xor.f32.f32 _df32, _sf32, _sf32, !_sPdt;
Ticks: 66 for set.eq.xor.f32.f32 _df32, _sf32, _sf32, _sPdt;
Ticks: 932 for set.ne.and.f32.f32 _df32, _sf32, _sf32, !_sPdt;
Ticks: 66 for set.ne.and.f32.f32 _df32, _sf32, _sf32, _sPdt;
Ticks: 804 for set.ne.or.f32.f32 _df32, _sf32, _sf32, !_sPdt;
Ticks: 66 for set.ne.or.f32.f32 _df32, _sf32, _sf32, _sPdt;
Ticks: 826 for set.ne.xor.f32.f32 _df32, _sf32, _sf32, !_sPdt;
Ticks: 66 for set.ne.xor.f32.f32 _df32, _sf32, _sf32, _sPdt;
Ticks: 824 for set.lt.and.f32.f32 _df32, _sf32, _sf32, !_sPdt;
Ticks: 66 for set.lt.and.f32.f32 _df32, _sf32, _sf32, _sPdt;
Ticks: 826 for set.lt.or.f32.f32 _df32, _sf32, _sf32, !_sPdt;
Ticks: 66 for set.lt.or.f32.f32 _df32, _sf32, _sf32, _sPdt;
Ticks: 832 for set.lt.xor.f32.f32 _df32, _sf32, _sf32, !_sPdt;
Ticks: 66 for set.lt.xor.f32.f32 _df32, _sf32, _sf32, _sPdt;
Ticks: 826 for set.le.and.f32.f32 _df32, _sf32, _sf32, !_sPdt;
Ticks: 66 for set.le.and.f32.f32 _df32, _sf32, _sf32, _sPdt;
Ticks: 830 for set.le.or.f32.f32 _df32, _sf32, _sf32, !_sPdt;
Ticks: 66 for set.le.or.f32.f32 _df32, _sf32, _sf32, _sPdt;
Ticks: 830 for set.le.xor.f32.f32 _df32, _sf32, _sf32, !_sPdt;
Ticks: 66 for set.le.xor.f32.f32 _df32, _sf32, _sf32, _sPdt;
Ticks: 846 for set.gt.and.f32.f32 _df32, _sf32, _sf32, !_sPdt;
Ticks: 66 for set.gt.and.f32.f32 _df32, _sf32, _sf32, _sPdt;
Ticks: 846 for set.gt.or.f32.f32 _df32, _sf32, _sf32, !_sPdt;
Ticks: 66 for set.gt.or.f32.f32 _df32, _sf32, _sf32, _sPdt;
Ticks: 830 for set.gt.xor.f32.f32 _df32, _sf32, _sf32, !_sPdt;
Ticks: 66 for set.gt.xor.f32.f32 _df32, _sf32, _sf32, _sPdt;
Ticks: 808 for set.ge.and.f32.f32 _df32, _sf32, _sf32, !_sPdt;
Ticks: 66 for set.ge.and.f32.f32 _df32, _sf32, _sf32, _sPdt;
Ticks: 802 for set.ge.or.f32.f32 _df32, _sf32, _sf32, !_sPdt;
Ticks: 66 for set.ge.or.f32.f32 _df32, _sf32, _sf32, _sPdt;
Ticks: 806 for set.ge.xor.f32.f32 _df32, _sf32, _sf32, !_sPdt;
Ticks: 66 for set.ge.xor.f32.f32 _df32, _sf32, _sf32, _sPdt;
Ticks: 822 for set.equ.and.f32.f32 _df32, _sf32, _sf32, !_sPdt;
Ticks: 66 for set.equ.and.f32.f32 _df32, _sf32, _sf32, _sPdt;
Ticks: 846 for set.equ.or.f32.f32 _df32, _sf32, _sf32, !_sPdt;
Ticks: 66 for set.equ.or.f32.f32 _df32, _sf32, _sf32, _sPdt;
Ticks: 806 for set.equ.xor.f32.f32 _df32, _sf32, _sf32, !_sPdt;
Ticks: 66 for set.equ.xor.f32.f32 _df32, _sf32, _sf32, _sPdt;
Ticks: 846 for set.neu.and.f32.f32 _df32, _sf32, _sf32, !_sPdt;
Ticks: 66 for set.neu.and.f32.f32 _df32, _sf32, _sf32, _sPdt;
Ticks: 804 for set.neu.or.f32.f32 _df32, _sf32, _sf32, !_sPdt;
Ticks: 66 for set.neu.or.f32.f32 _df32, _sf32, _sf32, _sPdt;
Ticks: 828 for set.neu.xor.f32.f32 _df32, _sf32, _sf32, !_sPdt;
Ticks: 66 for set.neu.xor.f32.f32 _df32, _sf32, _sf32, _sPdt;
Ticks: 846 for set.ltu.and.f32.f32 _df32, _sf32, _sf32, !_sPdt;
Ticks: 66 for set.ltu.and.f32.f32 _df32, _sf32, _sf32, _sPdt;
Ticks: 808 for set.ltu.or.f32.f32 _df32, _sf32, _sf32, !_sPdt;
Ticks: 66 for set.ltu.or.f32.f32 _df32, _sf32, _sf32, _sPdt;
Ticks: 852 for set.ltu.xor.f32.f32 _df32, _sf32, _sf32, !_sPdt;
Ticks: 66 for set.ltu.xor.f32.f32 _df32, _sf32, _sf32, _sPdt;
Ticks: 842 for set.leu.and.f32.f32 _df32, _sf32, _sf32, !_sPdt;
Ticks: 66 for set.leu.and.f32.f32 _df32, _sf32, _sf32, _sPdt;
Ticks: 844 for set.leu.or.f32.f32 _df32, _sf32, _sf32, !_sPdt;
Ticks: 66 for set.leu.or.f32.f32 _df32, _sf32, _sf32, _sPdt;
Ticks: 856 for set.leu.xor.f32.f32 _df32, _sf32, _sf32, !_sPdt;
Ticks: 66 for set.leu.xor.f32.f32 _df32, _sf32, _sf32, _sPdt;
Ticks: 824 for set.gtu.and.f32.f32 _df32, _sf32, _sf32, !_sPdt;
Ticks: 66 for set.gtu.and.f32.f32 _df32, _sf32, _sf32, _sPdt;
Ticks: 828 for set.gtu.or.f32.f32 _df32, _sf32, _sf32, !_sPdt;
Ticks: 66 for set.gtu.or.f32.f32 _df32, _sf32, _sf32, _sPdt;
Ticks: 846 for set.gtu.xor.f32.f32 _df32, _sf32, _sf32, !_sPdt;
Ticks: 66 for set.gtu.xor.f32.f32 _df32, _sf32, _sf32, _sPdt;
Ticks: 842 for set.geu.and.f32.f32 _df32, _sf32, _sf32, !_sPdt;
Ticks: 66 for set.geu.and.f32.f32 _df32, _sf32, _sf32, _sPdt;
Ticks: 850 for set.geu.or.f32.f32 _df32, _sf32, _sf32, !_sPdt;
Ticks: 66 for set.geu.or.f32.f32 _df32, _sf32, _sf32, _sPdt;
Ticks: 852 for set.geu.xor.f32.f32 _df32, _sf32, _sf32, !_sPdt;
Ticks: 66 for set.geu.xor.f32.f32 _df32, _sf32, _sf32, _sPdt;
Ticks: 828 for set.num.and.f32.f32 _df32, _sf32, _sf32, !_sPdt;
Ticks: 66 for set.num.and.f32.f32 _df32, _sf32, _sf32, _sPdt;
Ticks: 830 for set.num.or.f32.f32 _df32, _sf32, _sf32, !_sPdt;
Ticks: 66 for set.num.or.f32.f32 _df32, _sf32, _sf32, _sPdt;
Ticks: 874 for set.num.xor.f32.f32 _df32, _sf32, _sf32, !_sPdt;
Ticks: 66 for set.num.xor.f32.f32 _df32, _sf32, _sf32, _sPdt;
Ticks: 844 for set.nan.and.f32.f32 _df32, _sf32, _sf32, !_sPdt;
Ticks: 66 for set.nan.and.f32.f32 _df32, _sf32, _sf32, _sPdt;
Ticks: 822 for set.nan.or.f32.f32 _df32, _sf32, _sf32, !_sPdt;
Ticks: 66 for set.nan.or.f32.f32 _df32, _sf32, _sf32, _sPdt;
Ticks: 846 for set.nan.xor.f32.f32 _df32, _sf32, _sf32, !_sPdt;
Ticks: 66 for set.nan.xor.f32.f32 _df32, _sf32, _sf32, _sPdt;
Ticks: 6 for setp.eq.f32 _dPdt, _sf32, _sf32;
Ticks: 6 for setp.ne.f32 _dPdt, _sf32, _sf32;
Ticks: 6 for setp.lt.f32 _dPdt, _sf32, _sf32;
Ticks: 6 for setp.le.f32 _dPdt, _sf32, _sf32;
Ticks: 6 for setp.gt.f32 _dPdt, _sf32, _sf32;
Ticks: 6 for setp.ge.f32 _dPdt, _sf32, _sf32;
Ticks: 6 for setp.equ.f32 _dPdt, _sf32, _sf32;
Ticks: 6 for setp.neu.f32 _dPdt, _sf32, _sf32;
Ticks: 6 for setp.ltu.f32 _dPdt, _sf32, _sf32;
Ticks: 6 for setp.leu.f32 _dPdt, _sf32, _sf32;
Ticks: 6 for setp.gtu.f32 _dPdt, _sf32, _sf32;
Ticks: 6 for setp.geu.f32 _dPdt, _sf32, _sf32;
Ticks: 6 for setp.num.f32 _dPdt, _sf32, _sf32;
Ticks: 6 for setp.nan.f32 _dPdt, _sf32, _sf32;
Ticks: 836 for setp.eq.and.f32 _dPdt, _sf32, _sf32, !_sPdt;
Ticks: 796 for setp.eq.and.f32 _dPdt, _sf32, _sf32, _sPdt;
Ticks: 58 for setp.eq.or.f32 _dPdt, _sf32, _sf32, !_sPdt;
Ticks: 820 for setp.eq.or.f32 _dPdt, _sf32, _sf32, _sPdt;
Ticks: 58 for setp.eq.xor.f32 _dPdt, _sf32, _sf32, !_sPdt;
Ticks: 844 for setp.eq.xor.f32 _dPdt, _sf32, _sf32, _sPdt;
Ticks: 58 for setp.ne.and.f32 _dPdt, _sf32, _sf32, !_sPdt;
Ticks: 58 for setp.ne.and.f32 _dPdt, _sf32, _sf32, _sPdt;
Ticks: 820 for setp.ne.or.f32 _dPdt, _sf32, _sf32, !_sPdt;
Ticks: 824 for setp.ne.or.f32 _dPdt, _sf32, _sf32, _sPdt;
Ticks: 58 for setp.ne.xor.f32 _dPdt, _sf32, _sf32, !_sPdt;
Ticks: 58 for setp.ne.xor.f32 _dPdt, _sf32, _sf32, _sPdt;
Ticks: 838 for setp.lt.and.f32 _dPdt, _sf32, _sf32, !_sPdt;
Ticks: 58 for setp.lt.and.f32 _dPdt, _sf32, _sf32, _sPdt;
Ticks: 802 for setp.lt.or.f32 _dPdt, _sf32, _sf32, !_sPdt;
Ticks: 818 for setp.lt.or.f32 _dPdt, _sf32, _sf32, _sPdt;
Ticks: 58 for setp.lt.xor.f32 _dPdt, _sf32, _sf32, !_sPdt;
Ticks: 58 for setp.lt.xor.f32 _dPdt, _sf32, _sf32, _sPdt;
Ticks: 838 for setp.le.and.f32 _dPdt, _sf32, _sf32, !_sPdt;
Ticks: 798 for setp.le.and.f32 _dPdt, _sf32, _sf32, _sPdt;
Ticks: 58 for setp.le.or.f32 _dPdt, _sf32, _sf32, !_sPdt;
Ticks: 798 for setp.le.or.f32 _dPdt, _sf32, _sf32, _sPdt;
Ticks: 58 for setp.le.xor.f32 _dPdt, _sf32, _sf32, !_sPdt;
Ticks: 828 for setp.le.xor.f32 _dPdt, _sf32, _sf32, _sPdt;
Ticks: 58 for setp.gt.and.f32 _dPdt, _sf32, _sf32, !_sPdt;
Ticks: 58 for setp.gt.and.f32 _dPdt, _sf32, _sf32, _sPdt;
Ticks: 798 for setp.gt.or.f32 _dPdt, _sf32, _sf32, !_sPdt;
Ticks: 800 for setp.gt.or.f32 _dPdt, _sf32, _sf32, _sPdt;
Ticks: 58 for setp.gt.xor.f32 _dPdt, _sf32, _sf32, !_sPdt;
Ticks: 58 for setp.gt.xor.f32 _dPdt, _sf32, _sf32, _sPdt;
Ticks: 806 for setp.ge.and.f32 _dPdt, _sf32, _sf32, !_sPdt;
Ticks: 796 for setp.ge.and.f32 _dPdt, _sf32, _sf32, _sPdt;
Ticks: 58 for setp.ge.or.f32 _dPdt, _sf32, _sf32, !_sPdt;
Ticks: 800 for setp.ge.or.f32 _dPdt, _sf32, _sf32, _sPdt;
Ticks: 58 for setp.ge.xor.f32 _dPdt, _sf32, _sf32, !_sPdt;
Ticks: 808 for setp.ge.xor.f32 _dPdt, _sf32, _sf32, _sPdt;
Ticks: 58 for setp.equ.and.f32 _dPdt, _sf32, _sf32, !_sPdt;
Ticks: 58 for setp.equ.and.f32 _dPdt, _sf32, _sf32, _sPdt;
Ticks: 838 for setp.equ.or.f32 _dPdt, _sf32, _sf32, !_sPdt;
Ticks: 796 for setp.equ.or.f32 _dPdt, _sf32, _sf32, _sPdt;
Ticks: 58 for setp.equ.xor.f32 _dPdt, _sf32, _sf32, !_sPdt;
Ticks: 800 for setp.equ.xor.f32 _dPdt, _sf32, _sf32, _sPdt;
Ticks: 58 for setp.neu.and.f32 _dPdt, _sf32, _sf32, !_sPdt;
Ticks: 58 for setp.neu.and.f32 _dPdt, _sf32, _sf32, _sPdt;
Ticks: 822 for setp.neu.or.f32 _dPdt, _sf32, _sf32, !_sPdt;
Ticks: 822 for setp.neu.or.f32 _dPdt, _sf32, _sf32, _sPdt;
Ticks: 58 for setp.neu.xor.f32 _dPdt, _sf32, _sf32, !_sPdt;
Ticks: 58 for setp.neu.xor.f32 _dPdt, _sf32, _sf32, _sPdt;
Ticks: 816 for setp.ltu.and.f32 _dPdt, _sf32, _sf32, !_sPdt;
Ticks: 58 for setp.ltu.and.f32 _dPdt, _sf32, _sf32, _sPdt;
Ticks: 816 for setp.ltu.or.f32 _dPdt, _sf32, _sf32, !_sPdt;
Ticks: 840 for setp.ltu.or.f32 _dPdt, _sf32, _sf32, _sPdt;
Ticks: 58 for setp.ltu.xor.f32 _dPdt, _sf32, _sf32, !_sPdt;
Ticks: 58 for setp.ltu.xor.f32 _dPdt, _sf32, _sf32, _sPdt;
Ticks: 818 for setp.leu.and.f32 _dPdt, _sf32, _sf32, !_sPdt;
Ticks: 836 for setp.leu.and.f32 _dPdt, _sf32, _sf32, _sPdt;
Ticks: 58 for setp.leu.or.f32 _dPdt, _sf32, _sf32, !_sPdt;
Ticks: 818 for setp.leu.or.f32 _dPdt, _sf32, _sf32, _sPdt;
Ticks: 58 for setp.leu.xor.f32 _dPdt, _sf32, _sf32, !_sPdt;
Ticks: 824 for setp.leu.xor.f32 _dPdt, _sf32, _sf32, _sPdt;
Ticks: 58 for setp.gtu.and.f32 _dPdt, _sf32, _sf32, !_sPdt;
Ticks: 58 for setp.gtu.and.f32 _dPdt, _sf32, _sf32, _sPdt;
Ticks: 822 for setp.gtu.or.f32 _dPdt, _sf32, _sf32, !_sPdt;
Ticks: 840 for setp.gtu.or.f32 _dPdt, _sf32, _sf32, _sPdt;
Ticks: 58 for setp.gtu.xor.f32 _dPdt, _sf32, _sf32, !_sPdt;
Ticks: 58 for setp.gtu.xor.f32 _dPdt, _sf32, _sf32, _sPdt;
Ticks: 840 for setp.geu.and.f32 _dPdt, _sf32, _sf32, !_sPdt;
Ticks: 796 for setp.geu.and.f32 _dPdt, _sf32, _sf32, _sPdt;
Ticks: 58 for setp.geu.or.f32 _dPdt, _sf32, _sf32, !_sPdt;
Ticks: 840 for setp.geu.or.f32 _dPdt, _sf32, _sf32, _sPdt;
Ticks: 58 for setp.geu.xor.f32 _dPdt, _sf32, _sf32, !_sPdt;
Ticks: 822 for setp.geu.xor.f32 _dPdt, _sf32, _sf32, _sPdt;
Ticks: 58 for setp.num.and.f32 _dPdt, _sf32, _sf32, !_sPdt;
Ticks: 58 for setp.num.and.f32 _dPdt, _sf32, _sf32, _sPdt;
Ticks: 804 for setp.num.or.f32 _dPdt, _sf32, _sf32, !_sPdt;
Ticks: 798 for setp.num.or.f32 _dPdt, _sf32, _sf32, _sPdt;
Ticks: 58 for setp.num.xor.f32 _dPdt, _sf32, _sf32, !_sPdt;
Ticks: 844 for setp.num.xor.f32 _dPdt, _sf32, _sf32, _sPdt;
Ticks: 58 for setp.nan.and.f32 _dPdt, _sf32, _sf32, !_sPdt;
Ticks: 58 for setp.nan.and.f32 _dPdt, _sf32, _sf32, _sPdt;
Ticks: 828 for setp.nan.or.f32 _dPdt, _sf32, _sf32, !_sPdt;
Ticks: 820 for setp.nan.or.f32 _dPdt, _sf32, _sf32, _sPdt;
Ticks: 58 for setp.nan.xor.f32 _dPdt, _sf32, _sf32, !_sPdt;
Ticks: 58 for setp.nan.xor.f32 _dPdt, _sf32, _sf32, _sPdt;
Ticks: 0 for selp.f32 _df32, _sf32, _sf32, _sPdt; <-- 0 because compiler removes this line (makes sense)
Ticks: 0 for slct.f32.f32 _df32, _sf32, _sf32, _sf32;<-- 0 because compiler removes this line (makes sense)
Ticks: 6 for cvt.sat.f32.f32 _df32, _sf32;
Ticks: 14 for rcp.f32 _df32, _sf32;
Ticks: 32 for sqrt.f32 _df32, _sf32;
Ticks: 14 for rsqrt.f32 _df32, _sf32;
Ticks: 26 for sin.f32 _df32, _sf32;
Ticks: 26 for cos.f32 _df32, _sf32;
Ticks: 14 for lg2.f32 _df32, _sf32;
Ticks: 26 for ex2.f32 _df32, _sf32;

//////////////////////The Test Code////////////////////////
.version 1.3
.target sm_10
.reg .u32 startTime;
.reg .u32 endTime;
.reg .u32 _filler;
.reg .u32 parmOffset;

.entry timedReduction
{
.param .u32 __parm_timer;
.reg .u32 _su32;
.reg .f32 _df32;
.reg .pred _sPdt;
.reg .pred _dPdt;

mov.u32 _su32 , 3;
cvt.rn.f32.u32  _sf32 , _su32;

add.u32 _filler,_filler,_filler; //some junk - avoids a read after write 
add.u32 _filler,_filler,_filler; //some junk - avoids a read after write 

mov.s32 	startTime, %clock;
add.u32 _filler,_filler,_filler; //some junk - avoids a read after write 

---------> put each instuction here <------------------

add.u32 _filler,_filler,_filler; //some junk - avoids a read after write 
setp.lt.or.f32 _dPdt, _sf32, _sf32, _sPdt;
mov.s32 	endTime, %clock;


ld.param.u32 	parmOffset, [__parm_timer];
st.global.s32 	[parmOffset+0], startTime; //save starttime
sub.u32	endTime,endTime,52; //subtracted cost if there is no test instuction 
st.global.s32 	[parmOffset+4], endTime; //save endtime

@_dPdt st.global.f32 [parmOffset+48], _df32 ; //make sure we read _df32 and _dPdt
st.global.u32 [parmOffset+56], _filler ; //make sure we read _filler

exit;
} 

//////////////////////comments////////////////////////
800 tick access for some instructions? My guess is that these instructions internally need to store something in global memory. So running this with a larger grid size would work better because the chip can work on other threads while it is waiting for the global memory. I’m guessing in the real world we would see a number much lower then 800 ticks because we are running more then one thread. However, I do not see the need for global memory access? (if this is what is happening). Does anyone have any ideas on what is going on? Does my test code look okay?