#define F(x, y, z) (((x) & (y)) | ((~x) & (z)))
#define FF2(a, b, c, d, x, s, ac, ic,id) { DO(F,a,b,c,d,x,s,ac/*md5_const[ic]*/);}
�   const int ix = blockDim.x * blockIdx.x + threadIdx.x;
int data[4];
for(int i=0;i<4;i++)data[i]=sample_data[i];//
int a,b,c,d;
const int len = code_len[0];
for(int loop=0;loop<20000;loop++){
a=0x67452301-500+ix;b=0xefcdab89+loop;c=0x98badcfe;d=0x10325476;
FF2 (a, b, c, d, data[0], S11, 0xd76aa478,0,0); /* 1, 63 that lines more */
�  �  �  }
in this case code uses sample_data&code_len[0] which are constant and can run at full speed at grid=64, thread 128
COde with data_d->sample_data[i] can run with fullspeed at grid=128 thread 128. data_d is device
COde with data[i]=123; also runs as slow as with constant.
Execution results are the same the only difference is speed
Here are benchmark results, numbers a millions of hashes/sec checked:
grid=128, thread=128
global: 188.317368
constant: 167.413727
immediate value: 168.691376
grid=64, thread=128
global: 188.188232
constant: 188.084717
immediate value: 189.572815
moving this “const int len = code_len[0];” to constants does not affect performance.