Very need ooptimaze kernel

Hi there Kernel. My efforts to optimize it has just reached an impasse. This is the core of the most resource-intensive in my program. Prompt that can be optimized for speed?
I spread the whole kernel code:

.version 3.0
.target sm_30
.address_size 64

//карта
//0-31 		arrayoffset
//32-63 	header(32)
//64-71		nonce(8)
//72-103	hashperthread для sha-3(32) - counter
//104-135	hashperthread для fnv(32)
//136-167	dagpointer(32)
//168-199	dagsize(32)
//200-231 	target

.visible .entry _fnv128  (
	.param .u32 _fnv128_param_0
)
{
	
	.reg .b64 	%rt<3>, %rA, %rM;	
	.shared .align 64 .u64 A[1];      //размер динамический, устанавливается с Хоста
	
	.reg .b32  threadId32,temp,round,mixpointer,mixzero,threadGid,Apointer,Apointerzero, counter,adradd, n_c,rS, Ppointer, dagpointer;
	.reg .b32 threadId4,threadGidM,temp2;
	.reg .pred p;
	.reg .b16  threadIdx, threadDim, blockIdx, blockDim;	

	ld.param.u32 	mixpointer, [_fnv128_param_0];
	mov.u32  Apointer,A;
	mov.u16  blockIdx,%ctaid.x;
	mov.u16  blockDim,%nctaid.x;
	mov.u16  threadDim,%ntid.x;
	mov.u16  threadIdx,%tid.x;

	
	bar.sync 	0;
	prefetch.global.L1  [mixpointer+104];
	ld.global.u32 	counter, [mixpointer+104];      // загружаем колличество циклов	
	ld.global.u32 	dagpointer, [mixpointer+136];	// загружаем указатель на DAG массив	
	ld.global.u32 	n_c, [mixpointer+168];		// загружаем размер DAG массива
	
	
	ld.global.u32 	temp, [mixpointer];		// загружаем смещение	
	add.u32 mixpointer,mixpointer,temp;	
	
			
	cvt.u32.u16  threadId32,threadIdx;	
	
	mul.wide.u16 temp,blockDim,threadDim;	
	shr.b32  temp,temp,4;
	mul.lo.u32 adradd,temp,200;			//adradd - смещение для следующей партии хешей
	
	mul.wide.u16 temp,blockIdx,threadDim;
	add.u32 threadGid,temp,threadId32;	
	shr.b32  mixzero,threadGid,4;	
	
	shr.b32  temp,threadId32,4;	
	shl.b32  temp,temp,7;
	add.u32  Apointerzero,Apointer,temp;	        //Apointerzero- указатель на начало массива в Shared memory для этой нити
	
	shl.b32  temp,threadId32,3;		
	add.u32  Apointer,Apointer,temp;		//Apointer - указатель на 64бит. слово в Shared memory для этой нити
	
	mul.lo.u32  mixzero,mixzero,200;		
	add.u32  mixzero,mixpointer,mixzero;	//mixzero - указатель на начало 200байт. массива для этой нити
	and.b32 threadGid,threadId32,0x0f;
	shl.b32  threadId32,threadGid,3;		//threadId32 - теперь это threadGid%16 * 8
	add.u32     mixpointer,mixzero,threadId32;	//mixpointer - указатель на 64бит. слово в соответств. 200байт. массиве для этой нити
	
	and.b32 temp,threadGid,0x07;
	shl.b32  threadId4,temp,3;
	
	shl.b32  temp,threadGid,2;
	add.u32  threadGidM,temp,64;
	
	//-------------------------	
	add.u32  dagpointer,dagpointer,threadId32;		//сразу добавляем смещение для этой нити
	shr.b32  	n_c, n_c,7;							// fullsize / #HASH_BYTES(64)/mixhashes(#MIX_BYTES(128) / #HASH_BYTES(64))
	
	$LLBtemp1:
	
	
	
	bar.sync 	0;	
	ld.global.u32 	 rS,[mixzero];		//*s
	//раунд 0
	
	mul.lo.u32	temp,rS,0x01000193;
	xor.b32		temp,temp,rS;			//fnv(i ! ValueL(*s), ValueL(*mix+i % w) )
	
	rem.u32  temp, temp,n_c;			//% (n /mixhashes)
	shl.b32  temp,temp,7;				//*mixhashes(#MIX_BYTES(128) / #HASH_BYTES(64)) * #HASH_BYTES(64)	
	add.u32  Ppointer,dagpointer,temp;	//Ppointer указатель на dag со смещением P и смещением для этой нити
	
	
	add.u32  temp,mixzero,threadId4;	
	
	
	ld.global.u64 	 %rA,[Ppointer];
	
	ld.global.u64 	 %rM,[temp];	
	
	mul.lo.u64 		%rt0,%rM,0x01000193;													
	and.b64		%rt0,%rt0,0xffffffff;	
	shr.b64  	%rt1,%rM, 32;
	mul.lo.u64 	%rt1,%rt1,0x01000193;
	shl.b64  	%rt1,%rt1, 32;
	xor.b64		%rM,%rt0,%rt1;	
	xor.b64		%rM,%rM,%rA;
	
	st.shared.u64 	 [Apointer],%rM;

	mov.u32  	round,0x01;
	$LLBfnv1:
	// расчитываем P
	
	and.b32 temp,round,0x1f;			//i % w(32)
	shl.b32  temp,temp,2;
	add.u32  temp,Apointerzero,temp;
	bar.sync 	0;
	ld.shared.u32 	 temp2,[temp];		        //ValueL(*mix+i % w)	
	xor.b32		temp,rS,round;			//i ! ValueL(*s)
	mul.lo.u32	temp,temp,0x01000193;
	xor.b32		temp,temp,temp2;		//fnv(i ! ValueL(*s), ValueL(*mix+i % w) )
	
	rem.u32  temp, temp,n_c;			//% (n /mixhashes)
	shl.b32  temp,temp,7;				//*mixhashes(#MIX_BYTES(128) / #HASH_BYTES(64)) * #HASH_BYTES(64)	
	add.u32  Ppointer,dagpointer,temp;	//Ppointer указатель на dag со смещением P и смещением для этой нити

	
	ld.global.u64 	 %rA,[Ppointer];
	
	
	mul.lo.u64 		%rt0,%rM,0x01000193;													
	and.b64		%rt0,%rt0,0xffffffff;	
	shr.b64  	%rt1,%rM, 32;
	mul.lo.u64 	%rt1,%rt1,0x01000193;
	shl.b64  	%rt1,%rt1, 32;
	xor.b64		%rM,%rt0,%rt1;	
	xor.b64		%rM,%rM,%rA;
	
	
	st.shared.u64 	 [Apointer],%rM;
	
	add.u32     round,round,1;
	setp.lo.u32 p,round,64;  
	
	@p bra.uni $LLBfnv1;
	//---------------------------------	
	
	
	st.global.u64 	 [mixpointer],%rM;
	
	add.u32     mixpointer,mixpointer,adradd;	
	add.u32     mixzero,mixzero,adradd;
	
	sub.u32     counter,counter,1;
	setp.ne.u32 p,counter,0;               
	@p bra.uni $LLBtemp1;
	
	ret;
}

I have tried different ways to write Kernel: And with 32 bit computing and one thread on the hash (in series) and 32 threads on the hash and how you want. The result is weak.
If initially data at the pointer “mixpointer” is zero, the kernel works relatively quickly. But as soon as there appear real values, the speed drops immediately. I understand that at the kernel used “rem” instructions and other run-time which depends on the data. But I do know that it is possible is achieved faster results.

did you start Writing your kernel in PTX? do you have a CUDA C source version?

Yes, iam start write kernel in PTX, i havet C/C++ source

Seems to me like that’s a rather premature optimization (or likely the opposite).

It’s also much easier for everyone else to help if you have CUDA C version, it will be much easier to understand what you are trying to do.

I would recommend posting a complete sample file that calls and runs your kernel, I think your chances of getting help will be much higher this way.

Unfortunately I have a source code in C / C ++ s I did not know well enough.
Okay, I’ll try to write at least simplified.

P.S. No, probably in C / C ++ does not write. I’ll wait for PTX who knows…

In general, I found the reason. The array named “dagpointer”- this is a very big piece of memory, 1.5 GB. And “Ppointer” is pointer that indicates a specific location in the array has a different value for each of 16 threads. And probably, the fact that every 16 threads are trying to get data from different storage pieces give more delay.
How can I get rid of it? Are there any restrictions on the access to the global memory? Furthermore alignment. Because all arrays at me and so aligned.