Optimize kernel

Hello,

I am using CUDA 3.1 and GTX480.

This kernel is very expensive, because it copy more as 1.000.000 elements. Is it possible to optimize it?

__global__ void kernelVertexUpdate(...)

{

	const int i = (blockIdx.y*blockDim.x*gridDim.x) + (blockDim.x*blockIdx.x) + threadIdx.x;

	

	if( (i<m_IndicesLength) || (i<m_ActiveVerticesLength)){

		

		if(i<m_IndicesLength){

			const int vid = m_Indices[i];

			const int new_pos = pos_flag[vid];

			const int next_pos = pos_flag[vid+1];

			if( next_pos != new_pos )

				m_Indices[i] = new_pos;

			else

				m_Indices[i] = new_pos-1;

		}

		if(i<m_ActiveVerticesLength){

			const int new_pos  = pos_flag[i];

			const int next_pos = pos_flag[i+1];

			if( next_pos != new_pos )

			{

				

				CPMVertexCache cache_temp					 = m_ActiveVerticesCache[i];

				const unsigned int   index_temp				   = m_ActiveVertices_m_Index[i];

				const unsigned int   nextSplit_temp			   = m_ActiveVertices_m_NextSplit[i];

				const unsigned int   nextCollapse_temp		 = m_ActiveVertices_m_Collapse[i];

				m_ActiveVerticesCacheNEW[new_pos]		= cache_temp;

				m_ActiveVertices_m_IndexNEW[new_pos]	 = index_temp;

				m_ActiveVertices_m_NextSplitNEW[new_pos] = nextSplit_temp;

				m_ActiveVertices_m_CollapseNEW[new_pos]  = nextCollapse_temp;

				const int new_posPlus1 = new_pos+1;

				if( next_pos > new_posPlus1 )

				{ 

					m_ActiveVerticesCacheNEW[new_posPlus1]			   = cache_temp;

					m_ActiveVertices_m_IndexNEW[new_posPlus1]		   = index_temp; 

					m_ActiveVertices_m_NextSplitNEW[new_posPlus1]	= nextSplit_temp; 

					m_ActiveVertices_m_CollapseNEW[new_posPlus1]	 = nextCollapse_temp; 

				}

				

				const int new_posVertices = new_pos*m_StrideDiv4;

				const int i_posVertices   = i*m_StrideDiv4;

				#pragma unroll 6

				for(int j=0;j<m_StrideDiv4;++j){

					const float temp = m_Vertices[i_posVertices+j];

					m_VerticesNEW[new_posVertices+j] = temp; 

				}

			}

		}

	}

}