Hello,
I am using CUDA 3.1 and GTX480.
This kernel is very expensive, because it copy more as 1.000.000 elements. Is it possible to optimize it?
__global__ void kernelVertexUpdate(...)
{
const int i = (blockIdx.y*blockDim.x*gridDim.x) + (blockDim.x*blockIdx.x) + threadIdx.x;
if( (i<m_IndicesLength) || (i<m_ActiveVerticesLength)){
if(i<m_IndicesLength){
const int vid = m_Indices[i];
const int new_pos = pos_flag[vid];
const int next_pos = pos_flag[vid+1];
if( next_pos != new_pos )
m_Indices[i] = new_pos;
else
m_Indices[i] = new_pos-1;
}
if(i<m_ActiveVerticesLength){
const int new_pos = pos_flag[i];
const int next_pos = pos_flag[i+1];
if( next_pos != new_pos )
{
CPMVertexCache cache_temp = m_ActiveVerticesCache[i];
const unsigned int index_temp = m_ActiveVertices_m_Index[i];
const unsigned int nextSplit_temp = m_ActiveVertices_m_NextSplit[i];
const unsigned int nextCollapse_temp = m_ActiveVertices_m_Collapse[i];
m_ActiveVerticesCacheNEW[new_pos] = cache_temp;
m_ActiveVertices_m_IndexNEW[new_pos] = index_temp;
m_ActiveVertices_m_NextSplitNEW[new_pos] = nextSplit_temp;
m_ActiveVertices_m_CollapseNEW[new_pos] = nextCollapse_temp;
const int new_posPlus1 = new_pos+1;
if( next_pos > new_posPlus1 )
{
m_ActiveVerticesCacheNEW[new_posPlus1] = cache_temp;
m_ActiveVertices_m_IndexNEW[new_posPlus1] = index_temp;
m_ActiveVertices_m_NextSplitNEW[new_posPlus1] = nextSplit_temp;
m_ActiveVertices_m_CollapseNEW[new_posPlus1] = nextCollapse_temp;
}
const int new_posVertices = new_pos*m_StrideDiv4;
const int i_posVertices = i*m_StrideDiv4;
#pragma unroll 6
for(int j=0;j<m_StrideDiv4;++j){
const float temp = m_Vertices[i_posVertices+j];
m_VerticesNEW[new_posVertices+j] = temp;
}
}
}
}
}