Okay, so I’m trying to do some calculations. I got a good tip here about allocating private data for the sake of global memory access. Sounds like a good idea to me so I tried with a 2d array (a douple float pointer) in some of my code and I did achieve 100% load efficiency.
I was very happy.
But now I’m trying to do the same thing but with only a 1d array.
I’m storing tetrahedron data so the way I arranged it is or at least tried to arrange it is :
x0
y0
z0
x1
y1
z1
x2
y2
z2
x3
y3
z3
,
where each row is defined by the number of tetrahedra or some other parameter, n. So it’s a 12 x n matrix and each coordinate is of course a float.
Points are stored in a similar order,
x0
y0
z0
,
so this time we have a 3 x n matrix.
I’m not sure if I’m just messing this up really badly or not but I have a load efficiency of like 51% but my store efficiency is as it should be, 100%.
Compile code with :
nvcc -gencode arch=compute_50,code=sm_50 -O3 -o test test.cu
Profile with :
nvprof --metrics gld_efficiency,gst_efficiency ./test
test.cu
#include <iostream>
#include <thrust/device_vector.h>
#include <thrust/copy.h>
#include "predicates.h"
// This code is to be used as a time test
// for various calculations used by regulus
const int tpb = 64,
bpg = 512,
gs = tpb * bpg;
__global__
void orientation_tests
(
const int num_points,
const float * __restrict__ tet_data,
const float * __restrict__ pt_data,
const float * __restrict__ predConsts,
int * __restrict__ la,
int * __restrict__ fs
)
{
const int thread_id = threadIdx.x + blockIdx.x * blockDim.x;
// for every point...
for (int tid = thread_id; tid < num_points; tid += gs)
{
const float x0 = tet_data[0 * num_points + tid];
const float y0 = tet_data[1 * num_points + tid];
const float z0 = tet_data[2 * num_points + tid];
const float x1 = tet_data[3 * num_points + tid];
const float y1 = tet_data[4 * num_points + tid];
const float z1 = tet_data[5 * num_points + tid];
const float x2 = tet_data[6 * num_points + tid];
const float y2 = tet_data[7 * num_points + tid];
const float z2 = tet_data[8 * num_points + tid];
const float x3 = tet_data[9 * num_points + tid];
const float y3 = tet_data[10 * num_points + tid];
const float z3 = tet_data[11 * num_points + tid];
const float px = pt_data[0 * num_points + tid];
const float py = pt_data[1 * num_points + tid];
const float pz = pt_data[2 * num_points + tid];
//printf("%.00f %.00f %.00f\n%.00f %.00f %.00f\n%.00f %.00f %.00f\n%.00f %.00f %.00f\nis being tested against\n%.00f %.00f %.00f\n\n", x0, y0, z0, x1, y1, z1, x2, y2, z2, x3, y3, z3, px, py, pz);
const float a[3] = { x0, y0, z0 };
const float b[3] = { x1, y1, z1 };
const float c[3] = { x2, y2, z2 };
const float d[3] = { x3, y3, z3 };
const float p[3] = { px, py, pz };
// orienation of p vs every face
const int ort0 = orientation(predConsts, d, c, b, p); // 321
const int ort1 = orientation(predConsts, a, c, d, p); // 023
const int ort2 = orientation(predConsts, a, d, b, p); // 031
const int ort3 = orientation(predConsts, a, b, c, p); // 012
assert(ort0 != -1 && ort1 != -1 && ort2 != -1 && ort3 != -1);
// write location association
int x = 0;
x |= (ort0 << 0);
x |= (ort1 << 1);
x |= (ort2 << 2);
x |= (ort3 << 3);
la[tid] = x;
// fracture size = sum of orientations
// 4 for 1-to-4, 3 for 1-to-3, 2 for 1-to-2
fs[tid] = ort0 + ort1 + ort2 + ort3;
}
}
int main(void)
{
// number of points we are testing
const int bl = 4;
const int rl = (bl - 1) * 3;
const int num_points = bl * bl * bl;
// Allocate data for tetrahedra
float *tet_data = 0;
cudaMallocHost(&tet_data, num_points * 4 * 3 * sizeof(*tet_data));
// Allocate data for the points
float *pt_data = 0;
cudaMallocHost(&pt_data, num_points * 3 * sizeof(*pt_data));
// For simpler bookkeeping
int rs[12] = { 0 };
for (int i = 0; i < 12; ++i)
rs[i] = num_points * i;
// Write tetrahedra
for (int i = 0; i < num_points; ++i)
{
tet_data[rs[0] + i] = 0; // x
tet_data[rs[1] + i] = 0; // y
tet_data[rs[2] + i] = 0; // z
tet_data[rs[3] + i] = rl;
tet_data[rs[4] + i] = 0;
tet_data[rs[5] + i] = 0;
tet_data[rs[6] + i] = 0;
tet_data[rs[7] + i] = rl;
tet_data[rs[8] + i] = 0;
tet_data[rs[9] + i] = 0;
tet_data[rs[10] + i] = 0;
tet_data[rs[11] + i] = rl;
}
// write Cartesian points
for (int i = 0; i < num_points; ++i)
{
const float x = (float ) (i / (bl * bl)),
y = (float ) ((i / bl) % bl),
z = (float ) (i % bl);
pt_data[0 * num_points + i] = x;
pt_data[1 * num_points + i] = y;
pt_data[2 * num_points + i] = z;
}
// copy to device
thrust::device_vector<float> d_tet_data(num_points * 4 * 3),
d_pt_data(num_points * 3);
thrust::copy(tet_data, tet_data + 12 * num_points, d_tet_data.begin());
thrust::copy(pt_data, pt_data + 3 * num_points, d_pt_data.begin());
// Build predicate data
PredicateInfo preds;
initPredicate(preds);
// allocate storage to write to
thrust::device_vector<int> fs(num_points, -1),
la(num_points, -1);
// test routine
orientation_tests<<<bpg, tpb>>>
(num_points,
thrust::raw_pointer_cast(d_tet_data.data()),
thrust::raw_pointer_cast(d_pt_data.data()),
preds._consts,
thrust::raw_pointer_cast(la.data()),
thrust::raw_pointer_cast(fs.data()));
cudaDeviceSynchronize();
// visual confirmation
const bool print0 = true;
if (print0)
for (int i = 0; i < num_points; ++i)
std::cout << "fs : " << fs[i] << ", la : " << la[i] << std::endl;
cudaFreeHost(pt_data);
cudaFreeHost(tet_data);
return 0;
}
predicates.h
/*
Author: Ashwin Nanjappa and Cao Thanh Tung
Filename: GDelShewchukDevice.h
===============================================================================
Copyright (c) 2013, School of Computing, National University of Singapore.
All rights reserved.
Project homepage: http://www.comp.nus.edu.sg/~tants/gdel3d.html
If you use gStar4D and you like it or have comments on its usefulness etc., we
would love to hear from you at <tants@comp.nus.edu.sg>. You may share with us
your experience and any possibilities that we may improve the work/code.
===============================================================================
Redistribution and use in source and binary forms, with or without modification,
are permitted provided that the following conditions are met:
Redistributions of source code must retain the above copyright notice, this list of
conditions and the following disclaimer. Redistributions in binary form must reproduce
the above copyright notice, this list of conditions and the following disclaimer
in the documentation and/or other materials provided with the distribution.
Neither the name of the National University of University nor the names of its contributors
may be used to endorse or promote products derived from this software without specific
prior written permission from the National University of Singapore.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY
EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO THE IMPLIED WARRANTIES
OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT
SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
DAMAGE.
*/
#pragma once
#include "GDelShewchukDevice.h"
enum Orient
{
OrientNeg = -1,
OrientZero = +0,
OrientPos = +1
};
__device__ Orient detToOrient( float det )
{
return ( det > 0 ) ? OrientPos : ( ( det < 0 ) ? OrientNeg : OrientZero );
}
int PredThreadNum = 32 * 32;
template< typename T >
T* cuNew( int num )
{
T* loc = NULL;
const size_t space = num * sizeof( T );
cudaMalloc( &loc, space );
return loc;
}
void initPredicate(PredicateInfo &DPredicateInfo)
{
DPredicateInfo.init();
// Predicate constants
DPredicateInfo._consts = cuNew< float >( DPredicateBoundNum );
// Predicate arrays
DPredicateInfo._data = cuNew< float >( PredicateTotalSize * PredThreadNum );
// Set predicate constants
kerInitPredicate<<< 1, 1 >>>( DPredicateInfo._consts );
return;
}
__device__ Orient orientation
(
const float* predConsts,
const float* p0,
const float* p1,
const float* p2,
const float* p3
)
{
float det = orient3dfast( predConsts, p0, p1, p2, p3 );
//printf("det = %f\n", det);
// Need exact check
if ( det == FLT_MAX )
{ //printf("Calling exact routine...\n");
det = orient3dexact( predConsts, p0, p1, p2, p3 );
}
//printf("%.00f\n", det);
return detToOrient( det );
}
GDelShewchukDevice.h is quite large and must be taken from : regulus_v1.5/GDelShewchukDevice.h at master · cmazakas/regulus_v1.5 · GitHub