Here is a condensed version of what I’m working with to illustrate the point. The problem is explained in the comments in kernel.cu. Based on moving one line of code, it can slow down the execution speed by 350%. I’ve diffed the ptx files for the slow and fast builds, and there are many more differences than one would expect for such a small change. It seems like the compiler is trying to be smart with some inling, although I don’t really know how to read ptx. Can you guys try the same and see what you get?

I am running VS2010, sm_20, 32-bit build, on Windows 7 x64. I’d prefer not to post a ton of code, but it won’t let me attach a project file as a zip, so here are all the files one by one. The main file of interest is kernel.cu. The core of the problem is:

Based on how we increment our count of variations in CuFlameXform::AddVariation(), it changes the execution speed of CuFlameXform::Apply(). I’m utterly stumped, so any help would be much appreciated. Thanks.

Timing.h

```
#pragma once
#include <Windows.h>
#include <stdio.h>
class Timing
{
public:
Timing();
void Tic();
double Toc();
double Toc(char* str);
double BeginTime();
double EndTime();
static double Freq();
static int NumOfProcessors();
private:
static inline void Init();
LARGE_INTEGER m_BeginTime;
LARGE_INTEGER m_EndTime;
static bool m_TimingInit;
static int m_NumOfProcessors;
static LARGE_INTEGER m_Freq;
};
```

Timing.cpp:

```
#include "Timing.h"
bool Timing::m_TimingInit = false;
int Timing::m_NumOfProcessors;
LARGE_INTEGER Timing::m_Freq;
Timing::Timing()
{
Init();
Tic();
}
void Timing::Tic()
{
QueryPerformanceCounter(&m_BeginTime);
}
double Timing::Toc()
{
return Toc(NULL);
}
double Timing::Toc(char* str)
{
QueryPerformanceCounter(&m_EndTime);
double calcTime = double(m_EndTime.QuadPart - m_BeginTime.QuadPart) * 1000.0 / double(m_Freq.QuadPart);
if (str != NULL)
printf("%s processing time: %f ms
", str, calcTime);
return calcTime;
}
double Timing::BeginTime() { return (double)m_BeginTime.QuadPart; }
double Timing::EndTime() { return (double)m_EndTime.QuadPart; }
double Timing::Freq()
{
Init();
return (double)m_Freq.QuadPart;
}
int Timing::NumOfProcessors()
{
Init();
return m_NumOfProcessors;
}
void Timing::Init()
{
if (!m_TimingInit)
{
SYSTEM_INFO sysinfo;
QueryPerformanceFrequency(&m_Freq);
GetSystemInfo(&sysinfo);
m_NumOfProcessors = sysinfo.dwNumberOfProcessors;
m_TimingInit = true;
}
}
```

kernel.cu:

```
#include <stdio.h>
#include <stdint.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#define EPS (1e-10)
struct CuFlamePoint
{
float m_X;
float m_Y;
float m_ColorX;
};
struct CuFlameIteratorHelper
{
float m_TransX, m_TransY;
float m_PrecalcSumSquares;
float m_PrecalcSqrtSumSquares;
float m_PrecalcSina;
float m_PrecalcCosa;
float m_PrecalcAtanxy;
float m_PrecalcAtanyx;
};
typedef void (*VariationFunc)(float weight, CuFlameIteratorHelper& helper, CuFlamePoint* outPoint);
__device__ void Linear(float weight, CuFlameIteratorHelper& helper, CuFlamePoint* outPoint)
{
outPoint->m_X += weight * helper.m_TransX;
outPoint->m_Y += weight * helper.m_TransY;
}
__device__ void Sinusoidal(float weight, CuFlameIteratorHelper& helper, CuFlamePoint* outPoint)
{
outPoint->m_X += weight * sin(helper.m_TransX);
outPoint->m_Y += weight * sin(helper.m_TransY);
}
__device__ void Spherical(float weight, CuFlameIteratorHelper& helper, CuFlamePoint* outPoint)
{
float r2 = weight / (helper.m_PrecalcSumSquares + EPS);
outPoint->m_X += r2 * helper.m_TransX;
outPoint->m_Y += r2 * helper.m_TransY;
}
#define MAX_VARS_PER_XFORM 8
class CuFlameXform
{
public:
__device__ CuFlameXform()
{
Init();
}
__device__ CuFlameXform(float colorX, float a, float b, float c, float d, float e, float f)
{
Init();
m_ColorX = colorX;
m_A = a;
m_B = b;
m_C = c;
m_D = d;
m_E = e;
m_F = f;
m_ColorSpeed = 0.5;
m_ColorSpeedCache = m_ColorSpeed * m_ColorX;
m_OneMinusColorCache = float(1.0) - m_ColorSpeed;
}
__device__ ~CuFlameXform()
{
Init();
}
__device__ void Init()
{
m_VariationCount = 0;
memset(m_Variations, 0, MAX_VARS_PER_XFORM * sizeof(VariationFunc));
memset(m_VariationWeights, 0, MAX_VARS_PER_XFORM * sizeof(float));
}
__device__ void Apply(CuFlamePoint* inPoint, CuFlamePoint* outPoint)
{
CuFlameIteratorHelper iterHelper;
outPoint->m_ColorX = m_ColorSpeedCache + (m_OneMinusColorCache * inPoint->m_ColorX);
iterHelper.m_TransX = (m_A * inPoint->m_X) + (m_B * inPoint->m_Y) + m_C;
iterHelper.m_TransY = (m_D * inPoint->m_X) + (m_E * inPoint->m_Y) + m_F;
Precalc(iterHelper);
outPoint->m_X = outPoint->m_Y = 0;
//We know we added 3 variations for this test, so just hard code calls to the first three just for demo.
//for (unsigned int i = 0; i < m_VariationCount; i++)//Or, doing this also makes it 3.5 times slower, regardles of where we increment m_VariationCount below.
for (unsigned int i = 0; i < 3; i++)//This will be 3.5 times faster if we increment m_VariationCount outside of the if block below.
m_Variations[i](m_VariationWeights[i], iterHelper, outPoint);
}
__device__ void AddVariation(VariationFunc variation, float weight)
{
if (m_VariationCount < MAX_VARS_PER_XFORM)
{
m_Variations[m_VariationCount] = variation;
m_VariationWeights[m_VariationCount] = weight;
//m_VariationCount++;//Putting this here makes the call of the function pointers 3.5 times slower.
}
//For some reason, pulling this out of the above conditional makes calling a func ptr roughly the speed as calling a regular function.
//Should make no difference since we will always execute this line since we're only adding 3 variations in our test which is always
//below the limit of 8.
m_VariationCount++;
}
__device__ void ClearVariations()
{
m_VariationCount = 0;
memset(m_Variations, 0, MAX_VARS_PER_XFORM * sizeof(VariationFunc));
memset(m_VariationWeights, 0, MAX_VARS_PER_XFORM * sizeof(float));
}
float m_A, m_B, m_C, m_D, m_E, m_F;
float m_ColorX;
float m_ColorSpeed;
float m_ColorSpeedCache;
float m_OneMinusColorCache;
private:
__device__ void Precalc(CuFlameIteratorHelper& iteratorHelper)
{
iteratorHelper.m_PrecalcSumSquares = (iteratorHelper.m_TransX * iteratorHelper.m_TransX) + (iteratorHelper.m_TransY * iteratorHelper.m_TransY);
iteratorHelper.m_PrecalcSqrtSumSquares = sqrt(iteratorHelper.m_PrecalcSumSquares);
iteratorHelper.m_PrecalcSina = iteratorHelper.m_TransX / iteratorHelper.m_PrecalcSqrtSumSquares;
iteratorHelper.m_PrecalcCosa = iteratorHelper.m_TransY / iteratorHelper.m_PrecalcSqrtSumSquares;
iteratorHelper.m_PrecalcAtanxy = atan2(iteratorHelper.m_TransX, iteratorHelper.m_TransY);
iteratorHelper.m_PrecalcAtanyx = atan2(iteratorHelper.m_TransY, iteratorHelper.m_TransX);
}
VariationFunc m_Variations[MAX_VARS_PER_XFORM];
float m_VariationWeights[MAX_VARS_PER_XFORM];
unsigned int m_VariationCount;
};
__global__ void RunIters(unsigned int count)
{
CuFlamePoint firstPoint, secondPoint;
CuFlameXform theXform(1, -0.681206f, -0.0779465f, 0.20769f, 0.755065f, -0.0416126f, -0.262334f);
theXform.AddVariation(Linear, 0.2f);
theXform.AddVariation(Sinusoidal, 0.3f);
theXform.AddVariation(Spherical, 0.5f);
firstPoint.m_X = 0.3;
firstPoint.m_Y = -0.8;
firstPoint.m_ColorX = 0.44;
for (unsigned int round = 0; round < count; round++)
{
theXform.Apply(&firstPoint, &secondPoint);
firstPoint = secondPoint;
}
}
extern "C"
void CudaCallWrapper(unsigned int i)
{
RunIters<<<16, 16>>>(i);
cudaDeviceSynchronize();
}
```

Main.cpp:

```
// includes, system
#include <iostream>
#include <stdlib.h>
// Required to include CUDA vector types
#include <cuda_runtime.h>
#include <vector_types.h>
#include "Timing.h"
extern "C" void CudaCallWrapper(unsigned int i);
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv)
{
Timing t;
CudaCallWrapper(5);
t.Toc("CudaCallWrapper(5)");
t.Tic();
CudaCallWrapper(100);
t.Toc("CudaCallWrapper(100)");
t.Tic();
CudaCallWrapper(500);
t.Toc("CudaCallWrapper(500)");
t.Tic();
CudaCallWrapper(100000);
t.Toc("CudaCallWrapper(100000)");
return 0;
}
```