Cuda compiler loop unroll bug?

Hi,

I’m working on some fairly large kernel, and have noticed a weird behavior exhibited by cuda-9.0 nvcc compiler. This behavior results in a very significant slow-down of my code, caused by load and store spills. This has not been the case a few major versions of cuda ago, but I cannot tell for sure when this behavior started. I managed to reduce the bug to a small test case, which I’m attaching below.

The logic of the code is as follows. The kernel maintains a small matrix of floats per thread. Kernel threads, of which I’m launching just one for this example, run a few iterations over that matrix. Each iteration copies a row of that matrix into shared memory. The kernel doesn’t produce any output, besides the change in the shared memory. My goal is to keep that small matrix in the registers, in order to achieve optimal performance, and that’s what the compiler is failing to do. In my example code, I’m launching just one block and one thread, hence there should be more than enough registers to store a 6x6 matrix.

I compiled the code using the command line I supply with the code, and I do see ld.local instructions in the generated ptx file (see an excerpt from that ptx file attached below the code). Clearly the code sample I’m supplying here is very small, and the overall operation of the program is NoOp, but those local memory spills aren’t optimized out, as indicated by the ptx. I observe the same behavior in my large kernel.

Once I apply some very basic changes to the code (see comments therein), I see the spills disappear, as indicated by the lack of the warning message from the compiler as well as the lack of ld.local instructions in the ptx file.

Would love to hear some opinions about this situation. Thanks!


Save this code as broken-unrolling.cu to use my command line.

// This program demonstrates what seems to be a loop unrolling bug. The bug
// causes loops to not be unrolled in a fairly straightforward code.

// To build and run (linux):
// $ nvcc -keep -O3 -g -lineinfo -gencode arch=compute_50,code=sm_50 -Xcompiler "-Wall -Wextra -Wno-unused -Werror -fno-strict-aliasing" -Xptxas "-v-warn-lmem-usage -warn-spills" -o broken-unrolling.bin broken-unrolling.cu && ./broken-unrolling.bin

// Optput of the above command (Note warning message in the first line of the
// output):
// ptxas warning : Local memory used for function '_Z4kernv'
// ptxas info    : 0 bytes gmem
// ptxas info    : Compiling entry function '_Z4kernv' for 'sm_50'
// ptxas info    : Function properties for _Z4kernv
//     144 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
// ptxas info    : Used 9 registers, 320 bytes cmem[0], 24 bytes cmem[2]
//   Kernel launch success, status = 0
//   Thread sync success, status = 0

// Toolset version:
// $ nvcc --version
// nvcc: NVIDIA (R) Cuda compiler driver
// Copyright (c) 2005-2017 NVIDIA Corporation
// Built on Fri_Sep__1_21:08:03_CDT_2017
// Cuda compilation tools, release 9.0, V9.0.176

#include <stdio.h>

extern __shared__ int sharedMem[];

const int ITERATIONS = 10;

__global__ void kern(void) {

  // Constant 6 below results in local storage, while constant 5 doesn't.
  const int DIM = 6;

  float arr[DIM][DIM];

  for (int iter = 0; iter < ITERATIONS; ++ iter) {

    // Copy one row number iter from matrix arr into sharedMem.
#pragma unroll
    for (int i = 0; i < DIM; ++ i)
#pragma unroll
      for (int j = 0; j < DIM; ++ j)
	// Logical operator == below results in local storage, while != doesn't.
	if (i == iter)
	  sharedMem[j] = arr[i][j];

  }
}

int main(int argc, char** argv) {

  // Launch with some shared memory.
  kern<<<1, 1, 100>>>();

  cudaError_t status = cudaGetLastError();
  
  if (status == cudaSuccess)
    printf("Kernel launch success, status = %i\n", status);
  else
    printf("Kernel launch failure, status = %i, error: %s\n",
	   status, cudaGetErrorString(status) );

  status = cudaThreadSynchronize();

  if (status == cudaSuccess)
    printf("Thread sync success, status = %i\n", status);
  else
    printf("Thread sync failure, status = %i, error: %s\n",
	   status, cudaGetErrorString(status) );

  return 0;
}

Here’s an excerpt from my ptx file:

BB0_13:
        .loc 1 47 4
        ld.local.v2.f32         {%f61, %f62}, [%rd6];
        cvt.rzi.s32.f32 %r34, %f61;
        ld.local.v2.f32         {%f64, %f65}, [%rd6+8];
        ld.local.v2.f32         {%f66, %f67}, [%rd6+16];
        st.shared.u32   [sharedMem], %r34;
        cvt.rzi.s32.f32 %r35, %f62;
        st.shared.u32   [sharedMem+4], %r35;
        cvt.rzi.s32.f32 %r36, %f64;
        st.shared.u32   [sharedMem+8], %r36;
        cvt.rzi.s32.f32 %r37, %f65;
        st.shared.u32   [sharedMem+12], %r37;
        cvt.rzi.s32.f32 %r38, %f66;
        st.shared.u32   [sharedMem+16], %r38;
        cvt.rzi.s32.f32 %r39, %f67;
        st.shared.u32   [sharedMem+20], %r39;
        bra.uni         BB0_14;

A few things you could try (standalone and in combination)

Does the behavior change if you replace const int DIM = 6; with #define DIM 6 ?

Is there any improvement by explicitly stating #pragma unroll 6 in the loops over DIM?
Also possibly by using #pragma unroll 10 over the iterations loop.

Christian

Hi Christian,

No, same behavior.

No, same behavior.

Interesting. This actually did the job. However, in my real kernel ITERATIONS is not a constant, so it won’t work for me in real life.

Also, I tried the initialization of the shared memory through yet another 1D temp array tmp, as follows (within the kernel). Unfortunately, it didn’t prevent the spilling.

float arr[DIM][DIM];
  float tmp[DIM];

  for (int iter = 0; iter < ITERATIONS; ++ iter) {

    // Copy one row number iter from matrix arr into sharedMem.                    
#pragma unroll
    for (int i = 0; i < DIM; ++ i)
#pragma unroll
      for (int j = 0; j < DIM; ++ j)
        // Logical operator == below results in local storage, while != doesn't.   
	if (i == iter)
          tmp[j] = arr[i][j];

#pragma unroll
    for (int i = 0; i < DIM; ++ i)
      sharedMem[i] = tmp[i];

  }

You mentioned that it’s the use of the == operator that f…s things up.

Have you tried the != operator instead … and negating the if condition? ;-)

Christian

duplicate posting - removed

duplicate posting - removed

Just tried the following change, it hasn’t helped. The compiler still generates the local memory warning.

if (! (i != iter))
  sharedMem[j] = arr[i][j];

I’ve filed a bug internally at NVIDIA to track this. I don’t have further information at this time.

One more observation regarding this issue. In my original code, the type of the shared memory array element (int) differed from the type of the matrix element (float). The type conversion done during the copy of the matrix element into the array was causing the issue in that code. Once the types of both arrays match (float and float), the register spill disappears.

However, the same issue can be reproduced by sending ITERATIONS parameter into the kernel, rather than defining it as a global constant, even when the two types mentioned above match. Below is the modified code. This code now requires a command line argument, in order to make sure that the ITERATIONS parameter (renamed to “n” in the new code) isn’t known a-priori.

Hope this helps with the investigation into this issue.

// This program demonstrates what seems to be a loop unrolling bug. The bug
// causes loops to not be unrolled in a fairly straightforward code.

// To build (linux):
// $ nvcc -keep -O3 -g -lineinfo -gencode arch=compute_50,code=sm_50 -Xcompiler "-Wall -Wextra -Wno-unused -Werror -fno-strict-aliasing" -Xptxas "-v-warn-lmem-usage -warn-spills" -o broken-unrolling.bin broken-unrolling.cu && ./broken-unrolling.bin 2

// Optput of the above command (Note warning message in the first line of the
// output):
// ptxas warning : Local memory used for function '_Z4kernv'
// ptxas info    : 0 bytes gmem
// ptxas info    : Compiling entry function '_Z4kernv' for 'sm_50'
// ptxas info    : Function properties for _Z4kernv
//     144 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
// ptxas info    : Used 9 registers, 320 bytes cmem[0], 24 bytes cmem[2]
//   Kernel launch success, status = 0
//   Thread sync success, status = 0

// Toolset version:
// $ nvcc --version
// nvcc: NVIDIA (R) Cuda compiler driver
// Copyright (c) 2005-2017 NVIDIA Corporation
// Built on Fri_Sep__1_21:08:03_CDT_2017
// Cuda compilation tools, release 9.0, V9.0.176

#include <stdio.h>

extern __shared__ float sharedMem[];

__global__ void kern(int n) {

  // Constant 6 below results in local storage, while constant 5 doesn't.
  const int DIM = 6;

  float arr[DIM][DIM];

  for (int iter = 0; iter < n; ++ iter) {

    // Copy one row number iter from matrix arr into sharedMem.
#pragma unroll
    for (int i = 0; i < DIM; ++ i)
#pragma unroll
      for (int j = 0; j < DIM; ++ j)
	// Logical operator == below results in local storage, while != doesn't.
	if (i == iter) {
	  sharedMem[j] = arr[i][j];
	}

  }
}

int main(int argc, char** argv) {

  if (argc != 2) {
    printf("Expected one command line argument, got %i\n", argc);
    exit(1);
  }

  int n;

  if (sscanf(argv[1], "%i", &n) != 1) {
    printf("Expected integer command line argument, got %s\n", argv[1]);
    exit(1);
  }

  // Launch with some shared memory.
  kern<<<1, 1, 100>>>(n);

  cudaError_t status = cudaGetLastError();
  
  if (status == cudaSuccess)
    printf("Kernel launch success, status = %i\n", status);
  else
    printf("Kernel launch failure, status = %i, error: %s\n",
	   status, cudaGetErrorString(status) );

  status = cudaThreadSynchronize();

  if (status == cudaSuccess)
    printf("Thread sync success, status = %i\n", status);
  else
    printf("Thread sync failure, status = %i, error: %s\n",
	   status, cudaGetErrorString(status) );

  return 0;
}

To completely unroll a loop (which is what you seem to desire), the compiler must know the trip count at compile time. The best the compiler could possibly do when the trip count is known only at run time is to partially unroll the loop, but given that the loop controlled by kernel argument ‘n’ is an outer loop, it makes sense that it won’t even do that.

You may get partial unrolling on that outer loop if you specifically suggest it with ‘#pragma unroll 4’ or the like.

Hi njuffa, thanks for your thoughts.

I wonder why does the compiler not unroll the inner loops? That’s the unrolling I desire. The trip counts of both inner loops are known at compile time, since those are defined as DIM constant expression. I do not desire that the outermost loop (over variable iter) is unrolled, which you seem to make a point about.

In fact that inner loop unrolling worked in an older version of cuda (about 5.x maybe?). I can get this unrolling to work even in 9.0 version of cuda, by modifying the condition in the “if” statement in the innermost loop only slightly, as explained in my comment in the code sample. That modification does not change the logic around the “n” parameter at all.

Following your suggestion, I tried adding “#pragma unroll 4” just above the outermost loop over iter variable (which I don’t expect to be unrolled otherwise), and I’m still seeing local memory spill.

Does this comment make sense?

I found the question (or complex of questions) discussed in this thread confusing.

It seems the question is conflating two issues that are handled by different parts of the compiler and thus linked only indirectly: unrolling can have an impact on register pressure, which in turn can have an impact on spilling behavior.

Typically unrolling is something that happens at an early stage of the compiler (most likely in the nvvm portion, although the PTXAS backend also contains an unroller). Spilling is a potential consequence of register allocation, which happens at a very late stage of the compiler. Both unrolling and register allocation are driven by heuristics, and these heuristics act independently of each other, just by looking at the intermediate code representation in front of them.

Beyond minimum requirements (e.g. trip count known at compile time?) an unrolling heuristic will consider other issues such as magnitude of trip count (e.g. large trip count -> large code -> instruction cache misses) or presence of if-statements inside the loop. Similarly, the register allocation heuristic tries to find a balance between register use (too many registers used can mean fewer threads get to run, reswulting in reduced performance) and spilling (minimal spilling is often harmless, buffered by caches; large amount of spilling causes code to become memory bound).

The use of heuristics never results in optimal machine code for every possible source code presented to a compiler. Instead, 90% or 95% of possible source codes will see good results, while the remainder may see undesirable results (slow execution). A change to heuristics (by design or through bugs) may cause a specific source code (such as your code here) to switch from one category into the other.

So, if you can demonstrate (by comparison with older CUDA versions) that a performance regression occurred, I would suggest filing a bug. That way you will learn whether the difference in observed behavior is by design, or whether it is considered a bug by the compiler folks.

In general I would recommend the use of #pragma unroll with the specific unroll factor desired, e.g. #pragma unroll DIM.

Hi njuffa,

I agree with most of what you’re saying in the above comment.

nVidia recommends storing thread-local, register-based arrays as one of primary performance optimization methods. I believed I’ve seen examples of using this method many times in nVidia’s training materials, in code as simple as matrix multiplication.

In my specific example, I put quite a bit of effort optimizing my kernel using that method a couple of years ago. The method worked perfectly well under an older version of cuda, but it’s not working any more in cuda-9.0. I recently spent several hours trying to work around this change in the compiler’s behavior. As part of this effort, I reduced my code to a 20-line kernel, which I cannot get to perform. I think my kernel is trivial: all it does is copies a single row of a small matrix into shared memory.

Thus, this basic code optimization method seems to be fragile w.r.t. to unspecified changes in the compiler’s behavior. I think this is not a very user-friendly behavior of the compiler.

Digging a bit deeper, I have to say, I don’t understand semantic of #pragma unroll, as implemented by nvcc. I would expect the compiler to choose the compilation strategy that provides the best result that the compiler can think of, even without any #pragmas in the code. In the presence of #pragmas, the compiler should do its best to strictly obey the #pragmas, whenever it’s at all possible, even at risk of generating a slower code. This is because if a developer cares to implant a #pragma into his code, then he knows something about the code, that the compiler cannot or failed to deduce by itself. In the case of my example, I know that without loop unrolling, I cannot have my small matrix allocated within registers, and I also know for sure that such allocation is very important for the performance of my code. Instead of strictly following my #pragma, the compiler seems to think “OK, the developer put #pragma above the loop, but I suspect that following this #pragma in this case will make the loop less performing, hence I would disregard it”. The possibility of a quiet disregard of the #pragma, in such a simple case, I believe, is the fundamental problem here.

I think the compiler’s behavior w.r.t #pragma unroll should be as follows:

  1. Try to unroll the loop, no matter what the performance penalty is. If there’s a presumed penalty, produce a warning.
  2. If the unrolling is not possible (e.g. loop count is unknown), then terminate with an error message. This would clearly indicate that #pragma unroll is incompatible with the loop structure and that is a bug in the code.
  3. If it’s provably possible and not that hard to unroll a loop, but the compiler fails to do so, then this behavior is a bug (or lack of sophistication) in the compiler.

Maybe what I am trying to say is that a more user-friendly behavior of #pragma unroll would be to consider that an order, rather than a suggestion. If that’s too strict for this pragma, then I’d like to see another pragma (maybe force_unroll) that behaves this way, or a command-line option that makes the compiler treat #pragma unroll that way.

Does this make sense?

txbob wrote above that he has filed the bug.

Note that this means that any responses from the compiler team will be visible to txbob, not you. My standing recommendation is: CUDA programmers should file their own bugs if they are interested in being appraised of the progress on the matter reported. The first thing that will happen is that someone is going to try and reproduce the issue.

Since it sounds like this is reported as a performance regression, the minimum amount of information required is “I compiled attached code with CUDA 9, using such and such compiler switches, and with CUDA X, using identical compiler switches. The performance difference observed when run on GPU Y is Z percent.”

In general, the semantics of #pragmas are tool-chain specific. In my experience, using many different tool chains over the years, #pragmas typically do not come with any kind of guarantee. They are simply an expression of the programmer’s desire which compilers will usually honor, but not always. Generally, #pragma use is not required for loop unrolling. Modern compilers, including the CUDA compiler, will unroll loops by themselves when a high optimization level is chosen, if they can be reasonably sure that this will actually increase performance. Unrolling isn’t always beneficial. As I stated, to my knowledge the CUDA compiler includes at least two automatic unrollers, one each in NVVM and PTXAS.

Because they incorporate dozen of heuristics, compilers will rarely emit the code that a skilled observed thinks is optimal. In some cases the skilled observer is wrong (as proven by running actual performance tests), in many other cases the skilled observer is right and the code produced is sub-optimal. Unfortunately for programmers, compilers do yet provide a DWIM (do what I mean) compilation switch, and the typical method of last resort is to manually program the desired code sequence at assembly level. Which isn’t really a realistic option for GPUs as NVIDIA doesn’t make a SASS assembler available (to avoid confusion: PTXAS, which translates PTX into SASS, is an optimizing compiler)

NVIDIA welcomes well-reasoned requests for enhancements regarding their products. That doesn’t mean all requests will be implemented or within a specific timeframe. You can file RFEs (enhancements request) for CUDA through the bug reporting form linked from the registered developer website. As for reports of performance bugs, they are best accompanied by measured performance data demonstrating that there is an issue. Performance regressions caused by compilers are not uncommon. For example, I am aware of a product team that skipped three generations of MSVC because of such a regression affecting important parts of their code.

Hi, njuffa,

I worked around the issue which I reported earlier in this thread. I include the workaround code in the bottom of this message. You will notice from the comments in the code that it is very fragile. If I change it just a little bit, in a way which seems very logical and actually makes the code simpler, then the registers are spilled. Altogether, my workaround doesn’t look like a succinct way of expressing operation “take this row from the matrix, copy it into shared memory”. This workaround, in my opinion, makes my code harder to understand. I think my workaround is a hack.

It was a real pain to go through my kernels and remove all if() statements from every loop that involves my register array, replacing the if statement with weird constructs like the one in my workaround.

I agree that typically #pragmas do not come with a guarantee. Let’s however look at the situation with register spilling from the developer’s (i.e. customer) perspective.

We invest a fair amount of time to write optimal kernels. We try to follow nvidia’s guidance in using best-practice constructs in our code. In this case, however, I don’t see a way to more closely follow nvidia’s guidance and still be at least reasonably sure that my code will survive yet another change in nvcc. Indeed, what if compiler heuristics change in say v.10 and then my weird workaround stops working? Do I need to go through my kernel once again (spending several days on it), trying to figure out which other illogical hack will make the compiler happy?

I think the above observation uncovers a product management issue, as opposed to just a compiler regression bug. I think nvidia should come with some expression or pattern that actually instructs (rather than advises) the compiler to keep certain variables in registers. I’m no compiler expert, but I actually don’t understand why that would be so hard to implement. If we had that option, then we, developers, who have a good idea of which variables are heavily used by the code and which are not, could influence the compiler. This would be a much simpler approach for us, than hand-editing ptx code.

From what I understand variables which are defined shared cannot be spilled out to memory, or at least the compiler makes every effort not to do that. What’s so different about variables which we’d like to define as register? Why cannot nvidia postulate essentially the same thing about register variables, specifically that the compiler must keep those in the registers, unless it runs out of registers?

There’s a separate issue that I don’t understand about this example. When I keep the code I’m attaching, I’m seeing the following compiler message:

ptxas info : Used 13 registers, 324 bytes cmem[0]

Then I’m looking at the ptx file and I do see what seems to be my matrix in the registers:

BB0_2:
	setp.eq.s32	%p2, %r5, 0;
	selp.f32	%f23, %f21, %f19, %p2;
	add.f32 	%f24, %f64, %f23;
	add.f32 	%f25, %f63, %f23;
	add.f32 	%f26, %f62, %f23;
	add.f32 	%f27, %f61, %f23;
	add.f32 	%f28, %f60, %f23;
	add.f32 	%f29, %f59, %f23;
	setp.eq.s32	%p3, %r5, 1;
	selp.f32	%f30, %f21, %f19, %p3;
	add.f32 	%f31, %f24, %f30;
	add.f32 	%f32, %f25, %f30;
	add.f32 	%f33, %f26, %f30;
	add.f32 	%f34, %f27, %f30;
	add.f32 	%f35, %f28, %f30;
	add.f32 	%f36, %f29, %f30;
	setp.eq.s32	%p4, %r5, 2;
	selp.f32	%f37, %f21, %f19, %p4;
	add.f32 	%f38, %f31, %f37;
	add.f32 	%f39, %f32, %f37;
	add.f32 	%f40, %f33, %f37;
	add.f32 	%f41, %f34, %f37;
	add.f32 	%f42, %f35, %f37;
	add.f32 	%f43, %f36, %f37;
	setp.eq.s32	%p5, %r5, 3;
	selp.f32	%f44, %f21, %f19, %p5;
	add.f32 	%f45, %f38, %f44;
	add.f32 	%f46, %f39, %f44;
	add.f32 	%f47, %f40, %f44;
	add.f32 	%f48, %f41, %f44;
	add.f32 	%f49, %f42, %f44;
	add.f32 	%f50, %f43, %f44;
	setp.eq.s32	%p6, %r5, 4;
	selp.f32	%f51, %f21, %f19, %p6;
	add.f32 	%f52, %f45, %f51;
	add.f32 	%f53, %f46, %f51;
	add.f32 	%f54, %f47, %f51;
	add.f32 	%f55, %f48, %f51;
	add.f32 	%f56, %f49, %f51;
	add.f32 	%f57, %f50, %f51;
	setp.eq.s32	%p7, %r5, 5;
	selp.f32	%f58, %f21, %f19, %p7;
	add.f32 	%f64, %f52, %f58;
	add.f32 	%f63, %f53, %f58;
	add.f32 	%f62, %f54, %f58;
	add.f32 	%f61, %f55, %f58;
	add.f32 	%f60, %f56, %f58;
	add.f32 	%f59, %f57, %f58;
	.loc 1 38 32
	add.s32 	%r5, %r5, 1;
	.loc 1 38 3
	setp.lt.s32	%p8, %r5, %r3;
	@%p8 bra 	BB0_2;

	.loc 1 51 2
	st.shared.f32 	[sharedMem], %f64;
	st.shared.f32 	[sharedMem+4], %f63;
	st.shared.f32 	[sharedMem+8], %f62;
	st.shared.f32 	[sharedMem+12], %f61;
	st.shared.f32 	[sharedMem+16], %f60;
	st.shared.f32 	[sharedMem+20], %f59;

My matrix is 6x6 in size, i.e. it occupies 36 registers. How come ptxas reports the usage of only 13 of those?

I don’t mind filing a bug report, but I couldn’t find a link. Do you mind providing me with the one?

Please find the workaround hack below.

Thanks!

// This program demonstrates what seems to be a loop unrolling bug. The bug
// causes loops to not be unrolled in a fairly straightforward code.

// To build (linux):
// $ nvcc -keep -O3 -g -lineinfo -gencode arch=compute_50,code=sm_50 -Xcompiler "-Wall -Wextra -Wno-unused -Werror -fno-strict-aliasing" -Xptxas "-v-warn-lmem-usage -warn-spills" -o broken-unrolling.bin broken-unrolling.cu && ./broken-unrolling.bin 2

// Optput of the above command (Note warning message in the first line of the
// output):
// ptxas warning : Local memory used for function '_Z4kernv'
// ptxas info    : 0 bytes gmem
// ptxas info    : Compiling entry function '_Z4kernv' for 'sm_50'
// ptxas info    : Function properties for _Z4kernv
//     144 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
// ptxas info    : Used 9 registers, 320 bytes cmem[0], 24 bytes cmem[2]
//   Kernel launch success, status = 0
//   Thread sync success, status = 0

// Toolset version:
// $ nvcc --version
// nvcc: NVIDIA (R) Cuda compiler driver
// Copyright (c) 2005-2017 NVIDIA Corporation
// Built on Fri_Sep__1_21:08:03_CDT_2017
// Cuda compilation tools, release 9.0, V9.0.176

#include <stdio.h>

extern __shared__ char sharedMem[];

__global__ void kern(int n) {

  // float* f_sharedMem = (float*)sharedMem;
  
  // Constant 6 below results in local storage, while constant 5 doesn't.
  const int DIM = 6;

  float arr[DIM][DIM];

  for (int iter = 0; iter < n; ++ iter) {

    // Copy one row number iter from matrix arr into sharedMem.
#pragma unroll
    for (int i = 0; i < DIM; ++ i)
#pragma unroll
      for (int j = 0; j < DIM; ++ j)
	// Logical operator == below results in local storage, while != doesn't.
	// if (i == iter)
	//   ((float*)sharedMem)[j] = arr[i][j];

	// This rewrite of the above line is roughly equivalent, when arr doesn't
	// contain NaNs, and works around the issue. I consider this a hack.
	((float*)sharedMem)[j] += (i == iter ? 1 : 0) * arr[i][j];

	// If the above is rewritten as follows, then there's register spill.
	// ((float*)sharedMem)[j] += (i == iter ? arr[i][j] : 0);
  }

}

int main(int argc, char** argv) {

  if (argc != 2) {
    printf("Expected one command line argument, got %i\n", argc);
    exit(1);
  }

  int n;

  if (sscanf(argv[1], "%i", &n) != 1) {
    printf("Expected integer command line argument, got %s\n", argv[1]);
    exit(1);
  }

  // Launch with some shared memory.
  kern<<<1, 1, 100>>>(n);

  cudaError_t status = cudaGetLastError();
  
  if (status == cudaSuccess)
    printf("Kernel launch success, status = %i\n", status);
  else
    printf("Kernel launch failure, status = %i, error: %s\n",
	   status, cudaGetErrorString(status) );

  status = cudaThreadSynchronize();

  if (status == cudaSuccess)
    printf("Thread sync success, status = %i\n", status);
  else
    printf("Thread sync failure, status = %i, error: %s\n",
	   status, cudaGetErrorString(status) );

  return 0;
}