Unless I misunderstand what the desired functionality is, this seems to work as desired with the CUDA 3.2 toolchain. Below is my little test app which wraps the code that was posted.
#include <stdio.h>
#include <stdlib.h>
template <int Ky>
__device__ void func (float eta, int Sxp1, int Sy, int My, int Mx, const float *d, float *W, const float *prev_y, int stride_d, int stride_prev_y)
{
float sum[Ky];
float dval;
int opt;
int tidx = blockDim.x * blockIdx.x + threadIdx.x;
#pragma unroll
for(int ty=0; ty<Ky; ty++) {
sum[ty]=0.0f;
}
for (int i=0; i<My; i++) {
opt = (Sy+1)*i*stride_prev_y + tidx;
for(int j=0; j<Mx; j++) {
dval = d[i*stride_d+j];
#pragma unroll
for (int ty=0; ty<Ky; ty++) {
sum[ty] += prev_y[opt + ty*stride_prev_y] * dval;
}
opt += Sxp1;
}
}
#pragma unroll
for (int ty=0; ty<Ky; ty++) {
*W++ -= eta*sum[ty];
}
}
__global__ void kernel (float eta, int Sxp1, int Sy, int My, int Mx, const float *d, float *W, const float *prev_y, int stride_d, int stride_prev_y)
{
func<10>(eta, Sxp1, Sy, My, Mx, d, W, prev_y, stride_d, stride_prev_y);
}
int main (void)
{
return EXIT_SUCCESS;
}
I compiled the code above as follows
nvcc -o kernel --keep -Xptxas -v -arch={sm_20|sm_10} kernel.cu
I don’t see any local memory references in either the sm_10 or sm_20 version of the code (I checked the PTX as well as the machine code; registered developers can download cuobjdump to disassemble sm_1x machine code in .cubin files). PTXAS reports:
ptxas info : Compiling entry function ‘_Z6kernelfiiiiPKfPfS0_ii’ for ‘sm_10’
ptxas info : Used 37 registers, 56+16 bytes smem
ptxas info : Compiling entry function ‘_Z6kernelfiiiiPKfPfS0_ii’ for ‘sm_20’
ptxas info : Used 42 registers, 88 bytes cmem[0]