3D texture interpolation broken for small textures

I want to use 3D textures to implement a 3D display LUT. As a test, I wrote a program that operates as follows. I synthesize a 256x256 image that has contains a left-to-right monochrome gradient where each pixel contains the x coordinate in each of the RGB components and load that into a 2D texture. I then create a 3D texture that is a 2x2x2 identity LUT. I draw the pixels, applying the LUT in the fragment shader, then read the pixels back from the framebuffer. I expected to get back the same pixels that I sent up in the image texture, and this is true for the R and B channels. But, very strangely, the G channel does NOT match, and the result is non-linear!

The number of errors varies with the size of the identity LUT, where the larger I make the LUT, the fewer errors there are. When I get to a 9x9x9 identity LUT, the errors disappear. So I guess to get desired precision in this case, I will always use 16x16x16 or larger texture!

Here is the program:

#include <iostream>
#include <iomanip>
#include <GL/glew.h>
#include <GLFW/glfw3.h>
#include "common/checkglerror.hpp"
GLFWwindow* window;
const unsigned int imageWidth = 256;
const unsigned int imageHeight = 256;
GLubyte imageData[imageHeight][imageWidth][3];

const GLchar* vertexSource = R"glsl(
    #version 450 core
    in vec2 position;
    in vec2 texCoord;
    out vec2 fragTexCoord;
    void main()
    {
        fragTexCoord = texCoord;
        gl_Position = vec4(position, 0.0, 1.0);
    }
)glsl";

const GLchar* fragmentSource = R"glsl(
    #version 450 core
    in vec2 fragTexCoord;
    out vec4 outColor;
    uniform sampler2D imageTextureUnit;
    uniform sampler3D lutTextureUnit;
    uniform float lutInterpScale;
    uniform float lutInterpOffset;
    void main()
    {
        vec4 srcColor = texture(imageTextureUnit, fragTexCoord);
        vec3 lutIn = vec3(srcColor.rgb * lutInterpScale + lutInterpOffset);
        outColor = texture(lutTextureUnit, lutIn);
    }
)glsl";

int main(void)
{
    glfwInit();
    glfwWindowHint(GLFW_SAMPLES, 4);
    glfwWindowHint(GLFW_RESIZABLE, GL_FALSE);
    glfwWindowHint(GLFW_CONTEXT_VERSION_MAJOR, 3);
    glfwWindowHint(GLFW_CONTEXT_VERSION_MINOR, 3);
    glfwWindowHint(GLFW_OPENGL_FORWARD_COMPAT, GL_TRUE); // To make MacOS happy; should not be needed
    glfwWindowHint(GLFW_OPENGL_PROFILE, GLFW_OPENGL_CORE_PROFILE);
    window = glfwCreateWindow(imageWidth, imageHeight, "3D Lut Test", NULL, NULL);
    glfwMakeContextCurrent(window);

    glewExperimental = true; // Needed for core profile
    glewInit();

    GLuint vertexArrayID, vertexBufferID, elementBufferID;
    glGenVertexArrays(1, &vertexArrayID);
    glBindVertexArray(vertexArrayID);

    static GLfloat vertexBufferData[] =
    {
        //  VERTEX POS    TEXTURE COORD
        -1.0f,  1.0f,    0.0f,  1.0f,     //  top left
         1.0f,  1.0f,    1.0f,  1.0f,     //  top right
         1.0f, -1.0f,    1.0f,  0.0f,     //  bottom right
        -1.0f, -1.0f,    0.0f,  0.0f,     //  bottom left
    };
    glGenBuffers(1, &vertexBufferID);
    glBindBuffer(GL_ARRAY_BUFFER, vertexBufferID);
    glBufferData(GL_ARRAY_BUFFER, sizeof(vertexBufferData), vertexBufferData, GL_STATIC_DRAW);

    static unsigned int triangleIndices[2][3] =
    {
        {0,  1,  2}, // first image triangle
        {2,  3,  0}, // second image triangle
    };
    glGenBuffers(1, &elementBufferID);
    glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, elementBufferID);
    glBufferData(GL_ELEMENT_ARRAY_BUFFER, sizeof(triangleIndices), triangleIndices, GL_STATIC_DRAW);

    // Create an 8-bit RGB test image - each row is a monochrome 0-256 gradient.
    for (auto row = 0; row < imageHeight; ++row)
    {
        for (auto col = 0; col < imageWidth; ++col)
        {
            imageData[row][col][0] = col;  // r
            imageData[row][col][1] = col;  // g
            imageData[row][col][2] = col;  // b
        }
    }

    // Image texture
    GLuint imageTextureID;
    glGenTextures(1, &imageTextureID);
    glActiveTexture(GL_TEXTURE0);
    glBindTexture(GL_TEXTURE_2D, imageTextureID);
    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_BORDER);
    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_BORDER);
    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
    glPixelStorei(GL_UNPACK_ALIGNMENT, 1);
    glPixelStorei(GL_UNPACK_ROW_LENGTH, imageWidth);
    check_gl_error();
    glTexImage2D(GL_TEXTURE_2D, 0, GL_RGB, imageWidth, imageHeight, 0, GL_RGB, GL_UNSIGNED_BYTE, nullptr);
    check_gl_error();

    // Set up NxNxN identity LUT 3D texture
    const int lutSize = 2;
    static GLfloat lutValues[lutSize*lutSize*lutSize][3];
    for (int b = 0; b < lutSize; ++b)
    {
        for (int g = 0; g < lutSize; ++g)
        {
            for (int r = 0; r < lutSize; ++r)
            {
                int position = b * lutSize * lutSize + g * lutSize + r;
                lutValues[position][0] = r / GLfloat(lutSize - 1);
                lutValues[position][1] = g / GLfloat(lutSize - 1);
                lutValues[position][2] = b / GLfloat(lutSize - 1);
            }
        }
    }

    GLuint lutTextureId;
    glGenTextures(1, &lutTextureId);
    glActiveTexture(GL_TEXTURE1);
    glBindTexture(GL_TEXTURE_3D, lutTextureId);
    glTexParameterf(GL_TEXTURE_3D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE);
    glTexParameterf(GL_TEXTURE_3D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE);
    glTexParameterf(GL_TEXTURE_3D, GL_TEXTURE_WRAP_R, GL_CLAMP_TO_EDGE);
    glTexParameteri(GL_TEXTURE_3D, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
    glTexParameteri(GL_TEXTURE_3D, GL_TEXTURE_MAG_FILTER, GL_LINEAR);

    glPixelStorei(GL_UNPACK_ALIGNMENT, 4);
    glPixelStorei(GL_UNPACK_ROW_LENGTH, lutSize);
    glTexImage3D(GL_TEXTURE_3D, 0, GL_RGB, lutSize, lutSize, lutSize, 0, GL_RGB, GL_FLOAT, lutValues);

    // Create the shader program
    GLuint vertexShader = glCreateShader(GL_VERTEX_SHADER);
    glShaderSource(vertexShader, 1, &vertexSource, NULL);
    glCompileShader(vertexShader);
    GLuint fragmentShader = glCreateShader(GL_FRAGMENT_SHADER);
    glShaderSource(fragmentShader, 1, &fragmentSource, NULL);
    glCompileShader(fragmentShader);
    GLuint shaderProgram = glCreateProgram();
    glAttachShader(shaderProgram, vertexShader);
    glAttachShader(shaderProgram, fragmentShader);
    glLinkProgram(shaderProgram);
    glUseProgram(shaderProgram);

    // Set up attributes, uniforms, and fragment output.
    GLint posAttrib = glGetAttribLocation(shaderProgram, "position");
    glEnableVertexAttribArray(posAttrib);
    glVertexAttribPointer(posAttrib, 2, GL_FLOAT, GL_FALSE, 4 * sizeof(GLfloat), 0);
    GLint texAttrib = glGetAttribLocation(shaderProgram, "texCoord");
    glEnableVertexAttribArray(texAttrib);
    glVertexAttribPointer(texAttrib, 2, GL_FLOAT, GL_FALSE, 4 * sizeof(GLfloat), (void*)(2 * sizeof(GLfloat)));
    GLuint imageTextureUniform = glGetUniformLocation(shaderProgram, "imageTextureUnit");
    glUniform1i(imageTextureUniform, 0);  // image --> texture unit 0
    GLuint lutTextureUniform = glGetUniformLocation(shaderProgram, "lutTextureUnit");
    glUniform1i(lutTextureUniform, 1);    // lut --> texture unit 1
    GLuint lutInterpScaleUniform = glGetUniformLocation(shaderProgram, "lutInterpScale");
    glUniform1f(lutInterpScaleUniform, (lutSize - 1.0F) / lutSize);
    GLuint lutInterpOffsetUniform = glGetUniformLocation(shaderProgram, "lutInterpOffset");
    glUniform1f(lutInterpOffsetUniform, 1.0F / (2.0F * lutSize));
    glBindFragDataLocation(shaderProgram, 0, "outColor");

    glActiveTexture(GL_TEXTURE0);
    glPixelStorei(GL_UNPACK_ALIGNMENT, 1);
    glPixelStorei(GL_UNPACK_ROW_LENGTH, imageWidth);
    glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, imageWidth, imageHeight, GL_RGB, GL_UNSIGNED_BYTE, imageData);
    check_gl_error();

    // Draw
    glDrawElements(GL_TRIANGLES, 6, GL_UNSIGNED_INT, 0);
    glfwSwapBuffers(window);

    // Read back one row of the frame buffer image and compare with orig image
    unsigned char readBuffer[256][3];
    glReadPixels(0, 0, 256, 1, GL_RGB, GL_UNSIGNED_BYTE, readBuffer);
    std::cout << std::endl << " IN  dR dG dB";
    for (auto i = 0; i < 256; ++i)
    {
        std::cout << std::endl << std::setw(3) << i << ":";
        for (auto j = 0; j < 3; ++j)
        {
            int diff = int(readBuffer[i][j]) - int(imageData[0][i][j]);
            std::cout << " " << std::setw(2) << diff;
        }
    }
    std::cout << std::endl;

    while (glfwWindowShouldClose(window) == 0)
    {
        glfwWaitEvents();
    }

    glfwTerminate();
    return 0;
}

And here is the output:

IN  dR dG dB
  0:  0  0  0
  1:  0  0  0
  2:  0  0  0
  3:  0  0  0
  4:  0  0  0
  5:  0  0  0
  6:  0  0  0
  7:  0  0  0
  8:  0 -1  0
  9:  0 -1  0
 10:  0 -1  0
 11:  0 -1  0
 12:  0  1  0
 13:  0  1  0
 14:  0  1  0
 15:  0  0  0
 16:  0  0  0
 17:  0  0  0
 18:  0  0  0
 19:  0 -1  0
 20:  0 -1  0
 21:  0  1  0
 22:  0  0  0
 23:  0  0  0
 24:  0  0  0
 25:  0 -1  0
 26:  0 -1  0
 27:  0  1  0
 28:  0  0  0
 29:  0  0  0
 30:  0 -1  0
 31:  0 -1  0
 32:  0  0  0
 33:  0  1  0
 34:  0  1  0
 35:  0  0  0
 36:  0  0  0
 37:  0  1  0
 38:  0  1  0
 39:  0  0  0
 40:  0 -1  0
 41:  0 -1  0
 42:  0  0  0
 43:  0  0  0
 44:  0 -1  0
 45:  0  1  0
 46:  0  0  0
 47:  0  0  0
 48:  0  0  0
 49:  0  1  0
 50:  0  0  0
 51:  0  0  0
 52:  0 -1  0
 53:  0  0  0
 54:  0  0  0
 55:  0  0  0
 56:  0  1  0
 57:  0  1  0
 58:  0  0  0
 59:  0 -1  0
 60:  0  0  0
 61:  0  1  0
 62:  0  0  0
 63:  0  1  0
 64:  0  0  0
 65:  0 -1  0
 66:  0  0  0
 67:  0  1  0
 68:  0  0  0
 69:  0 -1  0
 70:  0  0  0
 71:  0  0  0
 72:  0  1  0
 73:  0  0  0
 74:  0 -1  0
 75:  0  0  0
 76:  0  1  0
 77:  0  0  0
 78:  0 -1  0
 79:  0  0  0
 80:  0  0  0
 81:  0 -1  0
 82:  0  0  0
 83:  0  0  0
 84:  0 -1  0
 85:  0  0  0
 86:  0  0  0
 87:  0 -1  0
 88:  0  0  0
 89:  0  0  0
 90:  0 -1  0
 91:  0  0  0
 92:  0  0  0
 93:  0 -1  0
 94:  0  1  0
 95:  0 -1  0
 96:  0  0  0
 97:  0  1  0
 98:  0  1  0
 99:  0  1  0
100:  0  0  0
101:  0  0  0
102:  0 -1  0
103:  0  0  0
104:  0  0  0
105:  0  0  0
106:  0  0  0
107:  0  0  0
108:  0 -1  0
109:  0  1  0
110:  0  0  0
111:  0  0  0
112:  0  0  0
113:  0  0  0
114:  0  0  0
115:  0 -1  0
116:  0  1  0
117:  0 -1  0
118:  0  0  0
119:  0  1  0
120:  0  0  0
121:  0  0  0
122:  0  0  0
123:  0  0  0
124:  0  0  0
125:  0  0  0
126:  0  0  0
127:  0  0  0
128:  0  0  0
129:  0  0  0
130:  0  0  0
131:  0  0  0
132:  0  0  0
133:  0  0  0
134:  0  0  0
135:  0  0  0
136:  0 -1  0
137:  0  0  0
138:  0  1  0
139:  0 -1  0
140:  0  1  0
141:  0  0  0
142:  0  0  0
143:  0  0  0
144:  0  0  0
145:  0  0  0
146:  0 -1  0
147:  0  1  0
148:  0  0  0
149:  0  0  0
150:  0  0  0
151:  0  0  0
152:  0  0  0
153:  0  1  0
154:  0  0  0
155:  0  0  0
156:  0 -1  0
157:  0 -1  0
158:  0 -1  0
159:  0  0  0
160:  0  1  0
161:  0 -1  0
162:  0  1  0
163:  0  0  0
164:  0  0  0
165:  0  1  0
166:  0  0  0
167:  0  0  0
168:  0  1  0
169:  0  0  0
170:  0  0  0
171:  0  1  0
172:  0  0  0
173:  0  0  0
174:  0  1  0
175:  0  0  0
176:  0  0  0
177:  0  1  0
178:  0  0  0
179:  0 -1  0
180:  0  0  0
181:  0  1  0
182:  0  0  0
183:  0 -1  0
184:  0  0  0
185:  0  0  0
186:  0  1  0
187:  0  0  0
188:  0 -1  0
189:  0  0  0
190:  0  1  0
191:  0  0  0
192:  0 -1  0
193:  0  0  0
194:  0 -1  0
195:  0  0  0
196:  0  1  0
197:  0  0  0
198:  0 -1  0
199:  0 -1  0
200:  0  0  0
201:  0  0  0
202:  0  0  0
203:  0  1  0
204:  0  0  0
205:  0  0  0
206:  0 -1  0
207:  0  0  0
208:  0  0  0
209:  0  0  0
210:  0 -1  0
211:  0  1  0
212:  0  0  0
213:  0  0  0
214:  0  1  0
215:  0  1  0
216:  0  0  0
217:  0 -1  0
218:  0 -1  0
219:  0  0  0
220:  0  0  0
221:  0 -1  0
222:  0 -1  0
223:  0  0  0
224:  0  1  0
225:  0  1  0
226:  0  0  0
227:  0  0  0
228:  0 -1  0
229:  0  1  0
230:  0  1  0
231:  0  0  0
232:  0  0  0
233:  0  0  0
234:  0 -1  0
235:  0  1  0
236:  0  1  0
237:  0  0  0
238:  0  0  0
239:  0  0  0
240:  0  0  0
241:  0 -1  0
242:  0 -1  0
243:  0 -1  0
244:  0  1  0
245:  0  1  0
246:  0  1  0
247:  0  1  0
248:  0  0  0
249:  0  0  0
250:  0  0  0
251:  0  0  0
252:  0  0  0
253:  0  0  0
254:  0  0  0
255:  0  0  0

Forgot to mention that this was using driver 442.19 on a GTX 970, but I saw similar results on a 1080ti with the studio driver and an Alienware laptop with a 2080.