GL_ARB_gpu_shader_int64 compiler breaks code logic.

Hello.
I’ve spent alot of time trying to avoid this issue and make my code produce correct results, but GLSL compiler breaks eveything. Here is a test application to reproduce a bug. I’ve removed 99% of code from it to make it cleaner.

Bug depends on the code itself.
I can’t safely assign a variable inside a loop.
Result should be:
2101479989,1917184202,4191109983,1779160146
But it is:
2101479989,0,4191109983,0

In practice shaders are huge, array indices can be selected randomly, based on the previous values, so loops can’t be unrolled manually, everything becomes very slow and i reach the instruction limit. Each value can depend on the previous one, etc.
main.cpp

//-----------------------------------------------------------------------------
// Dependencies: glfw, glew
// Output must be:
// OUTPUT: 2101479989,1917184202,4191109983,1779160146
//-----------------------------------------------------------------------------

#include <GL/glew.h>
#include <GLFW/glfw3.h> // window, opengl context creation
#include <stdint.h> // uint32_t, uint16_t
#include <string.h>
#include <iostream>

//-----------------------------------------------------------------------------
// GLSL functions
//-----------------------------------------------------------------------------
void printShaderInfoLog(GLuint& shader)
{
    GLint status;
    glGetShaderiv(shader, GL_COMPILE_STATUS, &status);
    if (status != GL_TRUE) {
        std::cerr << "Failed to compile shader!" << std::endl;
        GLchar log[10000];
        glGetShaderInfoLog(shader, 10000, NULL, log);
        std::cerr << log << std::endl;
        exit(1);
    }
}
//-----------------------------------------------------------------------------
void printProgramInfoLog(GLuint& program)
{
    GLint status;
    glGetProgramiv(program, GL_LINK_STATUS, &status);
    if(status != GL_TRUE)
    {
        std::cerr << "Failed to link program!" << std::endl;
        GLchar log[10000];
        glGetProgramInfoLog(program, 10000, NULL, log);
        std::cerr << log << std::endl;
        glDeleteProgram(program);
        exit(1);
    }
}
//-----------------------------------------------------------------------------
GLuint createShaderFromFile(const char* file_name, GLenum shader_type)
{
    // load from file...
    FILE* file = fopen(file_name, "rb");
    if (!file)
    {
        std::cout << "Failed to load shader: " << file_name << std::endl;
        exit(1);
    }

    fseek(file, 0, SEEK_END);
    int length = ftell(file);
    fseek(file, 0, SEEK_SET);
    if (length < 0)
    {
        std::cout << "Failed to read shader file: " << file_name << std::endl;
        fclose(file);
        exit(1);
    }
    std::string source;
    source.resize(length);

    int readLength = fread(&*source.begin(), 1, length, file);
    fclose(file);
    if (readLength != length)
    {
        std::cout << "Failed to read shader file: " << file_name << std::endl;
        source.clear();
        exit(1);
    }

    GLuint shader = glCreateShader(shader_type);
    glShaderSource(shader, 1, (const GLchar**)&source, NULL);
    glCompileShader(shader);
    printShaderInfoLog(shader);

    return shader;
}
//-----------------------------------------------------------------------------
GLuint linkProgram(GLuint shader)
{
    GLuint program = glCreateProgram();
    glAttachShader(program, shader);
    glLinkProgram(program);
    printProgramInfoLog(program);
    return program;
}
//-----------------------------------------------------------------------------
GLuint linkProgram(GLuint shader1, GLuint shader2)
{
    GLuint program = glCreateProgram();
    glAttachShader(program, shader1);
    glAttachShader(program, shader2);
    glLinkProgram(program);
    printProgramInfoLog(program);
    return program;
}
//-----------------------------------------------------------------------------

// each pixel contains a result
struct pixResult
{
    uint32_t v[4];
};

int main(int argc, char** argv)
{
//-----------------------------------------------------------------------------
// p 1. Initialize GLFW.
    if (!glfwInit())
        return -1;
    //  Create a windowed mode window and its OpenGL context
    glfwWindowHint(GLFW_CONTEXT_VERSION_MAJOR, 4);
    glfwWindowHint(GLFW_CONTEXT_VERSION_MINOR, 5);
    glfwWindowHint(GLFW_OPENGL_DEBUG_CONTEXT, GLFW_TRUE);
    glfwWindowHint(GLFW_OPENGL_PROFILE, GLFW_OPENGL_CORE_PROFILE);
    GLFWwindow* window = glfwCreateWindow(640, 480, "Loop Unroll", NULL, NULL);
    if (!window)
    {
        glfwTerminate();
        return -1;
    }

    // Make the window's context current
    glfwMakeContextCurrent(window);

    glewInit();

//-----------------------------------------------------------------------------
// p 2. Init a full-screen quad buffers.
    GLuint fsQuadVAO;
    GLuint fsQuadBuffers[2];

    const uint32_t numVertices = 4;
    float vertices[numVertices][2];
    vertices[0][0] = -1.0f; vertices[0][1] = -1.0f; vertices[1][0] = 1.0f; vertices[1][1] = -1.0f;
    vertices[2][0] = 1.0f; vertices[2][1] = 1.0f; vertices[3][0] = -1.0f; vertices[3][1] = 1.0f;

    const uint32_t numElements = 6;
    const uint16_t elements[numElements] = {0, 1, 2, 2, 3, 0};

    glGenVertexArrays(1, &fsQuadVAO);
    glGenBuffers(2, fsQuadBuffers);

    glBindVertexArray(fsQuadVAO);
    glBindBuffer(GL_ARRAY_BUFFER, fsQuadBuffers[0]);
    glBufferData(GL_ARRAY_BUFFER, 8*numVertices, vertices, GL_STATIC_DRAW);
    glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, fsQuadBuffers[1]);
    glBufferData(GL_ELEMENT_ARRAY_BUFFER, sizeof(unsigned short)*numElements, elements, GL_STATIC_DRAW);
    glVertexAttribPointer(0, 2, GL_FLOAT, GL_FALSE, 0, 0);
    glEnableVertexAttribArray(0);
    glBindVertexArray(0);

//-----------------------------------------------------------------------------
// p 3. Create texture and pbo.
    const GLsizei imgRes = 8;
    const GLenum internalFormat = GL_RGBA32UI;
    const GLsizei numComponents = 4;
    const GLsizei sizeOfDataType = sizeof(uint32_t)*numComponents;
    // 3.1. texture
    GLuint texDynamic;
    glGenTextures(1, &texDynamic);
    glBindTexture(GL_TEXTURE_2D, texDynamic);
    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE);
    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE);
    glTexStorage2D(GL_TEXTURE_2D, 1, internalFormat, imgRes, imgRes);
    glBindTexture(GL_TEXTURE_2D, 0);
    // 3.2. pbo
    GLuint pbo;
    glGenBuffers(1, &pbo);
    glBindBuffer(GL_PIXEL_PACK_BUFFER, pbo);
    glBufferData(GL_PIXEL_PACK_BUFFER, imgRes * imgRes * sizeOfDataType, 0, GL_STREAM_READ);
    glBindBuffer(GL_PIXEL_PACK_BUFFER, 0);
    // 3.3 pixel data on the CPU
    pixResult* pixels = new pixResult[imgRes*imgRes];

//-----------------------------------------------------------------------------
// p 4. Load shaders.
    GLuint vsQuad = createShaderFromFile("quad.vert", GL_VERTEX_SHADER);
    GLuint fsCompute = createShaderFromFile("unroll.frag", GL_FRAGMENT_SHADER);
    GLuint glslTest = linkProgram(vsQuad, fsCompute);
    glDeleteShader(vsQuad);
    glDeleteShader(fsCompute);

//-----------------------------------------------------------------------------
// p.5 Computations.

    glViewport(0, 0, imgRes, imgRes);
    //-------------------------------------
    // p.5.1 Do some computations.
    glBindImageTexture(0, texDynamic, 0, GL_FALSE, 0, GL_WRITE_ONLY, internalFormat);
    // bind a shader program
    glUseProgram(glslTest);
    // draw a full-screen quad
    glBindVertexArray(fsQuadVAO);
    glDrawElements(GL_TRIANGLES, 6, GL_UNSIGNED_SHORT, 0);
    glBindVertexArray(0);
    // unbind
    glUseProgram(0);
    glBindImageTexture(0, 0, 0, GL_FALSE, 0, GL_WRITE_ONLY, internalFormat);

    glMemoryBarrier(GL_TEXTURE_FETCH_BARRIER_BIT);
    //-------------------------------------
    // p.5.2. Transfer image to the PBO
    // bind pbo and texture
    glBindBuffer(GL_PIXEL_PACK_BUFFER, pbo);
    glBindTexture(GL_TEXTURE_2D, texDynamic);
    glGetTexImage(GL_TEXTURE_2D, 0, GL_RGBA_INTEGER, GL_UNSIGNED_INT, (GLvoid*)0);
    //-------------------------------------
    // p.5.3. Map PBO and copy to the CPU memory
    uint32_t* ptr = (uint32_t*)glMapBuffer(GL_PIXEL_PACK_BUFFER, GL_READ_ONLY);
    // copy pbo data to the cpu memory
    memcpy(pixels, ptr, imgRes * imgRes * sizeOfDataType); 
    // unmap buffer
    glUnmapBuffer(GL_PIXEL_PACK_BUFFER);
    //-------------------------------------
    // p 5.4 Swap front and back buffers.
    glfwSwapBuffers(window);
    //-------------------------------------
    // p.5.5. Print result for a selected pixel
    // Must be:
    // OUTPUT: 2101479989,1917184202,4191109983,1779160146
    size_t p = 0;
    std::cout << "OUTPUT: " << pixels[p].v[0] << "," << pixels[p].v[1] << "," << pixels[p].v[2] << "," << pixels[p].v[3] << std::endl;

//-----------------------------------------------------------------------------
// p 6. Destroy texture, pbo and glsl program
    glDeleteProgram(glslTest);
    glDeleteTextures(1, &texDynamic);
    glDeleteBuffers(1, &pbo);
    // delete cpu-side pixel data
    delete[] pixels;

//-----------------------------------------------------------------------------
// p 7. Destroy FSQuad
    glDeleteVertexArrays(1, &fsQuadVAO);
    glDeleteBuffers(2, fsQuadBuffers);

    glfwTerminate();

    return 0;
}

quad.vert

#version 450 core
layout(location = 0) in vec2 vertexPositionNDC;
void main()
{
    gl_Position = vec4(vertexPositionNDC, 0.0, 1.0);
}

unroll.frag

#version 450 core

#extension GL_ARB_gpu_shader_int64: enable

layout(binding = 0, rgba32ui) writeonly uniform uimage2D dynamicImage;
 
void main()
{
// OUTPUT should be:
// OUTPUT: 2101479989,1917184202,4191109983,1779160146

    uint b[8];
    b[0] = 2101479989U;
    b[1] = 1917184202U;
    b[2] = 4191109983U;
    b[3] = 1779160146U;
    b[4] = 2746074432U;
    b[5] = 2613207917U;
    b[6] = 4045743315U;
    b[7] = 4014514165U;

    u64vec2 st[3];
    st[0].x = packUint2x32(uvec2(b[0], b[1]));
    st[0].y = packUint2x32(uvec2(b[2], b[3]));
    st[1].x = packUint2x32(uvec2(b[4], b[5]));
    st[1].y = packUint2x32(uvec2(b[6], b[7]));
    st[2] = st[0];

    u64vec2 m[48];

// v1. DOESN'T WORK!
// Produces wrong result:
// OUTPUT: 2101479989,0,4191109983,0

// Should be:
// OUTPUT: 2101479989,1917184202,4191109983,1779160146
//-----------------
    // It may work OK depending on the iteration count and "io" variable.
    int io = 42;
    for (int i = 0; i < 8; ++i)
    {
        m[io+0] = st[0];
        m[io+1] = st[1];
        m[io+2] = st[2]; // Works OK if comment this line
        io -= 6;
    }
//-----------------	
// v2. WORKS!	
    // copy manually
    /*int io = 42;
    m[io+0] = st[0];
    m[io+1] = st[1];
    m[io+2] = st[2];
    io -= 6;
    m[io+0] = st[0];
    m[io+1] = st[1];
    m[io+2] = st[2];
    io -= 6;
    m[io+0] = st[0];
    m[io+1] = st[1];
    m[io+2] = st[2];
    io -= 6;
    m[io+0] = st[0];
    m[io+1] = st[1];
    m[io+2] = st[2];
    io -= 6;
    m[io+0] = st[0];
    m[io+1] = st[1];
    m[io+2] = st[2];
    io -= 6;
    m[io+0] = st[0];
    m[io+1] = st[1];
    m[io+2] = st[2];
    io -= 6;
    m[io+0] = st[0];
    m[io+1] = st[1];
    m[io+2] = st[2];
    io -= 6;
    m[io+0] = st[0];
    m[io+1] = st[1];
    m[io+2] = st[2];
    io -= 6;*/

/*
//-----------------	
// v3. WORKS!
    int io = 36;
    for (int i = 0; i < 7; ++i)
    {
	    m[io+0] = st[0];
	    m[io+1] = st[1];
	    m[io+2] = st[2];
        io -= 6;
    }
    m[io+0] = st[0];
    m[io+1] = st[1];
    m[io+2] = st[2];
    io -= 6;
*/

    uvec2 v01 = unpackUint2x32(m[0].x);
    uvec2 v23 = unpackUint2x32(m[0].y);
    imageStore(dynamicImage, ivec2(gl_FragCoord.xy), uvec4(v01.x, v01.y, v23.x, v23.y));
}

GPU: Nvidia GeForce GTX 690.
Driver Version: 384.98

Do you share the same/similar codebase for this OpenGL extension along with RadeonSI open-source driver?
I’ve had this issue with AMD Radeon HD 7850 in the past and it was fixed.