Can I use union in __global__ function with indexes?

For example:

union psw_64
	{
		uint32_t	p32[2];
		uint8_t		p[8];
	};

	union psw_64 psw;

and use:

for (int i = characters - 1; i >= 0; i--) {
			psw.p[i] = charset[counters[i]];
indata[0] = psw.p32[0];

What did you observe when you tried it?

It doesn’t store the data into:

psw.p[i] = charset[counters[i]];

See this thread.

I extended your snippet into a complete program, and the union seems to work just fine. The program prints the following output:

indata = 6e6a6662 7a787672

This corresponds to the letters n, j, f, b and z, x, v, r respectively, as selected from alphabet by the index values stored in the counters array.

Generally speaking, the use of unions to reinterpret a byte string between different data types is not advised, as it can run into issues with aliasing rules. The canonical C++ way to perform any kind of reiterpretation is to use memcpy().

#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>

union psw_64 {
    uint32_t p32[2];
    uint8_t p[8];
};

__global__ void kernel (const char *charset, const uint32_t *counters, 
                        uint32_t *indata, int characters)
{
    union psw_64 psw;
    for (int i = characters - 1; i >= 0; i--) {
        psw.p[i] = charset[counters[i]];        
    }
    indata[0] = psw.p32[0];
    indata[1] = psw.p32[1];
}

int main (void)
{
    char *charset = 0;
    const char *alphabet = "abcdefghijklmnopqrstuvwxyz";
    uint32_t *counters_d = 0, counters [8] = {1, 5, 9, 13, 17, 21, 23, 25};
    uint32_t *indata_d = 0, indata[2] = {0, 0};
    cudaMalloc ((void**)&charset, sizeof (charset[0]) * 26);
    cudaMalloc ((void**)&counters_d, sizeof (counters_d[0]) * 8);
    cudaMalloc ((void**)&indata_d, sizeof (indata_d[0]) * 2);
    cudaMemcpy (charset, alphabet, sizeof(charset[0])*26, cudaMemcpyHostToDevice);
    cudaMemcpy (counters_d, counters, sizeof(counters_d[0])*8, cudaMemcpyHostToDevice);
    kernel<<<1,1>>>(charset, counters_d, indata_d, 8);
    cudaMemcpy (indata, indata_d, sizeof (indata[0]) * 2, cudaMemcpyDeviceToHost);
    printf ("indata = %08x %08x\n", indata[0], indata[1]);
    cudaDeviceSynchronize();
    cudaFree (charset);
    cudaFree (counters_d);
    cudaFree (indata_d);
    return EXIT_SUCCESS;
}

Thank you for your help.