syncthreads doesn't stop a half of threads.

void calculate(int N, volatile float * out, struct com * dev, struct com *  in)
{
	int th = threadIdx.x + blockDim.x * blockIdx.x;

	if(th < N)
	{
		if(th < N/2){
	
			a = in[th];
			in[th] = complexaddition(in[th], in[th + k]);
			in[th + k] = complexsubtraction(a, in[th + k]);
			in[th + k] = complexmult(in[th + k], first[((th - m) * N / k / 2)]);

				printf(" %i:\t%f\n",th,in[th].real);  //correct result, first 4 values of "in" array changed
				printf(" %i:\t%f\n",th,in[th+k].real); //correct result, last 4 values of "in" array changed
		}
	        __syncthreads();
	printf("\thelp %i:\t%f\n",th,in[th].real);  // writes that only last 4 values of "in" array changed, another half, which is printed the earliest, prints without any changes
}

current configurations: 2 blocks, 4 threads per each block

last printf function writes in[4].real, in[5].real, in[6].real and in[7].real correctly, with changed values.
While in[0].real, in[1].real, in[2].real and in[3].real are printed before the nested “if” is permormed.

what is N ?

number of elements, equals to 8
and k = 4.

I would expect some strangeness in behavior, but I would expect that possibly the last 4 values get printed before the changes.

Can you provide a complete code and the exact printout for this?
What GPU are you running on?

if (th < N)  // divergent control flow
{
    [...]
    __syncthreads();
    [...]
}

Unless I am missing something the posted code includes the use of __syncthreads() in a divergent control flow, giving rise to undefined behavior.

N is 8. th is less than 8 always, assuming the OP’s statements are correct (two blocks, 4 threads each)

GeForce 710M
( 2) Multiprocessors, ( 48) CUDA Cores/MP
CUDA Capability Major/Minor version number: 2.1

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#define _USE_MATH_DEFINES
#include <math.h>
#include <time.h>
#include <cuda.h>
#include <cuda_runtime_api.h>
#include <iostream>
#include <vector>
#include <fstream>
#include <string>
#include <algorithm>

using namespace std;

#define CHECK(call)														   \
{																		   \
	const cudaError_t error = call;										   \
	if (error != cudaSuccess)										       \
{																	   \
	printf("Error: %s:%d, ", __FILE__, __LINE__);					   \
	printf("Code:%d, Reason: %s\n", error, cudaGetErrorString(error)); \
	getchar();														   \
	exit(1);														   \
}																	   \
}

struct com
{
	float real;
	float imaginary;
};

#define BLOCK_SIZE (1 << 2)
static int N = (1 << 3);

__device__ com complexmult(com a, com b)
{
	com dev;
	dev.real = a.real * b.real - a.imaginary * b.imaginary;
	dev.imaginary = a.real * b.imaginary + a.imaginary * b.real;
	return dev;
}
__device__ com complexaddition(com a, com b)
{
	com dev;
	dev.real = a.real + b.real;
	dev.imaginary = a.imaginary + b.imaginary;
	return dev;
}
__device__ com complexsubtraction(com a, com b)
{
	com dev;
	dev.real = a.real - b.real;
	dev.imaginary = a.imaginary - b.imaginary;			
	return dev;
}

__global__
	void calculate(int N, struct com * dev, struct com *  in)
{
	extern __shared__ struct com help[]; 
	int k, m;
	com a;
	struct com * first = &dev[0], * second= &dev[N];  // вспомогательный для перестановки
	struct com * third = &second[N], * fourth= &third[N], * fifth = &fourth[N];  // после перестановки
	struct com * sixth = &fifth[blockDim.x];// вспомогательный для перестановки

	int th = threadIdx.x + blockDim.x * blockIdx.x;
        
        printf("\tinput array  %i:\t%f\n",th,in[th].real);

	if(th < N/2){
		first[th].real = cospif(2.0 * (float)(th) / (float)(N));
		first[th].imaginary = sinpif(2.0 * (float)(th) / (float)(N));
	}
	__syncthreads();

	if(th < N/2){
		k = (N / 2); //4
		m = N - 2 * k;  //0

		a = in[th];
		in[th] = complexaddition(in[th], in[th + k]);
		in[th + k] = complexsubtraction(a, in[th + k]);
		in[th + k] = complexmult(in[th + k], first[((th - m) * N / k / 2)]);

		printf("\tinside if:\t %i:\t%f\n",th,in[th].real);
		printf("\tinside if:\t %i:\t%f\n",th,in[th+k].real);
	}

	__syncthreads();

	printf("outside: %i:\t%f\n",th,in[th].real);
}

__host__ 
	int main(int argc, char *argv[])
{
	struct com * gpux, * gpuin;
	cudaMalloc((void**)&gpux,6*N*sizeof(struct com));
	cudaMalloc((void**)&gpuin,N*sizeof(struct com));

	dim3 block (BLOCK_SIZE, 1);
	dim3 grid  (((N + block.x - 1) / block.x), 1);

	int help(N/BLOCK_SIZE);

	com * inputel = new com[N];

	for (int i=0;i<N;i++)
	{
		inputel[i].real = sin(2*M_PI*0.3*(i) + 0.123) + cos(2*M_PI*0.4*(i) + 0.321) + sin(2*M_PI*0.234*(i) + 0.555);
		inputel[i].imaginary = 0.0f;
	}

	{
		CHECK( cudaMemcpy((com*)gpuin, inputel, N * sizeof(com), cudaMemcpyHostToDevice));
		calculate<<<grid , block, BLOCK_SIZE * sizeof(com), 0>>>(N, gpux,gpuin);
		cudaDeviceSynchronize();
	}

	CHECK( cudaFree(gpux) );
	CHECK( cudaFree(gpuin) );
	cudaDeviceReset();
	return 0;
}

result:

input array  0: 1.598554
        input array  1: 0.851304
        input array  2: -0.435892
        input array  3: -1.458991
        input array  4: 0.551829
        input array  5: 1.824861
        input array  6: -1.810939
        input array  7: 0.286947
outside: 4:     0.551829
outside: 5:     1.824861
outside: 6:     -1.810939
outside: 7:     0.286947
        inside if:       0:     2.150383        // seems like values of input array have changed, but outside: 4,5,6,7 are not.
        inside if:       1:     2.676165
        inside if:       2:     -2.246831
        inside if:       3:     -1.172044
        inside if:       0:     1.046725
        inside if:       1:     -0.688408
        inside if:       2:     0.000000
        inside if:       3:     1.234564
outside: 0:     2.150383
outside: 1:     2.676165
outside: 2:     -2.246831
outside: 3:     -1.172044

I wanted to synchronize threads across all the blocks. That is not provided by __syncthreads(), which sync threads only within the same block;
Therefore I have decided to devide my kernel into two ones and now it works fine.
thank you all)

This is the printout behavior I expected, and it is opposite of what you originally stated with respect to 4,5,6,7. I expected that it’s possible that 4,5,6,7 could be printed out in their “unchanged” state, which is what you are now showing and apparently drawing attention to with your comment.

Previously you had said:

“last printf function writes in[4].real, in[5].real, in[6].real and in[7].real correctly, with changed values.
While in[0].real, in[1].real, in[2].real and in[3].real are printed before the nested “if” is permormed.”

which I could not explain. But what you are now showing is really that:

“last printf function writes in[0].real, in[1].real, in[2].real and in[3].real correctly, with changed values.
While in[4].real, in[5].real, in[6].real and in[7].real are printed before the nested “if” is permormed.”

And this is sensible and predictable from your code: if the second block executes “first”, then 4,5,6,7 (not 0,1,2,3 as you previously stated) can print out in their “unchanged” state.

Your actual printout is in line with what I had previously expected:

“I would expect some strangeness in behavior, but I would expect that possibly the last 4 values get printed before the changes.”

txbob, you were right! I merely fooled myself. Thank you again!