GeForce 710M

( 2) Multiprocessors, ( 48) CUDA Cores/MP

CUDA Capability Major/Minor version number: 2.1

```
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#define _USE_MATH_DEFINES
#include <math.h>
#include <time.h>
#include <cuda.h>
#include <cuda_runtime_api.h>
#include <iostream>
#include <vector>
#include <fstream>
#include <string>
#include <algorithm>
using namespace std;
#define CHECK(call) \
{ \
const cudaError_t error = call; \
if (error != cudaSuccess) \
{ \
printf("Error: %s:%d, ", __FILE__, __LINE__); \
printf("Code:%d, Reason: %s\n", error, cudaGetErrorString(error)); \
getchar(); \
exit(1); \
} \
}
struct com
{
float real;
float imaginary;
};
#define BLOCK_SIZE (1 << 2)
static int N = (1 << 3);
__device__ com complexmult(com a, com b)
{
com dev;
dev.real = a.real * b.real - a.imaginary * b.imaginary;
dev.imaginary = a.real * b.imaginary + a.imaginary * b.real;
return dev;
}
__device__ com complexaddition(com a, com b)
{
com dev;
dev.real = a.real + b.real;
dev.imaginary = a.imaginary + b.imaginary;
return dev;
}
__device__ com complexsubtraction(com a, com b)
{
com dev;
dev.real = a.real - b.real;
dev.imaginary = a.imaginary - b.imaginary;
return dev;
}
__global__
void calculate(int N, struct com * dev, struct com * in)
{
extern __shared__ struct com help[];
int k, m;
com a;
struct com * first = &dev[0], * second= &dev[N]; // вспомогательный для перестановки
struct com * third = &second[N], * fourth= &third[N], * fifth = &fourth[N]; // после перестановки
struct com * sixth = &fifth[blockDim.x];// вспомогательный для перестановки
int th = threadIdx.x + blockDim.x * blockIdx.x;
printf("\tinput array %i:\t%f\n",th,in[th].real);
if(th < N/2){
first[th].real = cospif(2.0 * (float)(th) / (float)(N));
first[th].imaginary = sinpif(2.0 * (float)(th) / (float)(N));
}
__syncthreads();
if(th < N/2){
k = (N / 2); //4
m = N - 2 * k; //0
a = in[th];
in[th] = complexaddition(in[th], in[th + k]);
in[th + k] = complexsubtraction(a, in[th + k]);
in[th + k] = complexmult(in[th + k], first[((th - m) * N / k / 2)]);
printf("\tinside if:\t %i:\t%f\n",th,in[th].real);
printf("\tinside if:\t %i:\t%f\n",th,in[th+k].real);
}
__syncthreads();
printf("outside: %i:\t%f\n",th,in[th].real);
}
__host__
int main(int argc, char *argv[])
{
struct com * gpux, * gpuin;
cudaMalloc((void**)&gpux,6*N*sizeof(struct com));
cudaMalloc((void**)&gpuin,N*sizeof(struct com));
dim3 block (BLOCK_SIZE, 1);
dim3 grid (((N + block.x - 1) / block.x), 1);
int help(N/BLOCK_SIZE);
com * inputel = new com[N];
for (int i=0;i<N;i++)
{
inputel[i].real = sin(2*M_PI*0.3*(i) + 0.123) + cos(2*M_PI*0.4*(i) + 0.321) + sin(2*M_PI*0.234*(i) + 0.555);
inputel[i].imaginary = 0.0f;
}
{
CHECK( cudaMemcpy((com*)gpuin, inputel, N * sizeof(com), cudaMemcpyHostToDevice));
calculate<<<grid , block, BLOCK_SIZE * sizeof(com), 0>>>(N, gpux,gpuin);
cudaDeviceSynchronize();
}
CHECK( cudaFree(gpux) );
CHECK( cudaFree(gpuin) );
cudaDeviceReset();
return 0;
}
```

result:

```
input array 0: 1.598554
input array 1: 0.851304
input array 2: -0.435892
input array 3: -1.458991
input array 4: 0.551829
input array 5: 1.824861
input array 6: -1.810939
input array 7: 0.286947
outside: 4: 0.551829
outside: 5: 1.824861
outside: 6: -1.810939
outside: 7: 0.286947
inside if: 0: 2.150383 // seems like values of input array have changed, but outside: 4,5,6,7 are not.
inside if: 1: 2.676165
inside if: 2: -2.246831
inside if: 3: -1.172044
inside if: 0: 1.046725
inside if: 1: -0.688408
inside if: 2: 0.000000
inside if: 3: 1.234564
outside: 0: 2.150383
outside: 1: 2.676165
outside: 2: -2.246831
outside: 3: -1.172044
```