memory operation inside kernel simultaneous using of variable inside the kernel function

I have simple code, where I sent to kernel variable which I want to be changed by threads. Each thread is adding 1 to this integer variable. I declared 10 threads, and when execution is finished then inside the variable there is only 1. Like only one thread changed the variable. How to do it properly ?

#include "cuda_runtime.h"

#include "device_launch_parameters.h"

#include <stdio.h>

__global__ void Kernel(int *c)

{

    *c += 1;

}

int main()

{

	int numberOfThreads = 10;

    int c = 0;

    int *dev_c = 0;

    cudaError_t cudaStatus;

// Choose which GPU to run on, change this on a multi-GPU system.

    cudaStatus = cudaSetDevice(0);

    if (cudaStatus != cudaSuccess) {

        fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");

        goto Error;

    }

// Allocate GPU buffers for three vectors (two input, one output)    .

    cudaStatus = cudaMalloc((void**)&dev_c, sizeof(int));

    if (cudaStatus != cudaSuccess) {

        fprintf(stderr, "cudaMalloc failed!");

        goto Error;

    }

	cudaMemcpy(dev_c, &c, 1, cudaMemcpyHostToDevice);

// Launch a kernel on the GPU with one thread for each element.

    Kernel<<<1, numberOfThreads>>>(dev_c);

// cudaDeviceSynchronize waits for the kernel to finish, and returns

    // any errors encountered during the launch.

    cudaStatus = cudaDeviceSynchronize();

    if (cudaStatus != cudaSuccess) {

        fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);

        goto Error;

    }

// Copy output vector from GPU buffer to host memory.

    cudaStatus = cudaMemcpy(&c, dev_c, sizeof(int), cudaMemcpyDeviceToHost);

    if (cudaStatus != cudaSuccess) {

        fprintf(stderr, "cudaMemcpy failed!");

        goto Error;

    }

Error:

    cudaFree(dev_c);

	// cudaDeviceReset must be called before exiting in order for profiling and

	// tracing tools such as Parallel Nsight and Visual Profiler to show complete traces.

    cudaStatus = cudaDeviceReset();

    if (cudaStatus != cudaSuccess) {

        fprintf(stderr, "cudaDeviceReset failed!");

        return 1;

    }

	printf("Levensthein: %d\n",c);

	printf("Hit any key to terminate\n");

        getchar();

return 0;

}

You need to use atomic operations of multiple threads can modify the same variable in parallel:

__global__ void Kernel(int *c)

{

    atomicAdd(c, 1);

}

Thank You, but what should I include to have access to this function ? and where can I find more information about these atomic functions ?

I included device_functions.h but it didn’t help.

It seems that in properties of project in visual studio 2010 there has to be option : compute_11,sm_11

In Cuda/C++ -> Device -> Code Generation