Cant compile because unistd.h

i used sdk 5.0 and cygwin
when i compile this source code

#include <stdio.h>
#include <stdlib.h>
#include <string.h>     /* strtok() */
#include <sys/types.h>  /* open() */
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>     /* getopt() */

int      _debug;
#include "kmeans.h"

/*---< usage() >------------------------------------------------------------*/
static void usage(char *argv0, float threshold) {
    char *help =
        "Usage: %s [switches] -i filename -n num_clusters\n"
        "       -i filename    : file containing data to be clustered\n"
        "       -b             : input file is in binary format (default no)\n"
        "       -n num_clusters: number of clusters (K must > 1)\n"
        "       -t threshold   : threshold value (default %.4f)\n"
        "       -o             : output timing results (default no)\n"
        "       -d             : enable debug mode\n";
    fprintf(stderr, help, argv0, threshold);

/*---< main() >-------------------------------------------------------------*/
int main(int argc, char **argv) {
           int     opt;
    extern char   *optarg;
    extern int     optind;
           int     isBinaryFile, is_output_timing;

           int     numClusters, numCoords, numObjs;
           int    *membership;    /* [numObjs] */
           char   *filename;
           float **objects;       /* [numObjs][numCoords] data objects */
           float **clusters;      /* [numClusters][numCoords] cluster center */
           float   threshold;
           double  timing, io_timing, clustering_timing;
           int     loop_iterations;

    /* some default values */
    _debug           = 0;
    threshold        = 0.001;
    numClusters      = 0;
    isBinaryFile     = 0;
    is_output_timing = 0;
    filename         = NULL;

    while ( (opt=getopt(argc,argv,"p:i:n:t:abdo"))!= EOF) {
        switch (opt) {
            case 'i': filename=optarg;
            case 'b': isBinaryFile = 1;
            case 't': threshold=atof(optarg);
            case 'n': numClusters = atoi(optarg);
            case 'o': is_output_timing = 1;
            case 'd': _debug = 1;
            case '?': usage(argv[0], threshold);
            default: usage(argv[0], threshold);

    if (filename == 0 || numClusters <= 1) usage(argv[0], threshold);

    if (is_output_timing) io_timing = wtime();

    /* read data points from file ------------------------------------------*/
    objects = file_read(isBinaryFile, filename, &numObjs, &numCoords);
    if (objects == NULL) exit(1);

    if (is_output_timing) {
        timing            = wtime();
        io_timing         = timing - io_timing;
        clustering_timing = timing;

    /* start the timer for the core computation -----------------------------*/
    /* membership: the cluster id for each data object */
    membership = (int*) malloc(numObjs * sizeof(int));
    assert(membership != NULL);

    clusters = cuda_kmeans(objects, numCoords, numObjs, numClusters, threshold,
                          membership, &loop_iterations);


    if (is_output_timing) {
        timing            = wtime();
        clustering_timing = timing - clustering_timing;

    /* output: the coordinates of the cluster centres ----------------------*/
    file_write(filename, numClusters, numObjs, numCoords, clusters,


    /*---- output performance numbers ---------------------------------------*/
    if (is_output_timing) {
        io_timing += wtime() - timing;
        printf("\nPerforming **** Regular Kmeans (CUDA version) ****\n");

        printf("Input file:     %s\n", filename);
        printf("numObjs       = %d\n", numObjs);
        printf("numCoords     = %d\n", numCoords);
        printf("numClusters   = %d\n", numClusters);
        printf("threshold     = %.4f\n", threshold);

        printf("Loop iterations    = %d\n", loop_iterations);

        printf("I/O time           = %10.4f sec\n", io_timing);
        printf("Computation timing = %10.4f sec\n", clustering_timing);


i get error message : fatal error C1083: Cannot open include file: ‘unistd.h’: No such file or directory
Makefile:118: recipe for target ‘cuda_main.o’ failed
make: *** [cuda_main.o] Error 2

what should i do?

cygwin is not a supported environment for nvcc

This file doesn’t appear to have any cuda-specific syntax in it.

Try compiling it with gcc from cygwin instead of nvcc (rename it to cuda_main.cpp). This will require changes to the Makefile. If gcc from cygwin can’t find unistd.h then your problem has nothing to do with cuda.

that source code above is for calling “cuda_kmeans” lines 90
this is the cuda_kmeans code

#include <stdio.h>
#include <stdlib.h>

#include "kmeans.h"

static inline int nextPowerOfTwo(int n) {

    n = n >>  1 | n;
    n = n >>  2 | n;
    n = n >>  4 | n;
    n = n >>  8 | n;
    n = n >> 16 | n;
//  n = n >> 32 | n;    //  For 64-bit ints

    return ++n;

/*----< euclid_dist_2() >----------------------------------------------------*/
/* square of Euclid distance between two multi-dimensional points            */
__host__ __device__ inline static
float euclid_dist_2(int    numCoords,
                    int    numObjs,
                    int    numClusters,
                    float *objects,     // [numCoords][numObjs]
                    float *clusters,    // [numCoords][numClusters]
                    int    objectId,
                    int    clusterId)
    int i;
    float ans=0.0;

    for (i = 0; i < numCoords; i++) {
        ans += (objects[numObjs * i + objectId] - clusters[numClusters * i + clusterId]) *
               (objects[numObjs * i + objectId] - clusters[numClusters * i + clusterId]);


/*----< find_nearest_cluster() >---------------------------------------------*/
__global__ static
void find_nearest_cluster(int numCoords,
                          int numObjs,
                          int numClusters,
                          float *objects,           //  [numCoords][numObjs]
                          float *deviceClusters,    //  [numCoords][numClusters]
                          int *membership,          //  [numObjs]
                          int *intermediates)
    extern __shared__ char sharedMemory[];

    //  The type chosen for membershipChanged must be large enough to support
    //  reductions! There are blockDim.x elements, one for each thread in the
    //  block. See numThreadsPerClusterBlock in cuda_kmeans().
    unsigned char *membershipChanged = (unsigned char *)sharedMemory;
    float *clusters = (float *)(sharedMemory + blockDim.x);
    float *clusters = deviceClusters;

    membershipChanged[threadIdx.x] = 0;

    //  BEWARE: We can overrun our shared memory here if there are too many
    //  clusters or too many coordinates! For reference, a Tesla C1060 has 16
    //  KiB of shared memory per block, and a GeForce GTX 480 has 48 KiB of
    //  shared memory per block.
    for (int i = threadIdx.x; i < numClusters; i += blockDim.x) {
        for (int j = 0; j < numCoords; j++) {
            clusters[numClusters * j + i] = deviceClusters[numClusters * j + i];

    int objectId = blockDim.x * blockIdx.x + threadIdx.x;

    if (objectId < numObjs) {
        int   index, i;
        float dist, min_dist;

        /* find the cluster id that has min distance to object */
        index    = 0;
        min_dist = euclid_dist_2(numCoords, numObjs, numClusters,
                                 objects, clusters, objectId, 0);

        for (i=1; i<numClusters; i++) {
            dist = euclid_dist_2(numCoords, numObjs, numClusters,
                                 objects, clusters, objectId, i);
            /* no need square root */
            if (dist < min_dist) { /* find the min and its array index */
                min_dist = dist;
                index    = i;

        if (membership[objectId] != index) {
            membershipChanged[threadIdx.x] = 1;

        /* assign the membership to object objectId */
        membership[objectId] = index;

        __syncthreads();    //  For membershipChanged[]

        //  blockDim.x *must* be a power of two!
        for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) {
            if (threadIdx.x < s) {
                membershipChanged[threadIdx.x] +=
                    membershipChanged[threadIdx.x + s];

        if (threadIdx.x == 0) {
            intermediates[blockIdx.x] = membershipChanged[0];

__global__ static
void compute_delta(int *deviceIntermediates,
                   int numIntermediates,    //  The actual number of intermediates
                   int numIntermediates2)   //  The next power of two
    //  The number of elements in this array should be equal to
    //  numIntermediates2, the number of threads launched. It *must* be a power
    //  of two!
    extern __shared__ unsigned int intermediates[];

    //  Copy global intermediate values into shared memory.
    intermediates[threadIdx.x] =
        (threadIdx.x < numIntermediates) ? deviceIntermediates[threadIdx.x] : 0;


    //  numIntermediates2 *must* be a power of two!
    for (unsigned int s = numIntermediates2 / 2; s > 0; s >>= 1) {
        if (threadIdx.x < s) {
            intermediates[threadIdx.x] += intermediates[threadIdx.x + s];

    if (threadIdx.x == 0) {
        deviceIntermediates[0] = intermediates[0];

/*----< cuda_kmeans() >-------------------------------------------------------*/
//  ----------------------------------------
//  objects         [numObjs][numCoords]
//  clusters        [numClusters][numCoords]
//  dimObjects      [numCoords][numObjs]
//  dimClusters     [numCoords][numClusters]
//  newClusters     [numCoords][numClusters]
//  deviceObjects   [numCoords][numObjs]
//  deviceClusters  [numCoords][numClusters]
//  ----------------------------------------
/* return an array of cluster centers of size [numClusters][numCoords]       */
float** cuda_kmeans(float **objects,      /* in: [numObjs][numCoords] */
                   int     numCoords,    /* no. features */
                   int     numObjs,      /* no. objects */
                   int     numClusters,  /* no. clusters */
                   float   threshold,    /* % objects change membership */
                   int    *membership,   /* out: [numObjs] */
                   int    *loop_iterations)
    int      i, j, index, loop=0;
    int     *newClusterSize; /* [numClusters]: no. objects assigned in each
                                new cluster */
    float    delta;          /* % of objects change their clusters */
    float  **dimObjects;
    float  **clusters;       /* out: [numClusters][numCoords] */
    float  **dimClusters;
    float  **newClusters;    /* [numCoords][numClusters] */

    float *deviceObjects;
    float *deviceClusters;
    int *deviceMembership;
    int *deviceIntermediates;

    //  Copy objects given in [numObjs][numCoords] layout to new
    //  [numCoords][numObjs] layout
    malloc2D(dimObjects, numCoords, numObjs, float);
    for (i = 0; i < numCoords; i++) {
        for (j = 0; j < numObjs; j++) {
            dimObjects[i][j] = objects[j][i];

    /* pick first numClusters elements of objects[] as initial cluster centers*/
    malloc2D(dimClusters, numCoords, numClusters, float);
    for (i = 0; i < numCoords; i++) {
        for (j = 0; j < numClusters; j++) {
            dimClusters[i][j] = dimObjects[i][j];

    /* initialize membership[] */
    for (i=0; i<numObjs; i++) membership[i] = -1;

    /* need to initialize newClusterSize and newClusters[0] to all 0 */
    newClusterSize = (int*) calloc(numClusters, sizeof(int));
    assert(newClusterSize != NULL);

    malloc2D(newClusters, numCoords, numClusters, float);
    memset(newClusters[0], 0, numCoords * numClusters * sizeof(float));

    //  To support reduction, numThreadsPerClusterBlock *must* be a power of
    //  two, and it *must* be no larger than the number of bits that will
    //  fit into an unsigned char, the type used to keep track of membership
    //  changes in the kernel.
    const unsigned int numThreadsPerClusterBlock = 128;
    const unsigned int numClusterBlocks =
        (numObjs + numThreadsPerClusterBlock - 1) / numThreadsPerClusterBlock;
    const unsigned int clusterBlockSharedDataSize =
        numThreadsPerClusterBlock * sizeof(unsigned char) +
        numClusters * numCoords * sizeof(float);

    cudaDeviceProp deviceProp;
    int deviceNum;
    cudaGetDeviceProperties(&deviceProp, deviceNum);

    if (clusterBlockSharedDataSize > deviceProp.sharedMemPerBlock) {
        err("WARNING: Your CUDA hardware has insufficient block shared memory. "
            "You need to recompile with BLOCK_SHARED_MEM_OPTIMIZATION=0. "
            "See the README for details.\n");
    const unsigned int clusterBlockSharedDataSize =
        numThreadsPerClusterBlock * sizeof(unsigned char);

    const unsigned int numReductionThreads =
    const unsigned int reductionBlockSharedDataSize =
        numReductionThreads * sizeof(unsigned int);

    checkCuda(cudaMalloc(&deviceObjects, numObjs*numCoords*sizeof(float)));
    checkCuda(cudaMalloc(&deviceClusters, numClusters*numCoords*sizeof(float)));
    checkCuda(cudaMalloc(&deviceMembership, numObjs*sizeof(int)));
    checkCuda(cudaMalloc(&deviceIntermediates, numReductionThreads*sizeof(unsigned int)));

    checkCuda(cudaMemcpy(deviceObjects, dimObjects[0],
              numObjs*numCoords*sizeof(float), cudaMemcpyHostToDevice));
    checkCuda(cudaMemcpy(deviceMembership, membership,
              numObjs*sizeof(int), cudaMemcpyHostToDevice));

    do {
        checkCuda(cudaMemcpy(deviceClusters, dimClusters[0],
                  numClusters*numCoords*sizeof(float), cudaMemcpyHostToDevice));

            <<< numClusterBlocks, numThreadsPerClusterBlock, clusterBlockSharedDataSize >>>
            (numCoords, numObjs, numClusters,
             deviceObjects, deviceClusters, deviceMembership, deviceIntermediates);

        cudaDeviceSynchronize(); checkLastCudaError();

        compute_delta <<< 1, numReductionThreads, reductionBlockSharedDataSize >>>
            (deviceIntermediates, numClusterBlocks, numReductionThreads);

        cudaDeviceSynchronize(); checkLastCudaError();

        int d;
        checkCuda(cudaMemcpy(&d, deviceIntermediates,
                  sizeof(int), cudaMemcpyDeviceToHost));
        delta = (float)d;

        checkCuda(cudaMemcpy(membership, deviceMembership,
                  numObjs*sizeof(int), cudaMemcpyDeviceToHost));

        for (i=0; i<numObjs; i++) {
            /* find the array index of nestest cluster center */
            index = membership[i];

            /* update new cluster centers : sum of objects located within */
            for (j=0; j<numCoords; j++)
                newClusters[j][index] += objects[i][j];

        //  TODO: Flip the nesting order
        //  TODO: Change layout of newClusters to [numClusters][numCoords]
        /* average the sum and replace old cluster centers with newClusters */
        for (i=0; i<numClusters; i++) {
            for (j=0; j<numCoords; j++) {
                if (newClusterSize[i] > 0)
                    dimClusters[j][i] = newClusters[j][i] / newClusterSize[i];
                newClusters[j][i] = 0.0;   /* set back to 0 */
            newClusterSize[i] = 0;   /* set back to 0 */

        delta /= numObjs;
    } while (delta > threshold && loop++ < 500);

    *loop_iterations = loop + 1;

    /* allocate a 2D space for returning variable clusters[] (coordinates
       of cluster centers) */
    malloc2D(clusters, numClusters, numCoords, float);
    for (i = 0; i < numClusters; i++) {
        for (j = 0; j < numCoords; j++) {
            clusters[i][j] = dimClusters[j][i];



    return clusters;

Yes, I expected that. It doesn’t change my advice.

The cuda_kmeans code doesn’t seem to include unistd.h, so presumably it will compile properly with nvcc.

The cuda_main.cpp code should be compilable with gcc (I think.)

Then you link the two compiled objects together.

Something like this should already be happening in your makefile. I’m suggesting that you change the compiler used for cuda_main.cpp to gcc (and rename to cuda_main.cpp). Leave the compilation process for alone.

all this program has 4 file
for specific and get the code from here

i already rename to cpp, compile with gcc and change the Makefile,
first it comes error with , i change to cuda_io.cpp
error with, i change to cuda_wtime.cpp
and still can not compiled,

i have three version from this kmeans program = cuda version ,mpi version and sequential version
both of mpi and sequential version have unistd.h in it and can compiled successfully and running

any help? should i change the compiler? with what?

that source code above is for calling “cuda_kmeans” lines 90
this is the cuda_kmeans code?

Here’s the steps I used to build that project:

wget --no-check-certificate
cd kmeans-master
cp cuda_io.cpp
cp cuda_main.cpp
cp cuda_wtime.cpp
g++ -c cuda_io.cpp
g++ -c cuda_wtime.cpp
g++ -c cuda_main.cpp
nvcc -c
g++ -o test cuda_io.o cuda_wtime.o cuda_main.o cuda_kmeans.o -L/usr/local/cuda/lib64 -lcudart

and the project built successfully for me.

I’m suggesting you try something similar in your cygwin environment. If it does not work, then I would suggest switching to an ordinary linux environment. That is what the project was originally set up for.

thank you so much before,first i failed to compile in cygwin because permission

cc1plus: fatal error: cuda_io.cpp: Permission denied
compilation terminated.

and then i used cmd (admin) all syntax works and the nvcc -c make ‘obj’ file not an ‘o’ file
i get an error message when compile the last syntax that

g++ -o test cuda_io.o cuda_wtime.o cuda_main.o cuda_kmeans.o -L/usr/local/cuda/lib64 -lcudart

it says that cuda_kmeans.o: No Such file or directory

when i change the syntax to

g++ -o test cuda_io.o cuda_wtime.o cuda_main.o cuda_kmeans.obj -L/usr/local/cuda/lib64 -lcudart

it says another error like in the picture

I indicated that what I had done was on linux. You will need to modify it for cygwin. You’ve already discovered the .o -> .obj difference on linux vs. windows (for nvcc). Note that the nvcc you are using on windows is designed to be compatible with visual studio, not cygwin (which is why g++ in this case creates the .o file but nvcc creates the .obj file).

You also need to fix this path:


to match whatever is the path to your cudart.lib file on your machine.

If you’re not sure what that path is, use the windows file search utility to locate it.

Even if you make that change, it’s possible that it may still not work. As I mentioned, nvcc is designed to be compatible with visual studio on windows, not cygwin. There may be other linker discrepancies or other issues that prevent it from working properly.

The solution would be to switch to a supported linux environment.

my lib64 file is in C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v5.0\lib\x64
how can i fix the path?

i will try to linux first, to see if that works thank you

Something like this:

-L"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v5.0\lib\x64"

I’m not familiar enough with cygwin to tell you if that will work or not.

so what based are you using to this project? i mean like what linux that you used? is that compile from linux terminal or other enviroment? and what cuda sdk you used? thank you

I used Fedora 20, CUDA 7.

And the compile sequence I gave (from linux terminal) seemed to work for me:

wget --no-check-certificate

cd kmeans-master
cp cuda_io.cpp
cp cuda_main.cpp
cp cuda_wtime.cpp
g++ -c cuda_io.cpp
g++ -c cuda_wtime.cpp
g++ -c cuda_main.cpp
nvcc -c
g++ -o test cuda_io.o cuda_wtime.o cuda_main.o cuda_kmeans.o -L/usr/local/cuda/lib64 -lcudart

if i using another linux like ubuntu is that alright? sorry for asking unnecessary question, because i am very new in linux

You can use Ubuntu. I just built the file using txbob’s commands on Ubuntu 14.04 LTS using CUDA 7.

i had install ubuntu 14.04 LTS and cuda toolkit 4.0
all works until syntax

nvcc -c

it says

dimas@dimas-pc:~/Documents/kmeans-master$ nvcc -c
In file included from /usr/local/cuda/bin/…/include/cuda_runtime.h:59:0,
from :0:
/usr/local/cuda/bin/…/include/host_config.h:82:2: error: #error – unsupported GNU version! gcc 4.5 and up are not supported!
#error – unsupported GNU version! gcc 4.5 and up are not supported!
what should i do?

Use a newer CUDA toolkit, like CUDA 7.0

why i always failed to install cuda toolkit 7.0-28? i download the ‘run’ file
compile with sudo chmod +x *.run — failed
from terminal like in here — failed

any tips for installing cuda toolkit 7.0-28?
should i need to install the nvidia driver which brought from cuda toolkit 7.0?

“failed” is not useful to me.

show your exact command and the exact error output

If you want to install CUDA 7, follow the instructions in the linux getting started guide.

i have download file cuda toolkit that deb 902 mb’cuda-repo-ubuntu1404-7-0-local_7.0-28_amd64.deb’
i follow that instructions pdf for ubuntu

first cd to folder where the file ini it , then
-sudo dpkg -i cuda-repo-ubuntu1404-7-0-local_7.0-28_amd64.deb – works
-sudo apt-get update – it works but output some failed like

Reading package lists… Done
W: Failed to fetch http://cuda-repo/prodtest/ubuntu1404/x86_64/InRelease
W: Failed to fetch http://cuda-repo/prodtest/ubuntu1404/x86_64/Release.gpg Could not resolve ‘cuda-repo’
W: Some index files failed to download. They have been ignored, or old ones used instead.

  • sudo apt-get install cuda – error message come

Reading package lists… Done
Building dependency tree
Reading state information… Done
Some packages could not be installed. This may mean that you have
requested an impossible situation or if you are using the unstable
distribution that some required packages have not yet been created
or been moved out of Incoming.
The following information may help to resolve the situation:

The following packages have unmet dependencies:
unity-control-center : Depends: libcheese-gtk23 (>= 3.4.0) but it is not going to be installed
Depends: libcheese7 (>= 3.0.1) but it is not going to be installed
E: Error, pkgProblemResolver::Resolve generated breaks, this may be caused by held packages.

how i can fix this?

finally i can compile the cuda file, without change to cpp
i compile directly using ‘make cuda’
last i want to change from kmeans to fuzzy c means, any advice?
and thank you txbob for helping me