Error in garbage collection stage?

I’m writing a program to sort ~100,000 elements. The program takes input in the form of files called aNUMBER.dat (a1000.dat, a40000.dat, etc) and outputs the sorted numbers, then min, max, and median.

My code is as follows:

[codebox]#include <cuda.h>

#include <stdio.h>

#include <stdlib.h>

#include <time.h>

#include <sys/time.h>

//shorthand

#define TODEV cudaMemcpyHostToDevice

#define TOHOST cudaMemcpyDeviceToHost

//Optimization - a total of BLOCK_SIZE * BUBBLE_FACTOR elements are sorted by each block

#define BLOCK_SIZE 512 //Number of threads/block

#define BUBBLE_FACTOR 2 //Element count each thread bubbles over - must be greater than 1 to prevent conflict

#define PER_BLOCK (BLOCK_SIZE * BUBBLE_FACTOR)

#define MAX_BLOCKS

//takes the array, the offset, and the upper bound length of the array

global void BlockBubbleSort(int* a, int N)

{

}

int main(int argc, char** argv)

{

    if (argc < 2)

    {

            printf("Error:  Must specify input file\n");

            exit(1);

    }

    //N is overall dimension of array to read in, determined by filename

    int N, i;

    sscanf(argv[1],"a%d.dat",&N);

size_t MEM_SIZE = sizeof(int) * N; //determine array size

    size_t  THREAD_SIZE     = sizeof(int) * PER_BLOCK;

    int*    a_h             = (int*)malloc(MEM_SIZE);               //array for host

    int*    a_d;            cudaMalloc((void**)&a_d, MEM_SIZE);     //array for device

if (!a_h)

    {

            printf("Error: Failed to allocate host memory\n");

            cudaFree(a_d);

            exit(1);

    }

    if (!a_d)

    {

            printf("Error: failed to allocate device memory\n");

    }

    //Read data into host array

    FILE* handle = fopen(argv[1],"r");

    if (!handle)

    {

            printf("Error: invalid input file (file does not exist\n");

            exit(1);

    }

    for (i = 0 ; i < N ; i++) fscanf(handle, "%d", &a_h[i]);

    fclose(handle);

//Execute kernel

    int BLOCK_COUNT = N/(PER_BLOCK);

    if (N%PER_BLOCK) BLOCK_COUNT += 1;

// cudaMemcpy(a_d, a_h, MEM_SIZE, TODEV);

// BlockBubbleSort<<<BLOCK_COUNT, BLOCK_SIZE, THREAD_SIZE>>>(a_d, N);

// cudaMemcpy(a_h, a_d, MEM_SIZE, TOHOST);

//deallocate resources

    cudaFree(a_d);

float median = 0;

int* merged = (int*)malloc(MEM_SIZE);//do merge on CPU;

    int* midx       = (int*)malloc(BLOCK_COUNT);

    if (!merged)

    {

            printf("Error: Could not allocate merged block memory\n");

            free(a_h);

            exit(1);

    }

    for (i = 0 ; i < BLOCK_COUNT ; i++)

            midx[i] = PER_BLOCK * i;

    int idx = 0;

    int mmidx;

    while (idx < N)

    {

            mmidx = -1;

            for (i = 0 ; i < BLOCK_COUNT ; i++)

            {

                    if (midx[i] < N && midx[i] < PER_BLOCK * (1+i))

                            if (mmidx < 0)

                                    mmidx = i;

                            else if (a_h[midx[i]] < a_h[midx[mmidx]])

                                    mmidx = i;

            }

            merged[idx] = a_h[midx[mmidx]];

            midx[mmidx]++;

            idx++;

    }

    for (i = 0 ; i < N ; i++)

            printf("%d\n", merged[i]);

    printf("Min: %d\nMax: %d\n",merged[0],merged[N-1]);

    median = (float)merged[N/2];

    if (N&1 == 0)

            median = (median + (float)merged[N/2+1])/2.;

    printf("Median: %.1f\n",median);

free(a_h);

    printf("a_h freed\n");

    free(merged);

    printf("merged freed\n");

    free(midx);

    printf("midx freed\n");

}

[/codebox]

My output is:

[…]

midx freed

*** glibc detected *** ./a.out: munmap_chunk(): invalid pointer: 0x0808afe8 ***

======= Backtrace: =========

/lib/tls/i686/cmov/libc.so.6(cfree+0x1bb)[0xb7d6961b]

/usr/lib/libstdc++.so.6(_ZdlPv+0x21)[0xb7f31b11]

/usr/local/cuda/lib/libcudart.so.2[0xb7fb13f3]

/lib/tls/i686/cmov/libc.so.6(__cxa_finalize+0xb1)[0xb7d283b1]

/usr/local/cuda/lib/libcudart.so.2[0xb7f89533]

/usr/local/cuda/lib/libcudart.so.2[0xb7fbd2fc]

/lib/ld-linux.so.2[0xb7fd5fcf]

/lib/tls/i686/cmov/libc.so.6(exit+0xd4)[0xb7d28084]

/lib/tls/i686/cmov/libc.so.6(__libc_start_main+0xe8)[0xb7d10458]

./a.out(__gxx_personality_v0+0x49)[0x8048c81]

======= Memory map: ========

[…]

While this is obviously not the code I actually use to sort, this is still giving me the error even with all of the business code stripped out. It goes away only when I don’t allocate the memory on the device, but the CUDA code runs fine still and the error only pops up after execution. Any idea what’s going wrong?

quick edit; I ssh’d to another machine (I’m doing this on hardware I don’t have direct control over) and with a ~4000 point input file there was no issue, but the 110,000+ file gave the same error.

run it through valgrind. you’re probably trampling host memory somewhere

I don’t have root on these boxes and valgrind’s not installed - any other suggestions until I can convince a professor to put it on?