I’m writing a program to sort ~100,000 elements. The program takes input in the form of files called aNUMBER.dat (a1000.dat, a40000.dat, etc) and outputs the sorted numbers, then min, max, and median.
My code is as follows:
[codebox]#include <cuda.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <sys/time.h>
//shorthand
#define TODEV cudaMemcpyHostToDevice
#define TOHOST cudaMemcpyDeviceToHost
//Optimization - a total of BLOCK_SIZE * BUBBLE_FACTOR elements are sorted by each block
#define BLOCK_SIZE 512 //Number of threads/block
#define BUBBLE_FACTOR 2 //Element count each thread bubbles over - must be greater than 1 to prevent conflict
#define PER_BLOCK (BLOCK_SIZE * BUBBLE_FACTOR)
#define MAX_BLOCKS
//takes the array, the offset, and the upper bound length of the array
global void BlockBubbleSort(int* a, int N)
{
}
int main(int argc, char** argv)
{
if (argc < 2)
{
printf("Error: Must specify input file\n");
exit(1);
}
//N is overall dimension of array to read in, determined by filename
int N, i;
sscanf(argv[1],"a%d.dat",&N);
size_t MEM_SIZE = sizeof(int) * N; //determine array size
size_t THREAD_SIZE = sizeof(int) * PER_BLOCK;
int* a_h = (int*)malloc(MEM_SIZE); //array for host
int* a_d; cudaMalloc((void**)&a_d, MEM_SIZE); //array for device
if (!a_h)
{
printf("Error: Failed to allocate host memory\n");
cudaFree(a_d);
exit(1);
}
if (!a_d)
{
printf("Error: failed to allocate device memory\n");
}
//Read data into host array
FILE* handle = fopen(argv[1],"r");
if (!handle)
{
printf("Error: invalid input file (file does not exist\n");
exit(1);
}
for (i = 0 ; i < N ; i++) fscanf(handle, "%d", &a_h[i]);
fclose(handle);
//Execute kernel
int BLOCK_COUNT = N/(PER_BLOCK);
if (N%PER_BLOCK) BLOCK_COUNT += 1;
// cudaMemcpy(a_d, a_h, MEM_SIZE, TODEV);
// BlockBubbleSort<<<BLOCK_COUNT, BLOCK_SIZE, THREAD_SIZE>>>(a_d, N);
// cudaMemcpy(a_h, a_d, MEM_SIZE, TOHOST);
//deallocate resources
cudaFree(a_d);
float median = 0;
int* merged = (int*)malloc(MEM_SIZE);//do merge on CPU;
int* midx = (int*)malloc(BLOCK_COUNT);
if (!merged)
{
printf("Error: Could not allocate merged block memory\n");
free(a_h);
exit(1);
}
for (i = 0 ; i < BLOCK_COUNT ; i++)
midx[i] = PER_BLOCK * i;
int idx = 0;
int mmidx;
while (idx < N)
{
mmidx = -1;
for (i = 0 ; i < BLOCK_COUNT ; i++)
{
if (midx[i] < N && midx[i] < PER_BLOCK * (1+i))
if (mmidx < 0)
mmidx = i;
else if (a_h[midx[i]] < a_h[midx[mmidx]])
mmidx = i;
}
merged[idx] = a_h[midx[mmidx]];
midx[mmidx]++;
idx++;
}
for (i = 0 ; i < N ; i++)
printf("%d\n", merged[i]);
printf("Min: %d\nMax: %d\n",merged[0],merged[N-1]);
median = (float)merged[N/2];
if (N&1 == 0)
median = (median + (float)merged[N/2+1])/2.;
printf("Median: %.1f\n",median);
free(a_h);
printf("a_h freed\n");
free(merged);
printf("merged freed\n");
free(midx);
printf("midx freed\n");
}
[/codebox]
My output is:
[…]
midx freed
*** glibc detected *** ./a.out: munmap_chunk(): invalid pointer: 0x0808afe8 ***
======= Backtrace: =========
/lib/tls/i686/cmov/libc.so.6(cfree+0x1bb)[0xb7d6961b]
/usr/lib/libstdc++.so.6(_ZdlPv+0x21)[0xb7f31b11]
/usr/local/cuda/lib/libcudart.so.2[0xb7fb13f3]
/lib/tls/i686/cmov/libc.so.6(__cxa_finalize+0xb1)[0xb7d283b1]
/usr/local/cuda/lib/libcudart.so.2[0xb7f89533]
/usr/local/cuda/lib/libcudart.so.2[0xb7fbd2fc]
/lib/ld-linux.so.2[0xb7fd5fcf]
/lib/tls/i686/cmov/libc.so.6(exit+0xd4)[0xb7d28084]
/lib/tls/i686/cmov/libc.so.6(__libc_start_main+0xe8)[0xb7d10458]
./a.out(__gxx_personality_v0+0x49)[0x8048c81]
======= Memory map: ========
[…]
While this is obviously not the code I actually use to sort, this is still giving me the error even with all of the business code stripped out. It goes away only when I don’t allocate the memory on the device, but the CUDA code runs fine still and the error only pops up after execution. Any idea what’s going wrong?
quick edit; I ssh’d to another machine (I’m doing this on hardware I don’t have direct control over) and with a ~4000 point input file there was no issue, but the 110,000+ file gave the same error.