Program crash if compile with -arch and -m64 on Mac OS X

Is this a bug or my setup problem? With a simple cudpp testing program, if I use -arch=sm_20 (or sm_30) with -m64, the produced program will crash, but use one of the options alone, the program runs fine.
I am running CUDA5.0 with Mac OS X 10.8. My card is GT 650M with compute 3.0

gzhangtest/thrust] nvcc -arch=sm_20 -m64 reducebykey.cu -o reducebykey gzhangtest/thrust] ./reducebykey
libc++abi.dylib: terminate called throwing an exception
Abort trap: 6

gzhangtest/thrust]$ cat reducebykey.cu
#include <thrust/reduce.h>
#include <thrust/unique.h>
#include <thrust/iterator/discard_iterator.h>

#include <thrust/host_vector.h>
#include <thrust/device_vector.h>

int main(void) {

thrust::device_vector keys;
thrust::device_vector values;

typename thrust::pair<typename thrust::device_vector::iterator, typename thrust::device_vector::iterator> new_last;

keys.resize(9);
keys[0] = 11;
keys[1] = 11;
keys[2] = 21;
keys[3] = 20;
keys[4] = 21;
keys[5] = 21;
keys[6] = 21;
keys[7] = 37;
keys[8] = 37;

values.resize(9);
values[0] = 0;
values[1] = 1;
values[2] = 2;
values[3] = 3;
values[4] = 4;
values[5] = 5;
values[6] = 6;
values[7] = 7;
values[8] = 8;

thrust::device_vector output_keys(keys.size());
thrust::device_vector output_values(values.size());

new_last = thrust::reduce_by_key(keys.begin(), keys.end(), values.begin(), output_keys.begin(), output_values.begin());

return 0;
}

thanks,
George
#1