Hi, i’m having this problem:
]$ make
/opt/lib/cuda/bin/nvcc --ptxas-options=-v -I src/ -c src/clock.c -o build/clock.o
/opt/lib/cuda/bin/nvcc --ptxas-options=-v -I src/ -c src/matMul.cu -o build/matMul.o
ptxas info : Compiling entry function ‘__globfunc__Z6matMulPfS_S_i’
ptxas info : Used 9 registers, 44+40 bytes smem, 4 bytes cmem[1]
/opt/lib/cuda/bin/nvcc --ptxas-options=-v -o bin/matMul build/matMul.o build/clock.o
build/clock.o(.text+0x1b): In function `ovhd’:
: undefined reference to `start_counter’
build/clock.o(.text+0x25): In function `ovhd’:
: undefined reference to `get_counter’
build/clock.o(.text+0x61): In function `mhz’:
: undefined reference to `start_counter’
build/clock.o(.text+0x73): In function `mhz’:
: undefined reference to `get_counter’
build/clock.o(.text+0x11f): In function `callibrate’:
: undefined reference to `start_counter’
build/clock.o(.text+0x129): In function `callibrate’:
: undefined reference to `get_counter’
build/clock.o(.text+0x14a): In function `callibrate’:
: undefined reference to `get_counter’
build/clock.o(.text+0x288): In function `start_comp_counter’:
: undefined reference to `start_counter’
build/clock.o(.text+0x29c): In function `get_comp_counter’:
: undefined reference to `get_counter’
collect2: ld returned 1 exit status
make: *** [matMul] Error 255
Does anyone knows what the problem is?
Makefile
[codebox]# source files.
SRC1 = matMul.cu
SRC2 = clock.c
OUT = matMul
include directories
INCLUDES = -I src/
#compilers
GCC = g++
CCC = /opt/lib/cuda/bin/nvcc --ptxas-options=-v
default: $(OUT)
$(OUT):
$(CCC) $(INCLUDES) -c src/$(SRC2) -o build/clock.o
$(CCC) $(INCLUDES) -c src/$(SRC1) -o build/matMul.o
$(CCC) -o bin/$(OUT) build/matMul.o build/clock.o
clean:
rm -f build/*.o bin/* [/codebox]
matMul.cu
[codebox]
#include <stdio.h>
#include “clock.h”
global void matMul(float *A, float *B, float *C,int SIZE){
int row=threadIdx.x;
int line=threadIdx.y;
//float Cvalue = 0;
for(int k=0; k<SIZE; ++k)
{
C[line * SIZE + row] += A[line * SIZE + k] * B[k * SIZE + row];
//float Adelement=A[line * A.with + k];
//float Bdelement=B[k * B.width + row];
//Cvalue += Adelement * Bdelement;
}
//C[line * SIZE + row] = Cvalue;
}
int main(){
//double ticks=0, ticks_before=0;
int SIZE = 1024;
float *Ah = (float*)malloc(SIZE*SIZE*sizeof(float));
float *Bh = (float*)malloc(SIZE*SIZE*sizeof(float));
float *Ch = (float*)malloc(SIZE*SIZE*sizeof(float));
memset(Ah,0,SIZE*SIZE*sizeof(float));
memset(Bh,0,SIZE*SIZE*sizeof(float));
memset(Ch,0,SIZE*SIZE*sizeof(float));
float *Ad;
float *Bd;
float *Cd;
cudaMalloc((void**)&Ad,SIZE*SIZE*sizeof(float));
cudaMalloc((void**)&Bd,SIZE*SIZE*sizeof(float));
cudaMalloc((void**)&Cd,SIZE*SIZE*sizeof(float));
dim3 blockSize = dim3(16*16);
dim3 gridSize = dim3(SIZE/blockSize.x, SIZE/blockSize.y);
cudaMemcpy(Ad, Ah, SIZE*SIZE*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(Bd, Bh, SIZE*SIZE*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(Cd, Ch, SIZE*SIZE*sizeof(float), cudaMemcpyHostToDevice);
//ticks_before = get_counter();
matMul<<<gridSize, blockSize>>>(Ad, Bd, Cd, SIZE);
//ticks = get_counter();
//printf("%f; ", (ticks-ticks_before) / (SIZE*SIZE*SIZE));
cudaMemcpy(Ch,Cd,SIZE*SIZE*sizeof(float), cudaMemcpyDeviceToHost);
printf("Done!\n");
return (0);
}[/codebox]
clock.c
[codebox]#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <sys/times.h>
#include “clock.h”
/*
- Routines for using the cycle counter
*/
/* Detect whether running on Alpha */
#ifdef __alpha
#define IS_ALPHA 1
#define IS_ALPHA 0
/* Detect whether running on x86 */
#ifdef i386
#define IS_x86 1
#define IS_x86 0
#if IS_ALPHA
/* Initialize the cycle counter */
static unsigned cyc_hi = 0;
static unsigned cyc_lo = 0;
/* Use Alpha cycle timer to compute cycles. Then use
measured clock speed to compute seconds
*/
/*
-
counterRoutine is an array of Alpha instructions to access
-
the Alpha’s processor cycle counter. It uses the rpcc
-
instruction to access the counter. This 64 bit register is
-
divided into two parts. The lower 32 bits are the cycles
-
used by the current process. The upper 32 bits are wall
-
clock cycles. These instructions read the counter, and
-
convert the lower 32 bits into an unsigned int - this is the
-
user space counter value.
-
NOTE: The counter has a very limited time span. With a
-
450MhZ clock the counter can time things for about 9
-
seconds. */
static unsigned int counterRoutine =
{
0x601fc000u,
0x401f0000u,
0x6bfa8001u
};
/* Cast the above instructions into a function. */
static unsigned int (*counter)(void)= (void *)counterRoutine;
void start_counter()
{
/* Get cycle counter */
cyc_hi = 0;
cyc_lo = counter();
}
double get_counter()
{
unsigned ncyc_hi, ncyc_lo;
unsigned hi, lo, borrow;
double result;
ncyc_lo = counter();
ncyc_hi = 0;
lo = ncyc_lo - cyc_lo;
borrow = lo > ncyc_lo;
hi = ncyc_hi - cyc_hi - borrow;
result = (double) hi * (1 << 30) * 4 + lo;
if (result < 0) {
fprintf(stderr, "Error: Cycle counter returning negative value: %.0f\n", result);
}
return result;
}
#endif /* Alpha */
#if IS_x86
/* $begin x86cyclecounter */
/* Initialize the cycle counter */
static unsigned cyc_hi = 0;
static unsigned cyc_lo = 0;
/* Set *hi and *lo to the high and low order bits of the cycle counter.
Implementation requires assembly code to use the rdtsc instruction. */
void access_counter(unsigned *hi, unsigned *lo)
{
asm("rdtsc; movl %%edx,%0; movl %%eax,%1" /* Read cycle counter */
: "=r" (*hi), "=r" (*lo) /* and move results to */
: /* No input */ /* the two outputs */
: "%edx", "%eax");
}
/* Record the current value of the cycle counter. */
void start_counter()
{
access_counter(&cyc_hi, &cyc_lo);
}
/* Return the number of cycles since the last call to start_counter. */
double get_counter()
{
unsigned ncyc_hi, ncyc_lo;
unsigned hi, lo, borrow;
double result;
/* Get cycle counter */
access_counter(&ncyc_hi, &ncyc_lo);
/* Do double precision subtraction */
lo = ncyc_lo - cyc_lo;
borrow = lo > ncyc_lo;
hi = ncyc_hi - cyc_hi - borrow;
result = (double) hi * (1 << 30) * 4 + lo;
if (result < 0) {
fprintf(stderr, "Error: counter returns neg value: %.0f\n", result);
}
return result;
}
/* $end x86cyclecounter */
#endif /* x86 */
double ovhd()
{
/* Do it twice to eliminate cache effects */
int i;
double result;
for (i = 0; i < 2; i++) {
start_counter();
result = get_counter();
}
return result;
}
/* $begin mhz */
/* Estimate the clock rate by measuring the cycles that elapse */
/* while sleeping for sleeptime seconds */
double mhz(int verbose, int sleeptime)
{
double rate;
start_counter();
sleep(sleeptime);
rate = get_counter() / (1e6*sleeptime);
if (verbose)
printf("Processor clock rate ~= %.1f MHz\n", rate);
return rate;
}
/* $end mhz */
/* Version using a default sleeptime */
double mhz_def(int verbose)
{
return mhz(verbose, 2);
}
/** Special counters that compensate for timer interrupt overhead */
static double cyc_per_tick = 0.0;
#define NEVENT 100
#define THRESHOLD 1000
#define RECORDTHRESH 3000
/* Attempt to see how much time is used by timer interrupt */
static void callibrate(int verbose)
{
double oldt;
struct tms t;
clock_t oldc;
int e = 0;
times(&t);
oldc = t.tms_utime;
start_counter();
oldt = get_counter();
while (e <NEVENT) {
double newt = get_counter();
if (newt-oldt >= THRESHOLD) {
clock_t newc;
times(&t);
newc = t.tms_utime;
if (newc > oldc) {
double cpt = (newt-oldt)/(newc-oldc);
if ((cyc_per_tick == 0.0 || cyc_per_tick > cpt) && cpt > RECORDTHRESH)
cyc_per_tick = cpt;
/*
if (verbose)
printf("Saw event lasting %.0f cycles and %d ticks. Ratio = %f\n",
newt-oldt, (int) (newc-oldc), cpt);
*/
e++;
oldc = newc;
}
oldt = newt;
}
}
if (verbose)
printf("Setting cyc_per_tick to %f\n", cyc_per_tick);
}
static clock_t start_tick = 0;
void start_comp_counter()
{
struct tms t;
if (cyc_per_tick == 0.0)
callibrate(1);
times(&t);
start_tick = t.tms_utime;
start_counter();
}
double get_comp_counter()
{
double time = get_counter();
double ctime;
struct tms t;
clock_t ticks;
times(&t);
ticks = t.tms_utime - start_tick;
ctime = time - ticks*cyc_per_tick;
/*
printf("Measured %.0f cycles. Ticks = %d. Corrected %.0f cycles\n",
time, (int) ticks, ctime);
*/
return ctime;
}
[/codebox]
Thanks