Thank you for your reply
So, following you advice, I write this little work of art that does no IO and no CPU:
import numpy as np
import keras
from keras.models import Sequential, Model, load_model
from keras.layers import Activation, Dense, Multiply, Input
from keras import metrics
from keras.optimizers import Adam
from keras import backend as K
import warnings
warnings.filterwarnings("ignore")
class DataGenerator:
def __init__(self):
pass
def create_train(self, batch_size, shape):
assert shape[2] == 3
while True:
batch_images1 = np.ones((batch_size, shape[0], shape[1], shape[2])).astype("float")
batch_labels = np.zeros((batch_size, 28))
yield batch_images1, batch_labels
train_datagen = DataGenerator()
def create_model(input_shape, n_out):
a = Input(shape=input_shape)
x = keras.layers.GlobalAveragePooling2D()(a)
b = Dense(n_out, activation="softmax")(x)
model = Model(inputs=a, outputs=b)
return model
model = create_model(
input_shape=(512,512,3),
n_out=28)
model.compile(
loss='binary_crossentropy',
optimizer='adam',
metrics=['acc'])
model.summary()
epochs = 3;batch_size = 10
train_generator = train_datagen.create_train(
batch_size, (512,512,3))
validation_generator = train_datagen.create_train(
batch_size, (512,512,3))
K.set_value(model.optimizer.lr, 0.0001)
history = model.fit_generator(
train_generator,
steps_per_epoch=10000//batch_size,
validation_data=validation_generator,
validation_steps=20,
epochs=epochs,
verbose=1)
results were the same with both cards:
m@dl4:~/retina/models$ CUDA_VISIBLE_DEVICES="1" python t_gv100.py
Using TensorFlow backend.
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
input_1 (InputLayer) (None, 512, 512, 3) 0
_________________________________________________________________
global_average_pooling2d_1 ( (None, 3) 0
_________________________________________________________________
dense_1 (Dense) (None, 28) 112
=================================================================
Total params: 112
Trainable params: 112
Non-trainable params: 0
_________________________________________________________________
2019-06-21 12:43:32.077198: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1405] Found device 0 with properties:
name: GeForce GTX TITAN X major: 5 minor: 2 memoryClockRate(GHz): 1.076
pciBusID: 0000:03:00.0
totalMemory: 11.93GiB freeMemory: 11.82GiB
2019-06-21 12:43:32.077237: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1484] Adding visible gpu devices: 0
2019-06-21 12:43:32.443339: I tensorflow/core/common_runtime/gpu/gpu_device.cc:965] Device interconnect StreamExecutor with strength 1 edge matrix:
2019-06-21 12:43:32.443389: I tensorflow/core/common_runtime/gpu/gpu_device.cc:971] 0
2019-06-21 12:43:32.443399: I tensorflow/core/common_runtime/gpu/gpu_device.cc:984] 0: N
2019-06-21 12:43:32.444021: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1097] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 11436 MB memory) -> physical GPU (device: 0, name: GeForce GTX TITAN X, pci bus id: 0000:03:00.0, compute capability: 5.2)
Epoch 1/3
1000/1000 [==============================] - 64s 64ms/step - loss: 0.0365 - acc: 1.0000 - val_loss: 0.0364 - val_acc: 1.0000
Epoch 2/3
1000/1000 [==============================] - 63s 63ms/step - loss: 0.0364 - acc: 1.0000 - val_loss: 0.0364 - val_acc: 1.0000
Epoch 3/3
1000/1000 [==============================] - 64s 64ms/step - loss: 0.0364 - acc: 1.0000 - val_loss: 0.0364 - val_acc: 1.0000
m@dl4:~/retina/models$ CUDA_VISIBLE_DEVICES="0" python t_gv100.py
Using TensorFlow backend.
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
input_1 (InputLayer) (None, 512, 512, 3) 0
_________________________________________________________________
global_average_pooling2d_1 ( (None, 3) 0
_________________________________________________________________
dense_1 (Dense) (None, 28) 112
=================================================================
Total params: 112
Trainable params: 112
Non-trainable params: 0
_________________________________________________________________
2019-06-21 12:38:35.289458: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1405] Found device 0 with properties:
name: Quadro GV100 major: 7 minor: 0 memoryClockRate(GHz): 1.627
pciBusID: 0000:04:00.0
totalMemory: 31.72GiB freeMemory: 31.41GiB
2019-06-21 12:38:35.289504: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1484] Adding visible gpu devices: 0
2019-06-21 12:38:35.673814: I tensorflow/core/common_runtime/gpu/gpu_device.cc:965] Device interconnect StreamExecutor with strength 1 edge matrix:
2019-06-21 12:38:35.673864: I tensorflow/core/common_runtime/gpu/gpu_device.cc:971] 0
2019-06-21 12:38:35.673874: I tensorflow/core/common_runtime/gpu/gpu_device.cc:984] 0: N
2019-06-21 12:38:35.675236: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1097] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 w
ith 30472 MB memory) -> physical GPU (device: 0, name: Quadro GV100, pci bus id: 0000:04:00.0, compute capability: 7.0)
Epoch 1/3
1000/1000 [==============================] - 64s 64ms/step - loss: 0.0365 - acc: 1.0000 - val_loss: 0.0364 - val_acc: 1.0000
Epoch 2/3
1000/1000 [==============================] - 64s 64ms/step - loss: 0.0364 - acc: 1.0000 - val_loss: 0.0364 - val_acc: 1.0000
Epoch 3/3
1000/1000 [==============================] - 65s 65ms/step - loss: 0.0364 - acc: 1.0000 - val_loss: 0.0364 - val_acc: 1.0000
This still seems wrong to me… I’ll try to run nvprof next. what should I run it on? this or my real model?