Commit 983ff5a4 authored by Eva Lina Fesefeldt's avatar Eva Lina Fesefeldt
Browse files

Vergleich NCG und Adam auf TTT

parent 32e91fbf
import tensorflow as tf
# MNIST laden
mnist = tf.keras.datasets.mnist
(x_train, y_train), (x_test, y_test) = mnist.load_data()
x_train = tf.reshape(x_train, (60000, 28*28))
y_train = tf.one_hot(y_train, depth=10)
# Modell erzeugen
size_hidden_layer = 30
from keras.models import Sequential
from keras.layers import Dense
from keras import backend as k
model = Sequential()
model.add(Dense(size_hidden_layer, input_dim = 28*28,activation='sigmoid'))
model.add(Dense(10, input_dim=size_hidden_layer, activation='sigmoid'))
loss_fn = tf.keras.losses.MeanSquaredError()
model.compile(optimizer="adam", loss = loss_fn)
model.fit(x_train, y_train, epochs=10000)
import tensorflow as tf
import numpy as np
def reelles_skalarprodukt_trainable_shape(v_1, v_2):
sum = 0
for i in range(len(v_1)):
sum += np.sum(v_1[i]*v_2[i])
return sum
# Todo umschreiben allgemeines Model
def vector_flat_shape_to_trainable_shape(v):
try:
p,m = v.shape
except:
p = v.shape[0]
n = int((p-3)/13) # Größe der versteckten Schicht
slice1 = 9*n
slice2 = 9*n+n
slice3 = 9*n+n+n*3
v1 = tf.reshape(v[:slice1], (9,n))
v2 = tf.reshape(v[slice1:slice2], (n,))
v3 = tf.reshape(v[slice2:slice3], (n,3))
v4 = tf.reshape(v[slice3:], (3,))
unit_vector_trainable_shape = [v1, v2, v3, v4]
return unit_vector_trainable_shape
# Todo umschreiben für allgemeines n, allgemeines Model
def vector_trainable_shape_to_flat_shape(list):
p = 0
#Number of Parameters p
for i in range(len(list)):
try:
n,m = list[i].shape
except:
n = list[i].shape[0]
m = 1
finally:
p += n*m
n = int((p-3)/13) # Größe der versteckten Schicht
v1 = list[0]
v2 = list[1]
v3 = list[2]
v4 = list[3]
slice1 = 9*n
slice2 = 9*n+n
slice3 = 9*n+n+n*3
v = np.zeros((p,))
v[:slice1] = np.reshape(v1, (9*n,))
v[slice1:slice2] = np.reshape(v2, (n,))
v[slice2:slice3] = np.reshape(v3, (3*n,))
v[slice3:] = np.reshape(v4, (3,))
return v
def matrix_trainable_shape_to_flat_shape(model, h):
layer1 = model.layers[0]
layer2 = model.layers[1]
n_params = tf.reduce_prod(layer1.kernel.shape) + tf.reduce_prod(layer2.kernel.shape) + tf.reduce_prod(layer1.bias.shape) + tf.reduce_prod(layer2.bias.shape)
#h[0] ist die Ableitung des Gradienten nach den Gewichten Layer 1
n_params_D_weights_1 = tf.reduce_prod(layer1.kernel.shape)
H_weights_1 = tf.reshape(h[0], [n_params, n_params_D_weights_1])
#h[1] ist die Ableitung des Gradienten nach den Biasen Layer 1
n_params_D_bias_1 = tf.reduce_prod(layer1.bias.shape)
H_bias_1 = tf.reshape(h[1], [n_params, n_params_D_bias_1])
#h[2] ist die Ableitung des Gradienten nach den Gewichten Layer 2
n_params_D_weights_2 = tf.reduce_prod(layer2.kernel.shape)
H_weights_2 = tf.reshape(h[2], [n_params, n_params_D_weights_2])
#h[3] ist die Ableitung des Gradienten nach den Biasen Layer 2
n_params_D_bias_2 = tf.reduce_prod(layer2.bias.shape)
H_bias_2 = tf.reshape(h[3], [n_params, n_params_D_bias_2])
# Hesse-Matrix zusammensetzen ToDo vorher allokieren
h_mat = tf.concat([H_weights_1, H_bias_1, H_weights_2, H_bias_2], axis = 1)
return h_mat
def matrix_flat_shape_to_trainable_shape(model, A):
layer1 = model.layers[0]
layer2 = model.layers[1]
n_params, m = A.shape
A_trainable = []
#A_trainable.append(tf.reshape())
\ No newline at end of file
# Jacobi-Vector-Products
# Forward Mode AD
import tensorflow as tf
from tensorflow.python.eager import forwardprop
from tensorflow.python.ops.gen_array_ops import reshape
def _back_over_forward_hvp(model, images, labels, vector, loss_fn):
layer1 = model.layers[0]
layer2 = model.layers[1]
x = tf.constant(images)
with tf.GradientTape() as grad_tape:
grad_tape.watch(model.trainable_variables)
with forwardprop.ForwardAccumulator(model.trainable_variables, vector) as acc:
x = layer1(x)
x = layer2(x)
loss = loss_fn(x, labels)
return grad_tape.gradient(acc.jvp(loss), model.trainable_variables)
def _tf_gradients_forward_over_back_hvp(model, images, labels, vector, loss_fn):
with tf.GradientTape() as grad_tape:
y = model(images)
loss = loss_fn(y, labels)
variables = model.trainable_variables
grads = grad_tape.gradient(loss, variables)
helpers = tf.nest.map_structure(tf.ones_like, grads)
transposing = tf.gradients(grads, variables, helpers)
return tf.gradients(transposing, helpers, vector)
def _back_over_back_hvp(model, images, labels, vector, loss_fn):
with tf.GradientTape() as outer_tape:
with tf.GradientTape() as inner_tape:
y = model(images)
loss = loss_fn(y, labels)
grads = inner_tape.gradient(loss, model.trainable_variables)
return outer_tape.gradient(
grads, model.trainable_variables, output_gradients=vector)
if __name__ == "__main__":
x = tf.constant([[2.0, 3.0], [1.0, 4.0]])
targets = tf.constant([[1.], [-1.]])
dense = tf.keras.layers.Dense(1)
dense.build([None, 2])
# dense.kernel (2 Parameter) und dense.bias (1 Parameter)
# primals: Parameter, tangents: vector in Jacobi-Vector-Produkt
#with tf.autodiff.ForwardAccumulator(primals=dense.kernel, tangents=tf.constant([[1.], [0.]])) as acc:
# loss = tf.reduce_sum((dense(x) - targets) ** 2.)
#print(acc.jvp(loss))
#HV = _back_over_forward_hvp()
\ No newline at end of file
import numpy as np
import tensorflow as tf
from tensorflow import keras
from jvp import _back_over_back_hvp
from helper import *
from helper import reelles_skalarprodukt_trainable_shape
def norm_trainable_shape(x):
return sqrt(reelles_skalarprodukt_trainable_shape(x,x))
def CG_trainable(model, train_set, train_labels, loss_fn, b, x_0, m=None, tau=None):
n = model.count_params()
if m == None:
m = n
if tau == None:
tau = 10e-8
tol = tau*norm_trainable_shape(b)
x = x_0.copy()
weights_and_biases_list = model.get_weights()
Ax = _back_over_back_hvp(model, train_set, train_labels, x, loss_fn)
r = []
for i in range(len(weights_and_biases_list)):
r.append(b[i] - Ax[i])
p = r.copy()
rho_minus = reelles_skalarprodukt_trainable_shape(r,r)
for k in range(m):
v = _back_over_back_hvp(model, train_set, train_labels, p, loss_fn)
alpha = (rho_minus / reelles_skalarprodukt_trainable_shape(p,v))
for i in range(len(weights_and_biases_list)):
r[i] = r[i] - alpha * v[i]
for i in range(len(weights_and_biases_list)):
x[i] = x[i] + alpha * p[i]
if norm_trainable_shape(r) < tol:
break
rho = reelles_skalarprodukt_trainable_shape(r,r)
for i in range(len(weights_and_biases_list)):
p[i] = r[i] + rho/rho_minus * p[i]
rho_minus = rho
return x
# MNIST laden
mnist = tf.keras.datasets.mnist
(x_train, y_train), (x_test, y_test) = mnist.load_data()
# normalisieren
x_train, x_test = x_train / 255.0, x_test / 255.0
# Modell erzeugen
size_hidden_layer = 30
number_of_epochs = 10000
number_of_parameters = 9*size_hidden_layer + size_hidden_layer + 3 * size_hidden_layer + 3
from keras.models import Sequential
from keras.layers import Dense
from keras import backend as k
model = Sequential()
model.add(Dense(size_hidden_layer, input_dim = 9,activation='sigmoid'))
model.add(Dense(3, input_dim=size_hidden_layer, activation='sigmoid'))
# Gewichte und Biase initialisieren, um nachher Vergleichbarkeit zu haben
filename_W1 = "initializers/W_1_n" + str(size_hidden_layer) + ".npy"
filename_b1 = "initializers/b_1_n" + str(size_hidden_layer) + ".npy"
filename_W2 = "initializers/W_2_n" + str(size_hidden_layer) + ".npy"
filename_b2 = "initializers/b_2_n" + str(size_hidden_layer) + ".npy"
W_1 = np.load(filename_W1)
b_1 = np.load(filename_b1)
W_2 = np.load(filename_W2)
b_2 = np.load(filename_b2)
list_of_weights_and_biases = [W_1, b_1, W_2, b_2]
model.set_weights(list_of_weights_and_biases)
loss_fn = tf.keras.losses.MeanSquaredError()
model.compile(optimizer='adam', loss=loss_fn)
# Vortrainieren
#model.fit(train_set, train_labels, batch_size=32, epochs = 1000, verbose=0)
loss = loss_fn(model.predict(train_set), train_labels)
loss_numpy = np.zeros(number_of_epochs)
filename = "results/MSE_loss_NCG_backtracking_epochs" + str(number_of_epochs) + ".npy"
b = [0,0,0,0]
for epoch in range(number_of_epochs):
x = model.get_weights()
# Gradienten ausrechnen
layer1 = model.layers[0]
layer2 = model.layers[1]
watch = train_set
with tf.GradientTape() as t1:
watch = layer1(watch)
watch = layer2(watch)
loss = loss_fn(train_labels, watch)
g = t1.gradient(loss, [layer1.kernel, layer1.bias, layer2.kernel, layer2.bias])
# rechte Seite b aufstellen
Hx = _back_over_back_hvp(model, train_set, train_labels, x, loss_fn)
for i in range(len(g)):
b[i] = Hx[i] - g[i]
# Approximativ für den Newton-Schritt lösen mittels CG
m = 200
x_new = CG_trainable(model, train_set, train_labels, loss_fn, b, x, m=m, tau=1e-3)
# Liniensuche
descent_dir = []
for i in range(len(list_of_weights_and_biases)):
descent_dir.append(x_new[i] -x[i])
loss = loss_fn(model.predict(train_set), train_labels).numpy()
alpha = 1
rho = 0.5
c = 1e-4
test_step = []
cond = loss + c*alpha*reelles_skalarprodukt_trainable_shape(g,descent_dir)
iter_max = 10000
for k in range(iter_max):
test_step = []
for i in range(len(list_of_weights_and_biases)):
test_step.append(x[i] + alpha*descent_dir[i])
model.set_weights(test_step)
loss_test = loss_fn(model.predict(train_set), train_labels).numpy()
if(loss_test < cond):
break
alpha = rho*alpha
loss = loss_fn(model.predict(train_set), train_labels)
loss_numpy[epoch] = loss.numpy()
np.save(filename, loss_numpy)
if epoch % 10 == 0:
print("Epoch ", epoch, " Loss: ", loss.numpy())
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
from numpy.linalg import norm
from generate_dataset import generate_tictactoe
from helper import matrix_trainable_shape_to_flat_shape
# Erstelle Daten
train_set, train_labels = generate_tictactoe()
dataset = tf.data.Dataset.from_tensor_slices((train_set, train_labels))
# Modell erzeugen
size_hidden_layer = 30
number_of_parameters = 9*size_hidden_layer + size_hidden_layer + 3 * size_hidden_layer + 3
from keras.models import Sequential
from keras.layers import Dense
from keras import backend as k
model = Sequential()
model.add(Dense(size_hidden_layer, input_dim = 9,activation='sigmoid'))
model.add(Dense(3, input_dim=size_hidden_layer, activation='sigmoid'))
W_1 = np.load("results/W_1_trained_epoch0.npy")
b_1 = np.load("results/b_1_trained_epoch0.npy")
W_2 = np.load("results/W_2_trained_epoch0.npy")
b_2 = np.load("results/b_2_trained_epoch0.npy")
list_of_weights_and_biases = [W_1, b_1, W_2, b_2]
model.set_weights(list_of_weights_and_biases)
loss_fn = tf.keras.losses.MeanSquaredError()
# Hesse-Matrix für Überprüfung der Konjugiertheit
layer1 = model.layers[0]
layer2 = model.layers[1]
watch = train_set
with tf.GradientTape() as t2:
with tf.GradientTape() as t1:
watch = layer1(watch)
watch = layer2(watch)
loss = loss_fn(train_labels, watch)
g = t1.gradient(loss, [layer1.kernel, layer1.bias, layer2.kernel, layer2.bias])
gradient_flat = tf.concat([tf.reshape(g[0], [9*size_hidden_layer,1]), tf.reshape(g[1], [size_hidden_layer,1]), tf.reshape(g[2], [size_hidden_layer*3, 1]), tf.reshape(g[3], [3,1])], axis=0)
hessian_trainable = t2.jacobian(gradient_flat, [layer1.kernel, layer1.bias, layer2.kernel, layer2.bias])
hessian_flat = matrix_trainable_shape_to_flat_shape(model, hessian_trainable).numpy()
# Daten aus dem CG-Verfahren laden
P = np.load("results/CG_P_epoch0.npy")
R = np.load("results/CG_R_epoch0.npy")
rho = np.load("results/CG_rho.npy")
trueres = np.load("results/CG_trueres.npy")
rho_flat = np.load("results/CG_rho_flat.npy")
trueres_flat = np.load("results/CG_trueres_flat.npy")
hessian_flat_test = np.load("results/hessian-flat.npy")
print("Fehler Hesse-Matrix", norm(hessian_flat-hessian_flat_test))
temp, m = P.shape
# A-Orthogonalität der Richtungsvektoren?
A_ortho_P = np.zeros((m,m))
for i in range(m):
for k in range(m):
A_ortho_P[i,k] = np.dot(P[:,i], hessian_flat@P[:,k])
err_A_ortho_P = np.log(np.abs(A_ortho_P-np.eye(m)))
# Orthogonalität der Residuen?
ortho_R = np.zeros((m,m))
for i in range(m):
for k in range(m):
ortho_R[i,k] = np.dot(R[:,i], R[:,k])
err_ortho_R = np.log(np.abs(ortho_R - np.eye(m)))
X = np.arange(1, m+1)
Y = np.arange(1, m+1)
X, Y = np.meshgrid(X, Y)
# Plot
fig = plt.figure(1, figsize=(12,6))
ax = fig.add_subplot(projection='3d')
surface = ax.plot_surface(X, Y, err_A_ortho_P, cmap=plt.cm.jet, norm=plt.Normalize(-16, max(err_A_ortho_P.max(), 0)))
plt.colorbar(surface)
#ax.set_zlim(-16, 0)
ax.view_init(25,245)
plt.title('Fehler A-Konjugiertheit der Richtungsvektoren')
plt.show()
plt.close()
plt.semilogy(rho[:50], label="Implizite Hesse-Matrix des KNN")
plt.semilogy(rho_flat[:50], label="Explizite Hesse-Matrix des KNN")
plt.legend()
plt.show()
plt.close()
x_norm_flat = np.load("results/norm_x_flat.npy")
p_norm_flat = np.load("results/norm_p_flat.npy")
v_norm_flat = np.load("results/norm_v_flat.npy")
r_norm_flat = np.load("results/norm_r_flat.npy")
x_norm = np.load("results/norm_x.npy")
p_norm = np.load("results/norm_p.npy")
v_norm = np.load("results/norm_v.npy")
r_norm = np.load("results/norm_r.npy")
plt.semilogy(x_norm_flat[:20], 'r-')
plt.semilogy(x_norm[:20], 'r--', label="norm(x)")
plt.semilogy(p_norm_flat[:20], 'b-')
plt.semilogy(p_norm[:20], 'b--', label="norm(p)")
plt.semilogy(v_norm_flat[:20], 'g-')
plt.semilogy(v_norm[:20], 'g--', label="norm(v)")
plt.semilogy(r_norm_flat[:20], 'c-')
plt.semilogy(r_norm[:20], 'c--', label="norm(r)")
plt.legend()
plt.show()
......@@ -27,14 +27,6 @@ def CG_trainable(model, train_set, train_labels, loss_fn, b, x_0, m=None, tau=No
tol = tau*norm_trainable_shape(b)
P = np.zeros((n,m))
R = np.zeros((n,m))
resvec = np.zeros(m+1)
trueres = np.zeros(m+1)
norm_x = np.zeros(m+1)
norm_r = np.zeros(m+1)
norm_p = np.zeros(m+1)
norm_v = np.zeros(m+1)
x = x_0.copy()
weights_and_biases_list = model.get_weights()
......@@ -47,52 +39,33 @@ def CG_trainable(model, train_set, train_labels, loss_fn, b, x_0, m=None, tau=No
rho_minus = reelles_skalarprodukt_trainable_shape(r,r)
resvec[0] = rho_minus
trueres[0] = rho_minus
# Initiale Werte abspeichern
norm_x[0] = norm_trainable_shape(x)
norm_r[0] = norm_trainable_shape(r)
norm_p[0] = norm_trainable_shape(p)
for k in range(m):
v = _back_over_back_hvp(model, train_set, train_labels, p, loss_fn)
norm_v[k+1] = norm_trainable_shape(v)
alpha = (rho_minus / reelles_skalarprodukt_trainable_shape(p,v))
for i in range(len(weights_and_biases_list)):
r[i] = r[i] - alpha * v[i]
norm_r[k+1] = norm_trainable_shape(r)
for i in range(len(weights_and_biases_list)):
x[i] = x[i] + alpha * p[i]
norm_x[k+1] = norm_trainable_shape(x)
if norm_trainable_shape(r) < tol:
resvec[k+1] = rho
print("Beende CG nach ", k, " Schritten")
break
rho = reelles_skalarprodukt_trainable_shape(r,r)
for i in range(len(weights_and_biases_list)):
p[i] = r[i] + rho/rho_minus * p[i]
norm_p[k+1] = norm_trainable_shape(p)
rho_minus = rho
# P,R, TrueRes abspeichern
P[:,k] = vector_trainable_shape_to_flat_shape(p)
R[:,k] = vector_trainable_shape_to_flat_shape(r)
resvec[k+1] = rho
Hx = _back_over_back_hvp(model, train_set, train_labels, x, loss_fn)
temp = []
for i in range(len(weights_and_biases_list)):
temp.append(b[i] - Hx[i])
trueres[k+1] = reelles_skalarprodukt_trainable_shape(temp, temp)
return x, P, R, resvec, trueres, norm_p, norm_x, norm_r, norm_v
return x
# Erstelle Daten
train_set, train_labels = generate_tictactoe()
......@@ -100,6 +73,7 @@ dataset = tf.data.Dataset.from_tensor_slices((train_set, train_labels))
# Modell erzeugen
size_hidden_layer = 30
number_of_epochs = 10000
number_of_parameters = 9*size_hidden_layer + size_hidden_layer + 3 * size_hidden_layer + 3
from keras.models import Sequential
from keras.layers import Dense
......@@ -131,11 +105,12 @@ model.compile(optimizer='adam', loss=loss_fn)
loss = loss_fn(model.predict(train_set), train_labels)
print("Vortrainiert bis Loss: ", loss.numpy())
loss_numpy = np.zeros(number_of_epochs)
filename = "results/MSE_loss_NCG_backtracking_epochs" + str(number_of_epochs) + ".npy"
b = [0,0,0,0]
for epoch in range(1000):
for epoch in range(number_of_epochs):
x = model.get_weights()
......@@ -156,29 +131,11 @@ for epoch in range(1000):
for i in range(len(g)):
b[i] = Hx[i] - g[i]
print("Norm b=", reelles_skalarprodukt_trainable_shape(b,b))
# Approximativ für den Newton-Schritt lösen mittels CG
m = 200
x_new, P, R, rho, trueres, norm_p, norm_x, norm_r, norm_v = CG_trainable(model, train_set, train_labels, loss_fn, b, x, m=m, tau=1e-3)
# Daten aus dem CG-Verfahren abspeichern für Analyse der Konvergenz
if(epoch == 0):
np.save("results/CG_P_epoch0.npy", P)
np.save("results/CG_R_epoch0.npy", R)
np.save("results/CG_rho.npy", rho)
np.save("results/CG_trueres.npy", trueres)
np.save("results/norm_p.npy", norm_p)
np.save("results/norm_x.npy", norm_x)
np.save("results/norm_r.npy", norm_r)
np.save("results/norm_v.npy", norm_v)