# Vergleich: Berechnung von Gradient und Hesse-Matrix mit AD vs. analytische Ableitung per Hand berechnet # Kernfunktionalität: grad_and_hesse_matrix, berechnet die analytischen Ableitungen import numpy as np from numpy.linalg import norm import tensorflow as tf import matplotlib as mpl import matplotlib.pyplot as plt import numpy as np from generate_dataset import generate_tictactoe # Aktivierungsfunktionen und Ableitungen def tau(x): return 1/(1+ np.exp(-x)) # sigmoid def ddx_tau(x): return sigma(x) * (1 - sigma(x)) # komponentenweise Ableitung von sigmoid def sigma(x): return 1/(1+ np.exp(-x)) # sigmoid def ddx_sigma(x): return sigma(x) * (1 - sigma(x)) # komponentenweise Ableitung von sigmoid def ddx2_sigma(x): return ddx_sigma(x) * (1 - 2*sigma(x)) def ddx2_tau(x): return ddx_tau(x) * (1 - 2*tau(x)) #HELPERS def d_W_1_d_W_1(k, i, train_set, z_1, theta_11, theta_12, theta_13, theta_14, theta_15, theta_16): ddthetak_ddthetai_f_x_j = np.zeros((126,3)) # Zeilen: Datenpunkte x_j, Spalten: drei Komponenten für den lokalen Loss x_i = np.reshape(train_set[:,i], (126,1)) x_k = np.reshape(train_set[:,k], (126,1)) temp = theta_11*x_i*(x_k*ddx2_sigma(z_1)*ddx_tau(sigma(z_1)*theta_11+theta_14)+(ddx_sigma(z_1))**2*theta_11*x_k*ddx2_tau(sigma(z_1)*theta_11+theta_14)) ddthetak_ddthetai_f_x_j[:,0] = np.reshape(temp, (126,)) temp = theta_12*x_i*(x_k*ddx2_sigma(z_1)*ddx_tau(sigma(z_1)*theta_12+theta_15)+(ddx_sigma(z_1))**2*theta_12*x_k*ddx2_tau(sigma(z_1)*theta_12+theta_15)) ddthetak_ddthetai_f_x_j[:,1] = np.reshape(temp, (126,)) temp = theta_13*x_i*(x_k*ddx2_sigma(z_1)*ddx_tau(sigma(z_1)*theta_13+theta_16)+ddx_sigma(z_1)**2*theta_13*x_k*ddx2_tau(sigma(z_1)*theta_13+theta_16) ) ddthetak_ddthetai_f_x_j[:,2] = np.reshape(temp, (126,)) return ddthetak_ddthetai_f_x_j def d_b_1_d_w_1(i, train_set, z_1, theta_11, theta_12, theta_13, theta_14, theta_15, theta_16): d_dtheta10_d_dthetai_f_x_j = np.zeros((126,3)) x_i = np.reshape(train_set[:,i], (126,1)) temp = theta_11*x_i*(ddx2_sigma(z_1)*ddx_tau(sigma(z_1)*theta_11+theta_14)+theta_11*(ddx_sigma(z_1)**2)*ddx2_tau(sigma(z_1)*theta_11+theta_14)) d_dtheta10_d_dthetai_f_x_j[:,0] = np.reshape(temp, (126,)) temp = theta_12*x_i*(ddx2_sigma(z_1)*ddx_tau(sigma(z_1)*theta_12+theta_15)+theta_12*(ddx_sigma(z_1)**2)*ddx2_tau(sigma(z_1)*theta_12+theta_15)) d_dtheta10_d_dthetai_f_x_j[:,1] = np.reshape(temp, (126,)) temp = theta_13*x_i*(ddx2_sigma(z_1)*ddx_tau(sigma(z_1)*theta_13+theta_16)+theta_13*(ddx_sigma(z_1)**2)*ddx2_tau(sigma(z_1)*theta_13+theta_16)) d_dtheta10_d_dthetai_f_x_j[:,2] = np.reshape(temp, (126,)) return d_dtheta10_d_dthetai_f_x_j def d_theta11_d_W_1(i, train_set, z_1, theta_11, theta_12, theta_13, theta_14, theta_15, theta_16): d_dtheta11_d_dthetai_f_x_j = np.zeros((126,3)) x_i = np.reshape(train_set[:,i], (126,1)) temp = x_i * ddx_sigma(z_1) * (ddx_tau(sigma(z_1) * theta_11 + theta_14) + theta_11 * sigma(z_1) * ddx2_tau(sigma(z_1)*theta_11 + theta_14)) d_dtheta11_d_dthetai_f_x_j[:,0] = np.reshape(temp, (126,)) # andere Komponenten = 0 return d_dtheta11_d_dthetai_f_x_j def d_theta12_d_W_1(i, train_set, z_1, theta_11, theta_12, theta_13, theta_14, theta_15, theta_16): d_dtheta12_d_dthetai_f_x_j = np.zeros((126,3)) x_i = np.reshape(train_set[:,i], (126,1)) temp = x_i * ddx_sigma(z_1) * (ddx_tau(sigma(z_1) * theta_12 + theta_15) + theta_12 * sigma(z_1) * ddx2_tau(sigma(z_1)*theta_12 + theta_15)) d_dtheta12_d_dthetai_f_x_j[:,1] = np.reshape(temp, (126,)) # andere Komponenten = 0 return d_dtheta12_d_dthetai_f_x_j def d_theta13_d_W_1(i, train_set, z_1, theta_11, theta_12, theta_13, theta_14, theta_15, theta_16): d_dtheta13_d_dthetai_f_x_j = np.zeros((126,3)) x_i = np.reshape(train_set[:,i], (126,1)) temp = x_i * ddx_sigma(z_1) * (ddx_tau(sigma(z_1) * theta_13 + theta_16) + theta_13 * sigma(z_1) * ddx2_tau(sigma(z_1)*theta_13 + theta_16)) d_dtheta13_d_dthetai_f_x_j[:,2] = np.reshape(temp, (126,)) # andere Komponenten = 0 return d_dtheta13_d_dthetai_f_x_j def d_theta14_d_W_1(i, train_set, z_1, theta_11, theta_12, theta_13, theta_14, theta_15, theta_16): d_dtheta14_d_dthetai_f_x_j = np.zeros((126,3)) x_i = np.reshape(train_set[:,i], (126,1)) temp = theta_11 * x_i * ddx_sigma(z_1) * ddx2_tau(sigma(z_1)*theta_11 + theta_14) d_dtheta14_d_dthetai_f_x_j[:,0] = np.reshape(temp, (126,)) # andere Komponenten = 0 return d_dtheta14_d_dthetai_f_x_j def d_theta15_d_W_1(i, train_set, z_1, theta_11, theta_12, theta_13, theta_14, theta_15, theta_16): d_dtheta15_d_dthetai_f_x_j = np.zeros((126,3)) x_i = np.reshape(train_set[:,i], (126,1)) temp = theta_12 * x_i * ddx_sigma(z_1) * ddx2_tau(sigma(z_1)*theta_12 + theta_15) d_dtheta15_d_dthetai_f_x_j[:,1] = np.reshape(temp, (126,)) # andere Komponenten = 0 return d_dtheta15_d_dthetai_f_x_j def d_theta16_d_W_1(i, train_set, z_1, theta_11, theta_12, theta_13, theta_14, theta_15, theta_16): d_dtheta16_d_dthetai_f_x_j = np.zeros((126,3)) x_i = np.reshape(train_set[:,i], (126,1)) temp = theta_13 * x_i * ddx_sigma(z_1) * ddx2_tau(sigma(z_1)*theta_13 + theta_16) d_dtheta16_d_dthetai_f_x_j[:,2] = np.reshape(temp, (126,)) # andere Komponenten = 0 return d_dtheta16_d_dthetai_f_x_j ### ------------------------ Ableitungen analytisch berechnet ------------------------------------ ### def grad_and_hesse_matrix(model, train_set, train_labels): weights_and_biases_list = model.get_weights() W_1 = weights_and_biases_list[0] b_1 = weights_and_biases_list[1] W_2 = weights_and_biases_list[2] b_2 = weights_and_biases_list[3] loss_fn = tf.keras.losses.MeanSquaredError() model.compile(optimizer='adam', loss=loss_fn) loss_keras = loss_fn(train_labels, model.predict(train_set)) # Berechnung des Losses (zur Kontrolle) z_1 = train_set @ W_1 + b_1 a_2 = sigma(z_1) z_2 = sigma(z_1) @ W_2 + b_2 a_3 = tau(z_2) f_x = tau(z_2) # Mean squared error local_loss = 1/3* norm(f_x - train_labels, ord=2, axis=1)**2 loss_hand = 1/126 * np.sum(local_loss) print("Keras berechnet als Loss: ", loss_keras.numpy(), "Per Hand: ", loss_hand) ### ---------------------------- GRADIENT ------------------------------- #### # d/dtheta_1 C(theta) # d/dtheta_2 C(theta) # . # gradient_hand = . # . # d/dtheta_16 C(theta) # # Speicher für den Gradienten und die partiellen Ableitungen des KNN nach den thetas gradient_hand = np.zeros((16,)) d_dtheta_f_x_j = np.zeros((16,126,3)) x_1 = np.reshape(train_set[:,0], (126,1)) x_2 = np.reshape(train_set[:,1], (126,1)) x_3 = np.reshape(train_set[:,2], (126,1)) x_4 = np.reshape(train_set[:,3], (126,1)) x_5 = np.reshape(train_set[:,4], (126,1)) x_6 = np.reshape(train_set[:,5], (126,1)) x_7 = np.reshape(train_set[:,6], (126,1)) x_8 = np.reshape(train_set[:,7], (126,1)) x_9 = np.reshape(train_set[:,8], (126,1)) theta_1 = W_1[0,0] theta_2 = W_1[1,0] theta_3 = W_1[2,0] theta_4 = W_1[3,0] theta_5 = W_1[4,0] theta_6 = W_1[5,0] theta_7 = W_1[6,0] theta_8 = W_1[7,0] theta_9 = W_1[8,0] theta_10 = b_1[0] theta_11 = W_2[0,0] theta_12 = W_2[0,1] theta_13 = W_2[0,2] theta_14 = b_2[0] theta_15 = b_2[1] theta_16 = b_2[2] # Ableitung nach theta_1 d_dtheta1_f_x_j = np.zeros((126,3)) # Zeilen: Datenpunkte x_j, Spalten: drei Komponenten für den lokalen Loss d_dtheta1_f_x_j[:,0] = np.reshape(theta_11 * x_1 * ddx_sigma(z_1) * ddx_tau(sigma(z_1)*theta_11 + theta_14), (126,)) d_dtheta1_f_x_j[:,1] = np.reshape(theta_12 * x_1 * ddx_sigma(z_1) * ddx_tau(sigma(z_1)*theta_12 + theta_15), (126,)) d_dtheta1_f_x_j[:,2] = np.reshape(theta_13 * x_1 * ddx_sigma(z_1) * ddx_tau(sigma(z_1)*theta_13 + theta_16), (126,)) d_dtheta_f_x_j[0,:,:] = d_dtheta1_f_x_j d_dtheta1_C_j_summanden = 2* d_dtheta1_f_x_j * (f_x - train_labels) gradient_hand[0] = 1/(126*3) * np.sum(d_dtheta1_C_j_summanden) # Ableitung nach theta_2 d_dtheta2_f_x_j = np.zeros((126,3)) # Zeilen: Datenpunkte x_j, Spalten: drei Komponenten für den lokalen Loss d_dtheta2_f_x_j[:,0] = np.reshape(theta_11 * x_2 * ddx_sigma(z_1) * ddx_tau(sigma(z_1)*theta_11 + theta_14), (126,)) d_dtheta2_f_x_j[:,1] = np.reshape(theta_12 * x_2 * ddx_sigma(z_1) * ddx_tau(sigma(z_1)*theta_12 + theta_15), (126,)) d_dtheta2_f_x_j[:,2] = np.reshape(theta_13 * x_2 * ddx_sigma(z_1) * ddx_tau(sigma(z_1)*theta_13 + theta_16), (126,)) d_dtheta_f_x_j[1,:,:] = d_dtheta2_f_x_j d_dtheta2_C_j_summanden = 2* d_dtheta2_f_x_j * (f_x - train_labels) gradient_hand[1] = 1/(126*3) * np.sum(d_dtheta2_C_j_summanden) # Ableitung nach theta_3 d_dtheta3_f_x_j = np.zeros((126,3)) # Zeilen: Datenpunkte x_j, Spalten: drei Komponenten für den lokalen Loss d_dtheta3_f_x_j[:,0] = np.reshape(theta_11 * x_3 * ddx_sigma(z_1) * ddx_tau(sigma(z_1)*theta_11 + theta_14), (126,)) d_dtheta3_f_x_j[:,1] = np.reshape(theta_12 * x_3 * ddx_sigma(z_1) * ddx_tau(sigma(z_1)*theta_12 + theta_15), (126,)) d_dtheta3_f_x_j[:,2] = np.reshape(theta_13 * x_3 * ddx_sigma(z_1) * ddx_tau(sigma(z_1)*theta_13 + theta_16), (126,)) d_dtheta_f_x_j[2,:,:] = d_dtheta3_f_x_j d_dtheta3_C_j_summanden = 2* d_dtheta3_f_x_j * (f_x - train_labels) gradient_hand[2] = 1/(126*3) * np.sum(d_dtheta3_C_j_summanden) # Ableitung nach theta_4 d_dtheta4_f_x_j = np.zeros((126,3)) # Zeilen: Datenpunkte x_j, Spalten: drei Komponenten für den lokalen Loss d_dtheta4_f_x_j[:,0] = np.reshape(theta_11 * x_4 * ddx_sigma(z_1) * ddx_tau(sigma(z_1)*theta_11 + theta_14), (126,)) d_dtheta4_f_x_j[:,1] = np.reshape(theta_12 * x_4 * ddx_sigma(z_1) * ddx_tau(sigma(z_1)*theta_12 + theta_15), (126,)) d_dtheta4_f_x_j[:,2] = np.reshape(theta_13 * x_4 * ddx_sigma(z_1) * ddx_tau(sigma(z_1)*theta_13 + theta_16), (126,)) d_dtheta_f_x_j[3,:,:] = d_dtheta4_f_x_j d_dtheta4_C_j_summanden = 2* d_dtheta4_f_x_j * (f_x - train_labels) gradient_hand[3] = 1/(126*3) * np.sum(d_dtheta4_C_j_summanden) # Ableitung nach theta_5 d_dtheta5_f_x_j = np.zeros((126,3)) # Zeilen: Datenpunkte x_j, Spalten: drei Komponenten für den lokalen Loss d_dtheta5_f_x_j[:,0] = np.reshape(theta_11 * x_5 * ddx_sigma(z_1) * ddx_tau(sigma(z_1)*theta_11 + theta_14), (126,)) d_dtheta5_f_x_j[:,1] = np.reshape(theta_12 * x_5 * ddx_sigma(z_1) * ddx_tau(sigma(z_1)*theta_12 + theta_15), (126,)) d_dtheta5_f_x_j[:,2] = np.reshape(theta_13 * x_5 * ddx_sigma(z_1) * ddx_tau(sigma(z_1)*theta_13 + theta_16), (126,)) d_dtheta_f_x_j[4,:,:] = d_dtheta5_f_x_j d_dtheta5_C_j_summanden = 2* d_dtheta5_f_x_j * (f_x - train_labels) gradient_hand[4] = 1/(126*3) * np.sum(d_dtheta5_C_j_summanden) # Ableitung nach theta_6 d_dtheta6_f_x_j = np.zeros((126,3)) # Zeilen: Datenpunkte x_j, Spalten: drei Komponenten für den lokalen Loss d_dtheta6_f_x_j[:,0] = np.reshape(theta_11 * x_6 * ddx_sigma(z_1) * ddx_tau(sigma(z_1)*theta_11 + theta_14), (126,)) d_dtheta6_f_x_j[:,1] = np.reshape(theta_12 * x_6 * ddx_sigma(z_1) * ddx_tau(sigma(z_1)*theta_12 + theta_15), (126,)) d_dtheta6_f_x_j[:,2] = np.reshape(theta_13 * x_6 * ddx_sigma(z_1) * ddx_tau(sigma(z_1)*theta_13 + theta_16), (126,)) d_dtheta_f_x_j[5,:,:] = d_dtheta6_f_x_j d_dtheta6_C_j_summanden = 2* d_dtheta6_f_x_j * (f_x - train_labels) gradient_hand[5] = 1/(126*3) * np.sum(d_dtheta6_C_j_summanden) # Ableitung nach theta_7 d_dtheta7_f_x_j = np.zeros((126,3)) # Zeilen: Datenpunkte x_j, Spalten: drei Komponenten für den lokalen Loss d_dtheta7_f_x_j[:,0] = np.reshape(theta_11 * x_7 * ddx_sigma(z_1) * ddx_tau(sigma(z_1)*theta_11 + theta_14), (126,)) d_dtheta7_f_x_j[:,1] = np.reshape(theta_12 * x_7 * ddx_sigma(z_1) * ddx_tau(sigma(z_1)*theta_12 + theta_15), (126,)) d_dtheta7_f_x_j[:,2] = np.reshape(theta_13 * x_7 * ddx_sigma(z_1) * ddx_tau(sigma(z_1)*theta_13 + theta_16), (126,)) d_dtheta_f_x_j[6,:,:] = d_dtheta7_f_x_j d_dtheta7_C_j_summanden = 2* d_dtheta7_f_x_j * (f_x - train_labels) gradient_hand[6] = 1/(126*3) * np.sum(d_dtheta7_C_j_summanden) # Ableitung nach theta_8 d_dtheta8_f_x_j = np.zeros((126,3)) # Zeilen: Datenpunkte x_j, Spalten: drei Komponenten für den lokalen Loss d_dtheta8_f_x_j[:,0] = np.reshape(theta_11 * x_8 * ddx_sigma(z_1) * ddx_tau(sigma(z_1)*theta_11 + theta_14), (126,)) d_dtheta8_f_x_j[:,1] = np.reshape(theta_12 * x_8 * ddx_sigma(z_1) * ddx_tau(sigma(z_1)*theta_12 + theta_15), (126,)) d_dtheta8_f_x_j[:,2] = np.reshape(theta_13 * x_8 * ddx_sigma(z_1) * ddx_tau(sigma(z_1)*theta_13 + theta_16), (126,)) d_dtheta_f_x_j[7,:,:] = d_dtheta8_f_x_j d_dtheta8_C_j_summanden = 2* d_dtheta8_f_x_j * (f_x - train_labels) gradient_hand[7] = 1/(126*3) * np.sum(d_dtheta8_C_j_summanden) # Ableitung nach theta_9 d_dtheta9_f_x_j = np.zeros((126,3)) # Zeilen: Datenpunkte x_j, Spalten: drei Komponenten für den lokalen Loss d_dtheta9_f_x_j[:,0] = np.reshape(theta_11 * x_9 * ddx_sigma(z_1) * ddx_tau(sigma(z_1)*theta_11 + theta_14), (126,)) d_dtheta9_f_x_j[:,1] = np.reshape(theta_12 * x_9 * ddx_sigma(z_1) * ddx_tau(sigma(z_1)*theta_12 + theta_15), (126,)) d_dtheta9_f_x_j[:,2] = np.reshape(theta_13 * x_9 * ddx_sigma(z_1) * ddx_tau(sigma(z_1)*theta_13 + theta_16), (126,)) d_dtheta_f_x_j[8,:,:] = d_dtheta9_f_x_j d_dtheta9_C_j_summanden = 2* d_dtheta9_f_x_j * (f_x - train_labels) gradient_hand[8] = 1/(126*3) * np.sum(d_dtheta9_C_j_summanden) # Ableitung nach theta_10 d_dtheta10_f_x_j = np.zeros((126,3)) # Zeilen: Datenpunkte x_j, Spalten: drei Komponenten für den lokalen Loss d_dtheta10_f_x_j[:,0] = np.reshape(theta_11 * ddx_sigma(z_1) * ddx_tau(sigma(z_1) * theta_11 + theta_14), (126,)) d_dtheta10_f_x_j[:,1] = np.reshape(theta_12 * ddx_sigma(z_1) * ddx_tau(sigma(z_1) * theta_12 + theta_15), (126,)) d_dtheta10_f_x_j[:,2] = np.reshape(theta_13 * ddx_sigma(z_1) * ddx_tau(sigma(z_1) * theta_13 + theta_16), (126,)) d_dtheta_f_x_j[9,:,:] = d_dtheta10_f_x_j d_dtheta10_C_j_summanden = 2* d_dtheta10_f_x_j * (f_x - train_labels) gradient_hand[9] = 1/(126*3) * np.sum(d_dtheta10_C_j_summanden) # Ableitung nach theta_11 d_dtheta11_f_x_j = np.zeros((126,3)) # Zeilen: Datenpunkte x_j, Spalten: drei Komponenten für den lokalen Loss d_dtheta11_f_x_j[:,0] = np.reshape(sigma(z_1)* ddx_tau(sigma(z_1)*theta_11 + theta_14), (126,)) # andere Komponenten sind 0 d_dtheta_f_x_j[10,:,:] = d_dtheta11_f_x_j d_dtheta11_C_j_summanden = 2* d_dtheta11_f_x_j * (f_x - train_labels) gradient_hand[10] = 1/(126*3) * np.sum(d_dtheta11_C_j_summanden) # Ableitung nach theta_12 d_dtheta12_f_x_j = np.zeros((126,3)) # Zeilen: Datenpunkte x_j, Spalten: drei Komponenten für den lokalen Loss d_dtheta12_f_x_j[:,1] = np.reshape(sigma(z_1)* ddx_tau(sigma(z_1)*theta_12 + theta_15), (126,)) # andere Komponenten sind 0 d_dtheta_f_x_j[11,:,:] = d_dtheta12_f_x_j d_dtheta12_C_j_summanden = 2* d_dtheta12_f_x_j * (f_x - train_labels) gradient_hand[11] = 1/(126*3) * np.sum(d_dtheta12_C_j_summanden) # Ableitung nach theta_13 d_dtheta13_f_x_j = np.zeros((126,3)) # Zeilen: Datenpunkte x_j, Spalten: drei Komponenten für den lokalen Loss d_dtheta13_f_x_j[:,2] = np.reshape(sigma(z_1)* ddx_tau(sigma(z_1)*theta_13 + theta_16), (126,)) #andere Komponenten sind 0 d_dtheta_f_x_j[12,:,:] = d_dtheta13_f_x_j d_dtheta13_C_j_summanden = 2* d_dtheta13_f_x_j * (f_x - train_labels) gradient_hand[12] = 1/(126*3) * np.sum(d_dtheta13_C_j_summanden) # Ableitung nach theta_14 d_dtheta14_f_x_j = np.zeros((126,3)) # Zeilen: Datenpunkte x_j, Spalten: drei Komponenten für den lokalen Loss d_dtheta14_f_x_j[:,0] = np.reshape(ddx_tau(sigma(z_1) * theta_11 + theta_14), (126,)) #andere Komponenten sind 0 d_dtheta_f_x_j[13,:,:] = d_dtheta14_f_x_j d_dtheta14_C_j_summanden = 2* d_dtheta14_f_x_j * (f_x - train_labels) gradient_hand[13] = 1/(126*3) * np.sum(d_dtheta14_C_j_summanden) # Ableitung nach theta_15 d_dtheta15_f_x_j = np.zeros((126,3)) # Zeilen: Datenpunkte x_j, Spalten: drei Komponenten für den lokalen Loss d_dtheta15_f_x_j[:,1] = np.reshape(ddx_tau(sigma(z_1) * theta_12 + theta_15), (126,)) #andere Komponenten sind 0 d_dtheta_f_x_j[14,:,:] = d_dtheta15_f_x_j d_dtheta15_C_j_summanden = 2* d_dtheta15_f_x_j * (f_x - train_labels) gradient_hand[14] = 1/(126*3) * np.sum(d_dtheta15_C_j_summanden) # Ableitung nach theta_16 d_dtheta16_f_x_j = np.zeros((126,3)) # Zeilen: Datenpunkte x_j, Spalten: drei Komponenten für den lokalen Loss d_dtheta16_f_x_j[:,2] = np.reshape(ddx_tau(sigma(z_1) * theta_13 + theta_16), (126,)) #andere Komponenten sind 0 d_dtheta_f_x_j[15,:,:] = d_dtheta16_f_x_j d_dtheta16_C_j_summanden = 2* d_dtheta16_f_x_j * (f_x - train_labels) gradient_hand[15] = 1/(126*3) * np.sum(d_dtheta16_C_j_summanden) ### ---------------------------- HESSE-MATRIX ------------------------------- #### # Blockweise Berechnung der Hesse-Matrix # W_1, W_1 todo fixen hessian_hand = np.zeros((16,16)) for i in range(9): for k in range(9): ddthetak_ddthetai_f_x_j = d_W_1_d_W_1(k, i, train_set, z_1, theta_11, theta_12, theta_13, theta_14, theta_15, theta_16) ddthetak_ddthetai_C_j_summanden = 2 * (ddthetak_ddthetai_f_x_j * (f_x - train_labels) + d_dtheta_f_x_j[i,:,:] * d_dtheta_f_x_j[k,:,:]) C = 1/(126*3) * np.sum(ddthetak_ddthetai_C_j_summanden) hessian_hand[i,k] = C hessian_hand[k,i] = C # b_1, W_1 (läuft) for i in range(9): ddtheta10_ddthetai_f_x_j = d_b_1_d_w_1(i, train_set, z_1, theta_11, theta_12, theta_13, theta_14, theta_15, theta_16) ddtheta10_ddthetai_C_j_summanden = 2 * (ddtheta10_ddthetai_f_x_j * (f_x - train_labels) + d_dtheta_f_x_j[i,:,:] * d_dtheta_f_x_j[9,:,:]) C = 1/(126*3) * np.sum(ddtheta10_ddthetai_C_j_summanden) hessian_hand[i,9] = C hessian_hand[9,i] = C # b_1, b_1 (läuft) d_dtheta10_d_dtheta10_f_x_j = np.zeros((126,3)) temp = theta_11*(ddx2_sigma(z_1)*ddx_tau(sigma(z_1)*theta_11+theta_14) + theta_11 * (ddx_sigma(z_1))**2 * ddx2_tau(sigma(z_1)*theta_11+theta_14)) d_dtheta10_d_dtheta10_f_x_j[:,0] = np.reshape(temp, (126,)) temp = theta_12*(ddx2_sigma(z_1)*ddx_tau(sigma(z_1)*theta_12+theta_15) + theta_12 * (ddx_sigma(z_1))**2 * ddx2_tau(sigma(z_1)*theta_12+theta_15)) d_dtheta10_d_dtheta10_f_x_j[:,1] = np.reshape(temp, (126,)) temp = theta_13*(ddx2_sigma(z_1)*ddx_tau(sigma(z_1)*theta_13+theta_16) + theta_13 * (ddx_sigma(z_1))**2 * ddx2_tau(sigma(z_1)*theta_13+theta_16)) d_dtheta10_d_dtheta10_f_x_j[:,2] = np.reshape(temp, (126,)) ddtheta10_ddtheta10_C_j_summanden = 2 * (d_dtheta10_d_dtheta10_f_x_j * (f_x - train_labels) + d_dtheta_f_x_j[9,:,:] * d_dtheta_f_x_j[9,:,:]) C = 1/(126*3) * np.sum(ddtheta10_ddtheta10_C_j_summanden) hessian_hand[9,9] = C # W_2, W_1 (läuft) for i in range(9): d_dtheta11_d_dthetai_f_x_j = d_theta11_d_W_1(i, train_set, z_1, theta_11, theta_12, theta_13, theta_14, theta_15, theta_16) d_dtheta11_d_dthetai_C_j_summanden = 2 * (d_dtheta11_d_dthetai_f_x_j * (f_x - train_labels) + d_dtheta_f_x_j[i,:,:] * d_dtheta_f_x_j[10,:,:]) C = 1/(126*3) * np.sum(d_dtheta11_d_dthetai_C_j_summanden) hessian_hand[i,10] = C hessian_hand[10,i] = C for i in range(9): d_dtheta12_d_dthetai_f_x_j = d_theta12_d_W_1(i, train_set, z_1, theta_11, theta_12, theta_13, theta_14, theta_15, theta_16) d_dtheta12_d_dthetai_C_j_summanden = 2 * (d_dtheta12_d_dthetai_f_x_j * (f_x - train_labels) + d_dtheta_f_x_j[i,:,:] * d_dtheta_f_x_j[11,:,:]) C = 1/(126*3) * np.sum(d_dtheta12_d_dthetai_C_j_summanden) hessian_hand[i,11] = C hessian_hand[11,i] = C for i in range(9): d_dtheta13_d_dthetai_f_x_j = d_theta13_d_W_1(i, train_set, z_1, theta_11, theta_12, theta_13, theta_14, theta_15, theta_16) d_dtheta13_d_dthetai_C_j_summanden = 2 * (d_dtheta13_d_dthetai_f_x_j * (f_x - train_labels) + d_dtheta_f_x_j[i,:,:] * d_dtheta_f_x_j[12,:,:]) C = 1/(126*3) * np.sum(d_dtheta13_d_dthetai_C_j_summanden) hessian_hand[i,12] = C hessian_hand[12,i] = C # W_2, b_1 (läuft) d_dtheta11_d_dtheta10_f_x_j = np.zeros((126,3)) temp = ddx_sigma(z_1)*(ddx_tau(sigma(z_1)*theta_11+theta_14) + theta_11 * sigma(z_1) * ddx2_tau(sigma(z_1)*theta_11+theta_14)) d_dtheta11_d_dtheta10_f_x_j[:,0] = np.reshape(temp, (126,)) ddtheta11_ddtheta10_C_j_summanden = 2 * (d_dtheta11_d_dtheta10_f_x_j * (f_x - train_labels) + d_dtheta_f_x_j[10,:,:] * d_dtheta_f_x_j[9,:,:]) C = 1/(126*3) * np.sum(ddtheta11_ddtheta10_C_j_summanden) hessian_hand[10,9] = C hessian_hand[9,10] = C d_dtheta12_d_dtheta10_f_x_j = np.zeros((126,3)) temp = ddx_sigma(z_1)*(ddx_tau(sigma(z_1)*theta_12+theta_15) + theta_12 * sigma(z_1) * ddx2_tau(sigma(z_1)*theta_12+theta_15)) d_dtheta12_d_dtheta10_f_x_j[:,1] = np.reshape(temp, (126,)) ddtheta12_ddtheta10_C_j_summanden = 2 * (d_dtheta12_d_dtheta10_f_x_j * (f_x - train_labels) + d_dtheta_f_x_j[11,:,:] * d_dtheta_f_x_j[9,:,:]) C = 1/(126*3) * np.sum(ddtheta12_ddtheta10_C_j_summanden) hessian_hand[11,9] = C hessian_hand[9,11] = C d_dtheta13_d_dtheta10_f_x_j = np.zeros((126,3)) temp = ddx_sigma(z_1)*(ddx_tau(sigma(z_1)*theta_13+theta_16) + theta_13 * sigma(z_1) * ddx2_tau(sigma(z_1)*theta_13+theta_16)) d_dtheta13_d_dtheta10_f_x_j[:,2] = np.reshape(temp, (126,)) ddtheta13_ddtheta10_C_j_summanden = 2 * (d_dtheta13_d_dtheta10_f_x_j * (f_x - train_labels) + d_dtheta_f_x_j[12,:,:] * d_dtheta_f_x_j[9,:,:]) C = 1/(126*3) * np.sum(ddtheta13_ddtheta10_C_j_summanden) hessian_hand[12,9] = C hessian_hand[9,12] = C # W_2, W_2 (läuft) d_dtheta11_d_dtheta11_f_x_j = np.zeros((126,3)) temp = (sigma(z_1))**2 * ddx2_tau(sigma(z_1)*theta_11 + theta_14) d_dtheta11_d_dtheta11_f_x_j[:,0] = np.reshape(temp, (126,)) ddtheta11_ddtheta11_C_j_summanden = 2 * (d_dtheta11_d_dtheta11_f_x_j * (f_x - train_labels) + d_dtheta_f_x_j[10,:,:] * d_dtheta_f_x_j[10,:,:]) C = 1/(126*3) * np.sum(ddtheta11_ddtheta11_C_j_summanden) hessian_hand[10,10] = C d_dtheta12_d_dtheta12_f_x_j = np.zeros((126,3)) temp = (sigma(z_1))**2 * ddx2_tau(sigma(z_1)*theta_12 + theta_15) d_dtheta12_d_dtheta12_f_x_j[:,1] = np.reshape(temp, (126,)) ddtheta12_ddtheta12_C_j_summanden = 2 * (d_dtheta12_d_dtheta12_f_x_j * (f_x - train_labels) + d_dtheta_f_x_j[11,:,:] * d_dtheta_f_x_j[11,:,:]) C = 1/(126*3) * np.sum(ddtheta12_ddtheta12_C_j_summanden) hessian_hand[11,11] = C d_dtheta13_d_dtheta13_f_x_j = np.zeros((126,3)) temp = (sigma(z_1))**2 * ddx2_tau(sigma(z_1)*theta_13 + theta_16) d_dtheta13_d_dtheta13_f_x_j[:,2] = np.reshape(temp, (126,)) ddtheta13_ddtheta13_C_j_summanden = 2 * (d_dtheta13_d_dtheta13_f_x_j * (f_x - train_labels) + d_dtheta_f_x_j[12,:,:] * d_dtheta_f_x_j[12,:,:]) C = 1/(126*3) * np.sum(ddtheta13_ddtheta13_C_j_summanden) hessian_hand[12,12] = C # alle gemischten Ableitungen im Block W_1, W_1 sind 0, somit sind auch alle Einträge der Hesse-Matrix = 0 # b_2, W_1 (läuft) for i in range(9): d_dtheta14_d_dthetai_f_x_j = d_theta14_d_W_1(i, train_set, z_1, theta_11, theta_12, theta_13, theta_14, theta_15, theta_16) d_dtheta14_d_dthetai_C_j_summanden = 2 * (d_dtheta14_d_dthetai_f_x_j * (f_x - train_labels) + d_dtheta_f_x_j[i,:,:] * d_dtheta_f_x_j[13,:,:]) C = 1/(126*3) * np.sum(d_dtheta14_d_dthetai_C_j_summanden) hessian_hand[i,13] = C hessian_hand[13,i] = C for i in range(9): d_dtheta15_d_dthetai_f_x_j = d_theta15_d_W_1(i, train_set, z_1, theta_11, theta_12, theta_13, theta_14, theta_15, theta_16) d_dtheta15_d_dthetai_C_j_summanden = 2 * (d_dtheta15_d_dthetai_f_x_j * (f_x - train_labels) + d_dtheta_f_x_j[i,:,:] * d_dtheta_f_x_j[14,:,:]) C = 1/(126*3) * np.sum(d_dtheta15_d_dthetai_C_j_summanden) hessian_hand[i,14] = C hessian_hand[14,i] = C for i in range(9): d_dtheta16_d_dthetai_f_x_j = d_theta16_d_W_1(i, train_set, z_1, theta_11, theta_12, theta_13, theta_14, theta_15, theta_16) d_dtheta16_d_dthetai_C_j_summanden = 2 * (d_dtheta16_d_dthetai_f_x_j * (f_x - train_labels) + d_dtheta_f_x_j[i,:,:] * d_dtheta_f_x_j[15,:,:]) C = 1/(126*3) * np.sum(d_dtheta16_d_dthetai_C_j_summanden) hessian_hand[i,15] = C hessian_hand[15,i] = C # b_2, b_1 (läuft) d_dtheta14_d_dtheta10_f_x_j = np.zeros((126,3)) temp = theta_11 * ddx_sigma(z_1) * ddx2_tau(sigma(z_1)*theta_11 + theta_14) d_dtheta14_d_dtheta10_f_x_j[:,0] = np.reshape(temp, (126,)) ddtheta14_ddtheta10_C_j_summanden = 2 * (d_dtheta14_d_dtheta10_f_x_j * (f_x - train_labels) + d_dtheta_f_x_j[13,:,:] * d_dtheta_f_x_j[9,:,:]) C = 1/(126*3) * np.sum(ddtheta14_ddtheta10_C_j_summanden) hessian_hand[13,9] = C hessian_hand[9,13] = C d_dtheta15_d_dtheta10_f_x_j = np.zeros((126,3)) temp = theta_12 * ddx_sigma(z_1) * ddx2_tau(sigma(z_1)*theta_12 + theta_15) d_dtheta15_d_dtheta10_f_x_j[:,1] = np.reshape(temp, (126,)) ddtheta15_ddtheta10_C_j_summanden = 2 * (d_dtheta15_d_dtheta10_f_x_j * (f_x - train_labels) + d_dtheta_f_x_j[14,:,:] * d_dtheta_f_x_j[9,:,:]) C = 1/(126*3) * np.sum(ddtheta15_ddtheta10_C_j_summanden) hessian_hand[14,9] = C hessian_hand[9,14] = C d_dtheta16_d_dtheta10_f_x_j = np.zeros((126,3)) temp = theta_13 * ddx_sigma(z_1) * ddx2_tau(sigma(z_1)*theta_13 + theta_16) d_dtheta16_d_dtheta10_f_x_j[:,2] = np.reshape(temp, (126,)) ddtheta16_ddtheta10_C_j_summanden = 2 * (d_dtheta16_d_dtheta10_f_x_j * (f_x - train_labels) + d_dtheta_f_x_j[15,:,:] * d_dtheta_f_x_j[9,:,:]) C = 1/(126*3) * np.sum(ddtheta16_ddtheta10_C_j_summanden) hessian_hand[15,9] = C hessian_hand[9,15] = C #b_2, W_2 d_dtheta14_d_dtheta11_f_x_j = np.zeros((126,3)) temp = sigma(z_1) * ddx2_tau(sigma(z_1)*theta_11 + theta_14) d_dtheta14_d_dtheta11_f_x_j[:,0] = np.reshape(temp, (126,)) ddtheta14_ddtheta11_C_j_summanden = 2 * (d_dtheta14_d_dtheta11_f_x_j * (f_x - train_labels) + d_dtheta_f_x_j[13,:,:] * d_dtheta_f_x_j[10,:,:]) C = 1/(126*3) * np.sum(ddtheta14_ddtheta11_C_j_summanden) hessian_hand[13,10] = C hessian_hand[10,13] = C d_dtheta15_d_dtheta12_f_x_j = np.zeros((126,3)) temp = sigma(z_1) * ddx2_tau(sigma(z_1)*theta_12 + theta_15) d_dtheta15_d_dtheta12_f_x_j[:,1] = np.reshape(temp, (126,)) ddtheta15_ddtheta12_C_j_summanden = 2 * (d_dtheta15_d_dtheta12_f_x_j * (f_x - train_labels) + d_dtheta_f_x_j[14,:,:] * d_dtheta_f_x_j[11,:,:]) C = 1/(126*3) * np.sum(ddtheta15_ddtheta12_C_j_summanden) hessian_hand[14,11] = C hessian_hand[11,14] = C d_dtheta16_d_dtheta13_f_x_j = np.zeros((126,3)) temp = sigma(z_1) * ddx2_tau(sigma(z_1)*theta_13 + theta_16) d_dtheta16_d_dtheta13_f_x_j[:,2] = np.reshape(temp, (126,)) ddtheta16_ddtheta13_C_j_summanden = 2 * (d_dtheta16_d_dtheta13_f_x_j * (f_x - train_labels) + d_dtheta_f_x_j[15,:,:] * d_dtheta_f_x_j[12,:,:]) C = 1/(126*3) * np.sum(ddtheta16_ddtheta13_C_j_summanden) hessian_hand[15,12] = C hessian_hand[12,15] = C #b_2, b_2 d_dtheta14_d_dtheta14_f_x_j = np.zeros((126,3)) temp = ddx2_tau(sigma(z_1)*theta_11 + theta_14) d_dtheta14_d_dtheta14_f_x_j[:,0] = np.reshape(temp, (126,)) ddtheta14_ddtheta14_C_j_summanden = 2 * (d_dtheta14_d_dtheta14_f_x_j * (f_x - train_labels) + d_dtheta_f_x_j[13,:,:] * d_dtheta_f_x_j[13,:,:]) C = 1/(126*3) * np.sum(ddtheta14_ddtheta14_C_j_summanden) hessian_hand[13,13] = C d_dtheta15_d_dtheta15_f_x_j = np.zeros((126,3)) temp = ddx2_tau(sigma(z_1)*theta_12 + theta_15) d_dtheta15_d_dtheta15_f_x_j[:,1] = np.reshape(temp, (126,)) ddtheta15_ddtheta15_C_j_summanden = 2 * (d_dtheta15_d_dtheta15_f_x_j * (f_x - train_labels) + d_dtheta_f_x_j[14,:,:] * d_dtheta_f_x_j[14,:,:]) C = 1/(126*3) * np.sum(ddtheta15_ddtheta15_C_j_summanden) hessian_hand[14,14] = C d_dtheta16_d_dtheta16_f_x_j = np.zeros((126,3)) temp = ddx2_tau(sigma(z_1)*theta_13 + theta_16) d_dtheta16_d_dtheta16_f_x_j[:,2] = np.reshape(temp, (126,)) ddtheta16_ddtheta16_C_j_summanden = 2 * (d_dtheta16_d_dtheta16_f_x_j * (f_x - train_labels) + d_dtheta_f_x_j[15,:,:] * d_dtheta_f_x_j[15,:,:]) C = 1/(126*3) * np.sum(ddtheta16_ddtheta16_C_j_summanden) hessian_hand[15,15] = C return gradient_hand, hessian_hand def imshow_zero_center(image, title): lim = tf.reduce_max(abs(image)) plt.imshow(image, vmin=-lim, vmax=lim, cmap='seismic') plt.title(title) plt.colorbar() plt.show() if __name__ == "__main__": # Modell und Daten laden size_hidden_layer = 1 from keras.models import Sequential from keras.layers import Dense tf.keras.backend.set_floatx('float64') model = Sequential() model.add(Dense(size_hidden_layer, input_dim = 9,activation='sigmoid')) model.add(Dense(3, input_dim=size_hidden_layer, activation='sigmoid')) weights_and_biases_list = model.get_weights() W_1 = weights_and_biases_list[0] b_1 = weights_and_biases_list[1] W_2 = weights_and_biases_list[2] b_2 = weights_and_biases_list[3] train_set, train_labels = generate_tictactoe() dataset = tf.data.Dataset.from_tensor_slices((train_set, train_labels)) loss_fn = tf.keras.losses.MeanSquaredError() model.compile(optimizer='adam', loss=loss_fn) loss_keras = loss_fn(train_labels, model.predict(train_set)) # Gradient und Hesse-Matrix mittels Autodiff ermitteln layer1 = model.layers[0] layer2 = model.layers[1] x = train_set with tf.GradientTape() as t2: with tf.GradientTape() as t1: x = layer1(x) x = layer2(x) loss = loss_fn(train_labels, x) g = t1.gradient(loss, [layer1.kernel, layer1.bias, layer2.kernel, layer2.bias]) grad = tf.concat([tf.reshape(g[0], [9*size_hidden_layer,1]), tf.reshape(g[1], [size_hidden_layer,1]), tf.reshape(g[2], [size_hidden_layer*3, 1]), tf.reshape(g[3], [3,1])], axis=0) h = t2.jacobian(grad, [layer1.kernel, layer1.bias, layer2.kernel, layer2.bias]) n_params = tf.reduce_prod(layer1.kernel.shape) + tf.reduce_prod(layer2.kernel.shape) + tf.reduce_prod(layer1.bias.shape) + tf.reduce_prod(layer2.bias.shape) #h[0] ist die Ableitung des Gradienten nach den Gewichten Layer 1 n_params_D_weights_1 = tf.reduce_prod(layer1.kernel.shape) H_weights_1 = tf.reshape(h[0], [n_params, n_params_D_weights_1]) #h[1] ist die Ableitung des Gradienten nach den Biasen Layer 1 n_params_D_bias_1 = tf.reduce_prod(layer1.bias.shape) H_bias_1 = tf.reshape(h[1], [n_params, n_params_D_bias_1]) #h[2] ist die Ableitung des Gradienten nach den Gewichten Layer 2 n_params_D_weights_2 = tf.reduce_prod(layer2.kernel.shape) H_weights_2 = tf.reshape(h[2], [n_params, n_params_D_weights_2]) #h[3] ist die Ableitung des Gradienten nach den Biasen Layer 2 n_params_D_bias_2 = tf.reduce_prod(layer2.bias.shape) H_bias_2 = tf.reshape(h[3], [n_params, n_params_D_bias_2]) # Hesse-Matrix zusammensetzen ToDo vorher allokieren h_mat_keras = tf.concat([H_weights_1, H_bias_1, H_weights_2, H_bias_2], axis = 1) # Gradient & Hesse-Matrix wie per Hand berechnet gradient_hand, hessian_hand = grad_and_hesse_matrix(model, train_set, train_labels) # vergleichende Plots imshow_zero_center(hessian_hand - h_mat_keras.numpy(), "Hesse-Matrix n=1 analytisch vs. AD Absoluter Fehler") imshow_zero_center((hessian_hand - h_mat_keras.numpy())/h_mat_keras.numpy(), "Hesse-Matrix n=1 analytisch vs. AD Relativer Fehler")