import numpy as np # definition of the function f ... def f(x): return(100*(x[1]-x[0]**2)**2 + (1-x[0])**2) # ... the gradient of f ... def gradf(x): grad1 = -2*(1-x[0]) - 400*(x[1]-x[0]**2)*x[0] grad2 = 200*(x[1]-x[0]**2) return np.array([grad1,grad2]) # ... and the Hessian of f: def hessf(x): h11 = 1200*x[0]**2 - 400*x[1] + 2 h12 = -400*x[0] h22 = 200 return np.array([[h11,h12],[h12,h22]]) ################################################## # code for step length selection and CG method def StrongWolfe(f,gradf,x,p, initial_value, initial_descent, initial_step_length = 1.0, c1 = 1e-2, c2 = 0.99, max_extrapolation_iterations = 50, max_interpolation_iterations = 20, rho = 2.0): ''' Implementation of a bisection based bracketing method for the strong Wolfe conditions ''' # initialise the bounds of the bracketing interval alphaR = initial_step_length alphaL = 0.0 # Armijo condition and the two parts of the Wolfe condition # are implemented as Boolean variables next_x = x+alphaR*p next_value = f(next_x) next_grad = gradf(next_x) Armijo = (next_value <= initial_value+c1*alphaR*initial_descent) descentR = np.inner(p,next_grad) curvatureLow = (descentR >= c2*initial_descent) curvatureHigh = (descentR <= -c2*initial_descent) # We start by increasing alphaR as long as Armijo and curvatureHigh hold, # but curvatureLow fails (that is, alphaR is definitely too small). # Note that curvatureHigh is automatically satisfied if curvatureLow fails. # Thus we only need to check whether Armijo holds and curvatureLow fails. itnr = 0 while (itnr < max_extrapolation_iterations and (Armijo and (not curvatureLow))): itnr += 1 # alphaR is a new lower bound for the step length # the old upper bound alphaR needs to be replaced with a larger step length alphaL = alphaR alphaR *= rho # update function value and gradient next_x = x+alphaR*p next_value = f(next_x) next_grad = gradf(next_x) # update the Armijo and Wolfe conditions Armijo = (next_value <= initial_value+c1*alphaR*initial_descent) descentR = np.inner(p,next_grad) curvatureLow = (descentR >= c2*initial_descent) curvatureHigh = (descentR <= -c2*initial_descent) # at that point we should have a situation where alphaL is too small # and alphaR is either satisfactory or too large # (Unless we have stopped because we used too many iterations. There # are at the moment no exceptions raised if this is the case.) alpha = alphaR grad_evals = itnr+1 itnr = 0 # Use bisection in order to find a step length alpha that satisfies # all conditions. while (itnr < max_interpolation_iterations and (not (Armijo and curvatureLow and curvatureHigh))): itnr += 1 if (Armijo and (not curvatureLow)): # the step length alpha was still too small # replace the former lower bound with alpha alphaL = alpha else: # the step length alpha was too large # replace the upper bound with alpha alphaR = alpha # choose a new step length as the mean of the new bounds alpha = (alphaL+alphaR)/2 # update function value and gradient next_x = x+alphaR*p next_value = f(next_x) next_grad = gradf(next_x) # update the Armijo and Wolfe conditions Armijo = (next_value <= initial_value+c1*alphaR*initial_descent) descentR = np.inner(p,next_grad) curvatureLow = (descentR >= c2*initial_descent) curvatureHigh = (descentR <= -c2*initial_descent) # return the next iterate as well as the function value and gradient there # (in order to save time in the outer iteration; we have had to do these # computations anyway) grad_evals += itnr return next_x,next_value,next_grad,grad_evals def BFGS(f,gradf,x_init, max_steps = 2000, alpha_0 = 1.0, gradient_stop = 1e-6): ''' Implementation of the BFGS method. ''' # Initialise the variable x x = x_init # Initialise the quasi-Newton matrix as the identity H = np.identity(np.size(x)) # Main loop n_step = 0 current_value = f(x) current_gradient = gradf(x) # count the number of gradient evaluations total_grad_evals = 1 norm_grad = np.linalg.norm(current_gradient) while ((n_step < max_steps) and (norm_grad > gradient_stop)): n_step += 1 search_direction = -H@current_gradient descent = np.inner(search_direction,current_gradient) x_old = x gradient_old = current_gradient x,current_value,current_gradient,grad_evals = StrongWolfe(f,gradf,x, search_direction, current_value,descent, initial_step_length = 1.0) total_grad_evals += grad_evals norm_grad = np.linalg.norm(current_gradient) s = x - x_old y = current_gradient - gradient_old rho = 1/np.inner(y,s) if n_step==1: H = H*(1/(rho*np.inner(y,y))) z = H.dot(y) H += -rho*(np.outer(s,z) + np.outer(z,s)) + rho*(rho*np.inner(y,z)+1)*np.outer(s,s) # print out some information about the final result if(n_step < max_steps): print('The iteration converged in {} steps.'.format(n_step)) else: print('The iteration failed to converge in {} steps.'.format(n_step)) print('The final iterate was {}.'.format(x)) print('The size of the gradient at the final iterate was {}.'.format(np.linalg.norm(current_gradient))) print('In total, we required {} gradient evaluations.'.format(total_grad_evals)) return x ####################################################### # run tests with the Rosenbrock function # test first for an initialisation at 0 print('First numerical test:') x_init = np.zeros(2) print('Initialisation: {}.'.format(x_init)) x = BFGS(f,gradf,x_init) print('\n') # run the same test with a higher precision # we print('Second numerical test with higher precision:') x_init = np.zeros(2) print('Initialisation: {}.'.format(x_init)) x = BFGS(f,gradf,x_init,gradient_stop = 1e-12) print('\n') # run the algorithm with a more challenging starting value print('Third numerical test with different starting value:') x_init = np.array([-10.0,20.0]) print('Initialisation: {}.'.format(x_init)) x = BFGS(f,gradf,x_init)