import numpy as np

# definition of the function f ...
def f(x):
    return(100*(x[1]-x[0]**2)**2 + (1-x[0])**2)
# ... the gradient of f ...
def gradf(x):
    grad1 = -2*(1-x[0]) - 400*(x[1]-x[0]**2)*x[0]
    grad2 = 200*(x[1]-x[0]**2)
    return np.array([grad1,grad2])
# ... and the Hessian of f:
def hessf(x):
    h11 = 1200*x[0]**2 - 400*x[1] + 2
    h12 = -400*x[0]
    h22 = 200
    return np.array([[h11,h12],[h12,h22]])

##################################################
# code for step length selection and CG method


def StrongWolfe(f,gradf,x,p,
                initial_value,
                initial_descent,
                initial_step_length = 1.0,
                c1 = 1e-2,
                c2 = 0.49,
                max_extrapolation_iterations = 50,
                max_interpolation_iterations = 20,
                rho = 2.0):
    '''
    Implementation of a bisection based bracketing method
    for the strong Wolfe conditions
    '''

    # initialise the bounds of the bracketing interval
    alphaR = initial_step_length
    alphaL = 0.0
    # Armijo condition and the two parts of the Wolfe condition
    # are implemented as Boolean variables
    next_x = x+alphaR*p
    next_value = f(next_x)
    next_grad = gradf(next_x)
    Armijo = (next_value <= initial_value+c1*alphaR*initial_descent)
    descentR = np.inner(p,next_grad)
    curvatureLow = (descentR >= c2*initial_descent)
    curvatureHigh = (descentR <= -c2*initial_descent)
    # We start by increasing alphaR as long as Armijo and curvatureHigh hold,
    # but curvatureLow fails (that is, alphaR is definitely too small).
    # Note that curvatureHigh is automatically satisfied if curvatureLow fails.
    # Thus we only need to check whether Armijo holds and curvatureLow fails.
    itnr = 0
    while (itnr < max_extrapolation_iterations and (Armijo and (not curvatureLow))):
        itnr += 1
        # alphaR is a new lower bound for the step length
        # the old upper bound alphaR needs to be replaced with a larger step length
        alphaL = alphaR
        alphaR *= rho
        # update function value and gradient
        next_x = x+alphaR*p
        next_value = f(next_x)
        next_grad = gradf(next_x)
        # update the Armijo and Wolfe conditions
        Armijo = (next_value <= initial_value+c1*alphaR*initial_descent)
        descentR = np.inner(p,next_grad)
        curvatureLow = (descentR >= c2*initial_descent)
        curvatureHigh = (descentR <= -c2*initial_descent)
    # at that point we should have a situation where alphaL is too small
    # and alphaR is either satisfactory or too large
    # (Unless we have stopped because we used too many iterations. There
    # are at the moment no exceptions raised if this is the case.)
    alpha = alphaR
    itnr = 0
    # Use bisection in order to find a step length alpha that satisfies
    # all conditions.
    while (itnr < max_interpolation_iterations and (not (Armijo and curvatureLow and curvatureHigh))):
        itnr += 1
        if (Armijo and (not curvatureLow)):
            # the step length alpha was still too small
            # replace the former lower bound with alpha
            alphaL = alpha
        else:
            # the step length alpha was too large
            # replace the upper bound with alpha
            alphaR = alpha
        # choose a new step length as the mean of the new bounds
        alpha = (alphaL+alphaR)/2
        # update function value and gradient
        next_x = x+alphaR*p
        next_value = f(next_x)
        next_grad = gradf(next_x)
        # update the Armijo and Wolfe conditions
        Armijo = (next_value <= initial_value+c1*alphaR*initial_descent)
        descentR = np.inner(p,next_grad)
        curvatureLow = (descentR >= c2*initial_descent)
        curvatureHigh = (descentR <= -c2*initial_descent)
    # return the next iterate as well as the function value and gradient there
    # (in order to save time in the outer iteration; we have had to do these
    # computations anyway)
    return next_x,next_value,next_grad

def CG_FR(f,gradf,x_init,
          max_steps = 2000,
          alpha_0 = 1.0,
          gradient_stop = 1e-6):
    '''
    Implementation of the Fletcher Reeves method.
    '''

    # Initialise the variable x
    x = x_init
    # set the current step number to 0
    n_step = 0
    # compute current function value and gradient
    current_value = f(x)
    current_gradient = gradf(x)
    # compute the squared norm of the gradient
    # in the iteration, we will also need the norm of the previous
    # gradient, so we initialise this variable as well
    old_gradient_norm_2 = np.linalg.norm(current_gradient)**2
    current_gradient_norm_2 = old_gradient_norm_2
    # initialise the search direction
    search_direction = -current_gradient
    # main loop
    while (n_step < max_steps) and (current_gradient_norm_2 > gradient_stop**2):
        # increase the current step number
        n_step += 1
        # compute the expected descent for usage in the step length selection
        descent = np.inner(search_direction,current_gradient)
        # run the step length selection
        # for the strong Wolfe conditions, we need to compute
        # both the function value and the gradient at the eventually
        # chosen iterate
        # in order to avoid doing the same computation twice, we
        # let the step length selection also return these variables
        x,current_value,current_gradient = StrongWolfe(f,gradf,x,search_direction,
                                                       current_value,descent,
                                                       initial_step_length = alpha_0)
        # update the squared norm of the gradient of the current iterate
        current_gradient_norm_2 = np.linalg.norm(current_gradient)**2
        # compute the beta in the Fletcher-Reeves method
        betaFR = current_gradient_norm_2/old_gradient_norm_2
        # overwrite the size of the old gradient with the current one
        old_gradient_norm_2 = current_gradient_norm_2
        # update the search direction
        search_direction = -current_gradient + betaFR*search_direction
    # print out some information about the final result
    if(n_step < max_steps):
        print('The iteration converged in {} steps.'.format(n_step))
    else:
        print('The iteration failed to converge in {} steps.'.format(n_step))
    print('The final iterate was {}.'.format(x))
    print('The size of the gradient at the final iterate was {}.'.format(np.linalg.norm(current_gradient)))
    return x


#######################################################
# run tests with the Rosenbrock function

# test first for an initialisation at 0
print('First numerical test:')
x_init = np.zeros(2)
print('Initialisation: {}.'.format(x_init))
x = CG_FR(f,gradf,x_init)
print('\n')

# run the same test with a higher precision
# we 
print('Second numerical test with higher precision:')
x_init = np.zeros(2)
print('Initialisation: {}.'.format(x_init))
x = CG_FR(f,gradf,x_init,gradient_stop = 1e-12)
print('\n')

# run the algorithm with a more challenging starting value
print('Third numerical test with different starting value:')
x_init = np.array([-10.0,20.0])
print('Initialisation: {}.'.format(x_init))
x = CG_FR(f,gradf,x_init)