# Rosenbrock: a nonlinear problem survey

Authors
Affiliations
Francesco Picetti
Image and Sound Processing Lab, Politecnico di Milano
Ettore Biondi
SeismoLab, California Institute of Technology

# Testing non-linear solvers on the Rosenbrock function¶

@Author: Ettore Biondi - ebiondi@caltech.edu

## Problem definition¶

In this notebook we show how to set a user-defined objective function and minimize it using different solvers. The function understudy is the well-known convex Rosenbrock function. Its analytical form for the 2D case takes the follwing form:

$$$\phi(x,y) = (1-x)^2 + 100 (y-x^2)^2,$$$

in which the unique global minimum is at $x=y=1$. The global minimum is inside a long, narrow, parabolic-shaped flat valley. To find the valley is trivial. To converge to the global minimum, however, is difficult. Hence, this function represents a good testing case for any non-linear optimization scheme.

import numpy as np
import occamypy as o

# Plotting
from matplotlib import rcParams
from mpl_toolkits.axes_grid1 import make_axes_locatable
import matplotlib.pyplot as plt
rcParams.update({
'image.cmap'     : 'jet',
'image.aspect'   : 'auto',
'image.interpolation': None,
'axes.grid'      : False,
'figure.figsize' : (10, 6),
'savefig.dpi'    : 300,
'axes.labelsize' : 14,
'axes.titlesize' : 16,
'font.size'      : 14,
'legend.fontsize': 14,
'xtick.labelsize': 14,
'ytick.labelsize': 14,
'text.usetex'    : True,
'font.family'    : 'serif',
'font.serif'     : 'Latin Modern Roman',
})

Let's first define the problem object. Our model vector is going to be $\mathbf{m} = [x \,, \, y]^T$. Since the libary assumes that the objective function is written in terms of some residual vector (i.e., $\phi(\mathbf{r}(\mathbf{m}))$, we will create a vector containing objective function as a single scalar value.

class Rosenbrock(o.Problem):
"""
Rosenbrock function inverse problem
f(x,y) = (1 - x)^2 + 100*(y -x^2)^2
m = [x y]'
res = objective function value
"""

def __init__(self, x_initial, y_initial, minBound=None, maxBound=None):
"""Constructor of linear problem"""
#Setting the bounds (if any)
super(Rosenbrock, self).__init__(model=o.VectorNumpy(np.array((x_initial, y_initial))),
data=o.VectorNumpy((1,)),
minBound=minBound, maxBound=maxBound)

#Setting default variables
self.setDefaults()
self.linear = False
return

def obj_func(self, model):
m = model.getNdArray()
obj = self.res.arr[0]
return obj

def res_func(self, model):
m = model.getNdArray()
self.res[0] = (1.0 - m[0])*(1.0 - m[0]) + 100.0 * (m[1] - m[0]*m[0]) * (m[1] - m[0]*m[0])
return self.res

def grad_func(self, model, residual):
m = model.getNdArray()
self.grad[0] = - 2.0 * (1.0 - m[0]) - 400.0 * m[0] * (m[1] - m[0]*m[0])
self.grad[1] = 200.0 * (m[1] - m[0]*m[0])

def pert_res_func(self, model, pert_model):
m = model.getNdArray()
dm = pert_model.getNdArray()
self.pert_res.arr[0] = (- 2.0 * (1.0 - m[0]) - 400.0 * m[0] * (m[1] - m[0]*m[0]))* dm[0] + (200.0 * (m[1] - m[0]*m[0])) * dm[1]
return self.pert_res

### Instantiation of the inverse problem and of the various solvers¶

# Starting point for all the optimization problem
x_init = -1.0
y_init = -1.0
# Testing solver on Rosenbrock function
Ros_prob = Rosenbrock(x_init, y_init)

Before running any inversion, let's compute the objective function for different values of $x$ and $y$. This step will be useful when we want to plot the optimization path taken by the various tested algorithms.

#Computing the objective function for plotting
x_samples = np.linspace(-1.5,  1.5, 1000)
y_samples = np.linspace(   3, -1.5, 1000)

obj_ros = o.VectorNumpy((x_samples.size, y_samples.size))
model_test = o.VectorNumpy((2,))

for ix, x_value in enumerate(x_samples):
for iy, y_value in enumerate(y_samples):
model_test[0]  = x_value
model_test[1]  = y_value
obj_ros[ix, iy] = Ros_prob.get_obj(model_test)

First we test a non-linear conjugate-gradient method in which a parabolic stepper with three-point interpolation is used.

niter = 1000
Stop  = o.BasicStopper(niter=niter, tolr=1e-32, tolg=1e-32)
NLCGsolver = o.NLCG(Stop)
Ros_prob = Rosenbrock(x_init, y_init) #Resetting the problem
NLCGsolver.setDefaults(save_obj=True, save_model=True)
NLCGsolver.run(Ros_prob,verbose=True)

# Converting sampled points to arrays for plotting
x_smpld = []
y_smpld = []
for i in range(len(NLCGsolver.model)):
x_smpld.append(NLCGsolver.model[i][0])
y_smpld.append(NLCGsolver.model[i][1])

Let's plot the optimization path taken by the algorithm, which converged to the global minimum in 199 iterations using a parabolic stepper.

fig, ax = plt.subplots()

ax.scatter(x_smpld, y_smpld, color='red', s=50, marker="+")
ax.plot(x_smpld, y_smpld, "--", color='red')

im = ax.imshow(obj_ros.plot().T, alpha=0.6, clim=(0,600), extent=[1.5,-1.5,-1.5,3.0])
ax.set_xlabel("x"), ax.set_ylabel("y")
ax.invert_xaxis()

cs = plt.contour(obj_ros.plot().T,
levels=[0.05,0.1,0.5,2,10,50,125,250,500,1000],
extent=[-1.5,1.5,3.0,-1.5],
colors="white", linewidths=(0.8,), linestyles='--')
divider = make_axes_locatable(ax)
plt.colorbar(im, cax=divider.append_axes("right", size="2%", pad=0.1))
plt.suptitle(r"Nonlinear CG with FR beta function")
plt.show()

For the second test, we will test the steppest-descent approach using the same stepper.

NLSDsolver = o.NLCG(Stop, beta_type="SD")
Ros_prob = Rosenbrock(x_init, y_init)
NLSDsolver.setDefaults(save_obj=True, save_model=True)
NLSDsolver.run(Ros_prob, verbose=True)

# Converting sampled points to arrays for plotting
x_smpld = []
y_smpld = []
for i in range(len(NLSDsolver.model)):
x_smpld.append(NLSDsolver.model[i][0])
y_smpld.append(NLSDsolver.model[i][1])

Let's again plot the optimization path. In this case, the algorithm finds only falls close to the vicinity of the global minimum but does not reach even after 1000 iteration. In the figure below, we can see that the algorithm is sampling most of the objective function within the parabolic valley.

fig, ax = plt.subplots()

ax.scatter(x_smpld, y_smpld, color='red', s=50, marker="+")
ax.plot(x_smpld, y_smpld, "--", color='red')

im = ax.imshow(obj_ros.plot().T, alpha=0.6, clim=(0,600), extent=[1.5,-1.5,-1.5,3.0])
ax.set_xlabel("x"), ax.set_ylabel("y")
ax.invert_xaxis()

cs = plt.contour(obj_ros.plot().T,
levels=[0.05,0.1,0.5,2,10,50,125,250,500,1000],
extent=[-1.5,1.5,3.0,-1.5],
colors="white", linewidths=(0.8,), linestyles='--')
divider = make_axes_locatable(ax)
plt.colorbar(im, cax=divider.append_axes("right", size="2%", pad=0.1))
plt.suptitle(r"Nonlinear Steepest Descent")

plt.show()

In the third test, let's apply the BFGS algorithm to find the function's global minimum.

ParabStep = o.ParabolicStep()
BFGSsolver = o.LBFGS(Stop, stepper=ParabStep)
Ros_prob = Rosenbrock(x_init, y_init)
BFGSsolver.setDefaults(save_obj=True, save_model=True)
BFGSsolver.run(Ros_prob, verbose=True)

# Converting sampled points to arrays for plotting
x_smpld = []
y_smpld = []
for i in range(len(BFGSsolver.model)):
x_smpld.append(BFGSsolver.model[i][0])
y_smpld.append(BFGSsolver.model[i][1])

The algorithm has precisely reached the global minimum in 24 iterations. We can clearly see that it is able to find an approximation of the local curvature of the objective function. In fact, it needs to sample very few points within the parabolic-shaped valley.

fig, ax = plt.subplots()

ax.scatter(x_smpld, y_smpld, color='red', s=50, marker="+")
ax.plot(x_smpld, y_smpld, "--", color='red')

im = ax.imshow(obj_ros.plot().T, alpha=0.6, clim=(0,600), extent=[1.5,-1.5,-1.5,3.0])
ax.set_xlabel("x"), ax.set_ylabel("y")
ax.invert_xaxis()

cs = plt.contour(obj_ros.plot().T,
levels=[0.05,0.1,0.5,2,10,50,125,250,500,1000],
extent=[-1.5,1.5,3.0,-1.5],
colors="white", linewidths=(0.8,), linestyles='--')
divider = make_axes_locatable(ax)
plt.colorbar(im, cax=divider.append_axes("right", size="2%", pad=0.1))
plt.suptitle("L-BFGS with Parabolic stepper")
plt.show()

Let's try to solve the same problem using the Strong-Wolfe stepper described by Nocedal and Wright (1999) in the Algorithm 3.2 on page 59.

WolfeStep = o.StrongWolfe(alpha=0.0) #We now use a stepper that satisfies the strong Wolfe conditions
BFGSsolver = o.LBFGS(Stop, stepper=WolfeStep)
Ros_prob = Rosenbrock(x_init, y_init) #Resetting the problem
BFGSsolver.setDefaults(save_obj=True, save_model=True)
BFGSsolver.run(Ros_prob, verbose=True)

# Converting sampled points to arrays for plotting
x_smpld = []
y_smpld = []
for i in range(len(BFGSsolver.model)):
x_smpld.append(BFGSsolver.model[i][0])
y_smpld.append(BFGSsolver.model[i][1])

The BFGS algorithm combined with this specific stepper reached the objective function's minimum in 35 iterations with 50 and 45 function and gradient evaluations, respectively.

fig, ax = plt.subplots()

ax.scatter(x_smpld, y_smpld, color='red', s=50, marker="+")
ax.plot(x_smpld, y_smpld, "--", color='red')

im = ax.imshow(obj_ros.plot().T, alpha=0.6, clim=(0,600), extent=[1.5,-1.5,-1.5,3.0])
ax.set_xlabel("x"), ax.set_ylabel("y")
ax.invert_xaxis()

cs = plt.contour(obj_ros.plot().T,
levels=[0.05,0.1,0.5,2,10,50,125,250,500,1000],
extent=[-1.5,1.5,3.0,-1.5],
colors="white", linewidths=(0.8,), linestyles='--')
divider = make_axes_locatable(ax)
plt.colorbar(im, cax=divider.append_axes("right", size="2%", pad=0.1))
plt.suptitle("L-BFGS with Strong-Wolfe stepper")
plt.show()

Finally, let's test again the BFGS method but this time employing the line-search algorithm proposed by More and Thuente (1994). Their line-search method uses a backeting approach in which the strong Wolfe conditions are verified for the tested point. In this case, if these conditons are met, then the method was successful.

BFGSsolver = o.LBFGS(Stop)
Ros_prob = Rosenbrock(x_init, y_init)
BFGSsolver.setDefaults(save_obj=True, save_model=True)
BFGSsolver.run(Ros_prob, verbose=True)

# Converting sampled points to arrays for plotting
x_smpld = []
y_smpld = []
for i in range(len(BFGSsolver.model)):
x_smpld.append(BFGSsolver.model[i][0])
y_smpld.append(BFGSsolver.model[i][1])

We can see that the algorithm has reached the global minimum in 36 iterations. However, since we employed a different stepping method, in which no parabolic interpolation is used during the optimization, the algorithm had to perfom only 41 objective function evaluations as opposed to 73 necessary by the BFGS method when the parabolic stepper was the line-search algorithm of choice.

fig, ax = plt.subplots()

ax.scatter(x_smpld, y_smpld, color='red', s=50, marker="+")
ax.plot(x_smpld, y_smpld, "--", color='red')

im = ax.imshow(obj_ros.plot().T, alpha=0.6, clim=(0,600), extent=[1.5,-1.5,-1.5,3.0])
ax.set_xlabel("x"), ax.set_ylabel("y")
ax.invert_xaxis()

cs = plt.contour(obj_ros.plot().T,
levels=[0.05,0.1,0.5,2,10,50,125,250,500,1000],
extent=[-1.5,1.5,3.0,-1.5],
colors="white", linewidths=(0.8,), linestyles='--')
divider = make_axes_locatable(ax)
plt.colorbar(im, cax=divider.append_axes("right", size="2%", pad=0.1))
plt.suptitle("L-BFGS with CvSrch stepper")

plt.show()

### Constrained non-linear optimization¶

We now try to solve the save non-linear problem but we impose maximum and minimum boundary conditions as follows:

$$$\phi(x,y) = (1-x)^2 + 100 (y-x^2)^2,\\ \text{s.t.} \; x_l \leq x \leq x_m \; \text{and} \; y_l \leq y \leq y_m$$$

where $x_l$, $y_l$, $x_m$, and $y_m$ represent the minimum and maximum allowed values for each optimization variable, respectively. Let's first define a problem object with two bound vectors having the inequality conditions shown in the previous equations. To solve such problem, we employ the L-BFGS-B algorithm proposed by Byrd et al. (1995). Let's try with some bounds that contains the global minimum of the objective function.

# Lower and upper bounds and associtated vectors
xl, yl = -1.2, -1.2
xu, yu = 1.2, 2.0
minBound = o.VectorNumpy(np.array((xl, yl)))
maxBound = o.VectorNumpy(np.array((xu, yu)))

Stop  = o.BasicStopper(niter=1000, tolg_proj=1e-32)
BFGSBsolver = o.LBFGSB(Stop, m_steps=10)
Ros_prob = Rosenbrock(x_init, y_init, minBound, maxBound)
BFGSBsolver.setDefaults(save_obj=True, save_model=True)
BFGSBsolver.run(Ros_prob,verbose=True)

# Converting sampled points to arrays for plotting
x_smpld = []
y_smpld = []
for i in range(len(BFGSBsolver.model)):
x_smpld.append(BFGSBsolver.model[i][0])
y_smpld.append(BFGSBsolver.model[i][1])

# Setting bounds for plotting
bounds_points_x = np.array([minBound[0],minBound[0],maxBound[0],maxBound[0],minBound[0]])
bounds_points_y = np.array([minBound[1],maxBound[1],maxBound[1],minBound[1],minBound[1]])


The method is able to reach the global minimum since it is contianed within the provided bounds (plotted with white dashed segments).

fig, ax = plt.subplots()

ax.scatter(x_smpld, y_smpld, color='red', s=50, marker="+")
ax.plot(x_smpld, y_smpld, "--", color='red')

ax.scatter(bounds_points_x, bounds_points_y, marker="o", color='w', s=30)
ax.plot(bounds_points_x, bounds_points_y, "--", color='w', linewidth=2)

im = ax.imshow(obj_ros.plot().T, alpha=0.6, clim=(0,600), extent=[1.5,-1.5,-1.5,3.0])
ax.set_xlabel("x"), ax.set_ylabel("y")
ax.invert_xaxis()

cs = plt.contour(obj_ros.plot().T,
levels=[0.05,0.1,0.5,2,10,50,125,250,500,1000],
extent=[-1.5,1.5,3.0,-1.5],
colors="white", linewidths=(0.8,), linestyles='--')
divider = make_axes_locatable(ax)
plt.colorbar(im, cax=divider.append_axes("right", size="2%", pad=0.1))
plt.suptitle(r"L-BFGS-B with %s upper bound" % maxBound[:].tolist())
plt.show()

Let's now try to reduce the upper bounds and see where the method stops.

# Lower and upper bounds and associtated vectors
xl, yl = -1.2, -1.2
xu, yu = 0.5, 1.0
minBound = o.VectorNumpy(np.array((xl, yl)))
maxBound = o.VectorNumpy(np.array((xu, yu)))

Stop  = o.BasicStopper(niter=1000, tolg_proj=1e-32)
BFGSBsolver = o.LBFGSB(Stop, m_steps=10)
Ros_prob = Rosenbrock(x_init, y_init, minBound, maxBound)
BFGSBsolver.setDefaults(save_obj=True, save_model=True)
BFGSBsolver.run(Ros_prob,verbose=True)

# Converting sampled points to arrays for plotting
x_smpld = []
y_smpld = []
for i in range(len(BFGSBsolver.model)):
x_smpld.append(BFGSBsolver.model[i][0])
y_smpld.append(BFGSBsolver.model[i][1])

# Setting bounds for plotting
bounds_points_x = np.array([minBound[0],minBound[0],maxBound[0],maxBound[0],minBound[0]])
bounds_points_y = np.array([minBound[1],maxBound[1],maxBound[1],minBound[1],minBound[1]])


The algorithm reaches the closest feasible minimum point as we can see from the plot below. This point is indeed positioned at on of the boundary.

fig, ax = plt.subplots()

ax.scatter(x_smpld, y_smpld, color='red', s=50, marker="+")
ax.plot(x_smpld, y_smpld, "--", color='red')

ax.scatter(bounds_points_x, bounds_points_y, marker="o", color='w', s=30)
ax.plot(bounds_points_x, bounds_points_y, "--", color='w', linewidth=2)

im = ax.imshow(obj_ros.plot().T, alpha=0.6, clim=(0,600), extent=[1.5,-1.5,-1.5,3.0])
ax.set_xlabel("x"), ax.set_ylabel("y")
ax.invert_xaxis()

cs = plt.contour(obj_ros.plot().T,
levels=[0.05,0.1,0.5,2,10,50,125,250,500,1000],
extent=[-1.5,1.5,3.0,-1.5],
colors="white", linewidths=(0.8,), linestyles='--')
divider = make_axes_locatable(ax)
plt.colorbar(im, cax=divider.append_axes("right", size="2%", pad=0.1))
plt.suptitle(r"L-BFGS-B with %s upper bound" % maxBound[:].tolist())
plt.show()
# Lower and upper bounds and associtated vectors
xl, yl = -1.2, -1.2
xu, yu = 0., 0.
minBound = o.VectorNumpy(np.array((xl, yl)))
maxBound = o.VectorNumpy(np.array((xu, yu)))

Stop  = o.BasicStopper(niter=1000, tolg_proj=1e-32)
BFGSBsolver = o.LBFGSB(Stop, m_steps=10)
Ros_prob = Rosenbrock(x_init, y_init, minBound, maxBound) #Resetting the problem
BFGSBsolver.setDefaults(save_obj=True, save_model=True)
BFGSBsolver.run(Ros_prob,verbose=True)

# Converting sampled points to arrays for plotting
x_smpld = []
y_smpld = []
for i in range(len(BFGSBsolver.model)):
x_smpld.append(BFGSBsolver.model[i][0])
y_smpld.append(BFGSBsolver.model[i][1])

# Setting bounds for plotting
bounds_points_x = np.array([minBound[0],minBound[0],maxBound[0],maxBound[0],minBound[0]])
bounds_points_y = np.array([minBound[1],maxBound[1],maxBound[1],minBound[1],minBound[1]])

fig, ax = plt.subplots()

ax.scatter(x_smpld, y_smpld, color='red', s=50, marker="+")
ax.plot(x_smpld, y_smpld, "--", color='red')

ax.scatter(bounds_points_x, bounds_points_y, marker="o", color='w', s=30)
ax.plot(bounds_points_x, bounds_points_y, "--", color='w', linewidth=2)

im = ax.imshow(obj_ros.plot().T, alpha=0.6, clim=(0,600), extent=[1.5,-1.5,-1.5,3.0])
ax.set_xlabel("x"), ax.set_ylabel("y")
ax.invert_xaxis()

cs = plt.contour(obj_ros.plot().T,
levels=[0.05,0.1,0.5,2,10,50,125,250,500,1000],
extent=[-1.5,1.5,3.0,-1.5],
colors="white", linewidths=(0.8,), linestyles='--')
divider = make_axes_locatable(ax)
plt.colorbar(im, cax=divider.append_axes("right", size="2%", pad=0.1))
plt.suptitle(r"L-BFGS-B with %s upper bound" % maxBound[:].tolist())
plt.show()

Again, the method is able to reach the closest feasible minimum of the objective function.