To understand the role that GPs can play in optimizing a blackbox function,
Basically, the idea is to view optimization as an application of sequential design.
The role of “modeling” in optimization has a rich history,
But the potential role of modern statistical modeling is just recently being realized by the mathematical programming, statistics, and machine learning communities.
Statistical methods in optimization, in particular of noisy blackbox functions, probably goes back to Box & Draper,
However, the modern version is closest to methods described by Mockus, et al. (e.g., 1978), in a paper entitled
Although it would seem that many of these ideas were overlooked,
until the late 1990’s, after GPs became established in the computer experiments literature.
The best reference for the core idea might be Booker, et al. (1999).
The methodology is simple:
Observe that Step 2 involves its own inner-optimization,
Before we continue, lets be clear about the problem.
We wish to find
\[ x^* = \mathrm{argmin}_{x \in \mathcal{B}} \; f(x). \]
This means that all we get to do is
Lets consider an implementation of the Booker et al. idea on a re-scaled/coded version of the Goldstein-Price function.
f <- function(X)
{
if(is.null(nrow(X))) X <- matrix(X, nrow=1)
m <- 8.6928
s <- 2.4269
x1 <- 4 * X[,1] - 2
x2 <- 4 * X[,2] - 2
a <- 1 + (x1 + x2 + 1)^2 * (19 - 14 * x1 + 3 * x1^2 - 14 *
x2 + 6 * x1 * x2 + 3 * x2^2)
b <- 30 + (2 * x1 - 3 * x2)^2 * (18 - 32 * x1 + 12 * x1^2 +
48 * x2 - 36 * x1 * x2 + 27 * x2^2)
f <- log(a * b)
f <- (f - m)/s
return(f)
}
Lets start with a small initial design in the 2d space.
library(lhs)
ninit <- 12
X <- randomLHS(ninit, 2)
y <- f(X)
Now lets fit a (separable) GP to that data, with a small nugget.
library(laGP)
gpi <- newGPsep(X, y, d=0.1, g=1e-6, dK=TRUE)
da <- darg(list(mle=TRUE, max=0.5), X)
mleGPsep(gpi, param="d", tmin=da$min, tmax=da$max, ab=da$ab)$msg
## [1] "CONVERGENCE: REL_REDUCTION_OF_F <= FACTR*EPSMCH"
Now, set up an objective to search based on the predictive mean.
obj.mean <- function(x, gpi) predGPsep(gpi, matrix(x, nrow=1), lite=TRUE)$mean
That predictive surface (like the function \(f\)) may have many local minima,
m <- which.min(y)
opt <- optim(X[m,], obj.mean, lower=0, upper=1, method="L-BFGS-B", gpi=gpi)
opt$par
## [1] 0.4491681 0.1494378
Here’s what that looks like in the input domain.
plot(X[1:ninit,], xlab="x1", ylab="x2", xlim=c(0,1), ylim=c(0,1))
arrows(X[m,1], X[m,2], opt$par[1], opt$par[2], length=0.1)
Evaluate \(f\) at opt$par
, update the GP …
ynew <- f(opt$par)
updateGPsep(gpi, matrix(opt$par, nrow=1), ynew)
mle <- mleGPsep(gpi, param="d", tmin=da$min, tmax=da$max, ab=da$ab)
X <- rbind(X, opt$par)
y <- c(y, ynew)
… and solve for the next point.
m <- which.min(y)
opt <- optim(X[m,], obj.mean, lower=0, upper=1, method="L-BFGS-B", gpi=gpi)
opt$par
## [1] 0.4343782 0.3078317
Here’s what that looks like in the input domain.
plot(X, xlab="x1", ylab="x2", xlim=c(0,1), ylim=c(0,1))
n <- nrow(X)
arrows(X[m,1], X[m,2], opt$par[1], opt$par[2], length=0.1)
Update for the most recent point.
ynew <- f(opt$par)
updateGPsep(gpi, matrix(opt$par, nrow=1), ynew)
mle <- mleGPsep(gpi, param="d", tmin=da$min, tmax=da$max, ab=da$ab)
X <- rbind(X, opt$par)
y <- c(y, ynew)
Looping: five more iterations.
while(1) {
m <- which.min(y)
opt <- optim(X[m,], obj.mean, lower=0, upper=1,
method="L-BFGS-B", gpi=gpi)
ynew <- f(opt$par)
if(abs(ynew - y[length(y)]) < 1e-4) break;
updateGPsep(gpi, matrix(opt$par, nrow=1), ynew)
mle <- mleGPsep(gpi, param="d", tmin=da$min, tmax=da$max, ab=da$ab)
X <- rbind(X, opt$par)
y <- c(y, ynew)
}
prog <- y
for(i in 2:length(y))
if(prog[i] > prog[i-1]) prog[i] <- prog[i-1]
plot(prog, type="l")
optim.surr <- function(f, ninit, stop, tol=1e-4)
{
X <- randomLHS(ninit, 2)
y <- f(X)
gpi <- newGPsep(X, y, d=0.1, g=1e-7, dK=TRUE)
da <- darg(list(mle=TRUE, max=0.5), randomLHS(1000, 2))
mleGPsep(gpi, param="d", tmin=da$min, tmax=da$max, ab=da$ab)
for(i in (ninit+1):stop) {
m <- which.min(y)
opt <- optim(X[m,], obj.mean, lower=0, upper=1,
method="L-BFGS-B", gpi=gpi)
ynew <- f(opt$par)
if(abs(ynew - y[length(y)]) < tol) break;
updateGPsep(gpi, matrix(opt$par, nrow=1), ynew)
mleGPsep(gpi, param="d", tmin=da$min, tmax=da$max, ab=da$ab)
X <- rbind(X, opt$par)
y <- c(y, ynew)
}
deleteGPsep(gpi)
return(list(X=X, y=y))
}
Lets repeatedly solve the problem in this way with 100 random initializations.
reps <- 100
prog <- matrix(NA, nrow=reps, ncol=50)
for(r in 1:reps) {
os <- optim.surr(f, 12, 50)
prog[r,1:length(os$y)] <- os$y
for(i in 2:50) {
if(is.na(prog[r,i]) || prog[r,i] > prog[r,i-1])
prog[r,i] <- prog[r,i-1]
}
}
Note that these are random initializations.
Clearly this is not a global optimization tool.
matplot(t(prog), type="l", col="gray", lty=1, ylab="progress", xlab="its")
optim
compare?First, we need to modify \(f\) so we can keep track of the path of evaluations.
optim
will call \(f\) to approximate gradients.fprime <- function(x)
{
ynew <- f(x)
y <<- c(y, ynew)
return(ynew)
}
Now lets loop a bunch of times.
optim
will only allow us to control the “outer” iterations.optim
progressHere is the same for
loop we did with the surrogate-based optimizer,
optim
instead,prog.optim <- matrix(NA, nrow=reps, ncol=50)
for(r in 1:reps) {
y <- c()
os <- optim(runif(2), fprime, lower=0, upper=1, method="L-BFGS-B")
prog.optim[r,1:length(y)] <- y <- y[1:min(50, length(y))]
for(i in 2:length(y)) {
if(is.na(prog.optim[r,i]) || prog.optim[r,i] > prog.optim[r,i-1])
prog.optim[r,i] <- prog.optim[r,i-1]
}
}
optim
progress is much slower on a fixed budget.
matplot(t(prog.optim), type="l", col="red", lty=1, ylab="progress", xlab="its")
matlines(t(prog), type="l", col="gray", lty=1)
legend("topright", c("surr", "optim"), col=c("gray", "red"), lty=1)
Our surrogate-based optimization is more efficient because
optim
’s.
optim
only bases evaluations on a local linear approximation.But surrogate optimization is still mostly a local affair.
optim
subroutine on the predictive surface.
We need some way to balance exploration and exploitation.
In the mid 90s, Matthias Schonlau was working on his dissertation,
He came up with a heuristic called expected improvement (EI), which is the basis of a so-called
His key insight was that predictive uncertainty was underutilized in the surrogate framework for optimization,
Schonlau defined a statistic called the improvement
\[ I(x) = \max\{0, f^{\mathrm{min}}_n - Y(x)\} \]
which is a random variable measuring the amount by which an unknown response \(Y(x)\) is below the current point known to be the minimum
\[ f^{\mathrm{min}}_n = \min \{y_1, \dots, y_n\}. \]
Now there are lots of things you could imagine doing with the improvement,
It is easiest to imagine what the expected improvement might look like through a Monte Carlo approximation.
\[ \frac{1}{T} \sum_{t=0}^T \max\{0, f^n_\min - y^{(t)} \} \rightarrow \mathbb{E}\{I(x)\} \quad \mbox{ as } T \rightarrow \infty. \]
The cool thing is that if \(Y(x)\) is Gaussian,
the EI has a convenient closed form expression.
\[ \mathbb{E}\{I(x)\} = (f^n_{\min} - \mu_n(x))\,\Phi\!\left( \frac{f^n_{\min} - \mu_n(x)}{\sigma_n(x)}\right) + \sigma_n(x)\, \phi\!\left( \frac{f^n_{\min} - \mu_n(x)}{\sigma_n(x)}\right) \]
Notice how it organically balances
– A useful cartoon –
See gp_ei_sin.R
with the course materials.
This code uses a hodge-podge of libraries, and I didn’t want to re-write it.
The laGP
package doesn’t include an EI calculation,
EI <- function(gpi, x, fmin, pred=predGPsep)
{
if(is.null(nrow(x))) x <- matrix(x, nrow=1)
p <- pred(gpi, x, lite=TRUE)
d <- fmin - p$mean
sigma <- sqrt(p$s2)
dn <- d/sigma
ei <- d*pnorm(dn) + sigma*dnorm(dn)
return(ei)
}
To use it as an objective in a surrogate-based optimization:
obj.EI <- function(x, fmin, gpi) - EI(gpi, x, fmin)
Although EI has a “maximizing variance” aspect, which could cause the EI surface to be multi-modal
The number of EI modes will fluctuate as the algorithm runs,
Therefore a sensible multi-start scheme might include
How about the following?
EI.search <- function(X, y, gpi, multi.start=5)
{
m <- which.min(y)
fmin <- y[m]
start <- matrix(X[m,], nrow=1)
if(multi.start > 1)
start <- rbind(start, randomLHS(multi.start-1, ncol(X)))
xnew <- matrix(NA, nrow=nrow(start), ncol=ncol(X)+1)
for(i in 1:nrow(start)) {
if(EI(gpi, start[i,], fmin) <= eps)
{ out <- list(value=-Inf); next }
out <- optim(start[i,], obj.EI, method="L-BFGS-B",
lower=0, upper=1, gpi=gpi, fmin=fmin)
xnew[i,] <- c(out$par, -out$value)
}
solns <- data.frame(cbind(start, xnew))
names(solns) <- c("s1", "s2", "x1", "x2", "val")
solns <- solns[(solns$val > sqrt(.Machine$double.eps)),]
return(solns)
}
Initializing the GP fit.
X <- randomLHS(ninit, 2)
y <- f(X)
gpi <- newGPsep(X, y, d=0.1, g=1e-6, dK=TRUE)
da <- darg(list(mle=TRUE, max=0.5), X)
Performing an EI search.
solns <- EI.search(X, y, gpi)
m <- which.max(solns$val)
maxei <- solns$val[m]
plot(X, xlab="x1", ylab="x2", xlim=c(0,1), ylim=c(0,1))
arrows(solns$s1, solns$s2, solns$x1, solns$x2, length=0.1)
points(solns$x1[m], solns$x2[m], col=2, pch=20)
Incorporate the new data at the chosen input location.
xnew <- as.matrix(solns[m,3:4])
X <- rbind(X, xnew)
y <- c(y, f(xnew))
updateGPsep(gpi, xnew, y[length(y)])
mle <- mleGPsep(gpi, param="d", tmin=da$min, tmax=da$max, ab=da$ab)
And do another iteration, and update.
solns <- EI.search(X, y, gpi)
m <- which.max(solns$val)
maxei <- c(maxei, solns$val[m])
xnew <- as.matrix(solns[m,3:4])
X <- rbind(X, xnew)
y <- c(y, f(xnew))
updateGPsep(gpi, xnew, y[length(y)])
mle <- mleGPsep(gpi, param="d", tmin=da$min, tmax=da$max, ab=da$ab)
Clearly a multi-modal criteria.
plot(X, xlab="x1", ylab="x2", xlim=c(0,1), ylim=c(0,1))
arrows(solns$s1, solns$s2, solns$x1, solns$x2, length=0.1)
points(solns$x1[m], solns$x2[m], col=2, pch=20)
Careful, similar \(y\)-values is no longer a good measure of convergence.
for(i in nrow(X):50) {
solns <- EI.search(X, y, gpi)
m <- which.max(solns$val)
maxei <- c(maxei, solns$val[m])
xnew <- as.matrix(solns[m,3:4])
ynew <- f(xnew)
X <- rbind(X, xnew); y <- c(y, ynew)
updateGPsep(gpi, xnew, y[length(y)])
mle <- mleGPsep(gpi, param="d", tmin=da$min, tmax=da$max, ab=da$ab)
}
Calculating progress …
prog.ei <- y
for(i in 2:length(y))
if(prog.ei[i] > prog.ei[i-1]) prog.ei[i] <- prog.ei[i-1]
par(mfrow=c(1,2))
plot(prog.ei, type="l", ylab="fmin progress", xlab="its")
plot(maxei, type="l", xlab="its", ylab="max EI")
optim.EI <- function(f, ninit, stop)
{
X <- randomLHS(ninit, 2); y <- f(X)
gpi <- newGPsep(X, y, d=0.1, g=1e-7, dK=TRUE)
da <- darg(list(mle=TRUE, min=eps, max=0.5), X)
mleGPsep(gpi, param="d", tmin=da$min, tmax=da$max, ab=da$ab)
maxei <- c()
for(i in (ninit+1):stop) {
solns <- EI.search(X, y, gpi)
m <- which.max(solns$val)
maxei <- c(maxei, solns$val[m])
xnew <- as.matrix(solns[m,3:4])
ynew <- f(xnew)
updateGPsep(gpi, matrix(xnew, nrow=1), ynew)
mleGPsep(gpi, param="d", tmin=da$min, tmax=da$max, ab=da$ab)
X <- rbind(X, xnew); y <- c(y, ynew)
}
deleteGPsep(gpi)
return(list(X=X, y=y, maxei=maxei))
}
Lets repeatedly solve the problem in this way with 100 random initializations.
reps <- 100
prog.ei <- matrix(NA, nrow=reps, ncol=50)
for(r in 1:reps) {
os <- optim.EI(f, 12, 50)
prog.ei[r,1:length(os$y)] <- os$y
for(i in 2:length(os$y)) {
if(is.na(prog.ei[r,i]) || prog.ei[r,i] > prog.ei[r,i-1])
prog.ei[r,i] <- prog.ei[r,i-1]
}
}
The next slide shows the average progress of our three methods so far.
plot(colMeans(prog.ei), col=1, lwd=2, ylab="fmin progress", type="l")
lines(colMeans(prog), col="gray",lwd=2)
lines(colMeans(prog.optim, na.rm=TRUE), col=2,lwd=2)
legend("topright", c("optim", "surr", "ei"), col=c(2, "gray", 1), lty=1)
Once or twice out of 100 repeats did EI not find the global min after 50 iterations.
boxplot(prog.ei[,50], prog[,50], prog.optim[,50], names=c("ei", "surr", "optim"))
Under certain regularity conditions,
the EGO algorithm (i.e., EI searches) will converge to a global optima.
In practice, it does really well
You can show that each sequential EI-based decision is optimal
How do you handle noise?
First, lets keep it simple and assume the constraints are known.
The problem is
\[ x^* = \mathrm{argmin}_{x \in \mathcal{B}} \; f(x) \quad \mbox{ subject to } \quad c(x) \leq 0. \]
One simple method is to extend EI to what is called expected feasible improvement (EFI) (Schonlau, Jones & Welch, 1998)
\[ \mbox{EFI}(x) = \mathbb{E}\{I(x)\} \mathbb{I}(c(x) \leq 0), \]
The problem is unchanged
\[ x^* = \mathrm{argmin}_{x \in \mathcal{B}} \; f(x) \quad \mbox{ subject to } \quad c(x) \leq 0, \]
We’ll need a model for \(c(x)\), and the appropriate model will depend on the nature of the function:
If \(p_n(x)\) is the predicted probability that input \(x\) satisfies the constraint,
\[ \mbox{EFI}(x) = \mathbb{E}\{I(x)\} p_n(x) \]
Usually the \(c(x)\) are real-valued and exist in multitude, \(\{c^{(j)}\}, \; j=1\dots,m\).
EFI adapts nicely to this setting, suffice it that
\[ \mbox{EFI}(x) = \mathbb{E}\{I(x)\} \prod_{j=1}^m p^{(j)}_n(x) \]
pnorm
given \(\mu_n^{(j)}(x)\) and \(\sigma_n^{(j)2}(x)\).These constrained optimizations are hard even when the objective is “easy”,
Here is a toy problem to fix ideas.
\[ \min_{x} \left\{ x_1+x_2 : c_1(x) \leq 0, \, c_2(x) \leq 0, \, x\in [0,1]^2 \right\} \]
\[ \begin{aligned} c_1(x) &= \frac{3}{2} - x_1 - 2x_2 - \frac{1}{2}\sin\left(2\pi(x_1^2-2x_2)\right) \\ c_2(x) &= x_1^2+x_2^2-\frac{3}{2} \end{aligned} \]
Even when \(f(x) = x_1 + x_2\) is known, this is hard when \(c(x)\) is a blackbox.
Math programming has efficient algorithms for non-linear (blackbox) optimization (under constraints) with
Whereas statistical approaches
they offer limited support for constraints.
One such framework involves the so-called the augmented Lagrangian (AL):
\[ L_A(x;\lambda, \rho) = f(x) +\lambda^\top c(x) + \frac{1}{2\rho} \sum_{j=1}^m \max \left(0,c_j(x) \right)^2, \quad \mbox{ where} \]
AL-based methods thereby
Without the Lagrangian term \(\lambda^\top c(x)\),
Given \((\rho^{k-1}, \lambda^{k-1})\),
See wildprob.R
with the course material.
AL methods are not designed for global optimization, however the convergence results have a certain robustness.
Even if the “inner” sub-problem
\[ x^k = \mbox{arg}\min_x \left\{ L_A(x;\lambda^{k-1},\rho^{k-1}) : x\in \mathcal{B}\right\} \]
cannot be solved exactly,
How is “outer” convergence determined?
The “inner” solver can be anything. Our interactive demo used Nelder–Mead (optim
default), but approximating the derivative is expensive.
The idea is to train the “inner” solver with all evaluations
\[ (x_1, f(x_1), c(x_1)), \dots, (x_n, f(x_n), c(x_n)) \]
collected over all “inner” and “outer” loops (Gramacy, et al., 2016).
Consider a separate/independent GP model each component of the AL.
The distribution of the composite random variable
\[ Y(x) = Y_f(x) + \lambda^\top Y_c(x) + \frac{1}{2\rho} \sum_{j=1}^m \max(0, Y_{c_j}(x))^2 \]
can serve as a surrogate for \(L_A(x; \lambda, \rho)\).
optim
The composite posterior mean is available in closed form, e.g., under GP priors.
\[ \mathbb{E} \{ Y(x)\} = \mu_f^n(x) + \lambda^\top \mu_c^n(x) + \frac{1}{2\rho} \sum_{j=1}^m \mathbb{E}\{ \max(0, Y_{c_j}(x))^2\} \]
A result from generalized EI (Schonlau, Jones & Welch, 1998) helps us work out the expectation inside that sum above.
\[ \begin{aligned} & \mathbb{E} \{ \max(0, Y_{c_j}(x))^2\} = \mathbb{E}\{I_{-Y_{c_j}}(x)\}^2 + \mathbb{V}\mathrm{ar}[I_{-Y_{c_j}}(x)] \\ &= \sigma^{2n}_{c_j}(x)\left[\left( 1+ \left(\frac{\mu^n_{c_j}(x)}{\sigma_{c_j}^n(x)}\right)^{2}\right) \Phi\left(\frac{\mu^n_{c_j}(x)}{\sigma_{c_j}^n(x)} \right) +\frac{\mu^n_{c_j}(x)}{\sigma_{c_j}^n(x)} \phi\left(\frac{\mu^n_{c_j}(x)}{\sigma_{c_j}^n(x)} \right) \right]. \end{aligned} \]
The simplest way to evaluate the EI is via Monte Carlo:
The “max” in the AL makes analytic calculation intractable.
But you can remove the “max” and obtain an analytic EI with slack variables.
Slacks also facilitate the only EI-based method for handing mixed (equality and inequality) constraints (Picheny, et al., 2016).
It is too much to code all this up by ourselves for a real-time run in lecture.
optim.auglag
implementation in laGP
.Here is an implementation of the toy problem in R, using the format required for optim.auglag
.
aimprob <- function(X, known.only=FALSE)
{
if(is.null(nrow(X))) X <- matrix(X, nrow=1)
f <- rowSums(X)
if(known.only) return(list(obj=f))
c1 <- 1.5-X[,1]-2*X[,2]-0.5*sin(2*pi*(X[,1]^2-2*X[,2]))
c2 <- rowSums(X^2)-1.5
return(list(obj=f, c=cbind(c1,c2)))
}
And we’ll work in the following bounding box.
B <- matrix(c(rep(0,2),rep(1,2)),ncol=2)
One non-AL version (EFI) is also provided by laGP
.
efi <- optim.efi(aimprob, B, end=50, verb=0)
One AL version guided by the posterior mean surface of the AL comprised of separated surrogate models.
ey <- optim.auglag(aimprob, B, end=50, ey.tol=1, verb=0)
Three variations with EI on the AL composite.
ei.mc <- optim.auglag(aimprob, B, end=50, verb=0)
ei.sl <- optim.auglag(aimprob, B, end=50, slack=TRUE, verb=0)
ei.slopt <- optim.auglag(aimprob, B, end=50, slack=2, verb=0)
plot(efi$prog, type="l", ylim=c(0.6, 1.6), ylab="progress", xlab="evaluations")
lines(ey$prog, col=2, lty=2); lines(ei.mc$prog, col=3, lty=3)
lines(ei.sl$prog, col=4, lty=4); lines(ei.slopt$prog, col=5, lty=5)
legend("topright", c("EFI", "EY", "EI.mc", "EI.sl", "EI.slopt"), col=1:5, lty=1:5)
Average results after 100 restarts.
optim
-AL required 100+ evaluations for local convergence.Sure, no problem. How about this crazy one?
f2d <- function(x, y=NULL)
{
if(is.null(y)) {
if(!is.matrix(x)) x <- matrix(x, ncol=2)
y <- x[,2]; x <- x[,1]
}
g <- function(z)
return(exp(-(z-1)^2) + exp(-0.8*(z+1)^2) - 0.05*sin(8*(z+0.1)))
return(-g(x)*g(y))
}
aimprob2 <- function(X, known.only = FALSE)
{
if(is.null(nrow(X))) X <- matrix(X, nrow=1)
if(known.only) stop("no outputs are treated as known")
f <- f2d(4*(X-0.5))
c1 <- 1.5 - X[,1] - 2*X[,2] - 0.5*sin(2*pi*(X[,1]^2 - 2*X[,2]))
c2 <- rowSums(X^2)-1.5
return(list(obj=f, c=cbind(c1,c2)))
}
Here is a little function to re-draw the surface on-demand.
plot.aimprob2 <- function()
{
x <- seq(0,1, length=200)
X <- expand.grid(x, x)
out <- aimprob2(as.matrix(X))
fv <- out$obj
fv[out$c[,1] > 0 | out$c[,2] > 0] <- NA
fi <- out$obj
fi[!(out$c[,1] > 0 | out$c[,2] > 0)] <- NA
plot(0, 0, type="n", xlim=B[1,], ylim=B[2,], xlab="x1", ylab="x2")
contour(x, x, matrix(out$c[,1], ncol=length(x)), nlevels=1, levels=0,
drawlabels=FALSE, add=TRUE, lwd=2)
contour(x, x, matrix(out$c[,2], ncol=length(x)), nlevels=1, levels=0,
drawlabels=FALSE, add=TRUE, lwd=2)
contour(x, x, matrix(fv, ncol=length(x)), nlevels=10, add=TRUE,
col="forestgreen")
contour(x, x, matrix(fi, ncol=length(x)), nlevels=13, add=TRUE, col=2, lty=2)
}
plot.aimprob2()
Pretty fast progress.
out2 <- optim.auglag(aimprob2, B, fhat=TRUE, start=20, end=50, verb=0)
plot(out2$prog, type="l", ylab="best valid value", xlab="blackbox evaluations")
plot.aimprob2()
v <- apply(out2$C, 1, function(x) { all(x <= 0) })
X <- out2$X[v,]; obj <- out2$obj[v]; xbest <- X[which.min(obj),]
points(xbest[1], xbest[2], pch=10, col="blue", cex=1.5)
How about initializing an optim
search from that point to see if we can “drill down” any further?
aimprob2.AL <- function(x, B, lambda, rho)
{
if(any(x < B[,1]) | any(x > B[,2])) return(Inf)
fc <- aimprob2(x)
al <- fc$obj + lambda%*%drop(fc$c) + rep(1/(2*rho),2)%*%pmax(0,drop(fc$c))^2
return(al)
}
## loop over AL updates until a valid solution is found
lambda <- out2$lambda[nrow(out2$lambda),]; rho <- out2$rho[length(out2$rho)]
while(1) {
o <- optim(xbest, aimprob2.AL, control=list(maxit=15),
B=B, lambda=lambda, rho=rho)
fc <- aimprob2(o$par)
if(all(fc$c <= 0)) { break
} else {
lambda <- pmax(0, lambda + (1/rho)*fc$c)
rho <- rho/2; xbest <- o$par
}
}
plot.aimprob2()
points(o$par[1], o$par[2], pch=18, col="blue")
segments(xbest[1], xbest[2], o$par[1], o$par[2])
For further comparison with optim
directly on the AL,
demo("ALfhat")
in the laGP
package.Two other demos show a mixed constraints setup
"GSBP"
) involving
"LAH"
) with
demo("GSBP")
– such a crazy surface!