Release e10 · kyegomez/Sophia

Here are five optimization suggestions for the Sophia class:

Use torch.einsum to compute the dot product in the hutchinson method.
Use torch.no_grad() to avoid unnecessary gradient computations during the parameter update.
Use in-place operations for updating the parameters.
Cache the result of group['eps'] and group['rho'] to avoid repeated computations.
Use a more efficient method to compute the softmax and loss in the gauss_newton_bartlett method.
Pseudocode
Modify the hutchinson method to use torch.einsum for the dot product.
Use torch.no_grad() in the step method during the parameter update.
Replace add_ with addcdiv_ for in-place operations in the step method.
Cache the result of group['eps'] and group['rho'] in the step method.
Compute the softmax and loss more efficiently in the gauss_newton_bartlett method.
PyTorch Python Code
import torch

class Sophia(torch.optim.Optimizer):
def init(self, model, input_data, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0, k=10, estimator="Hutchinson", rho=1):
self.model = model
self.input_data = input_data
defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay, k=k, estimator=estimator, rho=rho)
super(Sophia, self).init(params, defaults)

def step(self, closure=None):
    loss = None
    if closure is not None:
        loss = closure()

    for group in self.param_groups:
        eps = group['eps']
        rho = group['rho']
        for p in group["params"]:
            if p.grad is None:
                continue
            grad = p.grad.data
            if grad.is_sparse:
                raise RuntimeError("Sophia does not support sparse gradients")

            state = self.state[p]

            if len(state) == 0:
                state['step'] = 0
                state['m'] = torch.zeros_like(p.data)
                state['h'] = torch.zeros_like(p.data)

            m, h = state['m'], state['h']
            beta1, beta2 = group['betas']
            state['step'] += 1

            if group['weight_decay'] != 0:
                grad = grad.add(group["weight_decay"], p.data)

            m.mul_(beta1).add_(1 - beta1, grad)

            if state['step'] % group['k'] == 1:
                if group['estimator'] == "Hutchinson":
                    hessian_estimate = self.hutchinson(p, grad)
                elif group['estimator'] == "Gauss-Newton-Bartlett":
                    hessian_estimate = self.gauss_newton_bartlett(p, grad)
                else:
                    raise ValueError("Invalid estimator choice")
                h.mul_(beta2).add_(1 - beta2, hessian_estimate)

            with torch.no_grad():
                p.data.add_(-group['lr'] * group['weight_decay'], p.data)
                p.data.addcdiv_(-group['lr'], m, h.add(eps).clamp(max=rho))

    return loss

def hutchinson(self, p, grad):
    u = torch.randn_like(grad)
    grad_dot_u = torch.einsum("...,...->", grad, u)
    hessian_vector_product = torch.autograd.grad(grad_dot_u, p, retain_graph=True)[0]
    return u * hessian_vector_product

def gauss_newton_bartlett(self, p, grad):
    B = len(self.input_data)
    logits = [self.model(xb) for xb in self.input_data]
    y_hats = [torch.softmax(logit, dim=0) for logit in logits]
    g_hat = torch.autograd.grad(sum([self.loss_function(logit, y_hat) for logit, y_hat in zip(logits, y_hats)]) / B, p, retain_graph=True)[0]
    return B * g_hat * g_hat

Copy code
This updated Sophia class incorporates the suggested optimizations, making the code more efficient and potentially faster.

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

e10