Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 0 additions & 4 deletions modules/ui/OptimizerParamsWindow.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,18 +116,14 @@ def create_dynamic_ui(
'fused': {'title': 'Fused', 'tooltip': 'Whether to use a fused implementation if available. This implementation is usually faster and requires less memory.', 'type': 'bool'},
'fused_back_pass': {'title': 'Fused Back Pass', 'tooltip': 'Whether to fuse the back propagation pass with the optimizer step. This reduces VRAM usage, but is not compatible with gradient accumulation.', 'type': 'bool'},
'growth_rate': {'title': 'Growth Rate', 'tooltip': 'Limit for D estimate growth rate.', 'type': 'float'},
'initial_accumulator_value': {'title': 'Initial Accumulator Value', 'tooltip': 'Initial value for Adagrad optimizer.', 'type': 'float'},
'initial_accumulator': {'title': 'Initial Accumulator', 'tooltip': 'Sets the starting value for both moment estimates to ensure numerical stability and balanced adaptive updates early in training.', 'type': 'float'},
'is_paged': {'title': 'Is Paged', 'tooltip': 'Whether the optimizer\'s internal state should be paged to CPU.', 'type': 'bool'},
'log_every': {'title': 'Log Every', 'tooltip': 'Intervals at which logging should occur.', 'type': 'int'},
'lr_decay': {'title': 'LR Decay', 'tooltip': 'Rate at which learning rate decreases.', 'type': 'float'},
'max_unorm': {'title': 'Max Unorm', 'tooltip': 'Maximum value for gradient clipping by norms.', 'type': 'float'},
'maximize': {'title': 'Maximize', 'tooltip': 'Whether to optimizer_maximize the optimization function.', 'type': 'bool'},
'min_8bit_size': {'title': 'Min 8bit Size', 'tooltip': 'Minimum tensor size for 8-bit quantization.', 'type': 'int'},
'quant_block_size': {'title': 'Quant Block Size', 'tooltip': 'Size of a block of normalized 8-bit quantization data. Larger values increase memory efficiency at the cost of data precision.', 'type': 'int'},
'momentum': {'title': 'optimizer_momentum', 'tooltip': 'Factor to accelerate SGD in relevant direction.', 'type': 'float'},
'nesterov': {'title': 'Nesterov', 'tooltip': 'Whether to enable Nesterov optimizer_momentum.', 'type': 'bool'},
'no_prox': {'title': 'No Prox', 'tooltip': 'Whether to use proximity updates or not.', 'type': 'bool'},
'optim_bits': {'title': 'Optim Bits', 'tooltip': 'Number of bits used for optimization.', 'type': 'int'},
'percentile_clipping': {'title': 'Percentile Clipping', 'tooltip': 'Gradient clipping based on percentile values.', 'type': 'int'},
'relative_step': {'title': 'Relative Step', 'tooltip': 'Whether to use a relative step size.', 'type': 'bool'},
Expand Down
8 changes: 0 additions & 8 deletions modules/util/config/TrainConfig.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,18 +58,14 @@ class TrainOptimizerConfig(BaseConfig):
fused: bool
fused_back_pass: bool
growth_rate: float
initial_accumulator_value: int
initial_accumulator: float
is_paged: bool
log_every: int
lr_decay: float
max_unorm: float
maximize: bool
min_8bit_size: int
quant_block_size: int
momentum: float
nesterov: bool
no_prox: bool
optim_bits: int
percentile_clipping: int
r: float
Expand Down Expand Up @@ -177,18 +173,14 @@ def default_values():
data.append(("fused", False, bool, False))
data.append(("fused_back_pass", False, bool, False))
data.append(("growth_rate", None, float, True))
data.append(("initial_accumulator_value", None, int, True))
data.append(("initial_accumulator", None, float, True))
data.append(("is_paged", False, bool, False))
data.append(("log_every", None, int, True))
data.append(("lr_decay", None, float, True))
data.append(("max_unorm", None, float, True))
data.append(("maximize", False, bool, False))
data.append(("min_8bit_size", None, int, True))
data.append(("quant_block_size", None, int, True))
data.append(("momentum", None, float, True))
data.append(("nesterov", False, bool, False))
data.append(("no_prox", False, bool, False))
data.append(("optim_bits", None, int, True))
data.append(("percentile_clipping", None, int, True))
data.append(("r", None, float, True))
Expand Down
134 changes: 0 additions & 134 deletions modules/util/create.py
Original file line number Diff line number Diff line change
Expand Up @@ -289,33 +289,6 @@ def create_optimizer(
is_paged=optimizer_config.is_paged if optimizer_config.is_paged is not None else False,
)

# ADAGRAD Optimizer
case Optimizer.ADAGRAD:
import bitsandbytes as bnb
optimizer = bnb.optim.Adagrad(
params=parameters,
lr=config.learning_rate,
weight_decay=optimizer_config.weight_decay if optimizer_config.weight_decay is not None else 0,
eps=optimizer_config.eps if optimizer_config.eps is not None else 1e-10,
lr_decay=optimizer_config.lr_decay if optimizer_config.lr_decay is not None else 0,
initial_accumulator_value=optimizer_config.initial_accumulator_value if optimizer_config.initial_accumulator_value is not None else 0,
)

# ADAGRAD_8BIT Optimizer
case Optimizer.ADAGRAD_8BIT:
import bitsandbytes as bnb
optimizer = bnb.optim.Adagrad8bit(
params=parameters,
lr=config.learning_rate,
weight_decay=optimizer_config.weight_decay if optimizer_config.weight_decay is not None else 0,
eps=optimizer_config.eps if optimizer_config.eps is not None else 1e-10,
lr_decay=optimizer_config.lr_decay if optimizer_config.lr_decay is not None else 0,
initial_accumulator_value=optimizer_config.initial_accumulator_value if optimizer_config.initial_accumulator_value is not None else 0,
min_8bit_size=optimizer_config.min_8bit_size if optimizer_config.min_8bit_size is not None else 4096,
percentile_clipping=optimizer_config.percentile_clipping if optimizer_config.percentile_clipping is not None else 100,
block_wise=optimizer_config.block_wise if optimizer_config.block_wise is not None else True,
)

# RMSPROP Optimizer
case Optimizer.RMSPROP:
import bitsandbytes as bnb
Expand Down Expand Up @@ -473,83 +446,6 @@ def create_optimizer(
foreach=optimizer_config.foreach if optimizer_config.foreach is not None else False
)

# DADAPT_SGD Optimizer
case Optimizer.DADAPT_SGD:
import dadaptation as da
optimizer = da.DAdaptSGD(
params=parameters,
lr=config.learning_rate,
momentum=optimizer_config.momentum if optimizer_config.momentum is not None else 0.0,
weight_decay=optimizer_config.weight_decay if optimizer_config.weight_decay is not None else 0,
log_every=optimizer_config.log_every if optimizer_config.log_every is not None else 0,
d0=optimizer_config.d0 if optimizer_config.d0 is not None else 1e-6,
growth_rate=optimizer_config.growth_rate if optimizer_config.growth_rate is not None else float('inf'),
fsdp_in_use=optimizer_config.fsdp_in_use if optimizer_config.fsdp_in_use is not None else False,
)

# DADAPT_ADAM Optimizer
case Optimizer.DADAPT_ADAM:
import dadaptation as da
optimizer = da.DAdaptAdam(
params=parameters,
lr=config.learning_rate,
betas=(optimizer_config.beta1 if optimizer_config.beta1 is not None else 0.9,
optimizer_config.beta2 if optimizer_config.beta2 is not None else 0.999),
eps=optimizer_config.eps if optimizer_config.eps is not None else 1e-8,
weight_decay=optimizer_config.weight_decay if optimizer_config.weight_decay is not None else 0,
log_every=optimizer_config.log_every if optimizer_config.log_every is not None else 0,
decouple=optimizer_config.decouple if optimizer_config.decouple is not None else False,
use_bias_correction=optimizer_config.use_bias_correction if optimizer_config.use_bias_correction is not None else False,
d0=optimizer_config.d0 if optimizer_config.d0 is not None else 1e-6,
growth_rate=optimizer_config.growth_rate if optimizer_config.growth_rate is not None else float('inf'),
fsdp_in_use=optimizer_config.fsdp_in_use if optimizer_config.fsdp_in_use is not None else False,
)

# DADAPT_ADAN Optimizer
case Optimizer.DADAPT_ADAN:
import dadaptation as da
optimizer = da.DAdaptAdan(
params=parameters,
lr=config.learning_rate,
betas=(optimizer_config.beta1 if optimizer_config.beta1 is not None else 0.98,
optimizer_config.beta2 if optimizer_config.beta2 is not None else 0.92,
optimizer_config.beta3 if optimizer_config.beta3 is not None else 0.99),
eps=optimizer_config.eps if optimizer_config.eps is not None else 1e-8,
weight_decay=optimizer_config.weight_decay if optimizer_config.weight_decay is not None else 0.02,
no_prox=optimizer_config.no_prox if optimizer_config.no_prox is not None else False,
log_every=optimizer_config.log_every if optimizer_config.log_every is not None else 0,
d0=optimizer_config.d0 if optimizer_config.d0 is not None else 1e-6,
growth_rate=optimizer_config.growth_rate if optimizer_config.growth_rate is not None else float('inf'),
)

# DADAPT_ADA_GRAD Optimizer
case Optimizer.DADAPT_ADA_GRAD:
import dadaptation as da
optimizer = da.DAdaptAdaGrad(
params=parameters,
lr=config.learning_rate,
momentum=optimizer_config.momentum if optimizer_config.momentum is not None else 0,
log_every=optimizer_config.log_every if optimizer_config.log_every is not None else 0,
weight_decay=optimizer_config.weight_decay if optimizer_config.weight_decay is not None else 0.0,
eps=optimizer_config.eps if optimizer_config.eps is not None else 0.0,
d0=optimizer_config.d0 if optimizer_config.d0 is not None else 1e-6,
growth_rate=optimizer_config.growth_rate if optimizer_config.growth_rate is not None else float('inf'),
)

# DADAPT_LION Optimizer
case Optimizer.DADAPT_LION:
import dadaptation as da
optimizer = da.DAdaptLion(
params=parameters,
lr=config.learning_rate,
betas=(optimizer_config.beta1 if optimizer_config.beta1 is not None else 0.9,
optimizer_config.beta2 if optimizer_config.beta2 is not None else 0.999),
weight_decay=optimizer_config.weight_decay if optimizer_config.weight_decay is not None else 0.0,
log_every=optimizer_config.log_every if optimizer_config.log_every is not None else 0,
d0=optimizer_config.d0 if optimizer_config.d0 is not None else 1e-6,
fsdp_in_use=optimizer_config.fsdp_in_use if optimizer_config.fsdp_in_use is not None else False,
)

# PRODIGY Optimizer
case Optimizer.PRODIGY:
import prodigyopt
Expand Down Expand Up @@ -1010,18 +906,6 @@ def create_optimizer(
degenerated_to_sgd=optimizer_config.degenerated_to_sgd if optimizer_config.degenerated_to_sgd is not None else True,
)

# TIGER Optimizer
case Optimizer.TIGER:
from pytorch_optimizer.optimizer.tiger import Tiger
optimizer = Tiger(
params=parameters,
lr=config.learning_rate if config.learning_rate is not None else 0,
weight_decay=optimizer_config.weight_decay if optimizer_config.weight_decay is not None else 0,
beta=optimizer_config.beta1 if optimizer_config.beta1 is not None else 0.9,
weight_decouple=optimizer_config.decoupled_decay if optimizer_config.decoupled_decay is not None else True,
fixed_decay=optimizer_config.fixed_decay if optimizer_config.fixed_decay is not None else False,
)

# AIDA Optimizer
case Optimizer.AIDA:
from pytorch_optimizer.optimizer.aida import Aida
Expand Down Expand Up @@ -1060,24 +944,6 @@ def create_optimizer(
eps=optimizer_config.eps if optimizer_config.eps is not None else 1e-6,
)

# YOGI Optimizer
case Optimizer.YOGI:
from pytorch_optimizer.optimizer.yogi import Yogi
optimizer = Yogi(
params=parameters,
lr=config.learning_rate,
betas=(optimizer_config.beta1 if optimizer_config.beta1 is not None else 0.9,
optimizer_config.beta2 if optimizer_config.beta2 is not None else 0.999),
weight_decay=optimizer_config.weight_decay if optimizer_config.weight_decay is not None else 0.0,
weight_decouple=optimizer_config.decoupled_decay if optimizer_config.decoupled_decay is not None else True,
fixed_decay=optimizer_config.fixed_decay if optimizer_config.fixed_decay is not None else False,
r=optimizer_config.r if optimizer_config.r is not None else 0.95,
adanorm=optimizer_config.adanorm if optimizer_config.adanorm is not None else False,
adam_debias=optimizer_config.adam_debias if optimizer_config.adam_debias is not None else False,
initial_accumulator=optimizer_config.initial_accumulator if optimizer_config.initial_accumulator is not None else 1e-6,
eps=optimizer_config.eps if optimizer_config.eps is not None else 1e-3,
)

if state_dict is not None and optimizer is not None:
if 'param_group_mapping' not in state_dict:
# Old method of loading the optimizer state. This only works if the param groups did not change.
Expand Down
20 changes: 1 addition & 19 deletions modules/util/enum/Optimizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,7 @@


class Optimizer(Enum):
# Sorted by origin (BNB / torch first, then DADAPT), then by adapter name, then interleaved by variant.

# BNB Standard & 8-bit
ADAGRAD = 'ADAGRAD'
ADAGRAD_8BIT = 'ADAGRAD_8BIT'
# Sorted by origin (BNB / torch first), then by adapter name, then interleaved by variant.

# 32 bit is torch and not bnb
ADAM = 'ADAM'
Expand Down Expand Up @@ -48,13 +44,6 @@ class Optimizer(Enum):
SCHEDULE_FREE_ADAMW = 'SCHEDULE_FREE_ADAMW'
SCHEDULE_FREE_SGD = 'SCHEDULE_FREE_SGD'

# DADAPT
DADAPT_ADA_GRAD = 'DADAPT_ADA_GRAD'
DADAPT_ADAM = 'DADAPT_ADAM'
DADAPT_ADAN = 'DADAPT_ADAN'
DADAPT_LION = 'DADAPT_LION'
DADAPT_SGD = 'DADAPT_SGD'

# Prodigy
PRODIGY = 'PRODIGY'
PRODIGY_PLUS_SCHEDULE_FREE = 'PRODIGY_PLUS_SCHEDULE_FREE'
Expand All @@ -75,18 +64,11 @@ class Optimizer(Enum):

#Pytorch Optimizers
ADABELIEF = 'ADABELIEF'
TIGER = 'TIGER'
AIDA = 'AIDA'
YOGI = 'YOGI'

@property
def is_adaptive(self):
return self in [
self.DADAPT_SGD,
self.DADAPT_ADAM,
self.DADAPT_ADAN,
self.DADAPT_ADA_GRAD,
self.DADAPT_LION,
self.PRODIGY,
self.PRODIGY_PLUS_SCHEDULE_FREE,
self.PRODIGY_ADV,
Expand Down
86 changes: 0 additions & 86 deletions modules/util/optimizer_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,27 +106,6 @@ def init_model_parameters(
"stochastic_rounding": True,
"fused_back_pass": False,
},
Optimizer.ADAGRAD: {
"lr_decay": 0,
"weight_decay": 0,
"initial_accumulator_value": 0,
"eps": 1e-10,
"optim_bits": 32,
"min_8bit_size": 4096,
"percentile_clipping": 100,
"block_wise": True,
},
Optimizer.ADAGRAD_8BIT: {
"lr_decay": 0,
"weight_decay": 0,
"initial_accumulator_value": 0,
"eps": 1e-10,
"optim_bits": 8,
"min_8bit_size": 4096,
"percentile_clipping": 100,
"block_wise": True,
"fused_back_pass": False,
},
Optimizer.ADAM_8BIT: {
"beta1": 0.9,
"beta2": 0.999,
Expand Down Expand Up @@ -335,53 +314,6 @@ def init_model_parameters(
"schedulefree_c": 0.0,
"use_orthograd": False,
},
Optimizer.DADAPT_ADA_GRAD: {
"momentum": 0,
"log_every": 0,
"weight_decay": 0.0,
"eps": 0.0,
"d0": 1e-6,
"growth_rate": float('inf'),
},
Optimizer.DADAPT_ADAN: {
"beta1": 0.98,
"beta2": 0.92,
"beta3": 0.99,
"eps": 1e-8,
"weight_decay": 0.02,
"no_prox": False,
"log_every": 0,
"d0": 1e-6,
"growth_rate": float('inf'),
},
Optimizer.DADAPT_ADAM: {
"beta1": 0.9,
"beta2": 0.999,
"eps": 1e-8,
"weight_decay": 0,
"log_every": 0,
"decouple": False,
"use_bias_correction": False,
"d0": 1e-6,
"growth_rate": float('inf'),
"fsdp_in_use": False,
},
Optimizer.DADAPT_SGD: {
"momentum": 0.0,
"weight_decay": 0,
"log_every": 0,
"d0": 1e-6,
"growth_rate": float('inf'),
"fsdp_in_use": False,
},
Optimizer.DADAPT_LION: {
"beta1": 0.9,
"beta2": 0.999,
"weight_decay": 0.0,
"log_every": 0,
"d0": 1e-6,
"fsdp_in_use": False,
},
Optimizer.ADAM: {
"beta1": 0.9,
"beta2": 0.999,
Expand Down Expand Up @@ -657,12 +589,6 @@ def init_model_parameters(
"rectify": True,
"degenerated_to_sgd": True,
},
Optimizer.TIGER: {
"beta1": 0.965,
"weight_decay": 0.01,
"decoupled_decay": True,
"fixed_decay": False,
},
Optimizer.AIDA: {
"beta1": 0.9,
"beta2": 0.999,
Expand All @@ -680,16 +606,4 @@ def init_model_parameters(
"adam_debias": False,
"eps": 1e-8,
},
Optimizer.YOGI: {
"beta1": 0.9,
"beta2": 0.999,
"weight_decay": 0.0,
"decoupled_decay": True,
"fixed_decay": False,
"r": 0.95,
"adanorm": False,
"adam_debias": False,
"initial_accumulator": 1e-6,
"eps": 1e-3,
},
}
1 change: 0 additions & 1 deletion requirements-global.txt
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,6 @@ open-clip-torch==2.32.0
-e git+https://github.com/Nerogar/mgds.git@5bbafa5#egg=mgds

# optimizers
dadaptation==3.2 # dadaptation optimizers
lion-pytorch==0.2.3 # lion optimizer
prodigyopt==1.1.2 # prodigy optimizer
schedulefree==1.4.1 # schedule-free optimizers
Expand Down