Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
996c462
Adds basic validation, undo, redo, debouncing, error display and prve…
O-J1 Feb 7, 2026
6ba5c9a
Adds path_entry validation, prevent overwrite toggle and remove file_…
O-J1 Feb 7, 2026
adbec55
Merge branch 'Nerogar:master' into input-validation
O-J1 Feb 7, 2026
9052e8e
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Feb 7, 2026
cf18297
Adds extra validation behaviour (range, required), required int and f…
O-J1 Feb 8, 2026
be0f2db
Merge branch 'input-validation' of https://github.com/O-J1/OneTrainer…
O-J1 Feb 8, 2026
0650422
Delete dead code found in TrainUI.py
O-J1 Feb 8, 2026
18bfe6f
Make commits require validation to pass in order to write to trainConfig
O-J1 Feb 8, 2026
ef4a81f
Merge branch 'Nerogar:master' into input-validation
O-J1 Feb 8, 2026
12781f3
Adds tooltips, autocorrection and run name functionality
O-J1 Feb 9, 2026
f1b7a4b
Tweak autocorrect, bump up debounce to 500ms
O-J1 Feb 9, 2026
678df2f
Merge branch 'Nerogar:master' into input-tooltips-and-co
O-J1 Feb 12, 2026
da6b032
Merge branch 'master' into input-tooltips-and-co
O-J1 Feb 22, 2026
5219964
Merge branch 'master' of https://github.com/Nerogar/OneTrainer into i…
O-J1 Feb 22, 2026
2757f77
minor comment change
O-J1 Feb 22, 2026
e97bdcd
Remove redundant migrations
O-J1 Feb 22, 2026
2959b58
Code simplification
O-J1 Feb 23, 2026
15fae60
Remove treating learning rates specially
O-J1 Mar 3, 2026
2a47ba9
Fix typo in config_version
O-J1 Mar 3, 2026
9ca9806
Allow certain float fields to be negative
O-J1 Mar 3, 2026
ab4b687
Add st_ctime and mtime fallback for backup datetime extraction
O-J1 Mar 3, 2026
832c579
Adjust constant name, safer friendly run names
O-J1 Mar 6, 2026
a6f82f1
Merge branch 'Nerogar:master' into input-tooltips-and-co
O-J1 Mar 6, 2026
7449b2b
Initial commit of phase 3 for always on tensorboard.
O-J1 Mar 6, 2026
715945d
Merge branch 'Nerogar:master' into tb-always-on-v2
O-J1 Mar 7, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 12 additions & 2 deletions modules/trainer/BaseTrainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,17 @@ def _start_tensorboard(self):
if self.config.tensorboard_expose:
tensorboard_args.append("--bind_all")

self.tensorboard_subprocess = subprocess.Popen(tensorboard_args)

self.tensorboard_subprocess = subprocess.Popen(
tensorboard_args, stderr=subprocess.DEVNULL,
)

def _stop_tensorboard(self):
self.tensorboard_subprocess.kill()
if hasattr(self, 'tensorboard_subprocess') and self.tensorboard_subprocess:
try:
self.tensorboard_subprocess.terminate()
self.tensorboard_subprocess.wait(timeout=5)
except subprocess.TimeoutExpired:
self.tensorboard_subprocess.kill()
except Exception:
pass
4 changes: 2 additions & 2 deletions modules/trainer/CloudTrainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ def __init__(self, config: TrainConfig, callbacks: TrainCallbacks, commands: Tra

tensorboard_log_dir = os.path.join(config.workspace_dir, "tensorboard")
os.makedirs(Path(tensorboard_log_dir).absolute(), exist_ok=True)
if config.tensorboard and not config.cloud.tensorboard_tunnel and not config.tensorboard_always_on:
if config.tensorboard_is_train_only and not config.cloud.tensorboard_tunnel:
super()._start_tensorboard()

match config.cloud.type:
Expand Down Expand Up @@ -113,7 +113,7 @@ def train(self):

def end(self):
try:
if self.config.tensorboard and not self.config.cloud.tensorboard_tunnel and not self.config.tensorboard_always_on:
if self.config.tensorboard_is_train_only and not self.config.cloud.tensorboard_tunnel:
super()._stop_tensorboard()

if self.config.cloud.delete_workspace and not self.error_caught and not self.commands.get_stop_command():
Expand Down
77 changes: 46 additions & 31 deletions modules/trainer/GenericTrainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,10 +68,13 @@ def __init__(self, config: TrainConfig, callbacks: TrainCallbacks, commands: Tra
super().__init__(config, callbacks, commands)

if multi.is_master():
tensorboard_log_dir = os.path.join(config.workspace_dir, "tensorboard")
os.makedirs(Path(tensorboard_log_dir).absolute(), exist_ok=True)
self.tensorboard = SummaryWriter(os.path.join(tensorboard_log_dir, f"{config.save_filename_prefix}{get_string_timestamp()}"))
if config.tensorboard and not config.tensorboard_always_on:
if config.tensorboard_enabled:
tensorboard_log_dir = os.path.join(config.workspace_dir, "tensorboard")
os.makedirs(Path(tensorboard_log_dir).absolute(), exist_ok=True)
self.tensorboard = SummaryWriter(os.path.join(tensorboard_log_dir, f"{config.save_filename_prefix}{get_string_timestamp()}"))
else:
self.tensorboard = None
if config.tensorboard_is_train_only:
super()._start_tensorboard()

self.model = None
Expand Down Expand Up @@ -183,17 +186,20 @@ def __prune_backups(self, backups_to_keep: int):
backup_dirpath = os.path.join(self.config.workspace_dir, "backup")
if os.path.exists(backup_dirpath):
backup_directories = sorted(
[dirpath for dirpath in os.listdir(backup_dirpath) if
os.path.isdir(os.path.join(backup_dirpath, dirpath))],
[name for name in os.listdir(backup_dirpath) if
os.path.isdir(os.path.join(backup_dirpath, name))],
key=lambda n: TrainConfig._extract_backup_datetime(
os.path.join(backup_dirpath, n), n
),
reverse=True,
)

for dirpath in backup_directories[backups_to_keep:]:
dirpath = os.path.join(backup_dirpath, dirpath)
for name in backup_directories[backups_to_keep:]:
full = os.path.join(backup_dirpath, name)
try:
shutil.rmtree(dirpath)
shutil.rmtree(full)
except Exception:
print(f"Could not delete old rolling backup {dirpath}")
print(f"Could not delete old rolling backup {full}")

return

Expand Down Expand Up @@ -240,7 +246,7 @@ def __sample_loop(
)

def on_sample_default(sampler_output: ModelSamplerOutput):
if self.config.samples_to_tensorboard and sampler_output.file_type == FileType.IMAGE:
if self.config.samples_to_tensorboard and self.tensorboard is not None and sampler_output.file_type == FileType.IMAGE:
self.tensorboard.add_image(
f"sample{str(i)} - {safe_prompt}", pil_to_tensor(sampler_output.data), # noqa: B023
train_progress.global_step
Expand Down Expand Up @@ -400,18 +406,20 @@ def __validate(self, train_progress: TrainProgress):
for concept_seed, total_loss in accumulated_loss_per_concept.items():
average_loss = total_loss / concept_counts[concept_seed]

self.tensorboard.add_scalar(f"loss/validation_step/{mapping_seed_to_label[concept_seed]}",
average_loss,
train_progress.global_step)
if self.tensorboard is not None:
self.tensorboard.add_scalar(f"loss/validation_step/{mapping_seed_to_label[concept_seed]}",
average_loss,
train_progress.global_step)

if len(concept_counts) > 1:
total_loss = sum(accumulated_loss_per_concept[key] for key in concept_counts)
total_count = sum(concept_counts[key] for key in concept_counts)
total_average_loss = total_loss / total_count

self.tensorboard.add_scalar("loss/validation_step/total_average",
total_average_loss,
train_progress.global_step)
if self.tensorboard is not None:
self.tensorboard.add_scalar("loss/validation_step/total_average",
total_average_loss,
train_progress.global_step)

def __save_backup_config(self, backup_path):
config_path = os.path.join(backup_path, "onetrainer_config")
Expand All @@ -433,7 +441,8 @@ def __backup(self, train_progress: TrainProgress, print_msg: bool = True, print_

self.callbacks.on_update_status("Creating backup")

backup_name = f"{get_string_timestamp()}-backup-{train_progress.filename_string()}"
safe_prefix = path_util.safe_filename(self.config.save_filename_prefix, max_length=None)
backup_name = f"{safe_prefix}{get_string_timestamp()}-backup-{train_progress.filename_string()}"
backup_path = os.path.join(self.config.workspace_dir, "backup", backup_name)

# Special case for schedule-free optimizers.
Expand Down Expand Up @@ -480,10 +489,11 @@ def __save(self, train_progress: TrainProgress, print_msg: bool = True, print_cb

self.callbacks.on_update_status("Saving")

safe_prefix = path_util.safe_filename(self.config.save_filename_prefix, max_length=None)
save_path = os.path.join(
self.config.workspace_dir,
"save",
f"{self.config.save_filename_prefix}{get_string_timestamp()}-save-{train_progress.filename_string()}{self.config.output_model_format.file_extension()}"
f"{safe_prefix}{get_string_timestamp()}-save-{train_progress.filename_string()}{self.config.output_model_format.file_extension()}"
)
if print_msg:
print_cb("Saving " + save_path)
Expand Down Expand Up @@ -784,15 +794,17 @@ def sample_commands_fun():
has_gradient = False

if multi.is_master():
self.model_setup.report_to_tensorboard(
self.model, self.config, lr_scheduler, self.tensorboard
)
if self.tensorboard is not None:
self.model_setup.report_to_tensorboard(
self.model, self.config, lr_scheduler, self.tensorboard
)

accumulated_loss_cpu = accumulated_loss.item()
if math.isnan(accumulated_loss_cpu):
raise RuntimeError("Training loss became NaN. This may be due to invalid parameters, precision issues, or a bug in the loss computation.")

self.tensorboard.add_scalar("loss/train_step",accumulated_loss_cpu , train_progress.global_step)
if self.tensorboard is not None:
self.tensorboard.add_scalar("loss/train_step",accumulated_loss_cpu , train_progress.global_step)
ema_loss = ema_loss or accumulated_loss_cpu
ema_loss_steps += 1
ema_loss_decay = min(0.99, 1 - (1 / ema_loss_steps))
Expand All @@ -801,19 +813,21 @@ def sample_commands_fun():
'loss': accumulated_loss_cpu,
'smooth loss': ema_loss,
})
self.tensorboard.add_scalar("smooth_loss/train_step", ema_loss, train_progress.global_step)
if self.tensorboard is not None:
self.tensorboard.add_scalar("smooth_loss/train_step", ema_loss, train_progress.global_step)

accumulated_loss = 0.0
self.model_setup.after_optimizer_step(self.model, self.config, train_progress)

if self.model.ema:
assert multi.is_master()
update_step = train_progress.global_step // self.config.gradient_accumulation_steps
self.tensorboard.add_scalar(
"ema_decay",
self.model.ema.get_current_decay(update_step),
train_progress.global_step
)
if self.tensorboard is not None:
self.tensorboard.add_scalar(
"ema_decay",
self.model.ema.get_current_decay(update_step),
train_progress.global_step
)
self.model.ema.step(
self.parameters,
update_step
Expand Down Expand Up @@ -874,9 +888,10 @@ def end(self):
self.model.to(self.temp_device)

if multi.is_master():
self.tensorboard.close()
if self.tensorboard is not None:
self.tensorboard.close()

if self.config.tensorboard and not self.config.tensorboard_always_on:
if self.config.tensorboard_is_train_only:
super()._stop_tensorboard()

for handle in self.grad_hook_handles:
Expand Down
2 changes: 1 addition & 1 deletion modules/ui/MuonAdamWindow.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,6 @@ def create_adam_params_ui(self, master):
components.label(master, row, col, title, tooltip=tooltip)

if param_type != 'bool':
components.entry(master, row, col + 1, self.adam_ui_state, key)
components.entry(master, row, col + 1, self.adam_ui_state, key, allow_negative=True)
else:
components.switch(master, row, col + 1, self.adam_ui_state, key)
2 changes: 1 addition & 1 deletion modules/ui/OptimizerParamsWindow.py
Original file line number Diff line number Diff line change
Expand Up @@ -237,7 +237,7 @@ def create_dynamic_ui(
self.toggle_muon_adam_button()
elif type != 'bool':
components.entry(master, row, col + 1, self.optimizer_ui_state, key,
command=self.update_user_pref)
command=self.update_user_pref, allow_negative=True)
else:
components.switch(master, row, col + 1, self.optimizer_ui_state, key,
command=self.update_user_pref)
Expand Down
Loading