Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
996c462
Adds basic validation, undo, redo, debouncing, error display and prve…
O-J1 Feb 7, 2026
6ba5c9a
Adds path_entry validation, prevent overwrite toggle and remove file_…
O-J1 Feb 7, 2026
adbec55
Merge branch 'Nerogar:master' into input-validation
O-J1 Feb 7, 2026
9052e8e
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Feb 7, 2026
cf18297
Adds extra validation behaviour (range, required), required int and f…
O-J1 Feb 8, 2026
be0f2db
Merge branch 'input-validation' of https://github.com/O-J1/OneTrainer…
O-J1 Feb 8, 2026
0650422
Delete dead code found in TrainUI.py
O-J1 Feb 8, 2026
18bfe6f
Make commits require validation to pass in order to write to trainConfig
O-J1 Feb 8, 2026
ef4a81f
Merge branch 'Nerogar:master' into input-validation
O-J1 Feb 8, 2026
12781f3
Adds tooltips, autocorrection and run name functionality
O-J1 Feb 9, 2026
f1b7a4b
Tweak autocorrect, bump up debounce to 500ms
O-J1 Feb 9, 2026
678df2f
Merge branch 'Nerogar:master' into input-tooltips-and-co
O-J1 Feb 12, 2026
da6b032
Merge branch 'master' into input-tooltips-and-co
O-J1 Feb 22, 2026
5219964
Merge branch 'master' of https://github.com/Nerogar/OneTrainer into i…
O-J1 Feb 22, 2026
2757f77
minor comment change
O-J1 Feb 22, 2026
e97bdcd
Remove redundant migrations
O-J1 Feb 22, 2026
2959b58
Code simplification
O-J1 Feb 23, 2026
15fae60
Remove treating learning rates specially
O-J1 Mar 3, 2026
2a47ba9
Fix typo in config_version
O-J1 Mar 3, 2026
9ca9806
Allow certain float fields to be negative
O-J1 Mar 3, 2026
ab4b687
Add st_ctime and mtime fallback for backup datetime extraction
O-J1 Mar 3, 2026
832c579
Adjust constant name, safer friendly run names
O-J1 Mar 6, 2026
a6f82f1
Merge branch 'Nerogar:master' into input-tooltips-and-co
O-J1 Mar 6, 2026
7449b2b
Initial commit of phase 3 for always on tensorboard.
O-J1 Mar 6, 2026
715945d
Merge branch 'Nerogar:master' into tb-always-on-v2
O-J1 Mar 7, 2026
d0e6489
Initial commit of phase 4 - Drag-n-Drop
O-J1 Mar 7, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 12 additions & 2 deletions modules/trainer/BaseTrainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,17 @@ def _start_tensorboard(self):
if self.config.tensorboard_expose:
tensorboard_args.append("--bind_all")

self.tensorboard_subprocess = subprocess.Popen(tensorboard_args)

self.tensorboard_subprocess = subprocess.Popen(
tensorboard_args, stderr=subprocess.DEVNULL,
)

def _stop_tensorboard(self):
self.tensorboard_subprocess.kill()
if hasattr(self, 'tensorboard_subprocess') and self.tensorboard_subprocess:
try:
self.tensorboard_subprocess.terminate()
self.tensorboard_subprocess.wait(timeout=5)
except subprocess.TimeoutExpired:
self.tensorboard_subprocess.kill()
except Exception:
pass
4 changes: 2 additions & 2 deletions modules/trainer/CloudTrainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ def __init__(self, config: TrainConfig, callbacks: TrainCallbacks, commands: Tra

tensorboard_log_dir = os.path.join(config.workspace_dir, "tensorboard")
os.makedirs(Path(tensorboard_log_dir).absolute(), exist_ok=True)
if config.tensorboard and not config.cloud.tensorboard_tunnel and not config.tensorboard_always_on:
if config.tensorboard_is_train_only and not config.cloud.tensorboard_tunnel:
super()._start_tensorboard()

match config.cloud.type:
Expand Down Expand Up @@ -113,7 +113,7 @@ def train(self):

def end(self):
try:
if self.config.tensorboard and not self.config.cloud.tensorboard_tunnel and not self.config.tensorboard_always_on:
if self.config.tensorboard_is_train_only and not self.config.cloud.tensorboard_tunnel:
super()._stop_tensorboard()

if self.config.cloud.delete_workspace and not self.error_caught and not self.commands.get_stop_command():
Expand Down
77 changes: 46 additions & 31 deletions modules/trainer/GenericTrainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,10 +68,13 @@ def __init__(self, config: TrainConfig, callbacks: TrainCallbacks, commands: Tra
super().__init__(config, callbacks, commands)

if multi.is_master():
tensorboard_log_dir = os.path.join(config.workspace_dir, "tensorboard")
os.makedirs(Path(tensorboard_log_dir).absolute(), exist_ok=True)
self.tensorboard = SummaryWriter(os.path.join(tensorboard_log_dir, f"{config.save_filename_prefix}{get_string_timestamp()}"))
if config.tensorboard and not config.tensorboard_always_on:
if config.tensorboard_enabled:
tensorboard_log_dir = os.path.join(config.workspace_dir, "tensorboard")
os.makedirs(Path(tensorboard_log_dir).absolute(), exist_ok=True)
self.tensorboard = SummaryWriter(os.path.join(tensorboard_log_dir, f"{config.save_filename_prefix}{get_string_timestamp()}"))
else:
self.tensorboard = None
if config.tensorboard_is_train_only:
super()._start_tensorboard()

self.model = None
Expand Down Expand Up @@ -183,17 +186,20 @@ def __prune_backups(self, backups_to_keep: int):
backup_dirpath = os.path.join(self.config.workspace_dir, "backup")
if os.path.exists(backup_dirpath):
backup_directories = sorted(
[dirpath for dirpath in os.listdir(backup_dirpath) if
os.path.isdir(os.path.join(backup_dirpath, dirpath))],
[name for name in os.listdir(backup_dirpath) if
os.path.isdir(os.path.join(backup_dirpath, name))],
key=lambda n: TrainConfig._extract_backup_datetime(
os.path.join(backup_dirpath, n), n
),
reverse=True,
)

for dirpath in backup_directories[backups_to_keep:]:
dirpath = os.path.join(backup_dirpath, dirpath)
for name in backup_directories[backups_to_keep:]:
full = os.path.join(backup_dirpath, name)
try:
shutil.rmtree(dirpath)
shutil.rmtree(full)
except Exception:
print(f"Could not delete old rolling backup {dirpath}")
print(f"Could not delete old rolling backup {full}")

return

Expand Down Expand Up @@ -240,7 +246,7 @@ def __sample_loop(
)

def on_sample_default(sampler_output: ModelSamplerOutput):
if self.config.samples_to_tensorboard and sampler_output.file_type == FileType.IMAGE:
if self.config.samples_to_tensorboard and self.tensorboard is not None and sampler_output.file_type == FileType.IMAGE:
self.tensorboard.add_image(
f"sample{str(i)} - {safe_prompt}", pil_to_tensor(sampler_output.data), # noqa: B023
train_progress.global_step
Expand Down Expand Up @@ -400,18 +406,20 @@ def __validate(self, train_progress: TrainProgress):
for concept_seed, total_loss in accumulated_loss_per_concept.items():
average_loss = total_loss / concept_counts[concept_seed]

self.tensorboard.add_scalar(f"loss/validation_step/{mapping_seed_to_label[concept_seed]}",
average_loss,
train_progress.global_step)
if self.tensorboard is not None:
self.tensorboard.add_scalar(f"loss/validation_step/{mapping_seed_to_label[concept_seed]}",
average_loss,
train_progress.global_step)

if len(concept_counts) > 1:
total_loss = sum(accumulated_loss_per_concept[key] for key in concept_counts)
total_count = sum(concept_counts[key] for key in concept_counts)
total_average_loss = total_loss / total_count

self.tensorboard.add_scalar("loss/validation_step/total_average",
total_average_loss,
train_progress.global_step)
if self.tensorboard is not None:
self.tensorboard.add_scalar("loss/validation_step/total_average",
total_average_loss,
train_progress.global_step)

def __save_backup_config(self, backup_path):
config_path = os.path.join(backup_path, "onetrainer_config")
Expand All @@ -433,7 +441,8 @@ def __backup(self, train_progress: TrainProgress, print_msg: bool = True, print_

self.callbacks.on_update_status("Creating backup")

backup_name = f"{get_string_timestamp()}-backup-{train_progress.filename_string()}"
safe_prefix = path_util.safe_filename(self.config.save_filename_prefix, max_length=None)
backup_name = f"{safe_prefix}{get_string_timestamp()}-backup-{train_progress.filename_string()}"
backup_path = os.path.join(self.config.workspace_dir, "backup", backup_name)

# Special case for schedule-free optimizers.
Expand Down Expand Up @@ -480,10 +489,11 @@ def __save(self, train_progress: TrainProgress, print_msg: bool = True, print_cb

self.callbacks.on_update_status("Saving")

safe_prefix = path_util.safe_filename(self.config.save_filename_prefix, max_length=None)
save_path = os.path.join(
self.config.workspace_dir,
"save",
f"{self.config.save_filename_prefix}{get_string_timestamp()}-save-{train_progress.filename_string()}{self.config.output_model_format.file_extension()}"
f"{safe_prefix}{get_string_timestamp()}-save-{train_progress.filename_string()}{self.config.output_model_format.file_extension()}"
)
if print_msg:
print_cb("Saving " + save_path)
Expand Down Expand Up @@ -784,15 +794,17 @@ def sample_commands_fun():
has_gradient = False

if multi.is_master():
self.model_setup.report_to_tensorboard(
self.model, self.config, lr_scheduler, self.tensorboard
)
if self.tensorboard is not None:
self.model_setup.report_to_tensorboard(
self.model, self.config, lr_scheduler, self.tensorboard
)

accumulated_loss_cpu = accumulated_loss.item()
if math.isnan(accumulated_loss_cpu):
raise RuntimeError("Training loss became NaN. This may be due to invalid parameters, precision issues, or a bug in the loss computation.")

self.tensorboard.add_scalar("loss/train_step",accumulated_loss_cpu , train_progress.global_step)
if self.tensorboard is not None:
self.tensorboard.add_scalar("loss/train_step",accumulated_loss_cpu , train_progress.global_step)
ema_loss = ema_loss or accumulated_loss_cpu
ema_loss_steps += 1
ema_loss_decay = min(0.99, 1 - (1 / ema_loss_steps))
Expand All @@ -801,19 +813,21 @@ def sample_commands_fun():
'loss': accumulated_loss_cpu,
'smooth loss': ema_loss,
})
self.tensorboard.add_scalar("smooth_loss/train_step", ema_loss, train_progress.global_step)
if self.tensorboard is not None:
self.tensorboard.add_scalar("smooth_loss/train_step", ema_loss, train_progress.global_step)

accumulated_loss = 0.0
self.model_setup.after_optimizer_step(self.model, self.config, train_progress)

if self.model.ema:
assert multi.is_master()
update_step = train_progress.global_step // self.config.gradient_accumulation_steps
self.tensorboard.add_scalar(
"ema_decay",
self.model.ema.get_current_decay(update_step),
train_progress.global_step
)
if self.tensorboard is not None:
self.tensorboard.add_scalar(
"ema_decay",
self.model.ema.get_current_decay(update_step),
train_progress.global_step
)
self.model.ema.step(
self.parameters,
update_step
Expand Down Expand Up @@ -874,9 +888,10 @@ def end(self):
self.model.to(self.temp_device)

if multi.is_master():
self.tensorboard.close()
if self.tensorboard is not None:
self.tensorboard.close()

if self.config.tensorboard and not self.config.tensorboard_always_on:
if self.config.tensorboard_is_train_only:
super()._stop_tensorboard()

for handle in self.grad_hook_handles:
Expand Down
33 changes: 33 additions & 0 deletions modules/ui/ConceptTab.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import os
import pathlib
import platform
from tkinter import BooleanVar, StringVar

from modules.ui.ConceptWindow import ConceptWindow
Expand All @@ -10,6 +11,7 @@
from modules.util.enum.ConceptType import ConceptType
from modules.util.image_util import load_image
from modules.util.ui import components
from modules.util.ui.dnd import bind_file_drop
from modules.util.ui.UIState import UIState
from modules.util.ui.validation import DebounceTimer

Expand Down Expand Up @@ -43,6 +45,37 @@ def __init__(self, master, train_config: TrainConfig, ui_state: UIState):
# wrap toolbar if too narrow
self.top_frame.bind('<Configure>', lambda e: self._maybe_reposition_toolbar(e.width))

def _create_element_list(self, **filters):
super()._create_element_list(**filters)
self._bind_drop_targets()

def _get_empty_state_text(self) -> str:
if platform.system() == "Linux":
return "Your concepts are empty, click the Add button to add a concept"
return "Your concepts are empty, either drag n drop a folder or click the Add button to add a concept"

def _bind_drop_targets(self):
def _on_drop(paths: list[str]):
for dropped_path in paths:
if not os.path.isdir(dropped_path):
continue

concept = self.create_new_element()
concept.path = dropped_path

if not concept.name:
concept.name = os.path.basename(os.path.normpath(dropped_path))

self._append_existing_element(concept)

if self.element_list is not None and not getattr(self.element_list, "_dnd_bound", False):
if bind_file_drop(self.element_list, _on_drop):
self.element_list._dnd_bound = True

if self.top_frame is not None and not getattr(self.top_frame, "_dnd_bound", False):
if bind_file_drop(self.top_frame, _on_drop):
self.top_frame._dnd_bound = True

def create_widget(self, master, element, i, open_command, remove_command, clone_command, save_command):
return ConceptWidget(master, element, i, open_command, remove_command, clone_command, save_command)

Expand Down
55 changes: 37 additions & 18 deletions modules/ui/ConfigList.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ def __init__(
self.show_toggle_button = show_toggle_button
self.is_opening_window = False
self._is_current_item_enabled = False
self.empty_state_label = None

self.master.grid_rowconfigure(0, weight=0)
self.master.grid_rowconfigure(1, weight=1)
Expand Down Expand Up @@ -107,6 +108,19 @@ def open_element_window(self, i, ui_state) -> ctk.CTkToplevel:
def _refresh_show_disabled_text(self):
return

def _get_empty_state_text(self) -> str:
return ""

def _update_empty_state_visibility(self):
if self.empty_state_label is None:
return

should_show = len(self.current_config) == 0
if should_show:
self.empty_state_label.place(relx=0.5, rely=0.5, anchor="center")
else:
self.empty_state_label.place_forget()

def _reset_filters(self): # pragma: no cover - default noop
search_var = getattr(self, 'search_var', None)
filter_var = getattr(self, 'filter_var', None)
Expand Down Expand Up @@ -183,6 +197,19 @@ def _initialize_all_widgets(self):
if self.is_full_width:
self.element_list.grid_columnconfigure(0, weight=1)

empty_state_text = self._get_empty_state_text()
if empty_state_text:
label_parent = getattr(self.element_list, "_parent_canvas", self.element_list)
self.empty_state_label = ctk.CTkLabel(
label_parent,
text=empty_state_text,
justify="center",
anchor="center",
wraplength=560,
)
else:
self.empty_state_label = None

for i, element in enumerate(self.current_config):
widget = self.create_widget(
self.element_list, element, i,
Expand All @@ -193,9 +220,13 @@ def _initialize_all_widgets(self):
)
self.widgets.append(widget)

self._update_empty_state_visibility()

def _update_widget_visibility(self):
visible_index = 0

self._update_empty_state_visibility()

for i, widget in enumerate(self.widgets):
if i < len(self.current_config):
element = self.current_config[i]
Expand Down Expand Up @@ -232,12 +263,15 @@ def __add_config(self):

def __add_element(self):
new_element = self.create_new_element()
self.current_config.append(new_element)
self._append_existing_element(new_element)

def _append_existing_element(self, element: BaseConfig):
self.current_config.append(element)
# incremental insertion if widgets already initialized, else fall back to full rebuild
if self.widgets_initialized and self.element_list is not None:
i = len(self.current_config) - 1
widget = self.create_widget(
self.element_list, new_element, i,
self.element_list, element, i,
self.__open_element_window,
self.__remove_element,
self.__clone_element,
Expand All @@ -255,22 +289,7 @@ def __clone_element(self, clone_i, modify_element_fun=None):

if modify_element_fun is not None:
new_element = modify_element_fun(new_element)
self.current_config.append(new_element)
if self.widgets_initialized and self.element_list is not None:
i = len(self.current_config) - 1
widget = self.create_widget(
self.element_list, new_element, i,
self.__open_element_window,
self.__remove_element,
self.__clone_element,
self.save_current_config
)
self.widgets.append(widget)
self._update_widget_visibility()
else:
self.widgets_initialized = False
self._create_element_list()
self.save_current_config()
self._append_existing_element(new_element)

def __remove_element(self, remove_i):
self.current_config.pop(remove_i)
Expand Down
2 changes: 1 addition & 1 deletion modules/ui/MuonAdamWindow.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,6 @@ def create_adam_params_ui(self, master):
components.label(master, row, col, title, tooltip=tooltip)

if param_type != 'bool':
components.entry(master, row, col + 1, self.adam_ui_state, key)
components.entry(master, row, col + 1, self.adam_ui_state, key, allow_negative=True)
else:
components.switch(master, row, col + 1, self.adam_ui_state, key)
2 changes: 1 addition & 1 deletion modules/ui/OptimizerParamsWindow.py
Original file line number Diff line number Diff line change
Expand Up @@ -237,7 +237,7 @@ def create_dynamic_ui(
self.toggle_muon_adam_button()
elif type != 'bool':
components.entry(master, row, col + 1, self.optimizer_ui_state, key,
command=self.update_user_pref)
command=self.update_user_pref, allow_negative=True)
else:
components.switch(master, row, col + 1, self.optimizer_ui_state, key,
command=self.update_user_pref)
Expand Down
Loading