Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
53a62ac
initialize_with_decision_trees argument. default False (preserving pr…
gAldeia Nov 2, 2025
998ebd9
Better error message
gAldeia Nov 3, 2025
7be9710
Fixed print of invalid variables
gAldeia Nov 3, 2025
f3e2bba
Updating the auprc metric. Still need some work
gAldeia Nov 4, 2025
c7a5591
New tests
gAldeia Nov 4, 2025
6899509
Avoid generating complex equations in tests
gAldeia Nov 6, 2025
73a9aa2
Attempt to fix failing test cases of final model selection schema
gAldeia Nov 9, 2025
5826386
New attempt and improved log messages
gAldeia Nov 9, 2025
dac4b4e
Fixed conditional in final model selection
gAldeia Nov 9, 2025
cd079d3
Trying to make tests run faster
gAldeia Nov 9, 2025
4ec4c2b
Make brush work with 0 generations again
gAldeia Nov 9, 2025
18f8e81
Updated partial fit signature. It replicates the best individual by d…
gAldeia Nov 12, 2025
e5f854b
lockking nodes or weights
gAldeia Nov 12, 2025
cacf8fc
Variation is aware of locked weights. changed how I handle failed var…
gAldeia Nov 12, 2025
e0bd9bb
Sample notebook to test fixed nodes and weights
gAldeia Nov 12, 2025
d54e001
Simplifying at last gen
gAldeia Nov 26, 2025
eee7170
Casting when calculating the diff
gAldeia Nov 26, 2025
e64839a
Modified complexity to ignore Mul operation from weighted nodes
gAldeia Dec 5, 2025
47f9c22
improves printing expressions with 1.00* weights
gAldeia Dec 15, 2025
2ae370c
Fixed final model selection not working for regression
gAldeia Dec 15, 2025
313bb9f
Improved deap as an optional dependency
gAldeia Dec 17, 2025
28c59ae
Trying to make brush more deterministic
gAldeia Dec 18, 2025
83e3482
assert is_fitted is false
gAldeia Dec 24, 2025
ef9dc99
boolean operators fixed. new comparison operators
gAldeia Dec 24, 2025
65cffd8
New test for boolean operators. updated output of one guide notebook
gAldeia Dec 24, 2025
c473282
Default signature for new operators
gAldeia Jan 15, 2026
b96ade2
Improved feature type retrieval when fitting without using a pandas d…
gAldeia Feb 5, 2026
8410b1e
SplitOn defaults to ArrayXf(ArraXb, ArrayXf, ArrayXf)
gAldeia Mar 9, 2026
f1c3109
replace program (for better interface with brush-python-parser)
gAldeia Mar 10, 2026
3621a26
Also providing interface to work with json
gAldeia Mar 11, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
433 changes: 433 additions & 0 deletions docs/guide/fixing_nodes.ipynb

Large diffs are not rendered by default.

758 changes: 354 additions & 404 deletions docs/guide/working_with_programs.ipynb

Large diffs are not rendered by default.

52 changes: 38 additions & 14 deletions pybrush/BrushEstimator.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,13 @@ def fit(self, X, y):
feature_types=self.feature_types_,
validation_size=self.validation_size,
shuffle_split=self.shuffle_split)

# If it failed to infer datatypes from a dict, we will retrieve it
# from the brush type sniffer, because this helps calling predict later
# (it keeps data type consistent, which is important since brush is strongly typed)
if not isinstance(X, pd.DataFrame):
self.feature_names_ = self.data_.get_feature_names()
self.feature_types_ = self.data_.get_feature_types()

# These have a default behavior to return something meaningfull if
# no values are set
Expand All @@ -107,6 +114,7 @@ def fit(self, X, y):
self.parameters_.functions,
self.parameters_.weights_init)

# Creating a new brush engine
self.engine_ = None
if self.mode == 'classification':
self.engine_ = ( ClassifierEngine
Expand All @@ -133,7 +141,8 @@ def fit(self, X, y):

return self

def partial_fit(self, X, y, lock_nodes_depth=0, keep_leaves_unlocked=True):
def partial_fit(self, X, y, *,
lock_nodes_depth=0, keep_leaves_unlocked=True, keep_current_weights=False):
"""
Fit an estimator to X,y, without reseting the estimator.

Expand All @@ -147,6 +156,9 @@ def partial_fit(self, X, y, lock_nodes_depth=0, keep_leaves_unlocked=True):
The depth of the tree to lock. Default is 0.
keep_leaves_unlocked : bool, optional
Whether to skip leaves when locking nodes. Default is True.
keep_current_weights : bool, optional
Whether to keep current weights at the spot they appear, and preventing
them to be changed during optimization. Default is False.
"""

if isinstance(X, pd.DataFrame):
Expand All @@ -172,9 +184,17 @@ def partial_fit(self, X, y, lock_nodes_depth=0, keep_leaves_unlocked=True):
# This updates the parameters (such as class weights)
self.engine_.params = new_parameters

self.engine_.lock_nodes(lock_nodes_depth, keep_leaves_unlocked)
# replicating the best individual
self.engine_.set_population([self.best_estimator_ for _ in range(self.pop_size)])

self.engine_.lock_nodes(lock_nodes_depth, keep_leaves_unlocked, keep_current_weights)
self.engine_.fit(new_data)
self.engine_.lock_nodes(0, False) # unlocking everything
# self.engine_.lock_nodes(0, False, False) # unlocking everything

# getting a new reference to the search space (it is not serialized, so
# this ensures that loading a model with pickle and calling either fit()
# or partial_fit() will restore the search space reference)
self.search_space_ = self.engine_.search_space

self.archive_ = self.engine_.get_archive()
self.population_ = self.engine_.get_population()
Expand Down Expand Up @@ -225,6 +245,8 @@ def _update_final_model(self, data=None):
if data is None:
data = self.validation_ #.get_validation_data()

y = np.array(data.y)

candidate = None
if self.final_model_selection == "smallest_complexity":
candidates = [p for p in self.archive_ if p.fitness.size > 1 + (4 if self.mode == 'classification' else 0)]
Expand All @@ -244,48 +266,50 @@ def _update_final_model(self, data=None):
}
loss_f = loss_f_dict[self.parameters_.scorer]

def eval(ind, data, sample=None):
def eval(ind, sample=None):
if sample is None:
sample = np.arange(len(data.y))
sample = np.arange(len(y))

if self.parameters_.scorer in ["log", "average_precision_score"]:
y_pred = np.array(ind.predict_proba(data))
else: # accuracy, balanced accuracy, or regression metrics
y_pred = np.array(ind.predict(data))

y_pred = np.nan_to_num(y_pred) # Protecting the evaluation
# y_pred = np.nan_to_num(y_pred) # Protecting the evaluation

# if user_defined, sample_weight is given by his custom weights. if
# support, I calculate it here. otherwise, no weight is used
if self.class_weights not in ['unbalanced', 'balanced_accuracy']:
if self.mode == 'classification' \
and (self.class_weights not in ['unbalanced'] and self.parameters_.scorer not in ['balanced_accuracy']):
sample_weight = []
if isinstance(self.class_weights, list): # using user-defined values
sample_weight = [self.class_weights[int(label)] for label in data.y]
sample_weight = [self.class_weights[int(label)] for label in y]
else: # support
# Calculate class weights by support
classes, counts = np.unique(data.y, return_counts=True)
classes, counts = np.unique(y[sample], return_counts=True)

support_weights = {
int(cls): len(data.y) / (len(classes)*count)
int(cls): len(y[sample]) / (len(classes)*count)
if count > 0 else 0.0 for cls, count in zip(classes, counts)}

sample_weight = [support_weights[int(label)] for label in data.y]
# classes and support weights are calculated with y[sample].
# sample_weight will be indexed in the function call, so we use raw y.
sample_weight = [support_weights[int(label)] for label in y]
sample_weight = np.array(sample_weight)
return loss_f(y[sample], y_pred[sample], sample_weight=sample_weight[sample])
else: # unbalanced metrics, ignoring weights
return loss_f(y[sample], y_pred[sample])

y = np.array(data.y)
np.random.seed(0)
val_samples = []
for i in range(100):
sample = np.random.randint(0, len(y), size=len(y))
val_samples.append( eval(self.best_estimator_, data, sample) )
val_samples.append( eval(self.best_estimator_, sample) )

lower_ci, upper_ci = np.quantile(val_samples,0.05), np.quantile(val_samples,0.95)

# Recalculate metric with new data
new_losses = [eval(ind, data) for ind in self.archive_]
new_losses = [eval(ind) for ind in self.archive_]

# Filter for overlapping points. Adding the best estimator to assert there is at least one sample
candidates = [(l, p) for l, p in zip(new_losses, self.archive_) if lower_ci <= l <= upper_ci]
Expand Down
10 changes: 9 additions & 1 deletion pybrush/EstimatorInterface.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,9 @@ class EstimatorInterface():
The inexact simplification algorithm works by mapping similar expressions
to the same hash, and retrieving the simplest one when doing the
simplification of an expression.
start_from_decision_trees: boolean, optional (default: false)
Whether the initial population should only contain decision trees
(that is, trees using only SplitOn and SplitBest operators).
batch_size : float, default 1.0
Percentage of training data to sample every generation. If `1.0`, then
all data is used. Very small values can improve execution time, but
Expand Down Expand Up @@ -137,7 +140,7 @@ class EstimatorInterface():
* `"best_validation_ci"`: The less complex solution that is within
the 95% confidence interval of the best solution's validation loss, with
the confidence interval estimated with the inner validation partition of
the data passed to `fit` or `fit_partial`;
the data passed to `fit` or `partial_fit`;

If a custom function is passed, then it should hhave the signature
`Callable[[List[Dict], List[Dict]], Dict]]`, which means that it takes
Expand Down Expand Up @@ -203,6 +206,7 @@ def __init__(self,
validation_size: float = 0.2,
constants_simplification=True,
inexact_simplification=True,
start_from_decision_trees=False,
batch_size: float = 1.0,
sel: str = "lexicase",
surv: str = "nsga2",
Expand Down Expand Up @@ -238,6 +242,7 @@ def __init__(self,
self.objectives = objectives
self.constants_simplification=constants_simplification
self.inexact_simplification=inexact_simplification
self.start_from_decision_trees=start_from_decision_trees
self.scorer = scorer
self.shuffle_split = shuffle_split
self.initialization = initialization
Expand Down Expand Up @@ -319,6 +324,9 @@ def _wrap_parameters(self, y, **extra_kwargs):
params.max_stall = self.max_stall
params.max_time = self.max_time

# Initial population
params.start_from_decision_trees = self.start_from_decision_trees

# Sampling probabilities
params.weights_init = self.weights_init
params.bandit = self.bandit
Expand Down
19 changes: 17 additions & 2 deletions pybrush/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,21 @@
from ._brush import RegressorSelector, ClassifierSelector, MultiClassifierSelector
from ._brush import RegressorVariator, ClassifierVariator, MultiClassifierVariator

# full estimator implementations --------------------
from pybrush.DeapEstimator import DeapClassifier, DeapRegressor
from pybrush.BrushEstimator import BrushClassifier, BrushRegressor

# deap api
try:
from pybrush import deap_api
except ImportError:
import warnings

class _DeapAPIWarning:
def __getattr__(self, name):
warnings.warn(
"deap_api could not be imported. Please install required dependencies.",
ImportWarning,
stacklevel=2
)
raise AttributeError(f"deap_api is not available")

deap_api = _DeapAPIWarning()
21 changes: 0 additions & 21 deletions pybrush/_versionstr copy.py

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
from pandas.api.types import is_float_dtype, is_bool_dtype, is_integer_dtype

from pybrush.EstimatorInterface import EstimatorInterface
from pybrush.deap_api import nsga2
from pybrush.deap_api.nsga2 import nsga2
from pybrush import individual
from pybrush import RegressorEvaluator, ClassifierEvaluator, MultiClassifierEvaluator
from pybrush import RegressorSelector, ClassifierSelector, MultiClassifierSelector
Expand Down
3 changes: 2 additions & 1 deletion pybrush/deap_api/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
from pybrush.deap_api.nsga2 import nsga2
from pybrush.deap_api.nsga2 import nsga2
from pybrush.deap_api.DeapEstimator import DeapClassifier, DeapRegressor
2 changes: 0 additions & 2 deletions src/bandit/dummy.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,6 @@
namespace Brush {
namespace MAB {

// TODO: rename dummy to static or fixed

class DummyBandit : public BanditOperator
{
public:
Expand Down
10 changes: 7 additions & 3 deletions src/bindings/bind_dataset.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -64,8 +64,12 @@ void bind_dataset(py::module & m)
py::arg("ref_dataset"),
py::arg("feature_names")
)

.def_readwrite("y", &br::Data::Dataset::y)

.def("get_feature_types", &br::Data::Dataset::get_feature_types)
.def("get_feature_names", [](const br::Data::Dataset &d) {return d.feature_names; }) // wrapping it into a function to keep consistent with get_feature_types. brush feature types are not native to python, so that's why we need that function to cast it to something python can understand.

.def_readwrite("y", &br::Data::Dataset::y) // TODO: should this be read only?

// .def_readwrite("features", &br::Data::Dataset::features)
.def("get_n_samples", &br::Data::Dataset::get_n_samples)
.def("get_n_features", &br::Data::Dataset::get_n_features)
Expand All @@ -76,7 +80,7 @@ void bind_dataset(py::module & m)
.def("get_batch_size", &br::Data::Dataset::get_batch_size)
.def("set_batch_size", &br::Data::Dataset::set_batch_size)
.def("split", &br::Data::Dataset::split)
.def("get_X", &br::Data::Dataset::get_X)
.def("get_X", &br::Data::Dataset::get_X)
;

m.def("read_csv", &br::Data::read_csv, py::arg("path"), py::arg("target"), py::arg("sep")=',');
Expand Down
1 change: 1 addition & 0 deletions src/bindings/bind_engines.h
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@ void bind_engine(py::module& m, string name)
&T::lock_nodes,
py::arg("end_depth") = 0,
py::arg("keep_leaves_unlocked") = true,
py::arg("keep_current_weights") = false,
stream_redirect()
)
.def(py::pickle(
Expand Down
8 changes: 8 additions & 0 deletions src/bindings/bind_individuals.h
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,14 @@ void bind_individual(py::module& m, string name)
.def("fit",
static_cast<Class &(Class::*)(const Ref<const ArrayXXf> &X, const Ref<const ArrayXf> &y)>(&Class::fit),
"fit from X,y data")
.def("replace_program",
static_cast<Class &(Class::*)(const br::Program<PT>&)>(&Class::replace_program),
py::arg("new_program"),
"Replace the current program with a new program, invalidating fitness")
.def("replace_program",
static_cast<Class &(Class::*)(const json&)>(&Class::replace_program),
py::arg("json_program"),
"Replace the current program from a JSON representation, invalidating fitness")
.def("predict",
static_cast<RetType (Class::*)(const Dataset &d)>(&Class::predict),
"predict from Dataset object")
Expand Down
1 change: 1 addition & 0 deletions src/bindings/bind_params.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ void bind_params(py::module& m)
.def_property("num_islands", &Brush::Parameters::get_num_islands, &Brush::Parameters::set_num_islands)
.def_property("constants_simplification", &Brush::Parameters::get_constants_simplification, &Brush::Parameters::set_constants_simplification)
.def_property("inexact_simplification", &Brush::Parameters::get_inexact_simplification, &Brush::Parameters::set_inexact_simplification)
.def_property("start_from_decision_trees", &Brush::Parameters::get_start_from_decision_trees, &Brush::Parameters::set_start_from_decision_trees)
.def("set_n_classes", &Brush::Parameters::set_n_classes)
.def("set_class_weights", &Brush::Parameters::set_class_weights)
.def("set_class_weights_type", &Brush::Parameters::set_class_weights_type)
Expand Down
9 changes: 9 additions & 0 deletions src/bindings/bind_programs.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,14 @@ void bind_program(py::module& m, string name)
.def("fit",
static_cast<T &(T::*)(const Ref<const ArrayXXf> &X, const Ref<const ArrayXf> &y)>(&T::fit),
"fit from X,y data")
.def("replace_program",
static_cast<T &(T::*)(const T&)>(&T::replace_program),
py::arg("new_program"),
"Replace the current program with a new program, invalidating fitness")
.def("replace_program",
static_cast<T &(T::*)(const json&)>(&T::replace_program),
py::arg("json_program"),
"Replace the current program from a JSON representation, invalidating fitness")
.def("predict",
static_cast<RetType (T::*)(const Dataset &d)>(&T::predict),
"predict from Dataset object")
Expand All @@ -40,6 +48,7 @@ void bind_program(py::module& m, string name)
&T::lock_nodes,
py::arg("end_depth") = 0,
py::arg("keep_leaves_unlocked") = true,
py::arg("keep_current_weights") = false,
stream_redirect()
)
.def("get_model",
Expand Down
Loading
Loading