cavalab · lacava · Mar 24, 2026 · Nov 2, 2025 · Nov 3, 2025 · Nov 3, 2025
diff --git a/docs/guide/fixing_nodes.ipynb b/docs/guide/fixing_nodes.ipynb
diff --git a/docs/guide/working_with_programs.ipynb b/docs/guide/working_with_programs.ipynb
diff --git a/pybrush/BrushEstimator.py b/pybrush/BrushEstimator.py
@@ -94,6 +94,13 @@ def fit(self, X, y):
                                      feature_types=self.feature_types_,
                                      validation_size=self.validation_size,
                                      shuffle_split=self.shuffle_split)
+
+        # If it failed to infer datatypes from a dict, we will retrieve it
+        # from the brush type sniffer, because this helps calling predict later
+        # (it keeps data type consistent, which is important since brush is strongly typed)             
+        if not isinstance(X, pd.DataFrame): 
+            self.feature_names_ = self.data_.get_feature_names()
+            self.feature_types_ = self.data_.get_feature_types()
 
         # These have a default behavior to return something meaningfull if 
         # no values are set
@@ -107,6 +114,7 @@ def fit(self, X, y):
                                          self.parameters_.functions,
                                          self.parameters_.weights_init)
 
+        # Creating a new brush engine
         self.engine_ = None
         if self.mode == 'classification':
             self.engine_ = ( ClassifierEngine
@@ -133,7 +141,8 @@ def fit(self, X, y):
 
         return self
 
-    def partial_fit(self, X, y, lock_nodes_depth=0, keep_leaves_unlocked=True):
+    def partial_fit(self, X, y, *, 
+                    lock_nodes_depth=0, keep_leaves_unlocked=True, keep_current_weights=False):
         """
         Fit an estimator to X,y, without reseting the estimator.
 
@@ -147,6 +156,9 @@ def partial_fit(self, X, y, lock_nodes_depth=0, keep_leaves_unlocked=True):
             The depth of the tree to lock. Default is 0.
         keep_leaves_unlocked : bool, optional
             Whether to skip leaves when locking nodes. Default is True.
+        keep_current_weights : bool, optional
+            Whether to keep current weights at the spot they appear, and preventing 
+            them to be changed during optimization. Default is False.
         """
 
         if isinstance(X, pd.DataFrame):
@@ -172,9 +184,17 @@ def partial_fit(self, X, y, lock_nodes_depth=0, keep_leaves_unlocked=True):
         # This updates the parameters (such as class weights)
         self.engine_.params = new_parameters
 
-        self.engine_.lock_nodes(lock_nodes_depth, keep_leaves_unlocked)
+        # replicating the best individual
+        self.engine_.set_population([self.best_estimator_ for _ in range(self.pop_size)])
+
+        self.engine_.lock_nodes(lock_nodes_depth, keep_leaves_unlocked, keep_current_weights)
         self.engine_.fit(new_data)
-        self.engine_.lock_nodes(0, False) # unlocking everything
+        # self.engine_.lock_nodes(0, False, False) # unlocking everything
+
+        # getting a new reference to the search space (it is not serialized, so 
+        # this ensures that loading a model with pickle and calling either fit()
+        # or partial_fit() will restore the search space reference)
+        self.search_space_ = self.engine_.search_space
 
         self.archive_ = self.engine_.get_archive()
         self.population_ = self.engine_.get_population()
@@ -225,6 +245,8 @@ def _update_final_model(self, data=None):
         if data is None:
             data = self.validation_ #.get_validation_data()
 
+        y = np.array(data.y)
+
         candidate = None
         if self.final_model_selection == "smallest_complexity":
             candidates = [p for p in self.archive_ if p.fitness.size > 1 + (4 if self.mode == 'classification' else 0)]
@@ -244,48 +266,50 @@ def _update_final_model(self, data=None):
             }
             loss_f = loss_f_dict[self.parameters_.scorer]
 
-            def eval(ind, data, sample=None):
+            def eval(ind, sample=None):
                 if sample is None:
-                    sample = np.arange(len(data.y))
+                    sample = np.arange(len(y))
 
                 if self.parameters_.scorer in ["log", "average_precision_score"]:
                     y_pred = np.array(ind.predict_proba(data))
                 else: # accuracy, balanced accuracy, or regression metrics
                     y_pred = np.array(ind.predict(data))
 
-                y_pred = np.nan_to_num(y_pred) # Protecting the evaluation
+                # y_pred = np.nan_to_num(y_pred) # Protecting the evaluation
 
                 # if user_defined, sample_weight is given by his custom weights. if
                 # support, I calculate it here. otherwise, no weight is used
-                if self.class_weights not in ['unbalanced', 'balanced_accuracy']:
+                if self.mode == 'classification' \
+                and (self.class_weights not in ['unbalanced'] and self.parameters_.scorer not in ['balanced_accuracy']):
                     sample_weight = []
                     if isinstance(self.class_weights, list): # using user-defined values
-                        sample_weight = [self.class_weights[int(label)] for label in data.y]
+                        sample_weight = [self.class_weights[int(label)] for label in y]
                     else: # support
                         # Calculate class weights by support
-                        classes, counts = np.unique(data.y, return_counts=True)
+                        classes, counts = np.unique(y[sample], return_counts=True)
 
                         support_weights = {
-                            int(cls): len(data.y) / (len(classes)*count) 
+                            int(cls): len(y[sample]) / (len(classes)*count) 
                             if count > 0 else 0.0 for cls, count in zip(classes, counts)}
 
-                        sample_weight = [support_weights[int(label)] for label in data.y]
+                        # classes and support weights are calculated with y[sample].
+                        # sample_weight will be indexed in the function call, so we use raw y.
+                        sample_weight = [support_weights[int(label)] for label in y]
                     sample_weight = np.array(sample_weight)
                     return loss_f(y[sample], y_pred[sample], sample_weight=sample_weight[sample])
                 else: # unbalanced metrics, ignoring weights
                     return loss_f(y[sample], y_pred[sample])
 
-            y = np.array(data.y)
             np.random.seed(0)
             val_samples = []
             for i in range(100):
                 sample = np.random.randint(0, len(y), size=len(y))
-                val_samples.append( eval(self.best_estimator_, data, sample) )
+                val_samples.append( eval(self.best_estimator_, sample) )
 
             lower_ci, upper_ci = np.quantile(val_samples,0.05), np.quantile(val_samples,0.95)
 
             # Recalculate metric with new data
-            new_losses = [eval(ind, data) for ind in self.archive_]
+            new_losses = [eval(ind) for ind in self.archive_]
 
             # Filter for overlapping points. Adding the best estimator to assert there is at least one sample
             candidates = [(l, p) for l, p in zip(new_losses, self.archive_) if lower_ci <= l <= upper_ci]

diff --git a/pybrush/EstimatorInterface.py b/pybrush/EstimatorInterface.py
@@ -106,6 +106,9 @@ class EstimatorInterface():
         The inexact simplification algorithm works by mapping similar expressions
         to the same hash, and retrieving the simplest one when doing the
         simplification of an expression.
+    start_from_decision_trees: boolean, optional (default: false)
+        Whether the initial population should only contain decision trees
+        (that is, trees using only SplitOn and SplitBest operators).
     batch_size : float, default 1.0
         Percentage of training data to sample every generation. If `1.0`, then
         all data is used. Very small values can improve execution time, but 
@@ -137,7 +140,7 @@ class EstimatorInterface():
         * `"best_validation_ci"`: The less complex solution that is within
         the 95% confidence interval of the best solution's validation loss, with
         the confidence interval estimated with the inner validation partition of
-        the data passed to `fit` or `fit_partial`;
+        the data passed to `fit` or `partial_fit`;
 
         If a custom function is passed, then it should hhave the signature
         `Callable[[List[Dict], List[Dict]], Dict]]`, which means that it takes
@@ -203,6 +206,7 @@ def __init__(self,
         validation_size: float = 0.2,
         constants_simplification=True,
         inexact_simplification=True,
+        start_from_decision_trees=False,
         batch_size: float = 1.0,
         sel: str = "lexicase",
         surv: str = "nsga2",
@@ -238,6 +242,7 @@ def __init__(self,
         self.objectives = objectives
         self.constants_simplification=constants_simplification
         self.inexact_simplification=inexact_simplification
+        self.start_from_decision_trees=start_from_decision_trees
         self.scorer = scorer
         self.shuffle_split = shuffle_split
         self.initialization = initialization
@@ -319,6 +324,9 @@ def _wrap_parameters(self, y, **extra_kwargs):
         params.max_stall = self.max_stall
         params.max_time = self.max_time
 
+        # Initial population
+        params.start_from_decision_trees = self.start_from_decision_trees
+
         # Sampling probabilities
         params.weights_init = self.weights_init
         params.bandit = self.bandit

diff --git a/pybrush/__init__.py b/pybrush/__init__.py
@@ -18,6 +18,21 @@
 from ._brush import RegressorSelector, ClassifierSelector, MultiClassifierSelector
 from ._brush import RegressorVariator, ClassifierVariator, MultiClassifierVariator
 
-# full estimator implementations --------------------
-from pybrush.DeapEstimator import DeapClassifier, DeapRegressor
 from pybrush.BrushEstimator import BrushClassifier, BrushRegressor
+
+# deap api
+try:
+    from pybrush import deap_api 
+except ImportError:
+    import warnings
+
+    class _DeapAPIWarning:
+        def __getattr__(self, name):
+            warnings.warn(
+                "deap_api could not be imported. Please install required dependencies.",
+                ImportWarning,
+                stacklevel=2
+            )
+            raise AttributeError(f"deap_api is not available")
+
+    deap_api = _DeapAPIWarning()
diff --git a/pybrush/_versionstr copy.py b/pybrush/_versionstr copy.py
diff --git a/pybrush/DeapEstimator.py → pybrush/deap_api/DeapEstimator.py b/pybrush/DeapEstimator.py → pybrush/deap_api/DeapEstimator.py
@@ -22,7 +22,7 @@
 from pandas.api.types import is_float_dtype, is_bool_dtype, is_integer_dtype
 
 from pybrush.EstimatorInterface import EstimatorInterface
-from pybrush.deap_api import nsga2
+from pybrush.deap_api.nsga2 import nsga2
 from pybrush import individual
 from pybrush import RegressorEvaluator, ClassifierEvaluator, MultiClassifierEvaluator
 from pybrush import RegressorSelector, ClassifierSelector, MultiClassifierSelector

diff --git a/pybrush/deap_api/__init__.py b/pybrush/deap_api/__init__.py
@@ -1 +1,2 @@
-from pybrush.deap_api.nsga2 import nsga2
+from pybrush.deap_api.nsga2 import nsga2
+from pybrush.deap_api.DeapEstimator import DeapClassifier, DeapRegressor
diff --git a/src/bandit/dummy.h b/src/bandit/dummy.h
@@ -8,8 +8,6 @@
 namespace Brush {
 namespace MAB {
 
-// TODO: rename dummy to static or fixed
-
 class DummyBandit : public BanditOperator
 {
 public:

diff --git a/src/bindings/bind_dataset.cpp b/src/bindings/bind_dataset.cpp
@@ -64,8 +64,12 @@ void bind_dataset(py::module & m)
             py::arg("ref_dataset"),
             py::arg("feature_names")
         )
-
-        .def_readwrite("y", &br::Data::Dataset::y)
+
+        .def("get_feature_types", &br::Data::Dataset::get_feature_types)
+        .def("get_feature_names", [](const br::Data::Dataset &d) {return d.feature_names; }) // wrapping it into a function to keep consistent with get_feature_types. brush feature types are not native to python, so that's why we need that function to cast it to something python can understand.
+
+        .def_readwrite("y", &br::Data::Dataset::y) // TODO: should this be read only?
+
         // .def_readwrite("features", &br::Data::Dataset::features)
         .def("get_n_samples", &br::Data::Dataset::get_n_samples)
         .def("get_n_features", &br::Data::Dataset::get_n_features)
@@ -76,7 +80,7 @@ void bind_dataset(py::module & m)
         .def("get_batch_size", &br::Data::Dataset::get_batch_size)
         .def("set_batch_size", &br::Data::Dataset::set_batch_size)
         .def("split", &br::Data::Dataset::split)
-        .def("get_X", &br::Data::Dataset::get_X)
+        .def("get_X", &br::Data::Dataset::get_X)        
         ;
 
     m.def("read_csv", &br::Data::read_csv, py::arg("path"), py::arg("target"), py::arg("sep")=',');

diff --git a/src/bindings/bind_engines.h b/src/bindings/bind_engines.h
@@ -82,6 +82,7 @@ void bind_engine(py::module& m, string name)
                 &T::lock_nodes,
                 py::arg("end_depth") = 0,
                 py::arg("keep_leaves_unlocked") = true,
+                py::arg("keep_current_weights") = false,
                 stream_redirect()
             )
             .def(py::pickle(

diff --git a/src/bindings/bind_individuals.h b/src/bindings/bind_individuals.h
@@ -47,6 +47,14 @@ void bind_individual(py::module& m, string name)
         .def("fit",
             static_cast<Class &(Class::*)(const Ref<const ArrayXXf> &X, const Ref<const ArrayXf> &y)>(&Class::fit),
             "fit from X,y data")
+        .def("replace_program",
+            static_cast<Class &(Class::*)(const br::Program<PT>&)>(&Class::replace_program),
+            py::arg("new_program"),
+            "Replace the current program with a new program, invalidating fitness")
+        .def("replace_program",
+            static_cast<Class &(Class::*)(const json&)>(&Class::replace_program),
+            py::arg("json_program"),
+            "Replace the current program from a JSON representation, invalidating fitness")
         .def("predict",
             static_cast<RetType (Class::*)(const Dataset &d)>(&Class::predict),
             "predict from Dataset object")

diff --git a/src/bindings/bind_params.cpp b/src/bindings/bind_params.cpp
@@ -27,6 +27,7 @@ void bind_params(py::module& m)
         .def_property("num_islands", &Brush::Parameters::get_num_islands, &Brush::Parameters::set_num_islands)
         .def_property("constants_simplification", &Brush::Parameters::get_constants_simplification, &Brush::Parameters::set_constants_simplification)
         .def_property("inexact_simplification", &Brush::Parameters::get_inexact_simplification, &Brush::Parameters::set_inexact_simplification)
+        .def_property("start_from_decision_trees", &Brush::Parameters::get_start_from_decision_trees, &Brush::Parameters::set_start_from_decision_trees)
         .def("set_n_classes", &Brush::Parameters::set_n_classes)
         .def("set_class_weights", &Brush::Parameters::set_class_weights)
         .def("set_class_weights_type", &Brush::Parameters::set_class_weights_type)

diff --git a/src/bindings/bind_programs.h b/src/bindings/bind_programs.h
@@ -30,6 +30,14 @@ void bind_program(py::module& m, string name)
         .def("fit",
             static_cast<T &(T::*)(const Ref<const ArrayXXf> &X, const Ref<const ArrayXf> &y)>(&T::fit),
             "fit from X,y data")
+        .def("replace_program",
+            static_cast<T &(T::*)(const T&)>(&T::replace_program),
+            py::arg("new_program"),
+            "Replace the current program with a new program, invalidating fitness")
+        .def("replace_program",
+            static_cast<T &(T::*)(const json&)>(&T::replace_program),
+            py::arg("json_program"),
+            "Replace the current program from a JSON representation, invalidating fitness")
         .def("predict",
             static_cast<RetType (T::*)(const Dataset &d)>(&T::predict),
             "predict from Dataset object")
@@ -40,6 +48,7 @@ void bind_program(py::module& m, string name)
             &T::lock_nodes,
             py::arg("end_depth") = 0,
             py::arg("keep_leaves_unlocked") = true,
+            py::arg("keep_current_weights") = false,
             stream_redirect()
         )
         .def("get_model",