diff --git a/.gitignore b/.gitignore index af32788ff..be3cd74c1 100644 --- a/.gitignore +++ b/.gitignore @@ -114,6 +114,7 @@ celerybeat.pid # Environments .env +.Renviron .venv env/ venv/ diff --git a/Makefile b/Makefile index 078b6ac19..de573b998 100644 --- a/Makefile +++ b/Makefile @@ -67,9 +67,10 @@ version: @:$(call check_defined, tag, new semver version tag to use on pyproject.toml) @poetry version $(tag) @echo "__version__ = \"$$(poetry version -s)\"" > validmind/__version__.py + @sed -i '' 's/^Version: .*/Version: '"$$(poetry version -s)"'/' r/validmind/DESCRIPTION @echo "Version updated to $$(poetry version -s)" - @echo "Commiting changes to pyproject.toml and __version__.py with message: $$(poetry version -s)" - @git add pyproject.toml validmind/__version__.py + @echo "Commiting changes to pyproject.toml, __version__.py and r/validmind/DESCRIPTION with message: $$(poetry version -s)" + @git add pyproject.toml validmind/__version__.py r/validmind/DESCRIPTION @git commit -m "$$(poetry version -s)" generate-test-id-types: diff --git a/README.md b/README.md index 487780fc9..4078f355f 100644 --- a/README.md +++ b/README.md @@ -118,21 +118,71 @@ You can install the `transformers`, `torch` and `openai` dependencies using the poetry install --extras llm ``` -### Installing R dependencies +### Setting up R support -If you want to use the R support that is provided by the ValidMind Library, you must have R installed on your machine. You can download R from . On a Mac, you can install R using Homebrew: +#### 1. Install R + +You can download R from . On macOS, the easiest way is via Homebrew: ```bash brew install r ``` -Once you have R installed, install the `r-support` extra to install the necessary dependencies for R by running: +#### 2. Install Python dependencies + +Install `rpy2` so the Python library can interface with R models. On macOS, you may need to build from source to match your R version: ```bash -poetry install +# Try the standard install first pip install rpy2 + +# If you get R library loading errors, rebuild against your installed R: +R_HOME=$(Rscript -e 'cat(R.home())') pip install --no-binary :all: --force-reinstall rpy2 +``` + +#### 3. Install R packages + +Open R (type `R` in your terminal) and install the required packages: + +```r +install.packages(c("reticulate", "dplyr", "caTools", "knitr", "glue", "plotly", "htmltools", "rmarkdown", "DT", "base64enc")) +``` + +Then install the ValidMind R package from source: + +```r +install.packages("r/validmind", repos = NULL, type = "source") ``` +#### 4. Set up VS Code / Cursor for R + +No RStudio required. Install the **R extension** (`REditorSupport.r`) in VS Code or Cursor: + +1. Open Extensions (`Cmd+Shift+X`) and search for "R" +2. Install the **R** extension by REditorSupport +3. Optionally install the `languageserver` R package for autocomplete: `install.packages("languageserver")` + +With the extension installed: +- Open `.Rmd` files and run chunks with `Cmd+Shift+Enter` +- Render full documents with `Cmd+Shift+K` +- Use the R terminal panel for interactive sessions + +Alternatively, you can run R notebooks as Jupyter notebooks by installing the R kernel: + +```r +install.packages("IRkernel") +IRkernel::installspec() +``` + +Then create/open `.ipynb` files in VS Code and select the R kernel. + +#### 5. Run the quickstart notebooks + +Launch R from the repository root (so dataset paths resolve correctly) and run through the notebooks in `notebooks/code_sharing/r/`: + +- `quickstart_model_documentation.Rmd` — model documentation workflow +- `quickstart_model_validation.Rmd` — model validation workflow + ### Versioning Make sure you bump the package version before merging a PR with the following command: diff --git a/notebooks/code_sharing/r/r_custom_tests.Rmd b/notebooks/code_sharing/r/r_custom_tests.Rmd index 63e52f96a..cd931a562 100644 --- a/notebooks/code_sharing/r/r_custom_tests.Rmd +++ b/notebooks/code_sharing/r/r_custom_tests.Rmd @@ -109,15 +109,11 @@ Get your code snippet: Next, replace this placeholder with your own code snippet: ```{r} -# Find the path to your Python runtime by running `python -V` in your terminal -# python_version <- "" - vm_r <- vm( api_host = "https://api.prod.validmind.ai/api/v1/tracking", api_key = "...", api_secret = "...", - model = "...", - python_version = python_version + model = "..." ) ``` diff --git a/notebooks/code_sharing/r/r_customer_churn_demo.Rmd b/notebooks/code_sharing/r/r_customer_churn_demo.Rmd index ab4c44eb9..8aae0e652 100644 --- a/notebooks/code_sharing/r/r_customer_churn_demo.Rmd +++ b/notebooks/code_sharing/r/r_customer_churn_demo.Rmd @@ -24,7 +24,10 @@ We will train a sample model and demonstrate the following documentation functio ```{r setup, include=FALSE} library(reticulate) -python_version <- "" +python_version <- Sys.getenv("VALIDMIND_PYTHON", Sys.which("python")) +if (nchar(python_version) > 0 && !startsWith(python_version, "/")) { + python_version <- file.path(getwd(), python_version) +} use_python(python_version) library(validmind) @@ -64,11 +67,10 @@ The code snippet can be copied and pasted directly in the cell below to initiali ```{r} vm_r <- vm( + api_host="https://app.prod.validmind.ai/api/v1/tracking", api_key="", api_secret="", - model="", - python_version=python_version, - api_host="https://app.prod.validmind.ai/api/v1/tracking" + model="" ) ``` diff --git a/notebooks/code_sharing/r/r_customer_churn_demo_xgboost.Rmd b/notebooks/code_sharing/r/r_customer_churn_demo_xgboost.Rmd index 44e9e9318..597df3f31 100644 --- a/notebooks/code_sharing/r/r_customer_churn_demo_xgboost.Rmd +++ b/notebooks/code_sharing/r/r_customer_churn_demo_xgboost.Rmd @@ -24,7 +24,10 @@ We will train a sample model and demonstrate the following documentation functio ```{r setup, include=FALSE} library(reticulate) -python_version <- "" +python_version <- Sys.getenv("VALIDMIND_PYTHON", Sys.which("python")) +if (nchar(python_version) > 0 && !startsWith(python_version, "/")) { + python_version <- file.path(getwd(), python_version) +} use_python(python_version) library(validmind) @@ -65,11 +68,10 @@ The code snippet can be copied and pasted directly in the cell below to initiali ```{r} vm_r <- vm( + api_host="https://app.prod.validmind.ai/api/v1/tracking", api_key="", api_secret="", - model="", - python_version=python_version, - api_host="https://app.prod.validmind.ai/api/v1/tracking" + model="" ) ``` diff --git a/notebooks/code_sharing/r/r_mortality_demo.Rmd b/notebooks/code_sharing/r/r_mortality_demo.Rmd index b81eed50e..a8bb252bb 100644 --- a/notebooks/code_sharing/r/r_mortality_demo.Rmd +++ b/notebooks/code_sharing/r/r_mortality_demo.Rmd @@ -86,7 +86,10 @@ The features used in the mortality model are: ```{r setup, include=FALSE} library(reticulate) -python_version <- "" +python_version <- Sys.getenv("VALIDMIND_PYTHON", Sys.which("python")) +if (nchar(python_version) > 0 && !startsWith(python_version, "/")) { + python_version <- file.path(getwd(), python_version) +} use_python(python_version) library(magrittr) # needs to be run every time you start R and want to use %>% @@ -156,11 +159,10 @@ initialize the ValidMind Library when run: ```{r} vm_r <- vm( + api_host="https://app.prod.validmind.ai/api/v1/tracking", api_key="", api_secret="", - model="", - python_version=python_version, - api_host="https://app.prod.validmind.ai/api/v1/tracking" + model="" ) ``` diff --git a/notebooks/code_sharing/r/r_time_series_data_validation.Rmd b/notebooks/code_sharing/r/r_time_series_data_validation.Rmd index 2965eb390..bc2279e63 100644 --- a/notebooks/code_sharing/r/r_time_series_data_validation.Rmd +++ b/notebooks/code_sharing/r/r_time_series_data_validation.Rmd @@ -17,7 +17,10 @@ Finally, define and **configure** the specific use case we are working on by set ```{r setup, include=FALSE} library(reticulate) -python_version <- "" +python_version <- Sys.getenv("VALIDMIND_PYTHON", Sys.which("python")) +if (nchar(python_version) > 0 && !startsWith(python_version, "/")) { + python_version <- file.path(getwd(), python_version) +} use_python(python_version) library(validmind) @@ -58,11 +61,10 @@ The code snippet can be copied and pasted directly in the cell below to initiali ```{r} vm_r <- vm( + api_host="https://app.prod.validmind.ai/api/v1/tracking", api_key="", api_secret="", - model="", - python_version=python_version, - api_host="https://app.prod.validmind.ai/api/v1/tracking" + model="" ) ``` diff --git a/notebooks/code_sharing/r/r_time_series_model_validation.Rmd b/notebooks/code_sharing/r/r_time_series_model_validation.Rmd index 0a1c08b56..08cd9596e 100644 --- a/notebooks/code_sharing/r/r_time_series_model_validation.Rmd +++ b/notebooks/code_sharing/r/r_time_series_model_validation.Rmd @@ -17,7 +17,10 @@ Finally, define and **configure** the specific use case we are working on by set ```{r setup, include=FALSE} library(reticulate) -python_version <- "/Users/erichare/.pyenv/versions/3.10.10/bin/python" +python_version <- Sys.getenv("VALIDMIND_PYTHON", Sys.which("python")) +if (nchar(python_version) > 0 && !startsWith(python_version, "/")) { + python_version <- file.path(getwd(), python_version) +} use_python(python_version) library(magrittr) # needs to be run every time you start R and want to use %>% @@ -75,11 +78,10 @@ The code snippet can be copied and pasted directly in the cell below to initiali ```{r} vm_r <- vm( - api_key="b34dfe4dcb5491212be3eefe77c85cd6", - api_secret="40f8d2d583baa9e730a7f8872dd57e2f4657c7918c13fa259ba7ccda8a60e858", - model="clmp6k8e800ds19mot0zu8o34", - python_version=python_version, - api_host="https://app.prod.validmind.ai/api/v1/tracking" + api_host="https://app.prod.validmind.ai/api/v1/tracking", + api_key="", + api_secret="", + model="" ) ``` diff --git a/notebooks/quickstart/quickstart_model_documentation.Rmd b/notebooks/quickstart/quickstart_model_documentation.Rmd new file mode 100644 index 000000000..251899d47 --- /dev/null +++ b/notebooks/quickstart/quickstart_model_documentation.Rmd @@ -0,0 +1,227 @@ +--- +title: "Quickstart for Model Documentation (R)" +author: "ValidMind" +date: "2026-03-18" +output: html_document +--- + +# Quickstart for Model Documentation + +Learn the basics of using ValidMind to document models as part of a model development workflow using R. This notebook uses the ValidMind R package (a `reticulate` wrapper around the Python library) to generate a draft of documentation for a binary classification model. + +We will: + +1. Import a sample dataset and preprocess it +2. Split the datasets and initialize them for use with ValidMind +3. Train a logistic regression (GLM) model and initialize it for use with testing +4. Run the full suite of documentation tests, sending results to the ValidMind Platform + +## Setting up + +The Python path is auto-configured via the `VALIDMIND_PYTHON` environment variable. +If not set, it falls back to the system Python. For local development, create a +`.Renviron` file in the project root with `VALIDMIND_PYTHON=.venv/bin/python`. + +```{r setup, include=FALSE} +library(reticulate) + +python_version <- Sys.getenv("VALIDMIND_PYTHON", Sys.which("python")) +if (nchar(python_version) > 0 && !startsWith(python_version, "/")) { + python_version <- file.path(getwd(), python_version) +} +use_python(python_version, required = TRUE) + +library(validmind) +library(dplyr) +library(caTools) +library(knitr) + +knitr::opts_chunk$set(warning = FALSE, message = FALSE) +``` + +## Initialize the ValidMind Library + +Log in to the [ValidMind Platform](https://app.prod.validmind.ai) and register a model: + +1. Navigate to **Inventory** and click **+ Register Model**. +2. Under **Documents > Development**, select the `Binary classification` template. +3. Go to **Getting Started**, select `Development` from the **DOCUMENT** drop-down, and copy the code snippet. + +Replace the placeholder values below with your own credentials: + +```{r} +vm_r <- vm( + api_host = "https://app.prod.validmind.ai/api/v1/tracking", + api_key = "", + api_secret = "", + model = "", + document = "documentation" +) +``` + +## Preview the documentation template + +Verify the connection and see the documentation structure: + +```{r} +vm_r$preview_template() +``` + +## Load the demo dataset + +We use the Bank Customer Churn dataset for this demonstration: + +```{r} +customer_churn <- reticulate::import( + "validmind.datasets.classification.customer_churn" +) + +cat(sprintf( + paste0( + "Loaded demo dataset with:\n\n\t- Target column: '%s'", + "\n\t- Class labels: %s\n" + ), + customer_churn$target_column, + paste( + names(customer_churn$class_labels), + customer_churn$class_labels, + sep = ": ", collapse = ", " + ) +)) + +data <- customer_churn$load_data() +head(data) +``` + +## Initialize the raw dataset + +Before running tests, initialize a ValidMind dataset object for the raw data: + +```{r} +vm_raw_dataset <- vm_r$init_dataset( + dataset = data, + input_id = "raw_dataset", + target_column = customer_churn$target_column, + class_labels = customer_churn$class_labels +) +``` + +## Preprocess the raw dataset + +Handle categorical variables using one-hot encoding and remove unnecessary columns: + +```{r} +# load_data() already drops RowNumber, CustomerId, Surname +# One-hot encode categorical variables +geo_dummies <- model.matrix(~ Geography - 1, data = data) +gender_dummies <- model.matrix(~ Gender - 1, data = data) +data_processed <- data %>% select(-Geography, -Gender) +data_processed <- cbind(data_processed, geo_dummies, gender_dummies) +``` + +### Split the dataset + +Split into training (60%), validation (20%), and test (20%) sets: + +```{r} +set.seed(42) + +# First split: 80% train+validation, 20% test +target_col <- customer_churn$target_column +split1 <- sample.split(data_processed[[target_col]], SplitRatio = 0.8) +train_val_data <- subset(data_processed, split1 == TRUE) +test_data <- subset(data_processed, split1 == FALSE) + +# Second split: 75% train, 25% validation (of the 80% = 60/20 overall) +split2 <- sample.split(train_val_data[[target_col]], SplitRatio = 0.75) +train_data <- subset(train_val_data, split2 == TRUE) +validation_data <- subset(train_val_data, split2 == FALSE) +``` + +## Train a logistic regression model + +Train a GLM with a binomial family (logistic regression): + +```{r} +formula <- as.formula(paste(target_col, "~ .")) +model <- glm(formula, data = train_data, family = binomial) +summary(model) +``` + +## Initialize the ValidMind datasets + +```{r} +vm_train_ds <- vm_r$init_dataset( + dataset = train_data, + input_id = "train_dataset", + target_column = customer_churn$target_column +) + +vm_test_ds <- vm_r$init_dataset( + dataset = test_data, + input_id = "test_dataset", + target_column = customer_churn$target_column +) +``` + +## Initialize a model object + +Save the R model and initialize it with ValidMind: + +```{r} +model_path <- save_model(model) + +vm_model <- vm_r$init_r_model( + model_path = model_path, + input_id = "model" +) +``` + +### Assign predictions + +Link model predictions to the training and testing datasets: + +```{r} +vm_train_ds$assign_predictions(model = vm_model) +vm_test_ds$assign_predictions(model = vm_model) +``` + +## Run the full suite of tests + +Build the test configuration that maps each test to its required inputs: + +```{r} +# Import the test config helper from the Python customer_churn module +customer_churn <- reticulate::import( + "validmind.datasets.classification.customer_churn" +) +test_config <- customer_churn$get_demo_test_config() +``` + +Preview the test configuration: + +```{r} +vm_utils <- reticulate::import("validmind.utils") +vm_utils$preview_test_config(test_config) +``` + +Run the full documentation test suite and upload results to the ValidMind Platform: + +```{r} +full_suite <- vm_r$run_documentation_tests(config = test_config) +``` + +## Next steps + +Head to the [ValidMind Platform](https://app.prod.validmind.ai) to view the generated documentation: + +1. Navigate to **Inventory** and select your model. +2. Click **Development** under Documents to see the full draft of your model documentation. + +From there, you can make qualitative edits, collaborate with validators, and submit for approval. + +--- + +*Copyright 2023-2026 ValidMind Inc. All rights reserved.* +*Refer to [LICENSE](https://github.com/validmind/validmind-library/blob/main/LICENSE) for details.* +*SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial* diff --git a/notebooks/quickstart/quickstart_model_validation.Rmd b/notebooks/quickstart/quickstart_model_validation.Rmd new file mode 100644 index 000000000..6e43444e4 --- /dev/null +++ b/notebooks/quickstart/quickstart_model_validation.Rmd @@ -0,0 +1,335 @@ +--- +title: "Quickstart for Model Validation (R)" +author: "ValidMind" +date: "2026-03-18" +output: html_document +--- + +# Quickstart for Model Validation + +Learn the basics of using ValidMind to validate models as part of a model validation workflow using R. This notebook uses the ValidMind R package (a `reticulate` wrapper around the Python library) to generate a draft of a validation report for a binary classification model. + +We will: + +1. Import a sample dataset and preprocess it, then initialize datasets for use with ValidMind +2. Independently verify data quality tests performed on datasets by model development +3. Train a champion model for evaluation +4. Run model evaluation tests with the ValidMind Library + +## Setting up + +The Python path is auto-configured via the `VALIDMIND_PYTHON` environment variable. +If not set, it falls back to the system Python. For local development, create a +`.Renviron` file in the project root with `VALIDMIND_PYTHON=.venv/bin/python`. + +```{r setup, include=FALSE} +library(reticulate) + +python_version <- Sys.getenv("VALIDMIND_PYTHON", Sys.which("python")) +if (nchar(python_version) > 0 && !startsWith(python_version, "/")) { + python_version <- file.path(getwd(), python_version) +} +use_python(python_version, required = TRUE) + +library(validmind) +library(dplyr) +library(caTools) +library(knitr) + +knitr::opts_chunk$set(warning = FALSE, message = FALSE) +``` + +## Initialize the ValidMind Library + +Log in to the [ValidMind Platform](https://app.prod.validmind.ai) and register a model: + +1. Navigate to **Inventory** and click **+ Register Model**. +2. Assign yourself as a **Validator** (remove yourself from Owner and Developer roles). +3. Under **Documents > Validation**, select the `Generic Validation Report` template. +4. Go to **Getting Started**, select `Validation` from the **DOCUMENT** drop-down, and copy the code snippet. + +Replace the placeholder values below with your own credentials: + +```{r} +vm_r <- vm( + api_host = "https://app.prod.validmind.ai/api/v1/tracking", + api_key = "", + api_secret = "", + model = "", + document = "validation-report" +) +``` + +## Preview the validation report template + +Verify the connection and see the validation report structure: + +```{r} +vm_r$preview_template() +``` + +## Identify available tests + +List the tasks and tags available in the ValidMind test library: + +```{r} +vm_r$tests$list_tasks_and_tags() +``` + +List all data quality tests for classification: + +```{r} +vm_r$tests$list_tests(tags = list("data_quality"), task = "classification") +``` + +## Load the demo dataset + +We use the Bank Customer Churn dataset for this demonstration: + +```{r} +customer_churn <- reticulate::import( + "validmind.datasets.classification.customer_churn" +) + +cat(sprintf( + paste0( + "Loaded demo dataset with:\n\n\t- Target column: '%s'", + "\n\t- Class labels: %s\n" + ), + customer_churn$target_column, + paste( + names(customer_churn$class_labels), + customer_churn$class_labels, + sep = ": ", collapse = ", " + ) +)) + +data <- customer_churn$load_data() +head(data) +``` + +## Preprocess the raw dataset + +Handle categorical variables using one-hot encoding and remove unnecessary columns: + +```{r} +# load_data() already drops RowNumber, CustomerId, Surname +# One-hot encode categorical variables +geo_dummies <- model.matrix(~ Geography - 1, data = data) +gender_dummies <- model.matrix(~ Gender - 1, data = data) +data_processed <- data %>% select(-Geography, -Gender) +data_processed <- cbind(data_processed, geo_dummies, gender_dummies) +``` + +### Split the dataset + +Split into training (60%), validation (20%), and test (20%) sets: + +```{r} +set.seed(42) + +# First split: 80% train+validation, 20% test +target_col <- customer_churn$target_column +split1 <- sample.split(data_processed[[target_col]], SplitRatio = 0.8) +train_val_data <- subset(data_processed, split1 == TRUE) +test_data <- subset(data_processed, split1 == FALSE) + +# Second split: 75% train, 25% validation (of the 80% = 60/20 overall) +split2 <- sample.split(train_val_data[[target_col]], SplitRatio = 0.75) +train_data <- subset(train_val_data, split2 == TRUE) +validation_data <- subset(train_val_data, split2 == FALSE) +``` + +### Separate features and targets + +```{r} +x_train <- train_data %>% select(-all_of(target_col)) +y_train <- train_data[[target_col]] +``` + +## Initialize the ValidMind datasets + +```{r} +vm_raw_dataset <- vm_r$init_dataset( + dataset = data, + input_id = "raw_dataset", + target_column = customer_churn$target_column, + class_labels = customer_churn$class_labels +) + +vm_train_ds <- vm_r$init_dataset( + dataset = train_data, + input_id = "train_dataset", + target_column = customer_churn$target_column +) + +vm_validation_ds <- vm_r$init_dataset( + dataset = validation_data, + input_id = "validation_dataset", + target_column = customer_churn$target_column +) + +vm_test_ds <- vm_r$init_dataset( + dataset = test_data, + input_id = "test_dataset", + target_column = customer_churn$target_column +) +``` + +## Run data quality tests + +### Run an individual data quality test + +Run the ClassImbalance test on the raw dataset and log it to the platform: + +```{r} +vm_r$tests$run_test( + test_id = "validmind.data_validation.ClassImbalance", + inputs = list(dataset = vm_raw_dataset) +)$log() +``` + +### Run data comparison tests + +Compare class imbalance across dataset splits: + +```{r} +comparison_tests <- list( + "validmind.data_validation.ClassImbalance:train_vs_validation" = list( + input_grid = list(dataset = list("train_dataset", "validation_dataset")) + ), + "validmind.data_validation.ClassImbalance:train_vs_test" = list( + input_grid = list(dataset = list("train_dataset", "test_dataset")) + ) +) + +for (test_name in names(comparison_tests)) { + cat(paste0("Running: ", test_name, "\n")) + config <- comparison_tests[[test_name]] + tryCatch({ + vm_r$tests$run_test( + test_name, + input_grid = config$input_grid + )$log() + }, error = function(e) { + cat(paste0("Error running test ", test_name, ": ", e$message, "\n")) + }) +} +``` + +## Train the champion model + +Train a logistic regression (GLM) to serve as the champion model: + +```{r} +formula <- as.formula(paste(target_col, "~ .")) +model <- glm(formula, data = train_data, family = binomial) +summary(model) +``` + +## Initialize the model object + +Save the R model and initialize it with ValidMind: + +```{r} +model_path <- save_model(model) + +vm_xgboost <- vm_r$init_r_model( + model_path = model_path, + input_id = "xgboost_champion" +) +``` + +### Assign predictions + +Link model predictions to the training and testing datasets: + +```{r} +vm_train_ds$assign_predictions(model = vm_xgboost) +vm_test_ds$assign_predictions(model = vm_xgboost) +``` + +## Run model evaluation tests + +### Run model performance tests + +List available model performance tests: + +```{r} +vm_r$tests$list_tests(tags = list("model_performance"), task = "classification") +``` + +Run and log performance tests: + +```{r} +performance_tests <- c( + "validmind.model_validation.sklearn.ClassifierPerformance:xgboost_champion", + "validmind.model_validation.sklearn.ConfusionMatrix:xgboost_champion", + "validmind.model_validation.sklearn.ROCCurve:xgboost_champion" +) + +for (test in performance_tests) { + cat(paste0("Running: ", test, "\n")) + vm_r$tests$run_test( + test, + inputs = list(dataset = vm_test_ds, model = vm_xgboost) + )$log() +} +``` + +### Run diagnostic tests + +Assess the model for overfitting: + +```{r} +vm_r$tests$run_test( + test_id = paste0( + "validmind.model_validation.sklearn.OverfitDiagnosis", + ":xgboost_champion" + ), + input_grid = list( + datasets = list(list(vm_train_ds, vm_test_ds)), + model = list(vm_xgboost) + ) +)$log() +``` + +Test robustness: + +```{r} +vm_r$tests$run_test( + test_id = paste0( + "validmind.model_validation.sklearn.RobustnessDiagnosis", + ":xgboost_champion" + ), + input_grid = list( + datasets = list(list(vm_train_ds, vm_test_ds)), + model = list(vm_xgboost) + ) +)$log() +``` + +### Run feature importance tests + +Note: `PermutationFeatureImportance` and `SHAPGlobalImportance` are not supported for R models. + +```{r} +vm_r$tests$run_test( + "validmind.model_validation.FeaturesAUC:xgboost_champion", + inputs = list(dataset = vm_test_ds, model = vm_xgboost) +)$log() +``` + +## Next steps + +Head to the [ValidMind Platform](https://app.prod.validmind.ai) to view the validation report: + +1. Navigate to **Inventory** and select your model. +2. Click **Validation** under Documents. +3. Include your logged test results as evidence, create risk assessment notes, and assess compliance. + +--- + +*Copyright 2023-2026 ValidMind Inc. All rights reserved.* +*Refer to [LICENSE](https://github.com/validmind/validmind-library/blob/main/LICENSE) for details.* +*SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial* diff --git a/pyproject.toml b/pyproject.toml index 7fd360c68..6de558bf2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "validmind" -version = "2.12.3" +version = "2.12.4" description = "ValidMind Library" readme = "README.pypi.md" requires-python = ">=3.9,<3.15" diff --git a/r/validmind/DESCRIPTION b/r/validmind/DESCRIPTION index 1a64d1f58..37e80f97d 100644 --- a/r/validmind/DESCRIPTION +++ b/r/validmind/DESCRIPTION @@ -1,7 +1,7 @@ Package: validmind Type: Package Title: Interface to the 'ValidMind' Platform -Version: 0.1.2 +Version: 2.12.4 Authors@R: c(person("Andres", "Rodriguez", role = c("aut", "cre","cph"), email = "andres@validmind.ai")) Maintainer: Andres Rodriguez diff --git a/r/validmind/R/platform.R b/r/validmind/R/platform.R index 426b10d40..5b828918f 100644 --- a/r/validmind/R/platform.R +++ b/r/validmind/R/platform.R @@ -3,8 +3,11 @@ #' @param api_key The ValidMind API key #' @param api_secret The ValidMind API secret #' @param model The ValidMind model -#' @param python_version The Python Version to use +#' @param python_version The path to the Python binary to use. Defaults to +#' the VALIDMIND_PYTHON environment variable, or the system Python. #' @param api_host The ValidMind host, defaulting to local +#' @param document The document type to associate with this session +#' (e.g. "documentation", "validation-report"). Defaults to NULL. #' #' @importFrom reticulate import use_python py_config #' @@ -14,28 +17,47 @@ #' @examples #'\dontrun{ #' vm_r <- vm( +#' api_host="https://app.prod.validmind.ai/api/v1/tracking", #' api_key="", #' api_secret="", #' model="", -#' python_version=python_version, -#' api_host="https://app.prod.validmind.ai/api/v1/tracking" +#' document="documentation" #' ) #'} #' #' @export -vm <- function(api_key, api_secret, model, python_version, - api_host = "http://localhost:3000/api/v1/tracking") { - use_python(python_version) +vm <- function(api_key, api_secret, model, + python_version = Sys.getenv("VALIDMIND_PYTHON", Sys.which("python")), + api_host = "http://localhost:3000/api/v1/tracking", + document = NULL) { + # Resolve relative paths (e.g. ".venv/bin/python") against the working directory + if (nchar(python_version) > 0 && !startsWith(python_version, "/")) { + python_version <- file.path(getwd(), python_version) + } + use_python(python_version, required = TRUE) + + # Set environment variables BEFORE Python initializes (required for rpy2 compatibility) + # R_HOME: so rpy2 can find the R installation + # RPY2_CFFI_MODE: use ABI mode so rpy2 attaches to the existing R session + # started by reticulate rather than trying to start a new one + Sys.setenv(R_HOME = R.home()) + Sys.setenv(RPY2_CFFI_MODE = "ABI") vm <- import("validmind") - vm$init( + init_args <- list( api_host = api_host, api_key = api_key, api_secret = api_secret, model = model ) + if (!is.null(document)) { + init_args$document <- document + } + + do.call(vm$init, init_args) + return(vm) } diff --git a/r/validmind/README.md b/r/validmind/README.md index ed74b8149..4351790cb 100644 --- a/r/validmind/README.md +++ b/r/validmind/README.md @@ -1,5 +1,27 @@ # ValidMind R Package +## Prerequisites + +Install the required R packages: + +```r +install.packages(c("reticulate", "dplyr", "caTools", "knitr", "glue", "plotly", "htmltools", "rmarkdown", "DT", "base64enc")) +``` + +You also need a Python environment with the `validmind` Python package and `rpy2` installed: + +```bash +pip install validmind rpy2 +``` + +**Note:** On macOS, if `rpy2` fails to find R libraries, rebuild it from source against your installed R: + +```bash +R_HOME=$(Rscript -e 'cat(R.home())') pip install --no-binary :all: --force-reinstall rpy2 +``` + +Point `python_version` to your Python binary (e.g. the one in your project's `.venv`). + ## Installation You can install ValidMind from CRAN: @@ -20,23 +42,91 @@ Or you can install the package from source. Ensure you are in the `r/validmind` devtools::install() ``` +For local development, you can skip `devtools` entirely and install directly from the repo path: + +```r +install.packages("/path/to/validmind-library/r/validmind", repos = NULL, type = "source") +``` + +## Configuring the Python path + +The R package needs to know which Python binary to use (the one with `validmind` installed). It reads the `VALIDMIND_PYTHON` environment variable, falling back to the system Python if not set. + +### Option 1: `.Renviron` file (recommended) + +Create a `.Renviron` file in the project root (or `~/.Renviron` for a global setting): + +``` +VALIDMIND_PYTHON=.venv/bin/python +``` + +R reads this file automatically on startup, before any code runs. Relative paths are resolved against the working directory. + +### Option 2: `.env` file in the repo + +If the project uses Poetry with in-project virtualenvs (`.venv/` inside the repo), add to the repo's `.env` file: + +``` +VALIDMIND_PYTHON=.venv/bin/python +``` + +### No configuration needed in Docker / CI + +When `VALIDMIND_PYTHON` is not set, the package falls back to `Sys.which("python")`, which resolves to the system Python — correct for environments where `validmind` is installed globally. + ## QuickStart -You can connect to your ValidMind profile by providing the appropriate credentials: +Connect to your ValidMind profile: ```r vm_r <- vm( + api_host="https://api.prod.validmind.ai/api/v1/tracking", api_key="", api_secret="", model="", - python_version="", - api_host="https://api.prod.validmind.ai/api/v1/tracking" + document="documentation" ) ``` -## Fleshed out example +The `python_version` parameter is no longer required — it defaults to `VALIDMIND_PYTHON` or the system Python. The `document` parameter specifies which document type to associate with the session (e.g. `"documentation"` or `"validation-report"`). + +### Quickstart notebooks + +See the `notebooks/code_sharing/r/` folder for full working examples: + +- **`quickstart_model_documentation.Rmd`** — End-to-end model documentation workflow: load data, preprocess, train a GLM model, and run the full documentation test suite. +- **`quickstart_model_validation.Rmd`** — End-to-end model validation workflow: load data, run data quality tests, train a champion GLM model, and run model evaluation tests. -Please see the `notebooks/code-sharing/r` folder for examples of how to use! +These notebooks can be run from VS Code (with the R extension), RStudio, or interactively in a terminal R session. When running interactively, launch R from the repository root so that relative dataset paths resolve correctly. + +### Key APIs available via reticulate + +Since the R package returns the full Python `validmind` module, you can call any Python API directly: + +```r +# Preview the documentation template +vm_r$preview_template() + +# Initialize datasets +vm_dataset <- vm_r$init_dataset(dataset=df, input_id="my_dataset", target_column="target") + +# Initialize R models +model_path <- save_model(model) +vm_model <- vm_r$init_r_model(model_path=model_path, input_id="model") + +# Assign predictions +vm_dataset$assign_predictions(model=vm_model) + +# Run the full documentation test suite +vm_r$run_documentation_tests(config=test_config) + +# Run individual tests +vm_r$tests$run_test("validmind.data_validation.ClassImbalance", inputs=list(dataset=vm_dataset))$log() + +# List available tests +vm_r$tests$list_tests(tags=list("data_quality"), task="classification") +vm_r$tests$list_tasks_and_tags() +``` ## Troubleshooting diff --git a/tests/test_validmind_tests_module.py b/tests/test_validmind_tests_module.py index 4ee984c74..4118f6449 100644 --- a/tests/test_validmind_tests_module.py +++ b/tests/test_validmind_tests_module.py @@ -7,6 +7,7 @@ from typing import Callable, List import pandas as pd +from pandas.io.formats.style import Styler from validmind.tests import ( list_tags, @@ -37,8 +38,11 @@ def test_list_tasks(self): def test_list_tasks_and_tags(self): tasks_and_tags = list_tasks_and_tags() - self.assertIsInstance(tasks_and_tags, pd.io.formats.style.Styler) - df = tasks_and_tags.data + # Returns a Styler in notebooks, plain DataFrame otherwise + if isinstance(tasks_and_tags, Styler): + df = tasks_and_tags.data + else: + df = tasks_and_tags self.assertTrue(len(df) > 0) self.assertTrue(all(isinstance(task, str) for task in df["Task"])) self.assertTrue(all(isinstance(tag, str) for tag in df["Tags"])) @@ -51,8 +55,11 @@ def test_list_tests(self): def test_list_tests_pretty(self): tests = list_tests(pretty=True) - self.assertIsInstance(tests, pd.io.formats.style.Styler) - df = tests.data + # Returns a Styler in notebooks, plain DataFrame otherwise + if isinstance(tests, Styler): + df = tests.data + else: + df = tests self.assertTrue(len(df) > 0) # check has the columns: ID, Name, Description, Required Inputs, Params self.assertTrue("ID" in df.columns) diff --git a/validmind/__version__.py b/validmind/__version__.py index 521faacaf..896b89678 100644 --- a/validmind/__version__.py +++ b/validmind/__version__.py @@ -1 +1 @@ -__version__ = "2.12.3" +__version__ = "2.12.4" diff --git a/validmind/client.py b/validmind/client.py index 5c5ceff39..c0fe8b2ca 100644 --- a/validmind/client.py +++ b/validmind/client.py @@ -282,33 +282,23 @@ def init_r_model( """ Initialize a VM Model from an R model. - LogisticRegression and LinearRegression models are converted to sklearn models by extracting - the coefficients and intercept from the R model. XGB models are loaded using the xgboost - since xgb models saved in .json or .bin format can be loaded directly with either Python or R. + The model must first be saved to an .RData file using the R package's + ``save_model()`` function. This function then uses rpy2 to load the model + into Python for testing and validation. Args: - model_path (str): The path to the R model saved as an RDS or XGB file. + model_path (str): The path to the R model saved as an .RData file. input_id (str): The input ID for the model. Defaults to "model". Returns: VMModel: A VM Model instance. """ - - # TODO: proper check for supported models - # - # if model.get("method") not in R_MODEL_METHODS: - # raise UnsupportedRModelError( - # "R model method must be one of {}. Got {}".format( - # R_MODEL_METHODS, model.get("method") - # ) - # ) - - # first we need to load the model using rpy2 - # since rpy2 is an extra we need to conditionally import it try: import rpy2.robjects as robjects - except ImportError: - raise MissingRExtrasError() + except Exception as e: + raise MissingRExtrasError( + f"`rpy2` is required to use R models. Import failed: {e}" + ) r = robjects.r loaded_objects = r.load(model_path) @@ -321,6 +311,14 @@ def init_r_model( input_id=input_id, ) + metadata = get_model_info(vm_model) + log_input( + input_id=input_id, + type="model", + metadata=metadata, + ) + input_registry.add(key=input_id, obj=vm_model) + return vm_model diff --git a/validmind/models/r_model.py b/validmind/models/r_model.py index 377c80bd8..cf36e308c 100644 --- a/validmind/models/r_model.py +++ b/validmind/models/r_model.py @@ -98,11 +98,10 @@ def r_predict(self, new_data_r): Instead, there is a global predict() method that returns the predicted values according to the model type. """ - # Use the predict method on the loaded model (assuming the model's name in R is 'model') predicted_probs = self.r.predict( self.model, newdata=new_data_r, type="response" ) - return predicted_probs + return np.array(predicted_probs) def r_xgb_predict(self, new_data_r): """ @@ -114,7 +113,7 @@ def r_xgb_predict(self, new_data_r): predicted_probs = self.r.predict( self.model, newdata=new_data_r, type="response" ) - return predicted_probs + return np.array(predicted_probs) def predict_proba(self, new_data): """ @@ -127,24 +126,30 @@ def predict(self, new_data, return_probs=False): Converts the predicted probabilities to classes """ try: + from rpy2.robjects import conversion, default_converter from rpy2.robjects import pandas2ri except ImportError: raise MissingRExtrasError() - # Activate the pandas conversion for rpy2 - pandas2ri.activate() - new_data_class = get_full_class_name(new_data) if new_data_class == "numpy.ndarray": - # We need to reconstruct the DataFrame from the ndarray using the column names - new_data = pd.DataFrame(new_data, columns=self.test_ds.feature_columns) + # Reconstruct a DataFrame from the ndarray using column names + # from the model's training data + try: + model_terms = list(self.r.attr(self.model.rx2["terms"], "term.labels")) + new_data = pd.DataFrame(new_data, columns=model_terms) + except Exception: + # Fallback: use generic column names + new_data = pd.DataFrame(new_data) elif new_data_class != "pandas.core.frame.DataFrame": raise ValueError( f"new_data must be a DataFrame or ndarray. Got {new_data_class}" ) - new_data_r = pandas2ri.py2rpy(new_data) + # Use context manager for pandas conversion (activate/deactivate is deprecated) + with conversion.localconverter(default_converter + pandas2ri.converter): + new_data_r = conversion.get_conversion().py2rpy(new_data) if self.__model_class() == "xgb.Booster": predicted_probs = self.r_xgb_predict(new_data_r) diff --git a/validmind/template.py b/validmind/template.py index 8fe191389..d4aec2a0e 100644 --- a/validmind/template.py +++ b/validmind/template.py @@ -190,20 +190,37 @@ def _create_section_html(tree: List[Dict[str, Any]]) -> str: return StatefulHTMLRenderer.render_accordion(accordion_items, accordion_titles) +def _print_section_tree(sections: List[Dict[str, Any]], indent: int = 0) -> None: + """Print a plain-text representation of the template section tree.""" + prefix = " " * indent + for i, section in enumerate(sections): + number = f"{i + 1}." if indent == 0 else "" + print(f"{prefix}{number} {section['title']} ('{section['id']}')") + + for content in section.get("contents", []): + content_type = CONTENT_TYPE_MAP.get( + content["content_type"], content["content_type"] + ) + print(f"{prefix} - [{content_type}] {content['content_id']}") + + if section.get("sections"): + _print_section_tree(section["sections"], indent + 1) + + def preview_template(template: str) -> None: - """Preview a template in Jupyter Notebook. + """Preview a template in Jupyter Notebook or plain text. Args: template (dict): The template to preview. """ + section_tree = _convert_sections_to_section_tree(template["sections"]) + if not is_notebook(): - logger.warning("preview_template() only works in Jupyter Notebook") + _print_section_tree(section_tree) return html_content = StatefulHTMLRenderer.get_base_css() - html_content += _create_section_html( - _convert_sections_to_section_tree(template["sections"]) - ) + html_content += _create_section_html(section_tree) display(html_content) diff --git a/validmind/utils.py b/validmind/utils.py index bd1cec418..af136184e 100644 --- a/validmind/utils.py +++ b/validmind/utils.py @@ -359,6 +359,9 @@ def format_number(number): def format_dataframe(df: pd.DataFrame) -> pd.DataFrame: """Format a pandas DataFrame for display purposes.""" + if not is_notebook(): + return df + df = df.style.set_properties(**{"text-align": "left"}).hide(axis="index") return df.set_table_styles([dict(selector="th", props=[("text-align", "left")])]) @@ -509,7 +512,7 @@ def get_dataset_info(dataset): def preview_test_config(config): - """Preview test configuration in a collapsible HTML section. + """Preview test configuration in a collapsible HTML section or plain text. Args: config (dict): Test configuration dictionary. @@ -521,6 +524,10 @@ def preview_test_config(config): logger.error(f"JSON serialization failed: {e}") return + if not is_notebook(): + print(formatted_json) + return + collapsible_html = f"""