From 9fdfd930f6ca90c449e79da4e3696846bf14cab7 Mon Sep 17 00:00:00 2001 From: Andrey Shevchenko Date: Sun, 8 Mar 2026 21:21:17 +0300 Subject: [PATCH 1/3] added Jaccard distance --- src/metrics/distance/jaccard.rs | 101 ++++++++++++++++++++++++++++++++ src/metrics/distance/mod.rs | 7 +++ 2 files changed, 108 insertions(+) create mode 100644 src/metrics/distance/jaccard.rs diff --git a/src/metrics/distance/jaccard.rs b/src/metrics/distance/jaccard.rs new file mode 100644 index 00000000..589f52cf --- /dev/null +++ b/src/metrics/distance/jaccard.rs @@ -0,0 +1,101 @@ +//! # Jaccard Distance +//! +//! Jaccard Distance measures dissimilarity between two integer-valued vectors of the same length. +//! Given two vectors \\( x \in ℝ^n \\), \\( y \in ℝ^n \\) the Jaccard distance between \\( x \\) and \\( y \\) is defined as +//! +//! \\[ d(x, y) = 1 - \frac{|x \cap y|}{|x \cup y|} \\] +//! +//! where \\(|x \cap y|\\) is the number of positions where both vectors are non-zero, +//! and \\(|x \cup y|\\) is the number of positions where at least one of the vectors is non-zero. +//! +//! Example: +//! +//! ``` +//! use smartcore::metrics::distance::Distance; +//! use smartcore::metrics::distance::jaccard::Jaccard; +//! +//! let a = vec![1, 0, 1, 1]; +//! let b = vec![1, 1, 0, 1]; +//! +//! let j: f64 = Jaccard::new().distance(&a, &b); +//! +//! ``` +//! +//! +//! + +#[cfg(feature = "serde")] +use serde::{Deserialize, Serialize}; +use std::marker::PhantomData; + +use super::Distance; +use crate::linalg::basic::arrays::ArrayView1; +use crate::numbers::basenum::Number; + +/// Jaccard distance between two integer-valued vectors +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug, Clone)] +pub struct Jaccard { + _t: PhantomData, +} + +impl Jaccard { + /// instatiate the initial structure + pub fn new() -> Jaccard { + Jaccard { _t: PhantomData } + } +} + +impl Default for Jaccard { + fn default() -> Self { + Self::new() + } +} + +impl> Distance for Jaccard { + fn distance(&self, x: &A, y: &A) -> f64 { + if x.shape() != y.shape() { + panic!("Input vector sizes are different"); + } + + let (intersection, union): (usize, usize) = x + .iterator(0) + .zip(y.iterator(0)) + .map(|(a, b)| { + let a_nz = *a != T::zero(); + let b_nz = *b != T::zero(); + + match (a_nz, b_nz) { + (true, true) => (1, 1), + (true, false) | (false, true) => (0, 1), + (false, false) => (0, 0), + } + }) + .fold((0, 0), |acc, v| (acc.0 + v.0, acc.1 + v.1)); + + if union == 0 { + 0.0 + } else { + 1.0 - intersection as f64 / union as f64 + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[cfg_attr( + all(target_arch = "wasm32", not(target_os = "wasi")), + wasm_bindgen_test::wasm_bindgen_test + )] + #[test] + fn jaccard_distance() { + let a = vec![1, 0, 1, 1]; + let b = vec![1, 1, 0, 1]; + + let j: f64 = Jaccard::new().distance(&a, &b); + + assert!((j - 0.5).abs() < 1e-8); + } +} diff --git a/src/metrics/distance/mod.rs b/src/metrics/distance/mod.rs index 6fdbaa46..f720013e 100644 --- a/src/metrics/distance/mod.rs +++ b/src/metrics/distance/mod.rs @@ -19,6 +19,8 @@ pub mod cosine; pub mod euclidian; /// Hamming Distance between two strings is the number of positions at which the corresponding symbols are different. pub mod hamming; +/// Jaccard distance between two integer-valued vectors. +pub mod jaccard; /// The Mahalanobis distance is the distance between two points in multivariate space. pub mod mahalanobis; /// Also known as rectilinear distance, city block distance, taxicab metric. @@ -67,6 +69,11 @@ impl Distances { hamming::Hamming::new() } + /// Jaccard distance, see [`Jaccard`](jaccard/index.html) + pub fn jaccard() -> jaccard::Jaccard { + jaccard::Jaccard::new() + } + /// Mahalanobis distance, see [`Mahalanobis`](mahalanobis/index.html) pub fn mahalanobis, C: Array2 + LUDecomposable>( data: &M, From ece4f28446de178c3d1e9528d0429346259bf179 Mon Sep 17 00:00:00 2001 From: Andrey Shevchenko Date: Tue, 10 Mar 2026 11:48:09 +0300 Subject: [PATCH 2/3] two encounters of a bad pattern is_none() + unwrap(). FIXED. --- src/ensemble/base_forest_regressor.rs | 34 ++++++++++++-------- src/ensemble/random_forest_classifier.rs | 41 ++++++++++++++---------- 2 files changed, 44 insertions(+), 31 deletions(-) diff --git a/src/ensemble/base_forest_regressor.rs b/src/ensemble/base_forest_regressor.rs index dc504446..4209034c 100644 --- a/src/ensemble/base_forest_regressor.rs +++ b/src/ensemble/base_forest_regressor.rs @@ -161,25 +161,31 @@ impl, Y: Array1 /// Predict OOB classes for `x`. `x` is expected to be equal to the dataset used in training. pub fn predict_oob(&self, x: &X) -> Result { let (n, _) = x.shape(); - if self.samples.is_none() { - Err(Failed::because( - FailedError::PredictFailed, - "Need samples=true for OOB predictions.", - )) - } else if self.samples.as_ref().unwrap()[0].len() != n { - Err(Failed::because( + + let samples = match &self.samples { + Some(s) => s, + None => { + return Err(Failed::because( + FailedError::PredictFailed, + "Need samples=true for OOB predictions.", + )) + } + }; + + if samples[0].len() != n { + return Err(Failed::because( FailedError::PredictFailed, "Prediction matrix must match matrix used in training for OOB predictions.", - )) - } else { - let mut result = Y::zeros(n); + )); + } - for i in 0..n { - result.set(i, self.predict_for_row_oob(x, i)); - } + let mut result = Y::zeros(n); - Ok(result) + for i in 0..n { + result.set(i, self.predict_for_row_oob(x, i)); } + + Ok(result) } fn predict_for_row_oob(&self, x: &X, row: usize) -> TY { diff --git a/src/ensemble/random_forest_classifier.rs b/src/ensemble/random_forest_classifier.rs index dabb2480..f4e8db3c 100644 --- a/src/ensemble/random_forest_classifier.rs +++ b/src/ensemble/random_forest_classifier.rs @@ -539,27 +539,34 @@ impl, Y: Array1 Result { let (n, _) = x.shape(); - if self.samples.is_none() { - Err(Failed::because( - FailedError::PredictFailed, - "Need samples=true for OOB predictions.", - )) - } else if self.samples.as_ref().unwrap()[0].len() != n { - Err(Failed::because( + + let samples = match &self.samples { + Some(s) => s, + None => { + return Err(Failed::because( + FailedError::PredictFailed, + "Need samples=true for OOB predictions.", + )); + } + }; + + if samples[0].len() != n { + return Err(Failed::because( FailedError::PredictFailed, "Prediction matrix must match matrix used in training for OOB predictions.", - )) - } else { - let mut result = Y::zeros(n); + )); + } - for i in 0..n { - result.set( - i, - self.classes.as_ref().unwrap()[self.predict_for_row_oob(x, i)], - ); - } - Ok(result) + let mut result = Y::zeros(n); + + for i in 0..n { + result.set( + i, + self.classes.as_ref().unwrap()[self.predict_for_row_oob(x, i)], + ); } + + Ok(result) } fn predict_for_row_oob(&self, x: &X, row: usize) -> usize { From ff65a0838548e2efed868ff8d3c68081868f5441 Mon Sep 17 00:00:00 2001 From: Andrey Shevchenko Date: Tue, 10 Mar 2026 17:19:20 +0300 Subject: [PATCH 3/3] added 3 tests, incl. symmetry test. Now 4 in total. --- src/metrics/distance/jaccard.rs | 35 +++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/src/metrics/distance/jaccard.rs b/src/metrics/distance/jaccard.rs index 589f52cf..4834e2e3 100644 --- a/src/metrics/distance/jaccard.rs +++ b/src/metrics/distance/jaccard.rs @@ -89,6 +89,7 @@ mod tests { all(target_arch = "wasm32", not(target_os = "wasi")), wasm_bindgen_test::wasm_bindgen_test )] + #[test] fn jaccard_distance() { let a = vec![1, 0, 1, 1]; @@ -98,4 +99,38 @@ mod tests { assert!((j - 0.5).abs() < 1e-8); } + + #[test] + fn jaccard_identical_vectors() { + let a = vec![1, 0, 1, 0]; + let b = vec![1, 0, 1, 0]; + + let j: f64 = Jaccard::new().distance(&a, &b); + + assert!((j - 0.0).abs() < 1e-8); + } + + #[test] + fn jaccard_both_zero_vectors() { + let a = vec![0, 0, 0]; + let b = vec![0, 0, 0]; + + let j: f64 = Jaccard::new().distance(&a, &b); + + assert!((j - 0.0).abs() < 1e-8); + } + + #[test] + fn jaccard_symmetry() { + let a = vec![1, 0, 1, 1]; + let b = vec![0, 1, 1, 0]; + + let j = Jaccard::new(); + + let d1 = j.distance(&a, &b); + let d2 = j.distance(&b, &a); + + assert!((d1 - d2).abs() < 1e-12); + } } +