diff --git a/src/gleam_community/maths/metrics.gleam b/src/gleam_community/maths/metrics.gleam index 4c45285..a4ee8ab 100644 --- a/src/gleam_community/maths/metrics.gleam +++ b/src/gleam_community/maths/metrics.gleam @@ -137,7 +137,7 @@ pub fn norm(arr: List(Float), p: Float) -> Float { /// let assert Ok(tol) = elementary.power(-10.0, -6.0) /// /// // Empty lists returns 0.0 -/// metrics.float_manhatten_distance([], []) +/// metrics.manhatten_distance([], []) /// |> should.equal(Ok(0.0)) /// /// // Differing lengths returns error @@ -567,13 +567,36 @@ pub fn standard_deviation(arr: List(Float), ddof: Int) -> Result(Float, String) /// /// /// +/// The Jaccard index measures similarity between two sets of elements. Mathematically, the Jaccard index +/// is defined as: +/// +/// \\[ +/// \text{JI}(X, Y) = \frac{|X \cap Y|}{|X \cup Y|} \in \left[0, 1\right] +/// \\] +/// +/// where: +/// +/// - $$X$$ and $$Y$$ are two sets being compared, +/// - $$|X \cap Y|$$ represents the size of the intersection of the two sets +/// - $$|X \cup Y|$$ denotes the size of the union of the two sets +/// +/// The value of the Jaccard index ranges from 0 to 1, where 0 indicates that the two sets share no elements +/// and 1 indicates that the sets are identical. The Jaccard index is a special case of the +/// [Tversky index](#tversky_index) (with $$\alpha=\beta=1$$). +/// ///
/// Example: /// /// import gleeunit/should /// import gleam_community/maths/metrics +/// import gleam/set /// /// pub fn example () { +/// let xset: set.Set(String) = set.from_list(["cat", "dog", "hippo", "monkey"]) +/// let yset: set.Set(String) = +/// set.from_list(["monkey", "rhino", "ostrich", "salmon"]) +/// metrics.jaccard_index(xset, yset) +/// |> should.equal(1.0 /. 7.0) /// } ///
/// @@ -583,8 +606,8 @@ pub fn standard_deviation(arr: List(Float), ddof: Int) -> Result(Float, String) /// /// /// -pub fn jaccard_index(aset: set.Set(a), bset: set.Set(a)) -> Float { - let assert Ok(result) = tversky_index(aset, bset, 1.0, 1.0) +pub fn jaccard_index(xset: set.Set(a), yset: set.Set(a)) -> Float { + let assert Ok(result) = tversky_index(xset, yset, 1.0, 1.0) result } @@ -594,13 +617,36 @@ pub fn jaccard_index(aset: set.Set(a), bset: set.Set(a)) -> Float { /// /// /// +/// The Sørensen-Dice coefficient measures the similarity between two sets of elements. Mathematically, the +/// coefficient is defined as: +/// +/// \\[ +/// \text{DSC}(X, Y) = \frac{2 \times |X \cap Y|}{|X| + |Y|} \in \left[0, 1\right] +/// \\] +/// +/// where: +/// - $$X$$ and $$Y$$ are two sets being compared +/// - $$|X \cap Y|$$ is the size of the intersection of the two sets (i.e., the number of elements common to both sets) +/// - $$|X|$$ and $$|Y|$$ are the sizes of the sets $$X$$ and $$Y$$, respectively +/// +/// The coefficient ranges from 0 to 1, where 0 indicates no similarity (the sets share no elements) and 1 +/// indicates perfect similarity (the sets are identical). The higher the coefficient, the greater the similarity +/// between the two sets. The Sørensen-Dice coefficient is a special case of the +/// [Tversky index](#tversky_index) (with $$\alpha=\beta=0.5$$). +/// ///
/// Example: /// /// import gleeunit/should /// import gleam_community/maths/metrics +/// import gleam/set /// /// pub fn example () { +/// let xset: set.Set(String) = set.from_list(["cat", "dog", "hippo", "monkey"]) +/// let yset: set.Set(String) = +/// set.from_list(["monkey", "rhino", "ostrich", "salmon", "spider"]) +/// metrics.sorensen_dice_coefficient(xset, yset) +/// |> should.equal(2.0 *. 1.0 /. { 4.0 +. 5.0 }) /// } ///
/// @@ -610,8 +656,8 @@ pub fn jaccard_index(aset: set.Set(a), bset: set.Set(a)) -> Float { /// /// /// -pub fn sorensen_dice_coefficient(aset: set.Set(a), bset: set.Set(a)) -> Float { - let assert Ok(result) = tversky_index(aset, bset, 0.5, 0.5) +pub fn sorensen_dice_coefficient(xset: set.Set(a), yset: set.Set(a)) -> Float { + let assert Ok(result) = tversky_index(xset, yset, 0.5, 0.5) result } @@ -621,15 +667,39 @@ pub fn sorensen_dice_coefficient(aset: set.Set(a), bset: set.Set(a)) -> Float { /// /// /// -/// The Tversky index is a generalization of the Sørensen–Dice coefficient and the Jaccard index. +/// The Tversky index is a generalization of the Jaccard index and Sørensen-Dice coefficient, which adds +/// flexibility through two parameters, $$\alpha$$ and $$\beta$$, allowing for asymmetric similarity +/// measures between sets. The Tversky index is defined as: /// +/// \\[ +/// \text{TI}(X, Y) = \frac{|X \cap Y|}{|X \cap Y| + \alpha|X - Y| + \beta|Y - X|} +/// \\] +/// +/// where: +/// +/// - $$X$$ and $$Y$$ are the sets being compared +/// - $$|X - Y|$$ and $$|Y - X|$$ are the sizes of the relative complements of $$Y$$ in $$X$$ and $$X$$ in $$Y$$, respectively, +/// - $$\alpha$$ and $$\beta$$ are parameters that weigh the relative importance of the elements unique to $$X$$ and $$Y$$ +/// +/// The Tversky index reduces to the Jaccard index when \(\alpha = \beta = 1\) and to the Sorensen-Dice +/// coefficient when \(\alpha = \beta = 0.5\). In general, the Tversky index can take on any non-negative value, including 0. +/// The index equals 0 when there is no intersection between the two sets, indicating no similarity. However, unlike similarity +/// measures bounded strictly between 0 and 1, the Tversky index does not have a strict upper limit of 1 when $$\alpha \neq \beta$$. +/// ///
/// Example: /// /// import gleeunit/should /// import gleam_community/maths/metrics +/// import gleam/set /// /// pub fn example () { +/// let yset: set.Set(String) = set.from_list(["cat", "dog", "hippo", "monkey"]) +/// let xset: set.Set(String) = +/// set.from_list(["monkey", "rhino", "ostrich", "salmon"]) +/// // Test Jaccard index (alpha = beta = 1) +/// metrics.tversky_index(xset, yset, 1.0, 1.0) +/// |> should.equal(1.0 /. 7.0) /// } ///
/// @@ -640,23 +710,23 @@ pub fn sorensen_dice_coefficient(aset: set.Set(a), bset: set.Set(a)) -> Float { /// /// pub fn tversky_index( - aset: set.Set(a), - bset: set.Set(a), + xset: set.Set(a), + yset: set.Set(a), alpha: Float, beta: Float, ) -> Result(Float, String) { case alpha >=. 0.0, beta >=. 0.0 { True, True -> { let intersection: Float = - set.intersection(aset, bset) + set.intersection(xset, yset) |> set.size() |> conversion.int_to_float() let difference1: Float = - set.difference(aset, bset) + set.difference(xset, yset) |> set.size() |> conversion.int_to_float() let difference2: Float = - set.difference(bset, aset) + set.difference(yset, xset) |> set.size() |> conversion.int_to_float() intersection @@ -684,14 +754,39 @@ pub fn tversky_index( /// /// /// -/// +/// The Overlap coefficient, also known as the Szymkiewicz–Simpson coefficient, is a measure of +/// similarity between two sets that focuses on the size of the intersection relative to the +/// smaller of the two sets. It is defined mathematically as: +/// +/// \\[ +/// \text{OC}(X, Y) = \frac{|X \cap Y|}{\min(|X|, |Y|)} \in \left[0, 1\right] +/// \\] +/// +/// where: +/// +/// - $$X$$ and $$Y$$ are the sets being compared +/// - $$|X \cap Y|$$ is the size of the intersection of the sets +/// - $$\min(|X|, |Y|)$$ is the size of the smaller set among $$X$$ and $$Y$$ +/// +/// The coefficient ranges from 0 to 1, where 0 indicates no overlap and 1 indicates that the +/// smaller set is a suyset of the larger set. This measure is especially useful in situations +/// where the similarity in terms of the proportion of overlap is more relevant than the +/// difference in sizes between the two sets. +/// ///
/// Example: /// /// import gleeunit/should /// import gleam_community/maths/metrics +/// import gleam/set /// /// pub fn example () { +/// let set_a: set.Set(String) = +/// set.from_list(["horse", "dog", "hippo", "monkey", "bird"]) +/// let set_b: set.Set(String) = +/// set.from_list(["monkey", "bird", "ostrich", "salmon"]) +/// metrics.overlap_coefficient(set_a, set_b) +/// |> should.equal(2.0 /. 4.0) /// } ///
/// @@ -701,13 +796,92 @@ pub fn tversky_index( /// /// /// -pub fn overlap_coefficient(aset: set.Set(a), bset: set.Set(a)) -> Float { +pub fn overlap_coefficient(xset: set.Set(a), yset: set.Set(a)) -> Float { let intersection: Float = - set.intersection(aset, bset) + set.intersection(xset, yset) |> set.size() |> conversion.int_to_float() let minsize: Float = - piecewise.minimum(set.size(aset), set.size(bset), int.compare) + piecewise.minimum(set.size(xset), set.size(yset), int.compare) |> conversion.int_to_float() intersection /. minsize } + +///
+/// +/// Spot a typo? Open an issue! +/// +///
+/// +/// Calculate the cosine similarity between two lists (representing vectors): +/// +/// \\[ +/// \frac{\sum_{i=1}^n x_i \cdot y_i}{\left(\sum_{i=1}^n x_i^2\right)^{\frac{1}{2}} \cdot \left(\sum_{i=1}^n y_i^2\right)^{\frac{1}{2}}} +/// \\] +/// +/// In the formula, $n$ is the length of the two lists and $x_i, y_i$ are the values in the respective input lists indexed by $i$. The numerator +/// represents the dot product of the two vectors, while the denominator is the product of the magnitudes (Euclidean norms) of the two vectors. +/// The cosine similarity provides a value between -1 and 1, where 1 means the vectors are in the same direction, -1 means they are in exactly +/// opposite directions, and 0 indicates orthogonality. +/// +///
+/// Example: +/// +/// import gleeunit/should +/// import gleam_community/maths/metrics +/// +/// pub fn example () { +/// // Two orthogonal vectors +/// metrics.cosine_similarity([-1.0, 1.0, 0.0], [1.0, 1.0, -1.0]) +/// |> should.equal(Ok(0.0)) +/// +/// // Two identical (parallel) vectors +/// metrics.cosine_similarity([1.0, 2.0, 3.0], [1.0, 2.0, 3.0]) +/// |> should.equal(Ok(1.0)) +/// +/// // Two parallel, but oppositely oriented vectors +/// metrics.cosine_similarity([-1.0, -2.0, -3.0], [1.0, 2.0, 3.0]) +/// |> should.equal(Ok(-1.0)) +/// } +///
+/// +///
+/// +/// Back to top ↑ +/// +///
+/// +pub fn cosine_similarity( + xarr: List(Float), + yarr: List(Float), +) -> Result(Float, String) { + let xlen: Int = list.length(xarr) + let ylen: Int = list.length(yarr) + case xarr, yarr { + [], _ -> + "Invalid input argument: The list xarr is empty." + |> Error + _, [] -> + "Invalid input argument: The list yarr is empty." + |> Error + _, _ -> { + case xlen == ylen { + False -> + "Invalid input argument: length(xarr) != length(yarr). Valid input is when length(xarr) == length(yarr)." + |> Error + True -> { + list.fold( + list.zip(xarr, yarr), + 0.0, + fn(acc: Float, a: #(Float, Float)) -> Float { + let result: Float = pair.first(a) *. pair.second(a) + result +. acc + }, + ) + /. { norm(xarr, 2.0) *. norm(yarr, 2.0) } + |> Ok + } + } + } + } +} diff --git a/test/gleam_community/maths/metrics_test.gleam b/test/gleam_community/maths/metrics_test.gleam index 2cb5ef1..e2f7307 100644 --- a/test/gleam_community/maths/metrics_test.gleam +++ b/test/gleam_community/maths/metrics_test.gleam @@ -235,6 +235,27 @@ pub fn example_jaccard_index_test() { |> should.equal(1.0 /. 7.0) } +pub fn example_sorensen_dice_coefficient_test() { + metrics.sorensen_dice_coefficient(set.from_list([]), set.from_list([])) + |> should.equal(0.0) + + let set_a: set.Set(Int) = set.from_list([0, 1, 2, 5, 6, 8, 9]) + let set_b: set.Set(Int) = set.from_list([0, 2, 3, 4, 5, 7, 9]) + metrics.sorensen_dice_coefficient(set_a, set_b) + |> should.equal(2.0 *. 4.0 /. { 7.0 +. 7.0 }) + + let set_c: set.Set(Int) = set.from_list([0, 1, 2, 3, 4, 5]) + let set_d: set.Set(Int) = set.from_list([6, 7, 8, 9, 10]) + metrics.sorensen_dice_coefficient(set_c, set_d) + |> should.equal(2.0 *. 0.0 /. { 6.0 +. 5.0 }) + + let set_e: set.Set(String) = set.from_list(["cat", "dog", "hippo", "monkey"]) + let set_f: set.Set(String) = + set.from_list(["monkey", "rhino", "ostrich", "salmon", "spider"]) + metrics.sorensen_dice_coefficient(set_e, set_f) + |> should.equal(2.0 *. 1.0 /. { 4.0 +. 5.0 }) +} + pub fn example_overlap_coefficient_test() { metrics.overlap_coefficient(set.from_list([]), set.from_list([])) |> should.equal(0.0) @@ -250,9 +271,39 @@ pub fn example_overlap_coefficient_test() { |> should.equal(0.0 /. 5.0) let set_e: set.Set(String) = - set.from_list(["cat", "dog", "hippo", "monkey", "rhino"]) + set.from_list(["horse", "dog", "hippo", "monkey", "bird"]) let set_f: set.Set(String) = - set.from_list(["monkey", "rhino", "ostrich", "salmon"]) + set.from_list(["monkey", "bird", "ostrich", "salmon"]) metrics.overlap_coefficient(set_e, set_f) |> should.equal(2.0 /. 4.0) } + +pub fn example_cosine_similarity_test() { + // Empty lists returns an error + metrics.cosine_similarity([], []) + |> should.be_error() + + // One empty list returns an error + metrics.cosine_similarity([1.0, 2.0, 3.0], []) + |> should.be_error() + + // One empty list returns an error + metrics.cosine_similarity([], [1.0, 2.0, 3.0]) + |> should.be_error() + + // Differen sized lists returns an error + metrics.cosine_similarity([1.0, 2.0], [1.0, 2.0, 3.0, 4.0]) + |> should.be_error() + + // Two orthogonal vectors (represented by lists) + metrics.cosine_similarity([-1.0, 1.0, 0.0], [1.0, 1.0, -1.0]) + |> should.equal(Ok(0.0)) + + // Two identical (parallel) vectors (represented by lists) + metrics.cosine_similarity([1.0, 2.0, 3.0], [1.0, 2.0, 3.0]) + |> should.equal(Ok(1.0)) + + // Two parallel, but oppositely oriented vectors (represented by lists) + metrics.cosine_similarity([-1.0, -2.0, -3.0], [1.0, 2.0, 3.0]) + |> should.equal(Ok(-1.0)) +}