From 72d0500dc4584698e587ecf5c16e48046669b3ed Mon Sep 17 00:00:00 2001 From: Eduardo Farinati Date: Sat, 16 Mar 2024 12:56:23 -0300 Subject: [PATCH 01/16] fix: small typo on incomplete_gamma error --- src/gleam_community/maths/special.gleam | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/gleam_community/maths/special.gleam b/src/gleam_community/maths/special.gleam index 12e2f87..318511d 100644 --- a/src/gleam_community/maths/special.gleam +++ b/src/gleam_community/maths/special.gleam @@ -174,7 +174,7 @@ pub fn incomplete_gamma(a: Float, x: Float) -> Result(Float, String) { } False -> - "Invlaid input argument: a <= 0 or x < 0. Valid input is a > 0 and x >= 0." + "Invalid input argument: a <= 0 or x < 0. Valid input is a > 0 and x >= 0." |> Error } } From d7b4841c42660c098ab860762fbce2e3706b44b0 Mon Sep 17 00:00:00 2001 From: NicklasXYZ Date: Sun, 17 Mar 2024 00:03:50 +0100 Subject: [PATCH 02/16] Add set similarity measures --- src/gleam_community/maths/metrics.gleam | 129 +++++++++++++++++- test/gleam_community/maths/metrics_test.gleam | 22 +++ 2 files changed, 147 insertions(+), 4 deletions(-) diff --git a/src/gleam_community/maths/metrics.gleam b/src/gleam_community/maths/metrics.gleam index e35e336..cd9bfa6 100644 --- a/src/gleam_community/maths/metrics.gleam +++ b/src/gleam_community/maths/metrics.gleam @@ -30,6 +30,9 @@ //// * [`manhatten_distance`](#float_manhatten_distance) //// * [`minkowski_distance`](#minkowski_distance) //// * [`euclidean_distance`](#euclidean_distance) +//// * [`jaccard_index`](#jaccard_index) +//// * [`sorensen_dice_coefficient`](#sorensen_dice_coefficient) +//// * [`tversky_index`](#tversky_index) //// * **Basic statistical measures** //// * [`mean`](#mean) //// * [`median`](#median) @@ -44,6 +47,7 @@ import gleam_community/maths/predicates import gleam_community/maths/conversion import gleam/list import gleam/pair +import gleam/set import gleam/float ///
@@ -292,7 +296,7 @@ pub fn euclidean_distance( } ///
-/// +/// /// Spot a typo? Open an issue! /// ///
@@ -347,7 +351,7 @@ pub fn mean(arr: List(Float)) -> Result(Float, String) { } ///
-/// +/// /// Spot a typo? Open an issue! /// ///
@@ -414,7 +418,7 @@ pub fn median(arr: List(Float)) -> Result(Float, String) { } ///
-/// +/// /// Spot a typo? Open an issue! /// ///
@@ -490,7 +494,7 @@ pub fn variance(arr: List(Float), ddof: Int) -> Result(Float, String) { } ///
-/// +/// /// Spot a typo? Open an issue! /// ///
@@ -555,3 +559,120 @@ pub fn standard_deviation(arr: List(Float), ddof: Int) -> Result(Float, String) } } } + +///
+/// +/// Spot a typo? Open an issue! +/// +///
+/// +///
+/// Example: +/// +/// import gleeunit/should +/// import gleam_community/maths/metrics +/// +/// pub fn example () { +/// } +///
+/// +///
+/// +/// Back to top ↑ +/// +///
+/// +pub fn jaccard_index(aset: set.Set(a), bset: set.Set(a)) -> Float { + let assert Ok(result) = tversky_index(aset, bset, 1.0, 1.0) + result +} + +///
+/// +/// Spot a typo? Open an issue! +/// +///
+/// +///
+/// Example: +/// +/// import gleeunit/should +/// import gleam_community/maths/metrics +/// +/// pub fn example () { +/// } +///
+/// +///
+/// +/// Back to top ↑ +/// +///
+/// +pub fn sorensen_dice_coefficient(aset: set.Set(a), bset: set.Set(a)) -> Float { + let assert Ok(result) = tversky_index(aset, bset, 0.5, 0.5) + result +} + +///
+/// +/// Spot a typo? Open an issue! +/// +///
+/// +/// The Tversky index is a generalization of the Sørensen–Dice coefficient and the Jaccard index. +/// +///
+/// Example: +/// +/// import gleeunit/should +/// import gleam_community/maths/metrics +/// +/// pub fn example () { +/// } +///
+/// +///
+/// +/// Back to top ↑ +/// +///
+/// +pub fn tversky_index( + aset: set.Set(a), + bset: set.Set(a), + alpha: Float, + beta: Float, +) -> Result(Float, String) { + case alpha >=. 0.0, beta >=. 0.0 { + True, True -> { + let intersection: Float = + set.intersection(aset, bset) + |> set.size() + |> conversion.int_to_float() + let difference1: Float = + set.difference(aset, bset) + |> set.size() + |> conversion.int_to_float() + let difference2: Float = + set.difference(bset, aset) + |> set.size() + |> conversion.int_to_float() + intersection + /. { intersection +. alpha *. difference1 +. beta *. difference2 } + |> Ok + } + False, True -> { + "Invalid input argument: alpha < 0. Valid input is alpha >= 0." + |> Error + } + True, False -> { + "Invalid input argument: beta < 0. Valid input is beta >= 0." + |> Error + } + _, _ -> { + "Invalid input argument: alpha < 0 and beta < 0. Valid input is alpha >= 0 and beta >= 0." + |> Error + } + } +} diff --git a/test/gleam_community/maths/metrics_test.gleam b/test/gleam_community/maths/metrics_test.gleam index 8e407e6..cbd8d5e 100644 --- a/test/gleam_community/maths/metrics_test.gleam +++ b/test/gleam_community/maths/metrics_test.gleam @@ -2,6 +2,7 @@ import gleam_community/maths/elementary import gleam_community/maths/metrics import gleam_community/maths/predicates import gleeunit/should +import gleam/set pub fn float_list_norm_test() { let assert Ok(tol) = elementary.power(-10.0, -6.0) @@ -212,3 +213,24 @@ pub fn example_standard_deviation_test() { |> metrics.standard_deviation(ddof) |> should.equal(Ok(1.0)) } + +pub fn example_jaccard_index_test() { + metrics.jaccard_index(set.from_list([]), set.from_list([])) + |> should.equal(0.0) + + let set_a: set.Set(Int) = set.from_list([0, 1, 2, 5, 6, 8, 9]) + let set_b: set.Set(Int) = set.from_list([0, 2, 3, 4, 5, 7, 9]) + metrics.jaccard_index(set_a, set_b) + |> should.equal(4.0 /. 10.0) + + let set_c: set.Set(Int) = set.from_list([0, 1, 2, 3, 4, 5]) + let set_d: set.Set(Int) = set.from_list([6, 7, 8, 9, 10]) + metrics.jaccard_index(set_c, set_d) + |> should.equal(0.0 /. 11.0) + + let set_e: set.Set(String) = set.from_list(["cat", "dog", "hippo", "monkey"]) + let set_f: set.Set(String) = + set.from_list(["monkey", "rhino", "ostrich", "salmon"]) + metrics.jaccard_index(set_e, set_f) + |> should.equal(1.0 /. 7.0) +} From c825bb522f6a3228107e254dd8149deb92c4d6f3 Mon Sep 17 00:00:00 2001 From: NicklasXYZ Date: Sun, 17 Mar 2024 00:18:45 +0100 Subject: [PATCH 03/16] Add overlap coefficient --- src/gleam_community/maths/metrics.gleam | 35 +++++++++++++++++++ test/gleam_community/maths/metrics_test.gleam | 22 ++++++++++++ 2 files changed, 57 insertions(+) diff --git a/src/gleam_community/maths/metrics.gleam b/src/gleam_community/maths/metrics.gleam index cd9bfa6..4c45285 100644 --- a/src/gleam_community/maths/metrics.gleam +++ b/src/gleam_community/maths/metrics.gleam @@ -49,6 +49,7 @@ import gleam/list import gleam/pair import gleam/set import gleam/float +import gleam/int ///
/// @@ -676,3 +677,37 @@ pub fn tversky_index( } } } + +/// +/// +/// +///
+/// Example: +/// +/// import gleeunit/should +/// import gleam_community/maths/metrics +/// +/// pub fn example () { +/// } +///
+/// +/// +/// +pub fn overlap_coefficient(aset: set.Set(a), bset: set.Set(a)) -> Float { + let intersection: Float = + set.intersection(aset, bset) + |> set.size() + |> conversion.int_to_float() + let minsize: Float = + piecewise.minimum(set.size(aset), set.size(bset), int.compare) + |> conversion.int_to_float() + intersection /. minsize +} diff --git a/test/gleam_community/maths/metrics_test.gleam b/test/gleam_community/maths/metrics_test.gleam index cbd8d5e..2cb5ef1 100644 --- a/test/gleam_community/maths/metrics_test.gleam +++ b/test/gleam_community/maths/metrics_test.gleam @@ -234,3 +234,25 @@ pub fn example_jaccard_index_test() { metrics.jaccard_index(set_e, set_f) |> should.equal(1.0 /. 7.0) } + +pub fn example_overlap_coefficient_test() { + metrics.overlap_coefficient(set.from_list([]), set.from_list([])) + |> should.equal(0.0) + + let set_a: set.Set(Int) = set.from_list([0, 1, 2, 5, 6, 8, 9]) + let set_b: set.Set(Int) = set.from_list([0, 2, 3, 4, 5, 7, 9]) + metrics.overlap_coefficient(set_a, set_b) + |> should.equal(4.0 /. 7.0) + + let set_c: set.Set(Int) = set.from_list([0, 1, 2, 3, 4, 5]) + let set_d: set.Set(Int) = set.from_list([6, 7, 8, 9, 10]) + metrics.overlap_coefficient(set_c, set_d) + |> should.equal(0.0 /. 5.0) + + let set_e: set.Set(String) = + set.from_list(["cat", "dog", "hippo", "monkey", "rhino"]) + let set_f: set.Set(String) = + set.from_list(["monkey", "rhino", "ostrich", "salmon"]) + metrics.overlap_coefficient(set_e, set_f) + |> should.equal(2.0 /. 4.0) +} From 24e496a4a844a3386b94f4a672340b44bdf1f617 Mon Sep 17 00:00:00 2001 From: NicklasXYZ Date: Tue, 19 Mar 2024 15:04:44 +0100 Subject: [PATCH 04/16] Add new distance & similarity measures --- src/gleam_community/maths/metrics.gleam | 204 ++++++++++++++++-- test/gleam_community/maths/metrics_test.gleam | 55 ++++- 2 files changed, 242 insertions(+), 17 deletions(-) diff --git a/src/gleam_community/maths/metrics.gleam b/src/gleam_community/maths/metrics.gleam index 4c45285..a4ee8ab 100644 --- a/src/gleam_community/maths/metrics.gleam +++ b/src/gleam_community/maths/metrics.gleam @@ -137,7 +137,7 @@ pub fn norm(arr: List(Float), p: Float) -> Float { /// let assert Ok(tol) = elementary.power(-10.0, -6.0) /// /// // Empty lists returns 0.0 -/// metrics.float_manhatten_distance([], []) +/// metrics.manhatten_distance([], []) /// |> should.equal(Ok(0.0)) /// /// // Differing lengths returns error @@ -567,13 +567,36 @@ pub fn standard_deviation(arr: List(Float), ddof: Int) -> Result(Float, String) /// ///
/// +/// The Jaccard index measures similarity between two sets of elements. Mathematically, the Jaccard index +/// is defined as: +/// +/// \\[ +/// \text{JI}(X, Y) = \frac{|X \cap Y|}{|X \cup Y|} \in \left[0, 1\right] +/// \\] +/// +/// where: +/// +/// - $$X$$ and $$Y$$ are two sets being compared, +/// - $$|X \cap Y|$$ represents the size of the intersection of the two sets +/// - $$|X \cup Y|$$ denotes the size of the union of the two sets +/// +/// The value of the Jaccard index ranges from 0 to 1, where 0 indicates that the two sets share no elements +/// and 1 indicates that the sets are identical. The Jaccard index is a special case of the +/// [Tversky index](#tversky_index) (with $$\alpha=\beta=1$$). +/// ///
/// Example: /// /// import gleeunit/should /// import gleam_community/maths/metrics +/// import gleam/set /// /// pub fn example () { +/// let xset: set.Set(String) = set.from_list(["cat", "dog", "hippo", "monkey"]) +/// let yset: set.Set(String) = +/// set.from_list(["monkey", "rhino", "ostrich", "salmon"]) +/// metrics.jaccard_index(xset, yset) +/// |> should.equal(1.0 /. 7.0) /// } ///
/// @@ -583,8 +606,8 @@ pub fn standard_deviation(arr: List(Float), ddof: Int) -> Result(Float, String) /// ///
/// -pub fn jaccard_index(aset: set.Set(a), bset: set.Set(a)) -> Float { - let assert Ok(result) = tversky_index(aset, bset, 1.0, 1.0) +pub fn jaccard_index(xset: set.Set(a), yset: set.Set(a)) -> Float { + let assert Ok(result) = tversky_index(xset, yset, 1.0, 1.0) result } @@ -594,13 +617,36 @@ pub fn jaccard_index(aset: set.Set(a), bset: set.Set(a)) -> Float { /// /// /// +/// The Sørensen-Dice coefficient measures the similarity between two sets of elements. Mathematically, the +/// coefficient is defined as: +/// +/// \\[ +/// \text{DSC}(X, Y) = \frac{2 \times |X \cap Y|}{|X| + |Y|} \in \left[0, 1\right] +/// \\] +/// +/// where: +/// - $$X$$ and $$Y$$ are two sets being compared +/// - $$|X \cap Y|$$ is the size of the intersection of the two sets (i.e., the number of elements common to both sets) +/// - $$|X|$$ and $$|Y|$$ are the sizes of the sets $$X$$ and $$Y$$, respectively +/// +/// The coefficient ranges from 0 to 1, where 0 indicates no similarity (the sets share no elements) and 1 +/// indicates perfect similarity (the sets are identical). The higher the coefficient, the greater the similarity +/// between the two sets. The Sørensen-Dice coefficient is a special case of the +/// [Tversky index](#tversky_index) (with $$\alpha=\beta=0.5$$). +/// ///
/// Example: /// /// import gleeunit/should /// import gleam_community/maths/metrics +/// import gleam/set /// /// pub fn example () { +/// let xset: set.Set(String) = set.from_list(["cat", "dog", "hippo", "monkey"]) +/// let yset: set.Set(String) = +/// set.from_list(["monkey", "rhino", "ostrich", "salmon", "spider"]) +/// metrics.sorensen_dice_coefficient(xset, yset) +/// |> should.equal(2.0 *. 1.0 /. { 4.0 +. 5.0 }) /// } ///
/// @@ -610,8 +656,8 @@ pub fn jaccard_index(aset: set.Set(a), bset: set.Set(a)) -> Float { /// /// /// -pub fn sorensen_dice_coefficient(aset: set.Set(a), bset: set.Set(a)) -> Float { - let assert Ok(result) = tversky_index(aset, bset, 0.5, 0.5) +pub fn sorensen_dice_coefficient(xset: set.Set(a), yset: set.Set(a)) -> Float { + let assert Ok(result) = tversky_index(xset, yset, 0.5, 0.5) result } @@ -621,15 +667,39 @@ pub fn sorensen_dice_coefficient(aset: set.Set(a), bset: set.Set(a)) -> Float { /// /// /// -/// The Tversky index is a generalization of the Sørensen–Dice coefficient and the Jaccard index. +/// The Tversky index is a generalization of the Jaccard index and Sørensen-Dice coefficient, which adds +/// flexibility through two parameters, $$\alpha$$ and $$\beta$$, allowing for asymmetric similarity +/// measures between sets. The Tversky index is defined as: /// +/// \\[ +/// \text{TI}(X, Y) = \frac{|X \cap Y|}{|X \cap Y| + \alpha|X - Y| + \beta|Y - X|} +/// \\] +/// +/// where: +/// +/// - $$X$$ and $$Y$$ are the sets being compared +/// - $$|X - Y|$$ and $$|Y - X|$$ are the sizes of the relative complements of $$Y$$ in $$X$$ and $$X$$ in $$Y$$, respectively, +/// - $$\alpha$$ and $$\beta$$ are parameters that weigh the relative importance of the elements unique to $$X$$ and $$Y$$ +/// +/// The Tversky index reduces to the Jaccard index when \(\alpha = \beta = 1\) and to the Sorensen-Dice +/// coefficient when \(\alpha = \beta = 0.5\). In general, the Tversky index can take on any non-negative value, including 0. +/// The index equals 0 when there is no intersection between the two sets, indicating no similarity. However, unlike similarity +/// measures bounded strictly between 0 and 1, the Tversky index does not have a strict upper limit of 1 when $$\alpha \neq \beta$$. +/// ///
/// Example: /// /// import gleeunit/should /// import gleam_community/maths/metrics +/// import gleam/set /// /// pub fn example () { +/// let yset: set.Set(String) = set.from_list(["cat", "dog", "hippo", "monkey"]) +/// let xset: set.Set(String) = +/// set.from_list(["monkey", "rhino", "ostrich", "salmon"]) +/// // Test Jaccard index (alpha = beta = 1) +/// metrics.tversky_index(xset, yset, 1.0, 1.0) +/// |> should.equal(1.0 /. 7.0) /// } ///
/// @@ -640,23 +710,23 @@ pub fn sorensen_dice_coefficient(aset: set.Set(a), bset: set.Set(a)) -> Float { /// /// pub fn tversky_index( - aset: set.Set(a), - bset: set.Set(a), + xset: set.Set(a), + yset: set.Set(a), alpha: Float, beta: Float, ) -> Result(Float, String) { case alpha >=. 0.0, beta >=. 0.0 { True, True -> { let intersection: Float = - set.intersection(aset, bset) + set.intersection(xset, yset) |> set.size() |> conversion.int_to_float() let difference1: Float = - set.difference(aset, bset) + set.difference(xset, yset) |> set.size() |> conversion.int_to_float() let difference2: Float = - set.difference(bset, aset) + set.difference(yset, xset) |> set.size() |> conversion.int_to_float() intersection @@ -684,14 +754,39 @@ pub fn tversky_index( /// /// /// -/// +/// The Overlap coefficient, also known as the Szymkiewicz–Simpson coefficient, is a measure of +/// similarity between two sets that focuses on the size of the intersection relative to the +/// smaller of the two sets. It is defined mathematically as: +/// +/// \\[ +/// \text{OC}(X, Y) = \frac{|X \cap Y|}{\min(|X|, |Y|)} \in \left[0, 1\right] +/// \\] +/// +/// where: +/// +/// - $$X$$ and $$Y$$ are the sets being compared +/// - $$|X \cap Y|$$ is the size of the intersection of the sets +/// - $$\min(|X|, |Y|)$$ is the size of the smaller set among $$X$$ and $$Y$$ +/// +/// The coefficient ranges from 0 to 1, where 0 indicates no overlap and 1 indicates that the +/// smaller set is a suyset of the larger set. This measure is especially useful in situations +/// where the similarity in terms of the proportion of overlap is more relevant than the +/// difference in sizes between the two sets. +/// ///
/// Example: /// /// import gleeunit/should /// import gleam_community/maths/metrics +/// import gleam/set /// /// pub fn example () { +/// let set_a: set.Set(String) = +/// set.from_list(["horse", "dog", "hippo", "monkey", "bird"]) +/// let set_b: set.Set(String) = +/// set.from_list(["monkey", "bird", "ostrich", "salmon"]) +/// metrics.overlap_coefficient(set_a, set_b) +/// |> should.equal(2.0 /. 4.0) /// } ///
/// @@ -701,13 +796,92 @@ pub fn tversky_index( /// /// /// -pub fn overlap_coefficient(aset: set.Set(a), bset: set.Set(a)) -> Float { +pub fn overlap_coefficient(xset: set.Set(a), yset: set.Set(a)) -> Float { let intersection: Float = - set.intersection(aset, bset) + set.intersection(xset, yset) |> set.size() |> conversion.int_to_float() let minsize: Float = - piecewise.minimum(set.size(aset), set.size(bset), int.compare) + piecewise.minimum(set.size(xset), set.size(yset), int.compare) |> conversion.int_to_float() intersection /. minsize } + +///
+/// +/// Spot a typo? Open an issue! +/// +///
+/// +/// Calculate the cosine similarity between two lists (representing vectors): +/// +/// \\[ +/// \frac{\sum_{i=1}^n x_i \cdot y_i}{\left(\sum_{i=1}^n x_i^2\right)^{\frac{1}{2}} \cdot \left(\sum_{i=1}^n y_i^2\right)^{\frac{1}{2}}} +/// \\] +/// +/// In the formula, $n$ is the length of the two lists and $x_i, y_i$ are the values in the respective input lists indexed by $i$. The numerator +/// represents the dot product of the two vectors, while the denominator is the product of the magnitudes (Euclidean norms) of the two vectors. +/// The cosine similarity provides a value between -1 and 1, where 1 means the vectors are in the same direction, -1 means they are in exactly +/// opposite directions, and 0 indicates orthogonality. +/// +///
+/// Example: +/// +/// import gleeunit/should +/// import gleam_community/maths/metrics +/// +/// pub fn example () { +/// // Two orthogonal vectors +/// metrics.cosine_similarity([-1.0, 1.0, 0.0], [1.0, 1.0, -1.0]) +/// |> should.equal(Ok(0.0)) +/// +/// // Two identical (parallel) vectors +/// metrics.cosine_similarity([1.0, 2.0, 3.0], [1.0, 2.0, 3.0]) +/// |> should.equal(Ok(1.0)) +/// +/// // Two parallel, but oppositely oriented vectors +/// metrics.cosine_similarity([-1.0, -2.0, -3.0], [1.0, 2.0, 3.0]) +/// |> should.equal(Ok(-1.0)) +/// } +///
+/// +///
+/// +/// Back to top ↑ +/// +///
+/// +pub fn cosine_similarity( + xarr: List(Float), + yarr: List(Float), +) -> Result(Float, String) { + let xlen: Int = list.length(xarr) + let ylen: Int = list.length(yarr) + case xarr, yarr { + [], _ -> + "Invalid input argument: The list xarr is empty." + |> Error + _, [] -> + "Invalid input argument: The list yarr is empty." + |> Error + _, _ -> { + case xlen == ylen { + False -> + "Invalid input argument: length(xarr) != length(yarr). Valid input is when length(xarr) == length(yarr)." + |> Error + True -> { + list.fold( + list.zip(xarr, yarr), + 0.0, + fn(acc: Float, a: #(Float, Float)) -> Float { + let result: Float = pair.first(a) *. pair.second(a) + result +. acc + }, + ) + /. { norm(xarr, 2.0) *. norm(yarr, 2.0) } + |> Ok + } + } + } + } +} diff --git a/test/gleam_community/maths/metrics_test.gleam b/test/gleam_community/maths/metrics_test.gleam index 2cb5ef1..e2f7307 100644 --- a/test/gleam_community/maths/metrics_test.gleam +++ b/test/gleam_community/maths/metrics_test.gleam @@ -235,6 +235,27 @@ pub fn example_jaccard_index_test() { |> should.equal(1.0 /. 7.0) } +pub fn example_sorensen_dice_coefficient_test() { + metrics.sorensen_dice_coefficient(set.from_list([]), set.from_list([])) + |> should.equal(0.0) + + let set_a: set.Set(Int) = set.from_list([0, 1, 2, 5, 6, 8, 9]) + let set_b: set.Set(Int) = set.from_list([0, 2, 3, 4, 5, 7, 9]) + metrics.sorensen_dice_coefficient(set_a, set_b) + |> should.equal(2.0 *. 4.0 /. { 7.0 +. 7.0 }) + + let set_c: set.Set(Int) = set.from_list([0, 1, 2, 3, 4, 5]) + let set_d: set.Set(Int) = set.from_list([6, 7, 8, 9, 10]) + metrics.sorensen_dice_coefficient(set_c, set_d) + |> should.equal(2.0 *. 0.0 /. { 6.0 +. 5.0 }) + + let set_e: set.Set(String) = set.from_list(["cat", "dog", "hippo", "monkey"]) + let set_f: set.Set(String) = + set.from_list(["monkey", "rhino", "ostrich", "salmon", "spider"]) + metrics.sorensen_dice_coefficient(set_e, set_f) + |> should.equal(2.0 *. 1.0 /. { 4.0 +. 5.0 }) +} + pub fn example_overlap_coefficient_test() { metrics.overlap_coefficient(set.from_list([]), set.from_list([])) |> should.equal(0.0) @@ -250,9 +271,39 @@ pub fn example_overlap_coefficient_test() { |> should.equal(0.0 /. 5.0) let set_e: set.Set(String) = - set.from_list(["cat", "dog", "hippo", "monkey", "rhino"]) + set.from_list(["horse", "dog", "hippo", "monkey", "bird"]) let set_f: set.Set(String) = - set.from_list(["monkey", "rhino", "ostrich", "salmon"]) + set.from_list(["monkey", "bird", "ostrich", "salmon"]) metrics.overlap_coefficient(set_e, set_f) |> should.equal(2.0 /. 4.0) } + +pub fn example_cosine_similarity_test() { + // Empty lists returns an error + metrics.cosine_similarity([], []) + |> should.be_error() + + // One empty list returns an error + metrics.cosine_similarity([1.0, 2.0, 3.0], []) + |> should.be_error() + + // One empty list returns an error + metrics.cosine_similarity([], [1.0, 2.0, 3.0]) + |> should.be_error() + + // Differen sized lists returns an error + metrics.cosine_similarity([1.0, 2.0], [1.0, 2.0, 3.0, 4.0]) + |> should.be_error() + + // Two orthogonal vectors (represented by lists) + metrics.cosine_similarity([-1.0, 1.0, 0.0], [1.0, 1.0, -1.0]) + |> should.equal(Ok(0.0)) + + // Two identical (parallel) vectors (represented by lists) + metrics.cosine_similarity([1.0, 2.0, 3.0], [1.0, 2.0, 3.0]) + |> should.equal(Ok(1.0)) + + // Two parallel, but oppositely oriented vectors (represented by lists) + metrics.cosine_similarity([-1.0, -2.0, -3.0], [1.0, 2.0, 3.0]) + |> should.equal(Ok(-1.0)) +} From fc4a4a8b0994534b0533c162efc0028a6f63da10 Mon Sep 17 00:00:00 2001 From: NicklasXYZ Date: Tue, 19 Mar 2024 15:18:07 +0100 Subject: [PATCH 05/16] Fix typos --- src/gleam_community/maths/metrics.gleam | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/src/gleam_community/maths/metrics.gleam b/src/gleam_community/maths/metrics.gleam index a4ee8ab..772b80d 100644 --- a/src/gleam_community/maths/metrics.gleam +++ b/src/gleam_community/maths/metrics.gleam @@ -25,14 +25,17 @@ //// //// Metrics: A module offering functions for calculating distances and other types of metrics. //// -//// * **Distances** +//// * **Distance measures** //// * [`norm`](#norm) -//// * [`manhatten_distance`](#float_manhatten_distance) +//// * [`manhatten_distance`](#manhatten_distance) //// * [`minkowski_distance`](#minkowski_distance) //// * [`euclidean_distance`](#euclidean_distance) +//// * [`cosine_similarity`](#cosine_similarity) +//// * **Set & string similarity measures** //// * [`jaccard_index`](#jaccard_index) //// * [`sorensen_dice_coefficient`](#sorensen_dice_coefficient) //// * [`tversky_index`](#tversky_index) +//// * [`overlap_coefficient`](#overlap_coefficient) //// * **Basic statistical measures** //// * [`mean`](#mean) //// * [`median`](#median) @@ -571,7 +574,7 @@ pub fn standard_deviation(arr: List(Float), ddof: Int) -> Result(Float, String) /// is defined as: /// /// \\[ -/// \text{JI}(X, Y) = \frac{|X \cap Y|}{|X \cup Y|} \in \left[0, 1\right] +/// \frac{|X \cap Y|}{|X \cup Y|} \\; \in \\; \left[0, 1\right] /// \\] /// /// where: @@ -621,7 +624,7 @@ pub fn jaccard_index(xset: set.Set(a), yset: set.Set(a)) -> Float { /// coefficient is defined as: /// /// \\[ -/// \text{DSC}(X, Y) = \frac{2 \times |X \cap Y|}{|X| + |Y|} \in \left[0, 1\right] +/// \frac{2 |X \cap Y|}{|X| + |Y|} \\; \in \\; \left[0, 1\right] /// \\] /// /// where: @@ -672,7 +675,7 @@ pub fn sorensen_dice_coefficient(xset: set.Set(a), yset: set.Set(a)) -> Float { /// measures between sets. The Tversky index is defined as: /// /// \\[ -/// \text{TI}(X, Y) = \frac{|X \cap Y|}{|X \cap Y| + \alpha|X - Y| + \beta|Y - X|} +/// \frac{|X \cap Y|}{|X \cap Y| + \alpha|X - Y| + \beta|Y - X|} /// \\] /// /// where: @@ -759,7 +762,7 @@ pub fn tversky_index( /// smaller of the two sets. It is defined mathematically as: /// /// \\[ -/// \text{OC}(X, Y) = \frac{|X \cap Y|}{\min(|X|, |Y|)} \in \left[0, 1\right] +/// \frac{|X \cap Y|}{\min(|X|, |Y|)} \\; \in \\; \left[0, 1\right] /// \\] /// /// where: @@ -816,13 +819,13 @@ pub fn overlap_coefficient(xset: set.Set(a), yset: set.Set(a)) -> Float { /// Calculate the cosine similarity between two lists (representing vectors): /// /// \\[ -/// \frac{\sum_{i=1}^n x_i \cdot y_i}{\left(\sum_{i=1}^n x_i^2\right)^{\frac{1}{2}} \cdot \left(\sum_{i=1}^n y_i^2\right)^{\frac{1}{2}}} +/// \frac{\sum_{i=1}^n x_i \cdot y_i}{\left(\sum_{i=1}^n x_i^2\right)^{\frac{1}{2}} \cdot \left(\sum_{i=1}^n y_i^2\right)^{\frac{1}{2}}} \\; \in \\; \left[-1, 1\right] /// \\] /// -/// In the formula, $n$ is the length of the two lists and $x_i, y_i$ are the values in the respective input lists indexed by $i$. The numerator -/// represents the dot product of the two vectors, while the denominator is the product of the magnitudes (Euclidean norms) of the two vectors. -/// The cosine similarity provides a value between -1 and 1, where 1 means the vectors are in the same direction, -1 means they are in exactly -/// opposite directions, and 0 indicates orthogonality. +/// In the formula, $$n$$ is the length of the two lists and $$x_i$$, $$y_i$$ are the values in the respective input lists indexed by $$i$$. +/// The numerator represents the dot product of the two vectors, while the denominator is the product of the magnitudes (Euclidean norms) of +/// the two vectors. The cosine similarity provides a value between -1 and 1, where 1 means the vectors are in the same direction, -1 means +/// they are in exactly opposite directions, and 0 indicates orthogonality. /// ///
/// Example: From 587a47acde4cfb42e51a8766c44b4eae624e211e Mon Sep 17 00:00:00 2001 From: NicklasXYZ Date: Wed, 20 Mar 2024 22:00:39 +0100 Subject: [PATCH 06/16] Add is_prime function + doc improvements --- src/gleam_community/maths/predicates.gleam | 127 +++++++++++++++--- .../maths/predicates_test.gleam | 34 +++++ 2 files changed, 143 insertions(+), 18 deletions(-) diff --git a/src/gleam_community/maths/predicates.gleam b/src/gleam_community/maths/predicates.gleam index f8d357c..ca81d33 100644 --- a/src/gleam_community/maths/predicates.gleam +++ b/src/gleam_community/maths/predicates.gleam @@ -61,7 +61,7 @@ import gleam_community/maths/arithmetics /// Example /// /// import gleeunit/should -/// import gleam_community/maths/tests +/// import gleam_community/maths/predicates /// /// pub fn example () { /// let val: Float = 99. @@ -115,7 +115,7 @@ fn float_absolute_difference(a: Float, b: Float) -> Float { /// /// import gleeunit/should /// import gleam/list -/// import gleam_community/maths/tests +/// import gleam_community/maths/predicates /// /// pub fn example () { /// let val: Float = 99. @@ -126,7 +126,7 @@ fn float_absolute_difference(a: Float, b: Float) -> Float { /// // if 'val' is within 1 percent of 'ref_val' +/- 0.1 /// let rtol: Float = 0.01 /// let atol: Float = 0.10 -/// tests.all_close(xarr, yarr, rtol, atol) +/// predicates.all_close(xarr, yarr, rtol, atol) /// |> fn(zarr: Result(List(Bool), String)) -> Result(Bool, Nil) { /// case zarr { /// Ok(arr) -> @@ -181,13 +181,13 @@ pub fn all_close( /// Example /// /// import gleeunit/should -/// import gleam_community/maths/tests +/// import gleam_community/maths/predicates /// /// pub fn example () { -/// tests.is_fractional(0.3333) +/// predicates.is_fractional(0.3333) /// |> should.equal(True) /// -/// tests.is_fractional(1.0) +/// predicates.is_fractional(1.0) /// |> should.equal(False) /// } ///
@@ -218,15 +218,15 @@ fn do_ceiling(a: Float) -> Float /// Example: /// /// import gleeunit/should -/// import gleam_community/maths/tests +/// import gleam_community/maths/predicates /// /// pub fn example() { /// // Check if 4 is a power of 2 (it is) -/// tests.is_power(4, 2) +/// predicates.is_power(4, 2) /// |> should.equal(True) /// /// // Check if 5 is a power of 2 (it is not) -/// tests.is_power(5, 2) +/// predicates.is_power(5, 2) /// |> should.equal(False) /// } /// @@ -266,13 +266,13 @@ pub fn is_power(x: Int, y: Int) -> Bool { /// Example: /// /// import gleeunit/should -/// import gleam_community/maths/tests +/// import gleam_community/maths/predicates /// /// pub fn example() { -/// tests.is_perfect(6) +/// predicates.is_perfect(6) /// |> should.equal(True) /// -/// tests.is_perfect(28) +/// predicates.is_perfect(28) /// |> should.equal(True) /// } /// @@ -308,13 +308,13 @@ fn do_sum(arr: List(Int)) -> Int { /// Example: /// /// import gleeunit/should -/// import gleam_community/maths/tests +/// import gleam_community/maths/predicates /// /// pub fn example() { -/// tests.is_even(-3) +/// predicates.is_even(-3) /// |> should.equal(False) /// -/// tests.is_even(-4) +/// predicates.is_even(-4) /// |> should.equal(True) /// } /// @@ -341,13 +341,13 @@ pub fn is_even(x: Int) -> Bool { /// Example: /// /// import gleeunit/should -/// import gleam_community/maths/tests +/// import gleam_community/maths/predicates /// /// pub fn example() { -/// tests.is_odd(-3) +/// predicates.is_odd(-3) /// |> should.equal(True) /// -/// tests.is_odd(-4) +/// predicates.is_odd(-4) /// |> should.equal(False) /// } /// @@ -361,3 +361,94 @@ pub fn is_even(x: Int) -> Bool { pub fn is_odd(x: Int) -> Bool { x % 2 != 0 } + +///
+/// +/// Spot a typo? Open an issue! +/// +///
+/// +/// A function that tests whether a given integer value $$x \in \mathbb{Z}$$ is a prime number. +/// A prime number is a natural number greater than 1 that has no positive divisors other than 1 and itself. +/// +/// The function uses the Miller-Rabin primality test to assess if $$x$$ is prime. It is a probabilistic +/// test, so it can mistakenly identify a composite number as prime. However, the probability of such errors decreases +/// with more testing iterations (the function uses 64 iterations internally, which is typically more than sufficient). +/// The Miller-Rabin test is particularly useful for large numbers. +/// +///
+/// Details +/// +/// Examples of prime numbers: +/// - $$2$$ is a prime number since it has only two divisors: $$1$$ and $$2$$. +/// - $$7$$ is a prime number since it has only two divisors: $$1$$ and $$7$$. +/// - $$4$$ is not a prime number since it has divisors other than $$1$$ and itself, such as $$2$$. +/// +///
+/// +///
+/// Example: +/// +/// import gleeunit/should +/// import gleam_community/maths/predicates +/// +/// pub fn example() { +/// predicates.is_prime(2) +/// |> should.equal(True) +/// +/// predicates.is_prime(4) +/// |> should.equal(False) +/// +/// // Test the 2nd Carmichael number +/// predicates.is_prime(1105) +/// |> should.equal(False) +/// } +///
+/// +///
+/// +/// Back to top ↑ +/// +///
+/// +pub fn is_prime(x: Int) -> Bool { + case x { + x if x < 2 -> { + False + } + x if x == 2 -> { + True + } + _ -> { + miller_rabin_test(x, 64) + } + } +} + +fn miller_rabin_test(n: Int, k: Int) -> Bool { + case n, k { + _, 0 -> True + _, _ -> { + // Generate a random int in the range [2, n] + let random_candidate: Int = 2 + int.random(n - 2) + case powmod_with_check(random_candidate, n - 1, n) == 1 { + True -> miller_rabin_test(n, k - 1) + False -> False + } + } + } +} + +fn powmod_with_check(base: Int, exponent: Int, modulus: Int) -> Int { + case exponent, { exponent % 2 } == 0 { + 0, _ -> 1 + _, True -> { + let x: Int = powmod_with_check(base, exponent / 2, modulus) + case { x * x } % modulus, x != 1 && x != { modulus - 1 } { + 1, True -> 0 + _, _ -> { x * x } % modulus + } + } + _, _ -> { base * powmod_with_check(base, exponent - 1, modulus) } % modulus + } +} diff --git a/test/gleam_community/maths/predicates_test.gleam b/test/gleam_community/maths/predicates_test.gleam index a7a13d8..4130aab 100644 --- a/test/gleam_community/maths/predicates_test.gleam +++ b/test/gleam_community/maths/predicates_test.gleam @@ -136,3 +136,37 @@ pub fn int_is_perfect_test() { predicates.is_perfect(13) |> should.equal(False) } + +pub fn int_is_prime_test() { + predicates.is_prime(1) + |> should.equal(False) + + predicates.is_prime(2) + |> should.equal(True) + + predicates.is_prime(3) + |> should.equal(True) + + predicates.is_prime(5) + |> should.equal(True) + + predicates.is_prime(7) + |> should.equal(True) + + predicates.is_prime(11) + |> should.equal(True) + + predicates.is_prime(42) + |> should.equal(False) + + predicates.is_prime(7919) + |> should.equal(True) + + // Test 1st Carmichael number + predicates.is_prime(561) + |> should.equal(False) + + // Test 2nd Carmichael number + predicates.is_prime(1105) + |> should.equal(False) +} From 5ac794dac84ab902d8b5e9e84e25f38a639cb1e6 Mon Sep 17 00:00:00 2001 From: NicklasXYZ Date: Wed, 20 Mar 2024 22:40:30 +0100 Subject: [PATCH 07/16] fix docs + add another test case --- src/gleam_community/maths/predicates.gleam | 1 + test/gleam_community/maths/predicates_test.gleam | 4 ++++ 2 files changed, 5 insertions(+) diff --git a/src/gleam_community/maths/predicates.gleam b/src/gleam_community/maths/predicates.gleam index ca81d33..c5d5f01 100644 --- a/src/gleam_community/maths/predicates.gleam +++ b/src/gleam_community/maths/predicates.gleam @@ -33,6 +33,7 @@ //// * [`is_perfect`](#is_perfect) //// * [`is_even`](#is_even) //// * [`is_odd`](#is_odd) +//// * [`is_prime`](#is_prime) import gleam/pair import gleam/int diff --git a/test/gleam_community/maths/predicates_test.gleam b/test/gleam_community/maths/predicates_test.gleam index 4130aab..b2e6d57 100644 --- a/test/gleam_community/maths/predicates_test.gleam +++ b/test/gleam_community/maths/predicates_test.gleam @@ -138,6 +138,10 @@ pub fn int_is_perfect_test() { } pub fn int_is_prime_test() { + // Test a negative integer, i.e., not a natural number + predicates.is_prime(-7) + |> should.equal(False) + predicates.is_prime(1) |> should.equal(False) From ac3d23733da2ccc3f1a3a219c747477018c33892 Mon Sep 17 00:00:00 2001 From: NicklasXYZ <18580183+NicklasXYZ@users.noreply.github.com> Date: Thu, 4 Apr 2024 18:49:40 +0200 Subject: [PATCH 08/16] Fix typos. Add metric: Chebyshev distance --- src/gleam_community/maths/arithmetics.gleam | 16 +- src/gleam_community/maths/metrics.gleam | 154 ++++++++++++++---- test/gleam_community/maths/metrics_test.gleam | 45 ++++- 3 files changed, 166 insertions(+), 49 deletions(-) diff --git a/src/gleam_community/maths/arithmetics.gleam b/src/gleam_community/maths/arithmetics.gleam index a238abc..dbd3ffe 100644 --- a/src/gleam_community/maths/arithmetics.gleam +++ b/src/gleam_community/maths/arithmetics.gleam @@ -289,7 +289,7 @@ pub fn proper_divisors(n: Int) -> List(Int) { /// /// /// -/// Calculcate the sum of the elements in a list: +/// Calculate the sum of the elements in a list: /// /// \\[ /// \sum_{i=1}^n x_i @@ -337,7 +337,7 @@ pub fn float_sum(arr: List(Float)) -> Float { /// /// /// -/// Calculcate the sum of the elements in a list: +/// Calculate the sum of the elements in a list: /// /// \\[ /// \sum_{i=1}^n x_i @@ -385,7 +385,7 @@ pub fn int_sum(arr: List(Int)) -> Int { /// /// /// -/// Calculcate the product of the elements in a list: +/// Calculate the product of the elements in a list: /// /// \\[ /// \prod_{i=1}^n x_i @@ -433,7 +433,7 @@ pub fn float_product(arr: List(Float)) -> Float { /// /// /// -/// Calculcate the product of the elements in a list: +/// Calculate the product of the elements in a list: /// /// \\[ /// \prod_{i=1}^n x_i @@ -481,7 +481,7 @@ pub fn int_product(arr: List(Int)) -> Int { /// /// /// -/// Calculcate the cumulative sum of the elements in a list: +/// Calculate the cumulative sum of the elements in a list: /// /// \\[ /// v_j = \sum_{i=1}^j x_i \\;\\; \forall j = 1,\dots, n @@ -530,7 +530,7 @@ pub fn float_cumulative_sum(arr: List(Float)) -> List(Float) { /// /// /// -/// Calculcate the cumulative sum of the elements in a list: +/// Calculate the cumulative sum of the elements in a list: /// /// \\[ /// v_j = \sum_{i=1}^j x_i \\;\\; \forall j = 1,\dots, n @@ -579,7 +579,7 @@ pub fn int_cumulative_sum(arr: List(Int)) -> List(Int) { /// /// /// -/// Calculcate the cumulative product of the elements in a list: +/// Calculate the cumulative product of the elements in a list: /// /// \\[ /// v_j = \prod_{i=1}^j x_i \\;\\; \forall j = 1,\dots, n @@ -629,7 +629,7 @@ pub fn float_cumumlative_product(arr: List(Float)) -> List(Float) { /// /// /// -/// Calculcate the cumulative product of the elements in a list: +/// Calculate the cumulative product of the elements in a list: /// /// \\[ /// v_j = \prod_{i=1}^j x_i \\;\\; \forall j = 1,\dots, n diff --git a/src/gleam_community/maths/metrics.gleam b/src/gleam_community/maths/metrics.gleam index 772b80d..8b204cb 100644 --- a/src/gleam_community/maths/metrics.gleam +++ b/src/gleam_community/maths/metrics.gleam @@ -28,8 +28,9 @@ //// * **Distance measures** //// * [`norm`](#norm) //// * [`manhatten_distance`](#manhatten_distance) -//// * [`minkowski_distance`](#minkowski_distance) //// * [`euclidean_distance`](#euclidean_distance) +//// * [`chebyshev_distance`](#chebyshev_distance) +//// * [`minkowski_distance`](#minkowski_distance) //// * [`cosine_similarity`](#cosine_similarity) //// * **Set & string similarity measures** //// * [`jaccard_index`](#jaccard_index) @@ -60,7 +61,7 @@ import gleam/int /// /// /// -/// Calculcate the $$p$$-norm of a list (representing a vector): +/// Calculate the $$p$$-norm of a list (representing a vector): /// /// \\[ /// \left( \sum_{i=1}^n \left|x_i\right|^{p} \right)^{\frac{1}{p}} @@ -120,13 +121,13 @@ pub fn norm(arr: List(Float), p: Float) -> Float { /// /// /// -/// Calculcate the Manhatten distance between two lists (representing vectors): +/// Calculate the Manhatten distance between two lists (representing vectors): /// /// \\[ /// \sum_{i=1}^n \left|x_i - y_i \right| /// \\] /// -/// In the formula, $$n$$ is the length of the two lists and $$x_i, y_i$$ are the values in the respective input lists indexed by $$i, j$$. +/// In the formula, $$n$$ is the length of the two lists and $$x_i, y_i$$ are the values in the respective input lists indexed by $$i$$. /// ///
/// Example: @@ -139,9 +140,9 @@ pub fn norm(arr: List(Float), p: Float) -> Float { /// pub fn example () { /// let assert Ok(tol) = elementary.power(-10.0, -6.0) /// -/// // Empty lists returns 0.0 +/// // Empty lists returns an error /// metrics.manhatten_distance([], []) -/// |> should.equal(Ok(0.0)) +/// |> should.be_error() /// /// // Differing lengths returns error /// metrics.manhatten_distance([], [1.0]) @@ -173,13 +174,13 @@ pub fn manhatten_distance( /// /// /// -/// Calculcate the Minkowski distance between two lists (representing vectors): +/// Calculate the Minkowski distance between two lists (representing vectors): /// /// \\[ /// \left( \sum_{i=1}^n \left|x_i - y_i \right|^{p} \right)^{\frac{1}{p}} /// \\] /// -/// In the formula, $$p >= 1$$ is the order, $$n$$ is the length of the two lists and $$x_i, y_i$$ are the values in the respective input lists indexed by $$i, j$$. +/// In the formula, $$p >= 1$$ is the order, $$n$$ is the length of the two lists and $$x_i, y_i$$ are the values in the respective input lists indexed by $$i$$. /// /// The Minkowski distance is a generalization of both the Euclidean distance ($$p=2$$) and the Manhattan distance ($$p = 1$$). /// @@ -196,7 +197,7 @@ pub fn manhatten_distance( /// /// // Empty lists returns 0.0 /// metrics.minkowski_distance([], [], 1.0) -/// |> should.equal(Ok(0.0)) +/// |> should.be_error() /// /// // Differing lengths returns error /// metrics.minkowski_distance([], [1.0], 1.0) @@ -224,25 +225,35 @@ pub fn minkowski_distance( yarr: List(Float), p: Float, ) -> Result(Float, String) { - let xlen: Int = list.length(xarr) - let ylen: Int = list.length(yarr) - case xlen == ylen { - False -> - "Invalid input argument: length(xarr) != length(yarr). Valid input is when length(xarr) == length(yarr)." + case xarr, yarr { + [], _ -> + "Invalid input argument: The list xarr is empty." |> Error - True -> - case p <. 1.0 { - True -> - "Invalid input argument: p < 1. Valid input is p >= 1." - |> Error + _, [] -> + "Invalid input argument: The list yarr is empty." + |> Error + _, _ -> { + let xlen: Int = list.length(xarr) + let ylen: Int = list.length(yarr) + case xlen == ylen { False -> - list.zip(xarr, yarr) - |> list.map(fn(tuple: #(Float, Float)) -> Float { - pair.first(tuple) -. pair.second(tuple) - }) - |> norm(p) - |> Ok + "Invalid input argument: length(xarr) != length(yarr). Valid input is when length(xarr) == length(yarr)." + |> Error + True -> + case p <. 1.0 { + True -> + "Invalid input argument: p < 1. Valid input is p >= 1." + |> Error + False -> + list.zip(xarr, yarr) + |> list.map(fn(tuple: #(Float, Float)) -> Float { + pair.first(tuple) -. pair.second(tuple) + }) + |> norm(p) + |> Ok + } } + } } } @@ -252,13 +263,13 @@ pub fn minkowski_distance( /// /// /// -/// Calculcate the Euclidean distance between two lists (representing vectors): +/// Calculate the Euclidean distance between two lists (representing vectors): /// /// \\[ /// \left( \sum_{i=1}^n \left|x_i - y_i \right|^{2} \right)^{\frac{1}{2}} /// \\] /// -/// In the formula, $$n$$ is the length of the two lists and $$x_i, y_i$$ are the values in the respective input lists indexed by $$i, j$$. +/// In the formula, $$n$$ is the length of the two lists and $$x_i, y_i$$ are the values in the respective input lists indexed by $$i$$. /// ///
/// Example: @@ -273,7 +284,7 @@ pub fn minkowski_distance( /// /// // Empty lists returns 0.0 /// metrics.euclidean_distance([], []) -/// |> should.equal(Ok(0.0)) +/// |> should.be_error() /// /// // Differing lengths returns error /// metrics.euclidean_distance([], [1.0]) @@ -305,7 +316,82 @@ pub fn euclidean_distance( /// /// /// -/// Calculcate the arithmetic mean of the elements in a list: +/// Calculate the Chebyshev distance between two lists (representing vectors): +/// +/// \\[ +/// \text{max}_{i=1}^n \left|x_i - y_i \right| +/// \\] +/// +/// In the formula, $$n$$ is the length of the two lists and $$x_i, y_i$$ are the values in the respective input lists indexed by $$i$$. +/// +///
+/// Example: +/// +/// import gleeunit/should +/// import gleam_community/maths/elementary +/// import gleam_community/maths/metrics +/// import gleam_community/maths/predicates +/// +/// pub fn example () { +/// // Empty lists returns an error +/// metrics.chebyshev_distance([], []) +/// |> should.be_error() +/// +/// // Differing lengths returns error +/// metrics.chebyshev_distance([], [1.0]) +/// |> should.be_error() +/// +/// metrics.chebyshev_distance([-5.0, -10.0, -3.0], [-1.0, -12.0, -3.0]) +/// |> should.equal(Ok(4.0)) +/// } +///
+/// +/// +/// +pub fn chebyshev_distance( + xarr: List(Float), + yarr: List(Float), +) -> Result(Float, String) { + case xarr, yarr { + [], _ -> + "Invalid input argument: The list xarr is empty." + |> Error + _, [] -> + "Invalid input argument: The list yarr is empty." + |> Error + _, _ -> { + let xlen: Int = list.length(xarr) + let ylen: Int = list.length(yarr) + case xlen == ylen { + False -> + "Invalid input argument: length(xarr) != length(yarr). Valid input is when length(xarr) == length(yarr)." + |> Error + True -> { + let differences = + list.zip(xarr, yarr) + |> list.map(fn(tuple: #(Float, Float)) -> Float { + { pair.first(tuple) -. pair.second(tuple) } + |> piecewise.float_absolute_value() + }) + differences + |> piecewise.list_maximum(float.compare) + } + } + } + } +} + +/// +/// +/// Calculate the arithmetic mean of the elements in a list: /// /// \\[ /// \bar{x} = \frac{1}{n}\sum_{i=1}^n x_i @@ -360,7 +446,7 @@ pub fn mean(arr: List(Float)) -> Result(Float, String) { /// /// /// -/// Calculcate the median of the elements in a list. +/// Calculate the median of the elements in a list. /// ///
/// Example: @@ -427,7 +513,7 @@ pub fn median(arr: List(Float)) -> Result(Float, String) { /// /// /// -/// Calculcate the sample variance of the elements in a list: +/// Calculate the sample variance of the elements in a list: /// \\[ /// s^{2} = \frac{1}{n - d} \sum_{i=1}^{n}(x_i - \bar{x}) /// \\] @@ -503,7 +589,7 @@ pub fn variance(arr: List(Float), ddof: Int) -> Result(Float, String) { /// /// /// -/// Calculcate the sample standard deviation of the elements in a list: +/// Calculate the sample standard deviation of the elements in a list: /// \\[ /// s = \left(\frac{1}{n - d} \sum_{i=1}^{n}(x_i - \bar{x})\right)^{\frac{1}{2}} /// \\] @@ -858,8 +944,6 @@ pub fn cosine_similarity( xarr: List(Float), yarr: List(Float), ) -> Result(Float, String) { - let xlen: Int = list.length(xarr) - let ylen: Int = list.length(yarr) case xarr, yarr { [], _ -> "Invalid input argument: The list xarr is empty." @@ -868,6 +952,8 @@ pub fn cosine_similarity( "Invalid input argument: The list yarr is empty." |> Error _, _ -> { + let xlen: Int = list.length(xarr) + let ylen: Int = list.length(yarr) case xlen == ylen { False -> "Invalid input argument: length(xarr) != length(yarr). Valid input is when length(xarr) == length(yarr)." diff --git a/test/gleam_community/maths/metrics_test.gleam b/test/gleam_community/maths/metrics_test.gleam index e2f7307..4ba9cd9 100644 --- a/test/gleam_community/maths/metrics_test.gleam +++ b/test/gleam_community/maths/metrics_test.gleam @@ -53,9 +53,9 @@ pub fn float_list_norm_test() { pub fn float_list_manhatten_test() { let assert Ok(tol) = elementary.power(-10.0, -6.0) - // Empty lists returns 0.0 + // Empty lists returns an error metrics.manhatten_distance([], []) - |> should.equal(Ok(0.0)) + |> should.be_error() // Differing lengths returns error metrics.manhatten_distance([], [1.0]) @@ -85,9 +85,9 @@ pub fn float_list_manhatten_test() { pub fn float_list_minkowski_test() { let assert Ok(tol) = elementary.power(-10.0, -6.0) - // Empty lists returns 0.0 + // Empty lists returns an error metrics.minkowski_distance([], [], 1.0) - |> should.equal(Ok(0.0)) + |> should.be_error() // Differing lengths returns error metrics.minkowski_distance([], [1.0], 1.0) @@ -141,9 +141,9 @@ pub fn float_list_minkowski_test() { pub fn float_list_euclidean_test() { let assert Ok(tol) = elementary.power(-10.0, -6.0) - // Empty lists returns 0.0 + // Empty lists returns an error metrics.euclidean_distance([], []) - |> should.equal(Ok(0.0)) + |> should.be_error() // Differing lengths returns error metrics.euclidean_distance([], [1.0]) @@ -291,7 +291,7 @@ pub fn example_cosine_similarity_test() { metrics.cosine_similarity([], [1.0, 2.0, 3.0]) |> should.be_error() - // Differen sized lists returns an error + // Different sized lists returns an error metrics.cosine_similarity([1.0, 2.0], [1.0, 2.0, 3.0, 4.0]) |> should.be_error() @@ -307,3 +307,34 @@ pub fn example_cosine_similarity_test() { metrics.cosine_similarity([-1.0, -2.0, -3.0], [1.0, 2.0, 3.0]) |> should.equal(Ok(-1.0)) } + +pub fn example_chebyshev_distance() { + // Empty lists returns an error + metrics.chebyshev_distance([], []) + |> should.be_error() + + // One empty list returns an error + metrics.chebyshev_distance([1.0, 2.0, 3.0], []) + |> should.be_error() + + // One empty list returns an error + metrics.chebyshev_distance([], [1.0, 2.0, 3.0]) + |> should.be_error() + + // Different sized lists returns an error + metrics.chebyshev_distance([1.0, 2.0], [1.0, 2.0, 3.0, 4.0]) + |> should.be_error() + + // Try different types of valid input + metrics.chebyshev_distance([1.0, 0.0], [0.0, 2.0]) + |> should.equal(Ok(2.0)) + + metrics.chebyshev_distance([1.0, 0.0], [2.0, 0.0]) + |> should.equal(Ok(3.0)) + + metrics.chebyshev_distance([-5.0, -10.0, -3.0], [-1.0, -12.0, -3.0]) + |> should.equal(Ok(4.0)) + + metrics.chebyshev_distance([1.0, 2.0, 3.0], [1.0, 2.0, 3.0]) + |> should.equal(Ok(0.0)) +} From e8bbcce58762b8d30edda512a854db1da8d7ec91 Mon Sep 17 00:00:00 2001 From: NicklasXYZ <18580183+NicklasXYZ@users.noreply.github.com> Date: Wed, 10 Apr 2024 23:09:02 +0200 Subject: [PATCH 09/16] Add Levenshtein distance, fix typos, align doc examples with tests --- src/gleam_community/maths/metrics.gleam | 254 ++++++++++++++---- test/gleam_community/maths/metrics_test.gleam | 87 ++++-- 2 files changed, 257 insertions(+), 84 deletions(-) diff --git a/src/gleam_community/maths/metrics.gleam b/src/gleam_community/maths/metrics.gleam index 8b204cb..d5ddc69 100644 --- a/src/gleam_community/maths/metrics.gleam +++ b/src/gleam_community/maths/metrics.gleam @@ -23,11 +23,12 @@ //// //// --- //// -//// Metrics: A module offering functions for calculating distances and other types of metrics. +//// Metrics: A module offering functions for calculating distances and other +//// types of metrics. //// //// * **Distance measures** //// * [`norm`](#norm) -//// * [`manhatten_distance`](#manhatten_distance) +//// * [`manhattan_distance`](#manhattan_distance) //// * [`euclidean_distance`](#euclidean_distance) //// * [`chebyshev_distance`](#chebyshev_distance) //// * [`minkowski_distance`](#minkowski_distance) @@ -54,6 +55,7 @@ import gleam/pair import gleam/set import gleam/float import gleam/int +import gleam/string /// /// -/// Calculate the Manhatten distance between two lists (representing vectors): +/// Calculate the Manhattan distance between two lists (representing vectors): /// /// \\[ /// \sum_{i=1}^n \left|x_i - y_i \right| /// \\] /// -/// In the formula, $$n$$ is the length of the two lists and $$x_i, y_i$$ are the values in the respective input lists indexed by $$i$$. +/// In the formula, $$n$$ is the length of the two lists and $$x_i, y_i$$ are the +/// values in the respective input lists indexed by $$i$$. /// ///
/// Example: @@ -141,14 +145,14 @@ pub fn norm(arr: List(Float), p: Float) -> Float { /// let assert Ok(tol) = elementary.power(-10.0, -6.0) /// /// // Empty lists returns an error -/// metrics.manhatten_distance([], []) +/// metrics.manhattan_distance([], []) /// |> should.be_error() /// /// // Differing lengths returns error -/// metrics.manhatten_distance([], [1.0]) +/// metrics.manhattan_distance([], [1.0]) /// |> should.be_error() /// -/// let assert Ok(result) = metrics.manhatten_distance([0.0, 0.0], [1.0, 2.0]) +/// let assert Ok(result) = metrics.manhattan_distance([0.0, 0.0], [1.0, 2.0]) /// result /// |> predicates.is_close(3.0, 0.0, tol) /// |> should.be_true() @@ -161,7 +165,7 @@ pub fn norm(arr: List(Float), p: Float) -> Float { /// /// /// -pub fn manhatten_distance( +pub fn manhattan_distance( xarr: List(Float), yarr: List(Float), ) -> Result(Float, String) { @@ -180,9 +184,11 @@ pub fn manhatten_distance( /// \left( \sum_{i=1}^n \left|x_i - y_i \right|^{p} \right)^{\frac{1}{p}} /// \\] /// -/// In the formula, $$p >= 1$$ is the order, $$n$$ is the length of the two lists and $$x_i, y_i$$ are the values in the respective input lists indexed by $$i$$. +/// In the formula, $$p >= 1$$ is the order, $$n$$ is the length of the two lists +/// and $$x_i, y_i$$ are the values in the respective input lists indexed by $$i$$. /// -/// The Minkowski distance is a generalization of both the Euclidean distance ($$p=2$$) and the Manhattan distance ($$p = 1$$). +/// The Minkowski distance is a generalization of both the Euclidean distance +/// ($$p=2$$) and the Manhattan distance ($$p = 1$$). /// ///
/// Example: @@ -195,7 +201,7 @@ pub fn manhatten_distance( /// pub fn example () { /// let assert Ok(tol) = elementary.power(-10.0, -6.0) /// -/// // Empty lists returns 0.0 +/// // Empty lists returns an error /// metrics.minkowski_distance([], [], 1.0) /// |> should.be_error() /// @@ -269,7 +275,8 @@ pub fn minkowski_distance( /// \left( \sum_{i=1}^n \left|x_i - y_i \right|^{2} \right)^{\frac{1}{2}} /// \\] /// -/// In the formula, $$n$$ is the length of the two lists and $$x_i, y_i$$ are the values in the respective input lists indexed by $$i$$. +/// In the formula, $$n$$ is the length of the two lists and $$x_i, y_i$$ are the +/// values in the respective input lists indexed by $$i$$. /// ///
/// Example: @@ -282,11 +289,11 @@ pub fn minkowski_distance( /// pub fn example () { /// let assert Ok(tol) = elementary.power(-10.0, -6.0) /// -/// // Empty lists returns 0.0 +/// // Empty lists returns an error /// metrics.euclidean_distance([], []) /// |> should.be_error() /// -/// // Differing lengths returns error +/// // Differing lengths returns an error /// metrics.euclidean_distance([], [1.0]) /// |> should.be_error() /// @@ -322,7 +329,8 @@ pub fn euclidean_distance( /// \text{max}_{i=1}^n \left|x_i - y_i \right| /// \\] /// -/// In the formula, $$n$$ is the length of the two lists and $$x_i, y_i$$ are the values in the respective input lists indexed by $$i$$. +/// In the formula, $$n$$ is the length of the two lists and $$x_i, y_i$$ are the +/// values in the respective input lists indexed by $$i$$. /// ///
/// Example: @@ -397,8 +405,8 @@ pub fn chebyshev_distance( /// \bar{x} = \frac{1}{n}\sum_{i=1}^n x_i /// \\] /// -/// In the formula, $$n$$ is the sample size (the length of the list) and -/// $$x_i$$ is the sample point in the input list indexed by $$i$$. +/// In the formula, $$n$$ is the sample size (the length of the list) and $$x_i$$ +/// is the sample point in the input list indexed by $$i$$. /// ///
/// Example: @@ -514,12 +522,13 @@ pub fn median(arr: List(Float)) -> Result(Float, String) { /// /// /// Calculate the sample variance of the elements in a list: +/// /// \\[ /// s^{2} = \frac{1}{n - d} \sum_{i=1}^{n}(x_i - \bar{x}) /// \\] /// -/// In the formula, $$n$$ is the sample size (the length of the list) and -/// $$x_i$$ is the sample point in the input list indexed by $$i$$. +/// In the formula, $$n$$ is the sample size (the length of the list) and $$x_i$$ +/// is the sample point in the input list indexed by $$i$$. /// Furthermore, $$\bar{x}$$ is the sample mean and $$d$$ is the "Delta /// Degrees of Freedom", and is by default set to $$d = 0$$, which gives a biased /// estimate of the sample variance. Setting $$d = 1$$ gives an unbiased estimate. @@ -594,11 +603,12 @@ pub fn variance(arr: List(Float), ddof: Int) -> Result(Float, String) { /// s = \left(\frac{1}{n - d} \sum_{i=1}^{n}(x_i - \bar{x})\right)^{\frac{1}{2}} /// \\] /// -/// In the formula, $$n$$ is the sample size (the length of the list) and -/// $$x_i$$ is the sample point in the input list indexed by $$i$$. +/// In the formula, $$n$$ is the sample size (the length of the list) and $$x_i$$ +/// is the sample point in the input list indexed by $$i$$. /// Furthermore, $$\bar{x}$$ is the sample mean and $$d$$ is the "Delta /// Degrees of Freedom", and is by default set to $$d = 0$$, which gives a biased -/// estimate of the sample standard deviation. Setting $$d = 1$$ gives an unbiased estimate. +/// estimate of the sample standard deviation. Setting $$d = 1$$ gives an unbiased +/// estimate. /// ///
/// Example: @@ -656,8 +666,8 @@ pub fn standard_deviation(arr: List(Float), ddof: Int) -> Result(Float, String) /// /// /// -/// The Jaccard index measures similarity between two sets of elements. Mathematically, the Jaccard index -/// is defined as: +/// The Jaccard index measures similarity between two sets of elements. +/// Mathematically, the Jaccard index is defined as: /// /// \\[ /// \frac{|X \cap Y|}{|X \cup Y|} \\; \in \\; \left[0, 1\right] @@ -669,9 +679,10 @@ pub fn standard_deviation(arr: List(Float), ddof: Int) -> Result(Float, String) /// - $$|X \cap Y|$$ represents the size of the intersection of the two sets /// - $$|X \cup Y|$$ denotes the size of the union of the two sets /// -/// The value of the Jaccard index ranges from 0 to 1, where 0 indicates that the two sets share no elements -/// and 1 indicates that the sets are identical. The Jaccard index is a special case of the -/// [Tversky index](#tversky_index) (with $$\alpha=\beta=1$$). +/// The value of the Jaccard index ranges from 0 to 1, where 0 indicates that the +/// two sets share no elements and 1 indicates that the sets are identical. The +/// Jaccard index is a special case of the [Tversky index](#tversky_index) (with +/// $$\alpha=\beta=1$$). /// ///
/// Example: @@ -706,8 +717,8 @@ pub fn jaccard_index(xset: set.Set(a), yset: set.Set(a)) -> Float { /// /// /// -/// The Sørensen-Dice coefficient measures the similarity between two sets of elements. Mathematically, the -/// coefficient is defined as: +/// The Sørensen-Dice coefficient measures the similarity between two sets of +/// elements. Mathematically, the coefficient is defined as: /// /// \\[ /// \frac{2 |X \cap Y|}{|X| + |Y|} \\; \in \\; \left[0, 1\right] @@ -715,12 +726,14 @@ pub fn jaccard_index(xset: set.Set(a), yset: set.Set(a)) -> Float { /// /// where: /// - $$X$$ and $$Y$$ are two sets being compared -/// - $$|X \cap Y|$$ is the size of the intersection of the two sets (i.e., the number of elements common to both sets) +/// - $$|X \cap Y|$$ is the size of the intersection of the two sets (i.e., the +/// number of elements common to both sets) /// - $$|X|$$ and $$|Y|$$ are the sizes of the sets $$X$$ and $$Y$$, respectively /// -/// The coefficient ranges from 0 to 1, where 0 indicates no similarity (the sets share no elements) and 1 -/// indicates perfect similarity (the sets are identical). The higher the coefficient, the greater the similarity -/// between the two sets. The Sørensen-Dice coefficient is a special case of the +/// The coefficient ranges from 0 to 1, where 0 indicates no similarity (the sets +/// share no elements) and 1 indicates perfect similarity (the sets are identical). +/// The higher the coefficient, the greater the similarity between the two sets. +/// The Sørensen-Dice coefficient is a special case of the /// [Tversky index](#tversky_index) (with $$\alpha=\beta=0.5$$). /// ///
@@ -756,9 +769,10 @@ pub fn sorensen_dice_coefficient(xset: set.Set(a), yset: set.Set(a)) -> Float { /// /// /// -/// The Tversky index is a generalization of the Jaccard index and Sørensen-Dice coefficient, which adds -/// flexibility through two parameters, $$\alpha$$ and $$\beta$$, allowing for asymmetric similarity -/// measures between sets. The Tversky index is defined as: +/// The Tversky index is a generalization of the Jaccard index and Sørensen-Dice +/// coefficient, which adds flexibility through two parameters, $$\alpha$$ and +/// $$\beta$$, allowing for asymmetric similarity measures between sets. The +/// Tversky index is defined as: /// /// \\[ /// \frac{|X \cap Y|}{|X \cap Y| + \alpha|X - Y| + \beta|Y - X|} @@ -767,13 +781,17 @@ pub fn sorensen_dice_coefficient(xset: set.Set(a), yset: set.Set(a)) -> Float { /// where: /// /// - $$X$$ and $$Y$$ are the sets being compared -/// - $$|X - Y|$$ and $$|Y - X|$$ are the sizes of the relative complements of $$Y$$ in $$X$$ and $$X$$ in $$Y$$, respectively, -/// - $$\alpha$$ and $$\beta$$ are parameters that weigh the relative importance of the elements unique to $$X$$ and $$Y$$ +/// - $$|X - Y|$$ and $$|Y - X|$$ are the sizes of the relative complements of +/// $$Y$$ in $$X$$ and $$X$$ in $$Y$$, respectively, +/// - $$\alpha$$ and $$\beta$$ are parameters that weigh the relative importance +/// of the elements unique to $$X$$ and $$Y$$ /// -/// The Tversky index reduces to the Jaccard index when \(\alpha = \beta = 1\) and to the Sorensen-Dice -/// coefficient when \(\alpha = \beta = 0.5\). In general, the Tversky index can take on any non-negative value, including 0. -/// The index equals 0 when there is no intersection between the two sets, indicating no similarity. However, unlike similarity -/// measures bounded strictly between 0 and 1, the Tversky index does not have a strict upper limit of 1 when $$\alpha \neq \beta$$. +/// The Tversky index reduces to the Jaccard index when $$\alpha = \beta = 1$$ and +/// to the Sørensen-Dice coefficient when $$\alpha = \beta = 0.5$$. In general, the +/// Tversky index can take on any non-negative value, including 0. The index equals +/// 0 when there is no intersection between the two sets, indicating no similarity. +/// However, unlike similarity measures bounded strictly between 0 and 1, the +/// Tversky index does not have a strict upper limit of 1 when $$\alpha \neq \beta$$. /// ///
/// Example: @@ -843,9 +861,10 @@ pub fn tversky_index( /// /// /// -/// The Overlap coefficient, also known as the Szymkiewicz–Simpson coefficient, is a measure of -/// similarity between two sets that focuses on the size of the intersection relative to the -/// smaller of the two sets. It is defined mathematically as: +/// The Overlap coefficient, also known as the Szymkiewicz–Simpson coefficient, is +/// a measure of similarity between two sets that focuses on the size of the +/// intersection relative to the smaller of the two sets. It is defined +/// mathematically as: /// /// \\[ /// \frac{|X \cap Y|}{\min(|X|, |Y|)} \\; \in \\; \left[0, 1\right] @@ -857,10 +876,11 @@ pub fn tversky_index( /// - $$|X \cap Y|$$ is the size of the intersection of the sets /// - $$\min(|X|, |Y|)$$ is the size of the smaller set among $$X$$ and $$Y$$ /// -/// The coefficient ranges from 0 to 1, where 0 indicates no overlap and 1 indicates that the -/// smaller set is a suyset of the larger set. This measure is especially useful in situations -/// where the similarity in terms of the proportion of overlap is more relevant than the -/// difference in sizes between the two sets. +/// The coefficient ranges from 0 to 1, where 0 indicates no overlap and 1 +/// indicates that the smaller set is a suyset of the larger set. This +/// measure is especially useful in situations where the similarity in terms +/// of the proportion of overlap is more relevant than the difference in sizes +/// between the two sets. /// ///
/// Example: @@ -905,13 +925,18 @@ pub fn overlap_coefficient(xset: set.Set(a), yset: set.Set(a)) -> Float { /// Calculate the cosine similarity between two lists (representing vectors): /// /// \\[ -/// \frac{\sum_{i=1}^n x_i \cdot y_i}{\left(\sum_{i=1}^n x_i^2\right)^{\frac{1}{2}} \cdot \left(\sum_{i=1}^n y_i^2\right)^{\frac{1}{2}}} \\; \in \\; \left[-1, 1\right] +/// \frac{\sum_{i=1}^n x_i \cdot y_i}{\left(\sum_{i=1}^n x_i^2\right)^{\frac{1}{2}} +/// \cdot \left(\sum_{i=1}^n y_i^2\right)^{\frac{1}{2}}} +/// \\; \in \\; \left[-1, 1\right] /// \\] /// -/// In the formula, $$n$$ is the length of the two lists and $$x_i$$, $$y_i$$ are the values in the respective input lists indexed by $$i$$. -/// The numerator represents the dot product of the two vectors, while the denominator is the product of the magnitudes (Euclidean norms) of -/// the two vectors. The cosine similarity provides a value between -1 and 1, where 1 means the vectors are in the same direction, -1 means -/// they are in exactly opposite directions, and 0 indicates orthogonality. +/// In the formula, $$n$$ is the length of the two lists and $$x_i$$, $$y_i$$ are +/// the values in the respective input lists indexed by $$i$$. The numerator +/// represents the dot product of the two vectors, while the denominator is the +/// product of the magnitudes (Euclidean norms) of the two vectors. The cosine +/// similarity provides a value between -1 and 1, where 1 means the vectors are +/// in the same direction, -1 means they are in exactly opposite directions, +/// and 0 indicates orthogonality. /// ///
/// Example: @@ -974,3 +999,122 @@ pub fn cosine_similarity( } } } + +/// +/// +/// Calculate the Levenshtein distance between two strings, i.e., measure the +/// difference between two strings (essentially sequences). It is defined as +/// the minimum number of single-character edits required to change one string +/// into the other, using operations: +/// - insertions +/// - deletions +/// - substitutions +/// +/// Note: The implementation is primarily based on the elixir implementation +/// [https://hex.pm/packages/levenshtein](levenshtein). +/// +///
+/// Example: +/// +/// import gleeunit/should +/// import gleam_community/maths/metrics +/// +/// pub fn example () { +/// metrics.levenshtein_distance("hello", "hello") +/// |> should.equal(0) +/// +/// metrics.levenshtein_distance("cat", "cut") +/// |> should.equal(1) +/// +/// metrics.levenshtein_distance("kitten", "sitting") +/// |> should.equal(3) +/// } +///
+/// +/// +/// +/// +pub fn levenshtein_distance(xstring: String, ystring: String) -> Int { + case xstring, ystring { + xstring, ystring if xstring == ystring -> { + 0 + } + xstring, ystring if xstring == "" -> { + string.length(ystring) + } + xstring, ystring if ystring == "" -> { + string.length(xstring) + } + _, _ -> { + let xstring_graphemes = string.to_graphemes(xstring) + let ystring_graphemes = string.to_graphemes(ystring) + let ystring_length = list.length(ystring_graphemes) + let distance_list = list.range(0, ystring_length) + + do_edit_distance(xstring_graphemes, ystring_graphemes, distance_list, 1) + } + } +} + +fn do_edit_distance( + xstring: List(String), + ystring: List(String), + distance_list: List(Int), + step: Int, +) -> Int { + case xstring { + // Safe as 'distance_list' is never empty + [] -> { + let assert Ok(last) = list.last(distance_list) + last + } + [xstring_head, ..xstring_tail] -> { + let new_distance_list = + distance_list_helper(ystring, distance_list, xstring_head, [step], step) + do_edit_distance(xstring_tail, ystring, new_distance_list, step + 1) + } + } +} + +fn distance_list_helper( + ystring: List(String), + distance_list: List(Int), + grapheme: String, + new_distance_list: List(Int), + last_distance: Int, +) -> List(Int) { + case ystring { + [] -> list.reverse(new_distance_list) + [ystring_head, ..ystring_tail] -> { + let assert [distance_list_head, ..distance_list_tail] = distance_list + let difference = case ystring_head == grapheme { + True -> { + 0 + } + False -> { + 1 + } + } + let assert [first, ..] = distance_list_tail + let min = + last_distance + 1 + |> piecewise.minimum(first + 1, int.compare) + |> piecewise.minimum(distance_list_head + difference, int.compare) + distance_list_helper( + ystring_tail, + distance_list_tail, + grapheme, + [min, ..new_distance_list], + min, + ) + } + } +} diff --git a/test/gleam_community/maths/metrics_test.gleam b/test/gleam_community/maths/metrics_test.gleam index 4ba9cd9..8fca446 100644 --- a/test/gleam_community/maths/metrics_test.gleam +++ b/test/gleam_community/maths/metrics_test.gleam @@ -50,38 +50,24 @@ pub fn float_list_norm_test() { |> should.be_true() } -pub fn float_list_manhatten_test() { +pub fn float_list_manhattan_test() { let assert Ok(tol) = elementary.power(-10.0, -6.0) // Empty lists returns an error - metrics.manhatten_distance([], []) + metrics.manhattan_distance([], []) |> should.be_error() // Differing lengths returns error - metrics.manhatten_distance([], [1.0]) + metrics.manhattan_distance([], [1.0]) |> should.be_error() - // Manhatten distance (p = 1) - let assert Ok(result) = metrics.manhatten_distance([0.0, 0.0], [1.0, 2.0]) + // manhattan distance (p = 1) + let assert Ok(result) = metrics.manhattan_distance([0.0, 0.0], [1.0, 2.0]) result |> predicates.is_close(3.0, 0.0, tol) |> should.be_true() } -// pub fn int_list_manhatten_test() { -// // Empty lists returns 0 -// metrics.int_manhatten_distance([], []) -// |> should.equal(Ok(0)) - -// // Differing lengths returns error -// metrics.int_manhatten_distance([], [1]) -// |> should.be_error() - -// let assert Ok(result) = metrics.int_manhatten_distance([0, 0], [1, 2]) -// result -// |> should.equal(3) -// } - pub fn float_list_minkowski_test() { let assert Ok(tol) = elementary.power(-10.0, -6.0) @@ -130,7 +116,7 @@ pub fn float_list_minkowski_test() { |> predicates.is_close(2.23606797749979, 0.0, tol) |> should.be_true() - // Manhatten distance (p = 1) + // Manhattan distance (p = 1) let assert Ok(result) = metrics.minkowski_distance([0.0, 0.0], [1.0, 2.0], 1.0) result @@ -156,7 +142,7 @@ pub fn float_list_euclidean_test() { |> should.be_true() } -pub fn example_mean_test() { +pub fn mean_test() { // An empty list returns an error [] |> metrics.mean() @@ -168,7 +154,7 @@ pub fn example_mean_test() { |> should.equal(Ok(2.0)) } -pub fn example_median_test() { +pub fn median_test() { // An empty list returns an error [] |> metrics.median() @@ -184,7 +170,7 @@ pub fn example_median_test() { |> should.equal(Ok(2.5)) } -pub fn example_variance_test() { +pub fn variance_test() { // Degrees of freedom let ddof: Int = 1 @@ -199,7 +185,7 @@ pub fn example_variance_test() { |> should.equal(Ok(1.0)) } -pub fn example_standard_deviation_test() { +pub fn standard_deviation_test() { // Degrees of freedom let ddof: Int = 1 @@ -214,7 +200,7 @@ pub fn example_standard_deviation_test() { |> should.equal(Ok(1.0)) } -pub fn example_jaccard_index_test() { +pub fn jaccard_index_test() { metrics.jaccard_index(set.from_list([]), set.from_list([])) |> should.equal(0.0) @@ -235,7 +221,7 @@ pub fn example_jaccard_index_test() { |> should.equal(1.0 /. 7.0) } -pub fn example_sorensen_dice_coefficient_test() { +pub fn sorensen_dice_coefficient_test() { metrics.sorensen_dice_coefficient(set.from_list([]), set.from_list([])) |> should.equal(0.0) @@ -256,7 +242,7 @@ pub fn example_sorensen_dice_coefficient_test() { |> should.equal(2.0 *. 1.0 /. { 4.0 +. 5.0 }) } -pub fn example_overlap_coefficient_test() { +pub fn overlap_coefficient_test() { metrics.overlap_coefficient(set.from_list([]), set.from_list([])) |> should.equal(0.0) @@ -278,7 +264,7 @@ pub fn example_overlap_coefficient_test() { |> should.equal(2.0 /. 4.0) } -pub fn example_cosine_similarity_test() { +pub fn cosine_similarity_test() { // Empty lists returns an error metrics.cosine_similarity([], []) |> should.be_error() @@ -308,7 +294,7 @@ pub fn example_cosine_similarity_test() { |> should.equal(Ok(-1.0)) } -pub fn example_chebyshev_distance() { +pub fn chebyshev_distance_test() { // Empty lists returns an error metrics.chebyshev_distance([], []) |> should.be_error() @@ -330,6 +316,9 @@ pub fn example_chebyshev_distance() { |> should.equal(Ok(2.0)) metrics.chebyshev_distance([1.0, 0.0], [2.0, 0.0]) + |> should.equal(Ok(1.0)) + + metrics.chebyshev_distance([1.0, 0.0], [-2.0, 0.0]) |> should.equal(Ok(3.0)) metrics.chebyshev_distance([-5.0, -10.0, -3.0], [-1.0, -12.0, -3.0]) @@ -338,3 +327,43 @@ pub fn example_chebyshev_distance() { metrics.chebyshev_distance([1.0, 2.0, 3.0], [1.0, 2.0, 3.0]) |> should.equal(Ok(0.0)) } + +pub fn edit_distance_test() { + // Try different types of valid input... + + // Requires 5 insertions to transform the empty string into "hello" + metrics.levenshtein_distance("", "hello") + |> should.equal(5) + // Requires 5 deletions to remove all characters from "hello" to match the empty string + metrics.levenshtein_distance("hello", "") + |> should.equal(5) + + // Requires 2 deletions to remove two 'b's and 1 substitution to change 'b' to 'a' + metrics.levenshtein_distance("bbb", "a") + |> should.equal(3) + // Requires 2 insertions to add two 'b's and 1 substitution to change 'a' to 'b' + metrics.levenshtein_distance("a", "bbb") + |> should.equal(3) + + // No changes needed, since the strings are identical + metrics.levenshtein_distance("hello", "hello") + |> should.equal(0) + + // Requires 1 substitution to change 'a' to 'u' + metrics.levenshtein_distance("cat", "cut") + |> should.equal(1) + + // Requires 2 substitutions (k -> s, e -> i) and 1 insertion (g at the end) + metrics.levenshtein_distance("kitten", "sitting") + |> should.equal(3) + + // Some more complex cases, involving multiple insertions, deletions, and substitutions + metrics.levenshtein_distance("gggtatccat", "cctaggtccct") + |> should.equal(6) + + metrics.levenshtein_distance( + "This is a longer string", + "This is also a much longer string", + ) + |> should.equal(10) +} From e135ea33183bf1ca9a2f0786b881fa6ff50f4503 Mon Sep 17 00:00:00 2001 From: NicklasXYZ <18580183+NicklasXYZ@users.noreply.github.com> Date: Wed, 10 Apr 2024 23:11:14 +0200 Subject: [PATCH 10/16] Rename metrics test function --- test/gleam_community/maths/metrics_test.gleam | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/gleam_community/maths/metrics_test.gleam b/test/gleam_community/maths/metrics_test.gleam index 8fca446..e5888a0 100644 --- a/test/gleam_community/maths/metrics_test.gleam +++ b/test/gleam_community/maths/metrics_test.gleam @@ -328,7 +328,7 @@ pub fn chebyshev_distance_test() { |> should.equal(Ok(0.0)) } -pub fn edit_distance_test() { +pub fn levenshtein_distance_test() { // Try different types of valid input... // Requires 5 insertions to transform the empty string into "hello" From 71db9b1d526488ffb8736252e8b80ad50cf65f1f Mon Sep 17 00:00:00 2001 From: NicklasXYZ <18580183+NicklasXYZ@users.noreply.github.com> Date: Wed, 10 Apr 2024 23:28:28 +0200 Subject: [PATCH 11/16] fix link --- src/gleam_community/maths/metrics.gleam | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/gleam_community/maths/metrics.gleam b/src/gleam_community/maths/metrics.gleam index d5ddc69..237b900 100644 --- a/src/gleam_community/maths/metrics.gleam +++ b/src/gleam_community/maths/metrics.gleam @@ -38,6 +38,7 @@ //// * [`sorensen_dice_coefficient`](#sorensen_dice_coefficient) //// * [`tversky_index`](#tversky_index) //// * [`overlap_coefficient`](#overlap_coefficient) +//// * [`levenshtein_distance`](#levenshtein_distance) //// * **Basic statistical measures** //// * [`mean`](#mean) //// * [`median`](#median) @@ -1014,8 +1015,8 @@ pub fn cosine_similarity( /// - deletions /// - substitutions /// -/// Note: The implementation is primarily based on the elixir implementation -/// [https://hex.pm/packages/levenshtein](levenshtein). +/// Note: The implementation is primarily based on the Elixir implementation +/// [levenshtein](https://hex.pm/packages/levenshtein). /// ///
/// Example: From d9c642062a1c3cc1007f41c47b268f902b0de8a2 Mon Sep 17 00:00:00 2001 From: NicklasXYZ <18580183+NicklasXYZ@users.noreply.github.com> Date: Wed, 10 Apr 2024 23:32:42 +0200 Subject: [PATCH 12/16] Run gleam update --- manifest.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/manifest.toml b/manifest.toml index 6c5a157..4a7c920 100644 --- a/manifest.toml +++ b/manifest.toml @@ -3,7 +3,7 @@ packages = [ { name = "gleam_stdlib", version = "0.36.0", build_tools = ["gleam"], requirements = [], otp_app = "gleam_stdlib", source = "hex", outer_checksum = "C0D14D807FEC6F8A08A7C9EF8DFDE6AE5C10E40E21325B2B29365965D82EB3D4" }, - { name = "gleeunit", version = "1.0.2", build_tools = ["gleam"], requirements = ["gleam_stdlib"], otp_app = "gleeunit", source = "hex", outer_checksum = "D364C87AFEB26BDB4FB8A5ABDE67D635DC9FA52D6AB68416044C35B096C6882D" }, + { name = "gleeunit", version = "1.1.2", build_tools = ["gleam"], requirements = ["gleam_stdlib"], otp_app = "gleeunit", source = "hex", outer_checksum = "72CDC3D3F719478F26C4E2C5FED3E657AC81EC14A47D2D2DEBB8693CA3220C3B" }, ] [requirements] From b7f7b29e48274b98136519e4b076812eb06acda8 Mon Sep 17 00:00:00 2001 From: NicklasXYZ <18580183+NicklasXYZ@users.noreply.github.com> Date: Sat, 13 Apr 2024 23:38:39 +0200 Subject: [PATCH 13/16] Add canberra and bary-curtis distance --- src/gleam_community/maths/arithmetics.gleam | 77 +++-- src/gleam_community/maths/metrics.gleam | 309 +++++++++++++----- .../maths/arithmetics_test.gleam | 19 +- test/gleam_community/maths/metrics_test.gleam | 165 ++++++++-- 4 files changed, 432 insertions(+), 138 deletions(-) diff --git a/src/gleam_community/maths/arithmetics.gleam b/src/gleam_community/maths/arithmetics.gleam index dbd3ffe..b37a067 100644 --- a/src/gleam_community/maths/arithmetics.gleam +++ b/src/gleam_community/maths/arithmetics.gleam @@ -44,6 +44,9 @@ import gleam/int import gleam/list +import gleam/option +import gleam/pair +import gleam/result import gleam_community/maths/conversion import gleam_community/maths/elementary import gleam_community/maths/piecewise @@ -289,29 +292,32 @@ pub fn proper_divisors(n: Int) -> List(Int) { /// /// /// -/// Calculate the sum of the elements in a list: +/// Calculate the (weighted) sum of the elements in a list: /// /// \\[ -/// \sum_{i=1}^n x_i +/// \sum_{i=1}^n w_i x_i /// \\] /// -/// In the formula, $$n$$ is the length of the list and $$x_i \in \mathbb{R}$$ is the value in the input list indexed by $$i$$. +/// In the formula, $$n$$ is the length of the list and $$x_i \in \mathbb{R}$$ is +/// the value in the input list indexed by $$i$$, while $$w_i \in \mathbb{R}$$ is +/// a corresponding weight ($$w_i = 1.0\;\forall i=1...n$$ by default). /// ///
/// Example: /// /// import gleeunit/should +/// import gleam/option /// import gleam_community/maths/arithmetics /// /// pub fn example () { /// // An empty list returns an error /// [] -/// |> arithmetics.float_sum() +/// |> arithmetics.float_sum(option.None) /// |> should.equal(0.0) /// /// // Valid input returns a result /// [1.0, 2.0, 3.0] -/// |> arithmetics.float_sum() +/// |> arithmetics.float_sum(option.None) /// |> should.equal(6.0) /// } ///
@@ -322,12 +328,18 @@ pub fn proper_divisors(n: Int) -> List(Int) { /// /// /// -pub fn float_sum(arr: List(Float)) -> Float { - case arr { - [] -> 0.0 - _ -> +pub fn float_sum(arr: List(Float), weights: option.Option(List(Float))) -> Float { + case arr, weights { + [], _ -> 0.0 + _, option.None -> arr |> list.fold(0.0, fn(acc: Float, a: Float) -> Float { a +. acc }) + _, option.Some(warr) -> { + list.zip(arr, warr) + |> list.fold(0.0, fn(acc: Float, a: #(Float, Float)) -> Float { + pair.first(a) *. pair.second(a) +. acc + }) + } } } @@ -385,29 +397,32 @@ pub fn int_sum(arr: List(Int)) -> Int { /// /// /// -/// Calculate the product of the elements in a list: +/// Calculate the (weighted) product of the elements in a list: /// /// \\[ -/// \prod_{i=1}^n x_i +/// \prod_{i=1}^n x_i^{w_i} /// \\] /// -/// In the formula, $$n$$ is the length of the list and $$x_i \in \mathbb{R}$$ is the value in the input list indexed by $$i$$. -/// +/// In the formula, $$n$$ is the length of the list and $$x_i \in \mathbb{R}$$ is +/// the value in the input list indexed by $$i$$, while $$w_i \in \mathbb{R}$$ is +/// a corresponding weight ($$w_i = 1.0\;\forall i=1...n$$ by default). +/// ///
/// Example: /// /// import gleeunit/should +/// import gleam/option /// import gleam_community/maths/arithmetics /// /// pub fn example () { /// // An empty list returns 0.0 /// [] -/// |> arithmetics.float_product() +/// |> arithmetics.float_product(option.None) /// |> should.equal(0.0) /// /// // Valid input returns a result /// [1.0, 2.0, 3.0] -/// |> arithmetics.float_product() +/// |> arithmetics.float_product(option.None) /// |> should.equal(6.0) /// } ///
@@ -418,12 +433,36 @@ pub fn int_sum(arr: List(Int)) -> Int { /// /// /// -pub fn float_product(arr: List(Float)) -> Float { - case arr { - [] -> 1.0 - _ -> +pub fn float_product( + arr: List(Float), + weights: option.Option(List(Float)), +) -> Result(Float, String) { + case arr, weights { + [], _ -> + 1.0 + |> Ok + _, option.None -> arr |> list.fold(1.0, fn(acc: Float, a: Float) -> Float { a *. acc }) + |> Ok + _, option.Some(warr) -> { + let results = + list.zip(arr, warr) + |> list.map(fn(a: #(Float, Float)) -> Result(Float, String) { + pair.first(a) + |> elementary.power(pair.second(a)) + }) + |> result.all + case results { + Ok(prods) -> + prods + |> list.fold(1.0, fn(acc: Float, a: Float) -> Float { a *. acc }) + |> Ok + Error(msg) -> + msg + |> Error + } + } } } diff --git a/src/gleam_community/maths/metrics.gleam b/src/gleam_community/maths/metrics.gleam index 237b900..84cb8b2 100644 --- a/src/gleam_community/maths/metrics.gleam +++ b/src/gleam_community/maths/metrics.gleam @@ -33,6 +33,8 @@ //// * [`chebyshev_distance`](#chebyshev_distance) //// * [`minkowski_distance`](#minkowski_distance) //// * [`cosine_similarity`](#cosine_similarity) +//// * [`canberra_distance`](#canberra_distance) +//// * [`braycurtis_distance`](#braycurtis_distance) //// * **Set & string similarity measures** //// * [`jaccard_index`](#jaccard_index) //// * [`sorensen_dice_coefficient`](#sorensen_dice_coefficient) @@ -57,6 +59,48 @@ import gleam/set import gleam/float import gleam/int import gleam/string +import gleam/option + +/// Utility function that checks all lists have the expected length +/// Primarily used by all distance measures taking List(Float) as input +fn check_lists( + xarr: List(Float), + yarr: List(Float), + weights: option.Option(List(Float)), +) -> Result(Bool, String) { + case xarr, yarr { + [], _ -> + "Invalid input argument: The list xarr is empty." + |> Error + _, [] -> + "Invalid input argument: The list yarr is empty." + |> Error + _, _ -> { + let xlen: Int = list.length(xarr) + let ylen: Int = list.length(yarr) + case xlen == ylen, weights { + False, _ -> + "Invalid input argument: length(xarr) != length(yarr). Valid input is when length(xarr) == length(yarr)." + |> Error + True, option.None -> { + True + |> Ok + } + True, option.Some(warr) -> { + let wlen: Int = list.length(warr) + case xlen == wlen { + True -> + True + |> Ok + False -> + "Invalid input argument: length(weights) != length(xarr) and length(weights) != length(yarr). Valid input is when length(weights) == length(xarr) == length(yarr)." + |> Error + } + } + } + } + } +} ///
/// @@ -169,8 +213,9 @@ pub fn norm(arr: List(Float), p: Float) -> Float { pub fn manhattan_distance( xarr: List(Float), yarr: List(Float), + weights: option.Option(List(Float)), ) -> Result(Float, String) { - minkowski_distance(xarr, yarr, 1.0) + minkowski_distance(xarr, yarr, 1.0, weights) } ///
@@ -231,34 +276,24 @@ pub fn minkowski_distance( xarr: List(Float), yarr: List(Float), p: Float, + weights: option.Option(List(Float)), ) -> Result(Float, String) { - case xarr, yarr { - [], _ -> - "Invalid input argument: The list xarr is empty." + case check_lists(xarr, yarr, weights) { + Error(msg) -> + msg |> Error - _, [] -> - "Invalid input argument: The list yarr is empty." - |> Error - _, _ -> { - let xlen: Int = list.length(xarr) - let ylen: Int = list.length(yarr) - case xlen == ylen { - False -> - "Invalid input argument: length(xarr) != length(yarr). Valid input is when length(xarr) == length(yarr)." - |> Error + Ok(_) -> { + case p <. 1.0 { True -> - case p <. 1.0 { - True -> - "Invalid input argument: p < 1. Valid input is p >= 1." - |> Error - False -> - list.zip(xarr, yarr) - |> list.map(fn(tuple: #(Float, Float)) -> Float { - pair.first(tuple) -. pair.second(tuple) - }) - |> norm(p) - |> Ok - } + "Invalid input argument: p < 1. Valid input is p >= 1." + |> Error + False -> + list.zip(xarr, yarr) + |> list.map(fn(tuple: #(Float, Float)) -> Float { + pair.first(tuple) -. pair.second(tuple) + }) + |> norm(p) + |> Ok } } } @@ -314,8 +349,9 @@ pub fn minkowski_distance( pub fn euclidean_distance( xarr: List(Float), yarr: List(Float), + weights: option.Option(List(Float)), ) -> Result(Float, String) { - minkowski_distance(xarr, yarr, 2.0) + minkowski_distance(xarr, yarr, 2.0, weights) } ///
@@ -364,32 +400,21 @@ pub fn euclidean_distance( pub fn chebyshev_distance( xarr: List(Float), yarr: List(Float), + weights: option.Option(List(Float)), ) -> Result(Float, String) { - case xarr, yarr { - [], _ -> - "Invalid input argument: The list xarr is empty." + case check_lists(xarr, yarr, weights) { + Error(msg) -> + msg |> Error - _, [] -> - "Invalid input argument: The list yarr is empty." - |> Error - _, _ -> { - let xlen: Int = list.length(xarr) - let ylen: Int = list.length(yarr) - case xlen == ylen { - False -> - "Invalid input argument: length(xarr) != length(yarr). Valid input is when length(xarr) == length(yarr)." - |> Error - True -> { - let differences = - list.zip(xarr, yarr) - |> list.map(fn(tuple: #(Float, Float)) -> Float { - { pair.first(tuple) -. pair.second(tuple) } - |> piecewise.float_absolute_value() - }) - differences - |> piecewise.list_maximum(float.compare) - } - } + Ok(_) -> { + let differences = + list.zip(xarr, yarr) + |> list.map(fn(tuple: #(Float, Float)) -> Float { + { pair.first(tuple) -. pair.second(tuple) } + |> piecewise.float_absolute_value() + }) + differences + |> piecewise.list_maximum(float.compare) } } } @@ -441,7 +466,7 @@ pub fn mean(arr: List(Float)) -> Result(Float, String) { |> Error _ -> arr - |> arithmetics.float_sum() + |> arithmetics.float_sum(option.None) |> fn(a: Float) -> Float { a /. conversion.int_to_float(list.length(arr)) } @@ -579,7 +604,7 @@ pub fn variance(arr: List(Float), ddof: Int) -> Result(Float, String) { let assert Ok(result) = elementary.power(a -. mean, 2.0) result }) - |> arithmetics.float_sum() + |> arithmetics.float_sum(option.None) |> fn(a: Float) -> Float { a /. { @@ -969,34 +994,23 @@ pub fn overlap_coefficient(xset: set.Set(a), yset: set.Set(a)) -> Float { pub fn cosine_similarity( xarr: List(Float), yarr: List(Float), + weights: option.Option(List(Float)), ) -> Result(Float, String) { - case xarr, yarr { - [], _ -> - "Invalid input argument: The list xarr is empty." + case check_lists(xarr, yarr, weights) { + Error(msg) -> + msg |> Error - _, [] -> - "Invalid input argument: The list yarr is empty." - |> Error - _, _ -> { - let xlen: Int = list.length(xarr) - let ylen: Int = list.length(yarr) - case xlen == ylen { - False -> - "Invalid input argument: length(xarr) != length(yarr). Valid input is when length(xarr) == length(yarr)." - |> Error - True -> { - list.fold( - list.zip(xarr, yarr), - 0.0, - fn(acc: Float, a: #(Float, Float)) -> Float { - let result: Float = pair.first(a) *. pair.second(a) - result +. acc - }, - ) - /. { norm(xarr, 2.0) *. norm(yarr, 2.0) } - |> Ok - } - } + Ok(_) -> { + list.fold( + list.zip(xarr, yarr), + 0.0, + fn(acc: Float, a: #(Float, Float)) -> Float { + let result: Float = pair.first(a) *. pair.second(a) + result +. acc + }, + ) + /. { norm(xarr, 2.0) *. norm(yarr, 2.0) } + |> Ok } } } @@ -1119,3 +1133,140 @@ fn distance_list_helper( } } } + +/// +/// +///
+/// Example: +/// +/// import gleeunit/should +/// import gleam_community/maths/metrics +/// +/// pub fn example () { +/// } +///
+/// +/// +/// +/// +pub fn canberra_distance( + xarr: List(Float), + yarr: List(Float), + weights: option.Option(List(Float)), +) -> Result(Float, String) { + case check_lists(xarr, yarr, weights) { + Error(msg) -> + msg + |> Error + Ok(_) -> { + let arr: List(Float) = + list.zip(xarr, yarr) + |> list.map(canberra_distance_helper) + case weights { + option.None -> { + arr + |> arithmetics.float_sum(option.None) + |> Ok + } + _ -> { + arr + |> arithmetics.float_sum(weights) + |> Ok + } + } + } + } +} + +fn canberra_distance_helper(tuple: #(Float, Float)) -> Float { + let numerator: Float = + piecewise.float_absolute_value({ pair.first(tuple) -. pair.second(tuple) }) + let denominator: Float = { + piecewise.float_absolute_value(pair.first(tuple)) + +. piecewise.float_absolute_value(pair.second(tuple)) + } + numerator /. denominator +} + +/// +/// +///
+/// Example: +/// +/// import gleeunit/should +/// import gleam_community/maths/metrics +/// +/// pub fn example () { +/// } +///
+/// +/// +/// +/// +pub fn braycurtis_distance( + xarr: List(Float), + yarr: List(Float), + weights: option.Option(List(Float)), +) -> Result(Float, String) { + case check_lists(xarr, yarr, weights) { + Error(msg) -> + msg + |> Error + Ok(_) -> { + let zipped_arr: List(#(Float, Float)) = list.zip(xarr, yarr) + let numerator_elements: List(Float) = + zipped_arr + |> list.map(fn(tuple: #(Float, Float)) -> Float { + piecewise.float_absolute_value({ + pair.first(tuple) -. pair.second(tuple) + }) + }) + let denominator_elements: List(Float) = + zipped_arr + |> list.map(fn(tuple: #(Float, Float)) -> Float { + piecewise.float_absolute_value({ + pair.first(tuple) +. pair.second(tuple) + }) + }) + + case weights { + option.None -> { + let numerator = + numerator_elements + |> arithmetics.float_sum(option.None) + let denominator = + denominator_elements + |> arithmetics.float_sum(option.None) + { numerator /. denominator } + |> Ok + } + _ -> { + let numerator = + numerator_elements + |> arithmetics.float_sum(weights) + let denominator = + denominator_elements + |> arithmetics.float_sum(weights) + { numerator /. denominator } + |> Ok + } + } + } + } +} diff --git a/test/gleam_community/maths/arithmetics_test.gleam b/test/gleam_community/maths/arithmetics_test.gleam index aa694b7..7ec5ad5 100644 --- a/test/gleam_community/maths/arithmetics_test.gleam +++ b/test/gleam_community/maths/arithmetics_test.gleam @@ -1,5 +1,6 @@ import gleam_community/maths/arithmetics import gleeunit/should +import gleam/option pub fn int_gcd_test() { arithmetics.gcd(1, 1) @@ -100,16 +101,16 @@ pub fn int_divisors_test() { pub fn float_list_sum_test() { // An empty list returns 0 [] - |> arithmetics.float_sum() + |> arithmetics.float_sum(option.None) |> should.equal(0.0) // Valid input returns a result [1.0, 2.0, 3.0] - |> arithmetics.float_sum() + |> arithmetics.float_sum(option.None) |> should.equal(6.0) [-2.0, 4.0, 6.0] - |> arithmetics.float_sum() + |> arithmetics.float_sum(option.None) |> should.equal(8.0) } @@ -132,17 +133,17 @@ pub fn int_list_sum_test() { pub fn float_list_product_test() { // An empty list returns 0 [] - |> arithmetics.float_product() - |> should.equal(1.0) + |> arithmetics.float_product(option.None) + |> should.equal(Ok(1.0)) // Valid input returns a result [1.0, 2.0, 3.0] - |> arithmetics.float_product() - |> should.equal(6.0) + |> arithmetics.float_product(option.None) + |> should.equal(Ok(6.0)) [-2.0, 4.0, 6.0] - |> arithmetics.float_product() - |> should.equal(-48.0) + |> arithmetics.float_product(option.None) + |> should.equal(Ok(-48.0)) } pub fn int_list_product_test() { diff --git a/test/gleam_community/maths/metrics_test.gleam b/test/gleam_community/maths/metrics_test.gleam index e5888a0..3ccf758 100644 --- a/test/gleam_community/maths/metrics_test.gleam +++ b/test/gleam_community/maths/metrics_test.gleam @@ -3,6 +3,7 @@ import gleam_community/maths/metrics import gleam_community/maths/predicates import gleeunit/should import gleam/set +import gleam/option pub fn float_list_norm_test() { let assert Ok(tol) = elementary.power(-10.0, -6.0) @@ -54,15 +55,16 @@ pub fn float_list_manhattan_test() { let assert Ok(tol) = elementary.power(-10.0, -6.0) // Empty lists returns an error - metrics.manhattan_distance([], []) + metrics.manhattan_distance([], [], option.None) |> should.be_error() // Differing lengths returns error - metrics.manhattan_distance([], [1.0]) + metrics.manhattan_distance([], [1.0], option.None) |> should.be_error() // manhattan distance (p = 1) - let assert Ok(result) = metrics.manhattan_distance([0.0, 0.0], [1.0, 2.0]) + let assert Ok(result) = + metrics.manhattan_distance([0.0, 0.0], [1.0, 2.0], option.None) result |> predicates.is_close(3.0, 0.0, tol) |> should.be_true() @@ -72,53 +74,53 @@ pub fn float_list_minkowski_test() { let assert Ok(tol) = elementary.power(-10.0, -6.0) // Empty lists returns an error - metrics.minkowski_distance([], [], 1.0) + metrics.minkowski_distance([], [], 1.0, option.None) |> should.be_error() // Differing lengths returns error - metrics.minkowski_distance([], [1.0], 1.0) + metrics.minkowski_distance([], [1.0], 1.0, option.None) |> should.be_error() // Test order < 1 - metrics.minkowski_distance([0.0, 0.0], [0.0, 0.0], -1.0) + metrics.minkowski_distance([0.0, 0.0], [0.0, 0.0], -1.0, option.None) |> should.be_error() // Check that the function agrees, at some arbitrary input // points, with known function values let assert Ok(result) = - metrics.minkowski_distance([1.0, 1.0], [1.0, 1.0], 1.0) + metrics.minkowski_distance([1.0, 1.0], [1.0, 1.0], 1.0, option.None) result |> predicates.is_close(0.0, 0.0, tol) |> should.be_true() let assert Ok(result) = - metrics.minkowski_distance([0.0, 0.0], [1.0, 1.0], 10.0) + metrics.minkowski_distance([0.0, 0.0], [1.0, 1.0], 10.0, option.None) result |> predicates.is_close(1.0717734625362931, 0.0, tol) |> should.be_true() let assert Ok(result) = - metrics.minkowski_distance([0.0, 0.0], [1.0, 1.0], 100.0) + metrics.minkowski_distance([0.0, 0.0], [1.0, 1.0], 100.0, option.None) result |> predicates.is_close(1.0069555500567189, 0.0, tol) |> should.be_true() let assert Ok(result) = - metrics.minkowski_distance([0.0, 0.0], [1.0, 1.0], 10.0) + metrics.minkowski_distance([0.0, 0.0], [1.0, 1.0], 10.0, option.None) result |> predicates.is_close(1.0717734625362931, 0.0, tol) |> should.be_true() // Euclidean distance (p = 2) let assert Ok(result) = - metrics.minkowski_distance([0.0, 0.0], [1.0, 2.0], 2.0) + metrics.minkowski_distance([0.0, 0.0], [1.0, 2.0], 2.0, option.None) result |> predicates.is_close(2.23606797749979, 0.0, tol) |> should.be_true() // Manhattan distance (p = 1) let assert Ok(result) = - metrics.minkowski_distance([0.0, 0.0], [1.0, 2.0], 1.0) + metrics.minkowski_distance([0.0, 0.0], [1.0, 2.0], 1.0, option.None) result |> predicates.is_close(3.0, 0.0, tol) |> should.be_true() @@ -128,15 +130,16 @@ pub fn float_list_euclidean_test() { let assert Ok(tol) = elementary.power(-10.0, -6.0) // Empty lists returns an error - metrics.euclidean_distance([], []) + metrics.euclidean_distance([], [], option.None) |> should.be_error() // Differing lengths returns error - metrics.euclidean_distance([], [1.0]) + metrics.euclidean_distance([], [1.0], option.None) |> should.be_error() // Euclidean distance (p = 2) - let assert Ok(result) = metrics.euclidean_distance([0.0, 0.0], [1.0, 2.0]) + let assert Ok(result) = + metrics.euclidean_distance([0.0, 0.0], [1.0, 2.0], option.None) result |> predicates.is_close(2.23606797749979, 0.0, tol) |> should.be_true() @@ -266,65 +269,69 @@ pub fn overlap_coefficient_test() { pub fn cosine_similarity_test() { // Empty lists returns an error - metrics.cosine_similarity([], []) + metrics.cosine_similarity([], [], option.None) |> should.be_error() // One empty list returns an error - metrics.cosine_similarity([1.0, 2.0, 3.0], []) + metrics.cosine_similarity([1.0, 2.0, 3.0], [], option.None) |> should.be_error() // One empty list returns an error - metrics.cosine_similarity([], [1.0, 2.0, 3.0]) + metrics.cosine_similarity([], [1.0, 2.0, 3.0], option.None) |> should.be_error() // Different sized lists returns an error - metrics.cosine_similarity([1.0, 2.0], [1.0, 2.0, 3.0, 4.0]) + metrics.cosine_similarity([1.0, 2.0], [1.0, 2.0, 3.0, 4.0], option.None) |> should.be_error() // Two orthogonal vectors (represented by lists) - metrics.cosine_similarity([-1.0, 1.0, 0.0], [1.0, 1.0, -1.0]) + metrics.cosine_similarity([-1.0, 1.0, 0.0], [1.0, 1.0, -1.0], option.None) |> should.equal(Ok(0.0)) // Two identical (parallel) vectors (represented by lists) - metrics.cosine_similarity([1.0, 2.0, 3.0], [1.0, 2.0, 3.0]) + metrics.cosine_similarity([1.0, 2.0, 3.0], [1.0, 2.0, 3.0], option.None) |> should.equal(Ok(1.0)) // Two parallel, but oppositely oriented vectors (represented by lists) - metrics.cosine_similarity([-1.0, -2.0, -3.0], [1.0, 2.0, 3.0]) + metrics.cosine_similarity([-1.0, -2.0, -3.0], [1.0, 2.0, 3.0], option.None) |> should.equal(Ok(-1.0)) } pub fn chebyshev_distance_test() { // Empty lists returns an error - metrics.chebyshev_distance([], []) + metrics.chebyshev_distance([], [], option.None) |> should.be_error() // One empty list returns an error - metrics.chebyshev_distance([1.0, 2.0, 3.0], []) + metrics.chebyshev_distance([1.0, 2.0, 3.0], [], option.None) |> should.be_error() // One empty list returns an error - metrics.chebyshev_distance([], [1.0, 2.0, 3.0]) + metrics.chebyshev_distance([], [1.0, 2.0, 3.0], option.None) |> should.be_error() // Different sized lists returns an error - metrics.chebyshev_distance([1.0, 2.0], [1.0, 2.0, 3.0, 4.0]) + metrics.chebyshev_distance([1.0, 2.0], [1.0, 2.0, 3.0, 4.0], option.None) |> should.be_error() // Try different types of valid input - metrics.chebyshev_distance([1.0, 0.0], [0.0, 2.0]) + metrics.chebyshev_distance([1.0, 0.0], [0.0, 2.0], option.None) |> should.equal(Ok(2.0)) - metrics.chebyshev_distance([1.0, 0.0], [2.0, 0.0]) + metrics.chebyshev_distance([1.0, 0.0], [2.0, 0.0], option.None) |> should.equal(Ok(1.0)) - metrics.chebyshev_distance([1.0, 0.0], [-2.0, 0.0]) + metrics.chebyshev_distance([1.0, 0.0], [-2.0, 0.0], option.None) |> should.equal(Ok(3.0)) - metrics.chebyshev_distance([-5.0, -10.0, -3.0], [-1.0, -12.0, -3.0]) + metrics.chebyshev_distance( + [-5.0, -10.0, -3.0], + [-1.0, -12.0, -3.0], + option.None, + ) |> should.equal(Ok(4.0)) - metrics.chebyshev_distance([1.0, 2.0, 3.0], [1.0, 2.0, 3.0]) + metrics.chebyshev_distance([1.0, 2.0, 3.0], [1.0, 2.0, 3.0], option.None) |> should.equal(Ok(0.0)) } @@ -367,3 +374,99 @@ pub fn levenshtein_distance_test() { ) |> should.equal(10) } + +pub fn canberra_distance_test() { + // Empty lists returns an error + metrics.canberra_distance([], [], option.None) + |> should.be_error() + + // One empty list returns an error + metrics.canberra_distance([1.0, 2.0, 3.0], [], option.None) + |> should.be_error() + + // One empty list returns an error + metrics.canberra_distance([], [1.0, 2.0, 3.0], option.None) + |> should.be_error() + + // Different sized lists returns an error + metrics.canberra_distance([1.0, 2.0], [1.0, 2.0, 3.0, 4.0], option.None) + |> should.be_error() + + // Try different types of valid input + metrics.canberra_distance([0.0, 0.0], [0.0, 0.0], option.None) + |> should.equal(Ok(0.0)) + + metrics.canberra_distance([1.0, 2.0], [-2.0, -1.0], option.None) + |> should.equal(Ok(2.0)) + + metrics.canberra_distance([1.0, 0.0], [0.0, 2.0], option.None) + |> should.equal(Ok(2.0)) + + metrics.canberra_distance([1.0, 0.0], [2.0, 0.0], option.None) + |> should.equal(Ok(1.0 /. 3.0)) + + metrics.canberra_distance([1.0, 0.0], [0.0, 2.0], option.Some([1.0, 1.0])) + |> should.equal(Ok(2.0)) + + metrics.canberra_distance([1.0, 0.0], [0.0, 2.0], option.Some([1.0, 0.5])) + |> should.equal(Ok(1.5)) + + metrics.canberra_distance([1.0, 0.0], [0.0, 2.0], option.Some([0.5, 0.5])) + |> should.equal(Ok(1.0)) + + // Different sized lists (weights) returns an error + metrics.canberra_distance( + [1.0, 2.0, 3.0], + [1.0, 2.0, 3.0], + option.Some([1.0]), + ) + |> should.be_error() +} + +pub fn braycurtis_distance_test() { + // Empty lists returns an error + metrics.braycurtis_distance([], [], option.None) + |> should.be_error() + + // One empty list returns an error + metrics.braycurtis_distance([1.0, 2.0, 3.0], [], option.None) + |> should.be_error() + + // One empty list returns an error + metrics.braycurtis_distance([], [1.0, 2.0, 3.0], option.None) + |> should.be_error() + + // Different sized lists returns an error + metrics.braycurtis_distance([1.0, 2.0], [1.0, 2.0, 3.0, 4.0], option.None) + |> should.be_error() + + // Try different types of valid input + metrics.braycurtis_distance([0.0, 0.0], [0.0, 0.0], option.None) + |> should.equal(Ok(0.0)) + + metrics.braycurtis_distance([1.0, 2.0], [-2.0, -1.0], option.None) + |> should.equal(Ok(3.0)) + + metrics.braycurtis_distance([1.0, 0.0], [0.0, 2.0], option.None) + |> should.equal(Ok(1.0)) + + metrics.braycurtis_distance([1.0, 2.0], [3.0, 4.0], option.None) + |> should.equal(Ok(0.4)) + + metrics.braycurtis_distance([1.0, 2.0], [3.0, 4.0], option.Some([1.0, 1.0])) + |> should.equal(Ok(0.4)) + + metrics.braycurtis_distance([1.0, 2.0], [3.0, 4.0], option.Some([0.5, 1.0])) + |> should.equal(Ok(0.375)) + + metrics.braycurtis_distance([1.0, 2.0], [3.0, 4.0], option.Some([0.25, 0.25])) + |> should.equal(Ok(0.4)) + + // Different sized lists (weights) returns an error + metrics.braycurtis_distance( + [1.0, 2.0, 3.0], + [1.0, 2.0, 3.0], + option.Some([1.0]), + ) + |> should.be_error() +} From 2313363a9e921f11df2675f64226d566b81d1af2 Mon Sep 17 00:00:00 2001 From: NicklasXYZ <18580183+NicklasXYZ@users.noreply.github.com> Date: Sun, 14 Apr 2024 16:54:10 +0200 Subject: [PATCH 14/16] Allow passing weights in distance calculations --- src/gleam_community/maths/arithmetics.gleam | 91 ++-- src/gleam_community/maths/combinatorics.gleam | 6 +- src/gleam_community/maths/conversion.gleam | 6 +- src/gleam_community/maths/elementary.gleam | 6 +- src/gleam_community/maths/metrics.gleam | 427 ++++++++++++------ src/gleam_community/maths/piecewise.gleam | 6 +- src/gleam_community/maths/predicates.gleam | 36 +- src/gleam_community/maths/sequences.gleam | 24 +- src/gleam_community/maths/special.gleam | 4 +- .../maths/arithmetics_test.gleam | 6 +- test/gleam_community/maths/metrics_test.gleam | 263 +++++++++-- 11 files changed, 626 insertions(+), 249 deletions(-) diff --git a/src/gleam_community/maths/arithmetics.gleam b/src/gleam_community/maths/arithmetics.gleam index b37a067..ebffe30 100644 --- a/src/gleam_community/maths/arithmetics.gleam +++ b/src/gleam_community/maths/arithmetics.gleam @@ -1,6 +1,6 @@ -//// -//// -//// +//// +//// +//// //// -//// +//// +//// +//// //// -//// +//// +//// +//// //// -//// +//// +//// +//// //// -//// +//// +//// +//// //// -//// +//// +//// +//// ////