diff --git a/src/gleam_community/maths/metrics.gleam b/src/gleam_community/maths/metrics.gleam
index 4c45285..a4ee8ab 100644
--- a/src/gleam_community/maths/metrics.gleam
+++ b/src/gleam_community/maths/metrics.gleam
@@ -137,7 +137,7 @@ pub fn norm(arr: List(Float), p: Float) -> Float {
/// let assert Ok(tol) = elementary.power(-10.0, -6.0)
///
/// // Empty lists returns 0.0
-/// metrics.float_manhatten_distance([], [])
+/// metrics.manhatten_distance([], [])
/// |> should.equal(Ok(0.0))
///
/// // Differing lengths returns error
@@ -567,13 +567,36 @@ pub fn standard_deviation(arr: List(Float), ddof: Int) -> Result(Float, String)
///
///
///
+/// The Jaccard index measures similarity between two sets of elements. Mathematically, the Jaccard index
+/// is defined as:
+///
+/// \\[
+/// \text{JI}(X, Y) = \frac{|X \cap Y|}{|X \cup Y|} \in \left[0, 1\right]
+/// \\]
+///
+/// where:
+///
+/// - $$X$$ and $$Y$$ are two sets being compared,
+/// - $$|X \cap Y|$$ represents the size of the intersection of the two sets
+/// - $$|X \cup Y|$$ denotes the size of the union of the two sets
+///
+/// The value of the Jaccard index ranges from 0 to 1, where 0 indicates that the two sets share no elements
+/// and 1 indicates that the sets are identical. The Jaccard index is a special case of the
+/// [Tversky index](#tversky_index) (with $$\alpha=\beta=1$$).
+///
///
/// Example:
///
/// import gleeunit/should
/// import gleam_community/maths/metrics
+/// import gleam/set
///
/// pub fn example () {
+/// let xset: set.Set(String) = set.from_list(["cat", "dog", "hippo", "monkey"])
+/// let yset: set.Set(String) =
+/// set.from_list(["monkey", "rhino", "ostrich", "salmon"])
+/// metrics.jaccard_index(xset, yset)
+/// |> should.equal(1.0 /. 7.0)
/// }
///
///
@@ -583,8 +606,8 @@ pub fn standard_deviation(arr: List(Float), ddof: Int) -> Result(Float, String)
///
///
///
-pub fn jaccard_index(aset: set.Set(a), bset: set.Set(a)) -> Float {
- let assert Ok(result) = tversky_index(aset, bset, 1.0, 1.0)
+pub fn jaccard_index(xset: set.Set(a), yset: set.Set(a)) -> Float {
+ let assert Ok(result) = tversky_index(xset, yset, 1.0, 1.0)
result
}
@@ -594,13 +617,36 @@ pub fn jaccard_index(aset: set.Set(a), bset: set.Set(a)) -> Float {
///
///
///
+/// The Sørensen-Dice coefficient measures the similarity between two sets of elements. Mathematically, the
+/// coefficient is defined as:
+///
+/// \\[
+/// \text{DSC}(X, Y) = \frac{2 \times |X \cap Y|}{|X| + |Y|} \in \left[0, 1\right]
+/// \\]
+///
+/// where:
+/// - $$X$$ and $$Y$$ are two sets being compared
+/// - $$|X \cap Y|$$ is the size of the intersection of the two sets (i.e., the number of elements common to both sets)
+/// - $$|X|$$ and $$|Y|$$ are the sizes of the sets $$X$$ and $$Y$$, respectively
+///
+/// The coefficient ranges from 0 to 1, where 0 indicates no similarity (the sets share no elements) and 1
+/// indicates perfect similarity (the sets are identical). The higher the coefficient, the greater the similarity
+/// between the two sets. The Sørensen-Dice coefficient is a special case of the
+/// [Tversky index](#tversky_index) (with $$\alpha=\beta=0.5$$).
+///
///
/// Example:
///
/// import gleeunit/should
/// import gleam_community/maths/metrics
+/// import gleam/set
///
/// pub fn example () {
+/// let xset: set.Set(String) = set.from_list(["cat", "dog", "hippo", "monkey"])
+/// let yset: set.Set(String) =
+/// set.from_list(["monkey", "rhino", "ostrich", "salmon", "spider"])
+/// metrics.sorensen_dice_coefficient(xset, yset)
+/// |> should.equal(2.0 *. 1.0 /. { 4.0 +. 5.0 })
/// }
///
///
@@ -610,8 +656,8 @@ pub fn jaccard_index(aset: set.Set(a), bset: set.Set(a)) -> Float {
///
///
///
-pub fn sorensen_dice_coefficient(aset: set.Set(a), bset: set.Set(a)) -> Float {
- let assert Ok(result) = tversky_index(aset, bset, 0.5, 0.5)
+pub fn sorensen_dice_coefficient(xset: set.Set(a), yset: set.Set(a)) -> Float {
+ let assert Ok(result) = tversky_index(xset, yset, 0.5, 0.5)
result
}
@@ -621,15 +667,39 @@ pub fn sorensen_dice_coefficient(aset: set.Set(a), bset: set.Set(a)) -> Float {
///
///
///
-/// The Tversky index is a generalization of the Sørensen–Dice coefficient and the Jaccard index.
+/// The Tversky index is a generalization of the Jaccard index and Sørensen-Dice coefficient, which adds
+/// flexibility through two parameters, $$\alpha$$ and $$\beta$$, allowing for asymmetric similarity
+/// measures between sets. The Tversky index is defined as:
///
+/// \\[
+/// \text{TI}(X, Y) = \frac{|X \cap Y|}{|X \cap Y| + \alpha|X - Y| + \beta|Y - X|}
+/// \\]
+///
+/// where:
+///
+/// - $$X$$ and $$Y$$ are the sets being compared
+/// - $$|X - Y|$$ and $$|Y - X|$$ are the sizes of the relative complements of $$Y$$ in $$X$$ and $$X$$ in $$Y$$, respectively,
+/// - $$\alpha$$ and $$\beta$$ are parameters that weigh the relative importance of the elements unique to $$X$$ and $$Y$$
+///
+/// The Tversky index reduces to the Jaccard index when \(\alpha = \beta = 1\) and to the Sorensen-Dice
+/// coefficient when \(\alpha = \beta = 0.5\). In general, the Tversky index can take on any non-negative value, including 0.
+/// The index equals 0 when there is no intersection between the two sets, indicating no similarity. However, unlike similarity
+/// measures bounded strictly between 0 and 1, the Tversky index does not have a strict upper limit of 1 when $$\alpha \neq \beta$$.
+///
///
/// Example:
///
/// import gleeunit/should
/// import gleam_community/maths/metrics
+/// import gleam/set
///
/// pub fn example () {
+/// let yset: set.Set(String) = set.from_list(["cat", "dog", "hippo", "monkey"])
+/// let xset: set.Set(String) =
+/// set.from_list(["monkey", "rhino", "ostrich", "salmon"])
+/// // Test Jaccard index (alpha = beta = 1)
+/// metrics.tversky_index(xset, yset, 1.0, 1.0)
+/// |> should.equal(1.0 /. 7.0)
/// }
///
///
@@ -640,23 +710,23 @@ pub fn sorensen_dice_coefficient(aset: set.Set(a), bset: set.Set(a)) -> Float {
///
///
pub fn tversky_index(
- aset: set.Set(a),
- bset: set.Set(a),
+ xset: set.Set(a),
+ yset: set.Set(a),
alpha: Float,
beta: Float,
) -> Result(Float, String) {
case alpha >=. 0.0, beta >=. 0.0 {
True, True -> {
let intersection: Float =
- set.intersection(aset, bset)
+ set.intersection(xset, yset)
|> set.size()
|> conversion.int_to_float()
let difference1: Float =
- set.difference(aset, bset)
+ set.difference(xset, yset)
|> set.size()
|> conversion.int_to_float()
let difference2: Float =
- set.difference(bset, aset)
+ set.difference(yset, xset)
|> set.size()
|> conversion.int_to_float()
intersection
@@ -684,14 +754,39 @@ pub fn tversky_index(
///
///
///
-///
+/// The Overlap coefficient, also known as the Szymkiewicz–Simpson coefficient, is a measure of
+/// similarity between two sets that focuses on the size of the intersection relative to the
+/// smaller of the two sets. It is defined mathematically as:
+///
+/// \\[
+/// \text{OC}(X, Y) = \frac{|X \cap Y|}{\min(|X|, |Y|)} \in \left[0, 1\right]
+/// \\]
+///
+/// where:
+///
+/// - $$X$$ and $$Y$$ are the sets being compared
+/// - $$|X \cap Y|$$ is the size of the intersection of the sets
+/// - $$\min(|X|, |Y|)$$ is the size of the smaller set among $$X$$ and $$Y$$
+///
+/// The coefficient ranges from 0 to 1, where 0 indicates no overlap and 1 indicates that the
+/// smaller set is a suyset of the larger set. This measure is especially useful in situations
+/// where the similarity in terms of the proportion of overlap is more relevant than the
+/// difference in sizes between the two sets.
+///
///
/// Example:
///
/// import gleeunit/should
/// import gleam_community/maths/metrics
+/// import gleam/set
///
/// pub fn example () {
+/// let set_a: set.Set(String) =
+/// set.from_list(["horse", "dog", "hippo", "monkey", "bird"])
+/// let set_b: set.Set(String) =
+/// set.from_list(["monkey", "bird", "ostrich", "salmon"])
+/// metrics.overlap_coefficient(set_a, set_b)
+/// |> should.equal(2.0 /. 4.0)
/// }
///
///
@@ -701,13 +796,92 @@ pub fn tversky_index(
///
///
///
-pub fn overlap_coefficient(aset: set.Set(a), bset: set.Set(a)) -> Float {
+pub fn overlap_coefficient(xset: set.Set(a), yset: set.Set(a)) -> Float {
let intersection: Float =
- set.intersection(aset, bset)
+ set.intersection(xset, yset)
|> set.size()
|> conversion.int_to_float()
let minsize: Float =
- piecewise.minimum(set.size(aset), set.size(bset), int.compare)
+ piecewise.minimum(set.size(xset), set.size(yset), int.compare)
|> conversion.int_to_float()
intersection /. minsize
}
+
+///
+///
+/// Calculate the cosine similarity between two lists (representing vectors):
+///
+/// \\[
+/// \frac{\sum_{i=1}^n x_i \cdot y_i}{\left(\sum_{i=1}^n x_i^2\right)^{\frac{1}{2}} \cdot \left(\sum_{i=1}^n y_i^2\right)^{\frac{1}{2}}}
+/// \\]
+///
+/// In the formula, $n$ is the length of the two lists and $x_i, y_i$ are the values in the respective input lists indexed by $i$. The numerator
+/// represents the dot product of the two vectors, while the denominator is the product of the magnitudes (Euclidean norms) of the two vectors.
+/// The cosine similarity provides a value between -1 and 1, where 1 means the vectors are in the same direction, -1 means they are in exactly
+/// opposite directions, and 0 indicates orthogonality.
+///
+///
+/// Example:
+///
+/// import gleeunit/should
+/// import gleam_community/maths/metrics
+///
+/// pub fn example () {
+/// // Two orthogonal vectors
+/// metrics.cosine_similarity([-1.0, 1.0, 0.0], [1.0, 1.0, -1.0])
+/// |> should.equal(Ok(0.0))
+///
+/// // Two identical (parallel) vectors
+/// metrics.cosine_similarity([1.0, 2.0, 3.0], [1.0, 2.0, 3.0])
+/// |> should.equal(Ok(1.0))
+///
+/// // Two parallel, but oppositely oriented vectors
+/// metrics.cosine_similarity([-1.0, -2.0, -3.0], [1.0, 2.0, 3.0])
+/// |> should.equal(Ok(-1.0))
+/// }
+///
+///
+///
+///
+pub fn cosine_similarity(
+ xarr: List(Float),
+ yarr: List(Float),
+) -> Result(Float, String) {
+ let xlen: Int = list.length(xarr)
+ let ylen: Int = list.length(yarr)
+ case xarr, yarr {
+ [], _ ->
+ "Invalid input argument: The list xarr is empty."
+ |> Error
+ _, [] ->
+ "Invalid input argument: The list yarr is empty."
+ |> Error
+ _, _ -> {
+ case xlen == ylen {
+ False ->
+ "Invalid input argument: length(xarr) != length(yarr). Valid input is when length(xarr) == length(yarr)."
+ |> Error
+ True -> {
+ list.fold(
+ list.zip(xarr, yarr),
+ 0.0,
+ fn(acc: Float, a: #(Float, Float)) -> Float {
+ let result: Float = pair.first(a) *. pair.second(a)
+ result +. acc
+ },
+ )
+ /. { norm(xarr, 2.0) *. norm(yarr, 2.0) }
+ |> Ok
+ }
+ }
+ }
+ }
+}
diff --git a/test/gleam_community/maths/metrics_test.gleam b/test/gleam_community/maths/metrics_test.gleam
index 2cb5ef1..e2f7307 100644
--- a/test/gleam_community/maths/metrics_test.gleam
+++ b/test/gleam_community/maths/metrics_test.gleam
@@ -235,6 +235,27 @@ pub fn example_jaccard_index_test() {
|> should.equal(1.0 /. 7.0)
}
+pub fn example_sorensen_dice_coefficient_test() {
+ metrics.sorensen_dice_coefficient(set.from_list([]), set.from_list([]))
+ |> should.equal(0.0)
+
+ let set_a: set.Set(Int) = set.from_list([0, 1, 2, 5, 6, 8, 9])
+ let set_b: set.Set(Int) = set.from_list([0, 2, 3, 4, 5, 7, 9])
+ metrics.sorensen_dice_coefficient(set_a, set_b)
+ |> should.equal(2.0 *. 4.0 /. { 7.0 +. 7.0 })
+
+ let set_c: set.Set(Int) = set.from_list([0, 1, 2, 3, 4, 5])
+ let set_d: set.Set(Int) = set.from_list([6, 7, 8, 9, 10])
+ metrics.sorensen_dice_coefficient(set_c, set_d)
+ |> should.equal(2.0 *. 0.0 /. { 6.0 +. 5.0 })
+
+ let set_e: set.Set(String) = set.from_list(["cat", "dog", "hippo", "monkey"])
+ let set_f: set.Set(String) =
+ set.from_list(["monkey", "rhino", "ostrich", "salmon", "spider"])
+ metrics.sorensen_dice_coefficient(set_e, set_f)
+ |> should.equal(2.0 *. 1.0 /. { 4.0 +. 5.0 })
+}
+
pub fn example_overlap_coefficient_test() {
metrics.overlap_coefficient(set.from_list([]), set.from_list([]))
|> should.equal(0.0)
@@ -250,9 +271,39 @@ pub fn example_overlap_coefficient_test() {
|> should.equal(0.0 /. 5.0)
let set_e: set.Set(String) =
- set.from_list(["cat", "dog", "hippo", "monkey", "rhino"])
+ set.from_list(["horse", "dog", "hippo", "monkey", "bird"])
let set_f: set.Set(String) =
- set.from_list(["monkey", "rhino", "ostrich", "salmon"])
+ set.from_list(["monkey", "bird", "ostrich", "salmon"])
metrics.overlap_coefficient(set_e, set_f)
|> should.equal(2.0 /. 4.0)
}
+
+pub fn example_cosine_similarity_test() {
+ // Empty lists returns an error
+ metrics.cosine_similarity([], [])
+ |> should.be_error()
+
+ // One empty list returns an error
+ metrics.cosine_similarity([1.0, 2.0, 3.0], [])
+ |> should.be_error()
+
+ // One empty list returns an error
+ metrics.cosine_similarity([], [1.0, 2.0, 3.0])
+ |> should.be_error()
+
+ // Differen sized lists returns an error
+ metrics.cosine_similarity([1.0, 2.0], [1.0, 2.0, 3.0, 4.0])
+ |> should.be_error()
+
+ // Two orthogonal vectors (represented by lists)
+ metrics.cosine_similarity([-1.0, 1.0, 0.0], [1.0, 1.0, -1.0])
+ |> should.equal(Ok(0.0))
+
+ // Two identical (parallel) vectors (represented by lists)
+ metrics.cosine_similarity([1.0, 2.0, 3.0], [1.0, 2.0, 3.0])
+ |> should.equal(Ok(1.0))
+
+ // Two parallel, but oppositely oriented vectors (represented by lists)
+ metrics.cosine_similarity([-1.0, -2.0, -3.0], [1.0, 2.0, 3.0])
+ |> should.equal(Ok(-1.0))
+}