Add new distance & similarity measures

This commit is contained in:
NicklasXYZ 2024-03-19 15:04:44 +01:00
parent c825bb522f
commit 24e496a4a8
2 changed files with 242 additions and 17 deletions

View file

@ -137,7 +137,7 @@ pub fn norm(arr: List(Float), p: Float) -> Float {
/// let assert Ok(tol) = elementary.power(-10.0, -6.0)
///
/// // Empty lists returns 0.0
/// metrics.float_manhatten_distance([], [])
/// metrics.manhatten_distance([], [])
/// |> should.equal(Ok(0.0))
///
/// // Differing lengths returns error
@ -567,13 +567,36 @@ pub fn standard_deviation(arr: List(Float), ddof: Int) -> Result(Float, String)
/// </a>
/// </div>
///
/// The Jaccard index measures similarity between two sets of elements. Mathematically, the Jaccard index
/// is defined as:
///
/// \\[
/// \text{JI}(X, Y) = \frac{|X \cap Y|}{|X \cup Y|} \in \left[0, 1\right]
/// \\]
///
/// where:
///
/// - $$X$$ and $$Y$$ are two sets being compared,
/// - $$|X \cap Y|$$ represents the size of the intersection of the two sets
/// - $$|X \cup Y|$$ denotes the size of the union of the two sets
///
/// The value of the Jaccard index ranges from 0 to 1, where 0 indicates that the two sets share no elements
/// and 1 indicates that the sets are identical. The Jaccard index is a special case of the
/// [Tversky index](#tversky_index) (with $$\alpha=\beta=1$$).
///
/// <details>
/// <summary>Example:</summary>
///
/// import gleeunit/should
/// import gleam_community/maths/metrics
/// import gleam/set
///
/// pub fn example () {
/// let xset: set.Set(String) = set.from_list(["cat", "dog", "hippo", "monkey"])
/// let yset: set.Set(String) =
/// set.from_list(["monkey", "rhino", "ostrich", "salmon"])
/// metrics.jaccard_index(xset, yset)
/// |> should.equal(1.0 /. 7.0)
/// }
/// </details>
///
@ -583,8 +606,8 @@ pub fn standard_deviation(arr: List(Float), ddof: Int) -> Result(Float, String)
/// </a>
/// </div>
///
pub fn jaccard_index(aset: set.Set(a), bset: set.Set(a)) -> Float {
let assert Ok(result) = tversky_index(aset, bset, 1.0, 1.0)
pub fn jaccard_index(xset: set.Set(a), yset: set.Set(a)) -> Float {
let assert Ok(result) = tversky_index(xset, yset, 1.0, 1.0)
result
}
@ -594,13 +617,36 @@ pub fn jaccard_index(aset: set.Set(a), bset: set.Set(a)) -> Float {
/// </a>
/// </div>
///
/// The Sørensen-Dice coefficient measures the similarity between two sets of elements. Mathematically, the
/// coefficient is defined as:
///
/// \\[
/// \text{DSC}(X, Y) = \frac{2 \times |X \cap Y|}{|X| + |Y|} \in \left[0, 1\right]
/// \\]
///
/// where:
/// - $$X$$ and $$Y$$ are two sets being compared
/// - $$|X \cap Y|$$ is the size of the intersection of the two sets (i.e., the number of elements common to both sets)
/// - $$|X|$$ and $$|Y|$$ are the sizes of the sets $$X$$ and $$Y$$, respectively
///
/// The coefficient ranges from 0 to 1, where 0 indicates no similarity (the sets share no elements) and 1
/// indicates perfect similarity (the sets are identical). The higher the coefficient, the greater the similarity
/// between the two sets. The Sørensen-Dice coefficient is a special case of the
/// [Tversky index](#tversky_index) (with $$\alpha=\beta=0.5$$).
///
/// <details>
/// <summary>Example:</summary>
///
/// import gleeunit/should
/// import gleam_community/maths/metrics
/// import gleam/set
///
/// pub fn example () {
/// let xset: set.Set(String) = set.from_list(["cat", "dog", "hippo", "monkey"])
/// let yset: set.Set(String) =
/// set.from_list(["monkey", "rhino", "ostrich", "salmon", "spider"])
/// metrics.sorensen_dice_coefficient(xset, yset)
/// |> should.equal(2.0 *. 1.0 /. { 4.0 +. 5.0 })
/// }
/// </details>
///
@ -610,8 +656,8 @@ pub fn jaccard_index(aset: set.Set(a), bset: set.Set(a)) -> Float {
/// </a>
/// </div>
///
pub fn sorensen_dice_coefficient(aset: set.Set(a), bset: set.Set(a)) -> Float {
let assert Ok(result) = tversky_index(aset, bset, 0.5, 0.5)
pub fn sorensen_dice_coefficient(xset: set.Set(a), yset: set.Set(a)) -> Float {
let assert Ok(result) = tversky_index(xset, yset, 0.5, 0.5)
result
}
@ -621,15 +667,39 @@ pub fn sorensen_dice_coefficient(aset: set.Set(a), bset: set.Set(a)) -> Float {
/// </a>
/// </div>
///
/// The Tversky index is a generalization of the SørensenDice coefficient and the Jaccard index.
/// The Tversky index is a generalization of the Jaccard index and Sørensen-Dice coefficient, which adds
/// flexibility through two parameters, $$\alpha$$ and $$\beta$$, allowing for asymmetric similarity
/// measures between sets. The Tversky index is defined as:
///
/// \\[
/// \text{TI}(X, Y) = \frac{|X \cap Y|}{|X \cap Y| + \alpha|X - Y| + \beta|Y - X|}
/// \\]
///
/// where:
///
/// - $$X$$ and $$Y$$ are the sets being compared
/// - $$|X - Y|$$ and $$|Y - X|$$ are the sizes of the relative complements of $$Y$$ in $$X$$ and $$X$$ in $$Y$$, respectively,
/// - $$\alpha$$ and $$\beta$$ are parameters that weigh the relative importance of the elements unique to $$X$$ and $$Y$$
///
/// The Tversky index reduces to the Jaccard index when \(\alpha = \beta = 1\) and to the Sorensen-Dice
/// coefficient when \(\alpha = \beta = 0.5\). In general, the Tversky index can take on any non-negative value, including 0.
/// The index equals 0 when there is no intersection between the two sets, indicating no similarity. However, unlike similarity
/// measures bounded strictly between 0 and 1, the Tversky index does not have a strict upper limit of 1 when $$\alpha \neq \beta$$.
///
/// <details>
/// <summary>Example:</summary>
///
/// import gleeunit/should
/// import gleam_community/maths/metrics
/// import gleam/set
///
/// pub fn example () {
/// let yset: set.Set(String) = set.from_list(["cat", "dog", "hippo", "monkey"])
/// let xset: set.Set(String) =
/// set.from_list(["monkey", "rhino", "ostrich", "salmon"])
/// // Test Jaccard index (alpha = beta = 1)
/// metrics.tversky_index(xset, yset, 1.0, 1.0)
/// |> should.equal(1.0 /. 7.0)
/// }
/// </details>
///
@ -640,23 +710,23 @@ pub fn sorensen_dice_coefficient(aset: set.Set(a), bset: set.Set(a)) -> Float {
/// </div>
///
pub fn tversky_index(
aset: set.Set(a),
bset: set.Set(a),
xset: set.Set(a),
yset: set.Set(a),
alpha: Float,
beta: Float,
) -> Result(Float, String) {
case alpha >=. 0.0, beta >=. 0.0 {
True, True -> {
let intersection: Float =
set.intersection(aset, bset)
set.intersection(xset, yset)
|> set.size()
|> conversion.int_to_float()
let difference1: Float =
set.difference(aset, bset)
set.difference(xset, yset)
|> set.size()
|> conversion.int_to_float()
let difference2: Float =
set.difference(bset, aset)
set.difference(yset, xset)
|> set.size()
|> conversion.int_to_float()
intersection
@ -684,14 +754,39 @@ pub fn tversky_index(
/// </a>
/// </div>
///
///
/// The Overlap coefficient, also known as the SzymkiewiczSimpson coefficient, is a measure of
/// similarity between two sets that focuses on the size of the intersection relative to the
/// smaller of the two sets. It is defined mathematically as:
///
/// \\[
/// \text{OC}(X, Y) = \frac{|X \cap Y|}{\min(|X|, |Y|)} \in \left[0, 1\right]
/// \\]
///
/// where:
///
/// - $$X$$ and $$Y$$ are the sets being compared
/// - $$|X \cap Y|$$ is the size of the intersection of the sets
/// - $$\min(|X|, |Y|)$$ is the size of the smaller set among $$X$$ and $$Y$$
///
/// The coefficient ranges from 0 to 1, where 0 indicates no overlap and 1 indicates that the
/// smaller set is a suyset of the larger set. This measure is especially useful in situations
/// where the similarity in terms of the proportion of overlap is more relevant than the
/// difference in sizes between the two sets.
///
/// <details>
/// <summary>Example:</summary>
///
/// import gleeunit/should
/// import gleam_community/maths/metrics
/// import gleam/set
///
/// pub fn example () {
/// let set_a: set.Set(String) =
/// set.from_list(["horse", "dog", "hippo", "monkey", "bird"])
/// let set_b: set.Set(String) =
/// set.from_list(["monkey", "bird", "ostrich", "salmon"])
/// metrics.overlap_coefficient(set_a, set_b)
/// |> should.equal(2.0 /. 4.0)
/// }
/// </details>
///
@ -701,13 +796,92 @@ pub fn tversky_index(
/// </a>
/// </div>
///
pub fn overlap_coefficient(aset: set.Set(a), bset: set.Set(a)) -> Float {
pub fn overlap_coefficient(xset: set.Set(a), yset: set.Set(a)) -> Float {
let intersection: Float =
set.intersection(aset, bset)
set.intersection(xset, yset)
|> set.size()
|> conversion.int_to_float()
let minsize: Float =
piecewise.minimum(set.size(aset), set.size(bset), int.compare)
piecewise.minimum(set.size(xset), set.size(yset), int.compare)
|> conversion.int_to_float()
intersection /. minsize
}
/// <div style="text-align: right;">
/// <a href="https://github.com/gleam-community/maths/issues">
/// <small>Spot a typo? Open an issue!</small>
/// </a>
/// </div>
///
/// Calculate the cosine similarity between two lists (representing vectors):
///
/// \\[
/// \frac{\sum_{i=1}^n x_i \cdot y_i}{\left(\sum_{i=1}^n x_i^2\right)^{\frac{1}{2}} \cdot \left(\sum_{i=1}^n y_i^2\right)^{\frac{1}{2}}}
/// \\]
///
/// In the formula, $n$ is the length of the two lists and $x_i, y_i$ are the values in the respective input lists indexed by $i$. The numerator
/// represents the dot product of the two vectors, while the denominator is the product of the magnitudes (Euclidean norms) of the two vectors.
/// The cosine similarity provides a value between -1 and 1, where 1 means the vectors are in the same direction, -1 means they are in exactly
/// opposite directions, and 0 indicates orthogonality.
///
/// <details>
/// <summary>Example:</summary>
///
/// import gleeunit/should
/// import gleam_community/maths/metrics
///
/// pub fn example () {
/// // Two orthogonal vectors
/// metrics.cosine_similarity([-1.0, 1.0, 0.0], [1.0, 1.0, -1.0])
/// |> should.equal(Ok(0.0))
///
/// // Two identical (parallel) vectors
/// metrics.cosine_similarity([1.0, 2.0, 3.0], [1.0, 2.0, 3.0])
/// |> should.equal(Ok(1.0))
///
/// // Two parallel, but oppositely oriented vectors
/// metrics.cosine_similarity([-1.0, -2.0, -3.0], [1.0, 2.0, 3.0])
/// |> should.equal(Ok(-1.0))
/// }
/// </details>
///
/// <div style="text-align: right;">
/// <a href="#">
/// <small>Back to top </small>
/// </a>
/// </div>
///
pub fn cosine_similarity(
xarr: List(Float),
yarr: List(Float),
) -> Result(Float, String) {
let xlen: Int = list.length(xarr)
let ylen: Int = list.length(yarr)
case xarr, yarr {
[], _ ->
"Invalid input argument: The list xarr is empty."
|> Error
_, [] ->
"Invalid input argument: The list yarr is empty."
|> Error
_, _ -> {
case xlen == ylen {
False ->
"Invalid input argument: length(xarr) != length(yarr). Valid input is when length(xarr) == length(yarr)."
|> Error
True -> {
list.fold(
list.zip(xarr, yarr),
0.0,
fn(acc: Float, a: #(Float, Float)) -> Float {
let result: Float = pair.first(a) *. pair.second(a)
result +. acc
},
)
/. { norm(xarr, 2.0) *. norm(yarr, 2.0) }
|> Ok
}
}
}
}
}

View file

@ -235,6 +235,27 @@ pub fn example_jaccard_index_test() {
|> should.equal(1.0 /. 7.0)
}
pub fn example_sorensen_dice_coefficient_test() {
metrics.sorensen_dice_coefficient(set.from_list([]), set.from_list([]))
|> should.equal(0.0)
let set_a: set.Set(Int) = set.from_list([0, 1, 2, 5, 6, 8, 9])
let set_b: set.Set(Int) = set.from_list([0, 2, 3, 4, 5, 7, 9])
metrics.sorensen_dice_coefficient(set_a, set_b)
|> should.equal(2.0 *. 4.0 /. { 7.0 +. 7.0 })
let set_c: set.Set(Int) = set.from_list([0, 1, 2, 3, 4, 5])
let set_d: set.Set(Int) = set.from_list([6, 7, 8, 9, 10])
metrics.sorensen_dice_coefficient(set_c, set_d)
|> should.equal(2.0 *. 0.0 /. { 6.0 +. 5.0 })
let set_e: set.Set(String) = set.from_list(["cat", "dog", "hippo", "monkey"])
let set_f: set.Set(String) =
set.from_list(["monkey", "rhino", "ostrich", "salmon", "spider"])
metrics.sorensen_dice_coefficient(set_e, set_f)
|> should.equal(2.0 *. 1.0 /. { 4.0 +. 5.0 })
}
pub fn example_overlap_coefficient_test() {
metrics.overlap_coefficient(set.from_list([]), set.from_list([]))
|> should.equal(0.0)
@ -250,9 +271,39 @@ pub fn example_overlap_coefficient_test() {
|> should.equal(0.0 /. 5.0)
let set_e: set.Set(String) =
set.from_list(["cat", "dog", "hippo", "monkey", "rhino"])
set.from_list(["horse", "dog", "hippo", "monkey", "bird"])
let set_f: set.Set(String) =
set.from_list(["monkey", "rhino", "ostrich", "salmon"])
set.from_list(["monkey", "bird", "ostrich", "salmon"])
metrics.overlap_coefficient(set_e, set_f)
|> should.equal(2.0 /. 4.0)
}
pub fn example_cosine_similarity_test() {
// Empty lists returns an error
metrics.cosine_similarity([], [])
|> should.be_error()
// One empty list returns an error
metrics.cosine_similarity([1.0, 2.0, 3.0], [])
|> should.be_error()
// One empty list returns an error
metrics.cosine_similarity([], [1.0, 2.0, 3.0])
|> should.be_error()
// Differen sized lists returns an error
metrics.cosine_similarity([1.0, 2.0], [1.0, 2.0, 3.0, 4.0])
|> should.be_error()
// Two orthogonal vectors (represented by lists)
metrics.cosine_similarity([-1.0, 1.0, 0.0], [1.0, 1.0, -1.0])
|> should.equal(Ok(0.0))
// Two identical (parallel) vectors (represented by lists)
metrics.cosine_similarity([1.0, 2.0, 3.0], [1.0, 2.0, 3.0])
|> should.equal(Ok(1.0))
// Two parallel, but oppositely oriented vectors (represented by lists)
metrics.cosine_similarity([-1.0, -2.0, -3.0], [1.0, 2.0, 3.0])
|> should.equal(Ok(-1.0))
}