mirror of
https://github.com/sigmasternchen/gleam-community-maths
synced 2025-03-15 07:59:01 +00:00
Merge pull request #18 from NicklasXYZ/main
Add Levenshtein distance and fix docs
This commit is contained in:
commit
6a6baddb36
2 changed files with 257 additions and 84 deletions
|
@ -23,11 +23,12 @@
|
||||||
////
|
////
|
||||||
//// ---
|
//// ---
|
||||||
////
|
////
|
||||||
//// Metrics: A module offering functions for calculating distances and other types of metrics.
|
//// Metrics: A module offering functions for calculating distances and other
|
||||||
|
//// types of metrics.
|
||||||
////
|
////
|
||||||
//// * **Distance measures**
|
//// * **Distance measures**
|
||||||
//// * [`norm`](#norm)
|
//// * [`norm`](#norm)
|
||||||
//// * [`manhatten_distance`](#manhatten_distance)
|
//// * [`manhattan_distance`](#manhattan_distance)
|
||||||
//// * [`euclidean_distance`](#euclidean_distance)
|
//// * [`euclidean_distance`](#euclidean_distance)
|
||||||
//// * [`chebyshev_distance`](#chebyshev_distance)
|
//// * [`chebyshev_distance`](#chebyshev_distance)
|
||||||
//// * [`minkowski_distance`](#minkowski_distance)
|
//// * [`minkowski_distance`](#minkowski_distance)
|
||||||
|
@ -54,6 +55,7 @@ import gleam/pair
|
||||||
import gleam/set
|
import gleam/set
|
||||||
import gleam/float
|
import gleam/float
|
||||||
import gleam/int
|
import gleam/int
|
||||||
|
import gleam/string
|
||||||
|
|
||||||
/// <div style="text-align: right;">
|
/// <div style="text-align: right;">
|
||||||
/// <a href="https://github.com/gleam-community/maths/issues">
|
/// <a href="https://github.com/gleam-community/maths/issues">
|
||||||
|
@ -67,7 +69,8 @@ import gleam/int
|
||||||
/// \left( \sum_{i=1}^n \left|x_i\right|^{p} \right)^{\frac{1}{p}}
|
/// \left( \sum_{i=1}^n \left|x_i\right|^{p} \right)^{\frac{1}{p}}
|
||||||
/// \\]
|
/// \\]
|
||||||
///
|
///
|
||||||
/// In the formula, $$n$$ is the length of the list and $$x_i$$ is the value in the input list indexed by $$i$$.
|
/// In the formula, $$n$$ is the length of the list and $$x_i$$ is the value in
|
||||||
|
/// the input list indexed by $$i$$.
|
||||||
///
|
///
|
||||||
/// <details>
|
/// <details>
|
||||||
/// <summary>Example:</summary>
|
/// <summary>Example:</summary>
|
||||||
|
@ -121,13 +124,14 @@ pub fn norm(arr: List(Float), p: Float) -> Float {
|
||||||
/// </a>
|
/// </a>
|
||||||
/// </div>
|
/// </div>
|
||||||
///
|
///
|
||||||
/// Calculate the Manhatten distance between two lists (representing vectors):
|
/// Calculate the Manhattan distance between two lists (representing vectors):
|
||||||
///
|
///
|
||||||
/// \\[
|
/// \\[
|
||||||
/// \sum_{i=1}^n \left|x_i - y_i \right|
|
/// \sum_{i=1}^n \left|x_i - y_i \right|
|
||||||
/// \\]
|
/// \\]
|
||||||
///
|
///
|
||||||
/// In the formula, $$n$$ is the length of the two lists and $$x_i, y_i$$ are the values in the respective input lists indexed by $$i$$.
|
/// In the formula, $$n$$ is the length of the two lists and $$x_i, y_i$$ are the
|
||||||
|
/// values in the respective input lists indexed by $$i$$.
|
||||||
///
|
///
|
||||||
/// <details>
|
/// <details>
|
||||||
/// <summary>Example:</summary>
|
/// <summary>Example:</summary>
|
||||||
|
@ -141,14 +145,14 @@ pub fn norm(arr: List(Float), p: Float) -> Float {
|
||||||
/// let assert Ok(tol) = elementary.power(-10.0, -6.0)
|
/// let assert Ok(tol) = elementary.power(-10.0, -6.0)
|
||||||
///
|
///
|
||||||
/// // Empty lists returns an error
|
/// // Empty lists returns an error
|
||||||
/// metrics.manhatten_distance([], [])
|
/// metrics.manhattan_distance([], [])
|
||||||
/// |> should.be_error()
|
/// |> should.be_error()
|
||||||
///
|
///
|
||||||
/// // Differing lengths returns error
|
/// // Differing lengths returns error
|
||||||
/// metrics.manhatten_distance([], [1.0])
|
/// metrics.manhattan_distance([], [1.0])
|
||||||
/// |> should.be_error()
|
/// |> should.be_error()
|
||||||
///
|
///
|
||||||
/// let assert Ok(result) = metrics.manhatten_distance([0.0, 0.0], [1.0, 2.0])
|
/// let assert Ok(result) = metrics.manhattan_distance([0.0, 0.0], [1.0, 2.0])
|
||||||
/// result
|
/// result
|
||||||
/// |> predicates.is_close(3.0, 0.0, tol)
|
/// |> predicates.is_close(3.0, 0.0, tol)
|
||||||
/// |> should.be_true()
|
/// |> should.be_true()
|
||||||
|
@ -161,7 +165,7 @@ pub fn norm(arr: List(Float), p: Float) -> Float {
|
||||||
/// </a>
|
/// </a>
|
||||||
/// </div>
|
/// </div>
|
||||||
///
|
///
|
||||||
pub fn manhatten_distance(
|
pub fn manhattan_distance(
|
||||||
xarr: List(Float),
|
xarr: List(Float),
|
||||||
yarr: List(Float),
|
yarr: List(Float),
|
||||||
) -> Result(Float, String) {
|
) -> Result(Float, String) {
|
||||||
|
@ -180,9 +184,11 @@ pub fn manhatten_distance(
|
||||||
/// \left( \sum_{i=1}^n \left|x_i - y_i \right|^{p} \right)^{\frac{1}{p}}
|
/// \left( \sum_{i=1}^n \left|x_i - y_i \right|^{p} \right)^{\frac{1}{p}}
|
||||||
/// \\]
|
/// \\]
|
||||||
///
|
///
|
||||||
/// In the formula, $$p >= 1$$ is the order, $$n$$ is the length of the two lists and $$x_i, y_i$$ are the values in the respective input lists indexed by $$i$$.
|
/// In the formula, $$p >= 1$$ is the order, $$n$$ is the length of the two lists
|
||||||
|
/// and $$x_i, y_i$$ are the values in the respective input lists indexed by $$i$$.
|
||||||
///
|
///
|
||||||
/// The Minkowski distance is a generalization of both the Euclidean distance ($$p=2$$) and the Manhattan distance ($$p = 1$$).
|
/// The Minkowski distance is a generalization of both the Euclidean distance
|
||||||
|
/// ($$p=2$$) and the Manhattan distance ($$p = 1$$).
|
||||||
///
|
///
|
||||||
/// <details>
|
/// <details>
|
||||||
/// <summary>Example:</summary>
|
/// <summary>Example:</summary>
|
||||||
|
@ -195,7 +201,7 @@ pub fn manhatten_distance(
|
||||||
/// pub fn example () {
|
/// pub fn example () {
|
||||||
/// let assert Ok(tol) = elementary.power(-10.0, -6.0)
|
/// let assert Ok(tol) = elementary.power(-10.0, -6.0)
|
||||||
///
|
///
|
||||||
/// // Empty lists returns 0.0
|
/// // Empty lists returns an error
|
||||||
/// metrics.minkowski_distance([], [], 1.0)
|
/// metrics.minkowski_distance([], [], 1.0)
|
||||||
/// |> should.be_error()
|
/// |> should.be_error()
|
||||||
///
|
///
|
||||||
|
@ -269,7 +275,8 @@ pub fn minkowski_distance(
|
||||||
/// \left( \sum_{i=1}^n \left|x_i - y_i \right|^{2} \right)^{\frac{1}{2}}
|
/// \left( \sum_{i=1}^n \left|x_i - y_i \right|^{2} \right)^{\frac{1}{2}}
|
||||||
/// \\]
|
/// \\]
|
||||||
///
|
///
|
||||||
/// In the formula, $$n$$ is the length of the two lists and $$x_i, y_i$$ are the values in the respective input lists indexed by $$i$$.
|
/// In the formula, $$n$$ is the length of the two lists and $$x_i, y_i$$ are the
|
||||||
|
/// values in the respective input lists indexed by $$i$$.
|
||||||
///
|
///
|
||||||
/// <details>
|
/// <details>
|
||||||
/// <summary>Example:</summary>
|
/// <summary>Example:</summary>
|
||||||
|
@ -282,11 +289,11 @@ pub fn minkowski_distance(
|
||||||
/// pub fn example () {
|
/// pub fn example () {
|
||||||
/// let assert Ok(tol) = elementary.power(-10.0, -6.0)
|
/// let assert Ok(tol) = elementary.power(-10.0, -6.0)
|
||||||
///
|
///
|
||||||
/// // Empty lists returns 0.0
|
/// // Empty lists returns an error
|
||||||
/// metrics.euclidean_distance([], [])
|
/// metrics.euclidean_distance([], [])
|
||||||
/// |> should.be_error()
|
/// |> should.be_error()
|
||||||
///
|
///
|
||||||
/// // Differing lengths returns error
|
/// // Differing lengths returns an error
|
||||||
/// metrics.euclidean_distance([], [1.0])
|
/// metrics.euclidean_distance([], [1.0])
|
||||||
/// |> should.be_error()
|
/// |> should.be_error()
|
||||||
///
|
///
|
||||||
|
@ -322,7 +329,8 @@ pub fn euclidean_distance(
|
||||||
/// \text{max}_{i=1}^n \left|x_i - y_i \right|
|
/// \text{max}_{i=1}^n \left|x_i - y_i \right|
|
||||||
/// \\]
|
/// \\]
|
||||||
///
|
///
|
||||||
/// In the formula, $$n$$ is the length of the two lists and $$x_i, y_i$$ are the values in the respective input lists indexed by $$i$$.
|
/// In the formula, $$n$$ is the length of the two lists and $$x_i, y_i$$ are the
|
||||||
|
/// values in the respective input lists indexed by $$i$$.
|
||||||
///
|
///
|
||||||
/// <details>
|
/// <details>
|
||||||
/// <summary>Example:</summary>
|
/// <summary>Example:</summary>
|
||||||
|
@ -397,8 +405,8 @@ pub fn chebyshev_distance(
|
||||||
/// \bar{x} = \frac{1}{n}\sum_{i=1}^n x_i
|
/// \bar{x} = \frac{1}{n}\sum_{i=1}^n x_i
|
||||||
/// \\]
|
/// \\]
|
||||||
///
|
///
|
||||||
/// In the formula, $$n$$ is the sample size (the length of the list) and
|
/// In the formula, $$n$$ is the sample size (the length of the list) and $$x_i$$
|
||||||
/// $$x_i$$ is the sample point in the input list indexed by $$i$$.
|
/// is the sample point in the input list indexed by $$i$$.
|
||||||
///
|
///
|
||||||
/// <details>
|
/// <details>
|
||||||
/// <summary>Example:</summary>
|
/// <summary>Example:</summary>
|
||||||
|
@ -514,12 +522,13 @@ pub fn median(arr: List(Float)) -> Result(Float, String) {
|
||||||
/// </div>
|
/// </div>
|
||||||
///
|
///
|
||||||
/// Calculate the sample variance of the elements in a list:
|
/// Calculate the sample variance of the elements in a list:
|
||||||
|
///
|
||||||
/// \\[
|
/// \\[
|
||||||
/// s^{2} = \frac{1}{n - d} \sum_{i=1}^{n}(x_i - \bar{x})
|
/// s^{2} = \frac{1}{n - d} \sum_{i=1}^{n}(x_i - \bar{x})
|
||||||
/// \\]
|
/// \\]
|
||||||
///
|
///
|
||||||
/// In the formula, $$n$$ is the sample size (the length of the list) and
|
/// In the formula, $$n$$ is the sample size (the length of the list) and $$x_i$$
|
||||||
/// $$x_i$$ is the sample point in the input list indexed by $$i$$.
|
/// is the sample point in the input list indexed by $$i$$.
|
||||||
/// Furthermore, $$\bar{x}$$ is the sample mean and $$d$$ is the "Delta
|
/// Furthermore, $$\bar{x}$$ is the sample mean and $$d$$ is the "Delta
|
||||||
/// Degrees of Freedom", and is by default set to $$d = 0$$, which gives a biased
|
/// Degrees of Freedom", and is by default set to $$d = 0$$, which gives a biased
|
||||||
/// estimate of the sample variance. Setting $$d = 1$$ gives an unbiased estimate.
|
/// estimate of the sample variance. Setting $$d = 1$$ gives an unbiased estimate.
|
||||||
|
@ -594,11 +603,12 @@ pub fn variance(arr: List(Float), ddof: Int) -> Result(Float, String) {
|
||||||
/// s = \left(\frac{1}{n - d} \sum_{i=1}^{n}(x_i - \bar{x})\right)^{\frac{1}{2}}
|
/// s = \left(\frac{1}{n - d} \sum_{i=1}^{n}(x_i - \bar{x})\right)^{\frac{1}{2}}
|
||||||
/// \\]
|
/// \\]
|
||||||
///
|
///
|
||||||
/// In the formula, $$n$$ is the sample size (the length of the list) and
|
/// In the formula, $$n$$ is the sample size (the length of the list) and $$x_i$$
|
||||||
/// $$x_i$$ is the sample point in the input list indexed by $$i$$.
|
/// is the sample point in the input list indexed by $$i$$.
|
||||||
/// Furthermore, $$\bar{x}$$ is the sample mean and $$d$$ is the "Delta
|
/// Furthermore, $$\bar{x}$$ is the sample mean and $$d$$ is the "Delta
|
||||||
/// Degrees of Freedom", and is by default set to $$d = 0$$, which gives a biased
|
/// Degrees of Freedom", and is by default set to $$d = 0$$, which gives a biased
|
||||||
/// estimate of the sample standard deviation. Setting $$d = 1$$ gives an unbiased estimate.
|
/// estimate of the sample standard deviation. Setting $$d = 1$$ gives an unbiased
|
||||||
|
/// estimate.
|
||||||
///
|
///
|
||||||
/// <details>
|
/// <details>
|
||||||
/// <summary>Example:</summary>
|
/// <summary>Example:</summary>
|
||||||
|
@ -656,8 +666,8 @@ pub fn standard_deviation(arr: List(Float), ddof: Int) -> Result(Float, String)
|
||||||
/// </a>
|
/// </a>
|
||||||
/// </div>
|
/// </div>
|
||||||
///
|
///
|
||||||
/// The Jaccard index measures similarity between two sets of elements. Mathematically, the Jaccard index
|
/// The Jaccard index measures similarity between two sets of elements.
|
||||||
/// is defined as:
|
/// Mathematically, the Jaccard index is defined as:
|
||||||
///
|
///
|
||||||
/// \\[
|
/// \\[
|
||||||
/// \frac{|X \cap Y|}{|X \cup Y|} \\; \in \\; \left[0, 1\right]
|
/// \frac{|X \cap Y|}{|X \cup Y|} \\; \in \\; \left[0, 1\right]
|
||||||
|
@ -669,9 +679,10 @@ pub fn standard_deviation(arr: List(Float), ddof: Int) -> Result(Float, String)
|
||||||
/// - $$|X \cap Y|$$ represents the size of the intersection of the two sets
|
/// - $$|X \cap Y|$$ represents the size of the intersection of the two sets
|
||||||
/// - $$|X \cup Y|$$ denotes the size of the union of the two sets
|
/// - $$|X \cup Y|$$ denotes the size of the union of the two sets
|
||||||
///
|
///
|
||||||
/// The value of the Jaccard index ranges from 0 to 1, where 0 indicates that the two sets share no elements
|
/// The value of the Jaccard index ranges from 0 to 1, where 0 indicates that the
|
||||||
/// and 1 indicates that the sets are identical. The Jaccard index is a special case of the
|
/// two sets share no elements and 1 indicates that the sets are identical. The
|
||||||
/// [Tversky index](#tversky_index) (with $$\alpha=\beta=1$$).
|
/// Jaccard index is a special case of the [Tversky index](#tversky_index) (with
|
||||||
|
/// $$\alpha=\beta=1$$).
|
||||||
///
|
///
|
||||||
/// <details>
|
/// <details>
|
||||||
/// <summary>Example:</summary>
|
/// <summary>Example:</summary>
|
||||||
|
@ -706,8 +717,8 @@ pub fn jaccard_index(xset: set.Set(a), yset: set.Set(a)) -> Float {
|
||||||
/// </a>
|
/// </a>
|
||||||
/// </div>
|
/// </div>
|
||||||
///
|
///
|
||||||
/// The Sørensen-Dice coefficient measures the similarity between two sets of elements. Mathematically, the
|
/// The Sørensen-Dice coefficient measures the similarity between two sets of
|
||||||
/// coefficient is defined as:
|
/// elements. Mathematically, the coefficient is defined as:
|
||||||
///
|
///
|
||||||
/// \\[
|
/// \\[
|
||||||
/// \frac{2 |X \cap Y|}{|X| + |Y|} \\; \in \\; \left[0, 1\right]
|
/// \frac{2 |X \cap Y|}{|X| + |Y|} \\; \in \\; \left[0, 1\right]
|
||||||
|
@ -715,12 +726,14 @@ pub fn jaccard_index(xset: set.Set(a), yset: set.Set(a)) -> Float {
|
||||||
///
|
///
|
||||||
/// where:
|
/// where:
|
||||||
/// - $$X$$ and $$Y$$ are two sets being compared
|
/// - $$X$$ and $$Y$$ are two sets being compared
|
||||||
/// - $$|X \cap Y|$$ is the size of the intersection of the two sets (i.e., the number of elements common to both sets)
|
/// - $$|X \cap Y|$$ is the size of the intersection of the two sets (i.e., the
|
||||||
|
/// number of elements common to both sets)
|
||||||
/// - $$|X|$$ and $$|Y|$$ are the sizes of the sets $$X$$ and $$Y$$, respectively
|
/// - $$|X|$$ and $$|Y|$$ are the sizes of the sets $$X$$ and $$Y$$, respectively
|
||||||
///
|
///
|
||||||
/// The coefficient ranges from 0 to 1, where 0 indicates no similarity (the sets share no elements) and 1
|
/// The coefficient ranges from 0 to 1, where 0 indicates no similarity (the sets
|
||||||
/// indicates perfect similarity (the sets are identical). The higher the coefficient, the greater the similarity
|
/// share no elements) and 1 indicates perfect similarity (the sets are identical).
|
||||||
/// between the two sets. The Sørensen-Dice coefficient is a special case of the
|
/// The higher the coefficient, the greater the similarity between the two sets.
|
||||||
|
/// The Sørensen-Dice coefficient is a special case of the
|
||||||
/// [Tversky index](#tversky_index) (with $$\alpha=\beta=0.5$$).
|
/// [Tversky index](#tversky_index) (with $$\alpha=\beta=0.5$$).
|
||||||
///
|
///
|
||||||
/// <details>
|
/// <details>
|
||||||
|
@ -756,9 +769,10 @@ pub fn sorensen_dice_coefficient(xset: set.Set(a), yset: set.Set(a)) -> Float {
|
||||||
/// </a>
|
/// </a>
|
||||||
/// </div>
|
/// </div>
|
||||||
///
|
///
|
||||||
/// The Tversky index is a generalization of the Jaccard index and Sørensen-Dice coefficient, which adds
|
/// The Tversky index is a generalization of the Jaccard index and Sørensen-Dice
|
||||||
/// flexibility through two parameters, $$\alpha$$ and $$\beta$$, allowing for asymmetric similarity
|
/// coefficient, which adds flexibility through two parameters, $$\alpha$$ and
|
||||||
/// measures between sets. The Tversky index is defined as:
|
/// $$\beta$$, allowing for asymmetric similarity measures between sets. The
|
||||||
|
/// Tversky index is defined as:
|
||||||
///
|
///
|
||||||
/// \\[
|
/// \\[
|
||||||
/// \frac{|X \cap Y|}{|X \cap Y| + \alpha|X - Y| + \beta|Y - X|}
|
/// \frac{|X \cap Y|}{|X \cap Y| + \alpha|X - Y| + \beta|Y - X|}
|
||||||
|
@ -767,13 +781,17 @@ pub fn sorensen_dice_coefficient(xset: set.Set(a), yset: set.Set(a)) -> Float {
|
||||||
/// where:
|
/// where:
|
||||||
///
|
///
|
||||||
/// - $$X$$ and $$Y$$ are the sets being compared
|
/// - $$X$$ and $$Y$$ are the sets being compared
|
||||||
/// - $$|X - Y|$$ and $$|Y - X|$$ are the sizes of the relative complements of $$Y$$ in $$X$$ and $$X$$ in $$Y$$, respectively,
|
/// - $$|X - Y|$$ and $$|Y - X|$$ are the sizes of the relative complements of
|
||||||
/// - $$\alpha$$ and $$\beta$$ are parameters that weigh the relative importance of the elements unique to $$X$$ and $$Y$$
|
/// $$Y$$ in $$X$$ and $$X$$ in $$Y$$, respectively,
|
||||||
|
/// - $$\alpha$$ and $$\beta$$ are parameters that weigh the relative importance
|
||||||
|
/// of the elements unique to $$X$$ and $$Y$$
|
||||||
///
|
///
|
||||||
/// The Tversky index reduces to the Jaccard index when \(\alpha = \beta = 1\) and to the Sorensen-Dice
|
/// The Tversky index reduces to the Jaccard index when $$\alpha = \beta = 1$$ and
|
||||||
/// coefficient when \(\alpha = \beta = 0.5\). In general, the Tversky index can take on any non-negative value, including 0.
|
/// to the Sørensen-Dice coefficient when $$\alpha = \beta = 0.5$$. In general, the
|
||||||
/// The index equals 0 when there is no intersection between the two sets, indicating no similarity. However, unlike similarity
|
/// Tversky index can take on any non-negative value, including 0. The index equals
|
||||||
/// measures bounded strictly between 0 and 1, the Tversky index does not have a strict upper limit of 1 when $$\alpha \neq \beta$$.
|
/// 0 when there is no intersection between the two sets, indicating no similarity.
|
||||||
|
/// However, unlike similarity measures bounded strictly between 0 and 1, the
|
||||||
|
/// Tversky index does not have a strict upper limit of 1 when $$\alpha \neq \beta$$.
|
||||||
///
|
///
|
||||||
/// <details>
|
/// <details>
|
||||||
/// <summary>Example:</summary>
|
/// <summary>Example:</summary>
|
||||||
|
@ -843,9 +861,10 @@ pub fn tversky_index(
|
||||||
/// </a>
|
/// </a>
|
||||||
/// </div>
|
/// </div>
|
||||||
///
|
///
|
||||||
/// The Overlap coefficient, also known as the Szymkiewicz–Simpson coefficient, is a measure of
|
/// The Overlap coefficient, also known as the Szymkiewicz–Simpson coefficient, is
|
||||||
/// similarity between two sets that focuses on the size of the intersection relative to the
|
/// a measure of similarity between two sets that focuses on the size of the
|
||||||
/// smaller of the two sets. It is defined mathematically as:
|
/// intersection relative to the smaller of the two sets. It is defined
|
||||||
|
/// mathematically as:
|
||||||
///
|
///
|
||||||
/// \\[
|
/// \\[
|
||||||
/// \frac{|X \cap Y|}{\min(|X|, |Y|)} \\; \in \\; \left[0, 1\right]
|
/// \frac{|X \cap Y|}{\min(|X|, |Y|)} \\; \in \\; \left[0, 1\right]
|
||||||
|
@ -857,10 +876,11 @@ pub fn tversky_index(
|
||||||
/// - $$|X \cap Y|$$ is the size of the intersection of the sets
|
/// - $$|X \cap Y|$$ is the size of the intersection of the sets
|
||||||
/// - $$\min(|X|, |Y|)$$ is the size of the smaller set among $$X$$ and $$Y$$
|
/// - $$\min(|X|, |Y|)$$ is the size of the smaller set among $$X$$ and $$Y$$
|
||||||
///
|
///
|
||||||
/// The coefficient ranges from 0 to 1, where 0 indicates no overlap and 1 indicates that the
|
/// The coefficient ranges from 0 to 1, where 0 indicates no overlap and 1
|
||||||
/// smaller set is a suyset of the larger set. This measure is especially useful in situations
|
/// indicates that the smaller set is a suyset of the larger set. This
|
||||||
/// where the similarity in terms of the proportion of overlap is more relevant than the
|
/// measure is especially useful in situations where the similarity in terms
|
||||||
/// difference in sizes between the two sets.
|
/// of the proportion of overlap is more relevant than the difference in sizes
|
||||||
|
/// between the two sets.
|
||||||
///
|
///
|
||||||
/// <details>
|
/// <details>
|
||||||
/// <summary>Example:</summary>
|
/// <summary>Example:</summary>
|
||||||
|
@ -905,13 +925,18 @@ pub fn overlap_coefficient(xset: set.Set(a), yset: set.Set(a)) -> Float {
|
||||||
/// Calculate the cosine similarity between two lists (representing vectors):
|
/// Calculate the cosine similarity between two lists (representing vectors):
|
||||||
///
|
///
|
||||||
/// \\[
|
/// \\[
|
||||||
/// \frac{\sum_{i=1}^n x_i \cdot y_i}{\left(\sum_{i=1}^n x_i^2\right)^{\frac{1}{2}} \cdot \left(\sum_{i=1}^n y_i^2\right)^{\frac{1}{2}}} \\; \in \\; \left[-1, 1\right]
|
/// \frac{\sum_{i=1}^n x_i \cdot y_i}{\left(\sum_{i=1}^n x_i^2\right)^{\frac{1}{2}}
|
||||||
|
/// \cdot \left(\sum_{i=1}^n y_i^2\right)^{\frac{1}{2}}}
|
||||||
|
/// \\; \in \\; \left[-1, 1\right]
|
||||||
/// \\]
|
/// \\]
|
||||||
///
|
///
|
||||||
/// In the formula, $$n$$ is the length of the two lists and $$x_i$$, $$y_i$$ are the values in the respective input lists indexed by $$i$$.
|
/// In the formula, $$n$$ is the length of the two lists and $$x_i$$, $$y_i$$ are
|
||||||
/// The numerator represents the dot product of the two vectors, while the denominator is the product of the magnitudes (Euclidean norms) of
|
/// the values in the respective input lists indexed by $$i$$. The numerator
|
||||||
/// the two vectors. The cosine similarity provides a value between -1 and 1, where 1 means the vectors are in the same direction, -1 means
|
/// represents the dot product of the two vectors, while the denominator is the
|
||||||
/// they are in exactly opposite directions, and 0 indicates orthogonality.
|
/// product of the magnitudes (Euclidean norms) of the two vectors. The cosine
|
||||||
|
/// similarity provides a value between -1 and 1, where 1 means the vectors are
|
||||||
|
/// in the same direction, -1 means they are in exactly opposite directions,
|
||||||
|
/// and 0 indicates orthogonality.
|
||||||
///
|
///
|
||||||
/// <details>
|
/// <details>
|
||||||
/// <summary>Example:</summary>
|
/// <summary>Example:</summary>
|
||||||
|
@ -974,3 +999,122 @@ pub fn cosine_similarity(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// <div style="text-align: right;">
|
||||||
|
/// <a href="https://github.com/gleam-community/maths/issues">
|
||||||
|
/// <small>Spot a typo? Open an issue!</small>
|
||||||
|
/// </a>
|
||||||
|
/// </div>
|
||||||
|
///
|
||||||
|
/// Calculate the Levenshtein distance between two strings, i.e., measure the
|
||||||
|
/// difference between two strings (essentially sequences). It is defined as
|
||||||
|
/// the minimum number of single-character edits required to change one string
|
||||||
|
/// into the other, using operations:
|
||||||
|
/// - insertions
|
||||||
|
/// - deletions
|
||||||
|
/// - substitutions
|
||||||
|
///
|
||||||
|
/// Note: The implementation is primarily based on the elixir implementation
|
||||||
|
/// [https://hex.pm/packages/levenshtein](levenshtein).
|
||||||
|
///
|
||||||
|
/// <details>
|
||||||
|
/// <summary>Example:</summary>
|
||||||
|
///
|
||||||
|
/// import gleeunit/should
|
||||||
|
/// import gleam_community/maths/metrics
|
||||||
|
///
|
||||||
|
/// pub fn example () {
|
||||||
|
/// metrics.levenshtein_distance("hello", "hello")
|
||||||
|
/// |> should.equal(0)
|
||||||
|
///
|
||||||
|
/// metrics.levenshtein_distance("cat", "cut")
|
||||||
|
/// |> should.equal(1)
|
||||||
|
///
|
||||||
|
/// metrics.levenshtein_distance("kitten", "sitting")
|
||||||
|
/// |> should.equal(3)
|
||||||
|
/// }
|
||||||
|
/// </details>
|
||||||
|
///
|
||||||
|
/// <div style="text-align: right;">
|
||||||
|
/// <a href="#">
|
||||||
|
/// <small>Back to top ↑</small>
|
||||||
|
/// </a>
|
||||||
|
/// </div>
|
||||||
|
///
|
||||||
|
///
|
||||||
|
pub fn levenshtein_distance(xstring: String, ystring: String) -> Int {
|
||||||
|
case xstring, ystring {
|
||||||
|
xstring, ystring if xstring == ystring -> {
|
||||||
|
0
|
||||||
|
}
|
||||||
|
xstring, ystring if xstring == "" -> {
|
||||||
|
string.length(ystring)
|
||||||
|
}
|
||||||
|
xstring, ystring if ystring == "" -> {
|
||||||
|
string.length(xstring)
|
||||||
|
}
|
||||||
|
_, _ -> {
|
||||||
|
let xstring_graphemes = string.to_graphemes(xstring)
|
||||||
|
let ystring_graphemes = string.to_graphemes(ystring)
|
||||||
|
let ystring_length = list.length(ystring_graphemes)
|
||||||
|
let distance_list = list.range(0, ystring_length)
|
||||||
|
|
||||||
|
do_edit_distance(xstring_graphemes, ystring_graphemes, distance_list, 1)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn do_edit_distance(
|
||||||
|
xstring: List(String),
|
||||||
|
ystring: List(String),
|
||||||
|
distance_list: List(Int),
|
||||||
|
step: Int,
|
||||||
|
) -> Int {
|
||||||
|
case xstring {
|
||||||
|
// Safe as 'distance_list' is never empty
|
||||||
|
[] -> {
|
||||||
|
let assert Ok(last) = list.last(distance_list)
|
||||||
|
last
|
||||||
|
}
|
||||||
|
[xstring_head, ..xstring_tail] -> {
|
||||||
|
let new_distance_list =
|
||||||
|
distance_list_helper(ystring, distance_list, xstring_head, [step], step)
|
||||||
|
do_edit_distance(xstring_tail, ystring, new_distance_list, step + 1)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn distance_list_helper(
|
||||||
|
ystring: List(String),
|
||||||
|
distance_list: List(Int),
|
||||||
|
grapheme: String,
|
||||||
|
new_distance_list: List(Int),
|
||||||
|
last_distance: Int,
|
||||||
|
) -> List(Int) {
|
||||||
|
case ystring {
|
||||||
|
[] -> list.reverse(new_distance_list)
|
||||||
|
[ystring_head, ..ystring_tail] -> {
|
||||||
|
let assert [distance_list_head, ..distance_list_tail] = distance_list
|
||||||
|
let difference = case ystring_head == grapheme {
|
||||||
|
True -> {
|
||||||
|
0
|
||||||
|
}
|
||||||
|
False -> {
|
||||||
|
1
|
||||||
|
}
|
||||||
|
}
|
||||||
|
let assert [first, ..] = distance_list_tail
|
||||||
|
let min =
|
||||||
|
last_distance + 1
|
||||||
|
|> piecewise.minimum(first + 1, int.compare)
|
||||||
|
|> piecewise.minimum(distance_list_head + difference, int.compare)
|
||||||
|
distance_list_helper(
|
||||||
|
ystring_tail,
|
||||||
|
distance_list_tail,
|
||||||
|
grapheme,
|
||||||
|
[min, ..new_distance_list],
|
||||||
|
min,
|
||||||
|
)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
|
@ -50,38 +50,24 @@ pub fn float_list_norm_test() {
|
||||||
|> should.be_true()
|
|> should.be_true()
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn float_list_manhatten_test() {
|
pub fn float_list_manhattan_test() {
|
||||||
let assert Ok(tol) = elementary.power(-10.0, -6.0)
|
let assert Ok(tol) = elementary.power(-10.0, -6.0)
|
||||||
|
|
||||||
// Empty lists returns an error
|
// Empty lists returns an error
|
||||||
metrics.manhatten_distance([], [])
|
metrics.manhattan_distance([], [])
|
||||||
|> should.be_error()
|
|> should.be_error()
|
||||||
|
|
||||||
// Differing lengths returns error
|
// Differing lengths returns error
|
||||||
metrics.manhatten_distance([], [1.0])
|
metrics.manhattan_distance([], [1.0])
|
||||||
|> should.be_error()
|
|> should.be_error()
|
||||||
|
|
||||||
// Manhatten distance (p = 1)
|
// manhattan distance (p = 1)
|
||||||
let assert Ok(result) = metrics.manhatten_distance([0.0, 0.0], [1.0, 2.0])
|
let assert Ok(result) = metrics.manhattan_distance([0.0, 0.0], [1.0, 2.0])
|
||||||
result
|
result
|
||||||
|> predicates.is_close(3.0, 0.0, tol)
|
|> predicates.is_close(3.0, 0.0, tol)
|
||||||
|> should.be_true()
|
|> should.be_true()
|
||||||
}
|
}
|
||||||
|
|
||||||
// pub fn int_list_manhatten_test() {
|
|
||||||
// // Empty lists returns 0
|
|
||||||
// metrics.int_manhatten_distance([], [])
|
|
||||||
// |> should.equal(Ok(0))
|
|
||||||
|
|
||||||
// // Differing lengths returns error
|
|
||||||
// metrics.int_manhatten_distance([], [1])
|
|
||||||
// |> should.be_error()
|
|
||||||
|
|
||||||
// let assert Ok(result) = metrics.int_manhatten_distance([0, 0], [1, 2])
|
|
||||||
// result
|
|
||||||
// |> should.equal(3)
|
|
||||||
// }
|
|
||||||
|
|
||||||
pub fn float_list_minkowski_test() {
|
pub fn float_list_minkowski_test() {
|
||||||
let assert Ok(tol) = elementary.power(-10.0, -6.0)
|
let assert Ok(tol) = elementary.power(-10.0, -6.0)
|
||||||
|
|
||||||
|
@ -130,7 +116,7 @@ pub fn float_list_minkowski_test() {
|
||||||
|> predicates.is_close(2.23606797749979, 0.0, tol)
|
|> predicates.is_close(2.23606797749979, 0.0, tol)
|
||||||
|> should.be_true()
|
|> should.be_true()
|
||||||
|
|
||||||
// Manhatten distance (p = 1)
|
// Manhattan distance (p = 1)
|
||||||
let assert Ok(result) =
|
let assert Ok(result) =
|
||||||
metrics.minkowski_distance([0.0, 0.0], [1.0, 2.0], 1.0)
|
metrics.minkowski_distance([0.0, 0.0], [1.0, 2.0], 1.0)
|
||||||
result
|
result
|
||||||
|
@ -156,7 +142,7 @@ pub fn float_list_euclidean_test() {
|
||||||
|> should.be_true()
|
|> should.be_true()
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn example_mean_test() {
|
pub fn mean_test() {
|
||||||
// An empty list returns an error
|
// An empty list returns an error
|
||||||
[]
|
[]
|
||||||
|> metrics.mean()
|
|> metrics.mean()
|
||||||
|
@ -168,7 +154,7 @@ pub fn example_mean_test() {
|
||||||
|> should.equal(Ok(2.0))
|
|> should.equal(Ok(2.0))
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn example_median_test() {
|
pub fn median_test() {
|
||||||
// An empty list returns an error
|
// An empty list returns an error
|
||||||
[]
|
[]
|
||||||
|> metrics.median()
|
|> metrics.median()
|
||||||
|
@ -184,7 +170,7 @@ pub fn example_median_test() {
|
||||||
|> should.equal(Ok(2.5))
|
|> should.equal(Ok(2.5))
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn example_variance_test() {
|
pub fn variance_test() {
|
||||||
// Degrees of freedom
|
// Degrees of freedom
|
||||||
let ddof: Int = 1
|
let ddof: Int = 1
|
||||||
|
|
||||||
|
@ -199,7 +185,7 @@ pub fn example_variance_test() {
|
||||||
|> should.equal(Ok(1.0))
|
|> should.equal(Ok(1.0))
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn example_standard_deviation_test() {
|
pub fn standard_deviation_test() {
|
||||||
// Degrees of freedom
|
// Degrees of freedom
|
||||||
let ddof: Int = 1
|
let ddof: Int = 1
|
||||||
|
|
||||||
|
@ -214,7 +200,7 @@ pub fn example_standard_deviation_test() {
|
||||||
|> should.equal(Ok(1.0))
|
|> should.equal(Ok(1.0))
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn example_jaccard_index_test() {
|
pub fn jaccard_index_test() {
|
||||||
metrics.jaccard_index(set.from_list([]), set.from_list([]))
|
metrics.jaccard_index(set.from_list([]), set.from_list([]))
|
||||||
|> should.equal(0.0)
|
|> should.equal(0.0)
|
||||||
|
|
||||||
|
@ -235,7 +221,7 @@ pub fn example_jaccard_index_test() {
|
||||||
|> should.equal(1.0 /. 7.0)
|
|> should.equal(1.0 /. 7.0)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn example_sorensen_dice_coefficient_test() {
|
pub fn sorensen_dice_coefficient_test() {
|
||||||
metrics.sorensen_dice_coefficient(set.from_list([]), set.from_list([]))
|
metrics.sorensen_dice_coefficient(set.from_list([]), set.from_list([]))
|
||||||
|> should.equal(0.0)
|
|> should.equal(0.0)
|
||||||
|
|
||||||
|
@ -256,7 +242,7 @@ pub fn example_sorensen_dice_coefficient_test() {
|
||||||
|> should.equal(2.0 *. 1.0 /. { 4.0 +. 5.0 })
|
|> should.equal(2.0 *. 1.0 /. { 4.0 +. 5.0 })
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn example_overlap_coefficient_test() {
|
pub fn overlap_coefficient_test() {
|
||||||
metrics.overlap_coefficient(set.from_list([]), set.from_list([]))
|
metrics.overlap_coefficient(set.from_list([]), set.from_list([]))
|
||||||
|> should.equal(0.0)
|
|> should.equal(0.0)
|
||||||
|
|
||||||
|
@ -278,7 +264,7 @@ pub fn example_overlap_coefficient_test() {
|
||||||
|> should.equal(2.0 /. 4.0)
|
|> should.equal(2.0 /. 4.0)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn example_cosine_similarity_test() {
|
pub fn cosine_similarity_test() {
|
||||||
// Empty lists returns an error
|
// Empty lists returns an error
|
||||||
metrics.cosine_similarity([], [])
|
metrics.cosine_similarity([], [])
|
||||||
|> should.be_error()
|
|> should.be_error()
|
||||||
|
@ -308,7 +294,7 @@ pub fn example_cosine_similarity_test() {
|
||||||
|> should.equal(Ok(-1.0))
|
|> should.equal(Ok(-1.0))
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn example_chebyshev_distance() {
|
pub fn chebyshev_distance_test() {
|
||||||
// Empty lists returns an error
|
// Empty lists returns an error
|
||||||
metrics.chebyshev_distance([], [])
|
metrics.chebyshev_distance([], [])
|
||||||
|> should.be_error()
|
|> should.be_error()
|
||||||
|
@ -330,6 +316,9 @@ pub fn example_chebyshev_distance() {
|
||||||
|> should.equal(Ok(2.0))
|
|> should.equal(Ok(2.0))
|
||||||
|
|
||||||
metrics.chebyshev_distance([1.0, 0.0], [2.0, 0.0])
|
metrics.chebyshev_distance([1.0, 0.0], [2.0, 0.0])
|
||||||
|
|> should.equal(Ok(1.0))
|
||||||
|
|
||||||
|
metrics.chebyshev_distance([1.0, 0.0], [-2.0, 0.0])
|
||||||
|> should.equal(Ok(3.0))
|
|> should.equal(Ok(3.0))
|
||||||
|
|
||||||
metrics.chebyshev_distance([-5.0, -10.0, -3.0], [-1.0, -12.0, -3.0])
|
metrics.chebyshev_distance([-5.0, -10.0, -3.0], [-1.0, -12.0, -3.0])
|
||||||
|
@ -338,3 +327,43 @@ pub fn example_chebyshev_distance() {
|
||||||
metrics.chebyshev_distance([1.0, 2.0, 3.0], [1.0, 2.0, 3.0])
|
metrics.chebyshev_distance([1.0, 2.0, 3.0], [1.0, 2.0, 3.0])
|
||||||
|> should.equal(Ok(0.0))
|
|> should.equal(Ok(0.0))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn levenshtein_distance_test() {
|
||||||
|
// Try different types of valid input...
|
||||||
|
|
||||||
|
// Requires 5 insertions to transform the empty string into "hello"
|
||||||
|
metrics.levenshtein_distance("", "hello")
|
||||||
|
|> should.equal(5)
|
||||||
|
// Requires 5 deletions to remove all characters from "hello" to match the empty string
|
||||||
|
metrics.levenshtein_distance("hello", "")
|
||||||
|
|> should.equal(5)
|
||||||
|
|
||||||
|
// Requires 2 deletions to remove two 'b's and 1 substitution to change 'b' to 'a'
|
||||||
|
metrics.levenshtein_distance("bbb", "a")
|
||||||
|
|> should.equal(3)
|
||||||
|
// Requires 2 insertions to add two 'b's and 1 substitution to change 'a' to 'b'
|
||||||
|
metrics.levenshtein_distance("a", "bbb")
|
||||||
|
|> should.equal(3)
|
||||||
|
|
||||||
|
// No changes needed, since the strings are identical
|
||||||
|
metrics.levenshtein_distance("hello", "hello")
|
||||||
|
|> should.equal(0)
|
||||||
|
|
||||||
|
// Requires 1 substitution to change 'a' to 'u'
|
||||||
|
metrics.levenshtein_distance("cat", "cut")
|
||||||
|
|> should.equal(1)
|
||||||
|
|
||||||
|
// Requires 2 substitutions (k -> s, e -> i) and 1 insertion (g at the end)
|
||||||
|
metrics.levenshtein_distance("kitten", "sitting")
|
||||||
|
|> should.equal(3)
|
||||||
|
|
||||||
|
// Some more complex cases, involving multiple insertions, deletions, and substitutions
|
||||||
|
metrics.levenshtein_distance("gggtatccat", "cctaggtccct")
|
||||||
|
|> should.equal(6)
|
||||||
|
|
||||||
|
metrics.levenshtein_distance(
|
||||||
|
"This is a longer string",
|
||||||
|
"This is also a much longer string",
|
||||||
|
)
|
||||||
|
|> should.equal(10)
|
||||||
|
}
|
||||||
|
|
Loading…
Reference in a new issue