cargo/util/
edit_distance.rs

1use std::{cmp, mem};
2
3/// Finds the [edit distance] between two strings.
4///
5/// Returns `None` if the distance exceeds the limit.
6///
7/// [edit distance]: https://en.wikipedia.org/wiki/Edit_distance
8pub fn edit_distance(a: &str, b: &str, limit: usize) -> Option<usize> {
9    // Comparing the strings lowercased will result in a difference in capitalization being less distance away
10    // than being a completely different letter. Otherwise `CHECK` is as far away from `check` as it
11    // is from `build` (both with a distance of 5). For a single letter shortcut (e.g. `b` or `c`), they will
12    // all be as far away from any capital single letter entry (all with a distance of 1).
13    // By first lowercasing the strings, `C` and `c` are closer than `C` and `b`, for example.
14    let a = a.to_lowercase();
15    let b = b.to_lowercase();
16
17    let mut a = &a.chars().collect::<Vec<_>>()[..];
18    let mut b = &b.chars().collect::<Vec<_>>()[..];
19
20    // Ensure that `b` is the shorter string, minimizing memory use.
21    if a.len() < b.len() {
22        mem::swap(&mut a, &mut b);
23    }
24
25    let min_dist = a.len() - b.len();
26    // If we know the limit will be exceeded, we can return early.
27    if min_dist > limit {
28        return None;
29    }
30
31    // Strip common prefix.
32    while let Some(((b_char, b_rest), (a_char, a_rest))) = b.split_first().zip(a.split_first()) {
33        if a_char != b_char {
34            break;
35        }
36        a = a_rest;
37        b = b_rest;
38    }
39    // Strip common suffix.
40    while let Some(((b_char, b_rest), (a_char, a_rest))) = b.split_last().zip(a.split_last()) {
41        if a_char != b_char {
42            break;
43        }
44        a = a_rest;
45        b = b_rest;
46    }
47
48    // If either string is empty, the distance is the length of the other.
49    // We know that `b` is the shorter string, so we don't need to check `a`.
50    if b.len() == 0 {
51        return Some(min_dist);
52    }
53
54    let mut prev_prev = vec![usize::MAX; b.len() + 1];
55    let mut prev = (0..=b.len()).collect::<Vec<_>>();
56    let mut current = vec![0; b.len() + 1];
57
58    // row by row
59    for i in 1..=a.len() {
60        current[0] = i;
61        let a_idx = i - 1;
62
63        // column by column
64        for j in 1..=b.len() {
65            let b_idx = j - 1;
66
67            // There is no cost to substitute a character with itself.
68            let substitution_cost = if a[a_idx] == b[b_idx] { 0 } else { 1 };
69
70            current[j] = cmp::min(
71                // deletion
72                prev[j] + 1,
73                cmp::min(
74                    // insertion
75                    current[j - 1] + 1,
76                    // substitution
77                    prev[j - 1] + substitution_cost,
78                ),
79            );
80
81            if (i > 1) && (j > 1) && (a[a_idx] == b[b_idx - 1]) && (a[a_idx - 1] == b[b_idx]) {
82                // transposition
83                current[j] = cmp::min(current[j], prev_prev[j - 2] + 1);
84            }
85        }
86
87        // Rotate the buffers, reusing the memory.
88        [prev_prev, prev, current] = [prev, current, prev_prev];
89    }
90
91    // `prev` because we already rotated the buffers.
92    let distance = prev[b.len()];
93    (distance <= limit).then_some(distance)
94}
95
96/// Find the closest element from `iter` matching `choice`. The `key` callback
97/// is used to select a `&str` from the iterator to compare against `choice`.
98pub fn closest<'a, T>(
99    choice: &str,
100    iter: impl Iterator<Item = T>,
101    key: impl Fn(&T) -> &'a str,
102) -> Option<T> {
103    // Only consider candidates with an edit distance of 3 or less so we don't
104    // suggest out-of-the-blue options.
105    iter.filter_map(|e| Some((edit_distance(choice, key(&e), 3)?, e)))
106        .min_by_key(|t| t.0)
107        .map(|t| t.1)
108}
109
110/// Version of `closest` that returns a common "suggestion" that can be tacked
111/// onto the end of an error message.
112pub fn closest_msg<'a, T>(
113    choice: &str,
114    iter: impl Iterator<Item = T>,
115    key: impl Fn(&T) -> &'a str,
116    kind: &str,
117) -> String {
118    match closest(choice, iter, &key) {
119        Some(e) => format!(
120            "\n\nhelp: a {kind} with a similar name exists: `{}`",
121            key(&e)
122        ),
123        None => String::new(),
124    }
125}
126
127#[test]
128fn test_edit_distance() {
129    use std::char::{from_u32, MAX};
130    // Test bytelength agnosticity
131    for c in (0u32..MAX as u32)
132        .filter_map(from_u32)
133        .map(|i| i.to_string())
134    {
135        assert_eq!(edit_distance(&c, &c, usize::MAX), Some(0));
136    }
137
138    let a = "\nMäry häd ä little lämb\n\nLittle lämb\n";
139    let b = "\nMary häd ä little lämb\n\nLittle lämb\n";
140    let c = "Mary häd ä little lämb\n\nLittle lämb\n";
141    assert_eq!(edit_distance(a, b, usize::MAX), Some(1));
142    assert_eq!(edit_distance(b, a, usize::MAX), Some(1));
143    assert_eq!(edit_distance(a, c, usize::MAX), Some(2));
144    assert_eq!(edit_distance(c, a, usize::MAX), Some(2));
145    assert_eq!(edit_distance(b, c, usize::MAX), Some(1));
146    assert_eq!(edit_distance(c, b, usize::MAX), Some(1));
147}