Thrill  0.1
levenshtein.hpp
Go to the documentation of this file.
1 /*******************************************************************************
2  * tlx/string/levenshtein.hpp
3  *
4  * Part of tlx - http://panthema.net/tlx
5  *
6  * Copyright (C) 2007-2018 Timo Bingmann <[email protected]>
7  *
8  * All rights reserved. Published under the Boost Software License, Version 1.0
9  ******************************************************************************/
10 
11 #ifndef TLX_STRING_LEVENSHTEIN_HEADER
12 #define TLX_STRING_LEVENSHTEIN_HEADER
13 
14 #include <tlx/simple_vector.hpp>
15 #include <tlx/string/to_lower.hpp>
16 
17 #include <algorithm>
18 #include <cstring>
19 #include <string>
20 
21 namespace tlx {
22 
23 //! \addtogroup tlx_string
24 //! \{
25 
26 // *** Parameter Struct and Algorithm ***
27 
28 /*!
29  * Standard parameters to levenshtein distance function. Costs are all 1 and
30  * characters are compared directly.
31  */
33  static const unsigned int cost_insert_delete = 1;
34  static const unsigned int cost_replace = 1;
35 
36  static inline bool char_equal(const char& a, const char& b)
37  { return (a == b); }
38 };
39 
40 /*!
41  * Standard parameters to Levenshtein distance function. Costs are all 1 and
42  * characters are compared case-insensitively.
43  */
45  static const unsigned int cost_insert_delete = 1;
46  static const unsigned int cost_replace = 1;
47 
48  static inline bool char_equal(const char& a, const char& b)
49  { return to_lower(a) == to_lower(b); }
50 };
51 
52 /*!
53  * Computes the Levenshtein string distance also called edit distance between
54  * two strings. The distance is the minimum number of
55  * replacements/inserts/deletes needed to change one string into the
56  * other. Implemented with time complexity O(|n|+|m|) and memory complexity
57  * O(2*max(|n|,|m|))
58  *
59  * \param a first string
60  * \param a_size size of first string
61  * \param b second string
62  * \param b_size size of second string
63  * \return Levenshtein distance
64  */
65 template <typename Param>
66 static inline
67 size_t levenshtein_algorithm(const char* a, size_t a_size,
68  const char* b, size_t b_size) {
69  // if one of the strings is zero, then all characters of the other must
70  // be inserted.
71  if (a_size == 0) return b_size * Param::cost_insert_delete;
72  if (b_size == 0) return a_size * Param::cost_insert_delete;
73 
74  // make "as" the longer string and "bs" the shorter.
75  if (a_size < b_size) {
76  std::swap(a, b);
77  std::swap(a_size, b_size);
78  }
79 
80  // only allocate two rows of the needed matrix.
81  simple_vector<size_t> lastrow(a_size + 1);
82  simple_vector<size_t> thisrow(a_size + 1);
83 
84  // fill this row with ascending ordinals.
85  for (size_t i = 0; i < a_size + 1; i++) {
86  thisrow[i] = i;
87  }
88 
89  // compute distance
90  for (size_t j = 1; j < b_size + 1; j++)
91  {
92  // switch rows
93  std::swap(lastrow, thisrow);
94 
95  // compute new row
96  thisrow[0] = j;
97 
98  for (size_t i = 1; i < a_size + 1; i++)
99  {
100  // three-way mimimum of
101  thisrow[i] = std::min(
102  std::min(
103  // left plus insert cost
104  thisrow[i - 1] + Param::cost_insert_delete,
105  // top plus delete cost
106  lastrow[i] + Param::cost_insert_delete),
107  // top left plus replacement cost
108  lastrow[i - 1] + (
109  Param::char_equal(a[i - 1], b[j - 1])
110  ? 0 : Param::cost_replace)
111  );
112  }
113  }
114 
115  // result is in the last cell of the last computed row
116  return thisrow[a_size];
117 }
118 
119 /*!
120  * Computes the Levenshtein string distance between two strings. The distance
121  * is the minimum number of replacements/inserts/deletes needed to change one
122  * string into the other.
123  *
124  * \param a first string
125  * \param b second string
126  * \return Levenshtein distance
127  */
128 static inline size_t levenshtein(const char* a, const char* b) {
129  return levenshtein_algorithm<LevenshteinStandardParameters>(
130  a, std::strlen(a), b, std::strlen(b));
131 }
132 
133 /*!
134  * Computes the Levenshtein string distance between two strings. The distance
135  * is the minimum number of replacements/inserts/deletes needed to change one
136  * string into the other.
137  *
138  * \param a first string
139  * \param b second string
140  * \return Levenshtein distance
141  */
142 static inline size_t levenshtein_icase(const char* a, const char* b) {
143  return levenshtein_algorithm<LevenshteinStandardICaseParameters>(
144  a, std::strlen(a), b, std::strlen(b));
145 }
146 
147 /*!
148  * Computes the Levenshtein string distance between two strings. The distance
149  * is the minimum number of replacements/inserts/deletes needed to change one
150  * string into the other.
151  *
152  * \param a first string
153  * \param b second string
154  * \return Levenshtein distance
155  */
156 static inline size_t levenshtein(const std::string& a, const std::string& b) {
157  return levenshtein_algorithm<LevenshteinStandardParameters>(
158  a.data(), a.size(), b.data(), b.size());
159 }
160 
161 /*!
162  * Computes the Levenshtein string distance between two strings. The distance
163  * is the minimum number of replacements/inserts/deletes needed to change one
164  * string into the other. Character comparison is done case-insensitively.
165  *
166  * \param a first string
167  * \param b second string
168  * \return Levenshtein distance
169  */
170 static inline
171 size_t levenshtein_icase(const std::string& a, const std::string& b) {
172  return levenshtein_algorithm<LevenshteinStandardICaseParameters>(
173  a.data(), a.size(), b.data(), b.size());
174 }
175 
176 //! \}
177 
178 } // namespace tlx
179 
180 #endif // !TLX_STRING_LEVENSHTEIN_HEADER
181 
182 /******************************************************************************/
static size_t levenshtein_icase(const char *a, const char *b)
Computes the Levenshtein string distance between two strings.
static size_t levenshtein(const char *a, const char *b)
Computes the Levenshtein string distance between two strings.
char to_lower(char ch)
Transform the given character to lower case without any localization.
Definition: to_lower.cpp:17
Simpler non-growing vector without initialization.
Standard parameters to levenshtein distance function.
Definition: levenshtein.hpp:32
void swap(CountingPtr< A, D > &a1, CountingPtr< A, D > &a2) noexcept
static const unsigned int cost_replace
Definition: levenshtein.hpp:34
std::basic_string< char, std::char_traits< char >, Allocator< char > > string
string with Manager tracking
Definition: allocator.hpp:220
static bool char_equal(const char &a, const char &b)
Definition: levenshtein.hpp:48
static uint_pair min()
return an uint_pair instance containing the smallest value possible
Definition: uint_types.hpp:217
static const unsigned int cost_insert_delete
Definition: levenshtein.hpp:33
static bool char_equal(const char &a, const char &b)
Definition: levenshtein.hpp:36
static size_t levenshtein_algorithm(const char *a, size_t a_size, const char *b, size_t b_size)
Computes the Levenshtein string distance also called edit distance between two strings.
Definition: levenshtein.hpp:67
Standard parameters to Levenshtein distance function.
Definition: levenshtein.hpp:44