编辑距离(levenshtein distance)C/C++实现
程序员文章站
2022-04-01 17:43:37
...
编辑距离(levenshtein distance)C/C++ 实现
Levenshtein distance 距离简介
Levenshtein distance 的由来
Levenshtein Distance,又称编辑距离,指的是两个字符串之间,由一个转换成另一个所需的最少编辑操作次数。许可的编辑操作包括将一个字符替换成另一个字符,插入一个字符,删除一个字符。levenshtein() 函数返回两个字符串之间的 Levenshtein 距离。编辑距离的算法是首先由俄国科学家Levenshtein提出的,故又叫Levenshtein Distance。
编辑距离的内涵
编辑距离从三个方面进行度量:
- 替换
- 插入
- 删除
如下表所示:
操作 | 距离 | 解析 |
---|---|---|
替换 | cat rat | “c”被"r"替换了 |
插入 | cat coat | “c”和"a"之间插入了"o" |
删除 | cat at | “a”左边的"c"被删除了 |
编辑距离示例
Saturday 与 Sunday 的编辑距离:
- Sunday (原本)
- Saunday ( “S” 和 “u” 之间增加 “a” )
- Satunday ( “a” 和 “u” 之间增加 “t” )
- Saturday ( “n” 改为 “r” )
故 Saturday 与 Sunday 的编辑距离为 3 。
编辑距离 C 语言实现
代码参考*:https://en.wikipedia.org/wiki/Levenshtein_distance
最小时间复杂度
#include <stdio.h>
#include <string.h>
#define MIN3(a,b,c) ((a)<(b)?((a)<(c)?(a):(c)):((b)<(c)?(b):(c)))
int cmp_levenshtein( const char *s1, const char *s2 );
int main( void )
{
char str1[] = "Today is Saturday.";
char str2[] = "Tomorrow is Sunday.";
int d = cmp_levenshtein( str1, str2 );
printf( "edit distance: %d\n", d );
return 0;
}
int cmp_levenshtein( const char *s1, const char *s2 )
{
int row = strlen(s1); /* s1 的长度 */
int col = strlen(s2); /* s2 的长度 */
int mat[row][col]; /* C99 - variable-length array */
for( int i=0; i<row; ++i ) { /* 数组的行 */
for( int j=0; j<col; ++j ) { /* 数组的列 */
if( i == 0 ) {
mat[i][j] = j; /* 初始化第1行为 [ 0 1 2 ... ] */
}
else if( j == 0 ) {
mat[i][j] = i; /* 初始化第1列为 [ 0 1 2 ... ] */
}
else {
int cost = ( s1[i-1] == s2[j-1] ) ? 0 : 1; /* 记录s1[i-1]与s2[j-1]是否相等 */
mat[i][j] = MIN3( mat[i-1][j ] + 1, /* 取三者的最小值 */
mat[i ][j-1] + 1,
mat[i-1][j-1] + cost);
}
}
}
return mat[row-1][col-1];
}
最小空间复杂度
#include <stdlib.h>
#include <string.h>
#define MIN3(a,b,c) ((a)<(b)?((a)<(c)?(a):(c)):((b)<(c)?(b):(c)))
#define SWAP(a,b) do { \
typeof(a) tmp = (a); \
(a) = (b); \
(b) = tmp; \
}while(0)
int cmp_levenshtein( const char *s1, const char *s2 )
{
// create two work vectors of integer distances
int m = strlen(s1);
int n = strlen(s2);
int *v1 = (int*)malloc( (n+1)*sizeof(int) );
if( v1==NULL ) {
return -1;
}
int *v2 = (int*)malloc( (n+1)*sizeof(int) );
if( v2==NULL ) {
free( v1 );
return -1;
}
// initialize v1 (the previous row of distances)
// this row is A[0][i]: edit distance for an empty s1
// the distance is just the number of characters to delete from s2
for( int i=0; i<=n; ++i ) {
v1[i] = i;
}
// calculate v2 (current row distances) from the previous row v1
for( int i=0; i<m; ++i ) {
// first element of v2 is A[i+1][0]
// edit distance is delete (i+1) chars from s to match empty s2
v2[0] = i+1;
// use formula to fill in the rest of the row
for( int j=0; j<n; ++j ) {
// calculating costs for A[i+1][j+1]
int deletionCost = v1[j+1] + 1;
int insertionCost = v2[j] + 1;
int substitutionCost = v1[j];
if( s1[i] != s2[j] ) {
++ substitutionCost;
}
v2[j+1] = MIN3( deletionCost, insertionCost, substitutionCost );
}
// copy v2 (current row) to v1 (previous row) for next iteration
SWAP( v1, v2 );
}
// after the last swap, the results of v2 are now in v1
int retval = v1[n];
free(v1);
free(v2);
return retval;
}
编辑距离 C++ 语言实现
#include <string>
#include <iostream>
class levenshtein
{
public:
static int compare( const std::string& s1, const std::string& s2 )
{
// create two work vectors of integer distances
const int m = s1.size();
const int n = s2.size();
int* v1 = new int[n+1];
int* v2 = new int[n+1];
// initialize v1 (the previous row of distances)
// this row is A[0][i]: edit distance for an empty s1
// the distance is just the number of characters to delete from s2
for( int i = 0; i <= n; ++i ) {
v1[i] = i;
}
// calculate v2 (current row distances) from the previous row v1
for( int i = 0; i < m; ++i ) {
// first element of v2 is A[i+1][0]
// edit distance is delete (i+1) chars from s to match empty s2
v2[0] = i+1;
// use formula to fill in the rest of the row
for( int j = 0; j < n; ++j ) {
// calculating costs for A[i+1][j+1]
int deletionCost = v1[j+1] + 1;
int insertionCost = v2[j] + 1;
int substitutionCost = v1[j];
if( s1[i] != s2[j] ) {
++ substitutionCost;
}
v2[j+1] = min3( deletionCost, insertionCost, substitutionCost );
}
// copy v2 (current row) to v1 (previous row) for next iteration
swap( v1, v2 );
}
// after the last swap, the results of v2 are now in v1
int retval = v1[n];
delete []v1;
delete []v2;
return retval;
}
private:
static int min3( int a, int b, int c )
{
if ( a < b ) {
return a < c ? a : c ;
}
else {
return b < c ? b : c ;
}
}
static void swap( int*& a, int*& b )
{
int* tmp = a;
a = b;
b = tmp;
}
};
int main( void )
{
const std::string s1 = "Today is Saturday.";
const std::string s2 = "Tomorrow is Sunday.";
int d = levenshtein::compare( s1, s2 );
std::cout << "edit distance: " << d << std::endl;
return 0;
}
或者:
#include <iostream>
#include <vector>
#include <string>
#include <cmath>
using namespace std;
#define min(a,b) ((a<b)?a:b)
//算法
int ldistance(const string source, const string target)
{
//step 1
int n = source.length();
int m = target.length();
if (m == 0) return n;
if (n == 0) return m;
//Construct a matrix
typedef vector< vector<int> > Tmatrix;
Tmatrix matrix(n + 1);
for (int i = 0; i <= n; i++) matrix[i].resize(m + 1);
//step 2 Initialize
for (int i = 1; i <= n; i++) matrix[i][0] = i;
for (int i = 1; i <= m; i++) matrix[0][i] = i;
//step 3
for (int i = 1; i <= n; i++)
{
const char si = source[i - 1];
//step 4
for (int j = 1; j <= m; j++)
{
const char dj = target[j - 1];
//step 5
int cost;
if (si == dj){
cost = 0;
}
else{
cost = 1;
}
//step 6
const int above = matrix[i - 1][j] + 1;
const int left = matrix[i][j - 1] + 1;
const int diag = matrix[i - 1][j - 1] + cost;
matrix[i][j] = min(above, min(left, diag));
}
}//step7
return matrix[n][m];
}
void main(){
string s;
s = "ABC";
string d;
d = "abcd";
//cout << "source=";
//cin >> s;
//cout << "diag=";
//cin >> d;
int dist = ldistance(s, d);
cout << "dist=" << dist << endl;
system("pause");
}
上一篇: 欧几里得算法求最大公约数
下一篇: 动态规划——有顺序的背包问题