一种改进的变长字符串相似性排序算法

我正在寻找一种字符串相似度算法,它能够在可变长度的字符串上产生比通常建议的(莱文斯坦距离、 soundex 等)更好的结果。

比如说,

给出字符串 A“ Robert”

然后是字符串 B“ Amy Robertson”

会是一个更好的匹配

字符串 C: “ Richard”

此外,这个算法应该是语言不可知的(也适用于英语以外的语言)。

74984 次浏览

Catalysoft 的西蒙•怀特(Simon White)写了一篇文章,介绍了一种非常聪明的算法,可以比较相邻的字符对,这种算法对我的目的非常有效:

Http://www.catalysoft.com/articles/strikeamatch.html

Simon 有一个 Java 版本的算法,下面我写了一个 PL/Ruby 版本的算法(取自 Mark Wong-VanHaren 在相关论坛条目评论中的普通 Ruby 版本) ,这样我就可以在我的 PostgreSQL 查询中使用它:

CREATE FUNCTION string_similarity(str1 varchar, str2 varchar)
RETURNS float8 AS '


str1.downcase!
pairs1 = (0..str1.length-2).collect {|i| str1[i,2]}.reject {
|pair| pair.include? " "}
str2.downcase!
pairs2 = (0..str2.length-2).collect {|i| str2[i,2]}.reject {
|pair| pair.include? " "}
union = pairs1.size + pairs2.size
intersection = 0
pairs1.each do |p1|
0.upto(pairs2.size-1) do |i|
if p1 == pairs2[i]
intersection += 1
pairs2.slice!(i)
break
end
end
end
(2.0 * intersection) / union


' LANGUAGE 'plruby';

非常有效!

那么莱文斯坦距离呢,除以第一个字符串的长度(或者除以两个字符串的 min/max/avg 长度) ?到目前为止,这招对我很管用。

字符串相似度量 包含在字符串比较中使用的许多不同度量的概述(维基百科也有一个概述)。这些度量标准中的许多都是在库 模拟量度中实现的。

还有一个度量的例子,没有包含在给定的概述中,例如 压缩距离(试图近似于 Kolmogorov 的复杂性) ,它可以用于比您提供的文本稍长的文本。

您可能还会考虑更广泛的 自然语言处理主题。这些R 软件包可以让你快速入门(或者至少提供一些想法)。

最后一个编辑-搜索这个主题的其他问题在 SO,有相当多的相关问题。

Marzagao 的回答很棒,我把它转换成了 C # ,所以我想我应该把它贴在这里:

Pastebin Link

/// <summary>
/// This class implements string comparison algorithm
/// based on character pair similarity
/// Source: http://www.catalysoft.com/articles/StrikeAMatch.html
/// </summary>
public class SimilarityTool
{
/// <summary>
/// Compares the two strings based on letter pair matches
/// </summary>
/// <param name="str1"></param>
/// <param name="str2"></param>
/// <returns>The percentage match from 0.0 to 1.0 where 1.0 is 100%</returns>
public double CompareStrings(string str1, string str2)
{
List<string> pairs1 = WordLetterPairs(str1.ToUpper());
List<string> pairs2 = WordLetterPairs(str2.ToUpper());


int intersection = 0;
int union = pairs1.Count + pairs2.Count;


for (int i = 0; i < pairs1.Count; i++)
{
for (int j = 0; j < pairs2.Count; j++)
{
if (pairs1[i] == pairs2[j])
{
intersection++;
pairs2.RemoveAt(j);//Must remove the match to prevent "GGGG" from appearing to match "GG" with 100% success


break;
}
}
}


return (2.0 * intersection) / union;
}


/// <summary>
/// Gets all letter pairs for each
/// individual word in the string
/// </summary>
/// <param name="str"></param>
/// <returns></returns>
private List<string> WordLetterPairs(string str)
{
List<string> AllPairs = new List<string>();


// Tokenize the string and put the tokens/words into an array
string[] Words = Regex.Split(str, @"\s");


// For each word
for (int w = 0; w < Words.Length; w++)
{
if (!string.IsNullOrEmpty(Words[w]))
{
// Find the pairs of characters
String[] PairsInWord = LetterPairs(Words[w]);


for (int p = 0; p < PairsInWord.Length; p++)
{
AllPairs.Add(PairsInWord[p]);
}
}
}


return AllPairs;
}


/// <summary>
/// Generates an array containing every
/// two consecutive letters in the input string
/// </summary>
/// <param name="str"></param>
/// <returns></returns>
private string[] LetterPairs(string str)
{
int numPairs = str.Length - 1;


string[] pairs = new string[numPairs];


for (int i = 0; i < numPairs; i++)
{
pairs[i] = str.Substring(i, 2);
}


return pairs;
}
}

这次讨论真的很有帮助,谢谢。我将算法转换为 VBA,以便与 Excel 一起使用,并编写了一个工作表函数的几个版本,一个用于简单地比较一对字符串,另一个用于比较一个字符串和一个字符串范围/数组。StrSimLookup 版本以字符串、数组索引或相似性度量的形式返回最后一个最佳匹配。

这个实现产生的结果与 Simon White 网站上亚马逊示例中列出的相同,只是在低得分比赛中有一些小的例外; 不确定这种差异是从哪里来的,可能是 VBA 的 Split 函数,但我还没有研究过,因为它对我的目的来说工作得很好。

'Implements functions to rate how similar two strings are on
'a scale of 0.0 (completely dissimilar) to 1.0 (exactly similar)
'Source:   http://www.catalysoft.com/articles/StrikeAMatch.html
'Author: Bob Chatham, bob.chatham at gmail.com
'9/12/2010


Option Explicit


Public Function stringSimilarity(str1 As String, str2 As String) As Variant
'Simple version of the algorithm that computes the similiarity metric
'between two strings.
'NOTE: This verision is not efficient to use if you're comparing one string
'with a range of other values as it will needlessly calculate the pairs for the
'first string over an over again; use the array-optimized version for this case.


Dim sPairs1 As Collection
Dim sPairs2 As Collection


Set sPairs1 = New Collection
Set sPairs2 = New Collection


WordLetterPairs str1, sPairs1
WordLetterPairs str2, sPairs2


stringSimilarity = SimilarityMetric(sPairs1, sPairs2)


Set sPairs1 = Nothing
Set sPairs2 = Nothing


End Function


Public Function strSimA(str1 As Variant, rRng As Range) As Variant
'Return an array of string similarity indexes for str1 vs every string in input range rRng
Dim sPairs1 As Collection
Dim sPairs2 As Collection
Dim arrOut As Variant
Dim l As Long, j As Long


Set sPairs1 = New Collection


WordLetterPairs CStr(str1), sPairs1


l = rRng.Count
ReDim arrOut(1 To l)
For j = 1 To l
Set sPairs2 = New Collection
WordLetterPairs CStr(rRng(j)), sPairs2
arrOut(j) = SimilarityMetric(sPairs1, sPairs2)
Set sPairs2 = Nothing
Next j


strSimA = Application.Transpose(arrOut)


End Function


Public Function strSimLookup(str1 As Variant, rRng As Range, Optional returnType) As Variant
'Return either the best match or the index of the best match
'depending on returnTYype parameter) between str1 and strings in rRng)
' returnType = 0 or omitted: returns the best matching string
' returnType = 1           : returns the index of the best matching string
' returnType = 2           : returns the similarity metric


Dim sPairs1 As Collection
Dim sPairs2 As Collection
Dim metric, bestMetric As Double
Dim i, iBest As Long
Const RETURN_STRING As Integer = 0
Const RETURN_INDEX As Integer = 1
Const RETURN_METRIC As Integer = 2


If IsMissing(returnType) Then returnType = RETURN_STRING


Set sPairs1 = New Collection


WordLetterPairs CStr(str1), sPairs1


bestMetric = -1
iBest = -1


For i = 1 To rRng.Count
Set sPairs2 = New Collection
WordLetterPairs CStr(rRng(i)), sPairs2
metric = SimilarityMetric(sPairs1, sPairs2)
If metric > bestMetric Then
bestMetric = metric
iBest = i
End If
Set sPairs2 = Nothing
Next i


If iBest = -1 Then
strSimLookup = CVErr(xlErrValue)
Exit Function
End If


Select Case returnType
Case RETURN_STRING
strSimLookup = CStr(rRng(iBest))
Case RETURN_INDEX
strSimLookup = iBest
Case Else
strSimLookup = bestMetric
End Select


End Function


Public Function strSim(str1 As String, str2 As String) As Variant
Dim ilen, iLen1, ilen2 As Integer


iLen1 = Len(str1)
ilen2 = Len(str2)


If iLen1 >= ilen2 Then ilen = ilen2 Else ilen = iLen1


strSim = stringSimilarity(Left(str1, ilen), Left(str2, ilen))


End Function


Sub WordLetterPairs(str As String, pairColl As Collection)
'Tokenize str into words, then add all letter pairs to pairColl


Dim Words() As String
Dim word, nPairs, pair As Integer


Words = Split(str)


If UBound(Words) < 0 Then
Set pairColl = Nothing
Exit Sub
End If


For word = 0 To UBound(Words)
nPairs = Len(Words(word)) - 1
If nPairs > 0 Then
For pair = 1 To nPairs
pairColl.Add Mid(Words(word), pair, 2)
Next pair
End If
Next word


End Sub


Private Function SimilarityMetric(sPairs1 As Collection, sPairs2 As Collection) As Variant
'Helper function to calculate similarity metric given two collections of letter pairs.
'This function is designed to allow the pair collections to be set up separately as needed.
'NOTE: sPairs2 collection will be altered as pairs are removed; copy the collection
'if this is not the desired behavior.
'Also assumes that collections will be deallocated somewhere else


Dim Intersect As Double
Dim Union As Double
Dim i, j As Long


If sPairs1.Count = 0 Or sPairs2.Count = 0 Then
SimilarityMetric = CVErr(xlErrNA)
Exit Function
End If


Union = sPairs1.Count + sPairs2.Count
Intersect = 0


For i = 1 To sPairs1.Count
For j = 1 To sPairs2.Count
If StrComp(sPairs1(i), sPairs2(j)) = 0 Then
Intersect = Intersect + 1
sPairs2.Remove j
Exit For
End If
Next j
Next i


SimilarityMetric = (2 * Intersect) / Union


End Function

下面是 Marzagao’s的另一个版本,这是用 Python 编写的:

def get_bigrams(string):
"""
Take a string and return a list of bigrams.
"""
s = string.lower()
return [s[i:i+2] for i in list(range(len(s) - 1))]


def string_similarity(str1, str2):
"""
Perform bigram comparison between two strings
and return a percentage match in decimal form.
"""
pairs1 = get_bigrams(str1)
pairs2 = get_bigrams(str2)
union  = len(pairs1) + len(pairs2)
hit_count = 0
for x in pairs1:
for y in pairs2:
if x == y:
hit_count += 1
break
return (2.0 * hit_count) / union


if __name__ == "__main__":
"""
Run a test using the example taken from:
http://www.catalysoft.com/articles/StrikeAMatch.html
"""
w1 = 'Healed'
words = ['Heard', 'Healthy', 'Help', 'Herded', 'Sealed', 'Sold']


for w2 in words:
print('Healed --- ' + w2)
print(string_similarity(w1, w2))
print()

我把 Simon White 的算法翻译成 PL/pgSQL,这是我的贡献。

<!-- language: lang-sql -->


create or replace function spt1.letterpairs(in p_str varchar)
returns varchar  as
$$
declare


v_numpairs integer := length(p_str)-1;
v_pairs varchar[];


begin


for i in 1 .. v_numpairs loop
v_pairs[i] := substr(p_str, i, 2);
end loop;


return v_pairs;


end;
$$ language 'plpgsql';


--===================================================================


create or replace function spt1.wordletterpairs(in p_str varchar)
returns varchar as
$$
declare
v_allpairs varchar[];
v_words varchar[];
v_pairsinword varchar[];
begin
v_words := regexp_split_to_array(p_str, '[[:space:]]');


for i in 1 .. array_length(v_words, 1) loop
v_pairsinword := spt1.letterpairs(v_words[i]);


if v_pairsinword is not null then
for j in 1 .. array_length(v_pairsinword, 1) loop
v_allpairs := v_allpairs || v_pairsinword[j];
end loop;
end if;


end loop;




return v_allpairs;
end;
$$ language 'plpgsql';


--===================================================================


create or replace function spt1.arrayintersect(ANYARRAY, ANYARRAY)
returns anyarray as
$$
select array(select unnest($1) intersect select unnest($2))
$$ language 'sql';


--===================================================================


create or replace function spt1.comparestrings(in p_str1 varchar, in p_str2 varchar)
returns float as
$$
declare
v_pairs1 varchar[];
v_pairs2 varchar[];
v_intersection integer;
v_union integer;
begin
v_pairs1 := wordletterpairs(upper(p_str1));
v_pairs2 := wordletterpairs(upper(p_str2));
v_union := array_length(v_pairs1, 1) + array_length(v_pairs2, 1);


v_intersection := array_length(arrayintersect(v_pairs1, v_pairs2), 1);


return (2.0 * v_intersection / v_union);
end;
$$ language 'plpgsql';

嘿,伙计们,我用 javascript 试过了,但是我是新手,有人知道更快的方法吗?

function get_bigrams(string) {
// Takes a string and returns a list of bigrams
var s = string.toLowerCase();
var v = new Array(s.length-1);
for (i = 0; i< v.length; i++){
v[i] =s.slice(i,i+2);
}
return v;
}


function string_similarity(str1, str2){
/*
Perform bigram comparison between two strings
and return a percentage match in decimal form
*/
var pairs1 = get_bigrams(str1);
var pairs2 = get_bigrams(str2);
var union = pairs1.length + pairs2.length;
var hit_count = 0;
for (x in pairs1){
for (y in pairs2){
if (pairs1[x] == pairs2[y]){
hit_count++;
}
}
}
return ((2.0 * hit_count) / union);
}




var w1 = 'Healed';
var word =['Heard','Healthy','Help','Herded','Sealed','Sold']
for (w2 in word){
console.log('Healed --- ' + word[w2])
console.log(string_similarity(w1,word[w2]));
}

下面是 Simon White 推荐的 StrikeAMatch 算法的 PHP 实现。它的优点(如链接中所说)如下:

  • 词汇相似性的真实反映应该被认为是相似的。特别是,重要的子字符串重叠应该指向字符串之间的高度相似性。

  • 对词序变化的鲁棒性 -两个字符串包含相同的词,但在不同的顺序,应该被认为是相似的。另一方面,如果一个字符串只是包含在另一个字符中的随机变位,那么它(通常)应该被认为是不相似的。

  • 语言独立性 -该算法不仅适用于英语,而且适用于许多不同的语言。

<?php
/**
* LetterPairSimilarity algorithm implementation in PHP
* @author Igal Alkon
* @link http://www.catalysoft.com/articles/StrikeAMatch.html
*/
class LetterPairSimilarity
{
/**
* @param $str
* @return mixed
*/
private function wordLetterPairs($str)
{
$allPairs = array();


// Tokenize the string and put the tokens/words into an array


$words = explode(' ', $str);


// For each word
for ($w = 0; $w < count($words); $w++)
{
// Find the pairs of characters
$pairsInWord = $this->letterPairs($words[$w]);


for ($p = 0; $p < count($pairsInWord); $p++)
{
$allPairs[] = $pairsInWord[$p];
}
}


return $allPairs;
}


/**
* @param $str
* @return array
*/
private function letterPairs($str)
{
$numPairs = mb_strlen($str)-1;
$pairs = array();


for ($i = 0; $i < $numPairs; $i++)
{
$pairs[$i] = mb_substr($str,$i,2);
}


return $pairs;
}


/**
* @param $str1
* @param $str2
* @return float
*/
public function compareStrings($str1, $str2)
{
$pairs1 = $this->wordLetterPairs(strtoupper($str1));
$pairs2 = $this->wordLetterPairs(strtoupper($str2));


$intersection = 0;


$union = count($pairs1) + count($pairs2);


for ($i=0; $i < count($pairs1); $i++)
{
$pair1 = $pairs1[$i];


$pairs2 = array_values($pairs2);
for($j = 0; $j < count($pairs2); $j++)
{
$pair2 = $pairs2[$j];
if ($pair1 === $pair2)
{
$intersection++;
unset($pairs2[$j]);
break;
}
}
}


return (2.0*$intersection)/$union;
}
}

Dice 系数算法(Simon White/marzagao 的答案)是在 Ruby 中实现的 在 amatch gem 中使用 double _ range _ 類似的方法

Https://github.com/flori/amatch

这个 gem 还包含一些近似匹配和字符串比较算法的实现: Levenshtein 编辑距离、 Sellers 编辑距离、汉明距离、最长公共子序列长度、最长公共子字符串长度、对距离度量、 Jaro-Winkler 度量。

约翰 · 拉特利奇的答案的简短版本:

def get_bigrams(string):
'''
Takes a string and returns a list of bigrams
'''
s = string.lower()
return {s[i:i+2] for i in xrange(len(s) - 1)}


def string_similarity(str1, str2):
'''
Perform bigram comparison between two strings
and return a percentage match in decimal form
'''
pairs1 = get_bigrams(str1)
pairs2 = get_bigrams(str2)
return (2.0 * len(pairs1 & pairs2)) / (len(pairs1) + len(pairs2))

算法的一个更快的 PHP 版本:

/**
*
* @param $str
* @return mixed
*/
private static function wordLetterPairs ($str)
{
$allPairs = array();


// Tokenize the string and put the tokens/words into an array


$words = explode(' ', $str);


// For each word
for ($w = 0; $w < count($words); $w ++) {
// Find the pairs of characters
$pairsInWord = self::letterPairs($words[$w]);


for ($p = 0; $p < count($pairsInWord); $p ++) {
$allPairs[$pairsInWord[$p]] = $pairsInWord[$p];
}
}


return array_values($allPairs);
}


/**
*
* @param $str
* @return array
*/
private static function letterPairs ($str)
{
$numPairs = mb_strlen($str) - 1;
$pairs = array();


for ($i = 0; $i < $numPairs; $i ++) {
$pairs[$i] = mb_substr($str, $i, 2);
}


return $pairs;
}


/**
*
* @param $str1
* @param $str2
* @return float
*/
public static function compareStrings ($str1, $str2)
{
$pairs1 = self::wordLetterPairs(mb_strtolower($str1));
$pairs2 = self::wordLetterPairs(mb_strtolower($str2));




$union = count($pairs1) + count($pairs2);


$intersection = count(array_intersect($pairs1, $pairs2));


return (2.0 * $intersection) / $union;
}

对于我的数据(大约2300个比较) ,我使用 伊格尔 · 阿尔康解决方案的运行时间为0.58秒,而使用我的解决方案的运行时间为0.35秒。

一个漂亮的 Scala 版本:

  def pairDistance(s1: String, s2: String): Double = {


def strToPairs(s: String, acc: List[String]): List[String] = {
if (s.size < 2) acc
else strToPairs(s.drop(1),
if (s.take(2).contains(" ")) acc else acc ::: List(s.take(2)))
}


val lst1 = strToPairs(s1.toUpperCase, List())
val lst2 = strToPairs(s2.toUpperCase, List())


(2.0 * lst2.intersect(lst1).size) / (lst1.size + lst2.size)


}

我的 JavaScript 实现接受一个字符串或字符串数组,以及一个可选层(默认层为0.5)。如果传递给它一个字符串,它将返回 true 或 false,这取决于字符串的相似性得分是大于还是等于底部。如果传递给它一个字符串数组,它将返回一个字符串数组,这些字符串的相似性得分大于或等于底部,按得分排序。

例子:

'Healed'.fuzzy('Sealed');      // returns true
'Healed'.fuzzy('Help');        // returns false
'Healed'.fuzzy('Help', 0.25);  // returns true


'Healed'.fuzzy(['Sold', 'Herded', 'Heard', 'Help', 'Sealed', 'Healthy']);
// returns ["Sealed", "Healthy"]


'Healed'.fuzzy(['Sold', 'Herded', 'Heard', 'Help', 'Sealed', 'Healthy'], 0);
// returns ["Sealed", "Healthy", "Heard", "Herded", "Help", "Sold"]

这就是:

(function(){
var default_floor = 0.5;


function pairs(str){
var pairs = []
, length = str.length - 1
, pair;
str = str.toLowerCase();
for(var i = 0; i < length; i++){
pair = str.substr(i, 2);
if(!/\s/.test(pair)){
pairs.push(pair);
}
}
return pairs;
}


function similarity(pairs1, pairs2){
var union = pairs1.length + pairs2.length
, hits = 0;


for(var i = 0; i < pairs1.length; i++){
for(var j = 0; j < pairs2.length; j++){
if(pairs1[i] == pairs2[j]){
pairs2.splice(j--, 1);
hits++;
break;
}
}
}
return 2*hits/union || 0;
}


String.prototype.fuzzy = function(strings, floor){
var str1 = this
, pairs1 = pairs(this);


floor = typeof floor == 'number' ? floor : default_floor;


if(typeof(strings) == 'string'){
return str1.length > 1 && strings.length > 1 && similarity(pairs1, pairs(strings)) >= floor || str1.toLowerCase() == strings.toLowerCase();
}else if(strings instanceof Array){
var scores = {};


strings.map(function(str2){
scores[str2] = str1.length > 1 ? similarity(pairs1, pairs(str2)) : 1*(str1.toLowerCase() == str2.toLowerCase());
});


return strings.filter(function(str){
return scores[str] >= floor;
}).sort(function(a, b){
return scores[b] - scores[a];
});
}
};
})();

基于 Michael La Voie 令人敬畏的 C # 版本,根据将其作为扩展方法的请求,下面是我想到的。这样做的主要好处是,您可以按照匹配百分比对通用列表进行排序。例如,假设您的对象中有一个名为“ City”的字符串字段。用户搜索“ Chester”,您希望按照匹配的降序返回结果。例如,您希望在 Rochester 之前显示 Chester 的字面匹配。为此,向对象添加两个新属性:

    public string SearchText { get; set; }
public double PercentMatch
{
get
{
return City.ToUpper().PercentMatchTo(this.SearchText.ToUpper());
}
}

然后在每个对象上,将 SearchText 设置为用户搜索的内容。然后你就可以很容易地对它进行分类,比如:

    zipcodes = zipcodes.OrderByDescending(x => x.PercentMatch);

下面是一个小小的修改,使它成为一个扩展方法:

    /// <summary>
/// This class implements string comparison algorithm
/// based on character pair similarity
/// Source: http://www.catalysoft.com/articles/StrikeAMatch.html
/// </summary>
public static double PercentMatchTo(this string str1, string str2)
{
List<string> pairs1 = WordLetterPairs(str1.ToUpper());
List<string> pairs2 = WordLetterPairs(str2.ToUpper());


int intersection = 0;
int union = pairs1.Count + pairs2.Count;


for (int i = 0; i < pairs1.Count; i++)
{
for (int j = 0; j < pairs2.Count; j++)
{
if (pairs1[i] == pairs2[j])
{
intersection++;
pairs2.RemoveAt(j);//Must remove the match to prevent "GGGG" from appearing to match "GG" with 100% success


break;
}
}
}


return (2.0 * intersection) / union;
}


/// <summary>
/// Gets all letter pairs for each
/// individual word in the string
/// </summary>
/// <param name="str"></param>
/// <returns></returns>
private static List<string> WordLetterPairs(string str)
{
List<string> AllPairs = new List<string>();


// Tokenize the string and put the tokens/words into an array
string[] Words = Regex.Split(str, @"\s");


// For each word
for (int w = 0; w < Words.Length; w++)
{
if (!string.IsNullOrEmpty(Words[w]))
{
// Find the pairs of characters
String[] PairsInWord = LetterPairs(Words[w]);


for (int p = 0; p < PairsInWord.Length; p++)
{
AllPairs.Add(PairsInWord[p]);
}
}
}


return AllPairs;
}


/// <summary>
/// Generates an array containing every
/// two consecutive letters in the input string
/// </summary>
/// <param name="str"></param>
/// <returns></returns>
private static  string[] LetterPairs(string str)
{
int numPairs = str.Length - 1;


string[] pairs = new string[numPairs];


for (int i = 0; i < numPairs; i++)
{
pairs[i] = str.Substring(i, 2);
}


return pairs;
}

下面是 R 版本:

get_bigrams <- function(str)
{
lstr = tolower(str)
bigramlst = list()
for(i in 1:(nchar(str)-1))
{
bigramlst[[i]] = substr(str, i, i+1)
}
return(bigramlst)
}


str_similarity <- function(str1, str2)
{
pairs1 = get_bigrams(str1)
pairs2 = get_bigrams(str2)
unionlen  = length(pairs1) + length(pairs2)
hit_count = 0
for(x in 1:length(pairs1)){
for(y in 1:length(pairs2)){
if (pairs1[[x]] == pairs2[[y]])
hit_count = hit_count + 1
}
}
return ((2.0 * hit_count) / unionlen)
}

一个 Haskell 版本ーー请随意提出编辑建议,因为我没有做太多的 Haskell 工作。

import Data.Char
import Data.List


-- Convert a string into words, then get the pairs of words from that phrase
wordLetterPairs :: String -> [String]
wordLetterPairs s1 = concat $ map pairs $ words s1


-- Converts a String into a list of letter pairs.
pairs :: String -> [String]
pairs [] = []
pairs (x:[]) = []
pairs (x:ys) = [x, head ys]:(pairs ys)


-- Calculates the match rating for two strings
matchRating :: String -> String -> Double
matchRating s1 s2 = (numberOfMatches * 2) / totalLength
where pairsS1 = wordLetterPairs $ map toLower s1
pairsS2 = wordLetterPairs $ map toLower s2
numberOfMatches = fromIntegral $ length $ pairsS1 `intersect` pairsS2
totalLength = fromIntegral $ length pairsS1 + length pairsS2

抱歉,答案不是作者发明的。这是一个众所周知的算法,首先由数字设备公司提出,通常被称为瓦。

Http://www.hpl.hp.com/techreports/compaq-dec/src-tn-1997-015.pdf

Clojure:

(require '[clojure.set :refer [intersection]])


(defn bigrams [s]
(->> (split s #"\s+")
(mapcat #(partition 2 1 %))
(set)))


(defn string-similarity [a b]
(let [a-pairs (bigrams a)
b-pairs (bigrams b)
total-count (+ (count a-pairs) (count b-pairs))
match-count (count (intersection a-pairs b-pairs))
similarity (/ (* 2 match-count) total-count)]
similarity))

受到 这些算法的启发,在 C99中发布 Marzagao 的回答

double dice_match(const char *string1, const char *string2) {


//check fast cases
if (((string1 != NULL) && (string1[0] == '\0')) ||
((string2 != NULL) && (string2[0] == '\0'))) {
return 0;
}
if (string1 == string2) {
return 1;
}


size_t strlen1 = strlen(string1);
size_t strlen2 = strlen(string2);
if (strlen1 < 2 || strlen2 < 2) {
return 0;
}


size_t length1 = strlen1 - 1;
size_t length2 = strlen2 - 1;


double matches = 0;
int i = 0, j = 0;


//get bigrams and compare
while (i < length1 && j < length2) {
char a[3] = {string1[i], string1[i + 1], '\0'};
char b[3] = {string2[j], string2[j + 1], '\0'};
int cmp = strcmpi(a, b);
if (cmp == 0) {
matches += 2;
}
i++;
j++;
}


return matches / (length1 + length2);
}

基于 原创文章的一些测试:

#include <stdio.h>


void article_test1() {
char *string1 = "FRANCE";
char *string2 = "FRENCH";
printf("====%s====\n", __func__);
printf("%2.f%% == 40%%\n", dice_match(string1, string2) * 100);
}




void article_test2() {
printf("====%s====\n", __func__);
char *string = "Healed";
char *ss[] = {"Heard", "Healthy", "Help",
"Herded", "Sealed", "Sold"};
int correct[] = {44, 55, 25, 40, 80, 0};
for (int i = 0; i < 6; ++i) {
printf("%2.f%% == %d%%\n", dice_match(string, ss[i]) * 100, correct[i]);
}
}


void multicase_test() {
char *string1 = "FRaNcE";
char *string2 = "fREnCh";
printf("====%s====\n", __func__);
printf("%2.f%% == 40%%\n", dice_match(string1, string2) * 100);


}


void gg_test() {
char *string1 = "GG";
char *string2 = "GGGGG";
printf("====%s====\n", __func__);
printf("%2.f%% != 100%%\n", dice_match(string1, string2) * 100);
}




int main() {
article_test1();
article_test2();
multicase_test();
gg_test();


return 0;
}

下面是另一个基于 Sørensen 的相似度-骰子索引(marzagao 的回答) ,这个是用 c + + 11写的:

/*
* Similarity based in Sørensen–Dice index.
*
* Returns the Similarity between _str1 and _str2.
*/
double similarity_sorensen_dice(const std::string& _str1, const std::string& _str2) {
// Base case: if some string is empty.
if (_str1.empty() || _str2.empty()) {
return 1.0;
}


auto str1 = upper_string(_str1);
auto str2 = upper_string(_str2);


// Base case: if the strings are equals.
if (str1 == str2) {
return 0.0;
}


// Base case: if some string does not have bigrams.
if (str1.size() < 2 || str2.size() < 2) {
return 1.0;
}


// Extract bigrams from str1
auto num_pairs1 = str1.size() - 1;
std::unordered_set<std::string> str1_bigrams;
str1_bigrams.reserve(num_pairs1);
for (unsigned i = 0; i < num_pairs1; ++i) {
str1_bigrams.insert(str1.substr(i, 2));
}


// Extract bigrams from str2
auto num_pairs2 = str2.size() - 1;
std::unordered_set<std::string> str2_bigrams;
str2_bigrams.reserve(num_pairs2);
for (unsigned int i = 0; i < num_pairs2; ++i) {
str2_bigrams.insert(str2.substr(i, 2));
}


// Find the intersection between the two sets.
int intersection = 0;
if (str1_bigrams.size() < str2_bigrams.size()) {
const auto it_e = str2_bigrams.end();
for (const auto& bigram : str1_bigrams) {
intersection += str2_bigrams.find(bigram) != it_e;
}
} else {
const auto it_e = str1_bigrams.end();
for (const auto& bigram : str2_bigrams) {
intersection += str1_bigrams.find(bigram) != it_e;
}
}


// Returns similarity coefficient.
return (2.0 * intersection) / (num_pairs1 + num_pairs2);
}

我正在寻找@marzagao 的回答所指示的算法的纯 Ruby 实现。不幸的是,@marzagao 表示的链接中断了。在@s01ipsist 的回答中,他指出 Ruby gem 火柴的实现不是纯粹的 Ruby。所以我搜索了一下,发现 gem 模糊匹配给你上有纯 Ruby 实现(尽管这个 gem 使用 amatch)。我希望这能帮到像我这样的人。

**I've converted marzagao's answer to Java.**


import org.apache.commons.lang3.StringUtils; //Add a apache commons jar in pom.xml


import java.util.ArrayList;
import java.util.Collections;
import java.util.List;


public class SimilarityComparator {
public static void main(String[] args) {
String str0 = "Nischal";
String str1 = "Nischal";
double v = compareStrings(str0, str1);
System.out.println("Similarity betn " + str0 + " and " + str1 + " = " + v);


}


private static double compareStrings(String str1, String str2) {
List<String> pairs1 = wordLetterPairs(str1.toUpperCase());
List<String> pairs2 = wordLetterPairs(str2.toUpperCase());


int intersection = 0;
int union = pairs1.size() + pairs2.size();


for (String s : pairs1) {
for (int j = 0; j < pairs2.size(); j++) {
if (s.equals(pairs2.get(j))) {
intersection++;
pairs2.remove(j);
break;
}
}
}
return (2.0 * intersection) / union;
}


private static List<String> wordLetterPairs(String str) {
List<String> AllPairs = new ArrayList<>();
String[] Words = str.split("\\s");
for (String word : Words) {
if (StringUtils.isNotBlank(word)) {
String[] PairsInWord = letterPairs(word);
Collections.addAll(AllPairs, PairsInWord);
}
}
return AllPairs;
}


private static String[] letterPairs(String str) {
int numPairs = str.length() - 1;
String[] pairs = new String[numPairs];
for (int i = 0; i < numPairs; i++) {
try {
pairs[i] = str.substring(i, i + 2);
} catch (Exception e) {
pairs[i] = str.substring(i, numPairs);
}
}
return pairs;
}
}

为什么不用 JavaScript 实现,我还解释了算法。

算法

  • 输入: FranceFrench
  • 将它们映射到它们的大写字符(使算法对大小写不敏感) ,然后将它们分成字符对:
FRANCE: {FR, RA, AN, NC, CE}
FRENCH: {FR, RE, EN, NC, CH}
  • 找到那个十字路口:

intersection

  • 结果:

algorithm

实施

function similarity(s1, s2) {
const
set1 = pairs(s1.toUpperCase()), // [ FR, RA, AN, NC, CE ]
set2 = pairs(s2.toUpperCase()), // [ FR, RE, EN, NC, CH ]
intersection = set1.filter(x => set2.includes(x)); // [ FR, NC ]
// Tips: Instead of `2` multiply by `200`, To get percentage.
return (intersection.length * 2) / (set1.length + set2.length);
}
function pairs(input) {
const tokenized = [];
for (let i = 0; i < input.length - 1; i++)
tokenized.push(input.substring(i, 2 + i));


return tokenized;
}
console.log(similarity("FRANCE", "FRENCH"));

按词语相似度排列结果

  1. 80% 密封
  2. 健康-55%
  3. 听说44%
  4. 羊群占40%
  5. 帮助-25%
  6. 成交0%

来自同一个 来源

下面是另一个遵循原始 文章的 c + + 实现,它最小化了动态内存分配。

它在示例中获得了相同的匹配值,但我认为最好也考虑单个字符的单词。

//---------------------------------------------------------------------------
// Similarity based on Sørensen–Dice index
double calc_similarity( const std::string_view s1, const std::string_view s2 )
{
// Check banal cases
if( s1.empty() || s2.empty() )
{// Empty string is never similar to another
return 0.0;
}
else if( s1==s2 )
{// Perfectly equal
return 1.0;
}
else if( s1.length()==1 || s2.length()==1 )
{// Single (not equal) characters have zero similarity
return 0.0;
}


/////////////////////////////////////////////////////////////////////////
// Represents a pair of adjacent characters
class charpair_t final
{
public:
charpair_t(const char a, const char b) noexcept : c1(a), c2(b) {}
[[nodiscard]] bool operator==(const charpair_t& other) const noexcept { return c1==other.c1 && c2==other.c2; }
private:
char c1, c2;
};


/////////////////////////////////////////////////////////////////////////
// Collects and access a sequence of adjacent characters (skipping spaces)
class charpairs_t final
{
public:
charpairs_t(const std::string_view s)
{
assert( !s.empty() );
const std::size_t i_last = s.size()-1;
std::size_t i = 0;
chpairs.reserve(i_last);
while( i<i_last )
{
// Accepting also single-character words (the second is a space)
//if( !std::isspace(s[i]) ) chpairs.emplace_back( std::tolower(s[i]), std::tolower(s[i+1]) );
// Skipping single-character words (as in the original article)
if( std::isspace(s[i]) ) ; // Skip
else if( std::isspace(s[i+1]) ) ++i; // Skip also next
else chpairs.emplace_back( std::tolower(s[i]), std::tolower(s[i+1]) );
++i;
}
}


[[nodiscard]] auto size() const noexcept { return chpairs.size(); }
[[nodiscard]] auto cbegin() const noexcept { return chpairs.cbegin(); }
[[nodiscard]] auto cend() const noexcept { return chpairs.cend(); }
auto erase(std::vector<charpair_t>::const_iterator i) { return chpairs.erase(i); }


private:
std::vector<charpair_t> chpairs;
};


charpairs_t chpairs1{s1},
chpairs2{s2};
const double orig_avg_bigrams_count = 0.5 * static_cast<double>(chpairs1.size() + chpairs2.size());
std::size_t matching_bigrams_count = 0;
for( auto ib1=chpairs1.cbegin(); ib1!=chpairs1.cend(); ++ib1 )
{
for( auto ib2=chpairs2.cbegin(); ib2!=chpairs2.cend(); )
{
if( *ib1==*ib2 )
{
++matching_bigrams_count;
ib2 = chpairs2.erase(ib2); // Avoid to match the same character pair multiple times
break;
}
else ++ib2;
}
}
return static_cast<double>(matching_bigrams_count) / orig_avg_bigrams_count;
}