Pure Perl Fuzzy String Matching
Code is copyright (c) 2013 by Tim Ellis of Fifth Sigma, Inc. and is released under the terms of the Gnu GPL v.2. If you have questions about what this means, contact me.
First, here is the code:
sub fuzzyWordMatch {
my $first = shift; my $second = shift;
$first = lc($first); $first =~ s/[^a-z]//g;
$second = lc($second); $second =~ s/[^a-z]//g;
my $fuzzyMatch = 0;
my $cutoffRatio = 0.3;
my $charSameRatio = 99;
if (length($second) > length($first)) {
($second,$first) = ($first,$second);
}
my $diffWordLen = length($first) - length($second);
if ($diffWordLen > 4) { return (0,0); }
my $sumWordLen = length($first) + length($second);
my $firstLen = length($first);
my $secondLen = length($second);
$first = " $first ";
$second = " $second ";
my $charSame = 0;
foreach my $i (1 .. $firstLen+1) {
my $testChar = substr($first,$i,1);
#print "$testChar~[".substr($second,$i-1,3)."] ";
if ($testChar ne " ") {
if ($testChar eq substr($second,$i,1)) {
$charSame += 1;
} elsif ($testChar eq substr($second,$i-1,1) || $testChar eq substr($second,$i+1,1)) {
$charSame += 0.3;
}
}
$testChar = substr($first,$firstLen+1-$i,1);
#print "$testChar~[".substr($second,$secondLen+0-$i,3)."] ";
if ($testChar ne " ") {
if ($testChar eq substr($second,$secondLen+1-$i,1)) {
$charSame += 1;
} elsif ($testChar eq substr($second,$secondLen+0-$i,1) || $testChar eq substr($second,$secondLen+2-$i,1)) {
$charSame += 0.3;
}
}
}
$charSameRatio = int((($charSame / 2) / $sumWordLen) * 1000) / 1000;
if ($charSameRatio >= $cutoffRatio) {
$fuzzyMatch = 1;
}
return ($fuzzyMatch, $charSameRatio);
}
Next, here is some code that runs some tests against it:
my @inputs = qw(stationary stationery statonary statoonary);
my $inpLen = @inputs;
foreach my $i (0 .. $inpLen - 2) {
foreach my $j ($i + 1 .. $inpLen - 1) {
my $str1 = $inputs[$i];
my $str2 = $inputs[$j];
my ($match,$ratio) = fuzzyWordMatch($str1,$str2);
if ($match) {
print " $str1 <~=> $str2 ($ratio)\n";
} else {
print " x $str1 $str2 ($ratio)\n";
}
}
}
print "\n";
Good luck!