It measures the similarity of two strings. Its equation is like this
(1)
Where m means the number of matches, t means the number of transposition, means the length of string 1 ( it is regarded as list),
is the length of string 2.
This is the code
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def jaro(val1, val2): | |
''' | |
Computes the Jaro similarity between 2 sequences from: | |
Matthew A. Jaro (1989). Advances in record linkage methodology | |
as applied to the 1985 census of Tampa Florida. Journal of the | |
American Statistical Association. 84 (406): 414–20. | |
Returns a value between 0.0 and 1.0. | |
Implemented by Shark | |
''' | |
# If at least one of the values is empty return 0 | |
if (val1 == '') or (val2 == ''): | |
return 0.0 | |
# If both attribute values exactly match return 1 | |
elif (val1 == val2): | |
return 1.0 | |
# 1 get |s1| and |s2| | |
len1 = len(val1) | |
len2 = len(val2) | |
ass1 = '' # found characters | |
ass2 = '' | |
halflen = int(max(len1, len2)/2)-1 | |
JARO_MARKER_CHAR = '1' # to indicate this character has been found | |
# 2 get matches | |
m = 0 | |
# traverse the first string | |
for i in range(len1): | |
start = max(0, i-halflen) | |
end = min(i+halflen+1, len2) | |
j = val2.find(val1[i], start, end) | |
if j > -1: # found | |
ass1 += val1[i] | |
ass2 += val2[j] | |
m+=1 | |
# replace the found character to avoid duplicate finding | |
val2 = val2[:j] + JARO_MARKER_CHAR + val2[j+1:] | |
if m == 0: | |
return 0 | |
# 3 get transposition | |
t = 0 | |
for i in range(len(ass1)): | |
if ass1[i] != ass2[i]: | |
t += 1 | |
t = t/2.0 | |
# 4 get jaro | |
jaro = (m/len1 + m/len2 + (m-t)/m)/3 | |
assert (jaro >= 0) and (jaro <= 1), f'Jaro score {jaro} should be in [0, 1]' | |
return round(jaro, 4) | |
def jaro_comp(val1, val2): | |
"""Calculate the similarity between the two given attribute values based on | |
the Jaro comparison function. | |
As described in 'An Application of the Fellegi-Sunter Model of Record | |
Linkage to the 1990 U.S. Decennial Census' by William E. Winkler and Yves | |
Thibaudeau. | |
Returns a value between 0.0 and 1.0. | |
Implemented by ANU | |
""" | |
# If at least one of the values is empty return 0 | |
# | |
if (val1 == '') or (val2 == ''): | |
return 0.0 | |
# If both attribute values exactly match return 1 | |
# | |
elif (val1 == val2): | |
return 1.0 | |
len1 = len(val1) # Number of characters in val1 | |
len2 = len(val2) # Number of characters in val2 | |
halflen = int(max(len1, len2) / 2) - 1 | |
assingment1 = '' # Characters assigned in val1 | |
assingment2 = '' # Characters assigned in val2 | |
workstr1 = val1 # Copy of original value1 | |
workstr2 = val2 # Copy of original value1 | |
common1 = 0 # Number of common characters | |
common2 = 0 # Number of common characters | |
JARO_MARKER_CHAR = chr(1) | |
for i in range(len1): # Analyse the first string | |
start = max(0, i - halflen) | |
end = min(i + halflen + 1, len2) | |
index = workstr2.find(val1[i], start, end) | |
if (index > -1): # Found common character, count and mark it as assigned | |
common1 += 1 | |
assingment1 = assingment1 + val1[i] | |
workstr2 = workstr2[:index] + JARO_MARKER_CHAR + workstr2[index+1:] | |
for i in range(len2): # Analyse the second string | |
start = max(0, i - halflen) | |
end = min(i + halflen + 1, len1) | |
index = workstr1.find(val2[i], start, end) | |
if (index > -1): # Found common character, count and mark it as assigned | |
common2 += 1 | |
assingment2 = assingment2 + val2[i] | |
workstr1 = workstr1[:index] + JARO_MARKER_CHAR + workstr1[index+1:] | |
if (common1 != common2): | |
common1 = float(common1 + common2) / 2.0 | |
if (common1 == 0): # No common characters within half length of strings | |
return 0.0 | |
transposition = 0 # Calculate number of transpositions | |
for i in range(len(assingment1)): | |
if (assingment1[i] != assingment2[i]): | |
transposition += 1 | |
transposition = transposition / 2.0 | |
common1 = float(common1) | |
jaro_sim = 1./3.*(common1 / float(len1) + common1 / float(len2) + \ | |
(common1 - transposition) / common1) | |
assert (jaro_sim >= 0.0) and (jaro_sim <= 1.0), \ | |
'Similarity weight outside 0-1: %f' % (jaro_sim) | |
return round(jaro_sim, 4) | |
def test(): | |
assert jaro('jones', 'johnson') == jaro_comp('jones', 'johnson') | |
assert jaro('michelle', 'michael') == jaro_comp('michelle', 'michael') | |
assert jaro('shackleford', 'shackleford') == jaro_comp('shackleford', 'shackleford') | |
assert jaro('CRATE', 'TRACE') == 0.7333 | |
assert jaro('DwAyNE', 'DuANE') == 0.8222 | |
assert jaro('CRATE', 'TRACE') == jaro_comp('CRATE', 'TRACE') | |
assert jaro('DwAyNE', 'DuANE') == jaro_comp('DwAyNE', 'DuANE') | |
test() |