Home > AI > Uncategorized

Jaro similarity

It measures the similarity of two strings. Its equation is like this

(1)   \begin{equation*}Jaro =\begin{cases}0 & m=0 \\\frac{1}{3}(\frac{m}{|s1|} + \frac{m}{|s2|} + \frac{m-t}{m})& \text{for other m}\end{cases}\end{equation*}

Where m means the number of matches, t means the number of transposition, |s1| means the length of string 1 ( it is regarded as list), |s2| is the length of string 2.

This is the code

def jaro(val1, val2):
'''
Computes the Jaro similarity between 2 sequences from:
Matthew A. Jaro (1989). Advances in record linkage methodology
as applied to the 1985 census of Tampa Florida. Journal of the
American Statistical Association. 84 (406): 414–20.
Returns a value between 0.0 and 1.0.
Implemented by Shark
'''
# If at least one of the values is empty return 0
if (val1 == '') or (val2 == ''):
return 0.0
# If both attribute values exactly match return 1
elif (val1 == val2):
return 1.0
# 1 get |s1| and |s2|
len1 = len(val1)
len2 = len(val2)
ass1 = '' # found characters
ass2 = ''
halflen = int(max(len1, len2)/2)-1
JARO_MARKER_CHAR = '1' # to indicate this character has been found
# 2 get matches
m = 0
# traverse the first string
for i in range(len1):
start = max(0, i-halflen)
end = min(i+halflen+1, len2)
j = val2.find(val1[i], start, end)
if j > -1: # found
ass1 += val1[i]
ass2 += val2[j]
m+=1
# replace the found character to avoid duplicate finding
val2 = val2[:j] + JARO_MARKER_CHAR + val2[j+1:]
if m == 0:
return 0
# 3 get transposition
t = 0
for i in range(len(ass1)):
if ass1[i] != ass2[i]:
t += 1
t = t/2.0
# 4 get jaro
jaro = (m/len1 + m/len2 + (m-t)/m)/3
assert (jaro >= 0) and (jaro <= 1), f'Jaro score {jaro} should be in [0, 1]'
return round(jaro, 4)
def jaro_comp(val1, val2):
"""Calculate the similarity between the two given attribute values based on
the Jaro comparison function.
As described in 'An Application of the Fellegi-Sunter Model of Record
Linkage to the 1990 U.S. Decennial Census' by William E. Winkler and Yves
Thibaudeau.
Returns a value between 0.0 and 1.0.
Implemented by ANU
"""
# If at least one of the values is empty return 0
#
if (val1 == '') or (val2 == ''):
return 0.0
# If both attribute values exactly match return 1
#
elif (val1 == val2):
return 1.0
len1 = len(val1) # Number of characters in val1
len2 = len(val2) # Number of characters in val2
halflen = int(max(len1, len2) / 2) - 1
assingment1 = '' # Characters assigned in val1
assingment2 = '' # Characters assigned in val2
workstr1 = val1 # Copy of original value1
workstr2 = val2 # Copy of original value1
common1 = 0 # Number of common characters
common2 = 0 # Number of common characters
JARO_MARKER_CHAR = chr(1)
for i in range(len1): # Analyse the first string
start = max(0, i - halflen)
end = min(i + halflen + 1, len2)
index = workstr2.find(val1[i], start, end)
if (index > -1): # Found common character, count and mark it as assigned
common1 += 1
assingment1 = assingment1 + val1[i]
workstr2 = workstr2[:index] + JARO_MARKER_CHAR + workstr2[index+1:]
for i in range(len2): # Analyse the second string
start = max(0, i - halflen)
end = min(i + halflen + 1, len1)
index = workstr1.find(val2[i], start, end)
if (index > -1): # Found common character, count and mark it as assigned
common2 += 1
assingment2 = assingment2 + val2[i]
workstr1 = workstr1[:index] + JARO_MARKER_CHAR + workstr1[index+1:]
if (common1 != common2):
common1 = float(common1 + common2) / 2.0
if (common1 == 0): # No common characters within half length of strings
return 0.0
transposition = 0 # Calculate number of transpositions
for i in range(len(assingment1)):
if (assingment1[i] != assingment2[i]):
transposition += 1
transposition = transposition / 2.0
common1 = float(common1)
jaro_sim = 1./3.*(common1 / float(len1) + common1 / float(len2) + \
(common1 - transposition) / common1)
assert (jaro_sim >= 0.0) and (jaro_sim <= 1.0), \
'Similarity weight outside 0-1: %f' % (jaro_sim)
return round(jaro_sim, 4)
def test():
assert jaro('jones', 'johnson') == jaro_comp('jones', 'johnson')
assert jaro('michelle', 'michael') == jaro_comp('michelle', 'michael')
assert jaro('shackleford', 'shackleford') == jaro_comp('shackleford', 'shackleford')
assert jaro('CRATE', 'TRACE') == 0.7333
assert jaro('DwAyNE', 'DuANE') == 0.8222
assert jaro('CRATE', 'TRACE') == jaro_comp('CRATE', 'TRACE')
assert jaro('DwAyNE', 'DuANE') == jaro_comp('DwAyNE', 'DuANE')
test()
view raw jaro.py hosted with ❤ by GitHub
Related posts:
Relevant tags:

Leave a Reply