First, introduce the imagehash.
To date, when encountering duplicates, we would remove them or put them in a same fold to denoise.
Here is the result, the similarity matrix
array([[0. , 1. , 0.6328125],
[1. , 0. , 0.6328125],
[0.6328125, 0.6328125, 0. ]])
display duplicate images
# show 20 duplicate images
count = 20
tmp = 0
pairs = {}
for row, col in zip(*duplicates):
# diagonal
if row == col:
continue
path1 = paths[row]
path2 = paths[col]
print(path1)
print(path2)
print(sims2[row, col])
image1 = cv2.imread(path1)
image2 = cv2.imread(path2)
if image1.shape[0] > image1.shape[1] / 2:
fig,ax = plt.subplots(figsize=(20,20), ncols=2)
elif image1.shape[1] > image1.shape[0] / 2:
fig,ax = plt.subplots(figsize=(20,20), nrows=2)
else:
fig,ax = plt.subplots(figsize=(20,30), nrows=2)
ax[0].imshow(image1)
ax[1].imshow(image2)
plt.show()
tmp += 1
if tmp > count:
break