188 lines
5.6 KiB
Python
188 lines
5.6 KiB
Python
"""
|
|
The image module contains the :class:`Images` class, which is used to track
|
|
image objects (JPG, DNG, etc.).
|
|
|
|
.. moduleauthor:: Jaisen Mathai <jaisen@jmathai.com>
|
|
"""
|
|
|
|
import imghdr
|
|
import os
|
|
|
|
import imagehash
|
|
import numpy as np
|
|
from PIL import Image as img
|
|
from PIL import UnidentifiedImageError
|
|
|
|
from ordigi import LOG
|
|
|
|
# HEIC extension support (experimental, not tested)
|
|
PYHEIF = False
|
|
try:
|
|
from pyheif_pillow_opener import register_heif_opener
|
|
|
|
PYHEIF = True
|
|
# Allow to open HEIF/HEIC image from pillow
|
|
register_heif_opener()
|
|
except ImportError as e:
|
|
LOG.info(e)
|
|
|
|
|
|
class Image:
|
|
"""Image file class"""
|
|
|
|
def __init__(self, img_path, hash_size=8):
|
|
|
|
self.img_path = img_path
|
|
self.hash_size = hash_size
|
|
|
|
def is_image(self):
|
|
"""Check whether the file is an image.
|
|
:returns: bool
|
|
"""
|
|
# gh-4 This checks if the file is an image.
|
|
# It doesn't validate against the list of supported types.
|
|
# We check with imghdr and pillow.
|
|
if imghdr.what(self.img_path) is None:
|
|
# Pillow is used as a fallback
|
|
# imghdr won't detect all variants of images (https://bugs.python.org/issue28591)
|
|
# see https://github.com/jmathai/elodie/issues/281
|
|
# before giving up, we use `pillow` imaging library to detect file type
|
|
#
|
|
# It is important to note that the library doesn't decode or load the
|
|
# raster data unless it really has to. When you open a file,
|
|
# the file header is read to determine the file format and extract
|
|
# things like mode, size, and other properties required to decode the file,
|
|
# but the rest of the file is not processed until later.
|
|
try:
|
|
image = img.open(self.img_path)
|
|
except (IOError, UnidentifiedImageError):
|
|
return False
|
|
|
|
if image.format is None:
|
|
return False
|
|
|
|
return True
|
|
|
|
def get_hash(self):
|
|
"""Get image hash"""
|
|
try:
|
|
with img.open(self.img_path) as image:
|
|
return imagehash.average_hash(image, self.hash_size).hash
|
|
except (OSError, UnidentifiedImageError):
|
|
return None
|
|
|
|
|
|
class Images:
|
|
|
|
"""A image object.
|
|
|
|
:param str img_path: The fully qualified path to the image file
|
|
"""
|
|
|
|
#: Valid extensions for image files.
|
|
extensions = (
|
|
'arw',
|
|
'cr2',
|
|
'dng',
|
|
'gif',
|
|
'heic',
|
|
'jpeg',
|
|
'jpg',
|
|
'nef',
|
|
'png',
|
|
'rw2',
|
|
)
|
|
|
|
def __init__(self, images, hash_size=8):
|
|
self.images = images
|
|
self.duplicates = []
|
|
self.hash_size = hash_size
|
|
self.log = LOG.getChild(self.__class__.__name__)
|
|
if not PYHEIF:
|
|
self.log.info("No module named 'pyheif_pillow_opener'")
|
|
|
|
def get_images_hashes(self):
|
|
"""Get image hashes"""
|
|
# Searching for duplicates.
|
|
for image in self.images:
|
|
with img.open(image.img_path) as i:
|
|
yield imagehash.average_hash(i, self.hash_size)
|
|
|
|
def find_duplicates(self, img_path):
|
|
"""Find duplicates"""
|
|
duplicates = []
|
|
hashes = {}
|
|
for temp_hash in self.get_images_hashes():
|
|
if temp_hash in hashes:
|
|
self.log.info(
|
|
"Duplicate {} \nfound for image {}\n".format(
|
|
img_path, hashes[temp_hash]
|
|
)
|
|
)
|
|
duplicates.append(img_path)
|
|
else:
|
|
hashes[temp_hash] = img_path
|
|
|
|
return duplicates
|
|
|
|
def remove_duplicates(self, duplicates):
|
|
"""Remove duplicate files"""
|
|
for duplicate in duplicates:
|
|
try:
|
|
os.remove(duplicate)
|
|
except OSError as error:
|
|
self.log.error(error)
|
|
|
|
def remove_duplicates_interactive(self, duplicates):
|
|
"""Remove duplicate files: interactive mode"""
|
|
if len(duplicates) != 0:
|
|
answer = input(f"Do you want to delete these {duplicates} images? Y/n: ")
|
|
if answer.strip().lower() == 'y':
|
|
self.remove_duplicates(duplicates)
|
|
self.log.info('Duplicates images deleted successfully!')
|
|
else:
|
|
self.log.info("No duplicates found")
|
|
|
|
def diff(self, hash1, hash2):
|
|
return np.count_nonzero(hash1 != hash2)
|
|
|
|
def similarity(self, img_diff):
|
|
"""Similarity rate in %"""
|
|
threshold_img = img_diff / (self.hash_size ** 2)
|
|
similarity_img = round((1 - threshold_img) * 100)
|
|
|
|
return similarity_img
|
|
|
|
def find_similar(self, image0, similarity=80):
|
|
"""
|
|
Find similar images
|
|
:returns: img_path generator
|
|
"""
|
|
hash1 = image0.get_hash()
|
|
|
|
if hash1 is None:
|
|
return
|
|
|
|
self.log.info(f"Finding similar images to {image0.img_path}")
|
|
|
|
threshold = 1 - similarity / 100
|
|
diff_limit = int(threshold * (self.hash_size ** 2))
|
|
|
|
for image in self.images:
|
|
if not image.img_path.is_file():
|
|
continue
|
|
if image.img_path == image0.img_path:
|
|
continue
|
|
hash2 = image.get_hash()
|
|
# Be sure that hash are not None
|
|
if hash2 is None:
|
|
continue
|
|
|
|
img_diff = self.diff(hash1, hash2)
|
|
if img_diff <= diff_limit:
|
|
similarity_img = self.similarity(img_diff)
|
|
self.log.info(
|
|
f"{image.img_path} image found {similarity_img}% similar to {image0.img_path}"
|
|
)
|
|
yield image.img_path
|