Merge CompareImages and Photo class and refactoring

This commit is contained in:
Cédric Leporcq 2021-08-13 19:20:49 +02:00
parent 080541fc9a
commit c41a546822
2 changed files with 90 additions and 104 deletions

View File

@ -18,7 +18,7 @@ from dozo import constants
from dozo import geolocation from dozo import geolocation
from dozo.media.media import get_media_class, get_all_subclasses from dozo.media.media import get_media_class, get_all_subclasses
from dozo.media.photo import CompareImages from dozo.media.photo import Photo
from dozo.summary import Summary from dozo.summary import Summary
@ -684,9 +684,9 @@ class FileSystem(object):
for filename in filenames: for filename in filenames:
file_paths.add(os.path.join(dirname, filename)) file_paths.add(os.path.join(dirname, filename))
ci = CompareImages(file_paths, logger=self.logger) photo = Photo(logger=self.logger)
images = set([ i for i in ci.get_images() ]) images = set([ i for i in photo.get_images(file_paths) ])
for image in images: for image in images:
if not os.path.isfile(image): if not os.path.isfile(image):
continue continue
@ -698,7 +698,7 @@ class FileSystem(object):
# metadata = media.get_metadata() # metadata = media.get_metadata()
similar = False similar = False
moved_imgs = set() moved_imgs = set()
for img_path in ci.find_similar(image, similarity): for img_path in photo.find_similar(image, file_paths, similarity):
similar = True similar = True
checksum2 = self.checksum(img_path) checksum2 = self.checksum(img_path)
# move image into directory # move image into directory
@ -727,8 +727,8 @@ class FileSystem(object):
if not result: if not result:
has_errors = True has_errors = True
for moved_img in moved_imgs: # for moved_img in moved_imgs:
ci.file_paths.remove(moved_img) # os.remove(moved_img)
return self.summary, has_errors return self.summary, has_errors

View File

@ -10,7 +10,7 @@ import imghdr
import logging import logging
import numpy as np import numpy as np
import os import os
from PIL import Image from PIL import Image, UnidentifiedImageError
import time import time
from .media import Media from .media import Media
@ -28,45 +28,33 @@ class Photo(Media):
#: Valid extensions for photo files. #: Valid extensions for photo files.
extensions = ('arw', 'cr2', 'dng', 'gif', 'heic', 'jpeg', 'jpg', 'nef', 'png', 'rw2') extensions = ('arw', 'cr2', 'dng', 'gif', 'heic', 'jpeg', 'jpg', 'nef', 'png', 'rw2')
def __init__(self, source=None, ignore_tags=set()): def __init__(self, source=None, hash_size=8, ignore_tags=set(),
logger=logging.getLogger()):
super().__init__(source, ignore_tags) super().__init__(source, ignore_tags)
# We only want to parse EXIF once so we store it here self.hash_size = hash_size
self.exif = None self.logger = logger
logger.setLevel(logging.INFO)
# Optionally import Pillow - see gh-325 # HEIC extension support (experimental, not tested)
# https://github.com/jmathai/elodie/issues/325 self.pyheif = False
self.pillow = None
try: try:
from PIL import Image from pyheif_pillow_opener import register_heif_opener
self.pillow = Image self.pyheif = True
except ImportError: # Allow to open HEIF/HEIC images from pillow
pass register_heif_opener()
except ImportError as e:
self.logger.info(e)
def is_valid(self):
"""Check the file extension against valid file extensions.
The list of valid file extensions come from self.extensions. This
also checks whether the file is an image.
def is_image(self, img_path):
"""Check whether the file is an image.
:returns: bool :returns: bool
""" """
source = self.source
# HEIC is not well supported yet so we special case it.
# https://github.com/python-pillow/Pillow/issues/2806
extension = os.path.splitext(source)[1][1:].lower()
if(extension != 'heic'):
# gh-4 This checks if the source file is an image. # gh-4 This checks if the source file is an image.
# It doesn't validate against the list of supported types. # It doesn't validate against the list of supported types.
# We check with imghdr and pillow. # We check with imghdr and pillow.
if(imghdr.what(source) is None): if imghdr.what(img_path) is None:
# Pillow is used as a fallback and if it's not available we trust # Pillow is used as a fallback
# what imghdr returned.
if(self.pillow is None):
return False
else:
# imghdr won't detect all variants of images (https://bugs.python.org/issue28591) # imghdr won't detect all variants of images (https://bugs.python.org/issue28591)
# see https://github.com/jmathai/elodie/issues/281 # see https://github.com/jmathai/elodie/issues/281
# before giving up, we use `pillow` imaging library to detect file type # before giving up, we use `pillow` imaging library to detect file type
@ -77,44 +65,35 @@ class Photo(Media):
# things like mode, size, and other properties required to decode the file, # things like mode, size, and other properties required to decode the file,
# but the rest of the file is not processed until later. # but the rest of the file is not processed until later.
try: try:
im = self.pillow.open(source) im = Image.open(img_path)
except IOError: except (IOError, UnidentifiedImageError):
return False return False
if(im.format is None): if(im.format is None):
return False return False
return extension in self.extensions return True
def get_images(self, file_paths):
class CompareImages:
def __init__(self, file_paths, hash_size=8, logger=logging.getLogger()):
self.file_paths = file_paths
self.hash_size = hash_size
self.logger = logger
logger.setLevel(logging.INFO)
def get_images(self):
''' '''
:returns: img_path generator :returns: img_path generator
''' '''
for img_path in self.file_paths: for img_path in file_paths:
if imghdr.what(img_path) is not None: if self.is_image(img_path):
yield img_path yield img_path
def get_images_hashes(self, file_paths):
def find_duplicates(self): """Get image hashes"""
"""
Find duplicates
"""
hashes = {} hashes = {}
duplicates = [] duplicates = []
# Searching for duplicates. # Searching for duplicates.
for img_path in self.get_images(): for img_path in self.get_images(file_paths):
if imghdr.what(img_path) is not None:
with Image.open(img_path) as img: with Image.open(img_path) as img:
temp_hash = imagehash.average_hash(img, self.hash_size) yield imagehash.average_hash(img, self.hash_size)
def find_duplicates(self, file_paths):
"""Find duplicates"""
for temp_hash in get_images_hashes(file_paths):
if temp_hash in hashes: if temp_hash in hashes:
self.logger.info("Duplicate {} \nfound for image {}\n".format(img_path, hashes[temp_hash])) self.logger.info("Duplicate {} \nfound for image {}\n".format(img_path, hashes[temp_hash]))
duplicates.append(img_path) duplicates.append(img_path)
@ -123,7 +102,6 @@ class CompareImages:
return duplicates return duplicates
def remove_duplicates(self, duplicates): def remove_duplicates(self, duplicates):
for duplicate in duplicates: for duplicate in duplicates:
try: try:
@ -131,7 +109,6 @@ class CompareImages:
except OSError as error: except OSError as error:
self.logger.error(error) self.logger.error(error)
def remove_duplicates_interactive(self, duplicates): def remove_duplicates_interactive(self, duplicates):
if len(duplicates) != 0: if len(duplicates) != 0:
answer = input(f"Do you want to delete these {duplicates} images? Y/n: ") answer = input(f"Do you want to delete these {duplicates} images? Y/n: ")
@ -141,31 +118,40 @@ class CompareImages:
else: else:
self.logger.info("No duplicates found") self.logger.info("No duplicates found")
def get_hash(self, img_path):
with Image.open(img_path) as img:
return imagehash.average_hash(img, self.hash_size).hash
def find_similar(self, image, similarity=80): def diff(self, hash1, hash2):
return np.count_nonzero(hash1 != hash2)
def similarity(self, img_diff):
threshold_img = img_diff / (self.hash_size**2)
similarity_img = round((1 - threshold_img) * 100)
return similarity_img
def find_similar(self, image, file_paths, similarity=80):
''' '''
Find similar images Find similar images
:returns: img_path generator :returns: img_path generator
''' '''
hash1 = ''
if self.is_image(image):
hash1 = self.get_hash(image)
self.logger.info(f'Finding similar images to {image}')
threshold = 1 - similarity/100 threshold = 1 - similarity/100
diff_limit = int(threshold*(self.hash_size**2)) diff_limit = int(threshold*(self.hash_size**2))
hash1 = '' for img_path in self.get_images(file_paths):
if imghdr.what(image) is not None:
with Image.open(image) as img:
hash1 = imagehash.average_hash(img, self.hash_size).hash
self.logger.info(f'Finding similar images to {image}')
for img_path in self.get_images():
if img_path == image: if img_path == image:
continue continue
with Image.open(img_path) as img: hash2 = self.get_hash(img_path)
hash2 = imagehash.average_hash(img, self.hash_size).hash img_diff = self.diff(hash1, hash2)
if img_diff <= diff_limit:
diff_images = np.count_nonzero(hash1 != hash2) similarity_img = self.similarity(img_diff)
if diff_images <= diff_limit:
threshold_img = diff_images / (self.hash_size**2)
similarity_img = round((1 - threshold_img) * 100)
self.logger.info(f'{img_path} image found {similarity_img}% similar to {image}') self.logger.info(f'{img_path} image found {similarity_img}% similar to {image}')
yield img_path yield img_path