Merge CompareImages and Photo class and refactoring

This commit is contained in:
Cédric Leporcq 2021-08-13 19:20:49 +02:00
parent 080541fc9a
commit c41a546822
2 changed files with 90 additions and 104 deletions

View File

@ -18,7 +18,7 @@ from dozo import constants
from dozo import geolocation from dozo import geolocation
from dozo.media.media import get_media_class, get_all_subclasses from dozo.media.media import get_media_class, get_all_subclasses
from dozo.media.photo import CompareImages from dozo.media.photo import Photo
from dozo.summary import Summary from dozo.summary import Summary
@ -684,9 +684,9 @@ class FileSystem(object):
for filename in filenames: for filename in filenames:
file_paths.add(os.path.join(dirname, filename)) file_paths.add(os.path.join(dirname, filename))
ci = CompareImages(file_paths, logger=self.logger) photo = Photo(logger=self.logger)
images = set([ i for i in ci.get_images() ]) images = set([ i for i in photo.get_images(file_paths) ])
for image in images: for image in images:
if not os.path.isfile(image): if not os.path.isfile(image):
continue continue
@ -698,7 +698,7 @@ class FileSystem(object):
# metadata = media.get_metadata() # metadata = media.get_metadata()
similar = False similar = False
moved_imgs = set() moved_imgs = set()
for img_path in ci.find_similar(image, similarity): for img_path in photo.find_similar(image, file_paths, similarity):
similar = True similar = True
checksum2 = self.checksum(img_path) checksum2 = self.checksum(img_path)
# move image into directory # move image into directory
@ -727,8 +727,8 @@ class FileSystem(object):
if not result: if not result:
has_errors = True has_errors = True
for moved_img in moved_imgs: # for moved_img in moved_imgs:
ci.file_paths.remove(moved_img) # os.remove(moved_img)
return self.summary, has_errors return self.summary, has_errors

View File

@ -10,7 +10,7 @@ import imghdr
import logging import logging
import numpy as np import numpy as np
import os import os
from PIL import Image from PIL import Image, UnidentifiedImageError
import time import time
from .media import Media from .media import Media
@ -28,102 +28,80 @@ class Photo(Media):
#: Valid extensions for photo files. #: Valid extensions for photo files.
extensions = ('arw', 'cr2', 'dng', 'gif', 'heic', 'jpeg', 'jpg', 'nef', 'png', 'rw2') extensions = ('arw', 'cr2', 'dng', 'gif', 'heic', 'jpeg', 'jpg', 'nef', 'png', 'rw2')
def __init__(self, source=None, ignore_tags=set()): def __init__(self, source=None, hash_size=8, ignore_tags=set(),
logger=logging.getLogger()):
super().__init__(source, ignore_tags) super().__init__(source, ignore_tags)
# We only want to parse EXIF once so we store it here
self.exif = None
# Optionally import Pillow - see gh-325
# https://github.com/jmathai/elodie/issues/325
self.pillow = None
try:
from PIL import Image
self.pillow = Image
except ImportError:
pass
def is_valid(self):
"""Check the file extension against valid file extensions.
The list of valid file extensions come from self.extensions. This
also checks whether the file is an image.
:returns: bool
"""
source = self.source
# HEIC is not well supported yet so we special case it.
# https://github.com/python-pillow/Pillow/issues/2806
extension = os.path.splitext(source)[1][1:].lower()
if(extension != 'heic'):
# gh-4 This checks if the source file is an image.
# It doesn't validate against the list of supported types.
# We check with imghdr and pillow.
if(imghdr.what(source) is None):
# Pillow is used as a fallback and if it's not available we trust
# what imghdr returned.
if(self.pillow is None):
return False
else:
# imghdr won't detect all variants of images (https://bugs.python.org/issue28591)
# see https://github.com/jmathai/elodie/issues/281
# before giving up, we use `pillow` imaging library to detect file type
#
# It is important to note that the library doesn't decode or load the
# raster data unless it really has to. When you open a file,
# the file header is read to determine the file format and extract
# things like mode, size, and other properties required to decode the file,
# but the rest of the file is not processed until later.
try:
im = self.pillow.open(source)
except IOError:
return False
if(im.format is None):
return False
return extension in self.extensions
class CompareImages:
def __init__(self, file_paths, hash_size=8, logger=logging.getLogger()):
self.file_paths = file_paths
self.hash_size = hash_size self.hash_size = hash_size
self.logger = logger self.logger = logger
logger.setLevel(logging.INFO) logger.setLevel(logging.INFO)
def get_images(self): # HEIC extension support (experimental, not tested)
self.pyheif = False
try:
from pyheif_pillow_opener import register_heif_opener
self.pyheif = True
# Allow to open HEIF/HEIC images from pillow
register_heif_opener()
except ImportError as e:
self.logger.info(e)
def is_image(self, img_path):
"""Check whether the file is an image.
:returns: bool
"""
# gh-4 This checks if the source file is an image.
# It doesn't validate against the list of supported types.
# We check with imghdr and pillow.
if imghdr.what(img_path) is None:
# Pillow is used as a fallback
# imghdr won't detect all variants of images (https://bugs.python.org/issue28591)
# see https://github.com/jmathai/elodie/issues/281
# before giving up, we use `pillow` imaging library to detect file type
#
# It is important to note that the library doesn't decode or load the
# raster data unless it really has to. When you open a file,
# the file header is read to determine the file format and extract
# things like mode, size, and other properties required to decode the file,
# but the rest of the file is not processed until later.
try:
im = Image.open(img_path)
except (IOError, UnidentifiedImageError):
return False
if(im.format is None):
return False
return True
def get_images(self, file_paths):
''' '''
:returns: img_path generator :returns: img_path generator
''' '''
for img_path in self.file_paths: for img_path in file_paths:
if imghdr.what(img_path) is not None: if self.is_image(img_path):
yield img_path yield img_path
def get_images_hashes(self, file_paths):
def find_duplicates(self): """Get image hashes"""
"""
Find duplicates
"""
hashes = {} hashes = {}
duplicates = [] duplicates = []
# Searching for duplicates. # Searching for duplicates.
for img_path in self.get_images(): for img_path in self.get_images(file_paths):
if imghdr.what(img_path) is not None: with Image.open(img_path) as img:
with Image.open(img_path) as img: yield imagehash.average_hash(img, self.hash_size)
temp_hash = imagehash.average_hash(img, self.hash_size)
if temp_hash in hashes: def find_duplicates(self, file_paths):
self.logger.info("Duplicate {} \nfound for image {}\n".format(img_path, hashes[temp_hash])) """Find duplicates"""
duplicates.append(img_path) for temp_hash in get_images_hashes(file_paths):
else: if temp_hash in hashes:
hashes[temp_hash] = img_path self.logger.info("Duplicate {} \nfound for image {}\n".format(img_path, hashes[temp_hash]))
duplicates.append(img_path)
else:
hashes[temp_hash] = img_path
return duplicates return duplicates
def remove_duplicates(self, duplicates): def remove_duplicates(self, duplicates):
for duplicate in duplicates: for duplicate in duplicates:
try: try:
@ -131,7 +109,6 @@ class CompareImages:
except OSError as error: except OSError as error:
self.logger.error(error) self.logger.error(error)
def remove_duplicates_interactive(self, duplicates): def remove_duplicates_interactive(self, duplicates):
if len(duplicates) != 0: if len(duplicates) != 0:
answer = input(f"Do you want to delete these {duplicates} images? Y/n: ") answer = input(f"Do you want to delete these {duplicates} images? Y/n: ")
@ -141,32 +118,41 @@ class CompareImages:
else: else:
self.logger.info("No duplicates found") self.logger.info("No duplicates found")
def get_hash(self, img_path):
with Image.open(img_path) as img:
return imagehash.average_hash(img, self.hash_size).hash
def find_similar(self, image, similarity=80): def diff(self, hash1, hash2):
return np.count_nonzero(hash1 != hash2)
def similarity(self, img_diff):
threshold_img = img_diff / (self.hash_size**2)
similarity_img = round((1 - threshold_img) * 100)
return similarity_img
def find_similar(self, image, file_paths, similarity=80):
''' '''
Find similar images Find similar images
:returns: img_path generator :returns: img_path generator
''' '''
hash1 = ''
if self.is_image(image):
hash1 = self.get_hash(image)
self.logger.info(f'Finding similar images to {image}')
threshold = 1 - similarity/100 threshold = 1 - similarity/100
diff_limit = int(threshold*(self.hash_size**2)) diff_limit = int(threshold*(self.hash_size**2))
hash1 = '' for img_path in self.get_images(file_paths):
if imghdr.what(image) is not None:
with Image.open(image) as img:
hash1 = imagehash.average_hash(img, self.hash_size).hash
self.logger.info(f'Finding similar images to {image}')
for img_path in self.get_images():
if img_path == image: if img_path == image:
continue continue
with Image.open(img_path) as img: hash2 = self.get_hash(img_path)
hash2 = imagehash.average_hash(img, self.hash_size).hash img_diff = self.diff(hash1, hash2)
if img_diff <= diff_limit:
diff_images = np.count_nonzero(hash1 != hash2) similarity_img = self.similarity(img_diff)
if diff_images <= diff_limit: self.logger.info(f'{img_path} image found {similarity_img}% similar to {image}')
threshold_img = diff_images / (self.hash_size**2) yield img_path
similarity_img = round((1 - threshold_img) * 100)
self.logger.info(f'{img_path} image found {similarity_img}% similar to {image}')
yield img_path