Merge CompareImages and Photo class and refactoring
This commit is contained in:
parent
080541fc9a
commit
c41a546822
|
@ -18,7 +18,7 @@ from dozo import constants
|
||||||
from dozo import geolocation
|
from dozo import geolocation
|
||||||
|
|
||||||
from dozo.media.media import get_media_class, get_all_subclasses
|
from dozo.media.media import get_media_class, get_all_subclasses
|
||||||
from dozo.media.photo import CompareImages
|
from dozo.media.photo import Photo
|
||||||
from dozo.summary import Summary
|
from dozo.summary import Summary
|
||||||
|
|
||||||
|
|
||||||
|
@ -684,9 +684,9 @@ class FileSystem(object):
|
||||||
for filename in filenames:
|
for filename in filenames:
|
||||||
file_paths.add(os.path.join(dirname, filename))
|
file_paths.add(os.path.join(dirname, filename))
|
||||||
|
|
||||||
ci = CompareImages(file_paths, logger=self.logger)
|
photo = Photo(logger=self.logger)
|
||||||
|
|
||||||
images = set([ i for i in ci.get_images() ])
|
images = set([ i for i in photo.get_images(file_paths) ])
|
||||||
for image in images:
|
for image in images:
|
||||||
if not os.path.isfile(image):
|
if not os.path.isfile(image):
|
||||||
continue
|
continue
|
||||||
|
@ -698,7 +698,7 @@ class FileSystem(object):
|
||||||
# metadata = media.get_metadata()
|
# metadata = media.get_metadata()
|
||||||
similar = False
|
similar = False
|
||||||
moved_imgs = set()
|
moved_imgs = set()
|
||||||
for img_path in ci.find_similar(image, similarity):
|
for img_path in photo.find_similar(image, file_paths, similarity):
|
||||||
similar = True
|
similar = True
|
||||||
checksum2 = self.checksum(img_path)
|
checksum2 = self.checksum(img_path)
|
||||||
# move image into directory
|
# move image into directory
|
||||||
|
@ -727,8 +727,8 @@ class FileSystem(object):
|
||||||
if not result:
|
if not result:
|
||||||
has_errors = True
|
has_errors = True
|
||||||
|
|
||||||
for moved_img in moved_imgs:
|
# for moved_img in moved_imgs:
|
||||||
ci.file_paths.remove(moved_img)
|
# os.remove(moved_img)
|
||||||
|
|
||||||
return self.summary, has_errors
|
return self.summary, has_errors
|
||||||
|
|
||||||
|
|
|
@ -10,7 +10,7 @@ import imghdr
|
||||||
import logging
|
import logging
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import os
|
import os
|
||||||
from PIL import Image
|
from PIL import Image, UnidentifiedImageError
|
||||||
import time
|
import time
|
||||||
|
|
||||||
from .media import Media
|
from .media import Media
|
||||||
|
@ -28,102 +28,80 @@ class Photo(Media):
|
||||||
#: Valid extensions for photo files.
|
#: Valid extensions for photo files.
|
||||||
extensions = ('arw', 'cr2', 'dng', 'gif', 'heic', 'jpeg', 'jpg', 'nef', 'png', 'rw2')
|
extensions = ('arw', 'cr2', 'dng', 'gif', 'heic', 'jpeg', 'jpg', 'nef', 'png', 'rw2')
|
||||||
|
|
||||||
def __init__(self, source=None, ignore_tags=set()):
|
def __init__(self, source=None, hash_size=8, ignore_tags=set(),
|
||||||
|
logger=logging.getLogger()):
|
||||||
super().__init__(source, ignore_tags)
|
super().__init__(source, ignore_tags)
|
||||||
|
|
||||||
# We only want to parse EXIF once so we store it here
|
|
||||||
self.exif = None
|
|
||||||
|
|
||||||
# Optionally import Pillow - see gh-325
|
|
||||||
# https://github.com/jmathai/elodie/issues/325
|
|
||||||
self.pillow = None
|
|
||||||
try:
|
|
||||||
from PIL import Image
|
|
||||||
self.pillow = Image
|
|
||||||
except ImportError:
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
def is_valid(self):
|
|
||||||
"""Check the file extension against valid file extensions.
|
|
||||||
|
|
||||||
The list of valid file extensions come from self.extensions. This
|
|
||||||
also checks whether the file is an image.
|
|
||||||
|
|
||||||
:returns: bool
|
|
||||||
"""
|
|
||||||
source = self.source
|
|
||||||
|
|
||||||
# HEIC is not well supported yet so we special case it.
|
|
||||||
# https://github.com/python-pillow/Pillow/issues/2806
|
|
||||||
extension = os.path.splitext(source)[1][1:].lower()
|
|
||||||
if(extension != 'heic'):
|
|
||||||
# gh-4 This checks if the source file is an image.
|
|
||||||
# It doesn't validate against the list of supported types.
|
|
||||||
# We check with imghdr and pillow.
|
|
||||||
if(imghdr.what(source) is None):
|
|
||||||
# Pillow is used as a fallback and if it's not available we trust
|
|
||||||
# what imghdr returned.
|
|
||||||
if(self.pillow is None):
|
|
||||||
return False
|
|
||||||
else:
|
|
||||||
# imghdr won't detect all variants of images (https://bugs.python.org/issue28591)
|
|
||||||
# see https://github.com/jmathai/elodie/issues/281
|
|
||||||
# before giving up, we use `pillow` imaging library to detect file type
|
|
||||||
#
|
|
||||||
# It is important to note that the library doesn't decode or load the
|
|
||||||
# raster data unless it really has to. When you open a file,
|
|
||||||
# the file header is read to determine the file format and extract
|
|
||||||
# things like mode, size, and other properties required to decode the file,
|
|
||||||
# but the rest of the file is not processed until later.
|
|
||||||
try:
|
|
||||||
im = self.pillow.open(source)
|
|
||||||
except IOError:
|
|
||||||
return False
|
|
||||||
|
|
||||||
if(im.format is None):
|
|
||||||
return False
|
|
||||||
|
|
||||||
return extension in self.extensions
|
|
||||||
|
|
||||||
|
|
||||||
class CompareImages:
|
|
||||||
def __init__(self, file_paths, hash_size=8, logger=logging.getLogger()):
|
|
||||||
self.file_paths = file_paths
|
|
||||||
self.hash_size = hash_size
|
self.hash_size = hash_size
|
||||||
self.logger = logger
|
self.logger = logger
|
||||||
logger.setLevel(logging.INFO)
|
logger.setLevel(logging.INFO)
|
||||||
|
|
||||||
def get_images(self):
|
# HEIC extension support (experimental, not tested)
|
||||||
|
self.pyheif = False
|
||||||
|
try:
|
||||||
|
from pyheif_pillow_opener import register_heif_opener
|
||||||
|
self.pyheif = True
|
||||||
|
# Allow to open HEIF/HEIC images from pillow
|
||||||
|
register_heif_opener()
|
||||||
|
except ImportError as e:
|
||||||
|
self.logger.info(e)
|
||||||
|
|
||||||
|
def is_image(self, img_path):
|
||||||
|
"""Check whether the file is an image.
|
||||||
|
:returns: bool
|
||||||
|
"""
|
||||||
|
# gh-4 This checks if the source file is an image.
|
||||||
|
# It doesn't validate against the list of supported types.
|
||||||
|
# We check with imghdr and pillow.
|
||||||
|
if imghdr.what(img_path) is None:
|
||||||
|
# Pillow is used as a fallback
|
||||||
|
# imghdr won't detect all variants of images (https://bugs.python.org/issue28591)
|
||||||
|
# see https://github.com/jmathai/elodie/issues/281
|
||||||
|
# before giving up, we use `pillow` imaging library to detect file type
|
||||||
|
#
|
||||||
|
# It is important to note that the library doesn't decode or load the
|
||||||
|
# raster data unless it really has to. When you open a file,
|
||||||
|
# the file header is read to determine the file format and extract
|
||||||
|
# things like mode, size, and other properties required to decode the file,
|
||||||
|
# but the rest of the file is not processed until later.
|
||||||
|
try:
|
||||||
|
im = Image.open(img_path)
|
||||||
|
except (IOError, UnidentifiedImageError):
|
||||||
|
return False
|
||||||
|
|
||||||
|
if(im.format is None):
|
||||||
|
return False
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
|
def get_images(self, file_paths):
|
||||||
'''
|
'''
|
||||||
:returns: img_path generator
|
:returns: img_path generator
|
||||||
'''
|
'''
|
||||||
for img_path in self.file_paths:
|
for img_path in file_paths:
|
||||||
if imghdr.what(img_path) is not None:
|
if self.is_image(img_path):
|
||||||
yield img_path
|
yield img_path
|
||||||
|
|
||||||
|
def get_images_hashes(self, file_paths):
|
||||||
def find_duplicates(self):
|
"""Get image hashes"""
|
||||||
"""
|
|
||||||
Find duplicates
|
|
||||||
"""
|
|
||||||
|
|
||||||
hashes = {}
|
hashes = {}
|
||||||
duplicates = []
|
duplicates = []
|
||||||
# Searching for duplicates.
|
# Searching for duplicates.
|
||||||
for img_path in self.get_images():
|
for img_path in self.get_images(file_paths):
|
||||||
if imghdr.what(img_path) is not None:
|
with Image.open(img_path) as img:
|
||||||
with Image.open(img_path) as img:
|
yield imagehash.average_hash(img, self.hash_size)
|
||||||
temp_hash = imagehash.average_hash(img, self.hash_size)
|
|
||||||
if temp_hash in hashes:
|
def find_duplicates(self, file_paths):
|
||||||
self.logger.info("Duplicate {} \nfound for image {}\n".format(img_path, hashes[temp_hash]))
|
"""Find duplicates"""
|
||||||
duplicates.append(img_path)
|
for temp_hash in get_images_hashes(file_paths):
|
||||||
else:
|
if temp_hash in hashes:
|
||||||
hashes[temp_hash] = img_path
|
self.logger.info("Duplicate {} \nfound for image {}\n".format(img_path, hashes[temp_hash]))
|
||||||
|
duplicates.append(img_path)
|
||||||
|
else:
|
||||||
|
hashes[temp_hash] = img_path
|
||||||
|
|
||||||
return duplicates
|
return duplicates
|
||||||
|
|
||||||
|
|
||||||
def remove_duplicates(self, duplicates):
|
def remove_duplicates(self, duplicates):
|
||||||
for duplicate in duplicates:
|
for duplicate in duplicates:
|
||||||
try:
|
try:
|
||||||
|
@ -131,7 +109,6 @@ class CompareImages:
|
||||||
except OSError as error:
|
except OSError as error:
|
||||||
self.logger.error(error)
|
self.logger.error(error)
|
||||||
|
|
||||||
|
|
||||||
def remove_duplicates_interactive(self, duplicates):
|
def remove_duplicates_interactive(self, duplicates):
|
||||||
if len(duplicates) != 0:
|
if len(duplicates) != 0:
|
||||||
answer = input(f"Do you want to delete these {duplicates} images? Y/n: ")
|
answer = input(f"Do you want to delete these {duplicates} images? Y/n: ")
|
||||||
|
@ -141,32 +118,41 @@ class CompareImages:
|
||||||
else:
|
else:
|
||||||
self.logger.info("No duplicates found")
|
self.logger.info("No duplicates found")
|
||||||
|
|
||||||
|
def get_hash(self, img_path):
|
||||||
|
with Image.open(img_path) as img:
|
||||||
|
return imagehash.average_hash(img, self.hash_size).hash
|
||||||
|
|
||||||
def find_similar(self, image, similarity=80):
|
def diff(self, hash1, hash2):
|
||||||
|
return np.count_nonzero(hash1 != hash2)
|
||||||
|
|
||||||
|
def similarity(self, img_diff):
|
||||||
|
threshold_img = img_diff / (self.hash_size**2)
|
||||||
|
similarity_img = round((1 - threshold_img) * 100)
|
||||||
|
|
||||||
|
return similarity_img
|
||||||
|
|
||||||
|
def find_similar(self, image, file_paths, similarity=80):
|
||||||
'''
|
'''
|
||||||
Find similar images
|
Find similar images
|
||||||
:returns: img_path generator
|
:returns: img_path generator
|
||||||
'''
|
'''
|
||||||
|
hash1 = ''
|
||||||
|
if self.is_image(image):
|
||||||
|
hash1 = self.get_hash(image)
|
||||||
|
|
||||||
|
self.logger.info(f'Finding similar images to {image}')
|
||||||
|
|
||||||
threshold = 1 - similarity/100
|
threshold = 1 - similarity/100
|
||||||
diff_limit = int(threshold*(self.hash_size**2))
|
diff_limit = int(threshold*(self.hash_size**2))
|
||||||
|
|
||||||
hash1 = ''
|
for img_path in self.get_images(file_paths):
|
||||||
if imghdr.what(image) is not None:
|
|
||||||
with Image.open(image) as img:
|
|
||||||
hash1 = imagehash.average_hash(img, self.hash_size).hash
|
|
||||||
|
|
||||||
self.logger.info(f'Finding similar images to {image}')
|
|
||||||
for img_path in self.get_images():
|
|
||||||
if img_path == image:
|
if img_path == image:
|
||||||
continue
|
continue
|
||||||
with Image.open(img_path) as img:
|
hash2 = self.get_hash(img_path)
|
||||||
hash2 = imagehash.average_hash(img, self.hash_size).hash
|
img_diff = self.diff(hash1, hash2)
|
||||||
|
if img_diff <= diff_limit:
|
||||||
diff_images = np.count_nonzero(hash1 != hash2)
|
similarity_img = self.similarity(img_diff)
|
||||||
if diff_images <= diff_limit:
|
self.logger.info(f'{img_path} image found {similarity_img}% similar to {image}')
|
||||||
threshold_img = diff_images / (self.hash_size**2)
|
yield img_path
|
||||||
similarity_img = round((1 - threshold_img) * 100)
|
|
||||||
self.logger.info(f'{img_path} image found {similarity_img}% similar to {image}')
|
|
||||||
yield img_path
|
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue