2015-10-07 10:48:01 +02:00
|
|
|
"""
|
2016-01-08 23:49:06 +01:00
|
|
|
The photo module contains the :class:`Photo` class, which is used to track
|
|
|
|
image objects (JPG, DNG, etc.).
|
|
|
|
|
|
|
|
.. moduleauthor:: Jaisen Mathai <jaisen@jmathai.com>
|
2015-10-07 10:48:01 +02:00
|
|
|
"""
|
|
|
|
|
2021-07-26 20:50:51 +02:00
|
|
|
import imagehash
|
2015-12-11 08:07:01 +01:00
|
|
|
import imghdr
|
2021-07-26 20:50:51 +02:00
|
|
|
import logging
|
|
|
|
import numpy as np
|
2015-10-07 10:48:01 +02:00
|
|
|
import os
|
2021-08-13 19:20:49 +02:00
|
|
|
from PIL import Image, UnidentifiedImageError
|
2015-10-07 10:48:01 +02:00
|
|
|
import time
|
2016-06-21 20:19:40 +02:00
|
|
|
|
2016-03-12 20:09:28 +01:00
|
|
|
from .media import Media
|
2015-10-07 10:48:01 +02:00
|
|
|
|
2016-01-02 08:23:06 +01:00
|
|
|
|
2015-10-07 10:48:01 +02:00
|
|
|
class Photo(Media):
|
2016-01-08 23:49:06 +01:00
|
|
|
|
|
|
|
"""A photo object.
|
|
|
|
|
|
|
|
:param str source: The fully qualified path to the photo file
|
|
|
|
"""
|
|
|
|
|
2015-11-19 11:31:32 +01:00
|
|
|
__name__ = 'Photo'
|
2016-01-08 23:49:06 +01:00
|
|
|
|
|
|
|
#: Valid extensions for photo files.
|
2020-10-09 09:55:14 +02:00
|
|
|
extensions = ('arw', 'cr2', 'dng', 'gif', 'heic', 'jpeg', 'jpg', 'nef', 'png', 'rw2')
|
2015-10-07 10:48:01 +02:00
|
|
|
|
2021-08-13 19:20:49 +02:00
|
|
|
def __init__(self, source=None, hash_size=8, ignore_tags=set(),
|
|
|
|
logger=logging.getLogger()):
|
2021-07-17 16:47:31 +02:00
|
|
|
super().__init__(source, ignore_tags)
|
2015-10-07 10:48:01 +02:00
|
|
|
|
2021-08-13 19:20:49 +02:00
|
|
|
self.hash_size = hash_size
|
|
|
|
self.logger = logger
|
|
|
|
logger.setLevel(logging.INFO)
|
2016-01-02 08:23:06 +01:00
|
|
|
|
2021-08-13 19:20:49 +02:00
|
|
|
# HEIC extension support (experimental, not tested)
|
|
|
|
self.pyheif = False
|
2019-07-15 02:36:09 +02:00
|
|
|
try:
|
2021-08-13 19:20:49 +02:00
|
|
|
from pyheif_pillow_opener import register_heif_opener
|
|
|
|
self.pyheif = True
|
|
|
|
# Allow to open HEIF/HEIC images from pillow
|
|
|
|
register_heif_opener()
|
|
|
|
except ImportError as e:
|
|
|
|
self.logger.info(e)
|
|
|
|
|
|
|
|
def is_image(self, img_path):
|
|
|
|
"""Check whether the file is an image.
|
2016-01-08 23:49:06 +01:00
|
|
|
:returns: bool
|
|
|
|
"""
|
2021-08-13 19:20:49 +02:00
|
|
|
# gh-4 This checks if the source file is an image.
|
|
|
|
# It doesn't validate against the list of supported types.
|
|
|
|
# We check with imghdr and pillow.
|
|
|
|
if imghdr.what(img_path) is None:
|
|
|
|
# Pillow is used as a fallback
|
|
|
|
# imghdr won't detect all variants of images (https://bugs.python.org/issue28591)
|
|
|
|
# see https://github.com/jmathai/elodie/issues/281
|
|
|
|
# before giving up, we use `pillow` imaging library to detect file type
|
|
|
|
#
|
|
|
|
# It is important to note that the library doesn't decode or load the
|
|
|
|
# raster data unless it really has to. When you open a file,
|
|
|
|
# the file header is read to determine the file format and extract
|
|
|
|
# things like mode, size, and other properties required to decode the file,
|
|
|
|
# but the rest of the file is not processed until later.
|
|
|
|
try:
|
|
|
|
im = Image.open(img_path)
|
|
|
|
except (IOError, UnidentifiedImageError):
|
|
|
|
return False
|
|
|
|
|
|
|
|
if(im.format is None):
|
|
|
|
return False
|
|
|
|
|
|
|
|
return True
|
2021-07-26 20:50:51 +02:00
|
|
|
|
2021-08-13 19:20:49 +02:00
|
|
|
def get_images(self, file_paths):
|
2021-07-26 20:50:51 +02:00
|
|
|
'''
|
|
|
|
:returns: img_path generator
|
|
|
|
'''
|
2021-08-13 19:20:49 +02:00
|
|
|
for img_path in file_paths:
|
|
|
|
if self.is_image(img_path):
|
2021-07-26 20:50:51 +02:00
|
|
|
yield img_path
|
|
|
|
|
2021-08-13 19:20:49 +02:00
|
|
|
def get_images_hashes(self, file_paths):
|
|
|
|
"""Get image hashes"""
|
2021-07-26 20:50:51 +02:00
|
|
|
hashes = {}
|
|
|
|
duplicates = []
|
|
|
|
# Searching for duplicates.
|
2021-08-13 19:20:49 +02:00
|
|
|
for img_path in self.get_images(file_paths):
|
|
|
|
with Image.open(img_path) as img:
|
|
|
|
yield imagehash.average_hash(img, self.hash_size)
|
2021-07-26 20:50:51 +02:00
|
|
|
|
2021-08-13 19:20:49 +02:00
|
|
|
def find_duplicates(self, file_paths):
|
|
|
|
"""Find duplicates"""
|
|
|
|
for temp_hash in get_images_hashes(file_paths):
|
|
|
|
if temp_hash in hashes:
|
|
|
|
self.logger.info("Duplicate {} \nfound for image {}\n".format(img_path, hashes[temp_hash]))
|
|
|
|
duplicates.append(img_path)
|
|
|
|
else:
|
|
|
|
hashes[temp_hash] = img_path
|
2021-07-26 20:50:51 +02:00
|
|
|
|
2021-08-13 19:20:49 +02:00
|
|
|
return duplicates
|
2021-07-26 20:50:51 +02:00
|
|
|
|
|
|
|
def remove_duplicates(self, duplicates):
|
|
|
|
for duplicate in duplicates:
|
|
|
|
try:
|
|
|
|
os.remove(duplicate)
|
|
|
|
except OSError as error:
|
|
|
|
self.logger.error(error)
|
|
|
|
|
|
|
|
def remove_duplicates_interactive(self, duplicates):
|
|
|
|
if len(duplicates) != 0:
|
|
|
|
answer = input(f"Do you want to delete these {duplicates} images? Y/n: ")
|
|
|
|
if(answer.strip().lower() == 'y'):
|
|
|
|
self.remove_duplicates(duplicates)
|
|
|
|
self.logger.info(f'{duplicate} deleted successfully!')
|
|
|
|
else:
|
|
|
|
self.logger.info("No duplicates found")
|
|
|
|
|
2021-08-13 19:20:49 +02:00
|
|
|
def get_hash(self, img_path):
|
|
|
|
with Image.open(img_path) as img:
|
|
|
|
return imagehash.average_hash(img, self.hash_size).hash
|
2021-07-26 20:50:51 +02:00
|
|
|
|
2021-08-13 19:20:49 +02:00
|
|
|
def diff(self, hash1, hash2):
|
|
|
|
return np.count_nonzero(hash1 != hash2)
|
|
|
|
|
|
|
|
def similarity(self, img_diff):
|
|
|
|
threshold_img = img_diff / (self.hash_size**2)
|
|
|
|
similarity_img = round((1 - threshold_img) * 100)
|
|
|
|
|
|
|
|
return similarity_img
|
|
|
|
|
|
|
|
def find_similar(self, image, file_paths, similarity=80):
|
2021-07-26 20:50:51 +02:00
|
|
|
'''
|
|
|
|
Find similar images
|
|
|
|
:returns: img_path generator
|
|
|
|
'''
|
|
|
|
hash1 = ''
|
2021-08-13 19:20:49 +02:00
|
|
|
if self.is_image(image):
|
|
|
|
hash1 = self.get_hash(image)
|
2021-07-26 20:50:51 +02:00
|
|
|
|
|
|
|
self.logger.info(f'Finding similar images to {image}')
|
2021-08-13 19:20:49 +02:00
|
|
|
|
|
|
|
threshold = 1 - similarity/100
|
|
|
|
diff_limit = int(threshold*(self.hash_size**2))
|
|
|
|
|
|
|
|
for img_path in self.get_images(file_paths):
|
2021-07-26 20:50:51 +02:00
|
|
|
if img_path == image:
|
|
|
|
continue
|
2021-08-13 19:20:49 +02:00
|
|
|
hash2 = self.get_hash(img_path)
|
|
|
|
img_diff = self.diff(hash1, hash2)
|
|
|
|
if img_diff <= diff_limit:
|
|
|
|
similarity_img = self.similarity(img_diff)
|
|
|
|
self.logger.info(f'{img_path} image found {similarity_img}% similar to {image}')
|
|
|
|
yield img_path
|
2021-07-26 20:50:51 +02:00
|
|
|
|
|
|
|
|