Merge CompareImages and Photo class and refactoring

2021-08-13 19:20:49 +02:00 · 2021-08-13 19:20:49 +02:00 · c41a546822
commit c41a546822
parent 080541fc9a
2 changed files with 90 additions and 104 deletions
--- a/dozo/filesystem.py
+++ b/dozo/filesystem.py
@ -18,7 +18,7 @@ from dozo import constants
 from dozo import geolocation
 from dozo.media.media import get_media_class, get_all_subclasses
-from dozo.media.photo import CompareImages
+from dozo.media.photo import Photo
 from dozo.summary import Summary
@ -684,9 +684,9 @@ class FileSystem(object):
            for filename in filenames:
                file_paths.add(os.path.join(dirname, filename))
-            ci = CompareImages(file_paths, logger=self.logger)
+            photo = Photo(logger=self.logger)
-            images = set([ i for i in ci.get_images() ])
+            images = set([ i for i in photo.get_images(file_paths) ])
            for image in images:
                if not os.path.isfile(image):
                    continue
@ -698,7 +698,7 @@ class FileSystem(object):
                #     metadata = media.get_metadata()
                similar = False
                moved_imgs = set()
-                for img_path in ci.find_similar(image, similarity):
+                for img_path in photo.find_similar(image, file_paths, similarity):
                    similar = True
                    checksum2 = self.checksum(img_path)
                    # move image into directory
@ -727,8 +727,8 @@ class FileSystem(object):
                    if not result:
                        has_errors = True
-                for moved_img in moved_imgs:
+                # for moved_img in moved_imgs:
-                    ci.file_paths.remove(moved_img)
+                #     os.remove(moved_img)
        return self.summary, has_errors
--- a/dozo/media/photo.py
+++ b/dozo/media/photo.py
@ -10,7 +10,7 @@ import imghdr
 import logging
 import numpy as np
 import os
-from PIL import Image
+from PIL import Image, UnidentifiedImageError
 import time
 from .media import Media
@ -28,102 +28,80 @@ class Photo(Media):
    #: Valid extensions for photo files.
    extensions = ('arw', 'cr2', 'dng', 'gif', 'heic', 'jpeg', 'jpg', 'nef', 'png', 'rw2')
-    def __init__(self, source=None, ignore_tags=set()):
+    def __init__(self, source=None, hash_size=8, ignore_tags=set(),
            logger=logging.getLogger()):
        super().__init__(source, ignore_tags)
        # We only want to parse EXIF once so we store it here
        self.exif = None
        # Optionally import Pillow - see gh-325
        # https://github.com/jmathai/elodie/issues/325
        self.pillow = None
        try:
            from PIL import Image
            self.pillow = Image
        except ImportError:
            pass
    def is_valid(self):
        """Check the file extension against valid file extensions.
        The list of valid file extensions come from self.extensions. This
        also checks whether the file is an image.
        :returns: bool
        """
        source = self.source
        # HEIC is not well supported yet so we special case it.
        # https://github.com/python-pillow/Pillow/issues/2806
        extension = os.path.splitext(source)[1][1:].lower()
        if(extension != 'heic'):
            # gh-4 This checks if the source file is an image.
            # It doesn't validate against the list of supported types.
            # We check with imghdr and pillow.
            if(imghdr.what(source) is None):
                # Pillow is used as a fallback and if it's not available we trust
                #   what imghdr returned.
                if(self.pillow is None):
                    return False
                else:
                    # imghdr won't detect all variants of images (https://bugs.python.org/issue28591)
                    # see https://github.com/jmathai/elodie/issues/281
                    # before giving up, we use `pillow` imaging library to detect file type
                    #
                    # It is important to note that the library doesn't decode or load the
                    # raster data unless it really has to. When you open a file,
                    # the file header is read to determine the file format and extract
                    # things like mode, size, and other properties required to decode the file,
                    # but the rest of the file is not processed until later.
                    try:
                        im = self.pillow.open(source)
                    except IOError:
                        return False
                    if(im.format is None):
                        return False
        return extension in self.extensions
 class CompareImages:
    def __init__(self, file_paths, hash_size=8, logger=logging.getLogger()):
        self.file_paths = file_paths
        self.hash_size = hash_size
        self.logger = logger
        logger.setLevel(logging.INFO)
-    def get_images(self):
+        # HEIC extension support (experimental, not tested)
        self.pyheif = False
        try:
            from pyheif_pillow_opener import register_heif_opener
            self.pyheif = True
            # Allow to open HEIF/HEIC images from pillow
            register_heif_opener()
        except ImportError as e:
            self.logger.info(e)
    def is_image(self, img_path):
        """Check whether the file is an image.
        :returns: bool
        """
        # gh-4 This checks if the source file is an image.
        # It doesn't validate against the list of supported types.
        # We check with imghdr and pillow.
        if imghdr.what(img_path) is None:
            # Pillow is used as a fallback
            # imghdr won't detect all variants of images (https://bugs.python.org/issue28591)
            # see https://github.com/jmathai/elodie/issues/281
            # before giving up, we use `pillow` imaging library to detect file type
            #
            # It is important to note that the library doesn't decode or load the
            # raster data unless it really has to. When you open a file,
            # the file header is read to determine the file format and extract
            # things like mode, size, and other properties required to decode the file,
            # but the rest of the file is not processed until later.
            try:
                im = Image.open(img_path)
            except (IOError, UnidentifiedImageError):
                return False
            if(im.format is None):
                return False
        return True
    def get_images(self, file_paths):
        '''
        :returns: img_path generator
        '''
-        for img_path in self.file_paths:
+        for img_path in file_paths:
-            if imghdr.what(img_path) is not None:
+            if self.is_image(img_path):
                yield img_path
-
+    def get_images_hashes(self, file_paths):
-    def find_duplicates(self):
+        """Get image hashes"""
        """
        Find duplicates
        """
        hashes = {}
        duplicates = []
        # Searching for duplicates.
-        for img_path in self.get_images():
+        for img_path in self.get_images(file_paths):
-            if imghdr.what(img_path) is not None:
+            with Image.open(img_path) as img:
-                with Image.open(img_path) as img:
+                yield imagehash.average_hash(img, self.hash_size)
-                    temp_hash = imagehash.average_hash(img, self.hash_size)
+
-                    if temp_hash in hashes:
+    def find_duplicates(self, file_paths):
-                        self.logger.info("Duplicate {} \nfound for image {}\n".format(img_path, hashes[temp_hash]))
+        """Find duplicates"""
-                        duplicates.append(img_path)
+        for temp_hash in get_images_hashes(file_paths):
-                    else:
+            if temp_hash in hashes:
-                        hashes[temp_hash] = img_path
+                self.logger.info("Duplicate {} \nfound for image {}\n".format(img_path, hashes[temp_hash]))
                duplicates.append(img_path)
            else:
                hashes[temp_hash] = img_path
        return duplicates
    def remove_duplicates(self, duplicates):
        for duplicate in duplicates:
            try:
@ -131,7 +109,6 @@ class CompareImages:
            except OSError as error:
                self.logger.error(error)
    def remove_duplicates_interactive(self, duplicates):
        if len(duplicates) != 0:
            answer = input(f"Do you want to delete these {duplicates} images? Y/n: ")
@ -141,32 +118,41 @@ class CompareImages:
        else:
            self.logger.info("No duplicates found")
    def get_hash(self, img_path):
        with Image.open(img_path) as img:
            return imagehash.average_hash(img, self.hash_size).hash
-    def find_similar(self, image, similarity=80):
+    def diff(self, hash1, hash2):
        return np.count_nonzero(hash1 != hash2)
    def similarity(self, img_diff):
        threshold_img = img_diff / (self.hash_size**2)
        similarity_img = round((1 - threshold_img) * 100)
        return similarity_img
    def find_similar(self, image, file_paths, similarity=80):
        '''
        Find similar images
        :returns: img_path generator
        '''
        hash1 = ''
        if self.is_image(image):
            hash1 = self.get_hash(image)
        self.logger.info(f'Finding similar images to {image}')
        threshold = 1 - similarity/100
        diff_limit = int(threshold*(self.hash_size**2))
-        hash1 = ''
+        for img_path in self.get_images(file_paths):
        if imghdr.what(image) is not None:
            with Image.open(image) as img:
                hash1 = imagehash.average_hash(img, self.hash_size).hash
        self.logger.info(f'Finding similar images to {image}')
        for img_path in self.get_images():
            if img_path == image:
                continue
-            with Image.open(img_path) as img:
+            hash2 = self.get_hash(img_path)
-                hash2 = imagehash.average_hash(img, self.hash_size).hash
+            img_diff = self.diff(hash1, hash2)
-
+            if img_diff <= diff_limit:
-                diff_images = np.count_nonzero(hash1 != hash2)
+                similarity_img = self.similarity(img_diff)
-                if diff_images <= diff_limit:
+                self.logger.info(f'{img_path} image found {similarity_img}% similar to {image}')
-                    threshold_img = diff_images / (self.hash_size**2)
+                yield img_path
                    similarity_img = round((1 - threshold_img) * 100)
                    self.logger.info(f'{img_path} image found {similarity_img}% similar to {image}')
                    yield img_path