Use Path class for sort_similar_images function and test it
This commit is contained in:
parent
4156e769d0
commit
1a78962012
|
@ -300,14 +300,14 @@ def compare(**kwargs):
|
||||||
mode='move', dry_run=dry_run, logger=logger)
|
mode='move', dry_run=dry_run, logger=logger)
|
||||||
|
|
||||||
if kwargs['revert_compare']:
|
if kwargs['revert_compare']:
|
||||||
summary, has_errors = collection.revertcompare(path, dry_run)
|
summary, result = collection.revert_compare(path)
|
||||||
else:
|
else:
|
||||||
summary, has_errors = collection.sort_similar_images(path, kwargs['similarity'])
|
summary, result = collection.sort_similar_images(path, kwargs['similarity'])
|
||||||
|
|
||||||
if verbose or debug:
|
if verbose or debug:
|
||||||
summary.print()
|
summary.print()
|
||||||
|
|
||||||
if has_errors:
|
if not result:
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -518,18 +518,13 @@ class Collection(object):
|
||||||
# return file_path and subdir
|
# return file_path and subdir
|
||||||
yield file_path
|
yield file_path
|
||||||
|
|
||||||
def _create_directory(self, directory_path, path, media):
|
def _create_directory(self, directory_path, media):
|
||||||
"""Create a directory if it does not already exist.
|
"""Create a directory if it does not already exist.
|
||||||
|
|
||||||
:param Path: A fully qualified path of the to create.
|
:param Path: A fully qualified path of the to create.
|
||||||
:returns: bool
|
:returns: bool
|
||||||
"""
|
"""
|
||||||
try:
|
parts = directory_path.relative_to(self.root).parts
|
||||||
parts = directory_path.relative_to(path).parts
|
|
||||||
except ValueError:
|
|
||||||
# directory_path is not the subpath of path
|
|
||||||
pass
|
|
||||||
else:
|
|
||||||
for i, part in enumerate(parts):
|
for i, part in enumerate(parts):
|
||||||
dir_path = self.root / Path(*parts[0:i+1])
|
dir_path = self.root / Path(*parts[0:i+1])
|
||||||
if dir_path.is_file():
|
if dir_path.is_file():
|
||||||
|
@ -556,27 +551,6 @@ class Collection(object):
|
||||||
directory_path.mkdir(parents=True, exist_ok=True)
|
directory_path.mkdir(parents=True, exist_ok=True)
|
||||||
self.logger.info(f'Create {directory_path}')
|
self.logger.info(f'Create {directory_path}')
|
||||||
|
|
||||||
def create_directory(self, directory_path):
|
|
||||||
"""Create a directory if it does not already exist.
|
|
||||||
|
|
||||||
:param str directory_name: A fully qualified path of the
|
|
||||||
to create.
|
|
||||||
:returns: bool
|
|
||||||
"""
|
|
||||||
try:
|
|
||||||
if os.path.exists(directory_path):
|
|
||||||
return True
|
|
||||||
else:
|
|
||||||
if not self.dry_run:
|
|
||||||
os.makedirs(directory_path)
|
|
||||||
self.logger.info(f'Create {directory_path}')
|
|
||||||
return True
|
|
||||||
except OSError:
|
|
||||||
# OSError is thrown for cases like no permission
|
|
||||||
pass
|
|
||||||
|
|
||||||
return False
|
|
||||||
|
|
||||||
def _check_path(self, path):
|
def _check_path(self, path):
|
||||||
"""
|
"""
|
||||||
:param: str path
|
:param: str path
|
||||||
|
@ -591,16 +565,6 @@ class Collection(object):
|
||||||
|
|
||||||
return path
|
return path
|
||||||
|
|
||||||
def check_path(self, path):
|
|
||||||
path = os.path.abspath(os.path.expanduser(path))
|
|
||||||
|
|
||||||
# some error checking
|
|
||||||
if not os.path.exists(path):
|
|
||||||
self.logger.error(f'Directory {path} does not exist')
|
|
||||||
sys.exit(1)
|
|
||||||
|
|
||||||
return path
|
|
||||||
|
|
||||||
def set_utime_from_metadata(self, date_media, file_path):
|
def set_utime_from_metadata(self, date_media, file_path):
|
||||||
""" Set the modification time on the file based on the file name.
|
""" Set the modification time on the file based on the file name.
|
||||||
"""
|
"""
|
||||||
|
@ -648,7 +612,7 @@ class Collection(object):
|
||||||
|
|
||||||
# Dedup path
|
# Dedup path
|
||||||
dest_path = self.root.joinpath(*dedup_path)
|
dest_path = self.root.joinpath(*dedup_path)
|
||||||
self._create_directory(dest_path.parent.name, path, media)
|
self._create_directory(dest_path.parent.name, media)
|
||||||
|
|
||||||
result = self.sort_file(src_path, dest_path, remove_duplicates)
|
result = self.sort_file(src_path, dest_path, remove_duplicates)
|
||||||
|
|
||||||
|
@ -720,20 +684,16 @@ class Collection(object):
|
||||||
media = Media(src_path, path, self.album_from_folder,
|
media = Media(src_path, path, self.album_from_folder,
|
||||||
ignore_tags, self.interactive, self.logger,
|
ignore_tags, self.interactive, self.logger,
|
||||||
self.use_date_filename, self.use_file_dates)
|
self.use_date_filename, self.use_file_dates)
|
||||||
if media:
|
|
||||||
metadata = media.get_metadata(self.root, loc, self.db, self.cache)
|
metadata = media.get_metadata(self.root, loc, self.db, self.cache)
|
||||||
# Get the destination path according to metadata
|
# Get the destination path according to metadata
|
||||||
relpath = Path(self.get_path(metadata))
|
relpath = Path(self.get_path(metadata))
|
||||||
else:
|
|
||||||
# Keep same directory structure
|
|
||||||
relpath = src_path.relative_to(path)
|
|
||||||
|
|
||||||
files_data.append((copy(media), relpath))
|
files_data.append((copy(media), relpath))
|
||||||
|
|
||||||
# Create directories
|
# Create directories
|
||||||
for media, relpath in files_data:
|
for media, relpath in files_data:
|
||||||
dest_directory = self.root / relpath.parent
|
dest_directory = self.root / relpath.parent
|
||||||
self._create_directory(dest_directory, path, media)
|
self._create_directory(dest_directory, media)
|
||||||
|
|
||||||
# sort files and solve conflicts
|
# sort files and solve conflicts
|
||||||
for media, relpath in files_data:
|
for media, relpath in files_data:
|
||||||
|
@ -764,40 +724,11 @@ class Collection(object):
|
||||||
|
|
||||||
return self.summary, record
|
return self.summary, record
|
||||||
|
|
||||||
def set_hash(self, result, src_path, dest_path, src_checksum):
|
def move_file(self, img_path, dest_path):
|
||||||
if result:
|
|
||||||
# Check if file remain the same
|
|
||||||
result = self._checkcomp(dest_path, src_checksum)
|
|
||||||
has_errors = False
|
|
||||||
if result:
|
|
||||||
if not self.dry_run:
|
if not self.dry_run:
|
||||||
self._add_db_data(dest_path, metadata, checksum)
|
|
||||||
|
|
||||||
if dest_path:
|
|
||||||
self.logger.info(f'{src_path} -> {dest_path}')
|
|
||||||
|
|
||||||
self.summary.append((src_path, dest_path))
|
|
||||||
|
|
||||||
else:
|
|
||||||
self.logger.error(f'Files {src_path} and {dest_path} are not identical')
|
|
||||||
# sys.exit(1)
|
|
||||||
self.summary.append((src_path, False))
|
|
||||||
has_errors = True
|
|
||||||
else:
|
|
||||||
self.summary.append((src_path, False))
|
|
||||||
has_errors = True
|
|
||||||
|
|
||||||
return has_errors
|
|
||||||
|
|
||||||
def move_file(self, img_path, dest_path, checksum):
|
|
||||||
if not self.dry_run:
|
|
||||||
try:
|
|
||||||
shutil.move(img_path, dest_path)
|
shutil.move(img_path, dest_path)
|
||||||
except OSError as error:
|
|
||||||
self.logger.error(error)
|
|
||||||
|
|
||||||
self.logger.info(f'move: {img_path} -> {dest_path}')
|
self.logger.info(f'move: {img_path} -> {dest_path}')
|
||||||
return self.set_hash(True, img_path, dest_path, checksum)
|
|
||||||
|
|
||||||
def _get_images(self, path):
|
def _get_images(self, path):
|
||||||
"""
|
"""
|
||||||
|
@ -813,88 +744,86 @@ class Collection(object):
|
||||||
image = Image(src_path)
|
image = Image(src_path)
|
||||||
|
|
||||||
if image.is_image():
|
if image.is_image():
|
||||||
yield src_path
|
yield image
|
||||||
|
|
||||||
def sort_similar_images(self, path, similarity=80):
|
def sort_similar_images(self, path, similarity=80):
|
||||||
|
|
||||||
has_errors = False
|
result = True
|
||||||
path = self._check_path(path)
|
path = self._check_path(path)
|
||||||
img_paths = set([ x for x in self._get_images(path) ])
|
images = set([ x for x in self._get_images(path) ])
|
||||||
i = Images(img_paths, logger=self.logger)
|
i = Images(images, logger=self.logger)
|
||||||
for image in img_paths:
|
for image in images:
|
||||||
if not os.path.isfile(image):
|
if not image.img_path.is_file():
|
||||||
continue
|
continue
|
||||||
checksum1 = utils.checksum(image)
|
media_ref = Media(image.img_path, path, self.logger)
|
||||||
# Process files
|
# Todo: compare metadata?
|
||||||
# media = Media(src_path, False, self.logger)
|
metadata = media_ref.get_metadata(self.root, db=self.db, cache=self.cache)
|
||||||
# TODO compare metadata
|
|
||||||
# if media:
|
|
||||||
# metadata = media.get_metadata()
|
|
||||||
similar = False
|
similar = False
|
||||||
moved_imgs = set()
|
moved_imgs = set()
|
||||||
for img_path in i.find_similar(image, similarity):
|
for img_path in i.find_similar(image, similarity):
|
||||||
similar = True
|
similar = True
|
||||||
checksum2 = utils.checksum(img_path)
|
media = Media(img_path, path, self.logger)
|
||||||
|
metadata = media.get_metadata(self.root, db=self.db, cache=self.cache)
|
||||||
# move image into directory
|
# move image into directory
|
||||||
name = os.path.splitext(os.path.basename(image))[0]
|
name = img_path.stem
|
||||||
directory_name = 'similar_to_' + name
|
directory_name = 'similar_to_' + name
|
||||||
dest_directory = os.path.join(os.path.dirname(img_path),
|
dest_directory = img_path.parent / directory_name
|
||||||
directory_name)
|
dest_path = dest_directory / img_path.name
|
||||||
dest_path = os.path.join(dest_directory, os.path.basename(img_path))
|
dest_directory.mkdir(exist_ok=True)
|
||||||
|
|
||||||
result = self.create_directory(dest_directory)
|
|
||||||
# Move the simlars file into the destination directory
|
# Move the simlars file into the destination directory
|
||||||
if result:
|
self.move_file(img_path, dest_path)
|
||||||
result = self.move_file(img_path, dest_path, checksum2)
|
|
||||||
moved_imgs.add(img_path)
|
moved_imgs.add(img_path)
|
||||||
if not result:
|
if self._record_file(img_path, dest_path, media):
|
||||||
has_errors = True
|
self.summary.append((img_path, dest_path))
|
||||||
else:
|
else:
|
||||||
has_errors = True
|
self.summary.append((img_path, False))
|
||||||
|
result = False
|
||||||
|
|
||||||
if similar:
|
if similar:
|
||||||
dest_path = os.path.join(dest_directory,
|
img_path = image.img_path
|
||||||
os.path.basename(image))
|
dest_path = dest_directory / img_path.name
|
||||||
result = self.move_file(image, dest_path, checksum1)
|
self.move_file(img_path, dest_path)
|
||||||
moved_imgs.add(image)
|
moved_imgs.add(img_path)
|
||||||
if not result:
|
if self._record_file(img_path, dest_path, media_ref):
|
||||||
has_errors = True
|
self.summary.append((img_path, dest_path))
|
||||||
|
else:
|
||||||
|
self.summary.append((img_path, False))
|
||||||
|
result = False
|
||||||
|
|
||||||
# for moved_img in moved_imgs:
|
return self.summary, result
|
||||||
# os.remove(moved_img)
|
|
||||||
|
|
||||||
return self.summary, has_errors
|
|
||||||
|
|
||||||
def revert_compare(self, path):
|
def revert_compare(self, path):
|
||||||
|
|
||||||
has_errors = False
|
result = True
|
||||||
path = self.check_path(path)
|
path = self._check_path(path)
|
||||||
for dirname, dirnames, filenames, level in self.walklevel(path, None):
|
dirnames = set()
|
||||||
if dirname == os.path.join(path, '.ordigi'):
|
moved_files = set()
|
||||||
continue
|
for src_path in self._get_files_in_path(path, glob=self.glob,
|
||||||
|
extensions=self.filter_by_ext):
|
||||||
|
dirname = src_path.parent.name
|
||||||
if dirname.find('similar_to') == 0:
|
if dirname.find('similar_to') == 0:
|
||||||
continue
|
dirnames.add(src_path.parent)
|
||||||
|
|
||||||
for subdir in dirnames:
|
# move file to initial folder and update metadata
|
||||||
if subdir.find('similar_to') == 0:
|
media = Media(src_path, path, self.logger)
|
||||||
file_names = os.listdir(os.path.abspath(os.path.join(dirname, subdir)))
|
metadata = media.get_metadata(self.root, db=self.db, cache=self.cache)
|
||||||
for file_name in file_names:
|
dest_path = Path(src_path.parent.parent, src_path.name)
|
||||||
# move file to initial folder
|
self.move_file(src_path, dest_path)
|
||||||
img_path = os.path.join(dirname, subdir, file_name)
|
moved_files.add(src_path)
|
||||||
if os.path.isdir(img_path):
|
if self._record_file(src_path, dest_path, media):
|
||||||
continue
|
self.summary.append((src_path, dest_path))
|
||||||
checksum = utils.checksum(img_path)
|
else:
|
||||||
dest_path = os.path.join(dirname, os.path.basename(img_path))
|
self.summary.append((src_path, False))
|
||||||
result = self.move_file(img_path, dest_path, checksum)
|
result = False
|
||||||
if not result:
|
|
||||||
has_errors = True
|
for dirname in dirnames:
|
||||||
# remove directory
|
# remove 'similar_to*' directories
|
||||||
try:
|
try:
|
||||||
os.rmdir(os.path.join (dirname, subdir))
|
dirname.rmdir()
|
||||||
except OSError as error:
|
except OSError as error:
|
||||||
self.logger.error(error)
|
self.logger.error(error)
|
||||||
|
|
||||||
return self.summary, has_errors
|
return self.summary, result
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -61,8 +61,11 @@ class Image():
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def get_hash(self):
|
def get_hash(self):
|
||||||
with img.open(self.img_path) as img_path:
|
try:
|
||||||
return imagehash.average_hash(img_path, self.hash_size).hash
|
with img.open(self.img_path) as image:
|
||||||
|
return imagehash.average_hash(image, self.hash_size).hash
|
||||||
|
except (OSError, UnidentifiedImageError):
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
class Images():
|
class Images():
|
||||||
|
@ -75,33 +78,31 @@ class Images():
|
||||||
#: Valid extensions for image files.
|
#: Valid extensions for image files.
|
||||||
extensions = ('arw', 'cr2', 'dng', 'gif', 'heic', 'jpeg', 'jpg', 'nef', 'png', 'rw2')
|
extensions = ('arw', 'cr2', 'dng', 'gif', 'heic', 'jpeg', 'jpg', 'nef', 'png', 'rw2')
|
||||||
|
|
||||||
def __init__(self, img_paths=set(), hash_size=8, logger=logging.getLogger()):
|
def __init__(self, images=set(), hash_size=8, logger=logging.getLogger()):
|
||||||
|
|
||||||
self.img_paths = img_paths
|
self.images = images
|
||||||
self.duplicates = []
|
self.duplicates = []
|
||||||
self.hash_size = hash_size
|
self.hash_size = hash_size
|
||||||
self.logger = logger
|
self.logger = logger
|
||||||
|
|
||||||
def add_images(self, file_paths):
|
def add_images(self, file_paths):
|
||||||
''':returns: img_path generator
|
|
||||||
'''
|
|
||||||
for img_path in file_paths:
|
for img_path in file_paths:
|
||||||
image = Image(img_path)
|
image = Image(img_path)
|
||||||
if image.is_image():
|
if image.is_image():
|
||||||
self.img_paths.add(img_path)
|
self.images.add(image)
|
||||||
|
|
||||||
def get_images_hashes(self):
|
def get_images_hashes(self):
|
||||||
"""Get image hashes"""
|
"""Get image hashes"""
|
||||||
hashes = {}
|
hashes = {}
|
||||||
# Searching for duplicates.
|
# Searching for duplicates.
|
||||||
for img_path in self.img_paths:
|
for image in self.images:
|
||||||
with img.open(img_path) as img:
|
with img.open(image.img_path) as img:
|
||||||
yield imagehash.average_hash(img, self.hash_size)
|
yield imagehash.average_hash(img, self.hash_size)
|
||||||
|
|
||||||
def find_duplicates(self, img_path):
|
def find_duplicates(self, img_path):
|
||||||
"""Find duplicates"""
|
"""Find duplicates"""
|
||||||
duplicates = []
|
duplicates = []
|
||||||
for temp_hash in get_images_hashes(self.img_paths):
|
for temp_hash in get_images_hashes():
|
||||||
if temp_hash in hashes:
|
if temp_hash in hashes:
|
||||||
self.logger.info("Duplicate {} \nfound for image {}\n".format(img_path, hashes[temp_hash]))
|
self.logger.info("Duplicate {} \nfound for image {}\n".format(img_path, hashes[temp_hash]))
|
||||||
duplicates.append(img_path)
|
duplicates.append(img_path)
|
||||||
|
@ -136,28 +137,34 @@ class Images():
|
||||||
return similarity_img
|
return similarity_img
|
||||||
|
|
||||||
def find_similar(self, image, similarity=80):
|
def find_similar(self, image, similarity=80):
|
||||||
'''
|
"""
|
||||||
Find similar images
|
Find similar images
|
||||||
:returns: img_path generator
|
:returns: img_path generator
|
||||||
'''
|
"""
|
||||||
hash1 = ''
|
|
||||||
image = Image(image)
|
|
||||||
if image.is_image():
|
|
||||||
hash1 = image.get_hash()
|
hash1 = image.get_hash()
|
||||||
|
|
||||||
self.logger.info(f'Finding similar images to {image}')
|
if hash1 is None:
|
||||||
|
return None
|
||||||
|
|
||||||
|
self.logger.info(f'Finding similar images to {image.img_path}')
|
||||||
|
|
||||||
threshold = 1 - similarity/100
|
threshold = 1 - similarity/100
|
||||||
diff_limit = int(threshold*(self.hash_size**2))
|
diff_limit = int(threshold*(self.hash_size**2))
|
||||||
|
|
||||||
for img_path in self.img_paths:
|
for img in self.images:
|
||||||
if img_path == image:
|
if not img.img_path.is_file():
|
||||||
continue
|
continue
|
||||||
hash2 = image.get_hash()
|
if img.img_path == image.img_path:
|
||||||
|
continue
|
||||||
|
hash2 = img.get_hash()
|
||||||
|
# Be sure that hash are not None
|
||||||
|
if hash2 is None:
|
||||||
|
continue
|
||||||
|
|
||||||
img_diff = self.diff(hash1, hash2)
|
img_diff = self.diff(hash1, hash2)
|
||||||
if img_diff <= diff_limit:
|
if img_diff <= diff_limit:
|
||||||
similarity_img = self.similarity(img_diff)
|
similarity_img = self.similarity(img_diff)
|
||||||
self.logger.info(f'{img_path} image found {similarity_img}% similar to {image}')
|
self.logger.info(f'{img.img_path} image found {similarity_img}% similar to {image}')
|
||||||
yield img_path
|
yield img.img_path
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -11,10 +11,11 @@ from time import sleep
|
||||||
|
|
||||||
from .conftest import randomize_files, randomize_db
|
from .conftest import randomize_files, randomize_db
|
||||||
from ordigi import constants
|
from ordigi import constants
|
||||||
|
from ordigi.collection import Collection
|
||||||
from ordigi.database import Sqlite
|
from ordigi.database import Sqlite
|
||||||
from ordigi.exiftool import ExifToolCaching, exiftool_is_running, terminate_exiftool
|
from ordigi.exiftool import ExifToolCaching, exiftool_is_running, terminate_exiftool
|
||||||
from ordigi.collection import Collection
|
|
||||||
from ordigi.geolocation import GeoLocation
|
from ordigi.geolocation import GeoLocation
|
||||||
|
from ordigi import log
|
||||||
from ordigi.media import Media
|
from ordigi.media import Media
|
||||||
from ordigi import utils
|
from ordigi import utils
|
||||||
|
|
||||||
|
@ -179,7 +180,21 @@ class TestCollection:
|
||||||
for path in paths:
|
for path in paths:
|
||||||
assert isinstance(path, Path)
|
assert isinstance(path, Path)
|
||||||
|
|
||||||
|
def test_sort_similar_images(self, tmp_path):
|
||||||
|
path = tmp_path / 'collection'
|
||||||
|
shutil.copytree(self.src_path, path)
|
||||||
|
logger = log.get_logger(True, True)
|
||||||
|
collection = Collection(path, None, mode='move', logger=logger)
|
||||||
|
summary, result = collection.sort_similar_images(path, similarity=60)
|
||||||
|
|
||||||
|
# Summary is created and there is no errors
|
||||||
|
assert summary, summary
|
||||||
|
assert result, result
|
||||||
|
|
||||||
|
summary, result = collection.revert_compare(path)
|
||||||
|
|
||||||
|
# Summary is created and there is no errors
|
||||||
|
assert summary, summary
|
||||||
|
assert result, result
|
||||||
|
|
||||||
# TODO Sort similar images into a directory
|
|
||||||
# collection.sort_similar
|
|
||||||
|
|
||||||
|
|
|
@ -74,8 +74,6 @@ class TestMetadata:
|
||||||
assert not media.has_exif_data()
|
assert not media.has_exif_data()
|
||||||
|
|
||||||
def test_get_date_media(self):
|
def test_get_date_media(self):
|
||||||
# collection = Collection(tmp_path, self.path_format,
|
|
||||||
# use_date_filename=True, use_file_dates=True)
|
|
||||||
for file_path in self.file_paths:
|
for file_path in self.file_paths:
|
||||||
exif_data = ExifToolCaching(str(file_path)).asdict()
|
exif_data = ExifToolCaching(str(file_path)).asdict()
|
||||||
media = Media(file_path, self.src_path, use_date_filename=True,
|
media = Media(file_path, self.src_path, use_date_filename=True,
|
||||||
|
|
Loading…
Reference in New Issue