Verify checksum in check_db

This commit is contained in:
Cédric Leporcq 2022-07-28 07:58:13 +02:00
parent 81462677a7
commit f5faa70bc8
2 changed files with 84 additions and 56 deletions

View File

@ -819,7 +819,7 @@ class Collection(SortMedias):
def init(self, loc):
"""Init collection db"""
for file_path in self.get_collection_files():
metadata = self.medias.get_metadata(file_path, self.root, loc)
metadata = self.medias.get_metadata(file_path, self.root, loc=loc)
metadata['file_path'] = os.path.relpath(file_path, self.root)
self.db.add_file_data(metadata)
@ -827,6 +827,46 @@ class Collection(SortMedias):
return self.summary
def check_files(self):
"""Check file integrity."""
for file_path in self.paths.get_files(self.root):
checksum = utils.checksum(file_path)
relpath = file_path.relative_to(self.root)
if checksum == self.db.sqlite.get_checksum(relpath):
self.summary.append('check', True, file_path)
else:
self.log.error(f'{file_path} is corrupted')
self.summary.append('check', False, file_path)
return self.summary
def file_in_db(self, file_path, db_rows):
# Assuming file_path are inside collection root dir
relpath = os.path.relpath(file_path, self.root)
# If file not in database
if relpath not in db_rows:
return False
return True
def _check_file(self, file_path, file_checksum):
"""Check if file checksum as changed"""
relpath = os.path.relpath(file_path, self.root)
db_checksum = self.db.sqlite.get_checksum(relpath)
# Check if checksum match
if not db_checksum:
return None
if db_checksum != file_checksum:
self.log.warning(f'{file_path} checksum as changed')
self.log.info(
f'file_checksum={file_checksum},\ndb_checksum={db_checksum}'
)
return False
return True
def check_db(self):
"""
Check if db FilePath match to collection filesystem
@ -835,14 +875,20 @@ class Collection(SortMedias):
file_paths = list(self.get_collection_files())
db_rows = [row['FilePath'] for row in self.db.sqlite.get_rows('metadata')]
for file_path in file_paths:
# Assuming file_path are inside collection root dir
relpath = os.path.relpath(file_path, self.root)
# If file not in database
if relpath not in db_rows:
result = self.file_in_db(file_path, db_rows)
checksum = utils.checksum(file_path)
if not result:
self.log.error('Db data is not accurate')
self.log.info(f'{file_path} not in db')
return False
elif not self._check_file(file_path, checksum):
# We d'ont want to silently ignore or correct this without
# resetting the cache as is could be due to file corruption
self.log.error(f'modified or corrupted file.')
self.log.info(
'Use ordigi update --checksum or --reset-cache, check database integrity or try to restore the file'
)
return False
nb_files = len(file_paths)
nb_row = len(db_rows)
@ -886,7 +932,7 @@ class Collection(SortMedias):
return self.summary
def update(self, loc):
def update(self, loc, update_checksum=False):
"""Update collection db"""
file_paths = list(self.get_collection_files())
db_rows = list(self.db.sqlite.get_rows('metadata'))
@ -901,9 +947,22 @@ class Collection(SortMedias):
for file_path in file_paths:
relpath = os.path.relpath(file_path, self.root)
metadata = {}
checksum = utils.checksum(file_path)
if not self._check_file(file_path, checksum) and update_checksum:
# metatata will fill checksum from file
metadata = self.medias.get_metadata(
file_path, self.root, checksum, loc=loc
)
metadata['file_path'] = relpath
# set row attribute to the file
self.db.add_file_data(metadata)
self.summary.append('update', file_path)
# If file not in database
if relpath not in db_paths:
metadata = self.medias.get_metadata(file_path, self.root, loc)
metadata = self.medias.get_metadata(file_path, self.root, loc=loc)
metadata['file_path'] = relpath
# Check if file checksum is in invalid rows
row = []
@ -927,19 +986,6 @@ class Collection(SortMedias):
return self.summary
def check_files(self):
"""Check file integrity."""
for file_path in self.paths.get_files(self.root):
checksum = utils.checksum(file_path)
relpath = file_path.relative_to(self.root)
if checksum == self.db.sqlite.get_checksum(relpath):
self.summary.append('check', True, file_path)
else:
self.log.error(f'{file_path} is corrupted')
self.summary.append('check', False, file_path)
return self.summary
def set_utime_from_metadata(self, date_media, file_path):
"""Set the modification time on the file based on the file name."""

View File

@ -279,6 +279,7 @@ class Media(ReadExif):
ignore_tags=None,
interactive=False,
cache=True,
checksum=None,
use_date_filename=False,
use_file_dates=False,
):
@ -292,6 +293,11 @@ class Media(ReadExif):
self.album_from_folder = album_from_folder
self.cache = cache
if checksum:
self.checksum = checksum
else:
self.checksum = utils.checksum(file_path)
self.interactive = interactive
self.log = LOG.getChild(self.__class__.__name__)
self.metadata = None
@ -527,30 +533,6 @@ class Media(ReadExif):
return db.get_metadata(relpath, 'LocationId')
def _check_file(self, db, root, check=True):
"""Check if file_path is a subpath of root"""
if str(self.file_path).startswith(str(root)):
relpath = os.path.relpath(self.file_path, root)
db_checksum = db.get_checksum(relpath)
file_checksum = self.metadata['checksum']
# Check if checksum match
if check and db_checksum and db_checksum != file_checksum:
self.log.error(f'{self.file_path} checksum has changed, modified or corrupted file.')
self.log.error(
f'file_checksum={file_checksum},\ndb_checksum={db_checksum}'
)
self.log.info(
'Use ordigi update --checksum or --reset-cache, check database integrity or try to restore the file'
)
# We d'ont want to silently ignore or correct this without
# resetting the cache as is could be due to file corruption
sys.exit(1)
return relpath, db_checksum
return None, None
def set_location_from_db(self, location_id, db):
self.metadata['location_id'] = location_id
@ -604,13 +586,13 @@ class Media(ReadExif):
if not album or album == '':
self.metadata['album'] = folder
def get_metadata(self, root, loc=None, db=None, cache=False, check=True):
def get_metadata(self, root, loc=None, db=None, cache=False):
"""
Get a dictionary of metadata from exif.
All keys will be present and have a value of None if not obtained.
"""
self.metadata = {}
self.metadata['checksum'] = utils.checksum(self.file_path)
self.metadata['checksum'] = self.checksum
db_checksum = False
location_id = None
@ -621,7 +603,6 @@ class Media(ReadExif):
location_id = self._set_metadata_from_db(db, relpath)
self.set_location_from_db(location_id, db)
else:
# file not in db
self.metadata['src_dir'] = str(self.src_dir)
self.metadata['subdirs'] = str(
self.file_path.relative_to(self.src_dir).parent
@ -688,7 +669,7 @@ class Medias:
self.datas = {}
self.theme = request.load_theme()
def get_media(self, file_path, src_dir):
def get_media(self, file_path, src_dir, checksum=None):
media = Media(
file_path,
src_dir,
@ -696,23 +677,24 @@ class Medias:
self.exif_opt['ignore_tags'],
self.interactive,
self.exif_opt['cache'],
checksum,
self.exif_opt['use_date_filename'],
self.exif_opt['use_file_dates'],
)
return media
def get_media_data(self, file_path, src_dir, loc=None, check=True):
media = self.get_media(file_path, src_dir)
def get_media_data(self, file_path, src_dir, checksum=None, loc=None):
media = self.get_media(file_path, src_dir, checksum)
media.get_metadata(
self.root, loc, self.db.sqlite, self.exif_opt['cache'], check
self.root, loc, self.db.sqlite, self.exif_opt['cache']
)
return media
def get_metadata(self, src_path, src_dir, loc=None, check=True):
def get_metadata(self, src_path, src_dir, checksum=None, loc=None):
"""Get metadata"""
return self.get_media_data(src_path, src_dir, loc, check).metadata
return self.get_media_data(src_path, src_dir, checksum, loc).metadata
def get_paths(self, src_dirs, imp=False):
"""Get paths"""
@ -739,7 +721,7 @@ class Medias:
"""Get medias datas"""
for src_dir, src_path in self.get_paths(src_dirs, imp=imp):
# Get file metadata
media = self.get_media_data(src_path, src_dir, loc)
media = self.get_media_data(src_path, src_dir, loc=loc)
yield src_path, media
@ -747,7 +729,7 @@ class Medias:
"""Get medias data"""
for src_dir, src_path in self.get_paths(src_dirs, imp=imp):
# Get file metadata
metadata = self.get_metadata(src_path, src_dir, loc)
metadata = self.get_metadata(src_path, src_dir, loc=loc)
yield src_path, metadata