From b1ed551c3125278d14a69750fea2bfe39cf68530 Mon Sep 17 00:00:00 2001 From: Jon Bergli Heier Date: Wed, 28 Oct 2020 20:14:05 +0100 Subject: Add fbin-backup.py This is a backup script which will copy files to a target storage, and optionally add database entries as well. It works by specifying a source and a target config file. Both should have separate storage configured, and files will be copied from the source storage to the target storage. The list of files to copy is read from the source database, which means this must be accessible from where the script is run, not just the storage. If updating databse entries the target database must also be accessible. The target database is also check for any existing file hashes, and any files that would cause a collision will be skipped. --- fbin-backup.py | 90 +++++++++++++++++++++++++++++++++++++++++ fbin/file_storage/base.py | 8 ++++ fbin/file_storage/filesystem.py | 24 +++++++---- fbin/file_storage/s3.py | 18 ++++++++- 4 files changed, 131 insertions(+), 9 deletions(-) create mode 100644 fbin-backup.py diff --git a/fbin-backup.py b/fbin-backup.py new file mode 100644 index 0000000..5ef3a87 --- /dev/null +++ b/fbin-backup.py @@ -0,0 +1,90 @@ +import argparse +import importlib +import os + +from flask import Flask, Response + +from fbin.db import db, User, File + +parser = argparse.ArgumentParser() +parser.add_argument('-s', '--source-config-file', default='fbin/fbin.cfg') +parser.add_argument('-t', '--target-config-file', required=True) +parser.add_argument('--update-db', action='store_true', help='Update DB as well') +parser.add_argument('-n', '--dry-run', action='store_true', help='Do not update anything') +args = parser.parse_args() + +class DummyFile: + def __init__(self, stream, filename, content_length): + self.stream = stream + self.filename = filename + self.content_length = content_length + + def save(self, fp): + chunk = self.stream.read(10*1024) + while chunk: + fp.write(chunk) + chunk = self.stream.read(10*1024) + +class IterStream: + def __init__(self, it): + self._it = it + + def read(self, n): + try: + return next(self._it) + except StopIteration: + return None + +def main(): + from fbin.file_storage.s3 import Storage as S3Storage + from fbin.file_storage.filesystem import Storage as FSStorage + source_storage = importlib.import_module(source_app.config.get('STORAGE_MODULE', '.file_storage.filesystem'), package='fbin').Storage(source_app) + target_storage = importlib.import_module(target_app.config.get('STORAGE_MODULE', '.file_storage.filesystem'), package='fbin').Storage(target_app) + copy_list = [] + with source_app.app_context(): + db.init_app(source_app) + print('Finding existing files') + for f in db.session.query(File).all(): + if f.user: + db.session.refresh(f.user) + if source_storage.file_exists(f) and not target_storage.file_exists(f): + print('COPY: ', end='') + copy_list.append(f) + else: + print('SKIP: ', end='') + print(f.hash, f.filename) + if not copy_list: + print('No valid files found') + return + print('Copying {} files'.format(len(copy_list))) + for f in copy_list: + if args.dry_run: + print('Would copy', f.hash, f.filename) + continue + print('Copying', f.hash, f.filename) + with source_app.app_context(): + db.init_app(source_app) + with source_app.test_request_context(): + source = source_storage.get_file(f) + if isinstance(source, str) and os.path.exists(source): + source = open(source, 'rb') + elif isinstance(source, Response): + source = IterStream(source.get_app_iter({'REQUEST_METHOD': 'GET'})) + df = DummyFile(source, f.filename, f.size) + with target_app.app_context(): + db.init_app(target_app) + if args.update_db: + if db.session.query(File).filter(File.hash == f.hash).one(): + print(' Cannot copy this file; hash already exists in target DB') + else: + target_storage.store_file(df, f.hash, f.user, f.ip) + else: + target_storage.upload_file(df, f.hash, f.user) + +source_app = Flask('source') +target_app = Flask('target') +with source_app.app_context(): + source_app.config.from_pyfile(args.source_config_file) +with target_app.app_context(): + target_app.config.from_pyfile(args.target_config_file) +main() diff --git a/fbin/file_storage/base.py b/fbin/file_storage/base.py index abdf580..aa2c510 100644 --- a/fbin/file_storage/base.py +++ b/fbin/file_storage/base.py @@ -13,6 +13,10 @@ class BaseStorage: if size_limit is not None and file.size > size_limit: raise FileSizeError('The file size is too large (max {})'.format(File.pretty_size(size_limit))) + def upload_file(self, uploaded_file, file_hash, user): + '''Upload data from uploaded_file.''' + raise NotImplementedError() + def add_file(self, file_hash, filename, size, user=None, ip=None, verify=True): '''Adds the file to the database. @@ -29,6 +33,10 @@ class BaseStorage: '''Store uploaded_file.''' raise NotImplementedError() + def file_exists(self, f): + '''Return True if the specified file exists. ''' + raise NotImplementedError() + def get_file(self, f): '''Return a file object for the specified file. diff --git a/fbin/file_storage/filesystem.py b/fbin/file_storage/filesystem.py index 3a640bb..7951d88 100644 --- a/fbin/file_storage/filesystem.py +++ b/fbin/file_storage/filesystem.py @@ -10,7 +10,7 @@ class Storage(BaseStorage): os.makedirs(self.app.config['FILE_DIRECTORY'], exist_ok=True) os.makedirs(self.app.config['THUMB_DIRECTORY'], exist_ok=True) - def store_file(self, uploaded_file, file_hash, user, ip): + def upload_file(self, uploaded_file, file_hash, user): size = uploaded_file.content_length if hasattr(uploaded_file.stream, 'file'): temp = None @@ -20,17 +20,25 @@ class Storage(BaseStorage): uploaded_file.save(temp.file) temp_path = temp.name size = os.path.getsize(temp_path) + new_path = os.path.join(self.app.config['FILE_DIRECTORY'], file_hash + os.path.splitext(uploaded_file.filename)[1]) + os.rename(temp_path, new_path) + if self.app.config.get('DESTINATION_MODE'): + os.chmod(new_path, self.app.config.get('DESTINATION_MODE')) + return new_path, size + + def store_file(self, uploaded_file, file_hash, user, ip): + file_path, size = self.upload_file(uploaded_file, file_hash, user) try: - new_file = self.add_file(file_hash, uploaded_file.filename, size, user, ip) - if new_file: - os.rename(temp_path, new_file.get_path()) - if self.app.config.get('DESTINATION_MODE'): - os.chmod(new_file.get_path(), self.app.config.get('DESTINATION_MODE')) - return new_file + return self.add_file(file_hash, uploaded_file.filename, size, user, ip) except: - os.unlink(temp.name) + if os.path.exists(file_path): + os.unlink(file_path) raise + def file_exists(self, f): + path = f.get_path() + return os.path.exists(path) + def get_file(self, f): path = f.get_path() if not os.path.exists(path): diff --git a/fbin/file_storage/s3.py b/fbin/file_storage/s3.py index a11488f..e2dd1ea 100644 --- a/fbin/file_storage/s3.py +++ b/fbin/file_storage/s3.py @@ -21,7 +21,7 @@ class Storage(BaseStorage): key += '_thumb' return key - def store_file(self, uploaded_file, file_hash, user, ip): + def upload_file(self, uploaded_file, file_hash, user): bucket = self.client.Bucket(self.app.config['S3_BUCKET']) key = self._get_object_key(file_hash, user.id if user else 0) obj = bucket.upload_fileobj(Fileobj=uploaded_file.stream, Key=key) @@ -29,12 +29,28 @@ class Storage(BaseStorage): if not size: obj = self.client.ObjectSummary(self.app.config['S3_BUCKET'], key) size = obj.size + return size + + def store_file(self, uploaded_file, file_hash, user, ip): + size = self.upload_file(uploaded_file, file_hash, user) try: return self.add_file(file_hash, uploaded_file.filename, size, user, ip) except: obj.delete() raise + def file_exists(self, f): + key = self.get_object_key(f) + bucket = self.app.config['S3_BUCKET'] + obj = self.client.Object(bucket, key) + try: + meta = obj.load() + return True + except botocore.exceptions.ClientError as e: + if e.response['Error']['Code'] == '404': + return False + raise + def get_file(self, f, thumb=False): key = self.get_object_key(f, thumb=thumb) if thumb: -- cgit v1.2.3