Source code for ae.files

"""
generic file object helpers
===========================

this namespace portion is pure Python code, providing helpers for file object and content managing. it only depends
on the :mod:`ae.base` namespace portion.

.. hint:: more helper functions to manage directory/folder structures are provided by the :mod:`ae.paths` portion.

the helper function :func:`copy_bytes` provides recoverable copies of binary files and file streams, with progress
callbacks for every copied chunk/buffer.

:func:`file_lines` and :func:`read_file_text` are helpers to read/load text file contents.

the function :func:`write_file_text` stores a string to a text file.

the helper function :func:`file_transfer_progress` puts the number of transferred bytes in a short and user-readable
format, to be displayed as progress string in a file transfer.

:class:`RegisteredFile` and :class:`CachedFile` encapsulate and optionally cache the contents of a file within a file
object. instances of these classes are compatible with the file objects provided by Python's :mod:`pathlib` module. but
also pure path strings can be used as file objects (see also the :data:`FileObject` type).

all these types of file objects are supported by the class :class:`~ae.paths.FilesRegister` from the
:mod:`ae.paths` portion.


registered file
---------------

a registered file object represents a single file on your file system and can be instantiated from one of the classes
:class:`RegisteredFile` or :class:`CachedFile` provided by this module/portion::

    from ae.files import RegisteredFile

    rf = RegisteredFile('path/to/the/file_name.extension')

    assert str(rf) == 'path/to/the/file_name.extension'
    assert rf.path == 'path/to/the/file_name.extension'
    assert rf.stem == 'file_name'
    assert rf.ext == '.extension'
    assert rf.properties == {}

file properties will be automatically attached to each file object instance with the instance attribute
:attr:`~RegisteredFile.properties`. in the last example it results in an empty dictionary because the
:attr:`~RegisteredFile.path` of this file object does not contain folder names with an underscore character.


file properties
^^^^^^^^^^^^^^^

file property names and values are automatically determined from the names of their subfolders, specified in the
:attr:`~RegisteredFile.path` attribute. every subfolder name containing an underscore character in the format
<property-name>_<value> will be interpreted as a file property::

    rf = RegisteredFile('property1_69/property2_3.69/property3_whatever/file_name.ext')
    assert rf.properties['property1'] == 69
    assert rf.properties['property2'] == 3.69
    assert rf.properties['property3'] == 'whatever'

the property types `int`, `float` and `string` are recognized and converted into a property value. boolean values can be
specified as ``1`` and ``0`` integers.


cached file
-----------

a cached file created from the :class:`CachedFile` behaves like a :ref:`registered file` and additionally provides the
possibility to cache parts or the whole file content as well as the file pointer of the opened file::

    cf = CachedFile('integer_69/float_3.69/string_whatever/file_name.ext')

    assert str(cf) == 'integer_69/float_3.69/string_whatever/file_name.ext'
    assert cf.path == 'integer_69/float_3.69/string_whatever/file_name.ext'
    assert cf.stem == 'file_name'
    assert cf.ext == '.ext'
    assert cf.properties['integer'] == 69
    assert cf.properties['float'] == 3.69
    assert cf.properties['string'] == 'whatever'

pn instantiation of the :class:`CachedFile` file object the default file object loader function
:func:`_default_object_loader` will be used, which opens a file stream via Python's :func:`open` built-in.
alternatively, you can specify a specific file object loader with the :paramref:`~CachedFile.object_loader` parameter
or by assigning a callable directly to the :attr:`~CachedFile.object_loader` attribute::

    cf = CachedFile('integer_69/float_3.69/string_whatever/file_name.ext',
                    object_loader=lambda cached_file_obj: my_open_method(cached_file_obj.path))

the cached file object is accessible via the :attr:`~CachedFile.loaded_object` attribute of the cached file object
instance::

    assert isinstance(cf.loaded_object, TextIOWrapper)
    cf.loaded_object.seek(...)
    cf.loaded_object.read(...)

    cf.loaded_object.close()

"""
import os
import pathlib
from typing import Any, BinaryIO, Callable, Dict, List, Optional, Tuple, Union, cast

from ae.base import dummy_function, norm_line_sep, read_file, write_file                                # type: ignore


__version__ = '0.3.27'


COPY_BUF_LEN = 16 * 1024


FileObject = Union[str, 'RegisteredFile', 'CachedFile', pathlib.Path, pathlib.PurePath, Any]
""" file object type, e.g. a file path str or any class or callable where the returned instance/value is either a string
    or an object with a `stem` attribute (holding the file name w/o extension), like e.g. :class:`CachedFile`,
    :class:`RegisteredFile`, :class:`pathlib.Path` or :class:`pathlib.PurePath`.
"""
PropertyType = Union[int, float, str]                                           #: types of file property values
PropertiesType = Dict[str, PropertyType]                                        #: dict of file properties
FilenameOrStream = Union[str, BinaryIO]                                         #: file name or file stream pointer


# pylint: disable=too-many-arguments,too-many-locals,too-many-branches,too-many-statements
[docs] def copy_bytes(src_file: FilenameOrStream, dst_file: FilenameOrStream, *, transferred_bytes: int = 0, total_bytes: int = 0, buf_size: int = COPY_BUF_LEN, overwrite: bool = False, move_file: bool = False, recoverable: bool = False, errors: Optional[List[str]] = None, progress_func: Callable = dummy_function, **progress_kwargs) -> str: """ recoverable copy of a file or stream (file-like object), optionally with progress callbacks. :param src_file: source file name or opened stream (file-like) object. if passing a non-seekable stream together with a non-zero value in :paramref:`~copy_bytes.transferred_bytes`, then the source stream has to be set to the correct position before you call this function. if passing any source stream, then also the total file/stream size has to be passed into the :paramref:`~copy_bytes.total_bytes` parameter. source file streams do also not support a True value in the :paramref:`~copy_bytes.move_file` argument. :param dst_file: destination file name or opened stream (file-like) object. recoverable copies and copies with a True value in the :paramref:`~copy_bytes.overwrite` argument are not allowed; always use a destination file name if you need a recoverable/overwriting copy. :param transferred_bytes: file offset at which the copy process starts. if not passed for recoverable copies, then `copy_bytes` will determine this value from the file length of the destination file. :param total_bytes: source file size in bytes (needed only if :paramref:`~copy_bytes.src_file` is a stream). :param buf_size: size of copy buffer/chunk in bytes (that get copied before each progress callback). :param overwrite: pass True to allow overwriting of the destination file. if the destination file exists already, then this function will return an error (when this argument gets not specified or has a value that evaluates as False). :param move_file: pass True to delete the source file on complete copying (only works if the source file is a stream). :param recoverable: pass True to allow recoverable file copy (only working if the source file is a stream). :param errors: pass an empty list to get a list of detailed error messages. :param progress_func: optional callback to dispatch or break/cancel the copy progress for large files. if the callback returns a non-empty value, it will be interpreted as cancel reason, the copy process will be stopped, and an error will be returned. :param progress_kwargs: optional additional kwargs passed to the progress function. the kwargs `total_bytes` and `transferred_bytes` will be updated before the callback. :return: destination file name/stream as string or empty string on error. .. hint:: this function is extending the compatible Python functions :func:`shutil.copyfileobj`, :func:`shutil.copyfile`, :func:`shutil.copy`, :func:`shutil.copy2` and :meth:`http.server.SimpleHTTPRequestHandler.copyfile` with recoverability and a progress callback. it can also be used as an argument for the :paramref:`~shutil.copytree.copy_function` parameter of e.g. :func:`shutil.copytree` and :func:`shutil.move`. """ src_named = isinstance(src_file, str) dst_named = isinstance(dst_file, str) if not isinstance(errors, list): errors = [] if progress_func is dummy_function and progress_kwargs: errors.append(f"no progress callback function passed but {progress_kwargs=}") if not src_named: if not total_bytes: errors.append("total_bytes has to be specified for source file-stream") if move_file: errors.append("source file-stream cannot be moved") if not dst_named and (overwrite or recoverable): errors.append("destination file-stream cannot be overwritten or recovered (pass file name instead)") if dst_named and not overwrite and os.path.exists(dst_file): # type: ignore # mypy does not recognize src_named errors.append("destination file exists already (pass True to the overwrite parameter to overwrite)") if errors: return "" src_fp: BinaryIO = cast(BinaryIO, cast(object, None)) try: # pylint: disable-next=consider-using-with src_fp = open(cast(str, src_file), "rb") if src_named else cast(BinaryIO, src_file) # pylint: disable-next=consider-using-with dst_fp = open(cast(str, dst_file), "ab+") if dst_named else cast(BinaryIO, dst_file) except (OSError, Exception) as ex: # pylint: disable=broad-exception-caught errors.append(str(ex)) if src_named and src_fp: src_fp.close() return "" try: if not total_bytes: total_bytes = os.fstat(src_fp.fileno()).st_size # ALT: src_fp.seek(0, 2) and src_fp.tell() if recoverable: if not transferred_bytes: transferred_bytes = os.fstat(dst_fp.fileno()).st_size if transferred_bytes and src_fp.seekable(): src_fp.seek(transferred_bytes) dst_fp.close() while transferred_bytes < total_bytes: chunk = src_fp.read(buf_size) if not chunk: errors.append("source chunk is empty before reaching the end of the file") break if recoverable: with open(dst_file, "ab+") as dst_fp: # type: ignore # mypy does not recognize src_named ensuring str dst_fp.write(chunk) else: dst_fp.write(chunk) transferred_bytes += len(chunk) progress_kwargs.update(transferred_bytes=transferred_bytes, total_bytes=total_bytes) cancel_reason = progress_func(**progress_kwargs) if cancel_reason: errors.append(f"progress function request cancellation; {cancel_reason=}") break except (OSError, Exception) as ex: # pylint: disable=broad-exception-caught errors.append(str(ex)) finally: if dst_named and not dst_fp.closed: dst_fp.close() if src_named: src_fp.close() if move_file and not errors: os.remove(src_file) # type: ignore # silly mypy does not recognize src_named ensuring str return "" if errors else str(dst_file)
[docs] def file_lines(file_path: str, encoding: Optional[str] = None) -> Tuple[str, ...]: """ returning lines of the text file specified by file_path argument as tuple. :param file_path: file path/name to parse/load. :param encoding: encoding used to load and convert/interpret the file content. :return: tuple of the lines found in the specified file or empty tuple if the file could not be found or opened. """ return tuple(norm_line_sep(read_file_text(file_path, encoding=encoding) or "").splitlines())
[docs] def file_transfer_progress(transferred_bytes: int, total_bytes: int = 0, decimal_places: int = 3) -> str: """ return string to display the transfer progress of transferred bytes in short and user-readable format. :param transferred_bytes: number of transferred bytes. :param total_bytes: number of total bytes. :param decimal_places: number of decimal places (should be between 0 and 3). :return: formatted string to display the progress of the currently running transfer. """ def _unit_size(size: float) -> Tuple[float, str]: for unit in ("", "K", "M", "G", "T"): if size < 1024.0: break size /= 1024.0 return size, unit + "Bytes" trs, tru = _unit_size(transferred_bytes) if total_bytes and transferred_bytes != total_bytes: tos, tou = _unit_size(total_bytes) # pylint: disable-next=consider-using-f-string tru = ("" if tru == tou else tru + " ") + "/ {tos:.{de}f} {tou}".format( tos=tos, de=decimal_places if tos % 1 > 0 else 0, tou=tou) # pylint: disable-next=consider-using-f-string return "{trs:.{de}f} {tru}".format(trs=trs, de=decimal_places if trs % 1 > 0 else 0, tru=tru)
[docs] def read_file_text(file_path: str, encoding: Optional[str] = None, error_handling: str = 'ignore') -> Optional[str]: """ returning content of the text file specified by file_path argument as string, while suppressing exceptions. :param file_path: file path/name to load into a string. :param encoding: encoding used to load and convert/interpret the file content (see built-in `open`). :param error_handling: passed onto the `errors` parameter of the built-in `open` function. :return: the file contents as a string. if the file could not be decoded, found or opened, returns an empty string (if :paramref:`~read_file_text.error_handling` is unspecified or set to `'ignore'`), otherwise `None`. this function suppresses and catches exceptions such as `FileNotFoundError`, `OSError`, `PermissionError`, and `ValueError`. """ try: return read_file(file_path, encoding=encoding, error_handling=error_handling) except (FileNotFoundError, OSError, PermissionError, ValueError): return "" if error_handling == 'ignore' else None
[docs] def write_file_text(text_or_lines: Union[str, List[str], Tuple[str]], file_path: str, encoding: Optional[str] = None ) -> bool: """ write the passed text string or list of line strings into the text file specified by file_path argument. :param text_or_lines: new file content either passed as string or list of line strings (will be concatenated with the line separator of the current OS: os.linesep). :param file_path: file path/name to write the passed content into (overwriting any previous content!). :param encoding: encoding used to write/convert/interpret the file content to write. :return: True if the content got written to the file, False on error/exception. this function suppresses and catches exceptions such as `FileExistsError`, `FileNotFoundError`, `OSError`, `PermissionError`, and `ValueError`. """ content = text_or_lines if isinstance(text_or_lines, str) else os.linesep.join(text_or_lines) try: write_file(file_path, content, encoding=encoding) except (FileExistsError, FileNotFoundError, OSError, PermissionError, ValueError): return False return True
[docs] class RegisteredFile: """ represents a single file - see also :ref:`registered file` examples. """
[docs] def __init__(self, file_path: str, **kwargs): """ initialize the registered file instance. :param file_path: file path string. :param kwargs: not supported, only there to have compatibility to :class:`CachedFile` to detect invalid kwargs. """ assert not kwargs, "RegisteredFile does not have any kwargs - maybe want to use CachedFile as file_class." self.path: str = file_path #: file path self.stem: str #: file basename without extension self.ext: str #: file name extension dir_name, base_name = os.path.split(file_path) self.stem, self.ext = os.path.splitext(base_name) self.properties: PropertiesType = {} #: file properties for folder in dir_name.split(os.path.sep): parts = folder.split("_", maxsplit=1) if len(parts) == 2: self.add_property(*parts)
[docs] def __eq__(self, other: FileObject) -> bool: """ allow equality checks. :param other: another file object to compare this instance with. :return: True if both objects are of this type and contain a file with the same path, else False. """ return isinstance(other, self.__class__) and other.path == self.path
[docs] def __repr__(self): """ for config var storage and eval recovery. :return: evaluable/recoverable representation of this object. """ return f"{self.__class__.__name__}({self.path!r})"
[docs] def __str__(self): """ return the file path of the registered file. :return: file path string of this file object. """ return self.path
[docs] def add_property(self, property_name: str, str_value: str): """ add a property to this file object instance. :param property_name: stem of the property to add. :param str_value: literal of the property value (int/float/str type will be detected). """ try: property_value: PropertyType = int(str_value) except ValueError: try: property_value = float(str_value) except ValueError: property_value = str_value self.properties[property_name] = property_value
[docs] def _default_object_loader(file_obj: FileObject): """ file object loader that is opening the file and keeping the handle of the opened file. :param file_obj: file object (path string or obj with `path` attribute holding the complete file path). :return: file handle to the opened file. """ return open(str(file_obj)) # pylint: disable=unspecified-encoding
[docs] class CachedFile(RegisteredFile): """ represents a cacheables registered file object - see also :ref:`cached file` examples. """
[docs] def __init__(self, file_path: str, object_loader: Callable[['CachedFile', ], Any] = _default_object_loader, late_loading: bool = True): """ create a cached file object instance. :param file_path: path string of the file. :param object_loader: callable converting the file_obj into a cached object (available via :attr:`~CachedFile.loaded_object`). :param late_loading: pass False to convert/load file_obj cache early, directly at instantiation. """ super().__init__(file_path) self.object_loader = object_loader self.late_loading = late_loading self._loaded_object = None if late_loading else object_loader(self)
@property def loaded_object(self) -> Any: """ loaded object class instance property. :return: the loaded and cached file object. """ if self.late_loading and not self._loaded_object: self._loaded_object = self.object_loader(self) return self._loaded_object