chore: 添加虚拟环境到仓库
- 添加 backend_service/venv 虚拟环境 - 包含所有Python依赖包 - 注意:虚拟环境约393MB,包含12655个文件
This commit is contained in:
@@ -0,0 +1,71 @@
|
||||
from . import caching
|
||||
from ._version import __version__ # noqa: F401
|
||||
from .callbacks import Callback
|
||||
from .compression import available_compressions
|
||||
from .core import get_fs_token_paths, open, open_files, open_local, url_to_fs
|
||||
from .exceptions import FSTimeoutError
|
||||
from .mapping import FSMap, get_mapper
|
||||
from .registry import (
|
||||
available_protocols,
|
||||
filesystem,
|
||||
get_filesystem_class,
|
||||
register_implementation,
|
||||
registry,
|
||||
)
|
||||
from .spec import AbstractFileSystem
|
||||
|
||||
__all__ = [
|
||||
"AbstractFileSystem",
|
||||
"FSTimeoutError",
|
||||
"FSMap",
|
||||
"filesystem",
|
||||
"register_implementation",
|
||||
"get_filesystem_class",
|
||||
"get_fs_token_paths",
|
||||
"get_mapper",
|
||||
"open",
|
||||
"open_files",
|
||||
"open_local",
|
||||
"registry",
|
||||
"caching",
|
||||
"Callback",
|
||||
"available_protocols",
|
||||
"available_compressions",
|
||||
"url_to_fs",
|
||||
]
|
||||
|
||||
|
||||
def process_entries():
|
||||
try:
|
||||
from importlib.metadata import entry_points
|
||||
except ImportError:
|
||||
return
|
||||
if entry_points is not None:
|
||||
try:
|
||||
eps = entry_points()
|
||||
except TypeError:
|
||||
pass # importlib-metadata < 0.8
|
||||
else:
|
||||
if hasattr(eps, "select"): # Python 3.10+ / importlib_metadata >= 3.9.0
|
||||
specs = eps.select(group="fsspec.specs")
|
||||
else:
|
||||
specs = eps.get("fsspec.specs", [])
|
||||
registered_names = {}
|
||||
for spec in specs:
|
||||
err_msg = f"Unable to load filesystem from {spec}"
|
||||
name = spec.name
|
||||
if name in registered_names:
|
||||
continue
|
||||
registered_names[name] = True
|
||||
register_implementation(
|
||||
name,
|
||||
spec.value.replace(":", "."),
|
||||
errtxt=err_msg,
|
||||
# We take our implementations as the ones to overload with if
|
||||
# for some reason we encounter some, may be the same, already
|
||||
# registered
|
||||
clobber=True,
|
||||
)
|
||||
|
||||
|
||||
process_entries()
|
||||
@@ -0,0 +1,34 @@
|
||||
# file generated by setuptools-scm
|
||||
# don't change, don't track in version control
|
||||
|
||||
__all__ = [
|
||||
"__version__",
|
||||
"__version_tuple__",
|
||||
"version",
|
||||
"version_tuple",
|
||||
"__commit_id__",
|
||||
"commit_id",
|
||||
]
|
||||
|
||||
TYPE_CHECKING = False
|
||||
if TYPE_CHECKING:
|
||||
from typing import Tuple
|
||||
from typing import Union
|
||||
|
||||
VERSION_TUPLE = Tuple[Union[int, str], ...]
|
||||
COMMIT_ID = Union[str, None]
|
||||
else:
|
||||
VERSION_TUPLE = object
|
||||
COMMIT_ID = object
|
||||
|
||||
version: str
|
||||
__version__: str
|
||||
__version_tuple__: VERSION_TUPLE
|
||||
version_tuple: VERSION_TUPLE
|
||||
commit_id: COMMIT_ID
|
||||
__commit_id__: COMMIT_ID
|
||||
|
||||
__version__ = version = '2025.10.0'
|
||||
__version_tuple__ = version_tuple = (2025, 10, 0)
|
||||
|
||||
__commit_id__ = commit_id = None
|
||||
@@ -0,0 +1,75 @@
|
||||
import operator
|
||||
|
||||
from fsspec import AbstractFileSystem
|
||||
from fsspec.utils import tokenize
|
||||
|
||||
|
||||
class AbstractArchiveFileSystem(AbstractFileSystem):
|
||||
"""
|
||||
A generic superclass for implementing Archive-based filesystems.
|
||||
|
||||
Currently, it is shared amongst
|
||||
:class:`~fsspec.implementations.zip.ZipFileSystem`,
|
||||
:class:`~fsspec.implementations.libarchive.LibArchiveFileSystem` and
|
||||
:class:`~fsspec.implementations.tar.TarFileSystem`.
|
||||
"""
|
||||
|
||||
def __str__(self):
|
||||
return f"<Archive-like object {type(self).__name__} at {id(self)}>"
|
||||
|
||||
__repr__ = __str__
|
||||
|
||||
def ukey(self, path):
|
||||
return tokenize(path, self.fo, self.protocol)
|
||||
|
||||
def _all_dirnames(self, paths):
|
||||
"""Returns *all* directory names for each path in paths, including intermediate
|
||||
ones.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
paths: Iterable of path strings
|
||||
"""
|
||||
if len(paths) == 0:
|
||||
return set()
|
||||
|
||||
dirnames = {self._parent(path) for path in paths} - {self.root_marker}
|
||||
return dirnames | self._all_dirnames(dirnames)
|
||||
|
||||
def info(self, path, **kwargs):
|
||||
self._get_dirs()
|
||||
path = self._strip_protocol(path)
|
||||
if path in {"", "/"} and self.dir_cache:
|
||||
return {"name": "", "type": "directory", "size": 0}
|
||||
if path in self.dir_cache:
|
||||
return self.dir_cache[path]
|
||||
elif path + "/" in self.dir_cache:
|
||||
return self.dir_cache[path + "/"]
|
||||
else:
|
||||
raise FileNotFoundError(path)
|
||||
|
||||
def ls(self, path, detail=True, **kwargs):
|
||||
self._get_dirs()
|
||||
paths = {}
|
||||
for p, f in self.dir_cache.items():
|
||||
p = p.rstrip("/")
|
||||
if "/" in p:
|
||||
root = p.rsplit("/", 1)[0]
|
||||
else:
|
||||
root = ""
|
||||
if root == path.rstrip("/"):
|
||||
paths[p] = f
|
||||
elif all(
|
||||
(a == b)
|
||||
for a, b in zip(path.split("/"), [""] + p.strip("/").split("/"))
|
||||
):
|
||||
# root directory entry
|
||||
ppath = p.rstrip("/").split("/", 1)[0]
|
||||
if ppath not in paths:
|
||||
out = {"name": ppath, "size": 0, "type": "directory"}
|
||||
paths[ppath] = out
|
||||
if detail:
|
||||
out = sorted(paths.values(), key=operator.itemgetter("name"))
|
||||
return out
|
||||
else:
|
||||
return sorted(paths)
|
||||
1097
backend_service/venv/lib/python3.13/site-packages/fsspec/asyn.py
Normal file
1097
backend_service/venv/lib/python3.13/site-packages/fsspec/asyn.py
Normal file
File diff suppressed because it is too large
Load Diff
1004
backend_service/venv/lib/python3.13/site-packages/fsspec/caching.py
Normal file
1004
backend_service/venv/lib/python3.13/site-packages/fsspec/caching.py
Normal file
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,324 @@
|
||||
from functools import wraps
|
||||
|
||||
|
||||
class Callback:
|
||||
"""
|
||||
Base class and interface for callback mechanism
|
||||
|
||||
This class can be used directly for monitoring file transfers by
|
||||
providing ``callback=Callback(hooks=...)`` (see the ``hooks`` argument,
|
||||
below), or subclassed for more specialised behaviour.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
size: int (optional)
|
||||
Nominal quantity for the value that corresponds to a complete
|
||||
transfer, e.g., total number of tiles or total number of
|
||||
bytes
|
||||
value: int (0)
|
||||
Starting internal counter value
|
||||
hooks: dict or None
|
||||
A dict of named functions to be called on each update. The signature
|
||||
of these must be ``f(size, value, **kwargs)``
|
||||
"""
|
||||
|
||||
def __init__(self, size=None, value=0, hooks=None, **kwargs):
|
||||
self.size = size
|
||||
self.value = value
|
||||
self.hooks = hooks or {}
|
||||
self.kw = kwargs
|
||||
|
||||
def __enter__(self):
|
||||
return self
|
||||
|
||||
def __exit__(self, *exc_args):
|
||||
self.close()
|
||||
|
||||
def close(self):
|
||||
"""Close callback."""
|
||||
|
||||
def branched(self, path_1, path_2, **kwargs):
|
||||
"""
|
||||
Return callback for child transfers
|
||||
|
||||
If this callback is operating at a higher level, e.g., put, which may
|
||||
trigger transfers that can also be monitored. The function returns a callback
|
||||
that has to be passed to the child method, e.g., put_file,
|
||||
as `callback=` argument.
|
||||
|
||||
The implementation uses `callback.branch` for compatibility.
|
||||
When implementing callbacks, it is recommended to override this function instead
|
||||
of `branch` and avoid calling `super().branched(...)`.
|
||||
|
||||
Prefer using this function over `branch`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
path_1: str
|
||||
Child's source path
|
||||
path_2: str
|
||||
Child's destination path
|
||||
**kwargs:
|
||||
Arbitrary keyword arguments
|
||||
|
||||
Returns
|
||||
-------
|
||||
callback: Callback
|
||||
A callback instance to be passed to the child method
|
||||
"""
|
||||
self.branch(path_1, path_2, kwargs)
|
||||
# mutate kwargs so that we can force the caller to pass "callback=" explicitly
|
||||
return kwargs.pop("callback", DEFAULT_CALLBACK)
|
||||
|
||||
def branch_coro(self, fn):
|
||||
"""
|
||||
Wraps a coroutine, and pass a new child callback to it.
|
||||
"""
|
||||
|
||||
@wraps(fn)
|
||||
async def func(path1, path2: str, **kwargs):
|
||||
with self.branched(path1, path2, **kwargs) as child:
|
||||
return await fn(path1, path2, callback=child, **kwargs)
|
||||
|
||||
return func
|
||||
|
||||
def set_size(self, size):
|
||||
"""
|
||||
Set the internal maximum size attribute
|
||||
|
||||
Usually called if not initially set at instantiation. Note that this
|
||||
triggers a ``call()``.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
size: int
|
||||
"""
|
||||
self.size = size
|
||||
self.call()
|
||||
|
||||
def absolute_update(self, value):
|
||||
"""
|
||||
Set the internal value state
|
||||
|
||||
Triggers ``call()``
|
||||
|
||||
Parameters
|
||||
----------
|
||||
value: int
|
||||
"""
|
||||
self.value = value
|
||||
self.call()
|
||||
|
||||
def relative_update(self, inc=1):
|
||||
"""
|
||||
Delta increment the internal counter
|
||||
|
||||
Triggers ``call()``
|
||||
|
||||
Parameters
|
||||
----------
|
||||
inc: int
|
||||
"""
|
||||
self.value += inc
|
||||
self.call()
|
||||
|
||||
def call(self, hook_name=None, **kwargs):
|
||||
"""
|
||||
Execute hook(s) with current state
|
||||
|
||||
Each function is passed the internal size and current value
|
||||
|
||||
Parameters
|
||||
----------
|
||||
hook_name: str or None
|
||||
If given, execute on this hook
|
||||
kwargs: passed on to (all) hook(s)
|
||||
"""
|
||||
if not self.hooks:
|
||||
return
|
||||
kw = self.kw.copy()
|
||||
kw.update(kwargs)
|
||||
if hook_name:
|
||||
if hook_name not in self.hooks:
|
||||
return
|
||||
return self.hooks[hook_name](self.size, self.value, **kw)
|
||||
for hook in self.hooks.values() or []:
|
||||
hook(self.size, self.value, **kw)
|
||||
|
||||
def wrap(self, iterable):
|
||||
"""
|
||||
Wrap an iterable to call ``relative_update`` on each iterations
|
||||
|
||||
Parameters
|
||||
----------
|
||||
iterable: Iterable
|
||||
The iterable that is being wrapped
|
||||
"""
|
||||
for item in iterable:
|
||||
self.relative_update()
|
||||
yield item
|
||||
|
||||
def branch(self, path_1, path_2, kwargs):
|
||||
"""
|
||||
Set callbacks for child transfers
|
||||
|
||||
If this callback is operating at a higher level, e.g., put, which may
|
||||
trigger transfers that can also be monitored. The passed kwargs are
|
||||
to be *mutated* to add ``callback=``, if this class supports branching
|
||||
to children.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
path_1: str
|
||||
Child's source path
|
||||
path_2: str
|
||||
Child's destination path
|
||||
kwargs: dict
|
||||
arguments passed to child method, e.g., put_file.
|
||||
|
||||
Returns
|
||||
-------
|
||||
|
||||
"""
|
||||
return None
|
||||
|
||||
def no_op(self, *_, **__):
|
||||
pass
|
||||
|
||||
def __getattr__(self, item):
|
||||
"""
|
||||
If undefined methods are called on this class, nothing happens
|
||||
"""
|
||||
return self.no_op
|
||||
|
||||
@classmethod
|
||||
def as_callback(cls, maybe_callback=None):
|
||||
"""Transform callback=... into Callback instance
|
||||
|
||||
For the special value of ``None``, return the global instance of
|
||||
``NoOpCallback``. This is an alternative to including
|
||||
``callback=DEFAULT_CALLBACK`` directly in a method signature.
|
||||
"""
|
||||
if maybe_callback is None:
|
||||
return DEFAULT_CALLBACK
|
||||
return maybe_callback
|
||||
|
||||
|
||||
class NoOpCallback(Callback):
|
||||
"""
|
||||
This implementation of Callback does exactly nothing
|
||||
"""
|
||||
|
||||
def call(self, *args, **kwargs):
|
||||
return None
|
||||
|
||||
|
||||
class DotPrinterCallback(Callback):
|
||||
"""
|
||||
Simple example Callback implementation
|
||||
|
||||
Almost identical to Callback with a hook that prints a char; here we
|
||||
demonstrate how the outer layer may print "#" and the inner layer "."
|
||||
"""
|
||||
|
||||
def __init__(self, chr_to_print="#", **kwargs):
|
||||
self.chr = chr_to_print
|
||||
super().__init__(**kwargs)
|
||||
|
||||
def branch(self, path_1, path_2, kwargs):
|
||||
"""Mutate kwargs to add new instance with different print char"""
|
||||
kwargs["callback"] = DotPrinterCallback(".")
|
||||
|
||||
def call(self, **kwargs):
|
||||
"""Just outputs a character"""
|
||||
print(self.chr, end="")
|
||||
|
||||
|
||||
class TqdmCallback(Callback):
|
||||
"""
|
||||
A callback to display a progress bar using tqdm
|
||||
|
||||
Parameters
|
||||
----------
|
||||
tqdm_kwargs : dict, (optional)
|
||||
Any argument accepted by the tqdm constructor.
|
||||
See the `tqdm doc <https://tqdm.github.io/docs/tqdm/#__init__>`_.
|
||||
Will be forwarded to `tqdm_cls`.
|
||||
tqdm_cls: (optional)
|
||||
subclass of `tqdm.tqdm`. If not passed, it will default to `tqdm.tqdm`.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import fsspec
|
||||
>>> from fsspec.callbacks import TqdmCallback
|
||||
>>> fs = fsspec.filesystem("memory")
|
||||
>>> path2distant_data = "/your-path"
|
||||
>>> fs.upload(
|
||||
".",
|
||||
path2distant_data,
|
||||
recursive=True,
|
||||
callback=TqdmCallback(),
|
||||
)
|
||||
|
||||
You can forward args to tqdm using the ``tqdm_kwargs`` parameter.
|
||||
|
||||
>>> fs.upload(
|
||||
".",
|
||||
path2distant_data,
|
||||
recursive=True,
|
||||
callback=TqdmCallback(tqdm_kwargs={"desc": "Your tqdm description"}),
|
||||
)
|
||||
|
||||
You can also customize the progress bar by passing a subclass of `tqdm`.
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
class TqdmFormat(tqdm):
|
||||
'''Provides a `total_time` format parameter'''
|
||||
@property
|
||||
def format_dict(self):
|
||||
d = super().format_dict
|
||||
total_time = d["elapsed"] * (d["total"] or 0) / max(d["n"], 1)
|
||||
d.update(total_time=self.format_interval(total_time) + " in total")
|
||||
return d
|
||||
|
||||
>>> with TqdmCallback(
|
||||
tqdm_kwargs={
|
||||
"desc": "desc",
|
||||
"bar_format": "{total_time}: {percentage:.0f}%|{bar}{r_bar}",
|
||||
},
|
||||
tqdm_cls=TqdmFormat,
|
||||
) as callback:
|
||||
fs.upload(".", path2distant_data, recursive=True, callback=callback)
|
||||
"""
|
||||
|
||||
def __init__(self, tqdm_kwargs=None, *args, **kwargs):
|
||||
try:
|
||||
from tqdm import tqdm
|
||||
|
||||
except ImportError as exce:
|
||||
raise ImportError(
|
||||
"Using TqdmCallback requires tqdm to be installed"
|
||||
) from exce
|
||||
|
||||
self._tqdm_cls = kwargs.pop("tqdm_cls", tqdm)
|
||||
self._tqdm_kwargs = tqdm_kwargs or {}
|
||||
self.tqdm = None
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
def call(self, *args, **kwargs):
|
||||
if self.tqdm is None:
|
||||
self.tqdm = self._tqdm_cls(total=self.size, **self._tqdm_kwargs)
|
||||
self.tqdm.total = self.size
|
||||
self.tqdm.update(self.value - self.tqdm.n)
|
||||
|
||||
def close(self):
|
||||
if self.tqdm is not None:
|
||||
self.tqdm.close()
|
||||
self.tqdm = None
|
||||
|
||||
def __del__(self):
|
||||
return self.close()
|
||||
|
||||
|
||||
DEFAULT_CALLBACK = _DEFAULT_CALLBACK = NoOpCallback()
|
||||
@@ -0,0 +1,182 @@
|
||||
"""Helper functions for a standard streaming compression API"""
|
||||
|
||||
from zipfile import ZipFile
|
||||
|
||||
import fsspec.utils
|
||||
from fsspec.spec import AbstractBufferedFile
|
||||
|
||||
|
||||
def noop_file(file, mode, **kwargs):
|
||||
return file
|
||||
|
||||
|
||||
# TODO: files should also be available as contexts
|
||||
# should be functions of the form func(infile, mode=, **kwargs) -> file-like
|
||||
compr = {None: noop_file}
|
||||
|
||||
|
||||
def register_compression(name, callback, extensions, force=False):
|
||||
"""Register an "inferable" file compression type.
|
||||
|
||||
Registers transparent file compression type for use with fsspec.open.
|
||||
Compression can be specified by name in open, or "infer"-ed for any files
|
||||
ending with the given extensions.
|
||||
|
||||
Args:
|
||||
name: (str) The compression type name. Eg. "gzip".
|
||||
callback: A callable of form (infile, mode, **kwargs) -> file-like.
|
||||
Accepts an input file-like object, the target mode and kwargs.
|
||||
Returns a wrapped file-like object.
|
||||
extensions: (str, Iterable[str]) A file extension, or list of file
|
||||
extensions for which to infer this compression scheme. Eg. "gz".
|
||||
force: (bool) Force re-registration of compression type or extensions.
|
||||
|
||||
Raises:
|
||||
ValueError: If name or extensions already registered, and not force.
|
||||
|
||||
"""
|
||||
if isinstance(extensions, str):
|
||||
extensions = [extensions]
|
||||
|
||||
# Validate registration
|
||||
if name in compr and not force:
|
||||
raise ValueError(f"Duplicate compression registration: {name}")
|
||||
|
||||
for ext in extensions:
|
||||
if ext in fsspec.utils.compressions and not force:
|
||||
raise ValueError(f"Duplicate compression file extension: {ext} ({name})")
|
||||
|
||||
compr[name] = callback
|
||||
|
||||
for ext in extensions:
|
||||
fsspec.utils.compressions[ext] = name
|
||||
|
||||
|
||||
def unzip(infile, mode="rb", filename=None, **kwargs):
|
||||
if "r" not in mode:
|
||||
filename = filename or "file"
|
||||
z = ZipFile(infile, mode="w", **kwargs)
|
||||
fo = z.open(filename, mode="w")
|
||||
fo.close = lambda closer=fo.close: closer() or z.close()
|
||||
return fo
|
||||
z = ZipFile(infile)
|
||||
if filename is None:
|
||||
filename = z.namelist()[0]
|
||||
return z.open(filename, mode="r", **kwargs)
|
||||
|
||||
|
||||
register_compression("zip", unzip, "zip")
|
||||
|
||||
try:
|
||||
from bz2 import BZ2File
|
||||
except ImportError:
|
||||
pass
|
||||
else:
|
||||
register_compression("bz2", BZ2File, "bz2")
|
||||
|
||||
try: # pragma: no cover
|
||||
from isal import igzip
|
||||
|
||||
def isal(infile, mode="rb", **kwargs):
|
||||
return igzip.IGzipFile(fileobj=infile, mode=mode, **kwargs)
|
||||
|
||||
register_compression("gzip", isal, "gz")
|
||||
except ImportError:
|
||||
from gzip import GzipFile
|
||||
|
||||
register_compression(
|
||||
"gzip", lambda f, **kwargs: GzipFile(fileobj=f, **kwargs), "gz"
|
||||
)
|
||||
|
||||
try:
|
||||
from lzma import LZMAFile
|
||||
|
||||
register_compression("lzma", LZMAFile, "lzma")
|
||||
register_compression("xz", LZMAFile, "xz")
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
try:
|
||||
import lzmaffi
|
||||
|
||||
register_compression("lzma", lzmaffi.LZMAFile, "lzma", force=True)
|
||||
register_compression("xz", lzmaffi.LZMAFile, "xz", force=True)
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
|
||||
class SnappyFile(AbstractBufferedFile):
|
||||
def __init__(self, infile, mode, **kwargs):
|
||||
import snappy
|
||||
|
||||
super().__init__(
|
||||
fs=None, path="snappy", mode=mode.strip("b") + "b", size=999999999, **kwargs
|
||||
)
|
||||
self.infile = infile
|
||||
if "r" in mode:
|
||||
self.codec = snappy.StreamDecompressor()
|
||||
else:
|
||||
self.codec = snappy.StreamCompressor()
|
||||
|
||||
def _upload_chunk(self, final=False):
|
||||
self.buffer.seek(0)
|
||||
out = self.codec.add_chunk(self.buffer.read())
|
||||
self.infile.write(out)
|
||||
return True
|
||||
|
||||
def seek(self, loc, whence=0):
|
||||
raise NotImplementedError("SnappyFile is not seekable")
|
||||
|
||||
def seekable(self):
|
||||
return False
|
||||
|
||||
def _fetch_range(self, start, end):
|
||||
"""Get the specified set of bytes from remote"""
|
||||
data = self.infile.read(end - start)
|
||||
return self.codec.decompress(data)
|
||||
|
||||
|
||||
try:
|
||||
import snappy
|
||||
|
||||
snappy.compress(b"")
|
||||
# Snappy may use the .sz file extension, but this is not part of the
|
||||
# standard implementation.
|
||||
register_compression("snappy", SnappyFile, [])
|
||||
|
||||
except (ImportError, NameError, AttributeError):
|
||||
pass
|
||||
|
||||
try:
|
||||
import lz4.frame
|
||||
|
||||
register_compression("lz4", lz4.frame.open, "lz4")
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
try:
|
||||
# zstd in the standard library for python >= 3.14
|
||||
from compression.zstd import ZstdFile
|
||||
|
||||
register_compression("zstd", ZstdFile, "zst")
|
||||
|
||||
except ImportError:
|
||||
try:
|
||||
import zstandard as zstd
|
||||
|
||||
def zstandard_file(infile, mode="rb"):
|
||||
if "r" in mode:
|
||||
cctx = zstd.ZstdDecompressor()
|
||||
return cctx.stream_reader(infile)
|
||||
else:
|
||||
cctx = zstd.ZstdCompressor(level=10)
|
||||
return cctx.stream_writer(infile)
|
||||
|
||||
register_compression("zstd", zstandard_file, "zst")
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
|
||||
def available_compressions():
|
||||
"""Return a list of the implemented compressions."""
|
||||
return list(compr)
|
||||
@@ -0,0 +1,131 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import configparser
|
||||
import json
|
||||
import os
|
||||
import warnings
|
||||
from typing import Any
|
||||
|
||||
conf: dict[str, dict[str, Any]] = {}
|
||||
default_conf_dir = os.path.join(os.path.expanduser("~"), ".config/fsspec")
|
||||
conf_dir = os.environ.get("FSSPEC_CONFIG_DIR", default_conf_dir)
|
||||
|
||||
|
||||
def set_conf_env(conf_dict, envdict=os.environ):
|
||||
"""Set config values from environment variables
|
||||
|
||||
Looks for variables of the form ``FSSPEC_<protocol>`` and
|
||||
``FSSPEC_<protocol>_<kwarg>``. For ``FSSPEC_<protocol>`` the value is parsed
|
||||
as a json dictionary and used to ``update`` the config of the
|
||||
corresponding protocol. For ``FSSPEC_<protocol>_<kwarg>`` there is no
|
||||
attempt to convert the string value, but the kwarg keys will be lower-cased.
|
||||
|
||||
The ``FSSPEC_<protocol>_<kwarg>`` variables are applied after the
|
||||
``FSSPEC_<protocol>`` ones.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
conf_dict : dict(str, dict)
|
||||
This dict will be mutated
|
||||
envdict : dict-like(str, str)
|
||||
Source for the values - usually the real environment
|
||||
"""
|
||||
kwarg_keys = []
|
||||
for key in envdict:
|
||||
if key.startswith("FSSPEC_") and len(key) > 7 and key[7] != "_":
|
||||
if key.count("_") > 1:
|
||||
kwarg_keys.append(key)
|
||||
continue
|
||||
try:
|
||||
value = json.loads(envdict[key])
|
||||
except json.decoder.JSONDecodeError as ex:
|
||||
warnings.warn(
|
||||
f"Ignoring environment variable {key} due to a parse failure: {ex}"
|
||||
)
|
||||
else:
|
||||
if isinstance(value, dict):
|
||||
_, proto = key.split("_", 1)
|
||||
conf_dict.setdefault(proto.lower(), {}).update(value)
|
||||
else:
|
||||
warnings.warn(
|
||||
f"Ignoring environment variable {key} due to not being a dict:"
|
||||
f" {type(value)}"
|
||||
)
|
||||
elif key.startswith("FSSPEC"):
|
||||
warnings.warn(
|
||||
f"Ignoring environment variable {key} due to having an unexpected name"
|
||||
)
|
||||
|
||||
for key in kwarg_keys:
|
||||
_, proto, kwarg = key.split("_", 2)
|
||||
conf_dict.setdefault(proto.lower(), {})[kwarg.lower()] = envdict[key]
|
||||
|
||||
|
||||
def set_conf_files(cdir, conf_dict):
|
||||
"""Set config values from files
|
||||
|
||||
Scans for INI and JSON files in the given dictionary, and uses their
|
||||
contents to set the config. In case of repeated values, later values
|
||||
win.
|
||||
|
||||
In the case of INI files, all values are strings, and these will not
|
||||
be converted.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
cdir : str
|
||||
Directory to search
|
||||
conf_dict : dict(str, dict)
|
||||
This dict will be mutated
|
||||
"""
|
||||
if not os.path.isdir(cdir):
|
||||
return
|
||||
allfiles = sorted(os.listdir(cdir))
|
||||
for fn in allfiles:
|
||||
if fn.endswith(".ini"):
|
||||
ini = configparser.ConfigParser()
|
||||
ini.read(os.path.join(cdir, fn))
|
||||
for key in ini:
|
||||
if key == "DEFAULT":
|
||||
continue
|
||||
conf_dict.setdefault(key, {}).update(dict(ini[key]))
|
||||
if fn.endswith(".json"):
|
||||
with open(os.path.join(cdir, fn)) as f:
|
||||
js = json.load(f)
|
||||
for key in js:
|
||||
conf_dict.setdefault(key, {}).update(dict(js[key]))
|
||||
|
||||
|
||||
def apply_config(cls, kwargs, conf_dict=None):
|
||||
"""Supply default values for kwargs when instantiating class
|
||||
|
||||
Augments the passed kwargs, by finding entries in the config dict
|
||||
which match the classes ``.protocol`` attribute (one or more str)
|
||||
|
||||
Parameters
|
||||
----------
|
||||
cls : file system implementation
|
||||
kwargs : dict
|
||||
conf_dict : dict of dict
|
||||
Typically this is the global configuration
|
||||
|
||||
Returns
|
||||
-------
|
||||
dict : the modified set of kwargs
|
||||
"""
|
||||
if conf_dict is None:
|
||||
conf_dict = conf
|
||||
protos = cls.protocol if isinstance(cls.protocol, (tuple, list)) else [cls.protocol]
|
||||
kw = {}
|
||||
for proto in protos:
|
||||
# default kwargs from the current state of the config
|
||||
if proto in conf_dict:
|
||||
kw.update(conf_dict[proto])
|
||||
# explicit kwargs always win
|
||||
kw.update(**kwargs)
|
||||
kwargs = kw
|
||||
return kwargs
|
||||
|
||||
|
||||
set_conf_files(conf_dir, conf)
|
||||
set_conf_env(conf)
|
||||
@@ -0,0 +1,125 @@
|
||||
import os
|
||||
import shutil
|
||||
import subprocess
|
||||
import sys
|
||||
import time
|
||||
from collections import deque
|
||||
from collections.abc import Generator, Sequence
|
||||
|
||||
import pytest
|
||||
|
||||
import fsspec
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def m():
|
||||
"""
|
||||
Fixture providing a memory filesystem.
|
||||
"""
|
||||
m = fsspec.filesystem("memory")
|
||||
m.store.clear()
|
||||
m.pseudo_dirs.clear()
|
||||
m.pseudo_dirs.append("")
|
||||
try:
|
||||
yield m
|
||||
finally:
|
||||
m.store.clear()
|
||||
m.pseudo_dirs.clear()
|
||||
m.pseudo_dirs.append("")
|
||||
|
||||
|
||||
class InstanceCacheInspector:
|
||||
"""
|
||||
Helper class to inspect instance caches of filesystem classes in tests.
|
||||
"""
|
||||
|
||||
def clear(self) -> None:
|
||||
"""
|
||||
Clear instance caches of all currently imported filesystem classes.
|
||||
"""
|
||||
classes = deque([fsspec.spec.AbstractFileSystem])
|
||||
while classes:
|
||||
cls = classes.popleft()
|
||||
cls.clear_instance_cache()
|
||||
classes.extend(cls.__subclasses__())
|
||||
|
||||
def gather_counts(self, *, omit_zero: bool = True) -> dict[str, int]:
|
||||
"""
|
||||
Gather counts of filesystem instances in the instance caches
|
||||
of all currently imported filesystem classes.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
omit_zero:
|
||||
Whether to omit instance types with no cached instances.
|
||||
"""
|
||||
out: dict[str, int] = {}
|
||||
classes = deque([fsspec.spec.AbstractFileSystem])
|
||||
while classes:
|
||||
cls = classes.popleft()
|
||||
count = len(cls._cache) # there is no public interface for the cache
|
||||
# note: skip intermediate AbstractFileSystem subclasses
|
||||
# if they proxy the protocol attribute via a property.
|
||||
if isinstance(cls.protocol, (Sequence, str)):
|
||||
key = cls.protocol if isinstance(cls.protocol, str) else cls.protocol[0]
|
||||
if count or not omit_zero:
|
||||
out[key] = count
|
||||
classes.extend(cls.__subclasses__())
|
||||
return out
|
||||
|
||||
|
||||
@pytest.fixture(scope="function", autouse=True)
|
||||
def instance_caches() -> Generator[InstanceCacheInspector, None, None]:
|
||||
"""
|
||||
Fixture to ensure empty filesystem instance caches before and after a test.
|
||||
|
||||
Used by default for all tests.
|
||||
Clears caches of all imported filesystem classes.
|
||||
Can be used to write test assertions about instance caches.
|
||||
|
||||
Usage:
|
||||
|
||||
def test_something(instance_caches):
|
||||
# Test code here
|
||||
fsspec.open("file://abc")
|
||||
fsspec.open("memory://foo/bar")
|
||||
|
||||
# Test assertion
|
||||
assert instance_caches.gather_counts() == {"file": 1, "memory": 1}
|
||||
|
||||
Returns
|
||||
-------
|
||||
instance_caches: An instance cache inspector for clearing and inspecting caches.
|
||||
"""
|
||||
ic = InstanceCacheInspector()
|
||||
|
||||
ic.clear()
|
||||
try:
|
||||
yield ic
|
||||
finally:
|
||||
ic.clear()
|
||||
|
||||
|
||||
@pytest.fixture(scope="function")
|
||||
def ftp_writable(tmpdir):
|
||||
"""
|
||||
Fixture providing a writable FTP filesystem.
|
||||
"""
|
||||
pytest.importorskip("pyftpdlib")
|
||||
|
||||
d = str(tmpdir)
|
||||
with open(os.path.join(d, "out"), "wb") as f:
|
||||
f.write(b"hello" * 10000)
|
||||
P = subprocess.Popen(
|
||||
[sys.executable, "-m", "pyftpdlib", "-d", d, "-u", "user", "-P", "pass", "-w"]
|
||||
)
|
||||
try:
|
||||
time.sleep(1)
|
||||
yield "localhost", 2121, "user", "pass"
|
||||
finally:
|
||||
P.terminate()
|
||||
P.wait()
|
||||
try:
|
||||
shutil.rmtree(tmpdir)
|
||||
except Exception:
|
||||
pass
|
||||
743
backend_service/venv/lib/python3.13/site-packages/fsspec/core.py
Normal file
743
backend_service/venv/lib/python3.13/site-packages/fsspec/core.py
Normal file
@@ -0,0 +1,743 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import io
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
from glob import has_magic
|
||||
from pathlib import Path
|
||||
|
||||
# for backwards compat, we export cache things from here too
|
||||
from fsspec.caching import ( # noqa: F401
|
||||
BaseCache,
|
||||
BlockCache,
|
||||
BytesCache,
|
||||
MMapCache,
|
||||
ReadAheadCache,
|
||||
caches,
|
||||
)
|
||||
from fsspec.compression import compr
|
||||
from fsspec.config import conf
|
||||
from fsspec.registry import filesystem, get_filesystem_class
|
||||
from fsspec.utils import (
|
||||
_unstrip_protocol,
|
||||
build_name_function,
|
||||
infer_compression,
|
||||
stringify_path,
|
||||
)
|
||||
|
||||
logger = logging.getLogger("fsspec")
|
||||
|
||||
|
||||
class OpenFile:
|
||||
"""
|
||||
File-like object to be used in a context
|
||||
|
||||
Can layer (buffered) text-mode and compression over any file-system, which
|
||||
are typically binary-only.
|
||||
|
||||
These instances are safe to serialize, as the low-level file object
|
||||
is not created until invoked using ``with``.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
fs: FileSystem
|
||||
The file system to use for opening the file. Should be a subclass or duck-type
|
||||
with ``fsspec.spec.AbstractFileSystem``
|
||||
path: str
|
||||
Location to open
|
||||
mode: str like 'rb', optional
|
||||
Mode of the opened file
|
||||
compression: str or None, optional
|
||||
Compression to apply
|
||||
encoding: str or None, optional
|
||||
The encoding to use if opened in text mode.
|
||||
errors: str or None, optional
|
||||
How to handle encoding errors if opened in text mode.
|
||||
newline: None or str
|
||||
Passed to TextIOWrapper in text mode, how to handle line endings.
|
||||
autoopen: bool
|
||||
If True, calls open() immediately. Mostly used by pickle
|
||||
pos: int
|
||||
If given and autoopen is True, seek to this location immediately
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
fs,
|
||||
path,
|
||||
mode="rb",
|
||||
compression=None,
|
||||
encoding=None,
|
||||
errors=None,
|
||||
newline=None,
|
||||
):
|
||||
self.fs = fs
|
||||
self.path = path
|
||||
self.mode = mode
|
||||
self.compression = get_compression(path, compression)
|
||||
self.encoding = encoding
|
||||
self.errors = errors
|
||||
self.newline = newline
|
||||
self.fobjects = []
|
||||
|
||||
def __reduce__(self):
|
||||
return (
|
||||
OpenFile,
|
||||
(
|
||||
self.fs,
|
||||
self.path,
|
||||
self.mode,
|
||||
self.compression,
|
||||
self.encoding,
|
||||
self.errors,
|
||||
self.newline,
|
||||
),
|
||||
)
|
||||
|
||||
def __repr__(self):
|
||||
return f"<OpenFile '{self.path}'>"
|
||||
|
||||
def __enter__(self):
|
||||
mode = self.mode.replace("t", "").replace("b", "") + "b"
|
||||
|
||||
try:
|
||||
f = self.fs.open(self.path, mode=mode)
|
||||
except FileNotFoundError as e:
|
||||
if has_magic(self.path):
|
||||
raise FileNotFoundError(
|
||||
"%s not found. The URL contains glob characters: you maybe needed\n"
|
||||
"to pass expand=True in fsspec.open() or the storage_options of \n"
|
||||
"your library. You can also set the config value 'open_expand'\n"
|
||||
"before import, or fsspec.core.DEFAULT_EXPAND at runtime, to True.",
|
||||
self.path,
|
||||
) from e
|
||||
raise
|
||||
|
||||
self.fobjects = [f]
|
||||
|
||||
if self.compression is not None:
|
||||
compress = compr[self.compression]
|
||||
f = compress(f, mode=mode[0])
|
||||
self.fobjects.append(f)
|
||||
|
||||
if "b" not in self.mode:
|
||||
# assume, for example, that 'r' is equivalent to 'rt' as in builtin
|
||||
f = PickleableTextIOWrapper(
|
||||
f, encoding=self.encoding, errors=self.errors, newline=self.newline
|
||||
)
|
||||
self.fobjects.append(f)
|
||||
|
||||
return self.fobjects[-1]
|
||||
|
||||
def __exit__(self, *args):
|
||||
self.close()
|
||||
|
||||
@property
|
||||
def full_name(self):
|
||||
return _unstrip_protocol(self.path, self.fs)
|
||||
|
||||
def open(self):
|
||||
"""Materialise this as a real open file without context
|
||||
|
||||
The OpenFile object should be explicitly closed to avoid enclosed file
|
||||
instances persisting. You must, therefore, keep a reference to the OpenFile
|
||||
during the life of the file-like it generates.
|
||||
"""
|
||||
return self.__enter__()
|
||||
|
||||
def close(self):
|
||||
"""Close all encapsulated file objects"""
|
||||
for f in reversed(self.fobjects):
|
||||
if "r" not in self.mode and not f.closed:
|
||||
f.flush()
|
||||
f.close()
|
||||
self.fobjects.clear()
|
||||
|
||||
|
||||
class OpenFiles(list):
|
||||
"""List of OpenFile instances
|
||||
|
||||
Can be used in a single context, which opens and closes all of the
|
||||
contained files. Normal list access to get the elements works as
|
||||
normal.
|
||||
|
||||
A special case is made for caching filesystems - the files will
|
||||
be down/uploaded together at the start or end of the context, and
|
||||
this may happen concurrently, if the target filesystem supports it.
|
||||
"""
|
||||
|
||||
def __init__(self, *args, mode="rb", fs=None):
|
||||
self.mode = mode
|
||||
self.fs = fs
|
||||
self.files = []
|
||||
super().__init__(*args)
|
||||
|
||||
def __enter__(self):
|
||||
if self.fs is None:
|
||||
raise ValueError("Context has already been used")
|
||||
|
||||
fs = self.fs
|
||||
while True:
|
||||
if hasattr(fs, "open_many"):
|
||||
# check for concurrent cache download; or set up for upload
|
||||
self.files = fs.open_many(self)
|
||||
return self.files
|
||||
if hasattr(fs, "fs") and fs.fs is not None:
|
||||
fs = fs.fs
|
||||
else:
|
||||
break
|
||||
return [s.__enter__() for s in self]
|
||||
|
||||
def __exit__(self, *args):
|
||||
fs = self.fs
|
||||
[s.__exit__(*args) for s in self]
|
||||
if "r" not in self.mode:
|
||||
while True:
|
||||
if hasattr(fs, "open_many"):
|
||||
# check for concurrent cache upload
|
||||
fs.commit_many(self.files)
|
||||
return
|
||||
if hasattr(fs, "fs") and fs.fs is not None:
|
||||
fs = fs.fs
|
||||
else:
|
||||
break
|
||||
|
||||
def __getitem__(self, item):
|
||||
out = super().__getitem__(item)
|
||||
if isinstance(item, slice):
|
||||
return OpenFiles(out, mode=self.mode, fs=self.fs)
|
||||
return out
|
||||
|
||||
def __repr__(self):
|
||||
return f"<List of {len(self)} OpenFile instances>"
|
||||
|
||||
|
||||
def open_files(
|
||||
urlpath,
|
||||
mode="rb",
|
||||
compression=None,
|
||||
encoding="utf8",
|
||||
errors=None,
|
||||
name_function=None,
|
||||
num=1,
|
||||
protocol=None,
|
||||
newline=None,
|
||||
auto_mkdir=True,
|
||||
expand=True,
|
||||
**kwargs,
|
||||
):
|
||||
"""Given a path or paths, return a list of ``OpenFile`` objects.
|
||||
|
||||
For writing, a str path must contain the "*" character, which will be filled
|
||||
in by increasing numbers, e.g., "part*" -> "part1", "part2" if num=2.
|
||||
|
||||
For either reading or writing, can instead provide explicit list of paths.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
urlpath: string or list
|
||||
Absolute or relative filepath(s). Prefix with a protocol like ``s3://``
|
||||
to read from alternative filesystems. To read from multiple files you
|
||||
can pass a globstring or a list of paths, with the caveat that they
|
||||
must all have the same protocol.
|
||||
mode: 'rb', 'wt', etc.
|
||||
compression: string or None
|
||||
If given, open file using compression codec. Can either be a compression
|
||||
name (a key in ``fsspec.compression.compr``) or "infer" to guess the
|
||||
compression from the filename suffix.
|
||||
encoding: str
|
||||
For text mode only
|
||||
errors: None or str
|
||||
Passed to TextIOWrapper in text mode
|
||||
name_function: function or None
|
||||
if opening a set of files for writing, those files do not yet exist,
|
||||
so we need to generate their names by formatting the urlpath for
|
||||
each sequence number
|
||||
num: int [1]
|
||||
if writing mode, number of files we expect to create (passed to
|
||||
name+function)
|
||||
protocol: str or None
|
||||
If given, overrides the protocol found in the URL.
|
||||
newline: bytes or None
|
||||
Used for line terminator in text mode. If None, uses system default;
|
||||
if blank, uses no translation.
|
||||
auto_mkdir: bool (True)
|
||||
If in write mode, this will ensure the target directory exists before
|
||||
writing, by calling ``fs.mkdirs(exist_ok=True)``.
|
||||
expand: bool
|
||||
**kwargs: dict
|
||||
Extra options that make sense to a particular storage connection, e.g.
|
||||
host, port, username, password, etc.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> files = open_files('2015-*-*.csv') # doctest: +SKIP
|
||||
>>> files = open_files(
|
||||
... 's3://bucket/2015-*-*.csv.gz', compression='gzip'
|
||||
... ) # doctest: +SKIP
|
||||
|
||||
Returns
|
||||
-------
|
||||
An ``OpenFiles`` instance, which is a list of ``OpenFile`` objects that can
|
||||
be used as a single context
|
||||
|
||||
Notes
|
||||
-----
|
||||
For a full list of the available protocols and the implementations that
|
||||
they map across to see the latest online documentation:
|
||||
|
||||
- For implementations built into ``fsspec`` see
|
||||
https://filesystem-spec.readthedocs.io/en/latest/api.html#built-in-implementations
|
||||
- For implementations in separate packages see
|
||||
https://filesystem-spec.readthedocs.io/en/latest/api.html#other-known-implementations
|
||||
"""
|
||||
fs, fs_token, paths = get_fs_token_paths(
|
||||
urlpath,
|
||||
mode,
|
||||
num=num,
|
||||
name_function=name_function,
|
||||
storage_options=kwargs,
|
||||
protocol=protocol,
|
||||
expand=expand,
|
||||
)
|
||||
if fs.protocol == "file":
|
||||
fs.auto_mkdir = auto_mkdir
|
||||
elif "r" not in mode and auto_mkdir:
|
||||
parents = {fs._parent(path) for path in paths}
|
||||
for parent in parents:
|
||||
try:
|
||||
fs.makedirs(parent, exist_ok=True)
|
||||
except PermissionError:
|
||||
pass
|
||||
return OpenFiles(
|
||||
[
|
||||
OpenFile(
|
||||
fs,
|
||||
path,
|
||||
mode=mode,
|
||||
compression=compression,
|
||||
encoding=encoding,
|
||||
errors=errors,
|
||||
newline=newline,
|
||||
)
|
||||
for path in paths
|
||||
],
|
||||
mode=mode,
|
||||
fs=fs,
|
||||
)
|
||||
|
||||
|
||||
def _un_chain(path, kwargs):
|
||||
# Avoid a circular import
|
||||
from fsspec.implementations.chained import ChainedFileSystem
|
||||
|
||||
if "::" in path:
|
||||
x = re.compile(".*[^a-z]+.*") # test for non protocol-like single word
|
||||
bits = []
|
||||
for p in path.split("::"):
|
||||
if "://" in p or x.match(p):
|
||||
bits.append(p)
|
||||
else:
|
||||
bits.append(p + "://")
|
||||
else:
|
||||
bits = [path]
|
||||
# [[url, protocol, kwargs], ...]
|
||||
out = []
|
||||
previous_bit = None
|
||||
kwargs = kwargs.copy()
|
||||
for bit in reversed(bits):
|
||||
protocol = kwargs.pop("protocol", None) or split_protocol(bit)[0] or "file"
|
||||
cls = get_filesystem_class(protocol)
|
||||
extra_kwargs = cls._get_kwargs_from_urls(bit)
|
||||
kws = kwargs.pop(protocol, {})
|
||||
if bit is bits[0]:
|
||||
kws.update(kwargs)
|
||||
kw = dict(
|
||||
**{k: v for k, v in extra_kwargs.items() if k not in kws or v != kws[k]},
|
||||
**kws,
|
||||
)
|
||||
bit = cls._strip_protocol(bit)
|
||||
if "target_protocol" not in kw and issubclass(cls, ChainedFileSystem):
|
||||
bit = previous_bit
|
||||
out.append((bit, protocol, kw))
|
||||
previous_bit = bit
|
||||
out.reverse()
|
||||
return out
|
||||
|
||||
|
||||
def url_to_fs(url, **kwargs):
|
||||
"""
|
||||
Turn fully-qualified and potentially chained URL into filesystem instance
|
||||
|
||||
Parameters
|
||||
----------
|
||||
url : str
|
||||
The fsspec-compatible URL
|
||||
**kwargs: dict
|
||||
Extra options that make sense to a particular storage connection, e.g.
|
||||
host, port, username, password, etc.
|
||||
|
||||
Returns
|
||||
-------
|
||||
filesystem : FileSystem
|
||||
The new filesystem discovered from ``url`` and created with
|
||||
``**kwargs``.
|
||||
urlpath : str
|
||||
The file-systems-specific URL for ``url``.
|
||||
"""
|
||||
url = stringify_path(url)
|
||||
# non-FS arguments that appear in fsspec.open()
|
||||
# inspect could keep this in sync with open()'s signature
|
||||
known_kwargs = {
|
||||
"compression",
|
||||
"encoding",
|
||||
"errors",
|
||||
"expand",
|
||||
"mode",
|
||||
"name_function",
|
||||
"newline",
|
||||
"num",
|
||||
}
|
||||
kwargs = {k: v for k, v in kwargs.items() if k not in known_kwargs}
|
||||
chain = _un_chain(url, kwargs)
|
||||
inkwargs = {}
|
||||
# Reverse iterate the chain, creating a nested target_* structure
|
||||
for i, ch in enumerate(reversed(chain)):
|
||||
urls, protocol, kw = ch
|
||||
if i == len(chain) - 1:
|
||||
inkwargs = dict(**kw, **inkwargs)
|
||||
continue
|
||||
inkwargs["target_options"] = dict(**kw, **inkwargs)
|
||||
inkwargs["target_protocol"] = protocol
|
||||
inkwargs["fo"] = urls
|
||||
urlpath, protocol, _ = chain[0]
|
||||
fs = filesystem(protocol, **inkwargs)
|
||||
return fs, urlpath
|
||||
|
||||
|
||||
DEFAULT_EXPAND = conf.get("open_expand", False)
|
||||
|
||||
|
||||
def open(
|
||||
urlpath,
|
||||
mode="rb",
|
||||
compression=None,
|
||||
encoding="utf8",
|
||||
errors=None,
|
||||
protocol=None,
|
||||
newline=None,
|
||||
expand=None,
|
||||
**kwargs,
|
||||
):
|
||||
"""Given a path or paths, return one ``OpenFile`` object.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
urlpath: string or list
|
||||
Absolute or relative filepath. Prefix with a protocol like ``s3://``
|
||||
to read from alternative filesystems. Should not include glob
|
||||
character(s).
|
||||
mode: 'rb', 'wt', etc.
|
||||
compression: string or None
|
||||
If given, open file using compression codec. Can either be a compression
|
||||
name (a key in ``fsspec.compression.compr``) or "infer" to guess the
|
||||
compression from the filename suffix.
|
||||
encoding: str
|
||||
For text mode only
|
||||
errors: None or str
|
||||
Passed to TextIOWrapper in text mode
|
||||
protocol: str or None
|
||||
If given, overrides the protocol found in the URL.
|
||||
newline: bytes or None
|
||||
Used for line terminator in text mode. If None, uses system default;
|
||||
if blank, uses no translation.
|
||||
expand: bool or None
|
||||
Whether to regard file paths containing special glob characters as needing
|
||||
expansion (finding the first match) or absolute. Setting False allows using
|
||||
paths which do embed such characters. If None (default), this argument
|
||||
takes its value from the DEFAULT_EXPAND module variable, which takes
|
||||
its initial value from the "open_expand" config value at startup, which will
|
||||
be False if not set.
|
||||
**kwargs: dict
|
||||
Extra options that make sense to a particular storage connection, e.g.
|
||||
host, port, username, password, etc.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> openfile = open('2015-01-01.csv') # doctest: +SKIP
|
||||
>>> openfile = open(
|
||||
... 's3://bucket/2015-01-01.csv.gz', compression='gzip'
|
||||
... ) # doctest: +SKIP
|
||||
>>> with openfile as f:
|
||||
... df = pd.read_csv(f) # doctest: +SKIP
|
||||
...
|
||||
|
||||
Returns
|
||||
-------
|
||||
``OpenFile`` object.
|
||||
|
||||
Notes
|
||||
-----
|
||||
For a full list of the available protocols and the implementations that
|
||||
they map across to see the latest online documentation:
|
||||
|
||||
- For implementations built into ``fsspec`` see
|
||||
https://filesystem-spec.readthedocs.io/en/latest/api.html#built-in-implementations
|
||||
- For implementations in separate packages see
|
||||
https://filesystem-spec.readthedocs.io/en/latest/api.html#other-known-implementations
|
||||
"""
|
||||
expand = DEFAULT_EXPAND if expand is None else expand
|
||||
out = open_files(
|
||||
urlpath=[urlpath],
|
||||
mode=mode,
|
||||
compression=compression,
|
||||
encoding=encoding,
|
||||
errors=errors,
|
||||
protocol=protocol,
|
||||
newline=newline,
|
||||
expand=expand,
|
||||
**kwargs,
|
||||
)
|
||||
if not out:
|
||||
raise FileNotFoundError(urlpath)
|
||||
return out[0]
|
||||
|
||||
|
||||
def open_local(
|
||||
url: str | list[str] | Path | list[Path],
|
||||
mode: str = "rb",
|
||||
**storage_options: dict,
|
||||
) -> str | list[str]:
|
||||
"""Open file(s) which can be resolved to local
|
||||
|
||||
For files which either are local, or get downloaded upon open
|
||||
(e.g., by file caching)
|
||||
|
||||
Parameters
|
||||
----------
|
||||
url: str or list(str)
|
||||
mode: str
|
||||
Must be read mode
|
||||
storage_options:
|
||||
passed on to FS for or used by open_files (e.g., compression)
|
||||
"""
|
||||
if "r" not in mode:
|
||||
raise ValueError("Can only ensure local files when reading")
|
||||
of = open_files(url, mode=mode, **storage_options)
|
||||
if not getattr(of[0].fs, "local_file", False):
|
||||
raise ValueError(
|
||||
"open_local can only be used on a filesystem which"
|
||||
" has attribute local_file=True"
|
||||
)
|
||||
with of as files:
|
||||
paths = [f.name for f in files]
|
||||
if (isinstance(url, str) and not has_magic(url)) or isinstance(url, Path):
|
||||
return paths[0]
|
||||
return paths
|
||||
|
||||
|
||||
def get_compression(urlpath, compression):
|
||||
if compression == "infer":
|
||||
compression = infer_compression(urlpath)
|
||||
if compression is not None and compression not in compr:
|
||||
raise ValueError(f"Compression type {compression} not supported")
|
||||
return compression
|
||||
|
||||
|
||||
def split_protocol(urlpath):
|
||||
"""Return protocol, path pair"""
|
||||
urlpath = stringify_path(urlpath)
|
||||
if "://" in urlpath:
|
||||
protocol, path = urlpath.split("://", 1)
|
||||
if len(protocol) > 1:
|
||||
# excludes Windows paths
|
||||
return protocol, path
|
||||
if urlpath.startswith("data:"):
|
||||
return urlpath.split(":", 1)
|
||||
return None, urlpath
|
||||
|
||||
|
||||
def strip_protocol(urlpath):
|
||||
"""Return only path part of full URL, according to appropriate backend"""
|
||||
protocol, _ = split_protocol(urlpath)
|
||||
cls = get_filesystem_class(protocol)
|
||||
return cls._strip_protocol(urlpath)
|
||||
|
||||
|
||||
def expand_paths_if_needed(paths, mode, num, fs, name_function):
|
||||
"""Expand paths if they have a ``*`` in them (write mode) or any of ``*?[]``
|
||||
in them (read mode).
|
||||
|
||||
:param paths: list of paths
|
||||
mode: str
|
||||
Mode in which to open files.
|
||||
num: int
|
||||
If opening in writing mode, number of files we expect to create.
|
||||
fs: filesystem object
|
||||
name_function: callable
|
||||
If opening in writing mode, this callable is used to generate path
|
||||
names. Names are generated for each partition by
|
||||
``urlpath.replace('*', name_function(partition_index))``.
|
||||
:return: list of paths
|
||||
"""
|
||||
expanded_paths = []
|
||||
paths = list(paths)
|
||||
|
||||
if "w" in mode: # read mode
|
||||
if sum(1 for p in paths if "*" in p) > 1:
|
||||
raise ValueError(
|
||||
"When writing data, only one filename mask can be specified."
|
||||
)
|
||||
num = max(num, len(paths))
|
||||
|
||||
for curr_path in paths:
|
||||
if "*" in curr_path:
|
||||
# expand using name_function
|
||||
expanded_paths.extend(_expand_paths(curr_path, name_function, num))
|
||||
else:
|
||||
expanded_paths.append(curr_path)
|
||||
# if we generated more paths that asked for, trim the list
|
||||
if len(expanded_paths) > num:
|
||||
expanded_paths = expanded_paths[:num]
|
||||
|
||||
else: # read mode
|
||||
for curr_path in paths:
|
||||
if has_magic(curr_path):
|
||||
# expand using glob
|
||||
expanded_paths.extend(fs.glob(curr_path))
|
||||
else:
|
||||
expanded_paths.append(curr_path)
|
||||
|
||||
return expanded_paths
|
||||
|
||||
|
||||
def get_fs_token_paths(
|
||||
urlpath,
|
||||
mode="rb",
|
||||
num=1,
|
||||
name_function=None,
|
||||
storage_options=None,
|
||||
protocol=None,
|
||||
expand=True,
|
||||
):
|
||||
"""Filesystem, deterministic token, and paths from a urlpath and options.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
urlpath: string or iterable
|
||||
Absolute or relative filepath, URL (may include protocols like
|
||||
``s3://``), or globstring pointing to data.
|
||||
mode: str, optional
|
||||
Mode in which to open files.
|
||||
num: int, optional
|
||||
If opening in writing mode, number of files we expect to create.
|
||||
name_function: callable, optional
|
||||
If opening in writing mode, this callable is used to generate path
|
||||
names. Names are generated for each partition by
|
||||
``urlpath.replace('*', name_function(partition_index))``.
|
||||
storage_options: dict, optional
|
||||
Additional keywords to pass to the filesystem class.
|
||||
protocol: str or None
|
||||
To override the protocol specifier in the URL
|
||||
expand: bool
|
||||
Expand string paths for writing, assuming the path is a directory
|
||||
"""
|
||||
if isinstance(urlpath, (list, tuple, set)):
|
||||
if not urlpath:
|
||||
raise ValueError("empty urlpath sequence")
|
||||
urlpath0 = stringify_path(next(iter(urlpath)))
|
||||
else:
|
||||
urlpath0 = stringify_path(urlpath)
|
||||
storage_options = storage_options or {}
|
||||
if protocol:
|
||||
storage_options["protocol"] = protocol
|
||||
chain = _un_chain(urlpath0, storage_options or {})
|
||||
inkwargs = {}
|
||||
# Reverse iterate the chain, creating a nested target_* structure
|
||||
for i, ch in enumerate(reversed(chain)):
|
||||
urls, nested_protocol, kw = ch
|
||||
if i == len(chain) - 1:
|
||||
inkwargs = dict(**kw, **inkwargs)
|
||||
continue
|
||||
inkwargs["target_options"] = dict(**kw, **inkwargs)
|
||||
inkwargs["target_protocol"] = nested_protocol
|
||||
inkwargs["fo"] = urls
|
||||
paths, protocol, _ = chain[0]
|
||||
fs = filesystem(protocol, **inkwargs)
|
||||
if isinstance(urlpath, (list, tuple, set)):
|
||||
pchains = [
|
||||
_un_chain(stringify_path(u), storage_options or {})[0] for u in urlpath
|
||||
]
|
||||
if len({pc[1] for pc in pchains}) > 1:
|
||||
raise ValueError("Protocol mismatch getting fs from %s", urlpath)
|
||||
paths = [pc[0] for pc in pchains]
|
||||
else:
|
||||
paths = fs._strip_protocol(paths)
|
||||
if isinstance(paths, (list, tuple, set)):
|
||||
if expand:
|
||||
paths = expand_paths_if_needed(paths, mode, num, fs, name_function)
|
||||
elif not isinstance(paths, list):
|
||||
paths = list(paths)
|
||||
else:
|
||||
if ("w" in mode or "x" in mode) and expand:
|
||||
paths = _expand_paths(paths, name_function, num)
|
||||
elif "*" in paths:
|
||||
paths = [f for f in sorted(fs.glob(paths)) if not fs.isdir(f)]
|
||||
else:
|
||||
paths = [paths]
|
||||
|
||||
return fs, fs._fs_token, paths
|
||||
|
||||
|
||||
def _expand_paths(path, name_function, num):
|
||||
if isinstance(path, str):
|
||||
if path.count("*") > 1:
|
||||
raise ValueError("Output path spec must contain exactly one '*'.")
|
||||
elif "*" not in path:
|
||||
path = os.path.join(path, "*.part")
|
||||
|
||||
if name_function is None:
|
||||
name_function = build_name_function(num - 1)
|
||||
|
||||
paths = [path.replace("*", name_function(i)) for i in range(num)]
|
||||
if paths != sorted(paths):
|
||||
logger.warning(
|
||||
"In order to preserve order between partitions"
|
||||
" paths created with ``name_function`` should "
|
||||
"sort to partition order"
|
||||
)
|
||||
elif isinstance(path, (tuple, list)):
|
||||
assert len(path) == num
|
||||
paths = list(path)
|
||||
else:
|
||||
raise ValueError(
|
||||
"Path should be either\n"
|
||||
"1. A list of paths: ['foo.json', 'bar.json', ...]\n"
|
||||
"2. A directory: 'foo/\n"
|
||||
"3. A path with a '*' in it: 'foo.*.json'"
|
||||
)
|
||||
return paths
|
||||
|
||||
|
||||
class PickleableTextIOWrapper(io.TextIOWrapper):
|
||||
"""TextIOWrapper cannot be pickled. This solves it.
|
||||
|
||||
Requires that ``buffer`` be pickleable, which all instances of
|
||||
AbstractBufferedFile are.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
buffer,
|
||||
encoding=None,
|
||||
errors=None,
|
||||
newline=None,
|
||||
line_buffering=False,
|
||||
write_through=False,
|
||||
):
|
||||
self.args = buffer, encoding, errors, newline, line_buffering, write_through
|
||||
super().__init__(*self.args)
|
||||
|
||||
def __reduce__(self):
|
||||
return PickleableTextIOWrapper, self.args
|
||||
@@ -0,0 +1,98 @@
|
||||
import time
|
||||
from collections.abc import MutableMapping
|
||||
from functools import lru_cache
|
||||
|
||||
|
||||
class DirCache(MutableMapping):
|
||||
"""
|
||||
Caching of directory listings, in a structure like::
|
||||
|
||||
{"path0": [
|
||||
{"name": "path0/file0",
|
||||
"size": 123,
|
||||
"type": "file",
|
||||
...
|
||||
},
|
||||
{"name": "path0/file1",
|
||||
},
|
||||
...
|
||||
],
|
||||
"path1": [...]
|
||||
}
|
||||
|
||||
Parameters to this class control listing expiry or indeed turn
|
||||
caching off
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
use_listings_cache=True,
|
||||
listings_expiry_time=None,
|
||||
max_paths=None,
|
||||
**kwargs,
|
||||
):
|
||||
"""
|
||||
|
||||
Parameters
|
||||
----------
|
||||
use_listings_cache: bool
|
||||
If False, this cache never returns items, but always reports KeyError,
|
||||
and setting items has no effect
|
||||
listings_expiry_time: int or float (optional)
|
||||
Time in seconds that a listing is considered valid. If None,
|
||||
listings do not expire.
|
||||
max_paths: int (optional)
|
||||
The number of most recent listings that are considered valid; 'recent'
|
||||
refers to when the entry was set.
|
||||
"""
|
||||
self._cache = {}
|
||||
self._times = {}
|
||||
if max_paths:
|
||||
self._q = lru_cache(max_paths + 1)(lambda key: self._cache.pop(key, None))
|
||||
self.use_listings_cache = use_listings_cache
|
||||
self.listings_expiry_time = listings_expiry_time
|
||||
self.max_paths = max_paths
|
||||
|
||||
def __getitem__(self, item):
|
||||
if self.listings_expiry_time is not None:
|
||||
if self._times.get(item, 0) - time.time() < -self.listings_expiry_time:
|
||||
del self._cache[item]
|
||||
if self.max_paths:
|
||||
self._q(item)
|
||||
return self._cache[item] # maybe raises KeyError
|
||||
|
||||
def clear(self):
|
||||
self._cache.clear()
|
||||
|
||||
def __len__(self):
|
||||
return len(self._cache)
|
||||
|
||||
def __contains__(self, item):
|
||||
try:
|
||||
self[item]
|
||||
return True
|
||||
except KeyError:
|
||||
return False
|
||||
|
||||
def __setitem__(self, key, value):
|
||||
if not self.use_listings_cache:
|
||||
return
|
||||
if self.max_paths:
|
||||
self._q(key)
|
||||
self._cache[key] = value
|
||||
if self.listings_expiry_time is not None:
|
||||
self._times[key] = time.time()
|
||||
|
||||
def __delitem__(self, key):
|
||||
del self._cache[key]
|
||||
|
||||
def __iter__(self):
|
||||
entries = list(self._cache)
|
||||
|
||||
return (k for k in entries if k in self)
|
||||
|
||||
def __reduce__(self):
|
||||
return (
|
||||
DirCache,
|
||||
(self.use_listings_cache, self.listings_expiry_time, self.max_paths),
|
||||
)
|
||||
@@ -0,0 +1,18 @@
|
||||
"""
|
||||
fsspec user-defined exception classes
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
|
||||
|
||||
class BlocksizeMismatchError(ValueError):
|
||||
"""
|
||||
Raised when a cached file is opened with a different blocksize than it was
|
||||
written with
|
||||
"""
|
||||
|
||||
|
||||
class FSTimeoutError(asyncio.TimeoutError):
|
||||
"""
|
||||
Raised when a fsspec function timed out occurs
|
||||
"""
|
||||
324
backend_service/venv/lib/python3.13/site-packages/fsspec/fuse.py
Normal file
324
backend_service/venv/lib/python3.13/site-packages/fsspec/fuse.py
Normal file
@@ -0,0 +1,324 @@
|
||||
import argparse
|
||||
import logging
|
||||
import os
|
||||
import stat
|
||||
import threading
|
||||
import time
|
||||
from errno import EIO, ENOENT
|
||||
|
||||
from fuse import FUSE, FuseOSError, LoggingMixIn, Operations
|
||||
|
||||
from fsspec import __version__
|
||||
from fsspec.core import url_to_fs
|
||||
|
||||
logger = logging.getLogger("fsspec.fuse")
|
||||
|
||||
|
||||
class FUSEr(Operations):
|
||||
def __init__(self, fs, path, ready_file=False):
|
||||
self.fs = fs
|
||||
self.cache = {}
|
||||
self.root = path.rstrip("/") + "/"
|
||||
self.counter = 0
|
||||
logger.info("Starting FUSE at %s", path)
|
||||
self._ready_file = ready_file
|
||||
|
||||
def getattr(self, path, fh=None):
|
||||
logger.debug("getattr %s", path)
|
||||
if self._ready_file and path in ["/.fuse_ready", ".fuse_ready"]:
|
||||
return {"type": "file", "st_size": 5}
|
||||
|
||||
path = "".join([self.root, path.lstrip("/")]).rstrip("/")
|
||||
try:
|
||||
info = self.fs.info(path)
|
||||
except FileNotFoundError as exc:
|
||||
raise FuseOSError(ENOENT) from exc
|
||||
|
||||
data = {"st_uid": info.get("uid", 1000), "st_gid": info.get("gid", 1000)}
|
||||
perm = info.get("mode", 0o777)
|
||||
|
||||
if info["type"] != "file":
|
||||
data["st_mode"] = stat.S_IFDIR | perm
|
||||
data["st_size"] = 0
|
||||
data["st_blksize"] = 0
|
||||
else:
|
||||
data["st_mode"] = stat.S_IFREG | perm
|
||||
data["st_size"] = info["size"]
|
||||
data["st_blksize"] = 5 * 2**20
|
||||
data["st_nlink"] = 1
|
||||
data["st_atime"] = info["atime"] if "atime" in info else time.time()
|
||||
data["st_ctime"] = info["ctime"] if "ctime" in info else time.time()
|
||||
data["st_mtime"] = info["mtime"] if "mtime" in info else time.time()
|
||||
return data
|
||||
|
||||
def readdir(self, path, fh):
|
||||
logger.debug("readdir %s", path)
|
||||
path = "".join([self.root, path.lstrip("/")])
|
||||
files = self.fs.ls(path, False)
|
||||
files = [os.path.basename(f.rstrip("/")) for f in files]
|
||||
return [".", ".."] + files
|
||||
|
||||
def mkdir(self, path, mode):
|
||||
path = "".join([self.root, path.lstrip("/")])
|
||||
self.fs.mkdir(path)
|
||||
return 0
|
||||
|
||||
def rmdir(self, path):
|
||||
path = "".join([self.root, path.lstrip("/")])
|
||||
self.fs.rmdir(path)
|
||||
return 0
|
||||
|
||||
def read(self, path, size, offset, fh):
|
||||
logger.debug("read %s", (path, size, offset))
|
||||
if self._ready_file and path in ["/.fuse_ready", ".fuse_ready"]:
|
||||
# status indicator
|
||||
return b"ready"
|
||||
|
||||
f = self.cache[fh]
|
||||
f.seek(offset)
|
||||
out = f.read(size)
|
||||
return out
|
||||
|
||||
def write(self, path, data, offset, fh):
|
||||
logger.debug("write %s", (path, offset))
|
||||
f = self.cache[fh]
|
||||
f.seek(offset)
|
||||
f.write(data)
|
||||
return len(data)
|
||||
|
||||
def create(self, path, flags, fi=None):
|
||||
logger.debug("create %s", (path, flags))
|
||||
fn = "".join([self.root, path.lstrip("/")])
|
||||
self.fs.touch(fn) # OS will want to get attributes immediately
|
||||
f = self.fs.open(fn, "wb")
|
||||
self.cache[self.counter] = f
|
||||
self.counter += 1
|
||||
return self.counter - 1
|
||||
|
||||
def open(self, path, flags):
|
||||
logger.debug("open %s", (path, flags))
|
||||
fn = "".join([self.root, path.lstrip("/")])
|
||||
if flags % 2 == 0:
|
||||
# read
|
||||
mode = "rb"
|
||||
else:
|
||||
# write/create
|
||||
mode = "wb"
|
||||
self.cache[self.counter] = self.fs.open(fn, mode)
|
||||
self.counter += 1
|
||||
return self.counter - 1
|
||||
|
||||
def truncate(self, path, length, fh=None):
|
||||
fn = "".join([self.root, path.lstrip("/")])
|
||||
if length != 0:
|
||||
raise NotImplementedError
|
||||
# maybe should be no-op since open with write sets size to zero anyway
|
||||
self.fs.touch(fn)
|
||||
|
||||
def unlink(self, path):
|
||||
fn = "".join([self.root, path.lstrip("/")])
|
||||
try:
|
||||
self.fs.rm(fn, False)
|
||||
except (OSError, FileNotFoundError) as exc:
|
||||
raise FuseOSError(EIO) from exc
|
||||
|
||||
def release(self, path, fh):
|
||||
try:
|
||||
if fh in self.cache:
|
||||
f = self.cache[fh]
|
||||
f.close()
|
||||
self.cache.pop(fh)
|
||||
except Exception as e:
|
||||
print(e)
|
||||
return 0
|
||||
|
||||
def chmod(self, path, mode):
|
||||
if hasattr(self.fs, "chmod"):
|
||||
path = "".join([self.root, path.lstrip("/")])
|
||||
return self.fs.chmod(path, mode)
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
def run(
|
||||
fs,
|
||||
path,
|
||||
mount_point,
|
||||
foreground=True,
|
||||
threads=False,
|
||||
ready_file=False,
|
||||
ops_class=FUSEr,
|
||||
):
|
||||
"""Mount stuff in a local directory
|
||||
|
||||
This uses fusepy to make it appear as if a given path on an fsspec
|
||||
instance is in fact resident within the local file-system.
|
||||
|
||||
This requires that fusepy by installed, and that FUSE be available on
|
||||
the system (typically requiring a package to be installed with
|
||||
apt, yum, brew, etc.).
|
||||
|
||||
Parameters
|
||||
----------
|
||||
fs: file-system instance
|
||||
From one of the compatible implementations
|
||||
path: str
|
||||
Location on that file-system to regard as the root directory to
|
||||
mount. Note that you typically should include the terminating "/"
|
||||
character.
|
||||
mount_point: str
|
||||
An empty directory on the local file-system where the contents of
|
||||
the remote path will appear.
|
||||
foreground: bool
|
||||
Whether or not calling this function will block. Operation will
|
||||
typically be more stable if True.
|
||||
threads: bool
|
||||
Whether or not to create threads when responding to file operations
|
||||
within the mounter directory. Operation will typically be more
|
||||
stable if False.
|
||||
ready_file: bool
|
||||
Whether the FUSE process is ready. The ``.fuse_ready`` file will
|
||||
exist in the ``mount_point`` directory if True. Debugging purpose.
|
||||
ops_class: FUSEr or Subclass of FUSEr
|
||||
To override the default behavior of FUSEr. For Example, logging
|
||||
to file.
|
||||
|
||||
"""
|
||||
func = lambda: FUSE(
|
||||
ops_class(fs, path, ready_file=ready_file),
|
||||
mount_point,
|
||||
nothreads=not threads,
|
||||
foreground=foreground,
|
||||
)
|
||||
if not foreground:
|
||||
th = threading.Thread(target=func)
|
||||
th.daemon = True
|
||||
th.start()
|
||||
return th
|
||||
else: # pragma: no cover
|
||||
try:
|
||||
func()
|
||||
except KeyboardInterrupt:
|
||||
pass
|
||||
|
||||
|
||||
def main(args):
|
||||
"""Mount filesystem from chained URL to MOUNT_POINT.
|
||||
|
||||
Examples:
|
||||
|
||||
python3 -m fsspec.fuse memory /usr/share /tmp/mem
|
||||
|
||||
python3 -m fsspec.fuse local /tmp/source /tmp/local \\
|
||||
-l /tmp/fsspecfuse.log
|
||||
|
||||
You can also mount chained-URLs and use special settings:
|
||||
|
||||
python3 -m fsspec.fuse 'filecache::zip::file://data.zip' \\
|
||||
/ /tmp/zip \\
|
||||
-o 'filecache-cache_storage=/tmp/simplecache'
|
||||
|
||||
You can specify the type of the setting by using `[int]` or `[bool]`,
|
||||
(`true`, `yes`, `1` represents the Boolean value `True`):
|
||||
|
||||
python3 -m fsspec.fuse 'simplecache::ftp://ftp1.at.proftpd.org' \\
|
||||
/historic/packages/RPMS /tmp/ftp \\
|
||||
-o 'simplecache-cache_storage=/tmp/simplecache' \\
|
||||
-o 'simplecache-check_files=false[bool]' \\
|
||||
-o 'ftp-listings_expiry_time=60[int]' \\
|
||||
-o 'ftp-username=anonymous' \\
|
||||
-o 'ftp-password=xieyanbo'
|
||||
"""
|
||||
|
||||
class RawDescriptionArgumentParser(argparse.ArgumentParser):
|
||||
def format_help(self):
|
||||
usage = super().format_help()
|
||||
parts = usage.split("\n\n")
|
||||
parts[1] = self.description.rstrip()
|
||||
return "\n\n".join(parts)
|
||||
|
||||
parser = RawDescriptionArgumentParser(prog="fsspec.fuse", description=main.__doc__)
|
||||
parser.add_argument("--version", action="version", version=__version__)
|
||||
parser.add_argument("url", type=str, help="fs url")
|
||||
parser.add_argument("source_path", type=str, help="source directory in fs")
|
||||
parser.add_argument("mount_point", type=str, help="local directory")
|
||||
parser.add_argument(
|
||||
"-o",
|
||||
"--option",
|
||||
action="append",
|
||||
help="Any options of protocol included in the chained URL",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-l", "--log-file", type=str, help="Logging FUSE debug info (Default: '')"
|
||||
)
|
||||
parser.add_argument(
|
||||
"-f",
|
||||
"--foreground",
|
||||
action="store_false",
|
||||
help="Running in foreground or not (Default: False)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-t",
|
||||
"--threads",
|
||||
action="store_false",
|
||||
help="Running with threads support (Default: False)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-r",
|
||||
"--ready-file",
|
||||
action="store_false",
|
||||
help="The `.fuse_ready` file will exist after FUSE is ready. "
|
||||
"(Debugging purpose, Default: False)",
|
||||
)
|
||||
args = parser.parse_args(args)
|
||||
|
||||
kwargs = {}
|
||||
for item in args.option or []:
|
||||
key, sep, value = item.partition("=")
|
||||
if not sep:
|
||||
parser.error(message=f"Wrong option: {item!r}")
|
||||
val = value.lower()
|
||||
if val.endswith("[int]"):
|
||||
value = int(value[: -len("[int]")])
|
||||
elif val.endswith("[bool]"):
|
||||
value = val[: -len("[bool]")] in ["1", "yes", "true"]
|
||||
|
||||
if "-" in key:
|
||||
fs_name, setting_name = key.split("-", 1)
|
||||
if fs_name in kwargs:
|
||||
kwargs[fs_name][setting_name] = value
|
||||
else:
|
||||
kwargs[fs_name] = {setting_name: value}
|
||||
else:
|
||||
kwargs[key] = value
|
||||
|
||||
if args.log_file:
|
||||
logging.basicConfig(
|
||||
level=logging.DEBUG,
|
||||
filename=args.log_file,
|
||||
format="%(asctime)s %(message)s",
|
||||
)
|
||||
|
||||
class LoggingFUSEr(FUSEr, LoggingMixIn):
|
||||
pass
|
||||
|
||||
fuser = LoggingFUSEr
|
||||
else:
|
||||
fuser = FUSEr
|
||||
|
||||
fs, url_path = url_to_fs(args.url, **kwargs)
|
||||
logger.debug("Mounting %s to %s", url_path, str(args.mount_point))
|
||||
run(
|
||||
fs,
|
||||
args.source_path,
|
||||
args.mount_point,
|
||||
foreground=args.foreground,
|
||||
threads=args.threads,
|
||||
ready_file=args.ready_file,
|
||||
ops_class=fuser,
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import sys
|
||||
|
||||
main(sys.argv[1:])
|
||||
@@ -0,0 +1,396 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import inspect
|
||||
import logging
|
||||
import os
|
||||
import shutil
|
||||
import uuid
|
||||
|
||||
from .asyn import AsyncFileSystem, _run_coros_in_chunks, sync_wrapper
|
||||
from .callbacks import DEFAULT_CALLBACK
|
||||
from .core import filesystem, get_filesystem_class, split_protocol, url_to_fs
|
||||
|
||||
_generic_fs = {}
|
||||
logger = logging.getLogger("fsspec.generic")
|
||||
|
||||
|
||||
def set_generic_fs(protocol, **storage_options):
|
||||
"""Populate the dict used for method=="generic" lookups"""
|
||||
_generic_fs[protocol] = filesystem(protocol, **storage_options)
|
||||
|
||||
|
||||
def _resolve_fs(url, method, protocol=None, storage_options=None):
|
||||
"""Pick instance of backend FS"""
|
||||
url = url[0] if isinstance(url, (list, tuple)) else url
|
||||
protocol = protocol or split_protocol(url)[0]
|
||||
storage_options = storage_options or {}
|
||||
if method == "default":
|
||||
return filesystem(protocol)
|
||||
if method == "generic":
|
||||
return _generic_fs[protocol]
|
||||
if method == "current":
|
||||
cls = get_filesystem_class(protocol)
|
||||
return cls.current()
|
||||
if method == "options":
|
||||
fs, _ = url_to_fs(url, **storage_options.get(protocol, {}))
|
||||
return fs
|
||||
raise ValueError(f"Unknown FS resolution method: {method}")
|
||||
|
||||
|
||||
def rsync(
|
||||
source,
|
||||
destination,
|
||||
delete_missing=False,
|
||||
source_field="size",
|
||||
dest_field="size",
|
||||
update_cond="different",
|
||||
inst_kwargs=None,
|
||||
fs=None,
|
||||
**kwargs,
|
||||
):
|
||||
"""Sync files between two directory trees
|
||||
|
||||
(experimental)
|
||||
|
||||
Parameters
|
||||
----------
|
||||
source: str
|
||||
Root of the directory tree to take files from. This must be a directory, but
|
||||
do not include any terminating "/" character
|
||||
destination: str
|
||||
Root path to copy into. The contents of this location should be
|
||||
identical to the contents of ``source`` when done. This will be made a
|
||||
directory, and the terminal "/" should not be included.
|
||||
delete_missing: bool
|
||||
If there are paths in the destination that don't exist in the
|
||||
source and this is True, delete them. Otherwise, leave them alone.
|
||||
source_field: str | callable
|
||||
If ``update_field`` is "different", this is the key in the info
|
||||
of source files to consider for difference. Maybe a function of the
|
||||
info dict.
|
||||
dest_field: str | callable
|
||||
If ``update_field`` is "different", this is the key in the info
|
||||
of destination files to consider for difference. May be a function of
|
||||
the info dict.
|
||||
update_cond: "different"|"always"|"never"
|
||||
If "always", every file is copied, regardless of whether it exists in
|
||||
the destination. If "never", files that exist in the destination are
|
||||
not copied again. If "different" (default), only copy if the info
|
||||
fields given by ``source_field`` and ``dest_field`` (usually "size")
|
||||
are different. Other comparisons may be added in the future.
|
||||
inst_kwargs: dict|None
|
||||
If ``fs`` is None, use this set of keyword arguments to make a
|
||||
GenericFileSystem instance
|
||||
fs: GenericFileSystem|None
|
||||
Instance to use if explicitly given. The instance defines how to
|
||||
to make downstream file system instances from paths.
|
||||
|
||||
Returns
|
||||
-------
|
||||
dict of the copy operations that were performed, {source: destination}
|
||||
"""
|
||||
fs = fs or GenericFileSystem(**(inst_kwargs or {}))
|
||||
source = fs._strip_protocol(source)
|
||||
destination = fs._strip_protocol(destination)
|
||||
allfiles = fs.find(source, withdirs=True, detail=True)
|
||||
if not fs.isdir(source):
|
||||
raise ValueError("Can only rsync on a directory")
|
||||
otherfiles = fs.find(destination, withdirs=True, detail=True)
|
||||
dirs = [
|
||||
a
|
||||
for a, v in allfiles.items()
|
||||
if v["type"] == "directory" and a.replace(source, destination) not in otherfiles
|
||||
]
|
||||
logger.debug(f"{len(dirs)} directories to create")
|
||||
if dirs:
|
||||
fs.make_many_dirs(
|
||||
[dirn.replace(source, destination) for dirn in dirs], exist_ok=True
|
||||
)
|
||||
allfiles = {a: v for a, v in allfiles.items() if v["type"] == "file"}
|
||||
logger.debug(f"{len(allfiles)} files to consider for copy")
|
||||
to_delete = [
|
||||
o
|
||||
for o, v in otherfiles.items()
|
||||
if o.replace(destination, source) not in allfiles and v["type"] == "file"
|
||||
]
|
||||
for k, v in allfiles.copy().items():
|
||||
otherfile = k.replace(source, destination)
|
||||
if otherfile in otherfiles:
|
||||
if update_cond == "always":
|
||||
allfiles[k] = otherfile
|
||||
elif update_cond == "never":
|
||||
allfiles.pop(k)
|
||||
elif update_cond == "different":
|
||||
inf1 = source_field(v) if callable(source_field) else v[source_field]
|
||||
v2 = otherfiles[otherfile]
|
||||
inf2 = dest_field(v2) if callable(dest_field) else v2[dest_field]
|
||||
if inf1 != inf2:
|
||||
# details mismatch, make copy
|
||||
allfiles[k] = otherfile
|
||||
else:
|
||||
# details match, don't copy
|
||||
allfiles.pop(k)
|
||||
else:
|
||||
# file not in target yet
|
||||
allfiles[k] = otherfile
|
||||
logger.debug(f"{len(allfiles)} files to copy")
|
||||
if allfiles:
|
||||
source_files, target_files = zip(*allfiles.items())
|
||||
fs.cp(source_files, target_files, **kwargs)
|
||||
logger.debug(f"{len(to_delete)} files to delete")
|
||||
if delete_missing and to_delete:
|
||||
fs.rm(to_delete)
|
||||
return allfiles
|
||||
|
||||
|
||||
class GenericFileSystem(AsyncFileSystem):
|
||||
"""Wrapper over all other FS types
|
||||
|
||||
<experimental!>
|
||||
|
||||
This implementation is a single unified interface to be able to run FS operations
|
||||
over generic URLs, and dispatch to the specific implementations using the URL
|
||||
protocol prefix.
|
||||
|
||||
Note: instances of this FS are always async, even if you never use it with any async
|
||||
backend.
|
||||
"""
|
||||
|
||||
protocol = "generic" # there is no real reason to ever use a protocol with this FS
|
||||
|
||||
def __init__(self, default_method="default", storage_options=None, **kwargs):
|
||||
"""
|
||||
|
||||
Parameters
|
||||
----------
|
||||
default_method: str (optional)
|
||||
Defines how to configure backend FS instances. Options are:
|
||||
- "default": instantiate like FSClass(), with no
|
||||
extra arguments; this is the default instance of that FS, and can be
|
||||
configured via the config system
|
||||
- "generic": takes instances from the `_generic_fs` dict in this module,
|
||||
which you must populate before use. Keys are by protocol
|
||||
- "options": expects storage_options, a dict mapping protocol to
|
||||
kwargs to use when constructing the filesystem
|
||||
- "current": takes the most recently instantiated version of each FS
|
||||
"""
|
||||
self.method = default_method
|
||||
self.st_opts = storage_options
|
||||
super().__init__(**kwargs)
|
||||
|
||||
def _parent(self, path):
|
||||
fs = _resolve_fs(path, self.method, storage_options=self.st_opts)
|
||||
return fs.unstrip_protocol(fs._parent(path))
|
||||
|
||||
def _strip_protocol(self, path):
|
||||
# normalization only
|
||||
fs = _resolve_fs(path, self.method, storage_options=self.st_opts)
|
||||
return fs.unstrip_protocol(fs._strip_protocol(path))
|
||||
|
||||
async def _find(self, path, maxdepth=None, withdirs=False, detail=False, **kwargs):
|
||||
fs = _resolve_fs(path, self.method, storage_options=self.st_opts)
|
||||
if fs.async_impl:
|
||||
out = await fs._find(
|
||||
path, maxdepth=maxdepth, withdirs=withdirs, detail=True, **kwargs
|
||||
)
|
||||
else:
|
||||
out = fs.find(
|
||||
path, maxdepth=maxdepth, withdirs=withdirs, detail=True, **kwargs
|
||||
)
|
||||
result = {}
|
||||
for k, v in out.items():
|
||||
v = v.copy() # don't corrupt target FS dircache
|
||||
name = fs.unstrip_protocol(k)
|
||||
v["name"] = name
|
||||
result[name] = v
|
||||
if detail:
|
||||
return result
|
||||
return list(result)
|
||||
|
||||
async def _info(self, url, **kwargs):
|
||||
fs = _resolve_fs(url, self.method)
|
||||
if fs.async_impl:
|
||||
out = await fs._info(url, **kwargs)
|
||||
else:
|
||||
out = fs.info(url, **kwargs)
|
||||
out = out.copy() # don't edit originals
|
||||
out["name"] = fs.unstrip_protocol(out["name"])
|
||||
return out
|
||||
|
||||
async def _ls(
|
||||
self,
|
||||
url,
|
||||
detail=True,
|
||||
**kwargs,
|
||||
):
|
||||
fs = _resolve_fs(url, self.method)
|
||||
if fs.async_impl:
|
||||
out = await fs._ls(url, detail=True, **kwargs)
|
||||
else:
|
||||
out = fs.ls(url, detail=True, **kwargs)
|
||||
out = [o.copy() for o in out] # don't edit originals
|
||||
for o in out:
|
||||
o["name"] = fs.unstrip_protocol(o["name"])
|
||||
if detail:
|
||||
return out
|
||||
else:
|
||||
return [o["name"] for o in out]
|
||||
|
||||
async def _cat_file(
|
||||
self,
|
||||
url,
|
||||
**kwargs,
|
||||
):
|
||||
fs = _resolve_fs(url, self.method)
|
||||
if fs.async_impl:
|
||||
return await fs._cat_file(url, **kwargs)
|
||||
else:
|
||||
return fs.cat_file(url, **kwargs)
|
||||
|
||||
async def _pipe_file(
|
||||
self,
|
||||
path,
|
||||
value,
|
||||
**kwargs,
|
||||
):
|
||||
fs = _resolve_fs(path, self.method, storage_options=self.st_opts)
|
||||
if fs.async_impl:
|
||||
return await fs._pipe_file(path, value, **kwargs)
|
||||
else:
|
||||
return fs.pipe_file(path, value, **kwargs)
|
||||
|
||||
async def _rm(self, url, **kwargs):
|
||||
urls = url
|
||||
if isinstance(urls, str):
|
||||
urls = [urls]
|
||||
fs = _resolve_fs(urls[0], self.method)
|
||||
if fs.async_impl:
|
||||
await fs._rm(urls, **kwargs)
|
||||
else:
|
||||
fs.rm(url, **kwargs)
|
||||
|
||||
async def _makedirs(self, path, exist_ok=False):
|
||||
logger.debug("Make dir %s", path)
|
||||
fs = _resolve_fs(path, self.method, storage_options=self.st_opts)
|
||||
if fs.async_impl:
|
||||
await fs._makedirs(path, exist_ok=exist_ok)
|
||||
else:
|
||||
fs.makedirs(path, exist_ok=exist_ok)
|
||||
|
||||
def rsync(self, source, destination, **kwargs):
|
||||
"""Sync files between two directory trees
|
||||
|
||||
See `func:rsync` for more details.
|
||||
"""
|
||||
rsync(source, destination, fs=self, **kwargs)
|
||||
|
||||
async def _cp_file(
|
||||
self,
|
||||
url,
|
||||
url2,
|
||||
blocksize=2**20,
|
||||
callback=DEFAULT_CALLBACK,
|
||||
tempdir: str | None = None,
|
||||
**kwargs,
|
||||
):
|
||||
fs = _resolve_fs(url, self.method)
|
||||
fs2 = _resolve_fs(url2, self.method)
|
||||
if fs is fs2:
|
||||
# pure remote
|
||||
if fs.async_impl:
|
||||
return await fs._copy(url, url2, **kwargs)
|
||||
else:
|
||||
return fs.copy(url, url2, **kwargs)
|
||||
await copy_file_op(fs, [url], fs2, [url2], tempdir, 1, on_error="raise")
|
||||
|
||||
async def _make_many_dirs(self, urls, exist_ok=True):
|
||||
fs = _resolve_fs(urls[0], self.method)
|
||||
if fs.async_impl:
|
||||
coros = [fs._makedirs(u, exist_ok=exist_ok) for u in urls]
|
||||
await _run_coros_in_chunks(coros)
|
||||
else:
|
||||
for u in urls:
|
||||
fs.makedirs(u, exist_ok=exist_ok)
|
||||
|
||||
make_many_dirs = sync_wrapper(_make_many_dirs)
|
||||
|
||||
async def _copy(
|
||||
self,
|
||||
path1: list[str],
|
||||
path2: list[str],
|
||||
recursive: bool = False,
|
||||
on_error: str = "ignore",
|
||||
maxdepth: int | None = None,
|
||||
batch_size: int | None = None,
|
||||
tempdir: str | None = None,
|
||||
**kwargs,
|
||||
):
|
||||
# TODO: special case for one FS being local, which can use get/put
|
||||
# TODO: special case for one being memFS, which can use cat/pipe
|
||||
if recursive:
|
||||
raise NotImplementedError("Please use fsspec.generic.rsync")
|
||||
path1 = [path1] if isinstance(path1, str) else path1
|
||||
path2 = [path2] if isinstance(path2, str) else path2
|
||||
|
||||
fs = _resolve_fs(path1, self.method)
|
||||
fs2 = _resolve_fs(path2, self.method)
|
||||
|
||||
if fs is fs2:
|
||||
if fs.async_impl:
|
||||
return await fs._copy(path1, path2, **kwargs)
|
||||
else:
|
||||
return fs.copy(path1, path2, **kwargs)
|
||||
|
||||
await copy_file_op(
|
||||
fs, path1, fs2, path2, tempdir, batch_size, on_error=on_error
|
||||
)
|
||||
|
||||
|
||||
async def copy_file_op(
|
||||
fs1, url1, fs2, url2, tempdir=None, batch_size=20, on_error="ignore"
|
||||
):
|
||||
import tempfile
|
||||
|
||||
tempdir = tempdir or tempfile.mkdtemp()
|
||||
try:
|
||||
coros = [
|
||||
_copy_file_op(
|
||||
fs1,
|
||||
u1,
|
||||
fs2,
|
||||
u2,
|
||||
os.path.join(tempdir, uuid.uuid4().hex),
|
||||
)
|
||||
for u1, u2 in zip(url1, url2)
|
||||
]
|
||||
out = await _run_coros_in_chunks(
|
||||
coros, batch_size=batch_size, return_exceptions=True
|
||||
)
|
||||
finally:
|
||||
shutil.rmtree(tempdir)
|
||||
if on_error == "return":
|
||||
return out
|
||||
elif on_error == "raise":
|
||||
for o in out:
|
||||
if isinstance(o, Exception):
|
||||
raise o
|
||||
|
||||
|
||||
async def _copy_file_op(fs1, url1, fs2, url2, local, on_error="ignore"):
|
||||
if fs1.async_impl:
|
||||
await fs1._get_file(url1, local)
|
||||
else:
|
||||
fs1.get_file(url1, local)
|
||||
if fs2.async_impl:
|
||||
await fs2._put_file(local, url2)
|
||||
else:
|
||||
fs2.put_file(local, url2)
|
||||
os.unlink(local)
|
||||
logger.debug("Copy %s -> %s; done", url1, url2)
|
||||
|
||||
|
||||
async def maybe_await(cor):
|
||||
if inspect.iscoroutine(cor):
|
||||
return await cor
|
||||
else:
|
||||
return cor
|
||||
417
backend_service/venv/lib/python3.13/site-packages/fsspec/gui.py
Normal file
417
backend_service/venv/lib/python3.13/site-packages/fsspec/gui.py
Normal file
@@ -0,0 +1,417 @@
|
||||
import ast
|
||||
import contextlib
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
from collections.abc import Sequence
|
||||
from typing import ClassVar
|
||||
|
||||
import panel as pn
|
||||
|
||||
from .core import OpenFile, get_filesystem_class, split_protocol
|
||||
from .registry import known_implementations
|
||||
|
||||
pn.extension()
|
||||
logger = logging.getLogger("fsspec.gui")
|
||||
|
||||
|
||||
class SigSlot:
|
||||
"""Signal-slot mixin, for Panel event passing
|
||||
|
||||
Include this class in a widget manager's superclasses to be able to
|
||||
register events and callbacks on Panel widgets managed by that class.
|
||||
|
||||
The method ``_register`` should be called as widgets are added, and external
|
||||
code should call ``connect`` to associate callbacks.
|
||||
|
||||
By default, all signals emit a DEBUG logging statement.
|
||||
"""
|
||||
|
||||
# names of signals that this class may emit each of which must be
|
||||
# set by _register for any new instance
|
||||
signals: ClassVar[Sequence[str]] = []
|
||||
# names of actions that this class may respond to
|
||||
slots: ClassVar[Sequence[str]] = []
|
||||
|
||||
# each of which must be a method name
|
||||
|
||||
def __init__(self):
|
||||
self._ignoring_events = False
|
||||
self._sigs = {}
|
||||
self._map = {}
|
||||
self._setup()
|
||||
|
||||
def _setup(self):
|
||||
"""Create GUI elements and register signals"""
|
||||
self.panel = pn.pane.PaneBase()
|
||||
# no signals to set up in the base class
|
||||
|
||||
def _register(
|
||||
self, widget, name, thing="value", log_level=logging.DEBUG, auto=False
|
||||
):
|
||||
"""Watch the given attribute of a widget and assign it a named event
|
||||
|
||||
This is normally called at the time a widget is instantiated, in the
|
||||
class which owns it.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
widget : pn.layout.Panel or None
|
||||
Widget to watch. If None, an anonymous signal not associated with
|
||||
any widget.
|
||||
name : str
|
||||
Name of this event
|
||||
thing : str
|
||||
Attribute of the given widget to watch
|
||||
log_level : int
|
||||
When the signal is triggered, a logging event of the given level
|
||||
will be fired in the dfviz logger.
|
||||
auto : bool
|
||||
If True, automatically connects with a method in this class of the
|
||||
same name.
|
||||
"""
|
||||
if name not in self.signals:
|
||||
raise ValueError(f"Attempt to assign an undeclared signal: {name}")
|
||||
self._sigs[name] = {
|
||||
"widget": widget,
|
||||
"callbacks": [],
|
||||
"thing": thing,
|
||||
"log": log_level,
|
||||
}
|
||||
wn = "-".join(
|
||||
[
|
||||
getattr(widget, "name", str(widget)) if widget is not None else "none",
|
||||
thing,
|
||||
]
|
||||
)
|
||||
self._map[wn] = name
|
||||
if widget is not None:
|
||||
widget.param.watch(self._signal, thing, onlychanged=True)
|
||||
if auto and hasattr(self, name):
|
||||
self.connect(name, getattr(self, name))
|
||||
|
||||
def _repr_mimebundle_(self, *args, **kwargs):
|
||||
"""Display in a notebook or a server"""
|
||||
try:
|
||||
return self.panel._repr_mimebundle_(*args, **kwargs)
|
||||
except (ValueError, AttributeError) as exc:
|
||||
raise NotImplementedError(
|
||||
"Panel does not seem to be set up properly"
|
||||
) from exc
|
||||
|
||||
def connect(self, signal, slot):
|
||||
"""Associate call back with given event
|
||||
|
||||
The callback must be a function which takes the "new" value of the
|
||||
watched attribute as the only parameter. If the callback return False,
|
||||
this cancels any further processing of the given event.
|
||||
|
||||
Alternatively, the callback can be a string, in which case it means
|
||||
emitting the correspondingly-named event (i.e., connect to self)
|
||||
"""
|
||||
self._sigs[signal]["callbacks"].append(slot)
|
||||
|
||||
def _signal(self, event):
|
||||
"""This is called by a an action on a widget
|
||||
|
||||
Within an self.ignore_events context, nothing happens.
|
||||
|
||||
Tests can execute this method by directly changing the values of
|
||||
widget components.
|
||||
"""
|
||||
if not self._ignoring_events:
|
||||
wn = "-".join([event.obj.name, event.name])
|
||||
if wn in self._map and self._map[wn] in self._sigs:
|
||||
self._emit(self._map[wn], event.new)
|
||||
|
||||
@contextlib.contextmanager
|
||||
def ignore_events(self):
|
||||
"""Temporarily turn off events processing in this instance
|
||||
|
||||
(does not propagate to children)
|
||||
"""
|
||||
self._ignoring_events = True
|
||||
try:
|
||||
yield
|
||||
finally:
|
||||
self._ignoring_events = False
|
||||
|
||||
def _emit(self, sig, value=None):
|
||||
"""An event happened, call its callbacks
|
||||
|
||||
This method can be used in tests to simulate message passing without
|
||||
directly changing visual elements.
|
||||
|
||||
Calling of callbacks will halt whenever one returns False.
|
||||
"""
|
||||
logger.log(self._sigs[sig]["log"], f"{sig}: {value}")
|
||||
for callback in self._sigs[sig]["callbacks"]:
|
||||
if isinstance(callback, str):
|
||||
self._emit(callback)
|
||||
else:
|
||||
try:
|
||||
# running callbacks should not break the interface
|
||||
ret = callback(value)
|
||||
if ret is False:
|
||||
break
|
||||
except Exception as e:
|
||||
logger.exception(
|
||||
"Exception (%s) while executing callback for signal: %s",
|
||||
e,
|
||||
sig,
|
||||
)
|
||||
|
||||
def show(self, threads=False):
|
||||
"""Open a new browser tab and display this instance's interface"""
|
||||
self.panel.show(threads=threads, verbose=False)
|
||||
return self
|
||||
|
||||
|
||||
class SingleSelect(SigSlot):
|
||||
"""A multiselect which only allows you to select one item for an event"""
|
||||
|
||||
signals = ["_selected", "selected"] # the first is internal
|
||||
slots = ["set_options", "set_selection", "add", "clear", "select"]
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
self.kwargs = kwargs
|
||||
super().__init__()
|
||||
|
||||
def _setup(self):
|
||||
self.panel = pn.widgets.MultiSelect(**self.kwargs)
|
||||
self._register(self.panel, "_selected", "value")
|
||||
self._register(None, "selected")
|
||||
self.connect("_selected", self.select_one)
|
||||
|
||||
def _signal(self, *args, **kwargs):
|
||||
super()._signal(*args, **kwargs)
|
||||
|
||||
def select_one(self, *_):
|
||||
with self.ignore_events():
|
||||
val = [self.panel.value[-1]] if self.panel.value else []
|
||||
self.panel.value = val
|
||||
self._emit("selected", self.panel.value)
|
||||
|
||||
def set_options(self, options):
|
||||
self.panel.options = options
|
||||
|
||||
def clear(self):
|
||||
self.panel.options = []
|
||||
|
||||
@property
|
||||
def value(self):
|
||||
return self.panel.value
|
||||
|
||||
def set_selection(self, selection):
|
||||
self.panel.value = [selection]
|
||||
|
||||
|
||||
class FileSelector(SigSlot):
|
||||
"""Panel-based graphical file selector widget
|
||||
|
||||
Instances of this widget are interactive and can be displayed in jupyter by having
|
||||
them as the output of a cell, or in a separate browser tab using ``.show()``.
|
||||
"""
|
||||
|
||||
signals = [
|
||||
"protocol_changed",
|
||||
"selection_changed",
|
||||
"directory_entered",
|
||||
"home_clicked",
|
||||
"up_clicked",
|
||||
"go_clicked",
|
||||
"filters_changed",
|
||||
]
|
||||
slots = ["set_filters", "go_home"]
|
||||
|
||||
def __init__(self, url=None, filters=None, ignore=None, kwargs=None):
|
||||
"""
|
||||
|
||||
Parameters
|
||||
----------
|
||||
url : str (optional)
|
||||
Initial value of the URL to populate the dialog; should include protocol
|
||||
filters : list(str) (optional)
|
||||
File endings to include in the listings. If not included, all files are
|
||||
allowed. Does not affect directories.
|
||||
If given, the endings will appear as checkboxes in the interface
|
||||
ignore : list(str) (optional)
|
||||
Regex(s) of file basename patterns to ignore, e.g., "\\." for typical
|
||||
hidden files on posix
|
||||
kwargs : dict (optional)
|
||||
To pass to file system instance
|
||||
"""
|
||||
if url:
|
||||
self.init_protocol, url = split_protocol(url)
|
||||
else:
|
||||
self.init_protocol, url = "file", os.getcwd()
|
||||
self.init_url = url
|
||||
self.init_kwargs = (kwargs if isinstance(kwargs, str) else str(kwargs)) or "{}"
|
||||
self.filters = filters
|
||||
self.ignore = [re.compile(i) for i in ignore or []]
|
||||
self._fs = None
|
||||
super().__init__()
|
||||
|
||||
def _setup(self):
|
||||
self.url = pn.widgets.TextInput(
|
||||
name="url",
|
||||
value=self.init_url,
|
||||
align="end",
|
||||
sizing_mode="stretch_width",
|
||||
width_policy="max",
|
||||
)
|
||||
self.protocol = pn.widgets.Select(
|
||||
options=sorted(known_implementations),
|
||||
value=self.init_protocol,
|
||||
name="protocol",
|
||||
align="center",
|
||||
)
|
||||
self.kwargs = pn.widgets.TextInput(
|
||||
name="kwargs", value=self.init_kwargs, align="center"
|
||||
)
|
||||
self.go = pn.widgets.Button(name="⇨", align="end", width=45)
|
||||
self.main = SingleSelect(size=10)
|
||||
self.home = pn.widgets.Button(name="🏠", width=40, height=30, align="end")
|
||||
self.up = pn.widgets.Button(name="‹", width=30, height=30, align="end")
|
||||
|
||||
self._register(self.protocol, "protocol_changed", auto=True)
|
||||
self._register(self.go, "go_clicked", "clicks", auto=True)
|
||||
self._register(self.up, "up_clicked", "clicks", auto=True)
|
||||
self._register(self.home, "home_clicked", "clicks", auto=True)
|
||||
self._register(None, "selection_changed")
|
||||
self.main.connect("selected", self.selection_changed)
|
||||
self._register(None, "directory_entered")
|
||||
self.prev_protocol = self.protocol.value
|
||||
self.prev_kwargs = self.storage_options
|
||||
|
||||
self.filter_sel = pn.widgets.CheckBoxGroup(
|
||||
value=[], options=[], inline=False, align="end", width_policy="min"
|
||||
)
|
||||
self._register(self.filter_sel, "filters_changed", auto=True)
|
||||
|
||||
self.panel = pn.Column(
|
||||
pn.Row(self.protocol, self.kwargs),
|
||||
pn.Row(self.home, self.up, self.url, self.go, self.filter_sel),
|
||||
self.main.panel,
|
||||
)
|
||||
self.set_filters(self.filters)
|
||||
self.go_clicked()
|
||||
|
||||
def set_filters(self, filters=None):
|
||||
self.filters = filters
|
||||
if filters:
|
||||
self.filter_sel.options = filters
|
||||
self.filter_sel.value = filters
|
||||
else:
|
||||
self.filter_sel.options = []
|
||||
self.filter_sel.value = []
|
||||
|
||||
@property
|
||||
def storage_options(self):
|
||||
"""Value of the kwargs box as a dictionary"""
|
||||
return ast.literal_eval(self.kwargs.value) or {}
|
||||
|
||||
@property
|
||||
def fs(self):
|
||||
"""Current filesystem instance"""
|
||||
if self._fs is None:
|
||||
cls = get_filesystem_class(self.protocol.value)
|
||||
self._fs = cls(**self.storage_options)
|
||||
return self._fs
|
||||
|
||||
@property
|
||||
def urlpath(self):
|
||||
"""URL of currently selected item"""
|
||||
return (
|
||||
(f"{self.protocol.value}://{self.main.value[0]}")
|
||||
if self.main.value
|
||||
else None
|
||||
)
|
||||
|
||||
def open_file(self, mode="rb", compression=None, encoding=None):
|
||||
"""Create OpenFile instance for the currently selected item
|
||||
|
||||
For example, in a notebook you might do something like
|
||||
|
||||
.. code-block::
|
||||
|
||||
[ ]: sel = FileSelector(); sel
|
||||
|
||||
# user selects their file
|
||||
|
||||
[ ]: with sel.open_file('rb') as f:
|
||||
... out = f.read()
|
||||
|
||||
Parameters
|
||||
----------
|
||||
mode: str (optional)
|
||||
Open mode for the file.
|
||||
compression: str (optional)
|
||||
The interact with the file as compressed. Set to 'infer' to guess
|
||||
compression from the file ending
|
||||
encoding: str (optional)
|
||||
If using text mode, use this encoding; defaults to UTF8.
|
||||
"""
|
||||
if self.urlpath is None:
|
||||
raise ValueError("No file selected")
|
||||
return OpenFile(self.fs, self.urlpath, mode, compression, encoding)
|
||||
|
||||
def filters_changed(self, values):
|
||||
self.filters = values
|
||||
self.go_clicked()
|
||||
|
||||
def selection_changed(self, *_):
|
||||
if self.urlpath is None:
|
||||
return
|
||||
if self.fs.isdir(self.urlpath):
|
||||
self.url.value = self.fs._strip_protocol(self.urlpath)
|
||||
self.go_clicked()
|
||||
|
||||
def go_clicked(self, *_):
|
||||
if (
|
||||
self.prev_protocol != self.protocol.value
|
||||
or self.prev_kwargs != self.storage_options
|
||||
):
|
||||
self._fs = None # causes fs to be recreated
|
||||
self.prev_protocol = self.protocol.value
|
||||
self.prev_kwargs = self.storage_options
|
||||
listing = sorted(
|
||||
self.fs.ls(self.url.value, detail=True), key=lambda x: x["name"]
|
||||
)
|
||||
listing = [
|
||||
l
|
||||
for l in listing
|
||||
if not any(i.match(l["name"].rsplit("/", 1)[-1]) for i in self.ignore)
|
||||
]
|
||||
folders = {
|
||||
"📁 " + o["name"].rsplit("/", 1)[-1]: o["name"]
|
||||
for o in listing
|
||||
if o["type"] == "directory"
|
||||
}
|
||||
files = {
|
||||
"📄 " + o["name"].rsplit("/", 1)[-1]: o["name"]
|
||||
for o in listing
|
||||
if o["type"] == "file"
|
||||
}
|
||||
if self.filters:
|
||||
files = {
|
||||
k: v
|
||||
for k, v in files.items()
|
||||
if any(v.endswith(ext) for ext in self.filters)
|
||||
}
|
||||
self.main.set_options(dict(**folders, **files))
|
||||
|
||||
def protocol_changed(self, *_):
|
||||
self._fs = None
|
||||
self.main.options = []
|
||||
self.url.value = ""
|
||||
|
||||
def home_clicked(self, *_):
|
||||
self.protocol.value = self.init_protocol
|
||||
self.kwargs.value = self.init_kwargs
|
||||
self.url.value = self.init_url
|
||||
self.go_clicked()
|
||||
|
||||
def up_clicked(self, *_):
|
||||
self.url.value = self.fs._parent(self.url.value)
|
||||
self.go_clicked()
|
||||
@@ -0,0 +1,307 @@
|
||||
import errno
|
||||
import io
|
||||
import os
|
||||
import secrets
|
||||
import shutil
|
||||
from contextlib import suppress
|
||||
from functools import cached_property, wraps
|
||||
from urllib.parse import parse_qs
|
||||
|
||||
from fsspec.spec import AbstractFileSystem
|
||||
from fsspec.utils import (
|
||||
get_package_version_without_import,
|
||||
infer_storage_options,
|
||||
mirror_from,
|
||||
tokenize,
|
||||
)
|
||||
|
||||
|
||||
def wrap_exceptions(func):
|
||||
@wraps(func)
|
||||
def wrapper(*args, **kwargs):
|
||||
try:
|
||||
return func(*args, **kwargs)
|
||||
except OSError as exception:
|
||||
if not exception.args:
|
||||
raise
|
||||
|
||||
message, *args = exception.args
|
||||
if isinstance(message, str) and "does not exist" in message:
|
||||
raise FileNotFoundError(errno.ENOENT, message) from exception
|
||||
else:
|
||||
raise
|
||||
|
||||
return wrapper
|
||||
|
||||
|
||||
PYARROW_VERSION = None
|
||||
|
||||
|
||||
class ArrowFSWrapper(AbstractFileSystem):
|
||||
"""FSSpec-compatible wrapper of pyarrow.fs.FileSystem.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
fs : pyarrow.fs.FileSystem
|
||||
|
||||
"""
|
||||
|
||||
root_marker = "/"
|
||||
|
||||
def __init__(self, fs, **kwargs):
|
||||
global PYARROW_VERSION
|
||||
PYARROW_VERSION = get_package_version_without_import("pyarrow")
|
||||
self.fs = fs
|
||||
super().__init__(**kwargs)
|
||||
|
||||
@property
|
||||
def protocol(self):
|
||||
return self.fs.type_name
|
||||
|
||||
@cached_property
|
||||
def fsid(self):
|
||||
return "hdfs_" + tokenize(self.fs.host, self.fs.port)
|
||||
|
||||
@classmethod
|
||||
def _strip_protocol(cls, path):
|
||||
ops = infer_storage_options(path)
|
||||
path = ops["path"]
|
||||
if path.startswith("//"):
|
||||
# special case for "hdfs://path" (without the triple slash)
|
||||
path = path[1:]
|
||||
return path
|
||||
|
||||
def ls(self, path, detail=False, **kwargs):
|
||||
path = self._strip_protocol(path)
|
||||
from pyarrow.fs import FileSelector
|
||||
|
||||
try:
|
||||
entries = [
|
||||
self._make_entry(entry)
|
||||
for entry in self.fs.get_file_info(FileSelector(path))
|
||||
]
|
||||
except (FileNotFoundError, NotADirectoryError):
|
||||
entries = [self.info(path, **kwargs)]
|
||||
if detail:
|
||||
return entries
|
||||
else:
|
||||
return [entry["name"] for entry in entries]
|
||||
|
||||
def info(self, path, **kwargs):
|
||||
path = self._strip_protocol(path)
|
||||
[info] = self.fs.get_file_info([path])
|
||||
return self._make_entry(info)
|
||||
|
||||
def exists(self, path):
|
||||
path = self._strip_protocol(path)
|
||||
try:
|
||||
self.info(path)
|
||||
except FileNotFoundError:
|
||||
return False
|
||||
else:
|
||||
return True
|
||||
|
||||
def _make_entry(self, info):
|
||||
from pyarrow.fs import FileType
|
||||
|
||||
if info.type is FileType.Directory:
|
||||
kind = "directory"
|
||||
elif info.type is FileType.File:
|
||||
kind = "file"
|
||||
elif info.type is FileType.NotFound:
|
||||
raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), info.path)
|
||||
else:
|
||||
kind = "other"
|
||||
|
||||
return {
|
||||
"name": info.path,
|
||||
"size": info.size,
|
||||
"type": kind,
|
||||
"mtime": info.mtime,
|
||||
}
|
||||
|
||||
@wrap_exceptions
|
||||
def cp_file(self, path1, path2, **kwargs):
|
||||
path1 = self._strip_protocol(path1).rstrip("/")
|
||||
path2 = self._strip_protocol(path2).rstrip("/")
|
||||
|
||||
with self._open(path1, "rb") as lstream:
|
||||
tmp_fname = f"{path2}.tmp.{secrets.token_hex(6)}"
|
||||
try:
|
||||
with self.open(tmp_fname, "wb") as rstream:
|
||||
shutil.copyfileobj(lstream, rstream)
|
||||
self.fs.move(tmp_fname, path2)
|
||||
except BaseException:
|
||||
with suppress(FileNotFoundError):
|
||||
self.fs.delete_file(tmp_fname)
|
||||
raise
|
||||
|
||||
@wrap_exceptions
|
||||
def mv(self, path1, path2, **kwargs):
|
||||
path1 = self._strip_protocol(path1).rstrip("/")
|
||||
path2 = self._strip_protocol(path2).rstrip("/")
|
||||
self.fs.move(path1, path2)
|
||||
|
||||
@wrap_exceptions
|
||||
def rm_file(self, path):
|
||||
path = self._strip_protocol(path)
|
||||
self.fs.delete_file(path)
|
||||
|
||||
@wrap_exceptions
|
||||
def rm(self, path, recursive=False, maxdepth=None):
|
||||
path = self._strip_protocol(path).rstrip("/")
|
||||
if self.isdir(path):
|
||||
if recursive:
|
||||
self.fs.delete_dir(path)
|
||||
else:
|
||||
raise ValueError("Can't delete directories without recursive=False")
|
||||
else:
|
||||
self.fs.delete_file(path)
|
||||
|
||||
@wrap_exceptions
|
||||
def _open(self, path, mode="rb", block_size=None, seekable=True, **kwargs):
|
||||
if mode == "rb":
|
||||
if seekable:
|
||||
method = self.fs.open_input_file
|
||||
else:
|
||||
method = self.fs.open_input_stream
|
||||
elif mode == "wb":
|
||||
method = self.fs.open_output_stream
|
||||
elif mode == "ab":
|
||||
method = self.fs.open_append_stream
|
||||
else:
|
||||
raise ValueError(f"unsupported mode for Arrow filesystem: {mode!r}")
|
||||
|
||||
_kwargs = {}
|
||||
if mode != "rb" or not seekable:
|
||||
if int(PYARROW_VERSION.split(".")[0]) >= 4:
|
||||
# disable compression auto-detection
|
||||
_kwargs["compression"] = None
|
||||
stream = method(path, **_kwargs)
|
||||
|
||||
return ArrowFile(self, stream, path, mode, block_size, **kwargs)
|
||||
|
||||
@wrap_exceptions
|
||||
def mkdir(self, path, create_parents=True, **kwargs):
|
||||
path = self._strip_protocol(path)
|
||||
if create_parents:
|
||||
self.makedirs(path, exist_ok=True)
|
||||
else:
|
||||
self.fs.create_dir(path, recursive=False)
|
||||
|
||||
@wrap_exceptions
|
||||
def makedirs(self, path, exist_ok=False):
|
||||
path = self._strip_protocol(path)
|
||||
self.fs.create_dir(path, recursive=True)
|
||||
|
||||
@wrap_exceptions
|
||||
def rmdir(self, path):
|
||||
path = self._strip_protocol(path)
|
||||
self.fs.delete_dir(path)
|
||||
|
||||
@wrap_exceptions
|
||||
def modified(self, path):
|
||||
path = self._strip_protocol(path)
|
||||
return self.fs.get_file_info(path).mtime
|
||||
|
||||
def cat_file(self, path, start=None, end=None, **kwargs):
|
||||
kwargs["seekable"] = start not in [None, 0]
|
||||
return super().cat_file(path, start=None, end=None, **kwargs)
|
||||
|
||||
def get_file(self, rpath, lpath, **kwargs):
|
||||
kwargs["seekable"] = False
|
||||
super().get_file(rpath, lpath, **kwargs)
|
||||
|
||||
|
||||
@mirror_from(
|
||||
"stream",
|
||||
[
|
||||
"read",
|
||||
"seek",
|
||||
"tell",
|
||||
"write",
|
||||
"readable",
|
||||
"writable",
|
||||
"close",
|
||||
"size",
|
||||
"seekable",
|
||||
],
|
||||
)
|
||||
class ArrowFile(io.IOBase):
|
||||
def __init__(self, fs, stream, path, mode, block_size=None, **kwargs):
|
||||
self.path = path
|
||||
self.mode = mode
|
||||
|
||||
self.fs = fs
|
||||
self.stream = stream
|
||||
|
||||
self.blocksize = self.block_size = block_size
|
||||
self.kwargs = kwargs
|
||||
|
||||
def __enter__(self):
|
||||
return self
|
||||
|
||||
def __exit__(self, *args):
|
||||
return self.close()
|
||||
|
||||
|
||||
class HadoopFileSystem(ArrowFSWrapper):
|
||||
"""A wrapper on top of the pyarrow.fs.HadoopFileSystem
|
||||
to connect it's interface with fsspec"""
|
||||
|
||||
protocol = "hdfs"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
host="default",
|
||||
port=0,
|
||||
user=None,
|
||||
kerb_ticket=None,
|
||||
replication=3,
|
||||
extra_conf=None,
|
||||
**kwargs,
|
||||
):
|
||||
"""
|
||||
|
||||
Parameters
|
||||
----------
|
||||
host: str
|
||||
Hostname, IP or "default" to try to read from Hadoop config
|
||||
port: int
|
||||
Port to connect on, or default from Hadoop config if 0
|
||||
user: str or None
|
||||
If given, connect as this username
|
||||
kerb_ticket: str or None
|
||||
If given, use this ticket for authentication
|
||||
replication: int
|
||||
set replication factor of file for write operations. default value is 3.
|
||||
extra_conf: None or dict
|
||||
Passed on to HadoopFileSystem
|
||||
"""
|
||||
from pyarrow.fs import HadoopFileSystem
|
||||
|
||||
fs = HadoopFileSystem(
|
||||
host=host,
|
||||
port=port,
|
||||
user=user,
|
||||
kerb_ticket=kerb_ticket,
|
||||
replication=replication,
|
||||
extra_conf=extra_conf,
|
||||
)
|
||||
super().__init__(fs=fs, **kwargs)
|
||||
|
||||
@staticmethod
|
||||
def _get_kwargs_from_urls(path):
|
||||
ops = infer_storage_options(path)
|
||||
out = {}
|
||||
if ops.get("host", None):
|
||||
out["host"] = ops["host"]
|
||||
if ops.get("username", None):
|
||||
out["user"] = ops["username"]
|
||||
if ops.get("port", None):
|
||||
out["port"] = ops["port"]
|
||||
if ops.get("url_query", None):
|
||||
queries = parse_qs(ops["url_query"])
|
||||
if queries.get("replication", None):
|
||||
out["replication"] = int(queries["replication"][0])
|
||||
return out
|
||||
@@ -0,0 +1,122 @@
|
||||
import asyncio
|
||||
import functools
|
||||
import inspect
|
||||
|
||||
import fsspec
|
||||
from fsspec.asyn import AsyncFileSystem, running_async
|
||||
|
||||
|
||||
def async_wrapper(func, obj=None, semaphore=None):
|
||||
"""
|
||||
Wraps a synchronous function to make it awaitable.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
func : callable
|
||||
The synchronous function to wrap.
|
||||
obj : object, optional
|
||||
The instance to bind the function to, if applicable.
|
||||
semaphore : asyncio.Semaphore, optional
|
||||
A semaphore to limit concurrent calls.
|
||||
|
||||
Returns
|
||||
-------
|
||||
coroutine
|
||||
An awaitable version of the function.
|
||||
"""
|
||||
|
||||
@functools.wraps(func)
|
||||
async def wrapper(*args, **kwargs):
|
||||
if semaphore:
|
||||
async with semaphore:
|
||||
return await asyncio.to_thread(func, *args, **kwargs)
|
||||
return await asyncio.to_thread(func, *args, **kwargs)
|
||||
|
||||
return wrapper
|
||||
|
||||
|
||||
class AsyncFileSystemWrapper(AsyncFileSystem):
|
||||
"""
|
||||
A wrapper class to convert a synchronous filesystem into an asynchronous one.
|
||||
|
||||
This class takes an existing synchronous filesystem implementation and wraps all
|
||||
its methods to provide an asynchronous interface.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
sync_fs : AbstractFileSystem
|
||||
The synchronous filesystem instance to wrap.
|
||||
"""
|
||||
|
||||
protocol = "asyncwrapper", "async_wrapper"
|
||||
cachable = False
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
fs=None,
|
||||
asynchronous=None,
|
||||
target_protocol=None,
|
||||
target_options=None,
|
||||
semaphore=None,
|
||||
max_concurrent_tasks=None,
|
||||
**kwargs,
|
||||
):
|
||||
if asynchronous is None:
|
||||
asynchronous = running_async()
|
||||
super().__init__(asynchronous=asynchronous, **kwargs)
|
||||
if fs is not None:
|
||||
self.sync_fs = fs
|
||||
else:
|
||||
self.sync_fs = fsspec.filesystem(target_protocol, **target_options)
|
||||
self.protocol = self.sync_fs.protocol
|
||||
self.semaphore = semaphore
|
||||
self._wrap_all_sync_methods()
|
||||
|
||||
@property
|
||||
def fsid(self):
|
||||
return f"async_{self.sync_fs.fsid}"
|
||||
|
||||
def _wrap_all_sync_methods(self):
|
||||
"""
|
||||
Wrap all synchronous methods of the underlying filesystem with asynchronous versions.
|
||||
"""
|
||||
excluded_methods = {"open"}
|
||||
for method_name in dir(self.sync_fs):
|
||||
if method_name.startswith("_") or method_name in excluded_methods:
|
||||
continue
|
||||
|
||||
attr = inspect.getattr_static(self.sync_fs, method_name)
|
||||
if isinstance(attr, property):
|
||||
continue
|
||||
|
||||
method = getattr(self.sync_fs, method_name)
|
||||
if callable(method) and not inspect.iscoroutinefunction(method):
|
||||
async_method = async_wrapper(method, obj=self, semaphore=self.semaphore)
|
||||
setattr(self, f"_{method_name}", async_method)
|
||||
|
||||
@classmethod
|
||||
def wrap_class(cls, sync_fs_class):
|
||||
"""
|
||||
Create a new class that can be used to instantiate an AsyncFileSystemWrapper
|
||||
with lazy instantiation of the underlying synchronous filesystem.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
sync_fs_class : type
|
||||
The class of the synchronous filesystem to wrap.
|
||||
|
||||
Returns
|
||||
-------
|
||||
type
|
||||
A new class that wraps the provided synchronous filesystem class.
|
||||
"""
|
||||
|
||||
class GeneratedAsyncFileSystemWrapper(cls):
|
||||
def __init__(self, *args, **kwargs):
|
||||
sync_fs = sync_fs_class(*args, **kwargs)
|
||||
super().__init__(sync_fs)
|
||||
|
||||
GeneratedAsyncFileSystemWrapper.__name__ = (
|
||||
f"Async{sync_fs_class.__name__}Wrapper"
|
||||
)
|
||||
return GeneratedAsyncFileSystemWrapper
|
||||
@@ -0,0 +1,75 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import abc
|
||||
import hashlib
|
||||
|
||||
from fsspec.implementations.local import make_path_posix
|
||||
|
||||
|
||||
class AbstractCacheMapper(abc.ABC):
|
||||
"""Abstract super-class for mappers from remote URLs to local cached
|
||||
basenames.
|
||||
"""
|
||||
|
||||
@abc.abstractmethod
|
||||
def __call__(self, path: str) -> str: ...
|
||||
|
||||
def __eq__(self, other: object) -> bool:
|
||||
# Identity only depends on class. When derived classes have attributes
|
||||
# they will need to be included.
|
||||
return isinstance(other, type(self))
|
||||
|
||||
def __hash__(self) -> int:
|
||||
# Identity only depends on class. When derived classes have attributes
|
||||
# they will need to be included.
|
||||
return hash(type(self))
|
||||
|
||||
|
||||
class BasenameCacheMapper(AbstractCacheMapper):
|
||||
"""Cache mapper that uses the basename of the remote URL and a fixed number
|
||||
of directory levels above this.
|
||||
|
||||
The default is zero directory levels, meaning different paths with the same
|
||||
basename will have the same cached basename.
|
||||
"""
|
||||
|
||||
def __init__(self, directory_levels: int = 0):
|
||||
if directory_levels < 0:
|
||||
raise ValueError(
|
||||
"BasenameCacheMapper requires zero or positive directory_levels"
|
||||
)
|
||||
self.directory_levels = directory_levels
|
||||
|
||||
# Separator for directories when encoded as strings.
|
||||
self._separator = "_@_"
|
||||
|
||||
def __call__(self, path: str) -> str:
|
||||
path = make_path_posix(path)
|
||||
prefix, *bits = path.rsplit("/", self.directory_levels + 1)
|
||||
if bits:
|
||||
return self._separator.join(bits)
|
||||
else:
|
||||
return prefix # No separator found, simple filename
|
||||
|
||||
def __eq__(self, other: object) -> bool:
|
||||
return super().__eq__(other) and self.directory_levels == other.directory_levels
|
||||
|
||||
def __hash__(self) -> int:
|
||||
return super().__hash__() ^ hash(self.directory_levels)
|
||||
|
||||
|
||||
class HashCacheMapper(AbstractCacheMapper):
|
||||
"""Cache mapper that uses a hash of the remote URL."""
|
||||
|
||||
def __call__(self, path: str) -> str:
|
||||
return hashlib.sha256(path.encode()).hexdigest()
|
||||
|
||||
|
||||
def create_cache_mapper(same_names: bool) -> AbstractCacheMapper:
|
||||
"""Factory method to create cache mapper for backward compatibility with
|
||||
``CachingFileSystem`` constructor using ``same_names`` kwarg.
|
||||
"""
|
||||
if same_names:
|
||||
return BasenameCacheMapper()
|
||||
else:
|
||||
return HashCacheMapper()
|
||||
@@ -0,0 +1,233 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import pickle
|
||||
import time
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from fsspec.utils import atomic_write
|
||||
|
||||
try:
|
||||
import ujson as json
|
||||
except ImportError:
|
||||
if not TYPE_CHECKING:
|
||||
import json
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import Iterator
|
||||
from typing import Any, Literal
|
||||
|
||||
from typing_extensions import TypeAlias
|
||||
|
||||
from .cached import CachingFileSystem
|
||||
|
||||
Detail: TypeAlias = dict[str, Any]
|
||||
|
||||
|
||||
class CacheMetadata:
|
||||
"""Cache metadata.
|
||||
|
||||
All reading and writing of cache metadata is performed by this class,
|
||||
accessing the cached files and blocks is not.
|
||||
|
||||
Metadata is stored in a single file per storage directory in JSON format.
|
||||
For backward compatibility, also reads metadata stored in pickle format
|
||||
which is converted to JSON when next saved.
|
||||
"""
|
||||
|
||||
def __init__(self, storage: list[str]):
|
||||
"""
|
||||
|
||||
Parameters
|
||||
----------
|
||||
storage: list[str]
|
||||
Directories containing cached files, must be at least one. Metadata
|
||||
is stored in the last of these directories by convention.
|
||||
"""
|
||||
if not storage:
|
||||
raise ValueError("CacheMetadata expects at least one storage location")
|
||||
|
||||
self._storage = storage
|
||||
self.cached_files: list[Detail] = [{}]
|
||||
|
||||
# Private attribute to force saving of metadata in pickle format rather than
|
||||
# JSON for use in tests to confirm can read both pickle and JSON formats.
|
||||
self._force_save_pickle = False
|
||||
|
||||
def _load(self, fn: str) -> Detail:
|
||||
"""Low-level function to load metadata from specific file"""
|
||||
try:
|
||||
with open(fn, "r") as f:
|
||||
loaded = json.load(f)
|
||||
except ValueError:
|
||||
with open(fn, "rb") as f:
|
||||
loaded = pickle.load(f)
|
||||
for c in loaded.values():
|
||||
if isinstance(c.get("blocks"), list):
|
||||
c["blocks"] = set(c["blocks"])
|
||||
return loaded
|
||||
|
||||
def _save(self, metadata_to_save: Detail, fn: str) -> None:
|
||||
"""Low-level function to save metadata to specific file"""
|
||||
if self._force_save_pickle:
|
||||
with atomic_write(fn) as f:
|
||||
pickle.dump(metadata_to_save, f)
|
||||
else:
|
||||
with atomic_write(fn, mode="w") as f:
|
||||
json.dump(metadata_to_save, f)
|
||||
|
||||
def _scan_locations(
|
||||
self, writable_only: bool = False
|
||||
) -> Iterator[tuple[str, str, bool]]:
|
||||
"""Yield locations (filenames) where metadata is stored, and whether
|
||||
writable or not.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
writable: bool
|
||||
Set to True to only yield writable locations.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Yields (str, str, bool)
|
||||
"""
|
||||
n = len(self._storage)
|
||||
for i, storage in enumerate(self._storage):
|
||||
writable = i == n - 1
|
||||
if writable_only and not writable:
|
||||
continue
|
||||
yield os.path.join(storage, "cache"), storage, writable
|
||||
|
||||
def check_file(
|
||||
self, path: str, cfs: CachingFileSystem | None
|
||||
) -> Literal[False] | tuple[Detail, str]:
|
||||
"""If path is in cache return its details, otherwise return ``False``.
|
||||
|
||||
If the optional CachingFileSystem is specified then it is used to
|
||||
perform extra checks to reject possible matches, such as if they are
|
||||
too old.
|
||||
"""
|
||||
for (fn, base, _), cache in zip(self._scan_locations(), self.cached_files):
|
||||
if path not in cache:
|
||||
continue
|
||||
detail = cache[path].copy()
|
||||
|
||||
if cfs is not None:
|
||||
if cfs.check_files and detail["uid"] != cfs.fs.ukey(path):
|
||||
# Wrong file as determined by hash of file properties
|
||||
continue
|
||||
if cfs.expiry and time.time() - detail["time"] > cfs.expiry:
|
||||
# Cached file has expired
|
||||
continue
|
||||
|
||||
fn = os.path.join(base, detail["fn"])
|
||||
if os.path.exists(fn):
|
||||
return detail, fn
|
||||
return False
|
||||
|
||||
def clear_expired(self, expiry_time: int) -> tuple[list[str], bool]:
|
||||
"""Remove expired metadata from the cache.
|
||||
|
||||
Returns names of files corresponding to expired metadata and a boolean
|
||||
flag indicating whether the writable cache is empty. Caller is
|
||||
responsible for deleting the expired files.
|
||||
"""
|
||||
expired_files = []
|
||||
for path, detail in self.cached_files[-1].copy().items():
|
||||
if time.time() - detail["time"] > expiry_time:
|
||||
fn = detail.get("fn", "")
|
||||
if not fn:
|
||||
raise RuntimeError(
|
||||
f"Cache metadata does not contain 'fn' for {path}"
|
||||
)
|
||||
fn = os.path.join(self._storage[-1], fn)
|
||||
expired_files.append(fn)
|
||||
self.cached_files[-1].pop(path)
|
||||
|
||||
if self.cached_files[-1]:
|
||||
cache_path = os.path.join(self._storage[-1], "cache")
|
||||
self._save(self.cached_files[-1], cache_path)
|
||||
|
||||
writable_cache_empty = not self.cached_files[-1]
|
||||
return expired_files, writable_cache_empty
|
||||
|
||||
def load(self) -> None:
|
||||
"""Load all metadata from disk and store in ``self.cached_files``"""
|
||||
cached_files = []
|
||||
for fn, _, _ in self._scan_locations():
|
||||
if os.path.exists(fn):
|
||||
# TODO: consolidate blocks here
|
||||
cached_files.append(self._load(fn))
|
||||
else:
|
||||
cached_files.append({})
|
||||
self.cached_files = cached_files or [{}]
|
||||
|
||||
def on_close_cached_file(self, f: Any, path: str) -> None:
|
||||
"""Perform side-effect actions on closing a cached file.
|
||||
|
||||
The actual closing of the file is the responsibility of the caller.
|
||||
"""
|
||||
# File must be writeble, so in self.cached_files[-1]
|
||||
c = self.cached_files[-1][path]
|
||||
if c["blocks"] is not True and len(c["blocks"]) * f.blocksize >= f.size:
|
||||
c["blocks"] = True
|
||||
|
||||
def pop_file(self, path: str) -> str | None:
|
||||
"""Remove metadata of cached file.
|
||||
|
||||
If path is in the cache, return the filename of the cached file,
|
||||
otherwise return ``None``. Caller is responsible for deleting the
|
||||
cached file.
|
||||
"""
|
||||
details = self.check_file(path, None)
|
||||
if not details:
|
||||
return None
|
||||
_, fn = details
|
||||
if fn.startswith(self._storage[-1]):
|
||||
self.cached_files[-1].pop(path)
|
||||
self.save()
|
||||
else:
|
||||
raise PermissionError(
|
||||
"Can only delete cached file in last, writable cache location"
|
||||
)
|
||||
return fn
|
||||
|
||||
def save(self) -> None:
|
||||
"""Save metadata to disk"""
|
||||
for (fn, _, writable), cache in zip(self._scan_locations(), self.cached_files):
|
||||
if not writable:
|
||||
continue
|
||||
|
||||
if os.path.exists(fn):
|
||||
cached_files = self._load(fn)
|
||||
for k, c in cached_files.items():
|
||||
if k in cache:
|
||||
if c["blocks"] is True or cache[k]["blocks"] is True:
|
||||
c["blocks"] = True
|
||||
else:
|
||||
# self.cached_files[*][*]["blocks"] must continue to
|
||||
# point to the same set object so that updates
|
||||
# performed by MMapCache are propagated back to
|
||||
# self.cached_files.
|
||||
blocks = cache[k]["blocks"]
|
||||
blocks.update(c["blocks"])
|
||||
c["blocks"] = blocks
|
||||
c["time"] = max(c["time"], cache[k]["time"])
|
||||
c["uid"] = cache[k]["uid"]
|
||||
|
||||
# Files can be added to cache after it was written once
|
||||
for k, c in cache.items():
|
||||
if k not in cached_files:
|
||||
cached_files[k] = c
|
||||
else:
|
||||
cached_files = cache
|
||||
cache = {k: v.copy() for k, v in cached_files.items()}
|
||||
for c in cache.values():
|
||||
if isinstance(c["blocks"], set):
|
||||
c["blocks"] = list(c["blocks"])
|
||||
self._save(cache, fn)
|
||||
self.cached_files[-1] = cached_files
|
||||
|
||||
def update_file(self, path: str, detail: Detail) -> None:
|
||||
"""Update metadata for specific file in memory, do not save"""
|
||||
self.cached_files[-1][path] = detail
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,23 @@
|
||||
from typing import ClassVar
|
||||
|
||||
from fsspec import AbstractFileSystem
|
||||
|
||||
__all__ = ("ChainedFileSystem",)
|
||||
|
||||
|
||||
class ChainedFileSystem(AbstractFileSystem):
|
||||
"""Chained filesystem base class.
|
||||
|
||||
A chained filesystem is designed to be layered over another FS.
|
||||
This is useful to implement things like caching.
|
||||
|
||||
This base class does very little on its own, but is used as a marker
|
||||
that the class is designed for chaining.
|
||||
|
||||
Right now this is only used in `url_to_fs` to provide the path argument
|
||||
(`fo`) to the chained filesystem from the underlying filesystem.
|
||||
|
||||
Additional functionality may be added in the future.
|
||||
"""
|
||||
|
||||
protocol: ClassVar[str] = "chained"
|
||||
@@ -0,0 +1,152 @@
|
||||
import dask
|
||||
from distributed.client import Client, _get_global_client
|
||||
from distributed.worker import Worker
|
||||
|
||||
from fsspec import filesystem
|
||||
from fsspec.spec import AbstractBufferedFile, AbstractFileSystem
|
||||
from fsspec.utils import infer_storage_options
|
||||
|
||||
|
||||
def _get_client(client):
|
||||
if client is None:
|
||||
return _get_global_client()
|
||||
elif isinstance(client, Client):
|
||||
return client
|
||||
else:
|
||||
# e.g., connection string
|
||||
return Client(client)
|
||||
|
||||
|
||||
def _in_worker():
|
||||
return bool(Worker._instances)
|
||||
|
||||
|
||||
class DaskWorkerFileSystem(AbstractFileSystem):
|
||||
"""View files accessible to a worker as any other remote file-system
|
||||
|
||||
When instances are run on the worker, uses the real filesystem. When
|
||||
run on the client, they call the worker to provide information or data.
|
||||
|
||||
**Warning** this implementation is experimental, and read-only for now.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self, target_protocol=None, target_options=None, fs=None, client=None, **kwargs
|
||||
):
|
||||
super().__init__(**kwargs)
|
||||
if not (fs is None) ^ (target_protocol is None):
|
||||
raise ValueError(
|
||||
"Please provide one of filesystem instance (fs) or"
|
||||
" target_protocol, not both"
|
||||
)
|
||||
self.target_protocol = target_protocol
|
||||
self.target_options = target_options
|
||||
self.worker = None
|
||||
self.client = client
|
||||
self.fs = fs
|
||||
self._determine_worker()
|
||||
|
||||
@staticmethod
|
||||
def _get_kwargs_from_urls(path):
|
||||
so = infer_storage_options(path)
|
||||
if "host" in so and "port" in so:
|
||||
return {"client": f"{so['host']}:{so['port']}"}
|
||||
else:
|
||||
return {}
|
||||
|
||||
def _determine_worker(self):
|
||||
if _in_worker():
|
||||
self.worker = True
|
||||
if self.fs is None:
|
||||
self.fs = filesystem(
|
||||
self.target_protocol, **(self.target_options or {})
|
||||
)
|
||||
else:
|
||||
self.worker = False
|
||||
self.client = _get_client(self.client)
|
||||
self.rfs = dask.delayed(self)
|
||||
|
||||
def mkdir(self, *args, **kwargs):
|
||||
if self.worker:
|
||||
self.fs.mkdir(*args, **kwargs)
|
||||
else:
|
||||
self.rfs.mkdir(*args, **kwargs).compute()
|
||||
|
||||
def rm(self, *args, **kwargs):
|
||||
if self.worker:
|
||||
self.fs.rm(*args, **kwargs)
|
||||
else:
|
||||
self.rfs.rm(*args, **kwargs).compute()
|
||||
|
||||
def copy(self, *args, **kwargs):
|
||||
if self.worker:
|
||||
self.fs.copy(*args, **kwargs)
|
||||
else:
|
||||
self.rfs.copy(*args, **kwargs).compute()
|
||||
|
||||
def mv(self, *args, **kwargs):
|
||||
if self.worker:
|
||||
self.fs.mv(*args, **kwargs)
|
||||
else:
|
||||
self.rfs.mv(*args, **kwargs).compute()
|
||||
|
||||
def ls(self, *args, **kwargs):
|
||||
if self.worker:
|
||||
return self.fs.ls(*args, **kwargs)
|
||||
else:
|
||||
return self.rfs.ls(*args, **kwargs).compute()
|
||||
|
||||
def _open(
|
||||
self,
|
||||
path,
|
||||
mode="rb",
|
||||
block_size=None,
|
||||
autocommit=True,
|
||||
cache_options=None,
|
||||
**kwargs,
|
||||
):
|
||||
if self.worker:
|
||||
return self.fs._open(
|
||||
path,
|
||||
mode=mode,
|
||||
block_size=block_size,
|
||||
autocommit=autocommit,
|
||||
cache_options=cache_options,
|
||||
**kwargs,
|
||||
)
|
||||
else:
|
||||
return DaskFile(
|
||||
fs=self,
|
||||
path=path,
|
||||
mode=mode,
|
||||
block_size=block_size,
|
||||
autocommit=autocommit,
|
||||
cache_options=cache_options,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
def fetch_range(self, path, mode, start, end):
|
||||
if self.worker:
|
||||
with self._open(path, mode) as f:
|
||||
f.seek(start)
|
||||
return f.read(end - start)
|
||||
else:
|
||||
return self.rfs.fetch_range(path, mode, start, end).compute()
|
||||
|
||||
|
||||
class DaskFile(AbstractBufferedFile):
|
||||
def __init__(self, mode="rb", **kwargs):
|
||||
if mode != "rb":
|
||||
raise ValueError('Remote dask files can only be opened in "rb" mode')
|
||||
super().__init__(**kwargs)
|
||||
|
||||
def _upload_chunk(self, final=False):
|
||||
pass
|
||||
|
||||
def _initiate_upload(self):
|
||||
"""Create remote file/upload"""
|
||||
pass
|
||||
|
||||
def _fetch_range(self, start, end):
|
||||
"""Get the specified set of bytes from remote"""
|
||||
return self.fs.fetch_range(self.path, self.mode, start, end)
|
||||
@@ -0,0 +1,58 @@
|
||||
import base64
|
||||
import io
|
||||
from typing import Optional
|
||||
from urllib.parse import unquote
|
||||
|
||||
from fsspec import AbstractFileSystem
|
||||
|
||||
|
||||
class DataFileSystem(AbstractFileSystem):
|
||||
"""A handy decoder for data-URLs
|
||||
|
||||
Example
|
||||
-------
|
||||
>>> with fsspec.open("data:,Hello%2C%20World%21") as f:
|
||||
... print(f.read())
|
||||
b"Hello, World!"
|
||||
|
||||
See https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/Data_URLs
|
||||
"""
|
||||
|
||||
protocol = "data"
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
"""No parameters for this filesystem"""
|
||||
super().__init__(**kwargs)
|
||||
|
||||
def cat_file(self, path, start=None, end=None, **kwargs):
|
||||
pref, data = path.split(",", 1)
|
||||
if pref.endswith("base64"):
|
||||
return base64.b64decode(data)[start:end]
|
||||
return unquote(data).encode()[start:end]
|
||||
|
||||
def info(self, path, **kwargs):
|
||||
pref, name = path.split(",", 1)
|
||||
data = self.cat_file(path)
|
||||
mime = pref.split(":", 1)[1].split(";", 1)[0]
|
||||
return {"name": name, "size": len(data), "type": "file", "mimetype": mime}
|
||||
|
||||
def _open(
|
||||
self,
|
||||
path,
|
||||
mode="rb",
|
||||
block_size=None,
|
||||
autocommit=True,
|
||||
cache_options=None,
|
||||
**kwargs,
|
||||
):
|
||||
if "r" not in mode:
|
||||
raise ValueError("Read only filesystem")
|
||||
return io.BytesIO(self.cat_file(path))
|
||||
|
||||
@staticmethod
|
||||
def encode(data: bytes, mime: Optional[str] = None):
|
||||
"""Format the given data into data-URL syntax
|
||||
|
||||
This version always base64 encodes, even when the data is ascii/url-safe.
|
||||
"""
|
||||
return f"data:{mime or ''};base64,{base64.b64encode(data).decode()}"
|
||||
@@ -0,0 +1,496 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import base64
|
||||
import urllib
|
||||
|
||||
import requests
|
||||
from requests.adapters import HTTPAdapter, Retry
|
||||
from typing_extensions import override
|
||||
|
||||
from fsspec import AbstractFileSystem
|
||||
from fsspec.spec import AbstractBufferedFile
|
||||
|
||||
|
||||
class DatabricksException(Exception):
|
||||
"""
|
||||
Helper class for exceptions raised in this module.
|
||||
"""
|
||||
|
||||
def __init__(self, error_code, message, details=None):
|
||||
"""Create a new DatabricksException"""
|
||||
super().__init__(message)
|
||||
|
||||
self.error_code = error_code
|
||||
self.message = message
|
||||
self.details = details
|
||||
|
||||
|
||||
class DatabricksFileSystem(AbstractFileSystem):
|
||||
"""
|
||||
Get access to the Databricks filesystem implementation over HTTP.
|
||||
Can be used inside and outside of a databricks cluster.
|
||||
"""
|
||||
|
||||
def __init__(self, instance, token, **kwargs):
|
||||
"""
|
||||
Create a new DatabricksFileSystem.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
instance: str
|
||||
The instance URL of the databricks cluster.
|
||||
For example for an Azure databricks cluster, this
|
||||
has the form adb-<some-number>.<two digits>.azuredatabricks.net.
|
||||
token: str
|
||||
Your personal token. Find out more
|
||||
here: https://docs.databricks.com/dev-tools/api/latest/authentication.html
|
||||
"""
|
||||
self.instance = instance
|
||||
self.token = token
|
||||
self.session = requests.Session()
|
||||
self.retries = Retry(
|
||||
total=10,
|
||||
backoff_factor=0.05,
|
||||
status_forcelist=[408, 429, 500, 502, 503, 504],
|
||||
)
|
||||
|
||||
self.session.mount("https://", HTTPAdapter(max_retries=self.retries))
|
||||
self.session.headers.update({"Authorization": f"Bearer {self.token}"})
|
||||
|
||||
super().__init__(**kwargs)
|
||||
|
||||
@override
|
||||
def _ls_from_cache(self, path) -> list[dict[str, str | int]] | None:
|
||||
"""Check cache for listing
|
||||
|
||||
Returns listing, if found (may be empty list for a directory that
|
||||
exists but contains nothing), None if not in cache.
|
||||
"""
|
||||
self.dircache.pop(path.rstrip("/"), None)
|
||||
|
||||
parent = self._parent(path)
|
||||
if parent in self.dircache:
|
||||
for entry in self.dircache[parent]:
|
||||
if entry["name"] == path.rstrip("/"):
|
||||
if entry["type"] != "directory":
|
||||
return [entry]
|
||||
return []
|
||||
raise FileNotFoundError(path)
|
||||
|
||||
def ls(self, path, detail=True, **kwargs):
|
||||
"""
|
||||
List the contents of the given path.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
path: str
|
||||
Absolute path
|
||||
detail: bool
|
||||
Return not only the list of filenames,
|
||||
but also additional information on file sizes
|
||||
and types.
|
||||
"""
|
||||
try:
|
||||
out = self._ls_from_cache(path)
|
||||
except FileNotFoundError:
|
||||
# This happens if the `path`'s parent was cached, but `path` is not
|
||||
# there. This suggests that `path` is new since the parent was
|
||||
# cached. Attempt to invalidate parent's cache before continuing.
|
||||
self.dircache.pop(self._parent(path), None)
|
||||
out = None
|
||||
|
||||
if not out:
|
||||
try:
|
||||
r = self._send_to_api(
|
||||
method="get", endpoint="list", json={"path": path}
|
||||
)
|
||||
except DatabricksException as e:
|
||||
if e.error_code == "RESOURCE_DOES_NOT_EXIST":
|
||||
raise FileNotFoundError(e.message) from e
|
||||
|
||||
raise
|
||||
files = r.get("files", [])
|
||||
out = [
|
||||
{
|
||||
"name": o["path"],
|
||||
"type": "directory" if o["is_dir"] else "file",
|
||||
"size": o["file_size"],
|
||||
}
|
||||
for o in files
|
||||
]
|
||||
self.dircache[path] = out
|
||||
|
||||
if detail:
|
||||
return out
|
||||
return [o["name"] for o in out]
|
||||
|
||||
def makedirs(self, path, exist_ok=True):
|
||||
"""
|
||||
Create a given absolute path and all of its parents.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
path: str
|
||||
Absolute path to create
|
||||
exist_ok: bool
|
||||
If false, checks if the folder
|
||||
exists before creating it (and raises an
|
||||
Exception if this is the case)
|
||||
"""
|
||||
if not exist_ok:
|
||||
try:
|
||||
# If the following succeeds, the path is already present
|
||||
self._send_to_api(
|
||||
method="get", endpoint="get-status", json={"path": path}
|
||||
)
|
||||
raise FileExistsError(f"Path {path} already exists")
|
||||
except DatabricksException as e:
|
||||
if e.error_code == "RESOURCE_DOES_NOT_EXIST":
|
||||
pass
|
||||
|
||||
try:
|
||||
self._send_to_api(method="post", endpoint="mkdirs", json={"path": path})
|
||||
except DatabricksException as e:
|
||||
if e.error_code == "RESOURCE_ALREADY_EXISTS":
|
||||
raise FileExistsError(e.message) from e
|
||||
|
||||
raise
|
||||
self.invalidate_cache(self._parent(path))
|
||||
|
||||
def mkdir(self, path, create_parents=True, **kwargs):
|
||||
"""
|
||||
Create a given absolute path and all of its parents.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
path: str
|
||||
Absolute path to create
|
||||
create_parents: bool
|
||||
Whether to create all parents or not.
|
||||
"False" is not implemented so far.
|
||||
"""
|
||||
if not create_parents:
|
||||
raise NotImplementedError
|
||||
|
||||
self.mkdirs(path, **kwargs)
|
||||
|
||||
def rm(self, path, recursive=False, **kwargs):
|
||||
"""
|
||||
Remove the file or folder at the given absolute path.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
path: str
|
||||
Absolute path what to remove
|
||||
recursive: bool
|
||||
Recursively delete all files in a folder.
|
||||
"""
|
||||
try:
|
||||
self._send_to_api(
|
||||
method="post",
|
||||
endpoint="delete",
|
||||
json={"path": path, "recursive": recursive},
|
||||
)
|
||||
except DatabricksException as e:
|
||||
# This is not really an exception, it just means
|
||||
# not everything was deleted so far
|
||||
if e.error_code == "PARTIAL_DELETE":
|
||||
self.rm(path=path, recursive=recursive)
|
||||
elif e.error_code == "IO_ERROR":
|
||||
# Using the same exception as the os module would use here
|
||||
raise OSError(e.message) from e
|
||||
|
||||
raise
|
||||
self.invalidate_cache(self._parent(path))
|
||||
|
||||
def mv(
|
||||
self, source_path, destination_path, recursive=False, maxdepth=None, **kwargs
|
||||
):
|
||||
"""
|
||||
Move a source to a destination path.
|
||||
|
||||
A note from the original [databricks API manual]
|
||||
(https://docs.databricks.com/dev-tools/api/latest/dbfs.html#move).
|
||||
|
||||
When moving a large number of files the API call will time out after
|
||||
approximately 60s, potentially resulting in partially moved data.
|
||||
Therefore, for operations that move more than 10k files, we strongly
|
||||
discourage using the DBFS REST API.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
source_path: str
|
||||
From where to move (absolute path)
|
||||
destination_path: str
|
||||
To where to move (absolute path)
|
||||
recursive: bool
|
||||
Not implemented to far.
|
||||
maxdepth:
|
||||
Not implemented to far.
|
||||
"""
|
||||
if recursive:
|
||||
raise NotImplementedError
|
||||
if maxdepth:
|
||||
raise NotImplementedError
|
||||
|
||||
try:
|
||||
self._send_to_api(
|
||||
method="post",
|
||||
endpoint="move",
|
||||
json={"source_path": source_path, "destination_path": destination_path},
|
||||
)
|
||||
except DatabricksException as e:
|
||||
if e.error_code == "RESOURCE_DOES_NOT_EXIST":
|
||||
raise FileNotFoundError(e.message) from e
|
||||
elif e.error_code == "RESOURCE_ALREADY_EXISTS":
|
||||
raise FileExistsError(e.message) from e
|
||||
|
||||
raise
|
||||
self.invalidate_cache(self._parent(source_path))
|
||||
self.invalidate_cache(self._parent(destination_path))
|
||||
|
||||
def _open(self, path, mode="rb", block_size="default", **kwargs):
|
||||
"""
|
||||
Overwrite the base class method to make sure to create a DBFile.
|
||||
All arguments are copied from the base method.
|
||||
|
||||
Only the default blocksize is allowed.
|
||||
"""
|
||||
return DatabricksFile(self, path, mode=mode, block_size=block_size, **kwargs)
|
||||
|
||||
def _send_to_api(self, method, endpoint, json):
|
||||
"""
|
||||
Send the given json to the DBFS API
|
||||
using a get or post request (specified by the argument `method`).
|
||||
|
||||
Parameters
|
||||
----------
|
||||
method: str
|
||||
Which http method to use for communication; "get" or "post".
|
||||
endpoint: str
|
||||
Where to send the request to (last part of the API URL)
|
||||
json: dict
|
||||
Dictionary of information to send
|
||||
"""
|
||||
if method == "post":
|
||||
session_call = self.session.post
|
||||
elif method == "get":
|
||||
session_call = self.session.get
|
||||
else:
|
||||
raise ValueError(f"Do not understand method {method}")
|
||||
|
||||
url = urllib.parse.urljoin(f"https://{self.instance}/api/2.0/dbfs/", endpoint)
|
||||
|
||||
r = session_call(url, json=json)
|
||||
|
||||
# The DBFS API will return a json, also in case of an exception.
|
||||
# We want to preserve this information as good as possible.
|
||||
try:
|
||||
r.raise_for_status()
|
||||
except requests.HTTPError as e:
|
||||
# try to extract json error message
|
||||
# if that fails, fall back to the original exception
|
||||
try:
|
||||
exception_json = e.response.json()
|
||||
except Exception:
|
||||
raise e from None
|
||||
|
||||
raise DatabricksException(**exception_json) from e
|
||||
|
||||
return r.json()
|
||||
|
||||
def _create_handle(self, path, overwrite=True):
|
||||
"""
|
||||
Internal function to create a handle, which can be used to
|
||||
write blocks of a file to DBFS.
|
||||
A handle has a unique identifier which needs to be passed
|
||||
whenever written during this transaction.
|
||||
The handle is active for 10 minutes - after that a new
|
||||
write transaction needs to be created.
|
||||
Make sure to close the handle after you are finished.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
path: str
|
||||
Absolute path for this file.
|
||||
overwrite: bool
|
||||
If a file already exist at this location, either overwrite
|
||||
it or raise an exception.
|
||||
"""
|
||||
try:
|
||||
r = self._send_to_api(
|
||||
method="post",
|
||||
endpoint="create",
|
||||
json={"path": path, "overwrite": overwrite},
|
||||
)
|
||||
return r["handle"]
|
||||
except DatabricksException as e:
|
||||
if e.error_code == "RESOURCE_ALREADY_EXISTS":
|
||||
raise FileExistsError(e.message) from e
|
||||
|
||||
raise
|
||||
|
||||
def _close_handle(self, handle):
|
||||
"""
|
||||
Close a handle, which was opened by :func:`_create_handle`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
handle: str
|
||||
Which handle to close.
|
||||
"""
|
||||
try:
|
||||
self._send_to_api(method="post", endpoint="close", json={"handle": handle})
|
||||
except DatabricksException as e:
|
||||
if e.error_code == "RESOURCE_DOES_NOT_EXIST":
|
||||
raise FileNotFoundError(e.message) from e
|
||||
|
||||
raise
|
||||
|
||||
def _add_data(self, handle, data):
|
||||
"""
|
||||
Upload data to an already opened file handle
|
||||
(opened by :func:`_create_handle`).
|
||||
The maximal allowed data size is 1MB after
|
||||
conversion to base64.
|
||||
Remember to close the handle when you are finished.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
handle: str
|
||||
Which handle to upload data to.
|
||||
data: bytes
|
||||
Block of data to add to the handle.
|
||||
"""
|
||||
data = base64.b64encode(data).decode()
|
||||
try:
|
||||
self._send_to_api(
|
||||
method="post",
|
||||
endpoint="add-block",
|
||||
json={"handle": handle, "data": data},
|
||||
)
|
||||
except DatabricksException as e:
|
||||
if e.error_code == "RESOURCE_DOES_NOT_EXIST":
|
||||
raise FileNotFoundError(e.message) from e
|
||||
elif e.error_code == "MAX_BLOCK_SIZE_EXCEEDED":
|
||||
raise ValueError(e.message) from e
|
||||
|
||||
raise
|
||||
|
||||
def _get_data(self, path, start, end):
|
||||
"""
|
||||
Download data in bytes from a given absolute path in a block
|
||||
from [start, start+length].
|
||||
The maximum number of allowed bytes to read is 1MB.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
path: str
|
||||
Absolute path to download data from
|
||||
start: int
|
||||
Start position of the block
|
||||
end: int
|
||||
End position of the block
|
||||
"""
|
||||
try:
|
||||
r = self._send_to_api(
|
||||
method="get",
|
||||
endpoint="read",
|
||||
json={"path": path, "offset": start, "length": end - start},
|
||||
)
|
||||
return base64.b64decode(r["data"])
|
||||
except DatabricksException as e:
|
||||
if e.error_code == "RESOURCE_DOES_NOT_EXIST":
|
||||
raise FileNotFoundError(e.message) from e
|
||||
elif e.error_code in ["INVALID_PARAMETER_VALUE", "MAX_READ_SIZE_EXCEEDED"]:
|
||||
raise ValueError(e.message) from e
|
||||
|
||||
raise
|
||||
|
||||
def invalidate_cache(self, path=None):
|
||||
if path is None:
|
||||
self.dircache.clear()
|
||||
else:
|
||||
self.dircache.pop(path, None)
|
||||
super().invalidate_cache(path)
|
||||
|
||||
|
||||
class DatabricksFile(AbstractBufferedFile):
|
||||
"""
|
||||
Helper class for files referenced in the DatabricksFileSystem.
|
||||
"""
|
||||
|
||||
DEFAULT_BLOCK_SIZE = 1 * 2**20 # only allowed block size
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
fs,
|
||||
path,
|
||||
mode="rb",
|
||||
block_size="default",
|
||||
autocommit=True,
|
||||
cache_type="readahead",
|
||||
cache_options=None,
|
||||
**kwargs,
|
||||
):
|
||||
"""
|
||||
Create a new instance of the DatabricksFile.
|
||||
|
||||
The blocksize needs to be the default one.
|
||||
"""
|
||||
if block_size is None or block_size == "default":
|
||||
block_size = self.DEFAULT_BLOCK_SIZE
|
||||
|
||||
assert block_size == self.DEFAULT_BLOCK_SIZE, (
|
||||
f"Only the default block size is allowed, not {block_size}"
|
||||
)
|
||||
|
||||
super().__init__(
|
||||
fs,
|
||||
path,
|
||||
mode=mode,
|
||||
block_size=block_size,
|
||||
autocommit=autocommit,
|
||||
cache_type=cache_type,
|
||||
cache_options=cache_options or {},
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
def _initiate_upload(self):
|
||||
"""Internal function to start a file upload"""
|
||||
self.handle = self.fs._create_handle(self.path)
|
||||
|
||||
def _upload_chunk(self, final=False):
|
||||
"""Internal function to add a chunk of data to a started upload"""
|
||||
self.buffer.seek(0)
|
||||
data = self.buffer.getvalue()
|
||||
|
||||
data_chunks = [
|
||||
data[start:end] for start, end in self._to_sized_blocks(len(data))
|
||||
]
|
||||
|
||||
for data_chunk in data_chunks:
|
||||
self.fs._add_data(handle=self.handle, data=data_chunk)
|
||||
|
||||
if final:
|
||||
self.fs._close_handle(handle=self.handle)
|
||||
return True
|
||||
|
||||
def _fetch_range(self, start, end):
|
||||
"""Internal function to download a block of data"""
|
||||
return_buffer = b""
|
||||
length = end - start
|
||||
for chunk_start, chunk_end in self._to_sized_blocks(length, start):
|
||||
return_buffer += self.fs._get_data(
|
||||
path=self.path, start=chunk_start, end=chunk_end
|
||||
)
|
||||
|
||||
return return_buffer
|
||||
|
||||
def _to_sized_blocks(self, length, start=0):
|
||||
"""Helper function to split a range from 0 to total_length into blocksizes"""
|
||||
end = start + length
|
||||
for data_chunk in range(start, end, self.blocksize):
|
||||
data_start = data_chunk
|
||||
data_end = min(end, data_chunk + self.blocksize)
|
||||
yield data_start, data_end
|
||||
@@ -0,0 +1,388 @@
|
||||
from .. import filesystem
|
||||
from ..asyn import AsyncFileSystem
|
||||
|
||||
|
||||
class DirFileSystem(AsyncFileSystem):
|
||||
"""Directory prefix filesystem
|
||||
|
||||
The DirFileSystem is a filesystem-wrapper. It assumes every path it is dealing with
|
||||
is relative to the `path`. After performing the necessary paths operation it
|
||||
delegates everything to the wrapped filesystem.
|
||||
"""
|
||||
|
||||
protocol = "dir"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
path=None,
|
||||
fs=None,
|
||||
fo=None,
|
||||
target_protocol=None,
|
||||
target_options=None,
|
||||
**storage_options,
|
||||
):
|
||||
"""
|
||||
Parameters
|
||||
----------
|
||||
path: str
|
||||
Path to the directory.
|
||||
fs: AbstractFileSystem
|
||||
An instantiated filesystem to wrap.
|
||||
target_protocol, target_options:
|
||||
if fs is none, construct it from these
|
||||
fo: str
|
||||
Alternate for path; do not provide both
|
||||
"""
|
||||
super().__init__(**storage_options)
|
||||
if fs is None:
|
||||
fs = filesystem(protocol=target_protocol, **(target_options or {}))
|
||||
path = path or fo
|
||||
|
||||
if self.asynchronous and not fs.async_impl:
|
||||
raise ValueError("can't use asynchronous with non-async fs")
|
||||
|
||||
if fs.async_impl and self.asynchronous != fs.asynchronous:
|
||||
raise ValueError("both dirfs and fs should be in the same sync/async mode")
|
||||
|
||||
self.path = fs._strip_protocol(path)
|
||||
self.fs = fs
|
||||
|
||||
def _join(self, path):
|
||||
if isinstance(path, str):
|
||||
if not self.path:
|
||||
return path
|
||||
if not path:
|
||||
return self.path
|
||||
return self.fs.sep.join((self.path, self._strip_protocol(path)))
|
||||
if isinstance(path, dict):
|
||||
return {self._join(_path): value for _path, value in path.items()}
|
||||
return [self._join(_path) for _path in path]
|
||||
|
||||
def _relpath(self, path):
|
||||
if isinstance(path, str):
|
||||
if not self.path:
|
||||
return path
|
||||
# We need to account for S3FileSystem returning paths that do not
|
||||
# start with a '/'
|
||||
if path == self.path or (
|
||||
self.path.startswith(self.fs.sep) and path == self.path[1:]
|
||||
):
|
||||
return ""
|
||||
prefix = self.path + self.fs.sep
|
||||
if self.path.startswith(self.fs.sep) and not path.startswith(self.fs.sep):
|
||||
prefix = prefix[1:]
|
||||
assert path.startswith(prefix)
|
||||
return path[len(prefix) :]
|
||||
return [self._relpath(_path) for _path in path]
|
||||
|
||||
# Wrappers below
|
||||
|
||||
@property
|
||||
def sep(self):
|
||||
return self.fs.sep
|
||||
|
||||
async def set_session(self, *args, **kwargs):
|
||||
return await self.fs.set_session(*args, **kwargs)
|
||||
|
||||
async def _rm_file(self, path, **kwargs):
|
||||
return await self.fs._rm_file(self._join(path), **kwargs)
|
||||
|
||||
def rm_file(self, path, **kwargs):
|
||||
return self.fs.rm_file(self._join(path), **kwargs)
|
||||
|
||||
async def _rm(self, path, *args, **kwargs):
|
||||
return await self.fs._rm(self._join(path), *args, **kwargs)
|
||||
|
||||
def rm(self, path, *args, **kwargs):
|
||||
return self.fs.rm(self._join(path), *args, **kwargs)
|
||||
|
||||
async def _cp_file(self, path1, path2, **kwargs):
|
||||
return await self.fs._cp_file(self._join(path1), self._join(path2), **kwargs)
|
||||
|
||||
def cp_file(self, path1, path2, **kwargs):
|
||||
return self.fs.cp_file(self._join(path1), self._join(path2), **kwargs)
|
||||
|
||||
async def _copy(
|
||||
self,
|
||||
path1,
|
||||
path2,
|
||||
*args,
|
||||
**kwargs,
|
||||
):
|
||||
return await self.fs._copy(
|
||||
self._join(path1),
|
||||
self._join(path2),
|
||||
*args,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
def copy(self, path1, path2, *args, **kwargs):
|
||||
return self.fs.copy(
|
||||
self._join(path1),
|
||||
self._join(path2),
|
||||
*args,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
async def _pipe(self, path, *args, **kwargs):
|
||||
return await self.fs._pipe(self._join(path), *args, **kwargs)
|
||||
|
||||
def pipe(self, path, *args, **kwargs):
|
||||
return self.fs.pipe(self._join(path), *args, **kwargs)
|
||||
|
||||
async def _pipe_file(self, path, *args, **kwargs):
|
||||
return await self.fs._pipe_file(self._join(path), *args, **kwargs)
|
||||
|
||||
def pipe_file(self, path, *args, **kwargs):
|
||||
return self.fs.pipe_file(self._join(path), *args, **kwargs)
|
||||
|
||||
async def _cat_file(self, path, *args, **kwargs):
|
||||
return await self.fs._cat_file(self._join(path), *args, **kwargs)
|
||||
|
||||
def cat_file(self, path, *args, **kwargs):
|
||||
return self.fs.cat_file(self._join(path), *args, **kwargs)
|
||||
|
||||
async def _cat(self, path, *args, **kwargs):
|
||||
ret = await self.fs._cat(
|
||||
self._join(path),
|
||||
*args,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
if isinstance(ret, dict):
|
||||
return {self._relpath(key): value for key, value in ret.items()}
|
||||
|
||||
return ret
|
||||
|
||||
def cat(self, path, *args, **kwargs):
|
||||
ret = self.fs.cat(
|
||||
self._join(path),
|
||||
*args,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
if isinstance(ret, dict):
|
||||
return {self._relpath(key): value for key, value in ret.items()}
|
||||
|
||||
return ret
|
||||
|
||||
async def _put_file(self, lpath, rpath, **kwargs):
|
||||
return await self.fs._put_file(lpath, self._join(rpath), **kwargs)
|
||||
|
||||
def put_file(self, lpath, rpath, **kwargs):
|
||||
return self.fs.put_file(lpath, self._join(rpath), **kwargs)
|
||||
|
||||
async def _put(
|
||||
self,
|
||||
lpath,
|
||||
rpath,
|
||||
*args,
|
||||
**kwargs,
|
||||
):
|
||||
return await self.fs._put(
|
||||
lpath,
|
||||
self._join(rpath),
|
||||
*args,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
def put(self, lpath, rpath, *args, **kwargs):
|
||||
return self.fs.put(
|
||||
lpath,
|
||||
self._join(rpath),
|
||||
*args,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
async def _get_file(self, rpath, lpath, **kwargs):
|
||||
return await self.fs._get_file(self._join(rpath), lpath, **kwargs)
|
||||
|
||||
def get_file(self, rpath, lpath, **kwargs):
|
||||
return self.fs.get_file(self._join(rpath), lpath, **kwargs)
|
||||
|
||||
async def _get(self, rpath, *args, **kwargs):
|
||||
return await self.fs._get(self._join(rpath), *args, **kwargs)
|
||||
|
||||
def get(self, rpath, *args, **kwargs):
|
||||
return self.fs.get(self._join(rpath), *args, **kwargs)
|
||||
|
||||
async def _isfile(self, path):
|
||||
return await self.fs._isfile(self._join(path))
|
||||
|
||||
def isfile(self, path):
|
||||
return self.fs.isfile(self._join(path))
|
||||
|
||||
async def _isdir(self, path):
|
||||
return await self.fs._isdir(self._join(path))
|
||||
|
||||
def isdir(self, path):
|
||||
return self.fs.isdir(self._join(path))
|
||||
|
||||
async def _size(self, path):
|
||||
return await self.fs._size(self._join(path))
|
||||
|
||||
def size(self, path):
|
||||
return self.fs.size(self._join(path))
|
||||
|
||||
async def _exists(self, path):
|
||||
return await self.fs._exists(self._join(path))
|
||||
|
||||
def exists(self, path):
|
||||
return self.fs.exists(self._join(path))
|
||||
|
||||
async def _info(self, path, **kwargs):
|
||||
info = await self.fs._info(self._join(path), **kwargs)
|
||||
info = info.copy()
|
||||
info["name"] = self._relpath(info["name"])
|
||||
return info
|
||||
|
||||
def info(self, path, **kwargs):
|
||||
info = self.fs.info(self._join(path), **kwargs)
|
||||
info = info.copy()
|
||||
info["name"] = self._relpath(info["name"])
|
||||
return info
|
||||
|
||||
async def _ls(self, path, detail=True, **kwargs):
|
||||
ret = (await self.fs._ls(self._join(path), detail=detail, **kwargs)).copy()
|
||||
if detail:
|
||||
out = []
|
||||
for entry in ret:
|
||||
entry = entry.copy()
|
||||
entry["name"] = self._relpath(entry["name"])
|
||||
out.append(entry)
|
||||
return out
|
||||
|
||||
return self._relpath(ret)
|
||||
|
||||
def ls(self, path, detail=True, **kwargs):
|
||||
ret = self.fs.ls(self._join(path), detail=detail, **kwargs).copy()
|
||||
if detail:
|
||||
out = []
|
||||
for entry in ret:
|
||||
entry = entry.copy()
|
||||
entry["name"] = self._relpath(entry["name"])
|
||||
out.append(entry)
|
||||
return out
|
||||
|
||||
return self._relpath(ret)
|
||||
|
||||
async def _walk(self, path, *args, **kwargs):
|
||||
async for root, dirs, files in self.fs._walk(self._join(path), *args, **kwargs):
|
||||
yield self._relpath(root), dirs, files
|
||||
|
||||
def walk(self, path, *args, **kwargs):
|
||||
for root, dirs, files in self.fs.walk(self._join(path), *args, **kwargs):
|
||||
yield self._relpath(root), dirs, files
|
||||
|
||||
async def _glob(self, path, **kwargs):
|
||||
detail = kwargs.get("detail", False)
|
||||
ret = await self.fs._glob(self._join(path), **kwargs)
|
||||
if detail:
|
||||
return {self._relpath(path): info for path, info in ret.items()}
|
||||
return self._relpath(ret)
|
||||
|
||||
def glob(self, path, **kwargs):
|
||||
detail = kwargs.get("detail", False)
|
||||
ret = self.fs.glob(self._join(path), **kwargs)
|
||||
if detail:
|
||||
return {self._relpath(path): info for path, info in ret.items()}
|
||||
return self._relpath(ret)
|
||||
|
||||
async def _du(self, path, *args, **kwargs):
|
||||
total = kwargs.get("total", True)
|
||||
ret = await self.fs._du(self._join(path), *args, **kwargs)
|
||||
if total:
|
||||
return ret
|
||||
|
||||
return {self._relpath(path): size for path, size in ret.items()}
|
||||
|
||||
def du(self, path, *args, **kwargs):
|
||||
total = kwargs.get("total", True)
|
||||
ret = self.fs.du(self._join(path), *args, **kwargs)
|
||||
if total:
|
||||
return ret
|
||||
|
||||
return {self._relpath(path): size for path, size in ret.items()}
|
||||
|
||||
async def _find(self, path, *args, **kwargs):
|
||||
detail = kwargs.get("detail", False)
|
||||
ret = await self.fs._find(self._join(path), *args, **kwargs)
|
||||
if detail:
|
||||
return {self._relpath(path): info for path, info in ret.items()}
|
||||
return self._relpath(ret)
|
||||
|
||||
def find(self, path, *args, **kwargs):
|
||||
detail = kwargs.get("detail", False)
|
||||
ret = self.fs.find(self._join(path), *args, **kwargs)
|
||||
if detail:
|
||||
return {self._relpath(path): info for path, info in ret.items()}
|
||||
return self._relpath(ret)
|
||||
|
||||
async def _expand_path(self, path, *args, **kwargs):
|
||||
return self._relpath(
|
||||
await self.fs._expand_path(self._join(path), *args, **kwargs)
|
||||
)
|
||||
|
||||
def expand_path(self, path, *args, **kwargs):
|
||||
return self._relpath(self.fs.expand_path(self._join(path), *args, **kwargs))
|
||||
|
||||
async def _mkdir(self, path, *args, **kwargs):
|
||||
return await self.fs._mkdir(self._join(path), *args, **kwargs)
|
||||
|
||||
def mkdir(self, path, *args, **kwargs):
|
||||
return self.fs.mkdir(self._join(path), *args, **kwargs)
|
||||
|
||||
async def _makedirs(self, path, *args, **kwargs):
|
||||
return await self.fs._makedirs(self._join(path), *args, **kwargs)
|
||||
|
||||
def makedirs(self, path, *args, **kwargs):
|
||||
return self.fs.makedirs(self._join(path), *args, **kwargs)
|
||||
|
||||
def rmdir(self, path):
|
||||
return self.fs.rmdir(self._join(path))
|
||||
|
||||
def mv(self, path1, path2, **kwargs):
|
||||
return self.fs.mv(
|
||||
self._join(path1),
|
||||
self._join(path2),
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
def touch(self, path, **kwargs):
|
||||
return self.fs.touch(self._join(path), **kwargs)
|
||||
|
||||
def created(self, path):
|
||||
return self.fs.created(self._join(path))
|
||||
|
||||
def modified(self, path):
|
||||
return self.fs.modified(self._join(path))
|
||||
|
||||
def sign(self, path, *args, **kwargs):
|
||||
return self.fs.sign(self._join(path), *args, **kwargs)
|
||||
|
||||
def __repr__(self):
|
||||
return f"{self.__class__.__qualname__}(path='{self.path}', fs={self.fs})"
|
||||
|
||||
def open(
|
||||
self,
|
||||
path,
|
||||
*args,
|
||||
**kwargs,
|
||||
):
|
||||
return self.fs.open(
|
||||
self._join(path),
|
||||
*args,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
async def open_async(
|
||||
self,
|
||||
path,
|
||||
*args,
|
||||
**kwargs,
|
||||
):
|
||||
return await self.fs.open_async(
|
||||
self._join(path),
|
||||
*args,
|
||||
**kwargs,
|
||||
)
|
||||
@@ -0,0 +1,387 @@
|
||||
import os
|
||||
import uuid
|
||||
from ftplib import FTP, FTP_TLS, Error, error_perm
|
||||
from typing import Any
|
||||
|
||||
from ..spec import AbstractBufferedFile, AbstractFileSystem
|
||||
from ..utils import infer_storage_options, isfilelike
|
||||
|
||||
|
||||
class FTPFileSystem(AbstractFileSystem):
|
||||
"""A filesystem over classic FTP"""
|
||||
|
||||
root_marker = "/"
|
||||
cachable = False
|
||||
protocol = "ftp"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
host,
|
||||
port=21,
|
||||
username=None,
|
||||
password=None,
|
||||
acct=None,
|
||||
block_size=None,
|
||||
tempdir=None,
|
||||
timeout=30,
|
||||
encoding="utf-8",
|
||||
tls=False,
|
||||
**kwargs,
|
||||
):
|
||||
"""
|
||||
You can use _get_kwargs_from_urls to get some kwargs from
|
||||
a reasonable FTP url.
|
||||
|
||||
Authentication will be anonymous if username/password are not
|
||||
given.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
host: str
|
||||
The remote server name/ip to connect to
|
||||
port: int
|
||||
Port to connect with
|
||||
username: str or None
|
||||
If authenticating, the user's identifier
|
||||
password: str of None
|
||||
User's password on the server, if using
|
||||
acct: str or None
|
||||
Some servers also need an "account" string for auth
|
||||
block_size: int or None
|
||||
If given, the read-ahead or write buffer size.
|
||||
tempdir: str
|
||||
Directory on remote to put temporary files when in a transaction
|
||||
timeout: int
|
||||
Timeout of the ftp connection in seconds
|
||||
encoding: str
|
||||
Encoding to use for directories and filenames in FTP connection
|
||||
tls: bool
|
||||
Use FTP-TLS, by default False
|
||||
"""
|
||||
super().__init__(**kwargs)
|
||||
self.host = host
|
||||
self.port = port
|
||||
self.tempdir = tempdir or "/tmp"
|
||||
self.cred = username or "", password or "", acct or ""
|
||||
self.timeout = timeout
|
||||
self.encoding = encoding
|
||||
if block_size is not None:
|
||||
self.blocksize = block_size
|
||||
else:
|
||||
self.blocksize = 2**16
|
||||
self.tls = tls
|
||||
self._connect()
|
||||
if self.tls:
|
||||
self.ftp.prot_p()
|
||||
|
||||
def _connect(self):
|
||||
if self.tls:
|
||||
ftp_cls = FTP_TLS
|
||||
else:
|
||||
ftp_cls = FTP
|
||||
self.ftp = ftp_cls(timeout=self.timeout, encoding=self.encoding)
|
||||
self.ftp.connect(self.host, self.port)
|
||||
self.ftp.login(*self.cred)
|
||||
|
||||
@classmethod
|
||||
def _strip_protocol(cls, path):
|
||||
return "/" + infer_storage_options(path)["path"].lstrip("/").rstrip("/")
|
||||
|
||||
@staticmethod
|
||||
def _get_kwargs_from_urls(urlpath):
|
||||
out = infer_storage_options(urlpath)
|
||||
out.pop("path", None)
|
||||
out.pop("protocol", None)
|
||||
return out
|
||||
|
||||
def ls(self, path, detail=True, **kwargs):
|
||||
path = self._strip_protocol(path)
|
||||
out = []
|
||||
if path not in self.dircache:
|
||||
try:
|
||||
try:
|
||||
out = [
|
||||
(fn, details)
|
||||
for (fn, details) in self.ftp.mlsd(path)
|
||||
if fn not in [".", ".."]
|
||||
and details["type"] not in ["pdir", "cdir"]
|
||||
]
|
||||
except error_perm:
|
||||
out = _mlsd2(self.ftp, path) # Not platform independent
|
||||
for fn, details in out:
|
||||
details["name"] = "/".join(
|
||||
["" if path == "/" else path, fn.lstrip("/")]
|
||||
)
|
||||
if details["type"] == "file":
|
||||
details["size"] = int(details["size"])
|
||||
else:
|
||||
details["size"] = 0
|
||||
if details["type"] == "dir":
|
||||
details["type"] = "directory"
|
||||
self.dircache[path] = out
|
||||
except Error:
|
||||
try:
|
||||
info = self.info(path)
|
||||
if info["type"] == "file":
|
||||
out = [(path, info)]
|
||||
except (Error, IndexError) as exc:
|
||||
raise FileNotFoundError(path) from exc
|
||||
files = self.dircache.get(path, out)
|
||||
if not detail:
|
||||
return sorted([fn for fn, details in files])
|
||||
return [details for fn, details in files]
|
||||
|
||||
def info(self, path, **kwargs):
|
||||
# implement with direct method
|
||||
path = self._strip_protocol(path)
|
||||
if path == "/":
|
||||
# special case, since this dir has no real entry
|
||||
return {"name": "/", "size": 0, "type": "directory"}
|
||||
files = self.ls(self._parent(path).lstrip("/"), True)
|
||||
try:
|
||||
out = next(f for f in files if f["name"] == path)
|
||||
except StopIteration as exc:
|
||||
raise FileNotFoundError(path) from exc
|
||||
return out
|
||||
|
||||
def get_file(self, rpath, lpath, **kwargs):
|
||||
if self.isdir(rpath):
|
||||
if not os.path.exists(lpath):
|
||||
os.mkdir(lpath)
|
||||
return
|
||||
if isfilelike(lpath):
|
||||
outfile = lpath
|
||||
else:
|
||||
outfile = open(lpath, "wb")
|
||||
|
||||
def cb(x):
|
||||
outfile.write(x)
|
||||
|
||||
self.ftp.retrbinary(
|
||||
f"RETR {rpath}",
|
||||
blocksize=self.blocksize,
|
||||
callback=cb,
|
||||
)
|
||||
if not isfilelike(lpath):
|
||||
outfile.close()
|
||||
|
||||
def cat_file(self, path, start=None, end=None, **kwargs):
|
||||
if end is not None:
|
||||
return super().cat_file(path, start, end, **kwargs)
|
||||
out = []
|
||||
|
||||
def cb(x):
|
||||
out.append(x)
|
||||
|
||||
try:
|
||||
self.ftp.retrbinary(
|
||||
f"RETR {path}",
|
||||
blocksize=self.blocksize,
|
||||
rest=start,
|
||||
callback=cb,
|
||||
)
|
||||
except (Error, error_perm) as orig_exc:
|
||||
raise FileNotFoundError(path) from orig_exc
|
||||
return b"".join(out)
|
||||
|
||||
def _open(
|
||||
self,
|
||||
path,
|
||||
mode="rb",
|
||||
block_size=None,
|
||||
cache_options=None,
|
||||
autocommit=True,
|
||||
**kwargs,
|
||||
):
|
||||
path = self._strip_protocol(path)
|
||||
block_size = block_size or self.blocksize
|
||||
return FTPFile(
|
||||
self,
|
||||
path,
|
||||
mode=mode,
|
||||
block_size=block_size,
|
||||
tempdir=self.tempdir,
|
||||
autocommit=autocommit,
|
||||
cache_options=cache_options,
|
||||
)
|
||||
|
||||
def _rm(self, path):
|
||||
path = self._strip_protocol(path)
|
||||
self.ftp.delete(path)
|
||||
self.invalidate_cache(self._parent(path))
|
||||
|
||||
def rm(self, path, recursive=False, maxdepth=None):
|
||||
paths = self.expand_path(path, recursive=recursive, maxdepth=maxdepth)
|
||||
for p in reversed(paths):
|
||||
if self.isfile(p):
|
||||
self.rm_file(p)
|
||||
else:
|
||||
self.rmdir(p)
|
||||
|
||||
def mkdir(self, path: str, create_parents: bool = True, **kwargs: Any) -> None:
|
||||
path = self._strip_protocol(path)
|
||||
parent = self._parent(path)
|
||||
if parent != self.root_marker and not self.exists(parent) and create_parents:
|
||||
self.mkdir(parent, create_parents=create_parents)
|
||||
|
||||
self.ftp.mkd(path)
|
||||
self.invalidate_cache(self._parent(path))
|
||||
|
||||
def makedirs(self, path: str, exist_ok: bool = False) -> None:
|
||||
path = self._strip_protocol(path)
|
||||
if self.exists(path):
|
||||
# NB: "/" does not "exist" as it has no directory entry
|
||||
if not exist_ok:
|
||||
raise FileExistsError(f"{path} exists without `exist_ok`")
|
||||
# exists_ok=True -> no-op
|
||||
else:
|
||||
self.mkdir(path, create_parents=True)
|
||||
|
||||
def rmdir(self, path):
|
||||
path = self._strip_protocol(path)
|
||||
self.ftp.rmd(path)
|
||||
self.invalidate_cache(self._parent(path))
|
||||
|
||||
def mv(self, path1, path2, **kwargs):
|
||||
path1 = self._strip_protocol(path1)
|
||||
path2 = self._strip_protocol(path2)
|
||||
self.ftp.rename(path1, path2)
|
||||
self.invalidate_cache(self._parent(path1))
|
||||
self.invalidate_cache(self._parent(path2))
|
||||
|
||||
def __del__(self):
|
||||
self.ftp.close()
|
||||
|
||||
def invalidate_cache(self, path=None):
|
||||
if path is None:
|
||||
self.dircache.clear()
|
||||
else:
|
||||
self.dircache.pop(path, None)
|
||||
super().invalidate_cache(path)
|
||||
|
||||
|
||||
class TransferDone(Exception):
|
||||
"""Internal exception to break out of transfer"""
|
||||
|
||||
pass
|
||||
|
||||
|
||||
class FTPFile(AbstractBufferedFile):
|
||||
"""Interact with a remote FTP file with read/write buffering"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
fs,
|
||||
path,
|
||||
mode="rb",
|
||||
block_size="default",
|
||||
autocommit=True,
|
||||
cache_type="readahead",
|
||||
cache_options=None,
|
||||
**kwargs,
|
||||
):
|
||||
super().__init__(
|
||||
fs,
|
||||
path,
|
||||
mode=mode,
|
||||
block_size=block_size,
|
||||
autocommit=autocommit,
|
||||
cache_type=cache_type,
|
||||
cache_options=cache_options,
|
||||
**kwargs,
|
||||
)
|
||||
if not autocommit:
|
||||
self.target = self.path
|
||||
self.path = "/".join([kwargs["tempdir"], str(uuid.uuid4())])
|
||||
|
||||
def commit(self):
|
||||
self.fs.mv(self.path, self.target)
|
||||
|
||||
def discard(self):
|
||||
self.fs.rm(self.path)
|
||||
|
||||
def _fetch_range(self, start, end):
|
||||
"""Get bytes between given byte limits
|
||||
|
||||
Implemented by raising an exception in the fetch callback when the
|
||||
number of bytes received reaches the requested amount.
|
||||
|
||||
Will fail if the server does not respect the REST command on
|
||||
retrieve requests.
|
||||
"""
|
||||
out = []
|
||||
total = [0]
|
||||
|
||||
def callback(x):
|
||||
total[0] += len(x)
|
||||
if total[0] > end - start:
|
||||
out.append(x[: (end - start) - total[0]])
|
||||
if end < self.size:
|
||||
raise TransferDone
|
||||
else:
|
||||
out.append(x)
|
||||
|
||||
if total[0] == end - start and end < self.size:
|
||||
raise TransferDone
|
||||
|
||||
try:
|
||||
self.fs.ftp.retrbinary(
|
||||
f"RETR {self.path}",
|
||||
blocksize=self.blocksize,
|
||||
rest=start,
|
||||
callback=callback,
|
||||
)
|
||||
except TransferDone:
|
||||
try:
|
||||
# stop transfer, we got enough bytes for this block
|
||||
self.fs.ftp.abort()
|
||||
self.fs.ftp.getmultiline()
|
||||
except Error:
|
||||
self.fs._connect()
|
||||
|
||||
return b"".join(out)
|
||||
|
||||
def _upload_chunk(self, final=False):
|
||||
self.buffer.seek(0)
|
||||
self.fs.ftp.storbinary(
|
||||
f"STOR {self.path}", self.buffer, blocksize=self.blocksize, rest=self.offset
|
||||
)
|
||||
return True
|
||||
|
||||
|
||||
def _mlsd2(ftp, path="."):
|
||||
"""
|
||||
Fall back to using `dir` instead of `mlsd` if not supported.
|
||||
|
||||
This parses a Linux style `ls -l` response to `dir`, but the response may
|
||||
be platform dependent.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
ftp: ftplib.FTP
|
||||
path: str
|
||||
Expects to be given path, but defaults to ".".
|
||||
"""
|
||||
lines = []
|
||||
minfo = []
|
||||
ftp.dir(path, lines.append)
|
||||
for line in lines:
|
||||
split_line = line.split()
|
||||
if len(split_line) < 9:
|
||||
continue
|
||||
this = (
|
||||
split_line[-1],
|
||||
{
|
||||
"modify": " ".join(split_line[5:8]),
|
||||
"unix.owner": split_line[2],
|
||||
"unix.group": split_line[3],
|
||||
"unix.mode": split_line[0],
|
||||
"size": split_line[4],
|
||||
},
|
||||
)
|
||||
if this[1]["unix.mode"][0] == "d":
|
||||
this[1]["type"] = "dir"
|
||||
else:
|
||||
this[1]["type"] = "file"
|
||||
minfo.append(this)
|
||||
return minfo
|
||||
@@ -0,0 +1,241 @@
|
||||
import requests
|
||||
|
||||
from ..spec import AbstractFileSystem
|
||||
from ..utils import infer_storage_options
|
||||
from .memory import MemoryFile
|
||||
|
||||
|
||||
class GistFileSystem(AbstractFileSystem):
|
||||
"""
|
||||
Interface to files in a single GitHub Gist.
|
||||
|
||||
Provides read-only access to a gist's files. Gists do not contain
|
||||
subdirectories, so file listing is straightforward.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
gist_id: str
|
||||
The ID of the gist you want to access (the long hex value from the URL).
|
||||
filenames: list[str] (optional)
|
||||
If provided, only make a file system representing these files, and do not fetch
|
||||
the list of all files for this gist.
|
||||
sha: str (optional)
|
||||
If provided, fetch a particular revision of the gist. If omitted,
|
||||
the latest revision is used.
|
||||
username: str (optional)
|
||||
GitHub username for authentication.
|
||||
token: str (optional)
|
||||
GitHub personal access token (required if username is given), or.
|
||||
timeout: (float, float) or float, optional
|
||||
Connect and read timeouts for requests (default 60s each).
|
||||
kwargs: dict
|
||||
Stored on `self.request_kw` and passed to `requests.get` when fetching Gist
|
||||
metadata or reading ("opening") a file.
|
||||
"""
|
||||
|
||||
protocol = "gist"
|
||||
gist_url = "https://api.github.com/gists/{gist_id}"
|
||||
gist_rev_url = "https://api.github.com/gists/{gist_id}/{sha}"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
gist_id,
|
||||
filenames=None,
|
||||
sha=None,
|
||||
username=None,
|
||||
token=None,
|
||||
timeout=None,
|
||||
**kwargs,
|
||||
):
|
||||
super().__init__()
|
||||
self.gist_id = gist_id
|
||||
self.filenames = filenames
|
||||
self.sha = sha # revision of the gist (optional)
|
||||
if username is not None and token is None:
|
||||
raise ValueError("User auth requires a token")
|
||||
self.username = username
|
||||
self.token = token
|
||||
self.request_kw = kwargs
|
||||
# Default timeouts to 60s connect/read if none provided
|
||||
self.timeout = timeout if timeout is not None else (60, 60)
|
||||
|
||||
# We use a single-level "directory" cache, because a gist is essentially flat
|
||||
self.dircache[""] = self._fetch_file_list()
|
||||
|
||||
@property
|
||||
def kw(self):
|
||||
"""Auth parameters passed to 'requests' if we have username/token."""
|
||||
kw = {
|
||||
"headers": {
|
||||
"Accept": "application/vnd.github+json",
|
||||
"X-GitHub-Api-Version": "2022-11-28",
|
||||
}
|
||||
}
|
||||
kw.update(self.request_kw)
|
||||
if self.username and self.token:
|
||||
kw["auth"] = (self.username, self.token)
|
||||
elif self.token:
|
||||
kw["headers"]["Authorization"] = f"Bearer {self.token}"
|
||||
return kw
|
||||
|
||||
def _fetch_gist_metadata(self):
|
||||
"""
|
||||
Fetch the JSON metadata for this gist (possibly for a specific revision).
|
||||
"""
|
||||
if self.sha:
|
||||
url = self.gist_rev_url.format(gist_id=self.gist_id, sha=self.sha)
|
||||
else:
|
||||
url = self.gist_url.format(gist_id=self.gist_id)
|
||||
|
||||
r = requests.get(url, timeout=self.timeout, **self.kw)
|
||||
if r.status_code == 404:
|
||||
raise FileNotFoundError(
|
||||
f"Gist not found: {self.gist_id}@{self.sha or 'latest'}"
|
||||
)
|
||||
r.raise_for_status()
|
||||
return r.json()
|
||||
|
||||
def _fetch_file_list(self):
|
||||
"""
|
||||
Returns a list of dicts describing each file in the gist. These get stored
|
||||
in self.dircache[""].
|
||||
"""
|
||||
meta = self._fetch_gist_metadata()
|
||||
if self.filenames:
|
||||
available_files = meta.get("files", {})
|
||||
files = {}
|
||||
for fn in self.filenames:
|
||||
if fn not in available_files:
|
||||
raise FileNotFoundError(fn)
|
||||
files[fn] = available_files[fn]
|
||||
else:
|
||||
files = meta.get("files", {})
|
||||
|
||||
out = []
|
||||
for fname, finfo in files.items():
|
||||
if finfo is None:
|
||||
# Occasionally GitHub returns a file entry with null if it was deleted
|
||||
continue
|
||||
# Build a directory entry
|
||||
out.append(
|
||||
{
|
||||
"name": fname, # file's name
|
||||
"type": "file", # gists have no subdirectories
|
||||
"size": finfo.get("size", 0), # file size in bytes
|
||||
"raw_url": finfo.get("raw_url"),
|
||||
}
|
||||
)
|
||||
return out
|
||||
|
||||
@classmethod
|
||||
def _strip_protocol(cls, path):
|
||||
"""
|
||||
Remove 'gist://' from the path, if present.
|
||||
"""
|
||||
# The default infer_storage_options can handle gist://username:token@id/file
|
||||
# or gist://id/file, but let's ensure we handle a normal usage too.
|
||||
# We'll just strip the protocol prefix if it exists.
|
||||
path = infer_storage_options(path).get("path", path)
|
||||
return path.lstrip("/")
|
||||
|
||||
@staticmethod
|
||||
def _get_kwargs_from_urls(path):
|
||||
"""
|
||||
Parse 'gist://' style URLs into GistFileSystem constructor kwargs.
|
||||
For example:
|
||||
gist://:TOKEN@<gist_id>/file.txt
|
||||
gist://username:TOKEN@<gist_id>/file.txt
|
||||
"""
|
||||
so = infer_storage_options(path)
|
||||
out = {}
|
||||
if "username" in so and so["username"]:
|
||||
out["username"] = so["username"]
|
||||
if "password" in so and so["password"]:
|
||||
out["token"] = so["password"]
|
||||
if "host" in so and so["host"]:
|
||||
# We interpret 'host' as the gist ID
|
||||
out["gist_id"] = so["host"]
|
||||
|
||||
# Extract SHA and filename from path
|
||||
if "path" in so and so["path"]:
|
||||
path_parts = so["path"].rsplit("/", 2)[-2:]
|
||||
if len(path_parts) == 2:
|
||||
if path_parts[0]: # SHA present
|
||||
out["sha"] = path_parts[0]
|
||||
if path_parts[1]: # filename also present
|
||||
out["filenames"] = [path_parts[1]]
|
||||
|
||||
return out
|
||||
|
||||
def ls(self, path="", detail=False, **kwargs):
|
||||
"""
|
||||
List files in the gist. Gists are single-level, so any 'path' is basically
|
||||
the filename, or empty for all files.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
path : str, optional
|
||||
The filename to list. If empty, returns all files in the gist.
|
||||
detail : bool, default False
|
||||
If True, return a list of dicts; if False, return a list of filenames.
|
||||
"""
|
||||
path = self._strip_protocol(path or "")
|
||||
# If path is empty, return all
|
||||
if path == "":
|
||||
results = self.dircache[""]
|
||||
else:
|
||||
# We want just the single file with this name
|
||||
all_files = self.dircache[""]
|
||||
results = [f for f in all_files if f["name"] == path]
|
||||
if not results:
|
||||
raise FileNotFoundError(path)
|
||||
if detail:
|
||||
return results
|
||||
else:
|
||||
return sorted(f["name"] for f in results)
|
||||
|
||||
def _open(self, path, mode="rb", block_size=None, **kwargs):
|
||||
"""
|
||||
Read a single file from the gist.
|
||||
"""
|
||||
if mode != "rb":
|
||||
raise NotImplementedError("GitHub Gist FS is read-only (no write).")
|
||||
|
||||
path = self._strip_protocol(path)
|
||||
# Find the file entry in our dircache
|
||||
matches = [f for f in self.dircache[""] if f["name"] == path]
|
||||
if not matches:
|
||||
raise FileNotFoundError(path)
|
||||
finfo = matches[0]
|
||||
|
||||
raw_url = finfo.get("raw_url")
|
||||
if not raw_url:
|
||||
raise FileNotFoundError(f"No raw_url for file: {path}")
|
||||
|
||||
r = requests.get(raw_url, timeout=self.timeout, **self.kw)
|
||||
if r.status_code == 404:
|
||||
raise FileNotFoundError(path)
|
||||
r.raise_for_status()
|
||||
return MemoryFile(path, None, r.content)
|
||||
|
||||
def cat(self, path, recursive=False, on_error="raise", **kwargs):
|
||||
"""
|
||||
Return {path: contents} for the given file or files. If 'recursive' is True,
|
||||
and path is empty, returns all files in the gist.
|
||||
"""
|
||||
paths = self.expand_path(path, recursive=recursive)
|
||||
out = {}
|
||||
for p in paths:
|
||||
try:
|
||||
with self.open(p, "rb") as f:
|
||||
out[p] = f.read()
|
||||
except FileNotFoundError as e:
|
||||
if on_error == "raise":
|
||||
raise e
|
||||
elif on_error == "omit":
|
||||
pass # skip
|
||||
else:
|
||||
out[p] = e
|
||||
if len(paths) == 1 and paths[0] == path:
|
||||
return out[path]
|
||||
return out
|
||||
@@ -0,0 +1,114 @@
|
||||
import os
|
||||
|
||||
import pygit2
|
||||
|
||||
from fsspec.spec import AbstractFileSystem
|
||||
|
||||
from .memory import MemoryFile
|
||||
|
||||
|
||||
class GitFileSystem(AbstractFileSystem):
|
||||
"""Browse the files of a local git repo at any hash/tag/branch
|
||||
|
||||
(experimental backend)
|
||||
"""
|
||||
|
||||
root_marker = ""
|
||||
cachable = True
|
||||
|
||||
def __init__(self, path=None, fo=None, ref=None, **kwargs):
|
||||
"""
|
||||
|
||||
Parameters
|
||||
----------
|
||||
path: str (optional)
|
||||
Local location of the repo (uses current directory if not given).
|
||||
May be deprecated in favour of ``fo``. When used with a higher
|
||||
level function such as fsspec.open(), may be of the form
|
||||
"git://[path-to-repo[:]][ref@]path/to/file" (but the actual
|
||||
file path should not contain "@" or ":").
|
||||
fo: str (optional)
|
||||
Same as ``path``, but passed as part of a chained URL. This one
|
||||
takes precedence if both are given.
|
||||
ref: str (optional)
|
||||
Reference to work with, could be a hash, tag or branch name. Defaults
|
||||
to current working tree. Note that ``ls`` and ``open`` also take hash,
|
||||
so this becomes the default for those operations
|
||||
kwargs
|
||||
"""
|
||||
super().__init__(**kwargs)
|
||||
self.repo = pygit2.Repository(fo or path or os.getcwd())
|
||||
self.ref = ref or "master"
|
||||
|
||||
@classmethod
|
||||
def _strip_protocol(cls, path):
|
||||
path = super()._strip_protocol(path).lstrip("/")
|
||||
if ":" in path:
|
||||
path = path.split(":", 1)[1]
|
||||
if "@" in path:
|
||||
path = path.split("@", 1)[1]
|
||||
return path.lstrip("/")
|
||||
|
||||
def _path_to_object(self, path, ref):
|
||||
comm, ref = self.repo.resolve_refish(ref or self.ref)
|
||||
parts = path.split("/")
|
||||
tree = comm.tree
|
||||
for part in parts:
|
||||
if part and isinstance(tree, pygit2.Tree):
|
||||
if part not in tree:
|
||||
raise FileNotFoundError(path)
|
||||
tree = tree[part]
|
||||
return tree
|
||||
|
||||
@staticmethod
|
||||
def _get_kwargs_from_urls(path):
|
||||
path = path.removeprefix("git://")
|
||||
out = {}
|
||||
if ":" in path:
|
||||
out["path"], path = path.split(":", 1)
|
||||
if "@" in path:
|
||||
out["ref"], path = path.split("@", 1)
|
||||
return out
|
||||
|
||||
@staticmethod
|
||||
def _object_to_info(obj, path=None):
|
||||
# obj.name and obj.filemode are None for the root tree!
|
||||
is_dir = isinstance(obj, pygit2.Tree)
|
||||
return {
|
||||
"type": "directory" if is_dir else "file",
|
||||
"name": (
|
||||
"/".join([path, obj.name or ""]).lstrip("/") if path else obj.name
|
||||
),
|
||||
"hex": str(obj.id),
|
||||
"mode": "100644" if obj.filemode is None else f"{obj.filemode:o}",
|
||||
"size": 0 if is_dir else obj.size,
|
||||
}
|
||||
|
||||
def ls(self, path, detail=True, ref=None, **kwargs):
|
||||
tree = self._path_to_object(self._strip_protocol(path), ref)
|
||||
return [
|
||||
GitFileSystem._object_to_info(obj, path)
|
||||
if detail
|
||||
else GitFileSystem._object_to_info(obj, path)["name"]
|
||||
for obj in (tree if isinstance(tree, pygit2.Tree) else [tree])
|
||||
]
|
||||
|
||||
def info(self, path, ref=None, **kwargs):
|
||||
tree = self._path_to_object(self._strip_protocol(path), ref)
|
||||
return GitFileSystem._object_to_info(tree, path)
|
||||
|
||||
def ukey(self, path, ref=None):
|
||||
return self.info(path, ref=ref)["hex"]
|
||||
|
||||
def _open(
|
||||
self,
|
||||
path,
|
||||
mode="rb",
|
||||
block_size=None,
|
||||
autocommit=True,
|
||||
cache_options=None,
|
||||
ref=None,
|
||||
**kwargs,
|
||||
):
|
||||
obj = self._path_to_object(path, ref or self.ref)
|
||||
return MemoryFile(data=obj.data)
|
||||
@@ -0,0 +1,333 @@
|
||||
import base64
|
||||
import re
|
||||
|
||||
import requests
|
||||
|
||||
from ..spec import AbstractFileSystem
|
||||
from ..utils import infer_storage_options
|
||||
from .memory import MemoryFile
|
||||
|
||||
|
||||
class GithubFileSystem(AbstractFileSystem):
|
||||
"""Interface to files in github
|
||||
|
||||
An instance of this class provides the files residing within a remote github
|
||||
repository. You may specify a point in the repos history, by SHA, branch
|
||||
or tag (default is current master).
|
||||
|
||||
For files less than 1 MB in size, file content is returned directly in a
|
||||
MemoryFile. For larger files, or for files tracked by git-lfs, file content
|
||||
is returned as an HTTPFile wrapping the ``download_url`` provided by the
|
||||
GitHub API.
|
||||
|
||||
When using fsspec.open, allows URIs of the form:
|
||||
|
||||
- "github://path/file", in which case you must specify org, repo and
|
||||
may specify sha in the extra args
|
||||
- 'github://org:repo@/precip/catalog.yml', where the org and repo are
|
||||
part of the URI
|
||||
- 'github://org:repo@sha/precip/catalog.yml', where the sha is also included
|
||||
|
||||
``sha`` can be the full or abbreviated hex of the commit you want to fetch
|
||||
from, or a branch or tag name (so long as it doesn't contain special characters
|
||||
like "/", "?", which would have to be HTTP-encoded).
|
||||
|
||||
For authorised access, you must provide username and token, which can be made
|
||||
at https://github.com/settings/tokens
|
||||
"""
|
||||
|
||||
url = "https://api.github.com/repos/{org}/{repo}/git/trees/{sha}"
|
||||
content_url = "https://api.github.com/repos/{org}/{repo}/contents/{path}?ref={sha}"
|
||||
protocol = "github"
|
||||
timeout = (60, 60) # connect, read timeouts
|
||||
|
||||
def __init__(
|
||||
self, org, repo, sha=None, username=None, token=None, timeout=None, **kwargs
|
||||
):
|
||||
super().__init__(**kwargs)
|
||||
self.org = org
|
||||
self.repo = repo
|
||||
if (username is None) ^ (token is None):
|
||||
raise ValueError("Auth required both username and token")
|
||||
self.username = username
|
||||
self.token = token
|
||||
if timeout is not None:
|
||||
self.timeout = timeout
|
||||
if sha is None:
|
||||
# look up default branch (not necessarily "master")
|
||||
u = "https://api.github.com/repos/{org}/{repo}"
|
||||
r = requests.get(
|
||||
u.format(org=org, repo=repo), timeout=self.timeout, **self.kw
|
||||
)
|
||||
r.raise_for_status()
|
||||
sha = r.json()["default_branch"]
|
||||
|
||||
self.root = sha
|
||||
self.ls("")
|
||||
try:
|
||||
from .http import HTTPFileSystem
|
||||
|
||||
self.http_fs = HTTPFileSystem(**kwargs)
|
||||
except ImportError:
|
||||
self.http_fs = None
|
||||
|
||||
@property
|
||||
def kw(self):
|
||||
if self.username:
|
||||
return {"auth": (self.username, self.token)}
|
||||
return {}
|
||||
|
||||
@classmethod
|
||||
def repos(cls, org_or_user, is_org=True):
|
||||
"""List repo names for given org or user
|
||||
|
||||
This may become the top level of the FS
|
||||
|
||||
Parameters
|
||||
----------
|
||||
org_or_user: str
|
||||
Name of the github org or user to query
|
||||
is_org: bool (default True)
|
||||
Whether the name is an organisation (True) or user (False)
|
||||
|
||||
Returns
|
||||
-------
|
||||
List of string
|
||||
"""
|
||||
r = requests.get(
|
||||
f"https://api.github.com/{['users', 'orgs'][is_org]}/{org_or_user}/repos",
|
||||
timeout=cls.timeout,
|
||||
)
|
||||
r.raise_for_status()
|
||||
return [repo["name"] for repo in r.json()]
|
||||
|
||||
@property
|
||||
def tags(self):
|
||||
"""Names of tags in the repo"""
|
||||
r = requests.get(
|
||||
f"https://api.github.com/repos/{self.org}/{self.repo}/tags",
|
||||
timeout=self.timeout,
|
||||
**self.kw,
|
||||
)
|
||||
r.raise_for_status()
|
||||
return [t["name"] for t in r.json()]
|
||||
|
||||
@property
|
||||
def branches(self):
|
||||
"""Names of branches in the repo"""
|
||||
r = requests.get(
|
||||
f"https://api.github.com/repos/{self.org}/{self.repo}/branches",
|
||||
timeout=self.timeout,
|
||||
**self.kw,
|
||||
)
|
||||
r.raise_for_status()
|
||||
return [t["name"] for t in r.json()]
|
||||
|
||||
@property
|
||||
def refs(self):
|
||||
"""Named references, tags and branches"""
|
||||
return {"tags": self.tags, "branches": self.branches}
|
||||
|
||||
def ls(self, path, detail=False, sha=None, _sha=None, **kwargs):
|
||||
"""List files at given path
|
||||
|
||||
Parameters
|
||||
----------
|
||||
path: str
|
||||
Location to list, relative to repo root
|
||||
detail: bool
|
||||
If True, returns list of dicts, one per file; if False, returns
|
||||
list of full filenames only
|
||||
sha: str (optional)
|
||||
List at the given point in the repo history, branch or tag name or commit
|
||||
SHA
|
||||
_sha: str (optional)
|
||||
List this specific tree object (used internally to descend into trees)
|
||||
"""
|
||||
path = self._strip_protocol(path)
|
||||
if path == "":
|
||||
_sha = sha or self.root
|
||||
if _sha is None:
|
||||
parts = path.rstrip("/").split("/")
|
||||
so_far = ""
|
||||
_sha = sha or self.root
|
||||
for part in parts:
|
||||
out = self.ls(so_far, True, sha=sha, _sha=_sha)
|
||||
so_far += "/" + part if so_far else part
|
||||
out = [o for o in out if o["name"] == so_far]
|
||||
if not out:
|
||||
raise FileNotFoundError(path)
|
||||
out = out[0]
|
||||
if out["type"] == "file":
|
||||
if detail:
|
||||
return [out]
|
||||
else:
|
||||
return path
|
||||
_sha = out["sha"]
|
||||
if path not in self.dircache or sha not in [self.root, None]:
|
||||
r = requests.get(
|
||||
self.url.format(org=self.org, repo=self.repo, sha=_sha),
|
||||
timeout=self.timeout,
|
||||
**self.kw,
|
||||
)
|
||||
if r.status_code == 404:
|
||||
raise FileNotFoundError(path)
|
||||
r.raise_for_status()
|
||||
types = {"blob": "file", "tree": "directory"}
|
||||
out = [
|
||||
{
|
||||
"name": path + "/" + f["path"] if path else f["path"],
|
||||
"mode": f["mode"],
|
||||
"type": types[f["type"]],
|
||||
"size": f.get("size", 0),
|
||||
"sha": f["sha"],
|
||||
}
|
||||
for f in r.json()["tree"]
|
||||
if f["type"] in types
|
||||
]
|
||||
if sha in [self.root, None]:
|
||||
self.dircache[path] = out
|
||||
else:
|
||||
out = self.dircache[path]
|
||||
if detail:
|
||||
return out
|
||||
else:
|
||||
return sorted([f["name"] for f in out])
|
||||
|
||||
def invalidate_cache(self, path=None):
|
||||
self.dircache.clear()
|
||||
|
||||
@classmethod
|
||||
def _strip_protocol(cls, path):
|
||||
opts = infer_storage_options(path)
|
||||
if "username" not in opts:
|
||||
return super()._strip_protocol(path)
|
||||
return opts["path"].lstrip("/")
|
||||
|
||||
@staticmethod
|
||||
def _get_kwargs_from_urls(path):
|
||||
opts = infer_storage_options(path)
|
||||
if "username" not in opts:
|
||||
return {}
|
||||
out = {"org": opts["username"], "repo": opts["password"]}
|
||||
if opts["host"]:
|
||||
out["sha"] = opts["host"]
|
||||
return out
|
||||
|
||||
def _open(
|
||||
self,
|
||||
path,
|
||||
mode="rb",
|
||||
block_size=None,
|
||||
cache_options=None,
|
||||
sha=None,
|
||||
**kwargs,
|
||||
):
|
||||
if mode != "rb":
|
||||
raise NotImplementedError
|
||||
|
||||
# construct a url to hit the GitHub API's repo contents API
|
||||
url = self.content_url.format(
|
||||
org=self.org, repo=self.repo, path=path, sha=sha or self.root
|
||||
)
|
||||
|
||||
# make a request to this API, and parse the response as JSON
|
||||
r = requests.get(url, timeout=self.timeout, **self.kw)
|
||||
if r.status_code == 404:
|
||||
raise FileNotFoundError(path)
|
||||
r.raise_for_status()
|
||||
content_json = r.json()
|
||||
|
||||
# if the response's content key is not empty, try to parse it as base64
|
||||
if content_json["content"]:
|
||||
content = base64.b64decode(content_json["content"])
|
||||
|
||||
# as long as the content does not start with the string
|
||||
# "version https://git-lfs.github.com/"
|
||||
# then it is probably not a git-lfs pointer and we can just return
|
||||
# the content directly
|
||||
if not content.startswith(b"version https://git-lfs.github.com/"):
|
||||
return MemoryFile(None, None, content)
|
||||
|
||||
# we land here if the content was not present in the first response
|
||||
# (regular file over 1MB or git-lfs tracked file)
|
||||
# in this case, we get let the HTTPFileSystem handle the download
|
||||
if self.http_fs is None:
|
||||
raise ImportError(
|
||||
"Please install fsspec[http] to access github files >1 MB "
|
||||
"or git-lfs tracked files."
|
||||
)
|
||||
return self.http_fs.open(
|
||||
content_json["download_url"],
|
||||
mode=mode,
|
||||
block_size=block_size,
|
||||
cache_options=cache_options,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
def rm(self, path, recursive=False, maxdepth=None, message=None):
|
||||
path = self.expand_path(path, recursive=recursive, maxdepth=maxdepth)
|
||||
for p in reversed(path):
|
||||
self.rm_file(p, message=message)
|
||||
|
||||
def rm_file(self, path, message=None, **kwargs):
|
||||
"""
|
||||
Remove a file from a specified branch using a given commit message.
|
||||
|
||||
Since Github DELETE operation requires a branch name, and we can't reliably
|
||||
determine whether the provided SHA refers to a branch, tag, or commit, we
|
||||
assume it's a branch. If it's not, the user will encounter an error when
|
||||
attempting to retrieve the file SHA or delete the file.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
path: str
|
||||
The file's location relative to the repository root.
|
||||
message: str, optional
|
||||
The commit message for the deletion.
|
||||
"""
|
||||
|
||||
if not self.username:
|
||||
raise ValueError("Authentication required")
|
||||
|
||||
path = self._strip_protocol(path)
|
||||
|
||||
# Attempt to get SHA from cache or Github API
|
||||
sha = self._get_sha_from_cache(path)
|
||||
if not sha:
|
||||
url = self.content_url.format(
|
||||
org=self.org, repo=self.repo, path=path.lstrip("/"), sha=self.root
|
||||
)
|
||||
r = requests.get(url, timeout=self.timeout, **self.kw)
|
||||
if r.status_code == 404:
|
||||
raise FileNotFoundError(path)
|
||||
r.raise_for_status()
|
||||
sha = r.json()["sha"]
|
||||
|
||||
# Delete the file
|
||||
delete_url = self.content_url.format(
|
||||
org=self.org, repo=self.repo, path=path, sha=self.root
|
||||
)
|
||||
branch = self.root
|
||||
data = {
|
||||
"message": message or f"Delete {path}",
|
||||
"sha": sha,
|
||||
**({"branch": branch} if branch else {}),
|
||||
}
|
||||
|
||||
r = requests.delete(delete_url, json=data, timeout=self.timeout, **self.kw)
|
||||
error_message = r.json().get("message", "")
|
||||
if re.search(r"Branch .+ not found", error_message):
|
||||
error = "Remove only works when the filesystem is initialised from a branch or default (None)"
|
||||
raise ValueError(error)
|
||||
r.raise_for_status()
|
||||
|
||||
self.invalidate_cache(path)
|
||||
|
||||
def _get_sha_from_cache(self, path):
|
||||
for entries in self.dircache.values():
|
||||
for entry in entries:
|
||||
entry_path = entry.get("name")
|
||||
if entry_path and entry_path == path and "sha" in entry:
|
||||
return entry["sha"]
|
||||
return None
|
||||
@@ -0,0 +1,891 @@
|
||||
import asyncio
|
||||
import io
|
||||
import logging
|
||||
import re
|
||||
import weakref
|
||||
from copy import copy
|
||||
from urllib.parse import urlparse
|
||||
|
||||
import aiohttp
|
||||
import yarl
|
||||
|
||||
from fsspec.asyn import AbstractAsyncStreamedFile, AsyncFileSystem, sync, sync_wrapper
|
||||
from fsspec.callbacks import DEFAULT_CALLBACK
|
||||
from fsspec.exceptions import FSTimeoutError
|
||||
from fsspec.spec import AbstractBufferedFile
|
||||
from fsspec.utils import (
|
||||
DEFAULT_BLOCK_SIZE,
|
||||
glob_translate,
|
||||
isfilelike,
|
||||
nullcontext,
|
||||
tokenize,
|
||||
)
|
||||
|
||||
from ..caching import AllBytes
|
||||
|
||||
# https://stackoverflow.com/a/15926317/3821154
|
||||
ex = re.compile(r"""<(a|A)\s+(?:[^>]*?\s+)?(href|HREF)=["'](?P<url>[^"']+)""")
|
||||
ex2 = re.compile(r"""(?P<url>http[s]?://[-a-zA-Z0-9@:%_+.~#?&/=]+)""")
|
||||
logger = logging.getLogger("fsspec.http")
|
||||
|
||||
|
||||
async def get_client(**kwargs):
|
||||
return aiohttp.ClientSession(**kwargs)
|
||||
|
||||
|
||||
class HTTPFileSystem(AsyncFileSystem):
|
||||
"""
|
||||
Simple File-System for fetching data via HTTP(S)
|
||||
|
||||
``ls()`` is implemented by loading the parent page and doing a regex
|
||||
match on the result. If simple_link=True, anything of the form
|
||||
"http(s)://server.com/stuff?thing=other"; otherwise only links within
|
||||
HTML href tags will be used.
|
||||
"""
|
||||
|
||||
protocol = ("http", "https")
|
||||
sep = "/"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
simple_links=True,
|
||||
block_size=None,
|
||||
same_scheme=True,
|
||||
size_policy=None,
|
||||
cache_type="bytes",
|
||||
cache_options=None,
|
||||
asynchronous=False,
|
||||
loop=None,
|
||||
client_kwargs=None,
|
||||
get_client=get_client,
|
||||
encoded=False,
|
||||
**storage_options,
|
||||
):
|
||||
"""
|
||||
NB: if this is called async, you must await set_client
|
||||
|
||||
Parameters
|
||||
----------
|
||||
block_size: int
|
||||
Blocks to read bytes; if 0, will default to raw requests file-like
|
||||
objects instead of HTTPFile instances
|
||||
simple_links: bool
|
||||
If True, will consider both HTML <a> tags and anything that looks
|
||||
like a URL; if False, will consider only the former.
|
||||
same_scheme: True
|
||||
When doing ls/glob, if this is True, only consider paths that have
|
||||
http/https matching the input URLs.
|
||||
size_policy: this argument is deprecated
|
||||
client_kwargs: dict
|
||||
Passed to aiohttp.ClientSession, see
|
||||
https://docs.aiohttp.org/en/stable/client_reference.html
|
||||
For example, ``{'auth': aiohttp.BasicAuth('user', 'pass')}``
|
||||
get_client: Callable[..., aiohttp.ClientSession]
|
||||
A callable, which takes keyword arguments and constructs
|
||||
an aiohttp.ClientSession. Its state will be managed by
|
||||
the HTTPFileSystem class.
|
||||
storage_options: key-value
|
||||
Any other parameters passed on to requests
|
||||
cache_type, cache_options: defaults used in open()
|
||||
"""
|
||||
super().__init__(self, asynchronous=asynchronous, loop=loop, **storage_options)
|
||||
self.block_size = block_size if block_size is not None else DEFAULT_BLOCK_SIZE
|
||||
self.simple_links = simple_links
|
||||
self.same_schema = same_scheme
|
||||
self.cache_type = cache_type
|
||||
self.cache_options = cache_options
|
||||
self.client_kwargs = client_kwargs or {}
|
||||
self.get_client = get_client
|
||||
self.encoded = encoded
|
||||
self.kwargs = storage_options
|
||||
self._session = None
|
||||
|
||||
# Clean caching-related parameters from `storage_options`
|
||||
# before propagating them as `request_options` through `self.kwargs`.
|
||||
# TODO: Maybe rename `self.kwargs` to `self.request_options` to make
|
||||
# it clearer.
|
||||
request_options = copy(storage_options)
|
||||
self.use_listings_cache = request_options.pop("use_listings_cache", False)
|
||||
request_options.pop("listings_expiry_time", None)
|
||||
request_options.pop("max_paths", None)
|
||||
request_options.pop("skip_instance_cache", None)
|
||||
self.kwargs = request_options
|
||||
|
||||
@property
|
||||
def fsid(self):
|
||||
return "http"
|
||||
|
||||
def encode_url(self, url):
|
||||
return yarl.URL(url, encoded=self.encoded)
|
||||
|
||||
@staticmethod
|
||||
def close_session(loop, session):
|
||||
if loop is not None and loop.is_running():
|
||||
try:
|
||||
sync(loop, session.close, timeout=0.1)
|
||||
return
|
||||
except (TimeoutError, FSTimeoutError, NotImplementedError):
|
||||
pass
|
||||
connector = getattr(session, "_connector", None)
|
||||
if connector is not None:
|
||||
# close after loop is dead
|
||||
connector._close()
|
||||
|
||||
async def set_session(self):
|
||||
if self._session is None:
|
||||
self._session = await self.get_client(loop=self.loop, **self.client_kwargs)
|
||||
if not self.asynchronous:
|
||||
weakref.finalize(self, self.close_session, self.loop, self._session)
|
||||
return self._session
|
||||
|
||||
@classmethod
|
||||
def _strip_protocol(cls, path):
|
||||
"""For HTTP, we always want to keep the full URL"""
|
||||
return path
|
||||
|
||||
@classmethod
|
||||
def _parent(cls, path):
|
||||
# override, since _strip_protocol is different for URLs
|
||||
par = super()._parent(path)
|
||||
if len(par) > 7: # "http://..."
|
||||
return par
|
||||
return ""
|
||||
|
||||
async def _ls_real(self, url, detail=True, **kwargs):
|
||||
# ignoring URL-encoded arguments
|
||||
kw = self.kwargs.copy()
|
||||
kw.update(kwargs)
|
||||
logger.debug(url)
|
||||
session = await self.set_session()
|
||||
async with session.get(self.encode_url(url), **self.kwargs) as r:
|
||||
self._raise_not_found_for_status(r, url)
|
||||
|
||||
if "Content-Type" in r.headers:
|
||||
mimetype = r.headers["Content-Type"].partition(";")[0]
|
||||
else:
|
||||
mimetype = None
|
||||
|
||||
if mimetype in ("text/html", None):
|
||||
try:
|
||||
text = await r.text(errors="ignore")
|
||||
if self.simple_links:
|
||||
links = ex2.findall(text) + [u[2] for u in ex.findall(text)]
|
||||
else:
|
||||
links = [u[2] for u in ex.findall(text)]
|
||||
except UnicodeDecodeError:
|
||||
links = [] # binary, not HTML
|
||||
else:
|
||||
links = []
|
||||
|
||||
out = set()
|
||||
parts = urlparse(url)
|
||||
for l in links:
|
||||
if isinstance(l, tuple):
|
||||
l = l[1]
|
||||
if l.startswith("/") and len(l) > 1:
|
||||
# absolute URL on this server
|
||||
l = f"{parts.scheme}://{parts.netloc}{l}"
|
||||
if l.startswith("http"):
|
||||
if self.same_schema and l.startswith(url.rstrip("/") + "/"):
|
||||
out.add(l)
|
||||
elif l.replace("https", "http").startswith(
|
||||
url.replace("https", "http").rstrip("/") + "/"
|
||||
):
|
||||
# allowed to cross http <-> https
|
||||
out.add(l)
|
||||
else:
|
||||
if l not in ["..", "../"]:
|
||||
# Ignore FTP-like "parent"
|
||||
out.add("/".join([url.rstrip("/"), l.lstrip("/")]))
|
||||
if not out and url.endswith("/"):
|
||||
out = await self._ls_real(url.rstrip("/"), detail=False)
|
||||
if detail:
|
||||
return [
|
||||
{
|
||||
"name": u,
|
||||
"size": None,
|
||||
"type": "directory" if u.endswith("/") else "file",
|
||||
}
|
||||
for u in out
|
||||
]
|
||||
else:
|
||||
return sorted(out)
|
||||
|
||||
async def _ls(self, url, detail=True, **kwargs):
|
||||
if self.use_listings_cache and url in self.dircache:
|
||||
out = self.dircache[url]
|
||||
else:
|
||||
out = await self._ls_real(url, detail=detail, **kwargs)
|
||||
self.dircache[url] = out
|
||||
return out
|
||||
|
||||
ls = sync_wrapper(_ls)
|
||||
|
||||
def _raise_not_found_for_status(self, response, url):
|
||||
"""
|
||||
Raises FileNotFoundError for 404s, otherwise uses raise_for_status.
|
||||
"""
|
||||
if response.status == 404:
|
||||
raise FileNotFoundError(url)
|
||||
response.raise_for_status()
|
||||
|
||||
async def _cat_file(self, url, start=None, end=None, **kwargs):
|
||||
kw = self.kwargs.copy()
|
||||
kw.update(kwargs)
|
||||
logger.debug(url)
|
||||
|
||||
if start is not None or end is not None:
|
||||
if start == end:
|
||||
return b""
|
||||
headers = kw.pop("headers", {}).copy()
|
||||
|
||||
headers["Range"] = await self._process_limits(url, start, end)
|
||||
kw["headers"] = headers
|
||||
session = await self.set_session()
|
||||
async with session.get(self.encode_url(url), **kw) as r:
|
||||
out = await r.read()
|
||||
self._raise_not_found_for_status(r, url)
|
||||
return out
|
||||
|
||||
async def _get_file(
|
||||
self, rpath, lpath, chunk_size=5 * 2**20, callback=DEFAULT_CALLBACK, **kwargs
|
||||
):
|
||||
kw = self.kwargs.copy()
|
||||
kw.update(kwargs)
|
||||
logger.debug(rpath)
|
||||
session = await self.set_session()
|
||||
async with session.get(self.encode_url(rpath), **kw) as r:
|
||||
try:
|
||||
size = int(r.headers["content-length"])
|
||||
except (ValueError, KeyError):
|
||||
size = None
|
||||
|
||||
callback.set_size(size)
|
||||
self._raise_not_found_for_status(r, rpath)
|
||||
if isfilelike(lpath):
|
||||
outfile = lpath
|
||||
else:
|
||||
outfile = open(lpath, "wb") # noqa: ASYNC230
|
||||
|
||||
try:
|
||||
chunk = True
|
||||
while chunk:
|
||||
chunk = await r.content.read(chunk_size)
|
||||
outfile.write(chunk)
|
||||
callback.relative_update(len(chunk))
|
||||
finally:
|
||||
if not isfilelike(lpath):
|
||||
outfile.close()
|
||||
|
||||
async def _put_file(
|
||||
self,
|
||||
lpath,
|
||||
rpath,
|
||||
chunk_size=5 * 2**20,
|
||||
callback=DEFAULT_CALLBACK,
|
||||
method="post",
|
||||
mode="overwrite",
|
||||
**kwargs,
|
||||
):
|
||||
if mode != "overwrite":
|
||||
raise NotImplementedError("Exclusive write")
|
||||
|
||||
async def gen_chunks():
|
||||
# Support passing arbitrary file-like objects
|
||||
# and use them instead of streams.
|
||||
if isinstance(lpath, io.IOBase):
|
||||
context = nullcontext(lpath)
|
||||
use_seek = False # might not support seeking
|
||||
else:
|
||||
context = open(lpath, "rb") # noqa: ASYNC230
|
||||
use_seek = True
|
||||
|
||||
with context as f:
|
||||
if use_seek:
|
||||
callback.set_size(f.seek(0, 2))
|
||||
f.seek(0)
|
||||
else:
|
||||
callback.set_size(getattr(f, "size", None))
|
||||
|
||||
chunk = f.read(chunk_size)
|
||||
while chunk:
|
||||
yield chunk
|
||||
callback.relative_update(len(chunk))
|
||||
chunk = f.read(chunk_size)
|
||||
|
||||
kw = self.kwargs.copy()
|
||||
kw.update(kwargs)
|
||||
session = await self.set_session()
|
||||
|
||||
method = method.lower()
|
||||
if method not in ("post", "put"):
|
||||
raise ValueError(
|
||||
f"method has to be either 'post' or 'put', not: {method!r}"
|
||||
)
|
||||
|
||||
meth = getattr(session, method)
|
||||
async with meth(self.encode_url(rpath), data=gen_chunks(), **kw) as resp:
|
||||
self._raise_not_found_for_status(resp, rpath)
|
||||
|
||||
async def _exists(self, path, **kwargs):
|
||||
kw = self.kwargs.copy()
|
||||
kw.update(kwargs)
|
||||
try:
|
||||
logger.debug(path)
|
||||
session = await self.set_session()
|
||||
r = await session.get(self.encode_url(path), **kw)
|
||||
async with r:
|
||||
return r.status < 400
|
||||
except aiohttp.ClientError:
|
||||
return False
|
||||
|
||||
async def _isfile(self, path, **kwargs):
|
||||
return await self._exists(path, **kwargs)
|
||||
|
||||
def _open(
|
||||
self,
|
||||
path,
|
||||
mode="rb",
|
||||
block_size=None,
|
||||
autocommit=None, # XXX: This differs from the base class.
|
||||
cache_type=None,
|
||||
cache_options=None,
|
||||
size=None,
|
||||
**kwargs,
|
||||
):
|
||||
"""Make a file-like object
|
||||
|
||||
Parameters
|
||||
----------
|
||||
path: str
|
||||
Full URL with protocol
|
||||
mode: string
|
||||
must be "rb"
|
||||
block_size: int or None
|
||||
Bytes to download in one request; use instance value if None. If
|
||||
zero, will return a streaming Requests file-like instance.
|
||||
kwargs: key-value
|
||||
Any other parameters, passed to requests calls
|
||||
"""
|
||||
if mode != "rb":
|
||||
raise NotImplementedError
|
||||
block_size = block_size if block_size is not None else self.block_size
|
||||
kw = self.kwargs.copy()
|
||||
kw["asynchronous"] = self.asynchronous
|
||||
kw.update(kwargs)
|
||||
info = {}
|
||||
size = size or info.update(self.info(path, **kwargs)) or info["size"]
|
||||
session = sync(self.loop, self.set_session)
|
||||
if block_size and size and info.get("partial", True):
|
||||
return HTTPFile(
|
||||
self,
|
||||
path,
|
||||
session=session,
|
||||
block_size=block_size,
|
||||
mode=mode,
|
||||
size=size,
|
||||
cache_type=cache_type or self.cache_type,
|
||||
cache_options=cache_options or self.cache_options,
|
||||
loop=self.loop,
|
||||
**kw,
|
||||
)
|
||||
else:
|
||||
return HTTPStreamFile(
|
||||
self,
|
||||
path,
|
||||
mode=mode,
|
||||
loop=self.loop,
|
||||
session=session,
|
||||
**kw,
|
||||
)
|
||||
|
||||
async def open_async(self, path, mode="rb", size=None, **kwargs):
|
||||
session = await self.set_session()
|
||||
if size is None:
|
||||
try:
|
||||
size = (await self._info(path, **kwargs))["size"]
|
||||
except FileNotFoundError:
|
||||
pass
|
||||
return AsyncStreamFile(
|
||||
self,
|
||||
path,
|
||||
loop=self.loop,
|
||||
session=session,
|
||||
size=size,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
def ukey(self, url):
|
||||
"""Unique identifier; assume HTTP files are static, unchanging"""
|
||||
return tokenize(url, self.kwargs, self.protocol)
|
||||
|
||||
async def _info(self, url, **kwargs):
|
||||
"""Get info of URL
|
||||
|
||||
Tries to access location via HEAD, and then GET methods, but does
|
||||
not fetch the data.
|
||||
|
||||
It is possible that the server does not supply any size information, in
|
||||
which case size will be given as None (and certain operations on the
|
||||
corresponding file will not work).
|
||||
"""
|
||||
info = {}
|
||||
session = await self.set_session()
|
||||
|
||||
for policy in ["head", "get"]:
|
||||
try:
|
||||
info.update(
|
||||
await _file_info(
|
||||
self.encode_url(url),
|
||||
size_policy=policy,
|
||||
session=session,
|
||||
**self.kwargs,
|
||||
**kwargs,
|
||||
)
|
||||
)
|
||||
if info.get("size") is not None:
|
||||
break
|
||||
except Exception as exc:
|
||||
if policy == "get":
|
||||
# If get failed, then raise a FileNotFoundError
|
||||
raise FileNotFoundError(url) from exc
|
||||
logger.debug("", exc_info=exc)
|
||||
|
||||
return {"name": url, "size": None, **info, "type": "file"}
|
||||
|
||||
async def _glob(self, path, maxdepth=None, **kwargs):
|
||||
"""
|
||||
Find files by glob-matching.
|
||||
|
||||
This implementation is idntical to the one in AbstractFileSystem,
|
||||
but "?" is not considered as a character for globbing, because it is
|
||||
so common in URLs, often identifying the "query" part.
|
||||
"""
|
||||
if maxdepth is not None and maxdepth < 1:
|
||||
raise ValueError("maxdepth must be at least 1")
|
||||
import re
|
||||
|
||||
ends_with_slash = path.endswith("/") # _strip_protocol strips trailing slash
|
||||
path = self._strip_protocol(path)
|
||||
append_slash_to_dirname = ends_with_slash or path.endswith(("/**", "/*"))
|
||||
idx_star = path.find("*") if path.find("*") >= 0 else len(path)
|
||||
idx_brace = path.find("[") if path.find("[") >= 0 else len(path)
|
||||
|
||||
min_idx = min(idx_star, idx_brace)
|
||||
|
||||
detail = kwargs.pop("detail", False)
|
||||
|
||||
if not has_magic(path):
|
||||
if await self._exists(path, **kwargs):
|
||||
if not detail:
|
||||
return [path]
|
||||
else:
|
||||
return {path: await self._info(path, **kwargs)}
|
||||
else:
|
||||
if not detail:
|
||||
return [] # glob of non-existent returns empty
|
||||
else:
|
||||
return {}
|
||||
elif "/" in path[:min_idx]:
|
||||
min_idx = path[:min_idx].rindex("/")
|
||||
root = path[: min_idx + 1]
|
||||
depth = path[min_idx + 1 :].count("/") + 1
|
||||
else:
|
||||
root = ""
|
||||
depth = path[min_idx + 1 :].count("/") + 1
|
||||
|
||||
if "**" in path:
|
||||
if maxdepth is not None:
|
||||
idx_double_stars = path.find("**")
|
||||
depth_double_stars = path[idx_double_stars:].count("/") + 1
|
||||
depth = depth - depth_double_stars + maxdepth
|
||||
else:
|
||||
depth = None
|
||||
|
||||
allpaths = await self._find(
|
||||
root, maxdepth=depth, withdirs=True, detail=True, **kwargs
|
||||
)
|
||||
|
||||
pattern = glob_translate(path + ("/" if ends_with_slash else ""))
|
||||
pattern = re.compile(pattern)
|
||||
|
||||
out = {
|
||||
(
|
||||
p.rstrip("/")
|
||||
if not append_slash_to_dirname
|
||||
and info["type"] == "directory"
|
||||
and p.endswith("/")
|
||||
else p
|
||||
): info
|
||||
for p, info in sorted(allpaths.items())
|
||||
if pattern.match(p.rstrip("/"))
|
||||
}
|
||||
|
||||
if detail:
|
||||
return out
|
||||
else:
|
||||
return list(out)
|
||||
|
||||
async def _isdir(self, path):
|
||||
# override, since all URLs are (also) files
|
||||
try:
|
||||
return bool(await self._ls(path))
|
||||
except (FileNotFoundError, ValueError):
|
||||
return False
|
||||
|
||||
async def _pipe_file(self, path, value, mode="overwrite", **kwargs):
|
||||
"""
|
||||
Write bytes to a remote file over HTTP.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
path : str
|
||||
Target URL where the data should be written
|
||||
value : bytes
|
||||
Data to be written
|
||||
mode : str
|
||||
How to write to the file - 'overwrite' or 'append'
|
||||
**kwargs : dict
|
||||
Additional parameters to pass to the HTTP request
|
||||
"""
|
||||
url = self._strip_protocol(path)
|
||||
headers = kwargs.pop("headers", {})
|
||||
headers["Content-Length"] = str(len(value))
|
||||
|
||||
session = await self.set_session()
|
||||
|
||||
async with session.put(url, data=value, headers=headers, **kwargs) as r:
|
||||
r.raise_for_status()
|
||||
|
||||
|
||||
class HTTPFile(AbstractBufferedFile):
|
||||
"""
|
||||
A file-like object pointing to a remote HTTP(S) resource
|
||||
|
||||
Supports only reading, with read-ahead of a predetermined block-size.
|
||||
|
||||
In the case that the server does not supply the filesize, only reading of
|
||||
the complete file in one go is supported.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
url: str
|
||||
Full URL of the remote resource, including the protocol
|
||||
session: aiohttp.ClientSession or None
|
||||
All calls will be made within this session, to avoid restarting
|
||||
connections where the server allows this
|
||||
block_size: int or None
|
||||
The amount of read-ahead to do, in bytes. Default is 5MB, or the value
|
||||
configured for the FileSystem creating this file
|
||||
size: None or int
|
||||
If given, this is the size of the file in bytes, and we don't attempt
|
||||
to call the server to find the value.
|
||||
kwargs: all other key-values are passed to requests calls.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
fs,
|
||||
url,
|
||||
session=None,
|
||||
block_size=None,
|
||||
mode="rb",
|
||||
cache_type="bytes",
|
||||
cache_options=None,
|
||||
size=None,
|
||||
loop=None,
|
||||
asynchronous=False,
|
||||
**kwargs,
|
||||
):
|
||||
if mode != "rb":
|
||||
raise NotImplementedError("File mode not supported")
|
||||
self.asynchronous = asynchronous
|
||||
self.loop = loop
|
||||
self.url = url
|
||||
self.session = session
|
||||
self.details = {"name": url, "size": size, "type": "file"}
|
||||
super().__init__(
|
||||
fs=fs,
|
||||
path=url,
|
||||
mode=mode,
|
||||
block_size=block_size,
|
||||
cache_type=cache_type,
|
||||
cache_options=cache_options,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
def read(self, length=-1):
|
||||
"""Read bytes from file
|
||||
|
||||
Parameters
|
||||
----------
|
||||
length: int
|
||||
Read up to this many bytes. If negative, read all content to end of
|
||||
file. If the server has not supplied the filesize, attempting to
|
||||
read only part of the data will raise a ValueError.
|
||||
"""
|
||||
if (
|
||||
(length < 0 and self.loc == 0) # explicit read all
|
||||
# but not when the size is known and fits into a block anyways
|
||||
and not (self.size is not None and self.size <= self.blocksize)
|
||||
):
|
||||
self._fetch_all()
|
||||
if self.size is None:
|
||||
if length < 0:
|
||||
self._fetch_all()
|
||||
else:
|
||||
length = min(self.size - self.loc, length)
|
||||
return super().read(length)
|
||||
|
||||
async def async_fetch_all(self):
|
||||
"""Read whole file in one shot, without caching
|
||||
|
||||
This is only called when position is still at zero,
|
||||
and read() is called without a byte-count.
|
||||
"""
|
||||
logger.debug(f"Fetch all for {self}")
|
||||
if not isinstance(self.cache, AllBytes):
|
||||
r = await self.session.get(self.fs.encode_url(self.url), **self.kwargs)
|
||||
async with r:
|
||||
r.raise_for_status()
|
||||
out = await r.read()
|
||||
self.cache = AllBytes(
|
||||
size=len(out), fetcher=None, blocksize=None, data=out
|
||||
)
|
||||
self.size = len(out)
|
||||
|
||||
_fetch_all = sync_wrapper(async_fetch_all)
|
||||
|
||||
def _parse_content_range(self, headers):
|
||||
"""Parse the Content-Range header"""
|
||||
s = headers.get("Content-Range", "")
|
||||
m = re.match(r"bytes (\d+-\d+|\*)/(\d+|\*)", s)
|
||||
if not m:
|
||||
return None, None, None
|
||||
|
||||
if m[1] == "*":
|
||||
start = end = None
|
||||
else:
|
||||
start, end = [int(x) for x in m[1].split("-")]
|
||||
total = None if m[2] == "*" else int(m[2])
|
||||
return start, end, total
|
||||
|
||||
async def async_fetch_range(self, start, end):
|
||||
"""Download a block of data
|
||||
|
||||
The expectation is that the server returns only the requested bytes,
|
||||
with HTTP code 206. If this is not the case, we first check the headers,
|
||||
and then stream the output - if the data size is bigger than we
|
||||
requested, an exception is raised.
|
||||
"""
|
||||
logger.debug(f"Fetch range for {self}: {start}-{end}")
|
||||
kwargs = self.kwargs.copy()
|
||||
headers = kwargs.pop("headers", {}).copy()
|
||||
headers["Range"] = f"bytes={start}-{end - 1}"
|
||||
logger.debug(f"{self.url} : {headers['Range']}")
|
||||
r = await self.session.get(
|
||||
self.fs.encode_url(self.url), headers=headers, **kwargs
|
||||
)
|
||||
async with r:
|
||||
if r.status == 416:
|
||||
# range request outside file
|
||||
return b""
|
||||
r.raise_for_status()
|
||||
|
||||
# If the server has handled the range request, it should reply
|
||||
# with status 206 (partial content). But we'll guess that a suitable
|
||||
# Content-Range header or a Content-Length no more than the
|
||||
# requested range also mean we have got the desired range.
|
||||
response_is_range = (
|
||||
r.status == 206
|
||||
or self._parse_content_range(r.headers)[0] == start
|
||||
or int(r.headers.get("Content-Length", end + 1)) <= end - start
|
||||
)
|
||||
|
||||
if response_is_range:
|
||||
# partial content, as expected
|
||||
out = await r.read()
|
||||
elif start > 0:
|
||||
raise ValueError(
|
||||
"The HTTP server doesn't appear to support range requests. "
|
||||
"Only reading this file from the beginning is supported. "
|
||||
"Open with block_size=0 for a streaming file interface."
|
||||
)
|
||||
else:
|
||||
# Response is not a range, but we want the start of the file,
|
||||
# so we can read the required amount anyway.
|
||||
cl = 0
|
||||
out = []
|
||||
while True:
|
||||
chunk = await r.content.read(2**20)
|
||||
# data size unknown, let's read until we have enough
|
||||
if chunk:
|
||||
out.append(chunk)
|
||||
cl += len(chunk)
|
||||
if cl > end - start:
|
||||
break
|
||||
else:
|
||||
break
|
||||
out = b"".join(out)[: end - start]
|
||||
return out
|
||||
|
||||
_fetch_range = sync_wrapper(async_fetch_range)
|
||||
|
||||
|
||||
magic_check = re.compile("([*[])")
|
||||
|
||||
|
||||
def has_magic(s):
|
||||
match = magic_check.search(s)
|
||||
return match is not None
|
||||
|
||||
|
||||
class HTTPStreamFile(AbstractBufferedFile):
|
||||
def __init__(self, fs, url, mode="rb", loop=None, session=None, **kwargs):
|
||||
self.asynchronous = kwargs.pop("asynchronous", False)
|
||||
self.url = url
|
||||
self.loop = loop
|
||||
self.session = session
|
||||
if mode != "rb":
|
||||
raise ValueError
|
||||
self.details = {"name": url, "size": None}
|
||||
super().__init__(fs=fs, path=url, mode=mode, cache_type="none", **kwargs)
|
||||
|
||||
async def cor():
|
||||
r = await self.session.get(self.fs.encode_url(url), **kwargs).__aenter__()
|
||||
self.fs._raise_not_found_for_status(r, url)
|
||||
return r
|
||||
|
||||
self.r = sync(self.loop, cor)
|
||||
self.loop = fs.loop
|
||||
|
||||
def seek(self, loc, whence=0):
|
||||
if loc == 0 and whence == 1:
|
||||
return
|
||||
if loc == self.loc and whence == 0:
|
||||
return
|
||||
raise ValueError("Cannot seek streaming HTTP file")
|
||||
|
||||
async def _read(self, num=-1):
|
||||
out = await self.r.content.read(num)
|
||||
self.loc += len(out)
|
||||
return out
|
||||
|
||||
read = sync_wrapper(_read)
|
||||
|
||||
async def _close(self):
|
||||
self.r.close()
|
||||
|
||||
def close(self):
|
||||
asyncio.run_coroutine_threadsafe(self._close(), self.loop)
|
||||
super().close()
|
||||
|
||||
|
||||
class AsyncStreamFile(AbstractAsyncStreamedFile):
|
||||
def __init__(
|
||||
self, fs, url, mode="rb", loop=None, session=None, size=None, **kwargs
|
||||
):
|
||||
self.url = url
|
||||
self.session = session
|
||||
self.r = None
|
||||
if mode != "rb":
|
||||
raise ValueError
|
||||
self.details = {"name": url, "size": None}
|
||||
self.kwargs = kwargs
|
||||
super().__init__(fs=fs, path=url, mode=mode, cache_type="none")
|
||||
self.size = size
|
||||
|
||||
async def read(self, num=-1):
|
||||
if self.r is None:
|
||||
r = await self.session.get(
|
||||
self.fs.encode_url(self.url), **self.kwargs
|
||||
).__aenter__()
|
||||
self.fs._raise_not_found_for_status(r, self.url)
|
||||
self.r = r
|
||||
out = await self.r.content.read(num)
|
||||
self.loc += len(out)
|
||||
return out
|
||||
|
||||
async def close(self):
|
||||
if self.r is not None:
|
||||
self.r.close()
|
||||
self.r = None
|
||||
await super().close()
|
||||
|
||||
|
||||
async def get_range(session, url, start, end, file=None, **kwargs):
|
||||
# explicit get a range when we know it must be safe
|
||||
kwargs = kwargs.copy()
|
||||
headers = kwargs.pop("headers", {}).copy()
|
||||
headers["Range"] = f"bytes={start}-{end - 1}"
|
||||
r = await session.get(url, headers=headers, **kwargs)
|
||||
r.raise_for_status()
|
||||
async with r:
|
||||
out = await r.read()
|
||||
if file:
|
||||
with open(file, "r+b") as f: # noqa: ASYNC230
|
||||
f.seek(start)
|
||||
f.write(out)
|
||||
else:
|
||||
return out
|
||||
|
||||
|
||||
async def _file_info(url, session, size_policy="head", **kwargs):
|
||||
"""Call HEAD on the server to get details about the file (size/checksum etc.)
|
||||
|
||||
Default operation is to explicitly allow redirects and use encoding
|
||||
'identity' (no compression) to get the true size of the target.
|
||||
"""
|
||||
logger.debug("Retrieve file size for %s", url)
|
||||
kwargs = kwargs.copy()
|
||||
ar = kwargs.pop("allow_redirects", True)
|
||||
head = kwargs.get("headers", {}).copy()
|
||||
head["Accept-Encoding"] = "identity"
|
||||
kwargs["headers"] = head
|
||||
|
||||
info = {}
|
||||
if size_policy == "head":
|
||||
r = await session.head(url, allow_redirects=ar, **kwargs)
|
||||
elif size_policy == "get":
|
||||
r = await session.get(url, allow_redirects=ar, **kwargs)
|
||||
else:
|
||||
raise TypeError(f'size_policy must be "head" or "get", got {size_policy}')
|
||||
async with r:
|
||||
r.raise_for_status()
|
||||
|
||||
if "Content-Length" in r.headers:
|
||||
# Some servers may choose to ignore Accept-Encoding and return
|
||||
# compressed content, in which case the returned size is unreliable.
|
||||
if "Content-Encoding" not in r.headers or r.headers["Content-Encoding"] in [
|
||||
"identity",
|
||||
"",
|
||||
]:
|
||||
info["size"] = int(r.headers["Content-Length"])
|
||||
elif "Content-Range" in r.headers:
|
||||
info["size"] = int(r.headers["Content-Range"].split("/")[1])
|
||||
|
||||
if "Content-Type" in r.headers:
|
||||
info["mimetype"] = r.headers["Content-Type"].partition(";")[0]
|
||||
|
||||
if r.headers.get("Accept-Ranges") == "none":
|
||||
# Some servers may explicitly discourage partial content requests, but
|
||||
# the lack of "Accept-Ranges" does not always indicate they would fail
|
||||
info["partial"] = False
|
||||
|
||||
info["url"] = str(r.url)
|
||||
|
||||
for checksum_field in ["ETag", "Content-MD5", "Digest", "Last-Modified"]:
|
||||
if r.headers.get(checksum_field):
|
||||
info[checksum_field] = r.headers[checksum_field]
|
||||
|
||||
return info
|
||||
|
||||
|
||||
async def _file_size(url, session=None, *args, **kwargs):
|
||||
if session is None:
|
||||
session = await get_client()
|
||||
info = await _file_info(url, session=session, *args, **kwargs)
|
||||
return info.get("size")
|
||||
|
||||
|
||||
file_size = sync_wrapper(_file_size)
|
||||
@@ -0,0 +1,931 @@
|
||||
"""This file is largely copied from http.py"""
|
||||
|
||||
import io
|
||||
import logging
|
||||
import re
|
||||
import urllib.error
|
||||
import urllib.parse
|
||||
from copy import copy
|
||||
from json import dumps, loads
|
||||
from urllib.parse import urlparse
|
||||
|
||||
try:
|
||||
import yarl
|
||||
except (ImportError, ModuleNotFoundError, OSError):
|
||||
yarl = False
|
||||
|
||||
from fsspec.callbacks import _DEFAULT_CALLBACK
|
||||
from fsspec.registry import register_implementation
|
||||
from fsspec.spec import AbstractBufferedFile, AbstractFileSystem
|
||||
from fsspec.utils import DEFAULT_BLOCK_SIZE, isfilelike, nullcontext, tokenize
|
||||
|
||||
from ..caching import AllBytes
|
||||
|
||||
# https://stackoverflow.com/a/15926317/3821154
|
||||
ex = re.compile(r"""<(a|A)\s+(?:[^>]*?\s+)?(href|HREF)=["'](?P<url>[^"']+)""")
|
||||
ex2 = re.compile(r"""(?P<url>http[s]?://[-a-zA-Z0-9@:%_+.~#?&/=]+)""")
|
||||
logger = logging.getLogger("fsspec.http")
|
||||
|
||||
|
||||
class JsHttpException(urllib.error.HTTPError): ...
|
||||
|
||||
|
||||
class StreamIO(io.BytesIO):
|
||||
# fake class, so you can set attributes on it
|
||||
# will eventually actually stream
|
||||
...
|
||||
|
||||
|
||||
class ResponseProxy:
|
||||
"""Looks like a requests response"""
|
||||
|
||||
def __init__(self, req, stream=False):
|
||||
self.request = req
|
||||
self.stream = stream
|
||||
self._data = None
|
||||
self._headers = None
|
||||
|
||||
@property
|
||||
def raw(self):
|
||||
if self._data is None:
|
||||
b = self.request.response.to_bytes()
|
||||
if self.stream:
|
||||
self._data = StreamIO(b)
|
||||
else:
|
||||
self._data = b
|
||||
return self._data
|
||||
|
||||
def close(self):
|
||||
if hasattr(self, "_data"):
|
||||
del self._data
|
||||
|
||||
@property
|
||||
def headers(self):
|
||||
if self._headers is None:
|
||||
self._headers = dict(
|
||||
[
|
||||
_.split(": ")
|
||||
for _ in self.request.getAllResponseHeaders().strip().split("\r\n")
|
||||
]
|
||||
)
|
||||
return self._headers
|
||||
|
||||
@property
|
||||
def status_code(self):
|
||||
return int(self.request.status)
|
||||
|
||||
def raise_for_status(self):
|
||||
if not self.ok:
|
||||
raise JsHttpException(
|
||||
self.url, self.status_code, self.reason, self.headers, None
|
||||
)
|
||||
|
||||
def iter_content(self, chunksize, *_, **__):
|
||||
while True:
|
||||
out = self.raw.read(chunksize)
|
||||
if out:
|
||||
yield out
|
||||
else:
|
||||
break
|
||||
|
||||
@property
|
||||
def reason(self):
|
||||
return self.request.statusText
|
||||
|
||||
@property
|
||||
def ok(self):
|
||||
return self.status_code < 400
|
||||
|
||||
@property
|
||||
def url(self):
|
||||
return self.request.response.responseURL
|
||||
|
||||
@property
|
||||
def text(self):
|
||||
# TODO: encoding from headers
|
||||
return self.content.decode()
|
||||
|
||||
@property
|
||||
def content(self):
|
||||
self.stream = False
|
||||
return self.raw
|
||||
|
||||
def json(self):
|
||||
return loads(self.text)
|
||||
|
||||
|
||||
class RequestsSessionShim:
|
||||
def __init__(self):
|
||||
self.headers = {}
|
||||
|
||||
def request(
|
||||
self,
|
||||
method,
|
||||
url,
|
||||
params=None,
|
||||
data=None,
|
||||
headers=None,
|
||||
cookies=None,
|
||||
files=None,
|
||||
auth=None,
|
||||
timeout=None,
|
||||
allow_redirects=None,
|
||||
proxies=None,
|
||||
hooks=None,
|
||||
stream=None,
|
||||
verify=None,
|
||||
cert=None,
|
||||
json=None,
|
||||
):
|
||||
from js import Blob, XMLHttpRequest
|
||||
|
||||
logger.debug("JS request: %s %s", method, url)
|
||||
|
||||
if cert or verify or proxies or files or cookies or hooks:
|
||||
raise NotImplementedError
|
||||
if data and json:
|
||||
raise ValueError("Use json= or data=, not both")
|
||||
req = XMLHttpRequest.new()
|
||||
extra = auth if auth else ()
|
||||
if params:
|
||||
url = f"{url}?{urllib.parse.urlencode(params)}"
|
||||
req.open(method, url, False, *extra)
|
||||
if timeout:
|
||||
req.timeout = timeout
|
||||
if headers:
|
||||
for k, v in headers.items():
|
||||
req.setRequestHeader(k, v)
|
||||
|
||||
req.setRequestHeader("Accept", "application/octet-stream")
|
||||
req.responseType = "arraybuffer"
|
||||
if json:
|
||||
blob = Blob.new([dumps(data)], {type: "application/json"})
|
||||
req.send(blob)
|
||||
elif data:
|
||||
if isinstance(data, io.IOBase):
|
||||
data = data.read()
|
||||
blob = Blob.new([data], {type: "application/octet-stream"})
|
||||
req.send(blob)
|
||||
else:
|
||||
req.send(None)
|
||||
return ResponseProxy(req, stream=stream)
|
||||
|
||||
def get(self, url, **kwargs):
|
||||
return self.request("GET", url, **kwargs)
|
||||
|
||||
def head(self, url, **kwargs):
|
||||
return self.request("HEAD", url, **kwargs)
|
||||
|
||||
def post(self, url, **kwargs):
|
||||
return self.request("POST}", url, **kwargs)
|
||||
|
||||
def put(self, url, **kwargs):
|
||||
return self.request("PUT", url, **kwargs)
|
||||
|
||||
def patch(self, url, **kwargs):
|
||||
return self.request("PATCH", url, **kwargs)
|
||||
|
||||
def delete(self, url, **kwargs):
|
||||
return self.request("DELETE", url, **kwargs)
|
||||
|
||||
|
||||
class HTTPFileSystem(AbstractFileSystem):
|
||||
"""
|
||||
Simple File-System for fetching data via HTTP(S)
|
||||
|
||||
This is the BLOCKING version of the normal HTTPFileSystem. It uses
|
||||
requests in normal python and the JS runtime in pyodide.
|
||||
|
||||
***This implementation is extremely experimental, do not use unless
|
||||
you are testing pyodide/pyscript integration***
|
||||
"""
|
||||
|
||||
protocol = ("http", "https", "sync-http", "sync-https")
|
||||
sep = "/"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
simple_links=True,
|
||||
block_size=None,
|
||||
same_scheme=True,
|
||||
cache_type="readahead",
|
||||
cache_options=None,
|
||||
client_kwargs=None,
|
||||
encoded=False,
|
||||
**storage_options,
|
||||
):
|
||||
"""
|
||||
|
||||
Parameters
|
||||
----------
|
||||
block_size: int
|
||||
Blocks to read bytes; if 0, will default to raw requests file-like
|
||||
objects instead of HTTPFile instances
|
||||
simple_links: bool
|
||||
If True, will consider both HTML <a> tags and anything that looks
|
||||
like a URL; if False, will consider only the former.
|
||||
same_scheme: True
|
||||
When doing ls/glob, if this is True, only consider paths that have
|
||||
http/https matching the input URLs.
|
||||
size_policy: this argument is deprecated
|
||||
client_kwargs: dict
|
||||
Passed to aiohttp.ClientSession, see
|
||||
https://docs.aiohttp.org/en/stable/client_reference.html
|
||||
For example, ``{'auth': aiohttp.BasicAuth('user', 'pass')}``
|
||||
storage_options: key-value
|
||||
Any other parameters passed on to requests
|
||||
cache_type, cache_options: defaults used in open
|
||||
"""
|
||||
super().__init__(self, **storage_options)
|
||||
self.block_size = block_size if block_size is not None else DEFAULT_BLOCK_SIZE
|
||||
self.simple_links = simple_links
|
||||
self.same_schema = same_scheme
|
||||
self.cache_type = cache_type
|
||||
self.cache_options = cache_options
|
||||
self.client_kwargs = client_kwargs or {}
|
||||
self.encoded = encoded
|
||||
self.kwargs = storage_options
|
||||
|
||||
try:
|
||||
import js # noqa: F401
|
||||
|
||||
logger.debug("Starting JS session")
|
||||
self.session = RequestsSessionShim()
|
||||
self.js = True
|
||||
except Exception as e:
|
||||
import requests
|
||||
|
||||
logger.debug("Starting cpython session because of: %s", e)
|
||||
self.session = requests.Session(**(client_kwargs or {}))
|
||||
self.js = False
|
||||
|
||||
request_options = copy(storage_options)
|
||||
self.use_listings_cache = request_options.pop("use_listings_cache", False)
|
||||
request_options.pop("listings_expiry_time", None)
|
||||
request_options.pop("max_paths", None)
|
||||
request_options.pop("skip_instance_cache", None)
|
||||
self.kwargs = request_options
|
||||
|
||||
@property
|
||||
def fsid(self):
|
||||
return "sync-http"
|
||||
|
||||
def encode_url(self, url):
|
||||
if yarl:
|
||||
return yarl.URL(url, encoded=self.encoded)
|
||||
return url
|
||||
|
||||
@classmethod
|
||||
def _strip_protocol(cls, path: str) -> str:
|
||||
"""For HTTP, we always want to keep the full URL"""
|
||||
path = path.replace("sync-http://", "http://").replace(
|
||||
"sync-https://", "https://"
|
||||
)
|
||||
return path
|
||||
|
||||
@classmethod
|
||||
def _parent(cls, path):
|
||||
# override, since _strip_protocol is different for URLs
|
||||
par = super()._parent(path)
|
||||
if len(par) > 7: # "http://..."
|
||||
return par
|
||||
return ""
|
||||
|
||||
def _ls_real(self, url, detail=True, **kwargs):
|
||||
# ignoring URL-encoded arguments
|
||||
kw = self.kwargs.copy()
|
||||
kw.update(kwargs)
|
||||
logger.debug(url)
|
||||
r = self.session.get(self.encode_url(url), **self.kwargs)
|
||||
self._raise_not_found_for_status(r, url)
|
||||
text = r.text
|
||||
if self.simple_links:
|
||||
links = ex2.findall(text) + [u[2] for u in ex.findall(text)]
|
||||
else:
|
||||
links = [u[2] for u in ex.findall(text)]
|
||||
out = set()
|
||||
parts = urlparse(url)
|
||||
for l in links:
|
||||
if isinstance(l, tuple):
|
||||
l = l[1]
|
||||
if l.startswith("/") and len(l) > 1:
|
||||
# absolute URL on this server
|
||||
l = parts.scheme + "://" + parts.netloc + l
|
||||
if l.startswith("http"):
|
||||
if self.same_schema and l.startswith(url.rstrip("/") + "/"):
|
||||
out.add(l)
|
||||
elif l.replace("https", "http").startswith(
|
||||
url.replace("https", "http").rstrip("/") + "/"
|
||||
):
|
||||
# allowed to cross http <-> https
|
||||
out.add(l)
|
||||
else:
|
||||
if l not in ["..", "../"]:
|
||||
# Ignore FTP-like "parent"
|
||||
out.add("/".join([url.rstrip("/"), l.lstrip("/")]))
|
||||
if not out and url.endswith("/"):
|
||||
out = self._ls_real(url.rstrip("/"), detail=False)
|
||||
if detail:
|
||||
return [
|
||||
{
|
||||
"name": u,
|
||||
"size": None,
|
||||
"type": "directory" if u.endswith("/") else "file",
|
||||
}
|
||||
for u in out
|
||||
]
|
||||
else:
|
||||
return sorted(out)
|
||||
|
||||
def ls(self, url, detail=True, **kwargs):
|
||||
if self.use_listings_cache and url in self.dircache:
|
||||
out = self.dircache[url]
|
||||
else:
|
||||
out = self._ls_real(url, detail=detail, **kwargs)
|
||||
self.dircache[url] = out
|
||||
return out
|
||||
|
||||
def _raise_not_found_for_status(self, response, url):
|
||||
"""
|
||||
Raises FileNotFoundError for 404s, otherwise uses raise_for_status.
|
||||
"""
|
||||
if response.status_code == 404:
|
||||
raise FileNotFoundError(url)
|
||||
response.raise_for_status()
|
||||
|
||||
def cat_file(self, url, start=None, end=None, **kwargs):
|
||||
kw = self.kwargs.copy()
|
||||
kw.update(kwargs)
|
||||
logger.debug(url)
|
||||
|
||||
if start is not None or end is not None:
|
||||
if start == end:
|
||||
return b""
|
||||
headers = kw.pop("headers", {}).copy()
|
||||
|
||||
headers["Range"] = self._process_limits(url, start, end)
|
||||
kw["headers"] = headers
|
||||
r = self.session.get(self.encode_url(url), **kw)
|
||||
self._raise_not_found_for_status(r, url)
|
||||
return r.content
|
||||
|
||||
def get_file(
|
||||
self, rpath, lpath, chunk_size=5 * 2**20, callback=_DEFAULT_CALLBACK, **kwargs
|
||||
):
|
||||
kw = self.kwargs.copy()
|
||||
kw.update(kwargs)
|
||||
logger.debug(rpath)
|
||||
r = self.session.get(self.encode_url(rpath), **kw)
|
||||
try:
|
||||
size = int(
|
||||
r.headers.get("content-length", None)
|
||||
or r.headers.get("Content-Length", None)
|
||||
)
|
||||
except (ValueError, KeyError, TypeError):
|
||||
size = None
|
||||
|
||||
callback.set_size(size)
|
||||
self._raise_not_found_for_status(r, rpath)
|
||||
if not isfilelike(lpath):
|
||||
lpath = open(lpath, "wb")
|
||||
for chunk in r.iter_content(chunk_size, decode_unicode=False):
|
||||
lpath.write(chunk)
|
||||
callback.relative_update(len(chunk))
|
||||
|
||||
def put_file(
|
||||
self,
|
||||
lpath,
|
||||
rpath,
|
||||
chunk_size=5 * 2**20,
|
||||
callback=_DEFAULT_CALLBACK,
|
||||
method="post",
|
||||
**kwargs,
|
||||
):
|
||||
def gen_chunks():
|
||||
# Support passing arbitrary file-like objects
|
||||
# and use them instead of streams.
|
||||
if isinstance(lpath, io.IOBase):
|
||||
context = nullcontext(lpath)
|
||||
use_seek = False # might not support seeking
|
||||
else:
|
||||
context = open(lpath, "rb")
|
||||
use_seek = True
|
||||
|
||||
with context as f:
|
||||
if use_seek:
|
||||
callback.set_size(f.seek(0, 2))
|
||||
f.seek(0)
|
||||
else:
|
||||
callback.set_size(getattr(f, "size", None))
|
||||
|
||||
chunk = f.read(chunk_size)
|
||||
while chunk:
|
||||
yield chunk
|
||||
callback.relative_update(len(chunk))
|
||||
chunk = f.read(chunk_size)
|
||||
|
||||
kw = self.kwargs.copy()
|
||||
kw.update(kwargs)
|
||||
|
||||
method = method.lower()
|
||||
if method not in ("post", "put"):
|
||||
raise ValueError(
|
||||
f"method has to be either 'post' or 'put', not: {method!r}"
|
||||
)
|
||||
|
||||
meth = getattr(self.session, method)
|
||||
resp = meth(rpath, data=gen_chunks(), **kw)
|
||||
self._raise_not_found_for_status(resp, rpath)
|
||||
|
||||
def _process_limits(self, url, start, end):
|
||||
"""Helper for "Range"-based _cat_file"""
|
||||
size = None
|
||||
suff = False
|
||||
if start is not None and start < 0:
|
||||
# if start is negative and end None, end is the "suffix length"
|
||||
if end is None:
|
||||
end = -start
|
||||
start = ""
|
||||
suff = True
|
||||
else:
|
||||
size = size or self.info(url)["size"]
|
||||
start = size + start
|
||||
elif start is None:
|
||||
start = 0
|
||||
if not suff:
|
||||
if end is not None and end < 0:
|
||||
if start is not None:
|
||||
size = size or self.info(url)["size"]
|
||||
end = size + end
|
||||
elif end is None:
|
||||
end = ""
|
||||
if isinstance(end, int):
|
||||
end -= 1 # bytes range is inclusive
|
||||
return f"bytes={start}-{end}"
|
||||
|
||||
def exists(self, path, **kwargs):
|
||||
kw = self.kwargs.copy()
|
||||
kw.update(kwargs)
|
||||
try:
|
||||
logger.debug(path)
|
||||
r = self.session.get(self.encode_url(path), **kw)
|
||||
return r.status_code < 400
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
def isfile(self, path, **kwargs):
|
||||
return self.exists(path, **kwargs)
|
||||
|
||||
def _open(
|
||||
self,
|
||||
path,
|
||||
mode="rb",
|
||||
block_size=None,
|
||||
autocommit=None, # XXX: This differs from the base class.
|
||||
cache_type=None,
|
||||
cache_options=None,
|
||||
size=None,
|
||||
**kwargs,
|
||||
):
|
||||
"""Make a file-like object
|
||||
|
||||
Parameters
|
||||
----------
|
||||
path: str
|
||||
Full URL with protocol
|
||||
mode: string
|
||||
must be "rb"
|
||||
block_size: int or None
|
||||
Bytes to download in one request; use instance value if None. If
|
||||
zero, will return a streaming Requests file-like instance.
|
||||
kwargs: key-value
|
||||
Any other parameters, passed to requests calls
|
||||
"""
|
||||
if mode != "rb":
|
||||
raise NotImplementedError
|
||||
block_size = block_size if block_size is not None else self.block_size
|
||||
kw = self.kwargs.copy()
|
||||
kw.update(kwargs)
|
||||
size = size or self.info(path, **kwargs)["size"]
|
||||
if block_size and size:
|
||||
return HTTPFile(
|
||||
self,
|
||||
path,
|
||||
session=self.session,
|
||||
block_size=block_size,
|
||||
mode=mode,
|
||||
size=size,
|
||||
cache_type=cache_type or self.cache_type,
|
||||
cache_options=cache_options or self.cache_options,
|
||||
**kw,
|
||||
)
|
||||
else:
|
||||
return HTTPStreamFile(
|
||||
self,
|
||||
path,
|
||||
mode=mode,
|
||||
session=self.session,
|
||||
**kw,
|
||||
)
|
||||
|
||||
def ukey(self, url):
|
||||
"""Unique identifier; assume HTTP files are static, unchanging"""
|
||||
return tokenize(url, self.kwargs, self.protocol)
|
||||
|
||||
def info(self, url, **kwargs):
|
||||
"""Get info of URL
|
||||
|
||||
Tries to access location via HEAD, and then GET methods, but does
|
||||
not fetch the data.
|
||||
|
||||
It is possible that the server does not supply any size information, in
|
||||
which case size will be given as None (and certain operations on the
|
||||
corresponding file will not work).
|
||||
"""
|
||||
info = {}
|
||||
for policy in ["head", "get"]:
|
||||
try:
|
||||
info.update(
|
||||
_file_info(
|
||||
self.encode_url(url),
|
||||
size_policy=policy,
|
||||
session=self.session,
|
||||
**self.kwargs,
|
||||
**kwargs,
|
||||
)
|
||||
)
|
||||
if info.get("size") is not None:
|
||||
break
|
||||
except Exception as exc:
|
||||
if policy == "get":
|
||||
# If get failed, then raise a FileNotFoundError
|
||||
raise FileNotFoundError(url) from exc
|
||||
logger.debug(str(exc))
|
||||
|
||||
return {"name": url, "size": None, **info, "type": "file"}
|
||||
|
||||
def glob(self, path, maxdepth=None, **kwargs):
|
||||
"""
|
||||
Find files by glob-matching.
|
||||
|
||||
This implementation is idntical to the one in AbstractFileSystem,
|
||||
but "?" is not considered as a character for globbing, because it is
|
||||
so common in URLs, often identifying the "query" part.
|
||||
"""
|
||||
import re
|
||||
|
||||
ends = path.endswith("/")
|
||||
path = self._strip_protocol(path)
|
||||
indstar = path.find("*") if path.find("*") >= 0 else len(path)
|
||||
indbrace = path.find("[") if path.find("[") >= 0 else len(path)
|
||||
|
||||
ind = min(indstar, indbrace)
|
||||
|
||||
detail = kwargs.pop("detail", False)
|
||||
|
||||
if not has_magic(path):
|
||||
root = path
|
||||
depth = 1
|
||||
if ends:
|
||||
path += "/*"
|
||||
elif self.exists(path):
|
||||
if not detail:
|
||||
return [path]
|
||||
else:
|
||||
return {path: self.info(path)}
|
||||
else:
|
||||
if not detail:
|
||||
return [] # glob of non-existent returns empty
|
||||
else:
|
||||
return {}
|
||||
elif "/" in path[:ind]:
|
||||
ind2 = path[:ind].rindex("/")
|
||||
root = path[: ind2 + 1]
|
||||
depth = None if "**" in path else path[ind2 + 1 :].count("/") + 1
|
||||
else:
|
||||
root = ""
|
||||
depth = None if "**" in path else path[ind + 1 :].count("/") + 1
|
||||
|
||||
allpaths = self.find(
|
||||
root, maxdepth=maxdepth or depth, withdirs=True, detail=True, **kwargs
|
||||
)
|
||||
# Escape characters special to python regex, leaving our supported
|
||||
# special characters in place.
|
||||
# See https://www.gnu.org/software/bash/manual/html_node/Pattern-Matching.html
|
||||
# for shell globbing details.
|
||||
pattern = (
|
||||
"^"
|
||||
+ (
|
||||
path.replace("\\", r"\\")
|
||||
.replace(".", r"\.")
|
||||
.replace("+", r"\+")
|
||||
.replace("//", "/")
|
||||
.replace("(", r"\(")
|
||||
.replace(")", r"\)")
|
||||
.replace("|", r"\|")
|
||||
.replace("^", r"\^")
|
||||
.replace("$", r"\$")
|
||||
.replace("{", r"\{")
|
||||
.replace("}", r"\}")
|
||||
.rstrip("/")
|
||||
)
|
||||
+ "$"
|
||||
)
|
||||
pattern = re.sub("[*]{2}", "=PLACEHOLDER=", pattern)
|
||||
pattern = re.sub("[*]", "[^/]*", pattern)
|
||||
pattern = re.compile(pattern.replace("=PLACEHOLDER=", ".*"))
|
||||
out = {
|
||||
p: allpaths[p]
|
||||
for p in sorted(allpaths)
|
||||
if pattern.match(p.replace("//", "/").rstrip("/"))
|
||||
}
|
||||
if detail:
|
||||
return out
|
||||
else:
|
||||
return list(out)
|
||||
|
||||
def isdir(self, path):
|
||||
# override, since all URLs are (also) files
|
||||
try:
|
||||
return bool(self.ls(path))
|
||||
except (FileNotFoundError, ValueError):
|
||||
return False
|
||||
|
||||
|
||||
class HTTPFile(AbstractBufferedFile):
|
||||
"""
|
||||
A file-like object pointing to a remove HTTP(S) resource
|
||||
|
||||
Supports only reading, with read-ahead of a predermined block-size.
|
||||
|
||||
In the case that the server does not supply the filesize, only reading of
|
||||
the complete file in one go is supported.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
url: str
|
||||
Full URL of the remote resource, including the protocol
|
||||
session: requests.Session or None
|
||||
All calls will be made within this session, to avoid restarting
|
||||
connections where the server allows this
|
||||
block_size: int or None
|
||||
The amount of read-ahead to do, in bytes. Default is 5MB, or the value
|
||||
configured for the FileSystem creating this file
|
||||
size: None or int
|
||||
If given, this is the size of the file in bytes, and we don't attempt
|
||||
to call the server to find the value.
|
||||
kwargs: all other key-values are passed to requests calls.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
fs,
|
||||
url,
|
||||
session=None,
|
||||
block_size=None,
|
||||
mode="rb",
|
||||
cache_type="bytes",
|
||||
cache_options=None,
|
||||
size=None,
|
||||
**kwargs,
|
||||
):
|
||||
if mode != "rb":
|
||||
raise NotImplementedError("File mode not supported")
|
||||
self.url = url
|
||||
self.session = session
|
||||
self.details = {"name": url, "size": size, "type": "file"}
|
||||
super().__init__(
|
||||
fs=fs,
|
||||
path=url,
|
||||
mode=mode,
|
||||
block_size=block_size,
|
||||
cache_type=cache_type,
|
||||
cache_options=cache_options,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
def read(self, length=-1):
|
||||
"""Read bytes from file
|
||||
|
||||
Parameters
|
||||
----------
|
||||
length: int
|
||||
Read up to this many bytes. If negative, read all content to end of
|
||||
file. If the server has not supplied the filesize, attempting to
|
||||
read only part of the data will raise a ValueError.
|
||||
"""
|
||||
if (
|
||||
(length < 0 and self.loc == 0) # explicit read all
|
||||
# but not when the size is known and fits into a block anyways
|
||||
and not (self.size is not None and self.size <= self.blocksize)
|
||||
):
|
||||
self._fetch_all()
|
||||
if self.size is None:
|
||||
if length < 0:
|
||||
self._fetch_all()
|
||||
else:
|
||||
length = min(self.size - self.loc, length)
|
||||
return super().read(length)
|
||||
|
||||
def _fetch_all(self):
|
||||
"""Read whole file in one shot, without caching
|
||||
|
||||
This is only called when position is still at zero,
|
||||
and read() is called without a byte-count.
|
||||
"""
|
||||
logger.debug(f"Fetch all for {self}")
|
||||
if not isinstance(self.cache, AllBytes):
|
||||
r = self.session.get(self.fs.encode_url(self.url), **self.kwargs)
|
||||
r.raise_for_status()
|
||||
out = r.content
|
||||
self.cache = AllBytes(size=len(out), fetcher=None, blocksize=None, data=out)
|
||||
self.size = len(out)
|
||||
|
||||
def _parse_content_range(self, headers):
|
||||
"""Parse the Content-Range header"""
|
||||
s = headers.get("Content-Range", "")
|
||||
m = re.match(r"bytes (\d+-\d+|\*)/(\d+|\*)", s)
|
||||
if not m:
|
||||
return None, None, None
|
||||
|
||||
if m[1] == "*":
|
||||
start = end = None
|
||||
else:
|
||||
start, end = [int(x) for x in m[1].split("-")]
|
||||
total = None if m[2] == "*" else int(m[2])
|
||||
return start, end, total
|
||||
|
||||
def _fetch_range(self, start, end):
|
||||
"""Download a block of data
|
||||
|
||||
The expectation is that the server returns only the requested bytes,
|
||||
with HTTP code 206. If this is not the case, we first check the headers,
|
||||
and then stream the output - if the data size is bigger than we
|
||||
requested, an exception is raised.
|
||||
"""
|
||||
logger.debug(f"Fetch range for {self}: {start}-{end}")
|
||||
kwargs = self.kwargs.copy()
|
||||
headers = kwargs.pop("headers", {}).copy()
|
||||
headers["Range"] = f"bytes={start}-{end - 1}"
|
||||
logger.debug("%s : %s", self.url, headers["Range"])
|
||||
r = self.session.get(self.fs.encode_url(self.url), headers=headers, **kwargs)
|
||||
if r.status_code == 416:
|
||||
# range request outside file
|
||||
return b""
|
||||
r.raise_for_status()
|
||||
|
||||
# If the server has handled the range request, it should reply
|
||||
# with status 206 (partial content). But we'll guess that a suitable
|
||||
# Content-Range header or a Content-Length no more than the
|
||||
# requested range also mean we have got the desired range.
|
||||
cl = r.headers.get("Content-Length", r.headers.get("content-length", end + 1))
|
||||
response_is_range = (
|
||||
r.status_code == 206
|
||||
or self._parse_content_range(r.headers)[0] == start
|
||||
or int(cl) <= end - start
|
||||
)
|
||||
|
||||
if response_is_range:
|
||||
# partial content, as expected
|
||||
out = r.content
|
||||
elif start > 0:
|
||||
raise ValueError(
|
||||
"The HTTP server doesn't appear to support range requests. "
|
||||
"Only reading this file from the beginning is supported. "
|
||||
"Open with block_size=0 for a streaming file interface."
|
||||
)
|
||||
else:
|
||||
# Response is not a range, but we want the start of the file,
|
||||
# so we can read the required amount anyway.
|
||||
cl = 0
|
||||
out = []
|
||||
for chunk in r.iter_content(2**20, False):
|
||||
out.append(chunk)
|
||||
cl += len(chunk)
|
||||
out = b"".join(out)[: end - start]
|
||||
return out
|
||||
|
||||
|
||||
magic_check = re.compile("([*[])")
|
||||
|
||||
|
||||
def has_magic(s):
|
||||
match = magic_check.search(s)
|
||||
return match is not None
|
||||
|
||||
|
||||
class HTTPStreamFile(AbstractBufferedFile):
|
||||
def __init__(self, fs, url, mode="rb", session=None, **kwargs):
|
||||
self.url = url
|
||||
self.session = session
|
||||
if mode != "rb":
|
||||
raise ValueError
|
||||
self.details = {"name": url, "size": None}
|
||||
super().__init__(fs=fs, path=url, mode=mode, cache_type="readahead", **kwargs)
|
||||
|
||||
r = self.session.get(self.fs.encode_url(url), stream=True, **kwargs)
|
||||
self.fs._raise_not_found_for_status(r, url)
|
||||
self.it = r.iter_content(1024, False)
|
||||
self.leftover = b""
|
||||
|
||||
self.r = r
|
||||
|
||||
def seek(self, *args, **kwargs):
|
||||
raise ValueError("Cannot seek streaming HTTP file")
|
||||
|
||||
def read(self, num=-1):
|
||||
bufs = [self.leftover]
|
||||
leng = len(self.leftover)
|
||||
while leng < num or num < 0:
|
||||
try:
|
||||
out = self.it.__next__()
|
||||
except StopIteration:
|
||||
break
|
||||
if out:
|
||||
bufs.append(out)
|
||||
else:
|
||||
break
|
||||
leng += len(out)
|
||||
out = b"".join(bufs)
|
||||
if num >= 0:
|
||||
self.leftover = out[num:]
|
||||
out = out[:num]
|
||||
else:
|
||||
self.leftover = b""
|
||||
self.loc += len(out)
|
||||
return out
|
||||
|
||||
def close(self):
|
||||
self.r.close()
|
||||
self.closed = True
|
||||
|
||||
|
||||
def get_range(session, url, start, end, **kwargs):
|
||||
# explicit get a range when we know it must be safe
|
||||
kwargs = kwargs.copy()
|
||||
headers = kwargs.pop("headers", {}).copy()
|
||||
headers["Range"] = f"bytes={start}-{end - 1}"
|
||||
r = session.get(url, headers=headers, **kwargs)
|
||||
r.raise_for_status()
|
||||
return r.content
|
||||
|
||||
|
||||
def _file_info(url, session, size_policy="head", **kwargs):
|
||||
"""Call HEAD on the server to get details about the file (size/checksum etc.)
|
||||
|
||||
Default operation is to explicitly allow redirects and use encoding
|
||||
'identity' (no compression) to get the true size of the target.
|
||||
"""
|
||||
logger.debug("Retrieve file size for %s", url)
|
||||
kwargs = kwargs.copy()
|
||||
ar = kwargs.pop("allow_redirects", True)
|
||||
head = kwargs.get("headers", {}).copy()
|
||||
# TODO: not allowed in JS
|
||||
# head["Accept-Encoding"] = "identity"
|
||||
kwargs["headers"] = head
|
||||
|
||||
info = {}
|
||||
if size_policy == "head":
|
||||
r = session.head(url, allow_redirects=ar, **kwargs)
|
||||
elif size_policy == "get":
|
||||
r = session.get(url, allow_redirects=ar, **kwargs)
|
||||
else:
|
||||
raise TypeError(f'size_policy must be "head" or "get", got {size_policy}')
|
||||
r.raise_for_status()
|
||||
|
||||
# TODO:
|
||||
# recognise lack of 'Accept-Ranges',
|
||||
# or 'Accept-Ranges': 'none' (not 'bytes')
|
||||
# to mean streaming only, no random access => return None
|
||||
if "Content-Length" in r.headers:
|
||||
info["size"] = int(r.headers["Content-Length"])
|
||||
elif "Content-Range" in r.headers:
|
||||
info["size"] = int(r.headers["Content-Range"].split("/")[1])
|
||||
elif "content-length" in r.headers:
|
||||
info["size"] = int(r.headers["content-length"])
|
||||
elif "content-range" in r.headers:
|
||||
info["size"] = int(r.headers["content-range"].split("/")[1])
|
||||
|
||||
for checksum_field in ["ETag", "Content-MD5", "Digest"]:
|
||||
if r.headers.get(checksum_field):
|
||||
info[checksum_field] = r.headers[checksum_field]
|
||||
|
||||
return info
|
||||
|
||||
|
||||
# importing this is enough to register it
|
||||
def register():
|
||||
register_implementation("http", HTTPFileSystem, clobber=True)
|
||||
register_implementation("https", HTTPFileSystem, clobber=True)
|
||||
register_implementation("sync-http", HTTPFileSystem, clobber=True)
|
||||
register_implementation("sync-https", HTTPFileSystem, clobber=True)
|
||||
|
||||
|
||||
register()
|
||||
|
||||
|
||||
def unregister():
|
||||
from fsspec.implementations.http import HTTPFileSystem
|
||||
|
||||
register_implementation("http", HTTPFileSystem, clobber=True)
|
||||
register_implementation("https", HTTPFileSystem, clobber=True)
|
||||
@@ -0,0 +1,129 @@
|
||||
import base64
|
||||
import io
|
||||
import re
|
||||
|
||||
import requests
|
||||
|
||||
import fsspec
|
||||
|
||||
|
||||
class JupyterFileSystem(fsspec.AbstractFileSystem):
|
||||
"""View of the files as seen by a Jupyter server (notebook or lab)"""
|
||||
|
||||
protocol = ("jupyter", "jlab")
|
||||
|
||||
def __init__(self, url, tok=None, **kwargs):
|
||||
"""
|
||||
|
||||
Parameters
|
||||
----------
|
||||
url : str
|
||||
Base URL of the server, like "http://127.0.0.1:8888". May include
|
||||
token in the string, which is given by the process when starting up
|
||||
tok : str
|
||||
If the token is obtained separately, can be given here
|
||||
kwargs
|
||||
"""
|
||||
if "?" in url:
|
||||
if tok is None:
|
||||
try:
|
||||
tok = re.findall("token=([a-z0-9]+)", url)[0]
|
||||
except IndexError as e:
|
||||
raise ValueError("Could not determine token") from e
|
||||
url = url.split("?", 1)[0]
|
||||
self.url = url.rstrip("/") + "/api/contents"
|
||||
self.session = requests.Session()
|
||||
if tok:
|
||||
self.session.headers["Authorization"] = f"token {tok}"
|
||||
|
||||
super().__init__(**kwargs)
|
||||
|
||||
def ls(self, path, detail=True, **kwargs):
|
||||
path = self._strip_protocol(path)
|
||||
r = self.session.get(f"{self.url}/{path}")
|
||||
if r.status_code == 404:
|
||||
raise FileNotFoundError(path)
|
||||
r.raise_for_status()
|
||||
out = r.json()
|
||||
|
||||
if out["type"] == "directory":
|
||||
out = out["content"]
|
||||
else:
|
||||
out = [out]
|
||||
for o in out:
|
||||
o["name"] = o.pop("path")
|
||||
o.pop("content")
|
||||
if o["type"] == "notebook":
|
||||
o["type"] = "file"
|
||||
if detail:
|
||||
return out
|
||||
return [o["name"] for o in out]
|
||||
|
||||
def cat_file(self, path, start=None, end=None, **kwargs):
|
||||
path = self._strip_protocol(path)
|
||||
r = self.session.get(f"{self.url}/{path}")
|
||||
if r.status_code == 404:
|
||||
raise FileNotFoundError(path)
|
||||
r.raise_for_status()
|
||||
out = r.json()
|
||||
if out["format"] == "text":
|
||||
# data should be binary
|
||||
b = out["content"].encode()
|
||||
else:
|
||||
b = base64.b64decode(out["content"])
|
||||
return b[start:end]
|
||||
|
||||
def pipe_file(self, path, value, **_):
|
||||
path = self._strip_protocol(path)
|
||||
json = {
|
||||
"name": path.rsplit("/", 1)[-1],
|
||||
"path": path,
|
||||
"size": len(value),
|
||||
"content": base64.b64encode(value).decode(),
|
||||
"format": "base64",
|
||||
"type": "file",
|
||||
}
|
||||
self.session.put(f"{self.url}/{path}", json=json)
|
||||
|
||||
def mkdir(self, path, create_parents=True, **kwargs):
|
||||
path = self._strip_protocol(path)
|
||||
if create_parents and "/" in path:
|
||||
self.mkdir(path.rsplit("/", 1)[0], True)
|
||||
json = {
|
||||
"name": path.rsplit("/", 1)[-1],
|
||||
"path": path,
|
||||
"size": None,
|
||||
"content": None,
|
||||
"type": "directory",
|
||||
}
|
||||
self.session.put(f"{self.url}/{path}", json=json)
|
||||
|
||||
def mv(self, path1, path2, recursive=False, maxdepth=None, **kwargs):
|
||||
if path1 == path2:
|
||||
return
|
||||
self.session.patch(f"{self.url}/{path1}", json={"path": path2})
|
||||
|
||||
def _rm(self, path):
|
||||
path = self._strip_protocol(path)
|
||||
self.session.delete(f"{self.url}/{path}")
|
||||
|
||||
def _open(self, path, mode="rb", **kwargs):
|
||||
path = self._strip_protocol(path)
|
||||
if mode == "rb":
|
||||
data = self.cat_file(path)
|
||||
return io.BytesIO(data)
|
||||
else:
|
||||
return SimpleFileWriter(self, path, mode="wb")
|
||||
|
||||
|
||||
class SimpleFileWriter(fsspec.spec.AbstractBufferedFile):
|
||||
def _upload_chunk(self, final=False):
|
||||
"""Never uploads a chunk until file is done
|
||||
|
||||
Not suitable for large files
|
||||
"""
|
||||
if final is False:
|
||||
return False
|
||||
self.buffer.seek(0)
|
||||
data = self.buffer.read()
|
||||
self.fs.pipe_file(self.path, data)
|
||||
@@ -0,0 +1,213 @@
|
||||
from contextlib import contextmanager
|
||||
from ctypes import (
|
||||
CFUNCTYPE,
|
||||
POINTER,
|
||||
c_int,
|
||||
c_longlong,
|
||||
c_void_p,
|
||||
cast,
|
||||
create_string_buffer,
|
||||
)
|
||||
|
||||
import libarchive
|
||||
import libarchive.ffi as ffi
|
||||
|
||||
from fsspec import open_files
|
||||
from fsspec.archive import AbstractArchiveFileSystem
|
||||
from fsspec.implementations.memory import MemoryFile
|
||||
from fsspec.utils import DEFAULT_BLOCK_SIZE
|
||||
|
||||
# Libarchive requires seekable files or memory only for certain archive
|
||||
# types. However, since we read the directory first to cache the contents
|
||||
# and also allow random access to any file, the file-like object needs
|
||||
# to be seekable no matter what.
|
||||
|
||||
# Seek call-backs (not provided in the libarchive python wrapper)
|
||||
SEEK_CALLBACK = CFUNCTYPE(c_longlong, c_int, c_void_p, c_longlong, c_int)
|
||||
read_set_seek_callback = ffi.ffi(
|
||||
"read_set_seek_callback", [ffi.c_archive_p, SEEK_CALLBACK], c_int, ffi.check_int
|
||||
)
|
||||
new_api = hasattr(ffi, "NO_OPEN_CB")
|
||||
|
||||
|
||||
@contextmanager
|
||||
def custom_reader(file, format_name="all", filter_name="all", block_size=ffi.page_size):
|
||||
"""Read an archive from a seekable file-like object.
|
||||
|
||||
The `file` object must support the standard `readinto` and 'seek' methods.
|
||||
"""
|
||||
buf = create_string_buffer(block_size)
|
||||
buf_p = cast(buf, c_void_p)
|
||||
|
||||
def read_func(archive_p, context, ptrptr):
|
||||
# readinto the buffer, returns number of bytes read
|
||||
length = file.readinto(buf)
|
||||
# write the address of the buffer into the pointer
|
||||
ptrptr = cast(ptrptr, POINTER(c_void_p))
|
||||
ptrptr[0] = buf_p
|
||||
# tell libarchive how much data was written into the buffer
|
||||
return length
|
||||
|
||||
def seek_func(archive_p, context, offset, whence):
|
||||
file.seek(offset, whence)
|
||||
# tell libarchvie the current position
|
||||
return file.tell()
|
||||
|
||||
read_cb = ffi.READ_CALLBACK(read_func)
|
||||
seek_cb = SEEK_CALLBACK(seek_func)
|
||||
|
||||
if new_api:
|
||||
open_cb = ffi.NO_OPEN_CB
|
||||
close_cb = ffi.NO_CLOSE_CB
|
||||
else:
|
||||
open_cb = libarchive.read.OPEN_CALLBACK(ffi.VOID_CB)
|
||||
close_cb = libarchive.read.CLOSE_CALLBACK(ffi.VOID_CB)
|
||||
|
||||
with libarchive.read.new_archive_read(format_name, filter_name) as archive_p:
|
||||
read_set_seek_callback(archive_p, seek_cb)
|
||||
ffi.read_open(archive_p, None, open_cb, read_cb, close_cb)
|
||||
yield libarchive.read.ArchiveRead(archive_p)
|
||||
|
||||
|
||||
class LibArchiveFileSystem(AbstractArchiveFileSystem):
|
||||
"""Compressed archives as a file-system (read-only)
|
||||
|
||||
Supports the following formats:
|
||||
tar, pax , cpio, ISO9660, zip, mtree, shar, ar, raw, xar, lha/lzh, rar
|
||||
Microsoft CAB, 7-Zip, WARC
|
||||
|
||||
See the libarchive documentation for further restrictions.
|
||||
https://www.libarchive.org/
|
||||
|
||||
Keeps file object open while instance lives. It only works in seekable
|
||||
file-like objects. In case the filesystem does not support this kind of
|
||||
file object, it is recommended to cache locally.
|
||||
|
||||
This class is pickleable, but not necessarily thread-safe (depends on the
|
||||
platform). See libarchive documentation for details.
|
||||
"""
|
||||
|
||||
root_marker = ""
|
||||
protocol = "libarchive"
|
||||
cachable = False
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
fo="",
|
||||
mode="r",
|
||||
target_protocol=None,
|
||||
target_options=None,
|
||||
block_size=DEFAULT_BLOCK_SIZE,
|
||||
**kwargs,
|
||||
):
|
||||
"""
|
||||
Parameters
|
||||
----------
|
||||
fo: str or file-like
|
||||
Contains ZIP, and must exist. If a str, will fetch file using
|
||||
:meth:`~fsspec.open_files`, which must return one file exactly.
|
||||
mode: str
|
||||
Currently, only 'r' accepted
|
||||
target_protocol: str (optional)
|
||||
If ``fo`` is a string, this value can be used to override the
|
||||
FS protocol inferred from a URL
|
||||
target_options: dict (optional)
|
||||
Kwargs passed when instantiating the target FS, if ``fo`` is
|
||||
a string.
|
||||
"""
|
||||
super().__init__(self, **kwargs)
|
||||
if mode != "r":
|
||||
raise ValueError("Only read from archive files accepted")
|
||||
if isinstance(fo, str):
|
||||
files = open_files(fo, protocol=target_protocol, **(target_options or {}))
|
||||
if len(files) != 1:
|
||||
raise ValueError(
|
||||
f'Path "{fo}" did not resolve to exactly one file: "{files}"'
|
||||
)
|
||||
fo = files[0]
|
||||
self.of = fo
|
||||
self.fo = fo.__enter__() # the whole instance is a context
|
||||
self.block_size = block_size
|
||||
self.dir_cache = None
|
||||
|
||||
@contextmanager
|
||||
def _open_archive(self):
|
||||
self.fo.seek(0)
|
||||
with custom_reader(self.fo, block_size=self.block_size) as arc:
|
||||
yield arc
|
||||
|
||||
@classmethod
|
||||
def _strip_protocol(cls, path):
|
||||
# file paths are always relative to the archive root
|
||||
return super()._strip_protocol(path).lstrip("/")
|
||||
|
||||
def _get_dirs(self):
|
||||
fields = {
|
||||
"name": "pathname",
|
||||
"size": "size",
|
||||
"created": "ctime",
|
||||
"mode": "mode",
|
||||
"uid": "uid",
|
||||
"gid": "gid",
|
||||
"mtime": "mtime",
|
||||
}
|
||||
|
||||
if self.dir_cache is not None:
|
||||
return
|
||||
|
||||
self.dir_cache = {}
|
||||
list_names = []
|
||||
with self._open_archive() as arc:
|
||||
for entry in arc:
|
||||
if not entry.isdir and not entry.isfile:
|
||||
# Skip symbolic links, fifo entries, etc.
|
||||
continue
|
||||
self.dir_cache.update(
|
||||
{
|
||||
dirname: {"name": dirname, "size": 0, "type": "directory"}
|
||||
for dirname in self._all_dirnames(set(entry.name))
|
||||
}
|
||||
)
|
||||
f = {key: getattr(entry, fields[key]) for key in fields}
|
||||
f["type"] = "directory" if entry.isdir else "file"
|
||||
list_names.append(entry.name)
|
||||
|
||||
self.dir_cache[f["name"]] = f
|
||||
# libarchive does not seem to return an entry for the directories (at least
|
||||
# not in all formats), so get the directories names from the files names
|
||||
self.dir_cache.update(
|
||||
{
|
||||
dirname: {"name": dirname, "size": 0, "type": "directory"}
|
||||
for dirname in self._all_dirnames(list_names)
|
||||
}
|
||||
)
|
||||
|
||||
def _open(
|
||||
self,
|
||||
path,
|
||||
mode="rb",
|
||||
block_size=None,
|
||||
autocommit=True,
|
||||
cache_options=None,
|
||||
**kwargs,
|
||||
):
|
||||
path = self._strip_protocol(path)
|
||||
if mode != "rb":
|
||||
raise NotImplementedError
|
||||
|
||||
data = bytes()
|
||||
with self._open_archive() as arc:
|
||||
for entry in arc:
|
||||
if entry.pathname != path:
|
||||
continue
|
||||
|
||||
if entry.size == 0:
|
||||
# empty file, so there are no blocks
|
||||
break
|
||||
|
||||
for block in entry.get_blocks(entry.size):
|
||||
data = block
|
||||
break
|
||||
else:
|
||||
raise ValueError
|
||||
return MemoryFile(fs=self, path=path, data=data)
|
||||
@@ -0,0 +1,514 @@
|
||||
import datetime
|
||||
import io
|
||||
import logging
|
||||
import os
|
||||
import os.path as osp
|
||||
import shutil
|
||||
import stat
|
||||
import tempfile
|
||||
from functools import lru_cache
|
||||
|
||||
from fsspec import AbstractFileSystem
|
||||
from fsspec.compression import compr
|
||||
from fsspec.core import get_compression
|
||||
from fsspec.utils import isfilelike, stringify_path
|
||||
|
||||
logger = logging.getLogger("fsspec.local")
|
||||
|
||||
|
||||
class LocalFileSystem(AbstractFileSystem):
|
||||
"""Interface to files on local storage
|
||||
|
||||
Parameters
|
||||
----------
|
||||
auto_mkdir: bool
|
||||
Whether, when opening a file, the directory containing it should
|
||||
be created (if it doesn't already exist). This is assumed by pyarrow
|
||||
code.
|
||||
"""
|
||||
|
||||
root_marker = "/"
|
||||
protocol = "file", "local"
|
||||
local_file = True
|
||||
|
||||
def __init__(self, auto_mkdir=False, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
self.auto_mkdir = auto_mkdir
|
||||
|
||||
@property
|
||||
def fsid(self):
|
||||
return "local"
|
||||
|
||||
def mkdir(self, path, create_parents=True, **kwargs):
|
||||
path = self._strip_protocol(path)
|
||||
if self.exists(path):
|
||||
raise FileExistsError(path)
|
||||
if create_parents:
|
||||
self.makedirs(path, exist_ok=True)
|
||||
else:
|
||||
os.mkdir(path, **kwargs)
|
||||
|
||||
def makedirs(self, path, exist_ok=False):
|
||||
path = self._strip_protocol(path)
|
||||
os.makedirs(path, exist_ok=exist_ok)
|
||||
|
||||
def rmdir(self, path):
|
||||
path = self._strip_protocol(path)
|
||||
os.rmdir(path)
|
||||
|
||||
def ls(self, path, detail=False, **kwargs):
|
||||
path = self._strip_protocol(path)
|
||||
path_info = self.info(path)
|
||||
infos = []
|
||||
if path_info["type"] == "directory":
|
||||
with os.scandir(path) as it:
|
||||
for f in it:
|
||||
try:
|
||||
# Only get the info if requested since it is a bit expensive (the stat call inside)
|
||||
# The strip_protocol is also used in info() and calls make_path_posix to always return posix paths
|
||||
info = self.info(f) if detail else self._strip_protocol(f.path)
|
||||
infos.append(info)
|
||||
except FileNotFoundError:
|
||||
pass
|
||||
else:
|
||||
infos = [path_info] if detail else [path_info["name"]]
|
||||
|
||||
return infos
|
||||
|
||||
def info(self, path, **kwargs):
|
||||
if isinstance(path, os.DirEntry):
|
||||
# scandir DirEntry
|
||||
out = path.stat(follow_symlinks=False)
|
||||
link = path.is_symlink()
|
||||
if path.is_dir(follow_symlinks=False):
|
||||
t = "directory"
|
||||
elif path.is_file(follow_symlinks=False):
|
||||
t = "file"
|
||||
else:
|
||||
t = "other"
|
||||
|
||||
size = out.st_size
|
||||
if link:
|
||||
try:
|
||||
out2 = path.stat(follow_symlinks=True)
|
||||
size = out2.st_size
|
||||
except OSError:
|
||||
size = 0
|
||||
path = self._strip_protocol(path.path)
|
||||
else:
|
||||
# str or path-like
|
||||
path = self._strip_protocol(path)
|
||||
out = os.stat(path, follow_symlinks=False)
|
||||
link = stat.S_ISLNK(out.st_mode)
|
||||
if link:
|
||||
out = os.stat(path, follow_symlinks=True)
|
||||
size = out.st_size
|
||||
if stat.S_ISDIR(out.st_mode):
|
||||
t = "directory"
|
||||
elif stat.S_ISREG(out.st_mode):
|
||||
t = "file"
|
||||
else:
|
||||
t = "other"
|
||||
|
||||
# Check for the 'st_birthtime' attribute, which is not always present; fallback to st_ctime
|
||||
created_time = getattr(out, "st_birthtime", out.st_ctime)
|
||||
|
||||
result = {
|
||||
"name": path,
|
||||
"size": size,
|
||||
"type": t,
|
||||
"created": created_time,
|
||||
"islink": link,
|
||||
}
|
||||
for field in ["mode", "uid", "gid", "mtime", "ino", "nlink"]:
|
||||
result[field] = getattr(out, f"st_{field}")
|
||||
if link:
|
||||
result["destination"] = os.readlink(path)
|
||||
return result
|
||||
|
||||
def lexists(self, path, **kwargs):
|
||||
return osp.lexists(path)
|
||||
|
||||
def cp_file(self, path1, path2, **kwargs):
|
||||
path1 = self._strip_protocol(path1)
|
||||
path2 = self._strip_protocol(path2)
|
||||
if self.auto_mkdir:
|
||||
self.makedirs(self._parent(path2), exist_ok=True)
|
||||
if self.isfile(path1):
|
||||
shutil.copyfile(path1, path2)
|
||||
elif self.isdir(path1):
|
||||
self.mkdirs(path2, exist_ok=True)
|
||||
else:
|
||||
raise FileNotFoundError(path1)
|
||||
|
||||
def isfile(self, path):
|
||||
path = self._strip_protocol(path)
|
||||
return os.path.isfile(path)
|
||||
|
||||
def isdir(self, path):
|
||||
path = self._strip_protocol(path)
|
||||
return os.path.isdir(path)
|
||||
|
||||
def get_file(self, path1, path2, callback=None, **kwargs):
|
||||
if isfilelike(path2):
|
||||
with open(path1, "rb") as f:
|
||||
shutil.copyfileobj(f, path2)
|
||||
else:
|
||||
return self.cp_file(path1, path2, **kwargs)
|
||||
|
||||
def put_file(self, path1, path2, callback=None, **kwargs):
|
||||
return self.cp_file(path1, path2, **kwargs)
|
||||
|
||||
def mv(self, path1, path2, recursive: bool = True, **kwargs):
|
||||
"""Move files/directories
|
||||
For the specific case of local, all ops on directories are recursive and
|
||||
the recursive= kwarg is ignored.
|
||||
"""
|
||||
path1 = self._strip_protocol(path1)
|
||||
path2 = self._strip_protocol(path2)
|
||||
shutil.move(path1, path2)
|
||||
|
||||
def link(self, src, dst, **kwargs):
|
||||
src = self._strip_protocol(src)
|
||||
dst = self._strip_protocol(dst)
|
||||
os.link(src, dst, **kwargs)
|
||||
|
||||
def symlink(self, src, dst, **kwargs):
|
||||
src = self._strip_protocol(src)
|
||||
dst = self._strip_protocol(dst)
|
||||
os.symlink(src, dst, **kwargs)
|
||||
|
||||
def islink(self, path) -> bool:
|
||||
return os.path.islink(self._strip_protocol(path))
|
||||
|
||||
def rm_file(self, path):
|
||||
os.remove(self._strip_protocol(path))
|
||||
|
||||
def rm(self, path, recursive=False, maxdepth=None):
|
||||
if not isinstance(path, list):
|
||||
path = [path]
|
||||
|
||||
for p in path:
|
||||
p = self._strip_protocol(p)
|
||||
if self.isdir(p):
|
||||
if not recursive:
|
||||
raise ValueError("Cannot delete directory, set recursive=True")
|
||||
if osp.abspath(p) == os.getcwd():
|
||||
raise ValueError("Cannot delete current working directory")
|
||||
shutil.rmtree(p)
|
||||
else:
|
||||
os.remove(p)
|
||||
|
||||
def unstrip_protocol(self, name):
|
||||
name = self._strip_protocol(name) # normalise for local/win/...
|
||||
return f"file://{name}"
|
||||
|
||||
def _open(self, path, mode="rb", block_size=None, **kwargs):
|
||||
path = self._strip_protocol(path)
|
||||
if self.auto_mkdir and "w" in mode:
|
||||
self.makedirs(self._parent(path), exist_ok=True)
|
||||
return LocalFileOpener(path, mode, fs=self, **kwargs)
|
||||
|
||||
def touch(self, path, truncate=True, **kwargs):
|
||||
path = self._strip_protocol(path)
|
||||
if self.auto_mkdir:
|
||||
self.makedirs(self._parent(path), exist_ok=True)
|
||||
if self.exists(path):
|
||||
os.utime(path, None)
|
||||
else:
|
||||
open(path, "a").close()
|
||||
if truncate:
|
||||
os.truncate(path, 0)
|
||||
|
||||
def created(self, path):
|
||||
info = self.info(path=path)
|
||||
return datetime.datetime.fromtimestamp(
|
||||
info["created"], tz=datetime.timezone.utc
|
||||
)
|
||||
|
||||
def modified(self, path):
|
||||
info = self.info(path=path)
|
||||
return datetime.datetime.fromtimestamp(info["mtime"], tz=datetime.timezone.utc)
|
||||
|
||||
@classmethod
|
||||
def _parent(cls, path):
|
||||
path = cls._strip_protocol(path)
|
||||
if os.sep == "/":
|
||||
# posix native
|
||||
return path.rsplit("/", 1)[0] or "/"
|
||||
else:
|
||||
# NT
|
||||
path_ = path.rsplit("/", 1)[0]
|
||||
if len(path_) <= 3:
|
||||
if path_[1:2] == ":":
|
||||
# nt root (something like c:/)
|
||||
return path_[0] + ":/"
|
||||
# More cases may be required here
|
||||
return path_
|
||||
|
||||
@classmethod
|
||||
def _strip_protocol(cls, path):
|
||||
path = stringify_path(path)
|
||||
if path.startswith("file://"):
|
||||
path = path[7:]
|
||||
elif path.startswith("file:"):
|
||||
path = path[5:]
|
||||
elif path.startswith("local://"):
|
||||
path = path[8:]
|
||||
elif path.startswith("local:"):
|
||||
path = path[6:]
|
||||
|
||||
path = make_path_posix(path)
|
||||
if os.sep != "/":
|
||||
# This code-path is a stripped down version of
|
||||
# > drive, path = ntpath.splitdrive(path)
|
||||
if path[1:2] == ":":
|
||||
# Absolute drive-letter path, e.g. X:\Windows
|
||||
# Relative path with drive, e.g. X:Windows
|
||||
drive, path = path[:2], path[2:]
|
||||
elif path[:2] == "//":
|
||||
# UNC drives, e.g. \\server\share or \\?\UNC\server\share
|
||||
# Device drives, e.g. \\.\device or \\?\device
|
||||
if (index1 := path.find("/", 2)) == -1 or (
|
||||
index2 := path.find("/", index1 + 1)
|
||||
) == -1:
|
||||
drive, path = path, ""
|
||||
else:
|
||||
drive, path = path[:index2], path[index2:]
|
||||
else:
|
||||
# Relative path, e.g. Windows
|
||||
drive = ""
|
||||
|
||||
path = path.rstrip("/") or cls.root_marker
|
||||
return drive + path
|
||||
|
||||
else:
|
||||
return path.rstrip("/") or cls.root_marker
|
||||
|
||||
def _isfilestore(self):
|
||||
# Inheriting from DaskFileSystem makes this False (S3, etc. were)
|
||||
# the original motivation. But we are a posix-like file system.
|
||||
# See https://github.com/dask/dask/issues/5526
|
||||
return True
|
||||
|
||||
def chmod(self, path, mode):
|
||||
path = stringify_path(path)
|
||||
return os.chmod(path, mode)
|
||||
|
||||
|
||||
def make_path_posix(path):
|
||||
"""Make path generic and absolute for current OS"""
|
||||
if not isinstance(path, str):
|
||||
if isinstance(path, (list, set, tuple)):
|
||||
return type(path)(make_path_posix(p) for p in path)
|
||||
else:
|
||||
path = stringify_path(path)
|
||||
if not isinstance(path, str):
|
||||
raise TypeError(f"could not convert {path!r} to string")
|
||||
if os.sep == "/":
|
||||
# Native posix
|
||||
if path.startswith("/"):
|
||||
# most common fast case for posix
|
||||
return path
|
||||
elif path.startswith("~"):
|
||||
return osp.expanduser(path)
|
||||
elif path.startswith("./"):
|
||||
path = path[2:]
|
||||
elif path == ".":
|
||||
path = ""
|
||||
return f"{os.getcwd()}/{path}"
|
||||
else:
|
||||
# NT handling
|
||||
if path[0:1] == "/" and path[2:3] == ":":
|
||||
# path is like "/c:/local/path"
|
||||
path = path[1:]
|
||||
if path[1:2] == ":":
|
||||
# windows full path like "C:\\local\\path"
|
||||
if len(path) <= 3:
|
||||
# nt root (something like c:/)
|
||||
return path[0] + ":/"
|
||||
path = path.replace("\\", "/")
|
||||
return path
|
||||
elif path[0:1] == "~":
|
||||
return make_path_posix(osp.expanduser(path))
|
||||
elif path.startswith(("\\\\", "//")):
|
||||
# windows UNC/DFS-style paths
|
||||
return "//" + path[2:].replace("\\", "/")
|
||||
elif path.startswith(("\\", "/")):
|
||||
# windows relative path with root
|
||||
path = path.replace("\\", "/")
|
||||
return f"{osp.splitdrive(os.getcwd())[0]}{path}"
|
||||
else:
|
||||
path = path.replace("\\", "/")
|
||||
if path.startswith("./"):
|
||||
path = path[2:]
|
||||
elif path == ".":
|
||||
path = ""
|
||||
return f"{make_path_posix(os.getcwd())}/{path}"
|
||||
|
||||
|
||||
def trailing_sep(path):
|
||||
"""Return True if the path ends with a path separator.
|
||||
|
||||
A forward slash is always considered a path separator, even on Operating
|
||||
Systems that normally use a backslash.
|
||||
"""
|
||||
# TODO: if all incoming paths were posix-compliant then separator would
|
||||
# always be a forward slash, simplifying this function.
|
||||
# See https://github.com/fsspec/filesystem_spec/pull/1250
|
||||
return path.endswith(os.sep) or (os.altsep is not None and path.endswith(os.altsep))
|
||||
|
||||
|
||||
@lru_cache(maxsize=1)
|
||||
def get_umask(mask: int = 0o666) -> int:
|
||||
"""Get the current umask.
|
||||
|
||||
Follows https://stackoverflow.com/a/44130549 to get the umask.
|
||||
Temporarily sets the umask to the given value, and then resets it to the
|
||||
original value.
|
||||
"""
|
||||
value = os.umask(mask)
|
||||
os.umask(value)
|
||||
return value
|
||||
|
||||
|
||||
class LocalFileOpener(io.IOBase):
|
||||
def __init__(
|
||||
self, path, mode, autocommit=True, fs=None, compression=None, **kwargs
|
||||
):
|
||||
logger.debug("open file: %s", path)
|
||||
self.path = path
|
||||
self.mode = mode
|
||||
self.fs = fs
|
||||
self.f = None
|
||||
self.autocommit = autocommit
|
||||
self.compression = get_compression(path, compression)
|
||||
self.blocksize = io.DEFAULT_BUFFER_SIZE
|
||||
self._open()
|
||||
|
||||
def _open(self):
|
||||
if self.f is None or self.f.closed:
|
||||
if self.autocommit or "w" not in self.mode:
|
||||
self.f = open(self.path, mode=self.mode)
|
||||
if self.compression:
|
||||
compress = compr[self.compression]
|
||||
self.f = compress(self.f, mode=self.mode)
|
||||
else:
|
||||
# TODO: check if path is writable?
|
||||
i, name = tempfile.mkstemp()
|
||||
os.close(i) # we want normal open and normal buffered file
|
||||
self.temp = name
|
||||
self.f = open(name, mode=self.mode)
|
||||
if "w" not in self.mode:
|
||||
self.size = self.f.seek(0, 2)
|
||||
self.f.seek(0)
|
||||
self.f.size = self.size
|
||||
|
||||
def _fetch_range(self, start, end):
|
||||
# probably only used by cached FS
|
||||
if "r" not in self.mode:
|
||||
raise ValueError
|
||||
self._open()
|
||||
self.f.seek(start)
|
||||
return self.f.read(end - start)
|
||||
|
||||
def __setstate__(self, state):
|
||||
self.f = None
|
||||
loc = state.pop("loc", None)
|
||||
self.__dict__.update(state)
|
||||
if "r" in state["mode"]:
|
||||
self.f = None
|
||||
self._open()
|
||||
self.f.seek(loc)
|
||||
|
||||
def __getstate__(self):
|
||||
d = self.__dict__.copy()
|
||||
d.pop("f")
|
||||
if "r" in self.mode:
|
||||
d["loc"] = self.f.tell()
|
||||
else:
|
||||
if not self.f.closed:
|
||||
raise ValueError("Cannot serialise open write-mode local file")
|
||||
return d
|
||||
|
||||
def commit(self):
|
||||
if self.autocommit:
|
||||
raise RuntimeError("Can only commit if not already set to autocommit")
|
||||
try:
|
||||
shutil.move(self.temp, self.path)
|
||||
except PermissionError as e:
|
||||
# shutil.move raises PermissionError if os.rename
|
||||
# and the default copy2 fallback with shutil.copystats fail.
|
||||
# The file should be there nonetheless, but without copied permissions.
|
||||
# If it doesn't exist, there was no permission to create the file.
|
||||
if not os.path.exists(self.path):
|
||||
raise e
|
||||
else:
|
||||
# If PermissionError is not raised, permissions can be set.
|
||||
try:
|
||||
mask = 0o666
|
||||
os.chmod(self.path, mask & ~get_umask(mask))
|
||||
except RuntimeError:
|
||||
pass
|
||||
|
||||
def discard(self):
|
||||
if self.autocommit:
|
||||
raise RuntimeError("Cannot discard if set to autocommit")
|
||||
os.remove(self.temp)
|
||||
|
||||
def readable(self) -> bool:
|
||||
return True
|
||||
|
||||
def writable(self) -> bool:
|
||||
return "r" not in self.mode
|
||||
|
||||
def read(self, *args, **kwargs):
|
||||
return self.f.read(*args, **kwargs)
|
||||
|
||||
def write(self, *args, **kwargs):
|
||||
return self.f.write(*args, **kwargs)
|
||||
|
||||
def tell(self, *args, **kwargs):
|
||||
return self.f.tell(*args, **kwargs)
|
||||
|
||||
def seek(self, *args, **kwargs):
|
||||
return self.f.seek(*args, **kwargs)
|
||||
|
||||
def seekable(self, *args, **kwargs):
|
||||
return self.f.seekable(*args, **kwargs)
|
||||
|
||||
def readline(self, *args, **kwargs):
|
||||
return self.f.readline(*args, **kwargs)
|
||||
|
||||
def readlines(self, *args, **kwargs):
|
||||
return self.f.readlines(*args, **kwargs)
|
||||
|
||||
def close(self):
|
||||
return self.f.close()
|
||||
|
||||
def truncate(self, size=None) -> int:
|
||||
return self.f.truncate(size)
|
||||
|
||||
@property
|
||||
def closed(self):
|
||||
return self.f.closed
|
||||
|
||||
def fileno(self):
|
||||
return self.raw.fileno()
|
||||
|
||||
def flush(self) -> None:
|
||||
self.f.flush()
|
||||
|
||||
def __iter__(self):
|
||||
return self.f.__iter__()
|
||||
|
||||
def __getattr__(self, item):
|
||||
return getattr(self.f, item)
|
||||
|
||||
def __enter__(self):
|
||||
self._incontext = True
|
||||
return self
|
||||
|
||||
def __exit__(self, exc_type, exc_value, traceback):
|
||||
self._incontext = False
|
||||
self.f.__exit__(exc_type, exc_value, traceback)
|
||||
@@ -0,0 +1,311 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from datetime import datetime, timezone
|
||||
from errno import ENOTEMPTY
|
||||
from io import BytesIO
|
||||
from pathlib import PurePath, PureWindowsPath
|
||||
from typing import Any, ClassVar
|
||||
|
||||
from fsspec import AbstractFileSystem
|
||||
from fsspec.implementations.local import LocalFileSystem
|
||||
from fsspec.utils import stringify_path
|
||||
|
||||
logger = logging.getLogger("fsspec.memoryfs")
|
||||
|
||||
|
||||
class MemoryFileSystem(AbstractFileSystem):
|
||||
"""A filesystem based on a dict of BytesIO objects
|
||||
|
||||
This is a global filesystem so instances of this class all point to the same
|
||||
in memory filesystem.
|
||||
"""
|
||||
|
||||
store: ClassVar[dict[str, Any]] = {} # global, do not overwrite!
|
||||
pseudo_dirs = [""] # global, do not overwrite!
|
||||
protocol = "memory"
|
||||
root_marker = "/"
|
||||
|
||||
@classmethod
|
||||
def _strip_protocol(cls, path):
|
||||
if isinstance(path, PurePath):
|
||||
if isinstance(path, PureWindowsPath):
|
||||
return LocalFileSystem._strip_protocol(path)
|
||||
else:
|
||||
path = stringify_path(path)
|
||||
|
||||
path = path.removeprefix("memory://")
|
||||
if "::" in path or "://" in path:
|
||||
return path.rstrip("/")
|
||||
path = path.lstrip("/").rstrip("/")
|
||||
return "/" + path if path else ""
|
||||
|
||||
def ls(self, path, detail=True, **kwargs):
|
||||
path = self._strip_protocol(path)
|
||||
if path in self.store:
|
||||
# there is a key with this exact name
|
||||
if not detail:
|
||||
return [path]
|
||||
return [
|
||||
{
|
||||
"name": path,
|
||||
"size": self.store[path].size,
|
||||
"type": "file",
|
||||
"created": self.store[path].created.timestamp(),
|
||||
}
|
||||
]
|
||||
paths = set()
|
||||
starter = path + "/"
|
||||
out = []
|
||||
for p2 in tuple(self.store):
|
||||
if p2.startswith(starter):
|
||||
if "/" not in p2[len(starter) :]:
|
||||
# exact child
|
||||
out.append(
|
||||
{
|
||||
"name": p2,
|
||||
"size": self.store[p2].size,
|
||||
"type": "file",
|
||||
"created": self.store[p2].created.timestamp(),
|
||||
}
|
||||
)
|
||||
elif len(p2) > len(starter):
|
||||
# implied child directory
|
||||
ppath = starter + p2[len(starter) :].split("/", 1)[0]
|
||||
if ppath not in paths:
|
||||
out = out or []
|
||||
out.append(
|
||||
{
|
||||
"name": ppath,
|
||||
"size": 0,
|
||||
"type": "directory",
|
||||
}
|
||||
)
|
||||
paths.add(ppath)
|
||||
for p2 in self.pseudo_dirs:
|
||||
if p2.startswith(starter):
|
||||
if "/" not in p2[len(starter) :]:
|
||||
# exact child pdir
|
||||
if p2 not in paths:
|
||||
out.append({"name": p2, "size": 0, "type": "directory"})
|
||||
paths.add(p2)
|
||||
else:
|
||||
# directory implied by deeper pdir
|
||||
ppath = starter + p2[len(starter) :].split("/", 1)[0]
|
||||
if ppath not in paths:
|
||||
out.append({"name": ppath, "size": 0, "type": "directory"})
|
||||
paths.add(ppath)
|
||||
if not out:
|
||||
if path in self.pseudo_dirs:
|
||||
# empty dir
|
||||
return []
|
||||
raise FileNotFoundError(path)
|
||||
if detail:
|
||||
return out
|
||||
return sorted([f["name"] for f in out])
|
||||
|
||||
def mkdir(self, path, create_parents=True, **kwargs):
|
||||
path = self._strip_protocol(path)
|
||||
if path in self.store or path in self.pseudo_dirs:
|
||||
raise FileExistsError(path)
|
||||
if self._parent(path).strip("/") and self.isfile(self._parent(path)):
|
||||
raise NotADirectoryError(self._parent(path))
|
||||
if create_parents and self._parent(path).strip("/"):
|
||||
try:
|
||||
self.mkdir(self._parent(path), create_parents, **kwargs)
|
||||
except FileExistsError:
|
||||
pass
|
||||
if path and path not in self.pseudo_dirs:
|
||||
self.pseudo_dirs.append(path)
|
||||
|
||||
def makedirs(self, path, exist_ok=False):
|
||||
try:
|
||||
self.mkdir(path, create_parents=True)
|
||||
except FileExistsError:
|
||||
if not exist_ok:
|
||||
raise
|
||||
|
||||
def pipe_file(self, path, value, mode="overwrite", **kwargs):
|
||||
"""Set the bytes of given file
|
||||
|
||||
Avoids copies of the data if possible
|
||||
"""
|
||||
mode = "xb" if mode == "create" else "wb"
|
||||
self.open(path, mode=mode, data=value)
|
||||
|
||||
def rmdir(self, path):
|
||||
path = self._strip_protocol(path)
|
||||
if path == "":
|
||||
# silently avoid deleting FS root
|
||||
return
|
||||
if path in self.pseudo_dirs:
|
||||
if not self.ls(path):
|
||||
self.pseudo_dirs.remove(path)
|
||||
else:
|
||||
raise OSError(ENOTEMPTY, "Directory not empty", path)
|
||||
else:
|
||||
raise FileNotFoundError(path)
|
||||
|
||||
def info(self, path, **kwargs):
|
||||
logger.debug("info: %s", path)
|
||||
path = self._strip_protocol(path)
|
||||
if path in self.pseudo_dirs or any(
|
||||
p.startswith(path + "/") for p in list(self.store) + self.pseudo_dirs
|
||||
):
|
||||
return {
|
||||
"name": path,
|
||||
"size": 0,
|
||||
"type": "directory",
|
||||
}
|
||||
elif path in self.store:
|
||||
filelike = self.store[path]
|
||||
return {
|
||||
"name": path,
|
||||
"size": filelike.size,
|
||||
"type": "file",
|
||||
"created": getattr(filelike, "created", None),
|
||||
}
|
||||
else:
|
||||
raise FileNotFoundError(path)
|
||||
|
||||
def _open(
|
||||
self,
|
||||
path,
|
||||
mode="rb",
|
||||
block_size=None,
|
||||
autocommit=True,
|
||||
cache_options=None,
|
||||
**kwargs,
|
||||
):
|
||||
path = self._strip_protocol(path)
|
||||
if "x" in mode and self.exists(path):
|
||||
raise FileExistsError
|
||||
if path in self.pseudo_dirs:
|
||||
raise IsADirectoryError(path)
|
||||
parent = path
|
||||
while len(parent) > 1:
|
||||
parent = self._parent(parent)
|
||||
if self.isfile(parent):
|
||||
raise FileExistsError(parent)
|
||||
if mode in ["rb", "ab", "r+b", "a+b"]:
|
||||
if path in self.store:
|
||||
f = self.store[path]
|
||||
if "a" in mode:
|
||||
# position at the end of file
|
||||
f.seek(0, 2)
|
||||
else:
|
||||
# position at the beginning of file
|
||||
f.seek(0)
|
||||
return f
|
||||
else:
|
||||
raise FileNotFoundError(path)
|
||||
elif mode in {"wb", "w+b", "xb", "x+b"}:
|
||||
if "x" in mode and self.exists(path):
|
||||
raise FileExistsError
|
||||
m = MemoryFile(self, path, kwargs.get("data"))
|
||||
if not self._intrans:
|
||||
m.commit()
|
||||
return m
|
||||
else:
|
||||
name = self.__class__.__name__
|
||||
raise ValueError(f"unsupported file mode for {name}: {mode!r}")
|
||||
|
||||
def cp_file(self, path1, path2, **kwargs):
|
||||
path1 = self._strip_protocol(path1)
|
||||
path2 = self._strip_protocol(path2)
|
||||
if self.isfile(path1):
|
||||
self.store[path2] = MemoryFile(
|
||||
self, path2, self.store[path1].getvalue()
|
||||
) # implicit copy
|
||||
elif self.isdir(path1):
|
||||
if path2 not in self.pseudo_dirs:
|
||||
self.pseudo_dirs.append(path2)
|
||||
else:
|
||||
raise FileNotFoundError(path1)
|
||||
|
||||
def cat_file(self, path, start=None, end=None, **kwargs):
|
||||
logger.debug("cat: %s", path)
|
||||
path = self._strip_protocol(path)
|
||||
try:
|
||||
return bytes(self.store[path].getbuffer()[start:end])
|
||||
except KeyError as e:
|
||||
raise FileNotFoundError(path) from e
|
||||
|
||||
def _rm(self, path):
|
||||
path = self._strip_protocol(path)
|
||||
try:
|
||||
del self.store[path]
|
||||
except KeyError as e:
|
||||
raise FileNotFoundError(path) from e
|
||||
|
||||
def modified(self, path):
|
||||
path = self._strip_protocol(path)
|
||||
try:
|
||||
return self.store[path].modified
|
||||
except KeyError as e:
|
||||
raise FileNotFoundError(path) from e
|
||||
|
||||
def created(self, path):
|
||||
path = self._strip_protocol(path)
|
||||
try:
|
||||
return self.store[path].created
|
||||
except KeyError as e:
|
||||
raise FileNotFoundError(path) from e
|
||||
|
||||
def isfile(self, path):
|
||||
path = self._strip_protocol(path)
|
||||
return path in self.store
|
||||
|
||||
def rm(self, path, recursive=False, maxdepth=None):
|
||||
if isinstance(path, str):
|
||||
path = self._strip_protocol(path)
|
||||
else:
|
||||
path = [self._strip_protocol(p) for p in path]
|
||||
paths = self.expand_path(path, recursive=recursive, maxdepth=maxdepth)
|
||||
for p in reversed(paths):
|
||||
if self.isfile(p):
|
||||
self.rm_file(p)
|
||||
# If the expanded path doesn't exist, it is only because the expanded
|
||||
# path was a directory that does not exist in self.pseudo_dirs. This
|
||||
# is possible if you directly create files without making the
|
||||
# directories first.
|
||||
elif not self.exists(p):
|
||||
continue
|
||||
else:
|
||||
self.rmdir(p)
|
||||
|
||||
|
||||
class MemoryFile(BytesIO):
|
||||
"""A BytesIO which can't close and works as a context manager
|
||||
|
||||
Can initialise with data. Each path should only be active once at any moment.
|
||||
|
||||
No need to provide fs, path if auto-committing (default)
|
||||
"""
|
||||
|
||||
def __init__(self, fs=None, path=None, data=None):
|
||||
logger.debug("open file %s", path)
|
||||
self.fs = fs
|
||||
self.path = path
|
||||
self.created = datetime.now(tz=timezone.utc)
|
||||
self.modified = datetime.now(tz=timezone.utc)
|
||||
if data:
|
||||
super().__init__(data)
|
||||
self.seek(0)
|
||||
|
||||
@property
|
||||
def size(self):
|
||||
return self.getbuffer().nbytes
|
||||
|
||||
def __enter__(self):
|
||||
return self
|
||||
|
||||
def close(self):
|
||||
pass
|
||||
|
||||
def discard(self):
|
||||
pass
|
||||
|
||||
def commit(self):
|
||||
self.fs.store[self.path] = self
|
||||
self.modified = datetime.now(tz=timezone.utc)
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,187 @@
|
||||
import datetime
|
||||
import logging
|
||||
import os
|
||||
import types
|
||||
import uuid
|
||||
from stat import S_ISDIR, S_ISLNK
|
||||
|
||||
import paramiko
|
||||
|
||||
from .. import AbstractFileSystem
|
||||
from ..utils import infer_storage_options
|
||||
|
||||
logger = logging.getLogger("fsspec.sftp")
|
||||
|
||||
|
||||
class SFTPFileSystem(AbstractFileSystem):
|
||||
"""Files over SFTP/SSH
|
||||
|
||||
Peer-to-peer filesystem over SSH using paramiko.
|
||||
|
||||
Note: if using this with the ``open`` or ``open_files``, with full URLs,
|
||||
there is no way to tell if a path is relative, so all paths are assumed
|
||||
to be absolute.
|
||||
"""
|
||||
|
||||
protocol = "sftp", "ssh"
|
||||
|
||||
def __init__(self, host, **ssh_kwargs):
|
||||
"""
|
||||
|
||||
Parameters
|
||||
----------
|
||||
host: str
|
||||
Hostname or IP as a string
|
||||
temppath: str
|
||||
Location on the server to put files, when within a transaction
|
||||
ssh_kwargs: dict
|
||||
Parameters passed on to connection. See details in
|
||||
https://docs.paramiko.org/en/3.3/api/client.html#paramiko.client.SSHClient.connect
|
||||
May include port, username, password...
|
||||
"""
|
||||
if self._cached:
|
||||
return
|
||||
super().__init__(**ssh_kwargs)
|
||||
self.temppath = ssh_kwargs.pop("temppath", "/tmp") # remote temp directory
|
||||
self.host = host
|
||||
self.ssh_kwargs = ssh_kwargs
|
||||
self._connect()
|
||||
|
||||
def _connect(self):
|
||||
logger.debug("Connecting to SFTP server %s", self.host)
|
||||
self.client = paramiko.SSHClient()
|
||||
self.client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
|
||||
self.client.connect(self.host, **self.ssh_kwargs)
|
||||
self.ftp = self.client.open_sftp()
|
||||
|
||||
@classmethod
|
||||
def _strip_protocol(cls, path):
|
||||
return infer_storage_options(path)["path"]
|
||||
|
||||
@staticmethod
|
||||
def _get_kwargs_from_urls(urlpath):
|
||||
out = infer_storage_options(urlpath)
|
||||
out.pop("path", None)
|
||||
out.pop("protocol", None)
|
||||
return out
|
||||
|
||||
def mkdir(self, path, create_parents=True, mode=511):
|
||||
path = self._strip_protocol(path)
|
||||
logger.debug("Creating folder %s", path)
|
||||
if self.exists(path):
|
||||
raise FileExistsError(f"File exists: {path}")
|
||||
|
||||
if create_parents:
|
||||
self.makedirs(path)
|
||||
else:
|
||||
self.ftp.mkdir(path, mode)
|
||||
|
||||
def makedirs(self, path, exist_ok=False, mode=511):
|
||||
if self.exists(path) and not exist_ok:
|
||||
raise FileExistsError(f"File exists: {path}")
|
||||
|
||||
parts = path.split("/")
|
||||
new_path = "/" if path[:1] == "/" else ""
|
||||
|
||||
for part in parts:
|
||||
if part:
|
||||
new_path = f"{new_path}/{part}" if new_path else part
|
||||
if not self.exists(new_path):
|
||||
self.ftp.mkdir(new_path, mode)
|
||||
|
||||
def rmdir(self, path):
|
||||
path = self._strip_protocol(path)
|
||||
logger.debug("Removing folder %s", path)
|
||||
self.ftp.rmdir(path)
|
||||
|
||||
def info(self, path):
|
||||
path = self._strip_protocol(path)
|
||||
stat = self._decode_stat(self.ftp.stat(path))
|
||||
stat["name"] = path
|
||||
return stat
|
||||
|
||||
@staticmethod
|
||||
def _decode_stat(stat, parent_path=None):
|
||||
if S_ISDIR(stat.st_mode):
|
||||
t = "directory"
|
||||
elif S_ISLNK(stat.st_mode):
|
||||
t = "link"
|
||||
else:
|
||||
t = "file"
|
||||
out = {
|
||||
"name": "",
|
||||
"size": stat.st_size,
|
||||
"type": t,
|
||||
"uid": stat.st_uid,
|
||||
"gid": stat.st_gid,
|
||||
"time": datetime.datetime.fromtimestamp(
|
||||
stat.st_atime, tz=datetime.timezone.utc
|
||||
),
|
||||
"mtime": datetime.datetime.fromtimestamp(
|
||||
stat.st_mtime, tz=datetime.timezone.utc
|
||||
),
|
||||
}
|
||||
if parent_path:
|
||||
out["name"] = "/".join([parent_path.rstrip("/"), stat.filename])
|
||||
return out
|
||||
|
||||
def ls(self, path, detail=False):
|
||||
path = self._strip_protocol(path)
|
||||
logger.debug("Listing folder %s", path)
|
||||
stats = [self._decode_stat(stat, path) for stat in self.ftp.listdir_iter(path)]
|
||||
if detail:
|
||||
return stats
|
||||
else:
|
||||
paths = [stat["name"] for stat in stats]
|
||||
return sorted(paths)
|
||||
|
||||
def put(self, lpath, rpath, callback=None, **kwargs):
|
||||
rpath = self._strip_protocol(rpath)
|
||||
logger.debug("Put file %s into %s", lpath, rpath)
|
||||
self.ftp.put(lpath, rpath)
|
||||
|
||||
def get_file(self, rpath, lpath, **kwargs):
|
||||
if self.isdir(rpath):
|
||||
os.makedirs(lpath, exist_ok=True)
|
||||
else:
|
||||
self.ftp.get(self._strip_protocol(rpath), lpath)
|
||||
|
||||
def _open(self, path, mode="rb", block_size=None, **kwargs):
|
||||
"""
|
||||
block_size: int or None
|
||||
If 0, no buffering, if 1, line buffering, if >1, buffer that many
|
||||
bytes, if None use default from paramiko.
|
||||
"""
|
||||
logger.debug("Opening file %s", path)
|
||||
if kwargs.get("autocommit", True) is False:
|
||||
# writes to temporary file, move on commit
|
||||
path2 = "/".join([self.temppath, str(uuid.uuid4())])
|
||||
f = self.ftp.open(path2, mode, bufsize=block_size if block_size else -1)
|
||||
f.temppath = path2
|
||||
f.targetpath = path
|
||||
f.fs = self
|
||||
f.commit = types.MethodType(commit_a_file, f)
|
||||
f.discard = types.MethodType(discard_a_file, f)
|
||||
else:
|
||||
f = self.ftp.open(path, mode, bufsize=block_size if block_size else -1)
|
||||
return f
|
||||
|
||||
def _rm(self, path):
|
||||
if self.isdir(path):
|
||||
self.ftp.rmdir(path)
|
||||
else:
|
||||
self.ftp.remove(path)
|
||||
|
||||
def mv(self, old, new):
|
||||
new = self._strip_protocol(new)
|
||||
old = self._strip_protocol(old)
|
||||
logger.debug("Renaming %s into %s", old, new)
|
||||
self.ftp.posix_rename(old, new)
|
||||
|
||||
|
||||
def commit_a_file(self):
|
||||
self.fs.mv(self.temppath, self.targetpath)
|
||||
|
||||
|
||||
def discard_a_file(self):
|
||||
self.fs._rm(self.temppath)
|
||||
@@ -0,0 +1,416 @@
|
||||
"""
|
||||
This module contains SMBFileSystem class responsible for handling access to
|
||||
Windows Samba network shares by using package smbprotocol
|
||||
"""
|
||||
|
||||
import datetime
|
||||
import re
|
||||
import uuid
|
||||
from stat import S_ISDIR, S_ISLNK
|
||||
|
||||
import smbclient
|
||||
import smbprotocol.exceptions
|
||||
|
||||
from .. import AbstractFileSystem
|
||||
from ..utils import infer_storage_options
|
||||
|
||||
# ! pylint: disable=bad-continuation
|
||||
|
||||
|
||||
class SMBFileSystem(AbstractFileSystem):
|
||||
"""Allow reading and writing to Windows and Samba network shares.
|
||||
|
||||
When using `fsspec.open()` for getting a file-like object the URI
|
||||
should be specified as this format:
|
||||
``smb://workgroup;user:password@server:port/share/folder/file.csv``.
|
||||
|
||||
Example::
|
||||
|
||||
>>> import fsspec
|
||||
>>> with fsspec.open(
|
||||
... 'smb://myuser:mypassword@myserver.com/' 'share/folder/file.csv'
|
||||
... ) as smbfile:
|
||||
... df = pd.read_csv(smbfile, sep='|', header=None)
|
||||
|
||||
Note that you need to pass in a valid hostname or IP address for the host
|
||||
component of the URL. Do not use the Windows/NetBIOS machine name for the
|
||||
host component.
|
||||
|
||||
The first component of the path in the URL points to the name of the shared
|
||||
folder. Subsequent path components will point to the directory/folder/file.
|
||||
|
||||
The URL components ``workgroup`` , ``user``, ``password`` and ``port`` may be
|
||||
optional.
|
||||
|
||||
.. note::
|
||||
|
||||
For working this source require `smbprotocol`_ to be installed, e.g.::
|
||||
|
||||
$ pip install smbprotocol
|
||||
# or
|
||||
# pip install smbprotocol[kerberos]
|
||||
|
||||
.. _smbprotocol: https://github.com/jborean93/smbprotocol#requirements
|
||||
|
||||
Note: if using this with the ``open`` or ``open_files``, with full URLs,
|
||||
there is no way to tell if a path is relative, so all paths are assumed
|
||||
to be absolute.
|
||||
"""
|
||||
|
||||
protocol = "smb"
|
||||
|
||||
# pylint: disable=too-many-arguments
|
||||
def __init__(
|
||||
self,
|
||||
host,
|
||||
port=None,
|
||||
username=None,
|
||||
password=None,
|
||||
timeout=60,
|
||||
encrypt=None,
|
||||
share_access=None,
|
||||
register_session_retries=4,
|
||||
register_session_retry_wait=1,
|
||||
register_session_retry_factor=10,
|
||||
auto_mkdir=False,
|
||||
**kwargs,
|
||||
):
|
||||
"""
|
||||
You can use _get_kwargs_from_urls to get some kwargs from
|
||||
a reasonable SMB url.
|
||||
|
||||
Authentication will be anonymous or integrated if username/password are not
|
||||
given.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
host: str
|
||||
The remote server name/ip to connect to
|
||||
port: int or None
|
||||
Port to connect with. Usually 445, sometimes 139.
|
||||
username: str or None
|
||||
Username to connect with. Required if Kerberos auth is not being used.
|
||||
password: str or None
|
||||
User's password on the server, if using username
|
||||
timeout: int
|
||||
Connection timeout in seconds
|
||||
encrypt: bool
|
||||
Whether to force encryption or not, once this has been set to True
|
||||
the session cannot be changed back to False.
|
||||
share_access: str or None
|
||||
Specifies the default access applied to file open operations
|
||||
performed with this file system object.
|
||||
This affects whether other processes can concurrently open a handle
|
||||
to the same file.
|
||||
|
||||
- None (the default): exclusively locks the file until closed.
|
||||
- 'r': Allow other handles to be opened with read access.
|
||||
- 'w': Allow other handles to be opened with write access.
|
||||
- 'd': Allow other handles to be opened with delete access.
|
||||
register_session_retries: int
|
||||
Number of retries to register a session with the server. Retries are not performed
|
||||
for authentication errors, as they are considered as invalid credentials and not network
|
||||
issues. If set to negative value, no register attempts will be performed.
|
||||
register_session_retry_wait: int
|
||||
Time in seconds to wait between each retry. Number must be non-negative.
|
||||
register_session_retry_factor: int
|
||||
Base factor for the wait time between each retry. The wait time
|
||||
is calculated using exponential function. For factor=1 all wait times
|
||||
will be equal to `register_session_retry_wait`. For any number of retries,
|
||||
the last wait time will be equal to `register_session_retry_wait` and for retries>1
|
||||
the first wait time will be equal to `register_session_retry_wait / factor`.
|
||||
Number must be equal to or greater than 1. Optimal factor is 10.
|
||||
auto_mkdir: bool
|
||||
Whether, when opening a file, the directory containing it should
|
||||
be created (if it doesn't already exist). This is assumed by pyarrow
|
||||
and zarr-python code.
|
||||
"""
|
||||
super().__init__(**kwargs)
|
||||
self.host = host
|
||||
self.port = port
|
||||
self.username = username
|
||||
self.password = password
|
||||
self.timeout = timeout
|
||||
self.encrypt = encrypt
|
||||
self.temppath = kwargs.pop("temppath", "")
|
||||
self.share_access = share_access
|
||||
self.register_session_retries = register_session_retries
|
||||
if register_session_retry_wait < 0:
|
||||
raise ValueError(
|
||||
"register_session_retry_wait must be a non-negative integer"
|
||||
)
|
||||
self.register_session_retry_wait = register_session_retry_wait
|
||||
if register_session_retry_factor < 1:
|
||||
raise ValueError(
|
||||
"register_session_retry_factor must be a positive "
|
||||
"integer equal to or greater than 1"
|
||||
)
|
||||
self.register_session_retry_factor = register_session_retry_factor
|
||||
self.auto_mkdir = auto_mkdir
|
||||
self._connect()
|
||||
|
||||
@property
|
||||
def _port(self):
|
||||
return 445 if self.port is None else self.port
|
||||
|
||||
def _connect(self):
|
||||
import time
|
||||
|
||||
if self.register_session_retries <= -1:
|
||||
return
|
||||
|
||||
retried_errors = []
|
||||
|
||||
wait_time = self.register_session_retry_wait
|
||||
n_waits = (
|
||||
self.register_session_retries - 1
|
||||
) # -1 = No wait time after the last retry
|
||||
factor = self.register_session_retry_factor
|
||||
|
||||
# Generate wait times for each retry attempt.
|
||||
# Wait times are calculated using exponential function. For factor=1 all wait times
|
||||
# will be equal to `wait`. For any number of retries the last wait time will be
|
||||
# equal to `wait` and for retries>2 the first wait time will be equal to `wait / factor`.
|
||||
wait_times = iter(
|
||||
factor ** (n / n_waits - 1) * wait_time for n in range(0, n_waits + 1)
|
||||
)
|
||||
|
||||
for attempt in range(self.register_session_retries + 1):
|
||||
try:
|
||||
smbclient.register_session(
|
||||
self.host,
|
||||
username=self.username,
|
||||
password=self.password,
|
||||
port=self._port,
|
||||
encrypt=self.encrypt,
|
||||
connection_timeout=self.timeout,
|
||||
)
|
||||
return
|
||||
except (
|
||||
smbprotocol.exceptions.SMBAuthenticationError,
|
||||
smbprotocol.exceptions.LogonFailure,
|
||||
):
|
||||
# These exceptions should not be repeated, as they clearly indicate
|
||||
# that the credentials are invalid and not a network issue.
|
||||
raise
|
||||
except ValueError as exc:
|
||||
if re.findall(r"\[Errno -\d+]", str(exc)):
|
||||
# This exception is raised by the smbprotocol.transport:Tcp.connect
|
||||
# and originates from socket.gaierror (OSError). These exceptions might
|
||||
# be raised due to network instability. We will retry to connect.
|
||||
retried_errors.append(exc)
|
||||
else:
|
||||
# All another ValueError exceptions should be raised, as they are not
|
||||
# related to network issues.
|
||||
raise
|
||||
except Exception as exc:
|
||||
# Save the exception and retry to connect. This except might be dropped
|
||||
# in the future, once all exceptions suited for retry are identified.
|
||||
retried_errors.append(exc)
|
||||
|
||||
if attempt < self.register_session_retries:
|
||||
time.sleep(next(wait_times))
|
||||
|
||||
# Raise last exception to inform user about the connection issues.
|
||||
# Note: Should we use ExceptionGroup to raise all exceptions?
|
||||
raise retried_errors[-1]
|
||||
|
||||
@classmethod
|
||||
def _strip_protocol(cls, path):
|
||||
return infer_storage_options(path)["path"]
|
||||
|
||||
@staticmethod
|
||||
def _get_kwargs_from_urls(path):
|
||||
# smb://workgroup;user:password@host:port/share/folder/file.csv
|
||||
out = infer_storage_options(path)
|
||||
out.pop("path", None)
|
||||
out.pop("protocol", None)
|
||||
return out
|
||||
|
||||
def mkdir(self, path, create_parents=True, **kwargs):
|
||||
wpath = _as_unc_path(self.host, path)
|
||||
if create_parents:
|
||||
smbclient.makedirs(wpath, exist_ok=False, port=self._port, **kwargs)
|
||||
else:
|
||||
smbclient.mkdir(wpath, port=self._port, **kwargs)
|
||||
|
||||
def makedirs(self, path, exist_ok=False):
|
||||
if _share_has_path(path):
|
||||
wpath = _as_unc_path(self.host, path)
|
||||
smbclient.makedirs(wpath, exist_ok=exist_ok, port=self._port)
|
||||
|
||||
def rmdir(self, path):
|
||||
if _share_has_path(path):
|
||||
wpath = _as_unc_path(self.host, path)
|
||||
smbclient.rmdir(wpath, port=self._port)
|
||||
|
||||
def info(self, path, **kwargs):
|
||||
wpath = _as_unc_path(self.host, path)
|
||||
stats = smbclient.stat(wpath, port=self._port, **kwargs)
|
||||
if S_ISDIR(stats.st_mode):
|
||||
stype = "directory"
|
||||
elif S_ISLNK(stats.st_mode):
|
||||
stype = "link"
|
||||
else:
|
||||
stype = "file"
|
||||
res = {
|
||||
"name": path + "/" if stype == "directory" else path,
|
||||
"size": stats.st_size,
|
||||
"type": stype,
|
||||
"uid": stats.st_uid,
|
||||
"gid": stats.st_gid,
|
||||
"time": stats.st_atime,
|
||||
"mtime": stats.st_mtime,
|
||||
}
|
||||
return res
|
||||
|
||||
def created(self, path):
|
||||
"""Return the created timestamp of a file as a datetime.datetime"""
|
||||
wpath = _as_unc_path(self.host, path)
|
||||
stats = smbclient.stat(wpath, port=self._port)
|
||||
return datetime.datetime.fromtimestamp(stats.st_ctime, tz=datetime.timezone.utc)
|
||||
|
||||
def modified(self, path):
|
||||
"""Return the modified timestamp of a file as a datetime.datetime"""
|
||||
wpath = _as_unc_path(self.host, path)
|
||||
stats = smbclient.stat(wpath, port=self._port)
|
||||
return datetime.datetime.fromtimestamp(stats.st_mtime, tz=datetime.timezone.utc)
|
||||
|
||||
def ls(self, path, detail=True, **kwargs):
|
||||
unc = _as_unc_path(self.host, path)
|
||||
listed = smbclient.listdir(unc, port=self._port, **kwargs)
|
||||
dirs = ["/".join([path.rstrip("/"), p]) for p in listed]
|
||||
if detail:
|
||||
dirs = [self.info(d) for d in dirs]
|
||||
return dirs
|
||||
|
||||
# pylint: disable=too-many-arguments
|
||||
def _open(
|
||||
self,
|
||||
path,
|
||||
mode="rb",
|
||||
block_size=-1,
|
||||
autocommit=True,
|
||||
cache_options=None,
|
||||
**kwargs,
|
||||
):
|
||||
"""
|
||||
block_size: int or None
|
||||
If 0, no buffering, 1, line buffering, >1, buffer that many bytes
|
||||
|
||||
Notes
|
||||
-----
|
||||
By specifying 'share_access' in 'kwargs' it is possible to override the
|
||||
default shared access setting applied in the constructor of this object.
|
||||
"""
|
||||
if self.auto_mkdir and "w" in mode:
|
||||
self.makedirs(self._parent(path), exist_ok=True)
|
||||
bls = block_size if block_size is not None and block_size >= 0 else -1
|
||||
wpath = _as_unc_path(self.host, path)
|
||||
share_access = kwargs.pop("share_access", self.share_access)
|
||||
if "w" in mode and autocommit is False:
|
||||
temp = _as_temp_path(self.host, path, self.temppath)
|
||||
return SMBFileOpener(
|
||||
wpath, temp, mode, port=self._port, block_size=bls, **kwargs
|
||||
)
|
||||
return smbclient.open_file(
|
||||
wpath,
|
||||
mode,
|
||||
buffering=bls,
|
||||
share_access=share_access,
|
||||
port=self._port,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
def copy(self, path1, path2, **kwargs):
|
||||
"""Copy within two locations in the same filesystem"""
|
||||
wpath1 = _as_unc_path(self.host, path1)
|
||||
wpath2 = _as_unc_path(self.host, path2)
|
||||
if self.auto_mkdir:
|
||||
self.makedirs(self._parent(path2), exist_ok=True)
|
||||
smbclient.copyfile(wpath1, wpath2, port=self._port, **kwargs)
|
||||
|
||||
def _rm(self, path):
|
||||
if _share_has_path(path):
|
||||
wpath = _as_unc_path(self.host, path)
|
||||
stats = smbclient.stat(wpath, port=self._port)
|
||||
if S_ISDIR(stats.st_mode):
|
||||
smbclient.rmdir(wpath, port=self._port)
|
||||
else:
|
||||
smbclient.remove(wpath, port=self._port)
|
||||
|
||||
def mv(self, path1, path2, recursive=None, maxdepth=None, **kwargs):
|
||||
wpath1 = _as_unc_path(self.host, path1)
|
||||
wpath2 = _as_unc_path(self.host, path2)
|
||||
smbclient.rename(wpath1, wpath2, port=self._port, **kwargs)
|
||||
|
||||
|
||||
def _as_unc_path(host, path):
|
||||
rpath = path.replace("/", "\\")
|
||||
unc = f"\\\\{host}{rpath}"
|
||||
return unc
|
||||
|
||||
|
||||
def _as_temp_path(host, path, temppath):
|
||||
share = path.split("/")[1]
|
||||
temp_file = f"/{share}{temppath}/{uuid.uuid4()}"
|
||||
unc = _as_unc_path(host, temp_file)
|
||||
return unc
|
||||
|
||||
|
||||
def _share_has_path(path):
|
||||
parts = path.count("/")
|
||||
if path.endswith("/"):
|
||||
return parts > 2
|
||||
return parts > 1
|
||||
|
||||
|
||||
class SMBFileOpener:
|
||||
"""writes to remote temporary file, move on commit"""
|
||||
|
||||
def __init__(self, path, temp, mode, port=445, block_size=-1, **kwargs):
|
||||
self.path = path
|
||||
self.temp = temp
|
||||
self.mode = mode
|
||||
self.block_size = block_size
|
||||
self.kwargs = kwargs
|
||||
self.smbfile = None
|
||||
self._incontext = False
|
||||
self.port = port
|
||||
self._open()
|
||||
|
||||
def _open(self):
|
||||
if self.smbfile is None or self.smbfile.closed:
|
||||
self.smbfile = smbclient.open_file(
|
||||
self.temp,
|
||||
self.mode,
|
||||
port=self.port,
|
||||
buffering=self.block_size,
|
||||
**self.kwargs,
|
||||
)
|
||||
|
||||
def commit(self):
|
||||
"""Move temp file to definitive on success."""
|
||||
# TODO: use transaction support in SMB protocol
|
||||
smbclient.replace(self.temp, self.path, port=self.port)
|
||||
|
||||
def discard(self):
|
||||
"""Remove the temp file on failure."""
|
||||
smbclient.remove(self.temp, port=self.port)
|
||||
|
||||
def __fspath__(self):
|
||||
return self.path
|
||||
|
||||
def __iter__(self):
|
||||
return self.smbfile.__iter__()
|
||||
|
||||
def __getattr__(self, item):
|
||||
return getattr(self.smbfile, item)
|
||||
|
||||
def __enter__(self):
|
||||
self._incontext = True
|
||||
return self.smbfile.__enter__()
|
||||
|
||||
def __exit__(self, exc_type, exc_value, traceback):
|
||||
self._incontext = False
|
||||
self.smbfile.__exit__(exc_type, exc_value, traceback)
|
||||
@@ -0,0 +1,124 @@
|
||||
import logging
|
||||
import tarfile
|
||||
|
||||
import fsspec
|
||||
from fsspec.archive import AbstractArchiveFileSystem
|
||||
from fsspec.compression import compr
|
||||
from fsspec.utils import infer_compression
|
||||
|
||||
typemap = {b"0": "file", b"5": "directory"}
|
||||
|
||||
logger = logging.getLogger("tar")
|
||||
|
||||
|
||||
class TarFileSystem(AbstractArchiveFileSystem):
|
||||
"""Compressed Tar archives as a file-system (read-only)
|
||||
|
||||
Supports the following formats:
|
||||
tar.gz, tar.bz2, tar.xz
|
||||
"""
|
||||
|
||||
root_marker = ""
|
||||
protocol = "tar"
|
||||
cachable = False
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
fo="",
|
||||
index_store=None,
|
||||
target_options=None,
|
||||
target_protocol=None,
|
||||
compression=None,
|
||||
**kwargs,
|
||||
):
|
||||
super().__init__(**kwargs)
|
||||
target_options = target_options or {}
|
||||
|
||||
if isinstance(fo, str):
|
||||
self.of = fsspec.open(fo, protocol=target_protocol, **target_options)
|
||||
fo = self.of.open() # keep the reference
|
||||
|
||||
# Try to infer compression.
|
||||
if compression is None:
|
||||
name = None
|
||||
|
||||
# Try different ways to get hold of the filename. `fo` might either
|
||||
# be a `fsspec.LocalFileOpener`, an `io.BufferedReader` or an
|
||||
# `fsspec.AbstractFileSystem` instance.
|
||||
try:
|
||||
# Amended io.BufferedReader or similar.
|
||||
# This uses a "protocol extension" where original filenames are
|
||||
# propagated to archive-like filesystems in order to let them
|
||||
# infer the right compression appropriately.
|
||||
if hasattr(fo, "original"):
|
||||
name = fo.original
|
||||
|
||||
# fsspec.LocalFileOpener
|
||||
elif hasattr(fo, "path"):
|
||||
name = fo.path
|
||||
|
||||
# io.BufferedReader
|
||||
elif hasattr(fo, "name"):
|
||||
name = fo.name
|
||||
|
||||
# fsspec.AbstractFileSystem
|
||||
elif hasattr(fo, "info"):
|
||||
name = fo.info()["name"]
|
||||
|
||||
except Exception as ex:
|
||||
logger.warning(
|
||||
f"Unable to determine file name, not inferring compression: {ex}"
|
||||
)
|
||||
|
||||
if name is not None:
|
||||
compression = infer_compression(name)
|
||||
logger.info(f"Inferred compression {compression} from file name {name}")
|
||||
|
||||
if compression is not None:
|
||||
# TODO: tarfile already implements compression with modes like "'r:gz'",
|
||||
# but then would seek to offset in the file work?
|
||||
fo = compr[compression](fo)
|
||||
|
||||
self._fo_ref = fo
|
||||
self.fo = fo # the whole instance is a context
|
||||
self.tar = tarfile.TarFile(fileobj=self.fo)
|
||||
self.dir_cache = None
|
||||
|
||||
self.index_store = index_store
|
||||
self.index = None
|
||||
self._index()
|
||||
|
||||
def _index(self):
|
||||
# TODO: load and set saved index, if exists
|
||||
out = {}
|
||||
for ti in self.tar:
|
||||
info = ti.get_info()
|
||||
info["type"] = typemap.get(info["type"], "file")
|
||||
name = ti.get_info()["name"].rstrip("/")
|
||||
out[name] = (info, ti.offset_data)
|
||||
|
||||
self.index = out
|
||||
# TODO: save index to self.index_store here, if set
|
||||
|
||||
def _get_dirs(self):
|
||||
if self.dir_cache is not None:
|
||||
return
|
||||
|
||||
# This enables ls to get directories as children as well as files
|
||||
self.dir_cache = {
|
||||
dirname: {"name": dirname, "size": 0, "type": "directory"}
|
||||
for dirname in self._all_dirnames(self.tar.getnames())
|
||||
}
|
||||
for member in self.tar.getmembers():
|
||||
info = member.get_info()
|
||||
info["name"] = info["name"].rstrip("/")
|
||||
info["type"] = typemap.get(info["type"], "file")
|
||||
self.dir_cache[info["name"]] = info
|
||||
|
||||
def _open(self, path, mode="rb", **kwargs):
|
||||
if mode != "rb":
|
||||
raise ValueError("Read-only filesystem implementation")
|
||||
details, offset = self.index[path]
|
||||
if details["type"] != "file":
|
||||
raise ValueError("Can only handle regular files")
|
||||
return self.tar.extractfile(path)
|
||||
@@ -0,0 +1,485 @@
|
||||
# https://hadoop.apache.org/docs/r1.0.4/webhdfs.html
|
||||
|
||||
import logging
|
||||
import os
|
||||
import secrets
|
||||
import shutil
|
||||
import tempfile
|
||||
import uuid
|
||||
from contextlib import suppress
|
||||
from urllib.parse import quote
|
||||
|
||||
import requests
|
||||
|
||||
from ..spec import AbstractBufferedFile, AbstractFileSystem
|
||||
from ..utils import infer_storage_options, tokenize
|
||||
|
||||
logger = logging.getLogger("webhdfs")
|
||||
|
||||
|
||||
class WebHDFS(AbstractFileSystem):
|
||||
"""
|
||||
Interface to HDFS over HTTP using the WebHDFS API. Supports also HttpFS gateways.
|
||||
|
||||
Four auth mechanisms are supported:
|
||||
|
||||
insecure: no auth is done, and the user is assumed to be whoever they
|
||||
say they are (parameter ``user``), or a predefined value such as
|
||||
"dr.who" if not given
|
||||
spnego: when kerberos authentication is enabled, auth is negotiated by
|
||||
requests_kerberos https://github.com/requests/requests-kerberos .
|
||||
This establishes a session based on existing kinit login and/or
|
||||
specified principal/password; parameters are passed with ``kerb_kwargs``
|
||||
token: uses an existing Hadoop delegation token from another secured
|
||||
service. Indeed, this client can also generate such tokens when
|
||||
not insecure. Note that tokens expire, but can be renewed (by a
|
||||
previously specified user) and may allow for proxying.
|
||||
basic-auth: used when both parameter ``user`` and parameter ``password``
|
||||
are provided.
|
||||
|
||||
"""
|
||||
|
||||
tempdir = str(tempfile.gettempdir())
|
||||
protocol = "webhdfs", "webHDFS"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
host,
|
||||
port=50070,
|
||||
kerberos=False,
|
||||
token=None,
|
||||
user=None,
|
||||
password=None,
|
||||
proxy_to=None,
|
||||
kerb_kwargs=None,
|
||||
data_proxy=None,
|
||||
use_https=False,
|
||||
session_cert=None,
|
||||
session_verify=True,
|
||||
**kwargs,
|
||||
):
|
||||
"""
|
||||
Parameters
|
||||
----------
|
||||
host: str
|
||||
Name-node address
|
||||
port: int
|
||||
Port for webHDFS
|
||||
kerberos: bool
|
||||
Whether to authenticate with kerberos for this connection
|
||||
token: str or None
|
||||
If given, use this token on every call to authenticate. A user
|
||||
and user-proxy may be encoded in the token and should not be also
|
||||
given
|
||||
user: str or None
|
||||
If given, assert the user name to connect with
|
||||
password: str or None
|
||||
If given, assert the password to use for basic auth. If password
|
||||
is provided, user must be provided also
|
||||
proxy_to: str or None
|
||||
If given, the user has the authority to proxy, and this value is
|
||||
the user in who's name actions are taken
|
||||
kerb_kwargs: dict
|
||||
Any extra arguments for HTTPKerberosAuth, see
|
||||
`<https://github.com/requests/requests-kerberos/blob/master/requests_kerberos/kerberos_.py>`_
|
||||
data_proxy: dict, callable or None
|
||||
If given, map data-node addresses. This can be necessary if the
|
||||
HDFS cluster is behind a proxy, running on Docker or otherwise has
|
||||
a mismatch between the host-names given by the name-node and the
|
||||
address by which to refer to them from the client. If a dict,
|
||||
maps host names ``host->data_proxy[host]``; if a callable, full
|
||||
URLs are passed, and function must conform to
|
||||
``url->data_proxy(url)``.
|
||||
use_https: bool
|
||||
Whether to connect to the Name-node using HTTPS instead of HTTP
|
||||
session_cert: str or Tuple[str, str] or None
|
||||
Path to a certificate file, or tuple of (cert, key) files to use
|
||||
for the requests.Session
|
||||
session_verify: str, bool or None
|
||||
Path to a certificate file to use for verifying the requests.Session.
|
||||
kwargs
|
||||
"""
|
||||
if self._cached:
|
||||
return
|
||||
super().__init__(**kwargs)
|
||||
self.url = f"{'https' if use_https else 'http'}://{host}:{port}/webhdfs/v1"
|
||||
self.kerb = kerberos
|
||||
self.kerb_kwargs = kerb_kwargs or {}
|
||||
self.pars = {}
|
||||
self.proxy = data_proxy or {}
|
||||
if token is not None:
|
||||
if user is not None or proxy_to is not None:
|
||||
raise ValueError(
|
||||
"If passing a delegation token, must not set "
|
||||
"user or proxy_to, as these are encoded in the"
|
||||
" token"
|
||||
)
|
||||
self.pars["delegation"] = token
|
||||
self.user = user
|
||||
self.password = password
|
||||
|
||||
if password is not None:
|
||||
if user is None:
|
||||
raise ValueError(
|
||||
"If passing a password, the user must also be"
|
||||
"set in order to set up the basic-auth"
|
||||
)
|
||||
else:
|
||||
if user is not None:
|
||||
self.pars["user.name"] = user
|
||||
|
||||
if proxy_to is not None:
|
||||
self.pars["doas"] = proxy_to
|
||||
if kerberos and user is not None:
|
||||
raise ValueError(
|
||||
"If using Kerberos auth, do not specify the "
|
||||
"user, this is handled by kinit."
|
||||
)
|
||||
|
||||
self.session_cert = session_cert
|
||||
self.session_verify = session_verify
|
||||
|
||||
self._connect()
|
||||
|
||||
self._fsid = f"webhdfs_{tokenize(host, port)}"
|
||||
|
||||
@property
|
||||
def fsid(self):
|
||||
return self._fsid
|
||||
|
||||
def _connect(self):
|
||||
self.session = requests.Session()
|
||||
|
||||
if self.session_cert:
|
||||
self.session.cert = self.session_cert
|
||||
|
||||
self.session.verify = self.session_verify
|
||||
|
||||
if self.kerb:
|
||||
from requests_kerberos import HTTPKerberosAuth
|
||||
|
||||
self.session.auth = HTTPKerberosAuth(**self.kerb_kwargs)
|
||||
|
||||
if self.user is not None and self.password is not None:
|
||||
from requests.auth import HTTPBasicAuth
|
||||
|
||||
self.session.auth = HTTPBasicAuth(self.user, self.password)
|
||||
|
||||
def _call(self, op, method="get", path=None, data=None, redirect=True, **kwargs):
|
||||
path = self._strip_protocol(path) if path is not None else ""
|
||||
url = self._apply_proxy(self.url + quote(path, safe="/="))
|
||||
args = kwargs.copy()
|
||||
args.update(self.pars)
|
||||
args["op"] = op.upper()
|
||||
logger.debug("sending %s with %s", url, method)
|
||||
out = self.session.request(
|
||||
method=method.upper(),
|
||||
url=url,
|
||||
params=args,
|
||||
data=data,
|
||||
allow_redirects=redirect,
|
||||
)
|
||||
if out.status_code in [400, 401, 403, 404, 500]:
|
||||
try:
|
||||
err = out.json()
|
||||
msg = err["RemoteException"]["message"]
|
||||
exp = err["RemoteException"]["exception"]
|
||||
except (ValueError, KeyError):
|
||||
pass
|
||||
else:
|
||||
if exp in ["IllegalArgumentException", "UnsupportedOperationException"]:
|
||||
raise ValueError(msg)
|
||||
elif exp in ["SecurityException", "AccessControlException"]:
|
||||
raise PermissionError(msg)
|
||||
elif exp in ["FileNotFoundException"]:
|
||||
raise FileNotFoundError(msg)
|
||||
else:
|
||||
raise RuntimeError(msg)
|
||||
out.raise_for_status()
|
||||
return out
|
||||
|
||||
def _open(
|
||||
self,
|
||||
path,
|
||||
mode="rb",
|
||||
block_size=None,
|
||||
autocommit=True,
|
||||
replication=None,
|
||||
permissions=None,
|
||||
**kwargs,
|
||||
):
|
||||
"""
|
||||
|
||||
Parameters
|
||||
----------
|
||||
path: str
|
||||
File location
|
||||
mode: str
|
||||
'rb', 'wb', etc.
|
||||
block_size: int
|
||||
Client buffer size for read-ahead or write buffer
|
||||
autocommit: bool
|
||||
If False, writes to temporary file that only gets put in final
|
||||
location upon commit
|
||||
replication: int
|
||||
Number of copies of file on the cluster, write mode only
|
||||
permissions: str or int
|
||||
posix permissions, write mode only
|
||||
kwargs
|
||||
|
||||
Returns
|
||||
-------
|
||||
WebHDFile instance
|
||||
"""
|
||||
block_size = block_size or self.blocksize
|
||||
return WebHDFile(
|
||||
self,
|
||||
path,
|
||||
mode=mode,
|
||||
block_size=block_size,
|
||||
tempdir=self.tempdir,
|
||||
autocommit=autocommit,
|
||||
replication=replication,
|
||||
permissions=permissions,
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def _process_info(info):
|
||||
info["type"] = info["type"].lower()
|
||||
info["size"] = info["length"]
|
||||
return info
|
||||
|
||||
@classmethod
|
||||
def _strip_protocol(cls, path):
|
||||
return infer_storage_options(path)["path"]
|
||||
|
||||
@staticmethod
|
||||
def _get_kwargs_from_urls(urlpath):
|
||||
out = infer_storage_options(urlpath)
|
||||
out.pop("path", None)
|
||||
out.pop("protocol", None)
|
||||
if "username" in out:
|
||||
out["user"] = out.pop("username")
|
||||
return out
|
||||
|
||||
def info(self, path):
|
||||
out = self._call("GETFILESTATUS", path=path)
|
||||
info = out.json()["FileStatus"]
|
||||
info["name"] = path
|
||||
return self._process_info(info)
|
||||
|
||||
def ls(self, path, detail=False, **kwargs):
|
||||
out = self._call("LISTSTATUS", path=path)
|
||||
infos = out.json()["FileStatuses"]["FileStatus"]
|
||||
for info in infos:
|
||||
self._process_info(info)
|
||||
info["name"] = path.rstrip("/") + "/" + info["pathSuffix"]
|
||||
if detail:
|
||||
return sorted(infos, key=lambda i: i["name"])
|
||||
else:
|
||||
return sorted(info["name"] for info in infos)
|
||||
|
||||
def content_summary(self, path):
|
||||
"""Total numbers of files, directories and bytes under path"""
|
||||
out = self._call("GETCONTENTSUMMARY", path=path)
|
||||
return out.json()["ContentSummary"]
|
||||
|
||||
def ukey(self, path):
|
||||
"""Checksum info of file, giving method and result"""
|
||||
out = self._call("GETFILECHECKSUM", path=path, redirect=False)
|
||||
if "Location" in out.headers:
|
||||
location = self._apply_proxy(out.headers["Location"])
|
||||
out2 = self.session.get(location)
|
||||
out2.raise_for_status()
|
||||
return out2.json()["FileChecksum"]
|
||||
else:
|
||||
out.raise_for_status()
|
||||
return out.json()["FileChecksum"]
|
||||
|
||||
def home_directory(self):
|
||||
"""Get user's home directory"""
|
||||
out = self._call("GETHOMEDIRECTORY")
|
||||
return out.json()["Path"]
|
||||
|
||||
def get_delegation_token(self, renewer=None):
|
||||
"""Retrieve token which can give the same authority to other uses
|
||||
|
||||
Parameters
|
||||
----------
|
||||
renewer: str or None
|
||||
User who may use this token; if None, will be current user
|
||||
"""
|
||||
if renewer:
|
||||
out = self._call("GETDELEGATIONTOKEN", renewer=renewer)
|
||||
else:
|
||||
out = self._call("GETDELEGATIONTOKEN")
|
||||
t = out.json()["Token"]
|
||||
if t is None:
|
||||
raise ValueError("No token available for this user/security context")
|
||||
return t["urlString"]
|
||||
|
||||
def renew_delegation_token(self, token):
|
||||
"""Make token live longer. Returns new expiry time"""
|
||||
out = self._call("RENEWDELEGATIONTOKEN", method="put", token=token)
|
||||
return out.json()["long"]
|
||||
|
||||
def cancel_delegation_token(self, token):
|
||||
"""Stop the token from being useful"""
|
||||
self._call("CANCELDELEGATIONTOKEN", method="put", token=token)
|
||||
|
||||
def chmod(self, path, mod):
|
||||
"""Set the permission at path
|
||||
|
||||
Parameters
|
||||
----------
|
||||
path: str
|
||||
location to set (file or directory)
|
||||
mod: str or int
|
||||
posix epresentation or permission, give as oct string, e.g, '777'
|
||||
or 0o777
|
||||
"""
|
||||
self._call("SETPERMISSION", method="put", path=path, permission=mod)
|
||||
|
||||
def chown(self, path, owner=None, group=None):
|
||||
"""Change owning user and/or group"""
|
||||
kwargs = {}
|
||||
if owner is not None:
|
||||
kwargs["owner"] = owner
|
||||
if group is not None:
|
||||
kwargs["group"] = group
|
||||
self._call("SETOWNER", method="put", path=path, **kwargs)
|
||||
|
||||
def set_replication(self, path, replication):
|
||||
"""
|
||||
Set file replication factor
|
||||
|
||||
Parameters
|
||||
----------
|
||||
path: str
|
||||
File location (not for directories)
|
||||
replication: int
|
||||
Number of copies of file on the cluster. Should be smaller than
|
||||
number of data nodes; normally 3 on most systems.
|
||||
"""
|
||||
self._call("SETREPLICATION", path=path, method="put", replication=replication)
|
||||
|
||||
def mkdir(self, path, **kwargs):
|
||||
self._call("MKDIRS", method="put", path=path)
|
||||
|
||||
def makedirs(self, path, exist_ok=False):
|
||||
if exist_ok is False and self.exists(path):
|
||||
raise FileExistsError(path)
|
||||
self.mkdir(path)
|
||||
|
||||
def mv(self, path1, path2, **kwargs):
|
||||
self._call("RENAME", method="put", path=path1, destination=path2)
|
||||
|
||||
def rm(self, path, recursive=False, **kwargs):
|
||||
self._call(
|
||||
"DELETE",
|
||||
method="delete",
|
||||
path=path,
|
||||
recursive="true" if recursive else "false",
|
||||
)
|
||||
|
||||
def rm_file(self, path, **kwargs):
|
||||
self.rm(path)
|
||||
|
||||
def cp_file(self, lpath, rpath, **kwargs):
|
||||
with self.open(lpath) as lstream:
|
||||
tmp_fname = "/".join([self._parent(rpath), f".tmp.{secrets.token_hex(16)}"])
|
||||
# Perform an atomic copy (stream to a temporary file and
|
||||
# move it to the actual destination).
|
||||
try:
|
||||
with self.open(tmp_fname, "wb") as rstream:
|
||||
shutil.copyfileobj(lstream, rstream)
|
||||
self.mv(tmp_fname, rpath)
|
||||
except BaseException:
|
||||
with suppress(FileNotFoundError):
|
||||
self.rm(tmp_fname)
|
||||
raise
|
||||
|
||||
def _apply_proxy(self, location):
|
||||
if self.proxy and callable(self.proxy):
|
||||
location = self.proxy(location)
|
||||
elif self.proxy:
|
||||
# as a dict
|
||||
for k, v in self.proxy.items():
|
||||
location = location.replace(k, v, 1)
|
||||
return location
|
||||
|
||||
|
||||
class WebHDFile(AbstractBufferedFile):
|
||||
"""A file living in HDFS over webHDFS"""
|
||||
|
||||
def __init__(self, fs, path, **kwargs):
|
||||
super().__init__(fs, path, **kwargs)
|
||||
kwargs = kwargs.copy()
|
||||
if kwargs.get("permissions", None) is None:
|
||||
kwargs.pop("permissions", None)
|
||||
if kwargs.get("replication", None) is None:
|
||||
kwargs.pop("replication", None)
|
||||
self.permissions = kwargs.pop("permissions", 511)
|
||||
tempdir = kwargs.pop("tempdir")
|
||||
if kwargs.pop("autocommit", False) is False:
|
||||
self.target = self.path
|
||||
self.path = os.path.join(tempdir, str(uuid.uuid4()))
|
||||
|
||||
def _upload_chunk(self, final=False):
|
||||
"""Write one part of a multi-block file upload
|
||||
|
||||
Parameters
|
||||
==========
|
||||
final: bool
|
||||
This is the last block, so should complete file, if
|
||||
self.autocommit is True.
|
||||
"""
|
||||
out = self.fs.session.post(
|
||||
self.location,
|
||||
data=self.buffer.getvalue(),
|
||||
headers={"content-type": "application/octet-stream"},
|
||||
)
|
||||
out.raise_for_status()
|
||||
return True
|
||||
|
||||
def _initiate_upload(self):
|
||||
"""Create remote file/upload"""
|
||||
kwargs = self.kwargs.copy()
|
||||
if "a" in self.mode:
|
||||
op, method = "APPEND", "POST"
|
||||
else:
|
||||
op, method = "CREATE", "PUT"
|
||||
kwargs["overwrite"] = "true"
|
||||
out = self.fs._call(op, method, self.path, redirect=False, **kwargs)
|
||||
location = self.fs._apply_proxy(out.headers["Location"])
|
||||
if "w" in self.mode:
|
||||
# create empty file to append to
|
||||
out2 = self.fs.session.put(
|
||||
location, headers={"content-type": "application/octet-stream"}
|
||||
)
|
||||
out2.raise_for_status()
|
||||
# after creating empty file, change location to append to
|
||||
out2 = self.fs._call("APPEND", "POST", self.path, redirect=False, **kwargs)
|
||||
self.location = self.fs._apply_proxy(out2.headers["Location"])
|
||||
|
||||
def _fetch_range(self, start, end):
|
||||
start = max(start, 0)
|
||||
end = min(self.size, end)
|
||||
if start >= end or start >= self.size:
|
||||
return b""
|
||||
out = self.fs._call(
|
||||
"OPEN", path=self.path, offset=start, length=end - start, redirect=False
|
||||
)
|
||||
out.raise_for_status()
|
||||
if "Location" in out.headers:
|
||||
location = out.headers["Location"]
|
||||
out2 = self.fs.session.get(self.fs._apply_proxy(location))
|
||||
return out2.content
|
||||
else:
|
||||
return out.content
|
||||
|
||||
def commit(self):
|
||||
self.fs.mv(self.path, self.target)
|
||||
|
||||
def discard(self):
|
||||
self.fs.rm(self.path)
|
||||
@@ -0,0 +1,177 @@
|
||||
import os
|
||||
import zipfile
|
||||
|
||||
import fsspec
|
||||
from fsspec.archive import AbstractArchiveFileSystem
|
||||
|
||||
|
||||
class ZipFileSystem(AbstractArchiveFileSystem):
|
||||
"""Read/Write contents of ZIP archive as a file-system
|
||||
|
||||
Keeps file object open while instance lives.
|
||||
|
||||
This class is pickleable, but not necessarily thread-safe
|
||||
"""
|
||||
|
||||
root_marker = ""
|
||||
protocol = "zip"
|
||||
cachable = False
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
fo="",
|
||||
mode="r",
|
||||
target_protocol=None,
|
||||
target_options=None,
|
||||
compression=zipfile.ZIP_STORED,
|
||||
allowZip64=True,
|
||||
compresslevel=None,
|
||||
**kwargs,
|
||||
):
|
||||
"""
|
||||
Parameters
|
||||
----------
|
||||
fo: str or file-like
|
||||
Contains ZIP, and must exist. If a str, will fetch file using
|
||||
:meth:`~fsspec.open_files`, which must return one file exactly.
|
||||
mode: str
|
||||
Accept: "r", "w", "a"
|
||||
target_protocol: str (optional)
|
||||
If ``fo`` is a string, this value can be used to override the
|
||||
FS protocol inferred from a URL
|
||||
target_options: dict (optional)
|
||||
Kwargs passed when instantiating the target FS, if ``fo`` is
|
||||
a string.
|
||||
compression, allowZip64, compresslevel: passed to ZipFile
|
||||
Only relevant when creating a ZIP
|
||||
"""
|
||||
super().__init__(self, **kwargs)
|
||||
if mode not in set("rwa"):
|
||||
raise ValueError(f"mode '{mode}' no understood")
|
||||
self.mode = mode
|
||||
if isinstance(fo, (str, os.PathLike)):
|
||||
if mode == "a":
|
||||
m = "r+b"
|
||||
else:
|
||||
m = mode + "b"
|
||||
fo = fsspec.open(
|
||||
fo, mode=m, protocol=target_protocol, **(target_options or {})
|
||||
)
|
||||
self.force_zip_64 = allowZip64
|
||||
self.of = fo
|
||||
self.fo = fo.__enter__() # the whole instance is a context
|
||||
self.zip = zipfile.ZipFile(
|
||||
self.fo,
|
||||
mode=mode,
|
||||
compression=compression,
|
||||
allowZip64=allowZip64,
|
||||
compresslevel=compresslevel,
|
||||
)
|
||||
self.dir_cache = None
|
||||
|
||||
@classmethod
|
||||
def _strip_protocol(cls, path):
|
||||
# zip file paths are always relative to the archive root
|
||||
return super()._strip_protocol(path).lstrip("/")
|
||||
|
||||
def __del__(self):
|
||||
if hasattr(self, "zip"):
|
||||
self.close()
|
||||
del self.zip
|
||||
|
||||
def close(self):
|
||||
"""Commits any write changes to the file. Done on ``del`` too."""
|
||||
self.zip.close()
|
||||
|
||||
def _get_dirs(self):
|
||||
if self.dir_cache is None or self.mode in set("wa"):
|
||||
# when writing, dir_cache is always in the ZipFile's attributes,
|
||||
# not read from the file.
|
||||
files = self.zip.infolist()
|
||||
self.dir_cache = {
|
||||
dirname.rstrip("/"): {
|
||||
"name": dirname.rstrip("/"),
|
||||
"size": 0,
|
||||
"type": "directory",
|
||||
}
|
||||
for dirname in self._all_dirnames(self.zip.namelist())
|
||||
}
|
||||
for z in files:
|
||||
f = {s: getattr(z, s, None) for s in zipfile.ZipInfo.__slots__}
|
||||
f.update(
|
||||
{
|
||||
"name": z.filename.rstrip("/"),
|
||||
"size": z.file_size,
|
||||
"type": ("directory" if z.is_dir() else "file"),
|
||||
}
|
||||
)
|
||||
self.dir_cache[f["name"]] = f
|
||||
|
||||
def pipe_file(self, path, value, **kwargs):
|
||||
# override upstream, because we know the exact file size in this case
|
||||
self.zip.writestr(path, value, **kwargs)
|
||||
|
||||
def _open(
|
||||
self,
|
||||
path,
|
||||
mode="rb",
|
||||
block_size=None,
|
||||
autocommit=True,
|
||||
cache_options=None,
|
||||
**kwargs,
|
||||
):
|
||||
path = self._strip_protocol(path)
|
||||
if "r" in mode and self.mode in set("wa"):
|
||||
if self.exists(path):
|
||||
raise OSError("ZipFS can only be open for reading or writing, not both")
|
||||
raise FileNotFoundError(path)
|
||||
if "r" in self.mode and "w" in mode:
|
||||
raise OSError("ZipFS can only be open for reading or writing, not both")
|
||||
out = self.zip.open(path, mode.strip("b"), force_zip64=self.force_zip_64)
|
||||
if "r" in mode:
|
||||
info = self.info(path)
|
||||
out.size = info["size"]
|
||||
out.name = info["name"]
|
||||
return out
|
||||
|
||||
def find(self, path, maxdepth=None, withdirs=False, detail=False, **kwargs):
|
||||
if maxdepth is not None and maxdepth < 1:
|
||||
raise ValueError("maxdepth must be at least 1")
|
||||
|
||||
# Remove the leading slash, as the zip file paths are always
|
||||
# given without a leading slash
|
||||
path = path.lstrip("/")
|
||||
path_parts = list(filter(lambda s: bool(s), path.split("/")))
|
||||
|
||||
def _matching_starts(file_path):
|
||||
file_parts = filter(lambda s: bool(s), file_path.split("/"))
|
||||
return all(a == b for a, b in zip(path_parts, file_parts))
|
||||
|
||||
self._get_dirs()
|
||||
|
||||
result = {}
|
||||
# To match posix find, if an exact file name is given, we should
|
||||
# return only that file
|
||||
if path in self.dir_cache and self.dir_cache[path]["type"] == "file":
|
||||
result[path] = self.dir_cache[path]
|
||||
return result if detail else [path]
|
||||
|
||||
for file_path, file_info in self.dir_cache.items():
|
||||
if not (path == "" or _matching_starts(file_path)):
|
||||
continue
|
||||
|
||||
if file_info["type"] == "directory":
|
||||
if withdirs:
|
||||
if file_path not in result:
|
||||
result[file_path.strip("/")] = file_info
|
||||
continue
|
||||
|
||||
if file_path not in result:
|
||||
result[file_path] = file_info if detail else None
|
||||
|
||||
if maxdepth:
|
||||
path_depth = path.count("/")
|
||||
result = {
|
||||
k: v for k, v in result.items() if k.count("/") - path_depth < maxdepth
|
||||
}
|
||||
return result if detail else sorted(result)
|
||||
117
backend_service/venv/lib/python3.13/site-packages/fsspec/json.py
Normal file
117
backend_service/venv/lib/python3.13/site-packages/fsspec/json.py
Normal file
@@ -0,0 +1,117 @@
|
||||
import json
|
||||
from collections.abc import Mapping, Sequence
|
||||
from contextlib import suppress
|
||||
from pathlib import PurePath
|
||||
from typing import (
|
||||
Any,
|
||||
Callable,
|
||||
ClassVar,
|
||||
Optional,
|
||||
)
|
||||
|
||||
from .registry import _import_class, get_filesystem_class
|
||||
from .spec import AbstractFileSystem
|
||||
|
||||
|
||||
class FilesystemJSONEncoder(json.JSONEncoder):
|
||||
include_password: ClassVar[bool] = True
|
||||
|
||||
def default(self, o: Any) -> Any:
|
||||
if isinstance(o, AbstractFileSystem):
|
||||
return o.to_dict(include_password=self.include_password)
|
||||
if isinstance(o, PurePath):
|
||||
cls = type(o)
|
||||
return {"cls": f"{cls.__module__}.{cls.__name__}", "str": str(o)}
|
||||
|
||||
return super().default(o)
|
||||
|
||||
def make_serializable(self, obj: Any) -> Any:
|
||||
"""
|
||||
Recursively converts an object so that it can be JSON serialized via
|
||||
:func:`json.dumps` and :func:`json.dump`, without actually calling
|
||||
said functions.
|
||||
"""
|
||||
if isinstance(obj, (str, int, float, bool)):
|
||||
return obj
|
||||
if isinstance(obj, Mapping):
|
||||
return {k: self.make_serializable(v) for k, v in obj.items()}
|
||||
if isinstance(obj, Sequence):
|
||||
return [self.make_serializable(v) for v in obj]
|
||||
|
||||
return self.default(obj)
|
||||
|
||||
|
||||
class FilesystemJSONDecoder(json.JSONDecoder):
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
object_hook: Optional[Callable[[dict[str, Any]], Any]] = None,
|
||||
parse_float: Optional[Callable[[str], Any]] = None,
|
||||
parse_int: Optional[Callable[[str], Any]] = None,
|
||||
parse_constant: Optional[Callable[[str], Any]] = None,
|
||||
strict: bool = True,
|
||||
object_pairs_hook: Optional[Callable[[list[tuple[str, Any]]], Any]] = None,
|
||||
) -> None:
|
||||
self.original_object_hook = object_hook
|
||||
|
||||
super().__init__(
|
||||
object_hook=self.custom_object_hook,
|
||||
parse_float=parse_float,
|
||||
parse_int=parse_int,
|
||||
parse_constant=parse_constant,
|
||||
strict=strict,
|
||||
object_pairs_hook=object_pairs_hook,
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def try_resolve_path_cls(cls, dct: dict[str, Any]):
|
||||
with suppress(Exception):
|
||||
fqp = dct["cls"]
|
||||
|
||||
path_cls = _import_class(fqp)
|
||||
|
||||
if issubclass(path_cls, PurePath):
|
||||
return path_cls
|
||||
|
||||
return None
|
||||
|
||||
@classmethod
|
||||
def try_resolve_fs_cls(cls, dct: dict[str, Any]):
|
||||
with suppress(Exception):
|
||||
if "cls" in dct:
|
||||
try:
|
||||
fs_cls = _import_class(dct["cls"])
|
||||
if issubclass(fs_cls, AbstractFileSystem):
|
||||
return fs_cls
|
||||
except Exception:
|
||||
if "protocol" in dct: # Fallback if cls cannot be imported
|
||||
return get_filesystem_class(dct["protocol"])
|
||||
|
||||
raise
|
||||
|
||||
return None
|
||||
|
||||
def custom_object_hook(self, dct: dict[str, Any]):
|
||||
if "cls" in dct:
|
||||
if (obj_cls := self.try_resolve_fs_cls(dct)) is not None:
|
||||
return AbstractFileSystem.from_dict(dct)
|
||||
if (obj_cls := self.try_resolve_path_cls(dct)) is not None:
|
||||
return obj_cls(dct["str"])
|
||||
|
||||
if self.original_object_hook is not None:
|
||||
return self.original_object_hook(dct)
|
||||
|
||||
return dct
|
||||
|
||||
def unmake_serializable(self, obj: Any) -> Any:
|
||||
"""
|
||||
Inverse function of :meth:`FilesystemJSONEncoder.make_serializable`.
|
||||
"""
|
||||
if isinstance(obj, dict):
|
||||
obj = self.custom_object_hook(obj)
|
||||
if isinstance(obj, dict):
|
||||
return {k: self.unmake_serializable(v) for k, v in obj.items()}
|
||||
if isinstance(obj, (list, tuple)):
|
||||
return [self.unmake_serializable(v) for v in obj]
|
||||
|
||||
return obj
|
||||
@@ -0,0 +1,251 @@
|
||||
import array
|
||||
import logging
|
||||
import posixpath
|
||||
import warnings
|
||||
from collections.abc import MutableMapping
|
||||
from functools import cached_property
|
||||
|
||||
from fsspec.core import url_to_fs
|
||||
|
||||
logger = logging.getLogger("fsspec.mapping")
|
||||
|
||||
|
||||
class FSMap(MutableMapping):
|
||||
"""Wrap a FileSystem instance as a mutable wrapping.
|
||||
|
||||
The keys of the mapping become files under the given root, and the
|
||||
values (which must be bytes) the contents of those files.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
root: string
|
||||
prefix for all the files
|
||||
fs: FileSystem instance
|
||||
check: bool (=True)
|
||||
performs a touch at the location, to check for write access.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> fs = FileSystem(**parameters) # doctest: +SKIP
|
||||
>>> d = FSMap('my-data/path/', fs) # doctest: +SKIP
|
||||
or, more likely
|
||||
>>> d = fs.get_mapper('my-data/path/')
|
||||
|
||||
>>> d['loc1'] = b'Hello World' # doctest: +SKIP
|
||||
>>> list(d.keys()) # doctest: +SKIP
|
||||
['loc1']
|
||||
>>> d['loc1'] # doctest: +SKIP
|
||||
b'Hello World'
|
||||
"""
|
||||
|
||||
def __init__(self, root, fs, check=False, create=False, missing_exceptions=None):
|
||||
self.fs = fs
|
||||
self.root = fs._strip_protocol(root)
|
||||
self._root_key_to_str = fs._strip_protocol(posixpath.join(root, "x"))[:-1]
|
||||
if missing_exceptions is None:
|
||||
missing_exceptions = (
|
||||
FileNotFoundError,
|
||||
IsADirectoryError,
|
||||
NotADirectoryError,
|
||||
)
|
||||
self.missing_exceptions = missing_exceptions
|
||||
self.check = check
|
||||
self.create = create
|
||||
if create:
|
||||
if not self.fs.exists(root):
|
||||
self.fs.mkdir(root)
|
||||
if check:
|
||||
if not self.fs.exists(root):
|
||||
raise ValueError(
|
||||
f"Path {root} does not exist. Create "
|
||||
f" with the ``create=True`` keyword"
|
||||
)
|
||||
self.fs.touch(root + "/a")
|
||||
self.fs.rm(root + "/a")
|
||||
|
||||
@cached_property
|
||||
def dirfs(self):
|
||||
"""dirfs instance that can be used with the same keys as the mapper"""
|
||||
from .implementations.dirfs import DirFileSystem
|
||||
|
||||
return DirFileSystem(path=self._root_key_to_str, fs=self.fs)
|
||||
|
||||
def clear(self):
|
||||
"""Remove all keys below root - empties out mapping"""
|
||||
logger.info("Clear mapping at %s", self.root)
|
||||
try:
|
||||
self.fs.rm(self.root, True)
|
||||
self.fs.mkdir(self.root)
|
||||
except: # noqa: E722
|
||||
pass
|
||||
|
||||
def getitems(self, keys, on_error="raise"):
|
||||
"""Fetch multiple items from the store
|
||||
|
||||
If the backend is async-able, this might proceed concurrently
|
||||
|
||||
Parameters
|
||||
----------
|
||||
keys: list(str)
|
||||
They keys to be fetched
|
||||
on_error : "raise", "omit", "return"
|
||||
If raise, an underlying exception will be raised (converted to KeyError
|
||||
if the type is in self.missing_exceptions); if omit, keys with exception
|
||||
will simply not be included in the output; if "return", all keys are
|
||||
included in the output, but the value will be bytes or an exception
|
||||
instance.
|
||||
|
||||
Returns
|
||||
-------
|
||||
dict(key, bytes|exception)
|
||||
"""
|
||||
keys2 = [self._key_to_str(k) for k in keys]
|
||||
oe = on_error if on_error == "raise" else "return"
|
||||
try:
|
||||
out = self.fs.cat(keys2, on_error=oe)
|
||||
if isinstance(out, bytes):
|
||||
out = {keys2[0]: out}
|
||||
except self.missing_exceptions as e:
|
||||
raise KeyError from e
|
||||
out = {
|
||||
k: (KeyError() if isinstance(v, self.missing_exceptions) else v)
|
||||
for k, v in out.items()
|
||||
}
|
||||
return {
|
||||
key: out[k2] if on_error == "raise" else out.get(k2, KeyError(k2))
|
||||
for key, k2 in zip(keys, keys2)
|
||||
if on_error == "return" or not isinstance(out[k2], BaseException)
|
||||
}
|
||||
|
||||
def setitems(self, values_dict):
|
||||
"""Set the values of multiple items in the store
|
||||
|
||||
Parameters
|
||||
----------
|
||||
values_dict: dict(str, bytes)
|
||||
"""
|
||||
values = {self._key_to_str(k): maybe_convert(v) for k, v in values_dict.items()}
|
||||
self.fs.pipe(values)
|
||||
|
||||
def delitems(self, keys):
|
||||
"""Remove multiple keys from the store"""
|
||||
self.fs.rm([self._key_to_str(k) for k in keys])
|
||||
|
||||
def _key_to_str(self, key):
|
||||
"""Generate full path for the key"""
|
||||
if not isinstance(key, str):
|
||||
# raise TypeError("key must be of type `str`, got `{type(key).__name__}`"
|
||||
warnings.warn(
|
||||
"from fsspec 2023.5 onward FSMap non-str keys will raise TypeError",
|
||||
DeprecationWarning,
|
||||
)
|
||||
if isinstance(key, list):
|
||||
key = tuple(key)
|
||||
key = str(key)
|
||||
return f"{self._root_key_to_str}{key}".rstrip("/")
|
||||
|
||||
def _str_to_key(self, s):
|
||||
"""Strip path of to leave key name"""
|
||||
return s[len(self.root) :].lstrip("/")
|
||||
|
||||
def __getitem__(self, key, default=None):
|
||||
"""Retrieve data"""
|
||||
k = self._key_to_str(key)
|
||||
try:
|
||||
result = self.fs.cat(k)
|
||||
except self.missing_exceptions as exc:
|
||||
if default is not None:
|
||||
return default
|
||||
raise KeyError(key) from exc
|
||||
return result
|
||||
|
||||
def pop(self, key, default=None):
|
||||
"""Pop data"""
|
||||
result = self.__getitem__(key, default)
|
||||
try:
|
||||
del self[key]
|
||||
except KeyError:
|
||||
pass
|
||||
return result
|
||||
|
||||
def __setitem__(self, key, value):
|
||||
"""Store value in key"""
|
||||
key = self._key_to_str(key)
|
||||
self.fs.mkdirs(self.fs._parent(key), exist_ok=True)
|
||||
self.fs.pipe_file(key, maybe_convert(value))
|
||||
|
||||
def __iter__(self):
|
||||
return (self._str_to_key(x) for x in self.fs.find(self.root))
|
||||
|
||||
def __len__(self):
|
||||
return len(self.fs.find(self.root))
|
||||
|
||||
def __delitem__(self, key):
|
||||
"""Remove key"""
|
||||
try:
|
||||
self.fs.rm(self._key_to_str(key))
|
||||
except Exception as exc:
|
||||
raise KeyError from exc
|
||||
|
||||
def __contains__(self, key):
|
||||
"""Does key exist in mapping?"""
|
||||
path = self._key_to_str(key)
|
||||
return self.fs.isfile(path)
|
||||
|
||||
def __reduce__(self):
|
||||
return FSMap, (self.root, self.fs, False, False, self.missing_exceptions)
|
||||
|
||||
|
||||
def maybe_convert(value):
|
||||
if isinstance(value, array.array) or hasattr(value, "__array__"):
|
||||
# bytes-like things
|
||||
if hasattr(value, "dtype") and value.dtype.kind in "Mm":
|
||||
# The buffer interface doesn't support datetime64/timdelta64 numpy
|
||||
# arrays
|
||||
value = value.view("int64")
|
||||
value = bytes(memoryview(value))
|
||||
return value
|
||||
|
||||
|
||||
def get_mapper(
|
||||
url="",
|
||||
check=False,
|
||||
create=False,
|
||||
missing_exceptions=None,
|
||||
alternate_root=None,
|
||||
**kwargs,
|
||||
):
|
||||
"""Create key-value interface for given URL and options
|
||||
|
||||
The URL will be of the form "protocol://location" and point to the root
|
||||
of the mapper required. All keys will be file-names below this location,
|
||||
and their values the contents of each key.
|
||||
|
||||
Also accepts compound URLs like zip::s3://bucket/file.zip , see ``fsspec.open``.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
url: str
|
||||
Root URL of mapping
|
||||
check: bool
|
||||
Whether to attempt to read from the location before instantiation, to
|
||||
check that the mapping does exist
|
||||
create: bool
|
||||
Whether to make the directory corresponding to the root before
|
||||
instantiating
|
||||
missing_exceptions: None or tuple
|
||||
If given, these exception types will be regarded as missing keys and
|
||||
return KeyError when trying to read data. By default, you get
|
||||
(FileNotFoundError, IsADirectoryError, NotADirectoryError)
|
||||
alternate_root: None or str
|
||||
In cases of complex URLs, the parser may fail to pick the correct part
|
||||
for the mapper root, so this arg can override
|
||||
|
||||
Returns
|
||||
-------
|
||||
``FSMap`` instance, the dict-like key-value store.
|
||||
"""
|
||||
# Removing protocol here - could defer to each open() on the backend
|
||||
fs, urlpath = url_to_fs(url, **kwargs)
|
||||
root = alternate_root if alternate_root is not None else urlpath
|
||||
return FSMap(root, fs, check, create, missing_exceptions=missing_exceptions)
|
||||
@@ -0,0 +1,541 @@
|
||||
import io
|
||||
import json
|
||||
import warnings
|
||||
|
||||
from .core import url_to_fs
|
||||
from .utils import merge_offset_ranges
|
||||
|
||||
# Parquet-Specific Utilities for fsspec
|
||||
#
|
||||
# Most of the functions defined in this module are NOT
|
||||
# intended for public consumption. The only exception
|
||||
# to this is `open_parquet_file`, which should be used
|
||||
# place of `fs.open()` to open parquet-formatted files
|
||||
# on remote file systems.
|
||||
|
||||
|
||||
def open_parquet_file(
|
||||
path,
|
||||
mode="rb",
|
||||
fs=None,
|
||||
metadata=None,
|
||||
columns=None,
|
||||
row_groups=None,
|
||||
storage_options=None,
|
||||
strict=False,
|
||||
engine="auto",
|
||||
max_gap=64_000,
|
||||
max_block=256_000_000,
|
||||
footer_sample_size=1_000_000,
|
||||
**kwargs,
|
||||
):
|
||||
"""
|
||||
Return a file-like object for a single Parquet file.
|
||||
|
||||
The specified parquet `engine` will be used to parse the
|
||||
footer metadata, and determine the required byte ranges
|
||||
from the file. The target path will then be opened with
|
||||
the "parts" (`KnownPartsOfAFile`) caching strategy.
|
||||
|
||||
Note that this method is intended for usage with remote
|
||||
file systems, and is unlikely to improve parquet-read
|
||||
performance on local file systems.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
path: str
|
||||
Target file path.
|
||||
mode: str, optional
|
||||
Mode option to be passed through to `fs.open`. Default is "rb".
|
||||
metadata: Any, optional
|
||||
Parquet metadata object. Object type must be supported
|
||||
by the backend parquet engine. For now, only the "fastparquet"
|
||||
engine supports an explicit `ParquetFile` metadata object.
|
||||
If a metadata object is supplied, the remote footer metadata
|
||||
will not need to be transferred into local memory.
|
||||
fs: AbstractFileSystem, optional
|
||||
Filesystem object to use for opening the file. If nothing is
|
||||
specified, an `AbstractFileSystem` object will be inferred.
|
||||
engine : str, default "auto"
|
||||
Parquet engine to use for metadata parsing. Allowed options
|
||||
include "fastparquet", "pyarrow", and "auto". The specified
|
||||
engine must be installed in the current environment. If
|
||||
"auto" is specified, and both engines are installed,
|
||||
"fastparquet" will take precedence over "pyarrow".
|
||||
columns: list, optional
|
||||
List of all column names that may be read from the file.
|
||||
row_groups : list, optional
|
||||
List of all row-groups that may be read from the file. This
|
||||
may be a list of row-group indices (integers), or it may be
|
||||
a list of `RowGroup` metadata objects (if the "fastparquet"
|
||||
engine is used).
|
||||
storage_options : dict, optional
|
||||
Used to generate an `AbstractFileSystem` object if `fs` was
|
||||
not specified.
|
||||
strict : bool, optional
|
||||
Whether the resulting `KnownPartsOfAFile` cache should
|
||||
fetch reads that go beyond a known byte-range boundary.
|
||||
If `False` (the default), any read that ends outside a
|
||||
known part will be zero padded. Note that using
|
||||
`strict=True` may be useful for debugging.
|
||||
max_gap : int, optional
|
||||
Neighboring byte ranges will only be merged when their
|
||||
inter-range gap is <= `max_gap`. Default is 64KB.
|
||||
max_block : int, optional
|
||||
Neighboring byte ranges will only be merged when the size of
|
||||
the aggregated range is <= `max_block`. Default is 256MB.
|
||||
footer_sample_size : int, optional
|
||||
Number of bytes to read from the end of the path to look
|
||||
for the footer metadata. If the sampled bytes do not contain
|
||||
the footer, a second read request will be required, and
|
||||
performance will suffer. Default is 1MB.
|
||||
**kwargs :
|
||||
Optional key-word arguments to pass to `fs.open`
|
||||
"""
|
||||
|
||||
# Make sure we have an `AbstractFileSystem` object
|
||||
# to work with
|
||||
if fs is None:
|
||||
fs = url_to_fs(path, **(storage_options or {}))[0]
|
||||
|
||||
# For now, `columns == []` not supported. Just use
|
||||
# default `open` command with `path` input
|
||||
if columns is not None and len(columns) == 0:
|
||||
return fs.open(path, mode=mode)
|
||||
|
||||
# Set the engine
|
||||
engine = _set_engine(engine)
|
||||
|
||||
# Fetch the known byte ranges needed to read
|
||||
# `columns` and/or `row_groups`
|
||||
data = _get_parquet_byte_ranges(
|
||||
[path],
|
||||
fs,
|
||||
metadata=metadata,
|
||||
columns=columns,
|
||||
row_groups=row_groups,
|
||||
engine=engine,
|
||||
max_gap=max_gap,
|
||||
max_block=max_block,
|
||||
footer_sample_size=footer_sample_size,
|
||||
)
|
||||
|
||||
# Extract file name from `data`
|
||||
fn = next(iter(data)) if data else path
|
||||
|
||||
# Call self.open with "parts" caching
|
||||
options = kwargs.pop("cache_options", {}).copy()
|
||||
return fs.open(
|
||||
fn,
|
||||
mode=mode,
|
||||
cache_type="parts",
|
||||
cache_options={
|
||||
**options,
|
||||
"data": data.get(fn, {}),
|
||||
"strict": strict,
|
||||
},
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
|
||||
def _get_parquet_byte_ranges(
|
||||
paths,
|
||||
fs,
|
||||
metadata=None,
|
||||
columns=None,
|
||||
row_groups=None,
|
||||
max_gap=64_000,
|
||||
max_block=256_000_000,
|
||||
footer_sample_size=1_000_000,
|
||||
engine="auto",
|
||||
):
|
||||
"""Get a dictionary of the known byte ranges needed
|
||||
to read a specific column/row-group selection from a
|
||||
Parquet dataset. Each value in the output dictionary
|
||||
is intended for use as the `data` argument for the
|
||||
`KnownPartsOfAFile` caching strategy of a single path.
|
||||
"""
|
||||
|
||||
# Set engine if necessary
|
||||
if isinstance(engine, str):
|
||||
engine = _set_engine(engine)
|
||||
|
||||
# Pass to specialized function if metadata is defined
|
||||
if metadata is not None:
|
||||
# Use the provided parquet metadata object
|
||||
# to avoid transferring/parsing footer metadata
|
||||
return _get_parquet_byte_ranges_from_metadata(
|
||||
metadata,
|
||||
fs,
|
||||
engine,
|
||||
columns=columns,
|
||||
row_groups=row_groups,
|
||||
max_gap=max_gap,
|
||||
max_block=max_block,
|
||||
)
|
||||
|
||||
# Get file sizes asynchronously
|
||||
file_sizes = fs.sizes(paths)
|
||||
|
||||
# Populate global paths, starts, & ends
|
||||
result = {}
|
||||
data_paths = []
|
||||
data_starts = []
|
||||
data_ends = []
|
||||
add_header_magic = True
|
||||
if columns is None and row_groups is None:
|
||||
# We are NOT selecting specific columns or row-groups.
|
||||
#
|
||||
# We can avoid sampling the footers, and just transfer
|
||||
# all file data with cat_ranges
|
||||
for i, path in enumerate(paths):
|
||||
result[path] = {}
|
||||
for b in range(0, file_sizes[i], max_block):
|
||||
data_paths.append(path)
|
||||
data_starts.append(b)
|
||||
data_ends.append(min(b + max_block, file_sizes[i]))
|
||||
add_header_magic = False # "Magic" should already be included
|
||||
else:
|
||||
# We ARE selecting specific columns or row-groups.
|
||||
#
|
||||
# Gather file footers.
|
||||
# We just take the last `footer_sample_size` bytes of each
|
||||
# file (or the entire file if it is smaller than that)
|
||||
footer_starts = []
|
||||
footer_ends = []
|
||||
for i, path in enumerate(paths):
|
||||
footer_ends.append(file_sizes[i])
|
||||
sample_size = max(0, file_sizes[i] - footer_sample_size)
|
||||
footer_starts.append(sample_size)
|
||||
footer_samples = fs.cat_ranges(paths, footer_starts, footer_ends)
|
||||
|
||||
# Check our footer samples and re-sample if necessary.
|
||||
missing_footer_starts = footer_starts.copy()
|
||||
large_footer = 0
|
||||
for i, path in enumerate(paths):
|
||||
footer_size = int.from_bytes(footer_samples[i][-8:-4], "little")
|
||||
real_footer_start = file_sizes[i] - (footer_size + 8)
|
||||
if real_footer_start < footer_starts[i]:
|
||||
missing_footer_starts[i] = real_footer_start
|
||||
large_footer = max(large_footer, (footer_size + 8))
|
||||
if large_footer:
|
||||
warnings.warn(
|
||||
f"Not enough data was used to sample the parquet footer. "
|
||||
f"Try setting footer_sample_size >= {large_footer}."
|
||||
)
|
||||
for i, block in enumerate(
|
||||
fs.cat_ranges(
|
||||
paths,
|
||||
missing_footer_starts,
|
||||
footer_starts,
|
||||
)
|
||||
):
|
||||
footer_samples[i] = block + footer_samples[i]
|
||||
footer_starts[i] = missing_footer_starts[i]
|
||||
|
||||
# Calculate required byte ranges for each path
|
||||
for i, path in enumerate(paths):
|
||||
# Deal with small-file case.
|
||||
# Just include all remaining bytes of the file
|
||||
# in a single range.
|
||||
if file_sizes[i] < max_block:
|
||||
if footer_starts[i] > 0:
|
||||
# Only need to transfer the data if the
|
||||
# footer sample isn't already the whole file
|
||||
data_paths.append(path)
|
||||
data_starts.append(0)
|
||||
data_ends.append(footer_starts[i])
|
||||
continue
|
||||
|
||||
# Use "engine" to collect data byte ranges
|
||||
path_data_starts, path_data_ends = engine._parquet_byte_ranges(
|
||||
columns,
|
||||
row_groups=row_groups,
|
||||
footer=footer_samples[i],
|
||||
footer_start=footer_starts[i],
|
||||
)
|
||||
|
||||
data_paths += [path] * len(path_data_starts)
|
||||
data_starts += path_data_starts
|
||||
data_ends += path_data_ends
|
||||
|
||||
# Merge adjacent offset ranges
|
||||
data_paths, data_starts, data_ends = merge_offset_ranges(
|
||||
data_paths,
|
||||
data_starts,
|
||||
data_ends,
|
||||
max_gap=max_gap,
|
||||
max_block=max_block,
|
||||
sort=False, # Should already be sorted
|
||||
)
|
||||
|
||||
# Start by populating `result` with footer samples
|
||||
for i, path in enumerate(paths):
|
||||
result[path] = {(footer_starts[i], footer_ends[i]): footer_samples[i]}
|
||||
|
||||
# Transfer the data byte-ranges into local memory
|
||||
_transfer_ranges(fs, result, data_paths, data_starts, data_ends)
|
||||
|
||||
# Add b"PAR1" to header if necessary
|
||||
if add_header_magic:
|
||||
_add_header_magic(result)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def _get_parquet_byte_ranges_from_metadata(
|
||||
metadata,
|
||||
fs,
|
||||
engine,
|
||||
columns=None,
|
||||
row_groups=None,
|
||||
max_gap=64_000,
|
||||
max_block=256_000_000,
|
||||
):
|
||||
"""Simplified version of `_get_parquet_byte_ranges` for
|
||||
the case that an engine-specific `metadata` object is
|
||||
provided, and the remote footer metadata does not need to
|
||||
be transferred before calculating the required byte ranges.
|
||||
"""
|
||||
|
||||
# Use "engine" to collect data byte ranges
|
||||
data_paths, data_starts, data_ends = engine._parquet_byte_ranges(
|
||||
columns,
|
||||
row_groups=row_groups,
|
||||
metadata=metadata,
|
||||
)
|
||||
|
||||
# Merge adjacent offset ranges
|
||||
data_paths, data_starts, data_ends = merge_offset_ranges(
|
||||
data_paths,
|
||||
data_starts,
|
||||
data_ends,
|
||||
max_gap=max_gap,
|
||||
max_block=max_block,
|
||||
sort=False, # Should be sorted
|
||||
)
|
||||
|
||||
# Transfer the data byte-ranges into local memory
|
||||
result = {fn: {} for fn in list(set(data_paths))}
|
||||
_transfer_ranges(fs, result, data_paths, data_starts, data_ends)
|
||||
|
||||
# Add b"PAR1" to header
|
||||
_add_header_magic(result)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def _transfer_ranges(fs, blocks, paths, starts, ends):
|
||||
# Use cat_ranges to gather the data byte_ranges
|
||||
ranges = (paths, starts, ends)
|
||||
for path, start, stop, data in zip(*ranges, fs.cat_ranges(*ranges)):
|
||||
blocks[path][(start, stop)] = data
|
||||
|
||||
|
||||
def _add_header_magic(data):
|
||||
# Add b"PAR1" to file headers
|
||||
for path in list(data.keys()):
|
||||
add_magic = True
|
||||
for k in data[path]:
|
||||
if k[0] == 0 and k[1] >= 4:
|
||||
add_magic = False
|
||||
break
|
||||
if add_magic:
|
||||
data[path][(0, 4)] = b"PAR1"
|
||||
|
||||
|
||||
def _set_engine(engine_str):
|
||||
# Define a list of parquet engines to try
|
||||
if engine_str == "auto":
|
||||
try_engines = ("fastparquet", "pyarrow")
|
||||
elif not isinstance(engine_str, str):
|
||||
raise ValueError(
|
||||
"Failed to set parquet engine! "
|
||||
"Please pass 'fastparquet', 'pyarrow', or 'auto'"
|
||||
)
|
||||
elif engine_str not in ("fastparquet", "pyarrow"):
|
||||
raise ValueError(f"{engine_str} engine not supported by `fsspec.parquet`")
|
||||
else:
|
||||
try_engines = [engine_str]
|
||||
|
||||
# Try importing the engines in `try_engines`,
|
||||
# and choose the first one that succeeds
|
||||
for engine in try_engines:
|
||||
try:
|
||||
if engine == "fastparquet":
|
||||
return FastparquetEngine()
|
||||
elif engine == "pyarrow":
|
||||
return PyarrowEngine()
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
# Raise an error if a supported parquet engine
|
||||
# was not found
|
||||
raise ImportError(
|
||||
f"The following parquet engines are not installed "
|
||||
f"in your python environment: {try_engines}."
|
||||
f"Please install 'fastparquert' or 'pyarrow' to "
|
||||
f"utilize the `fsspec.parquet` module."
|
||||
)
|
||||
|
||||
|
||||
class FastparquetEngine:
|
||||
# The purpose of the FastparquetEngine class is
|
||||
# to check if fastparquet can be imported (on initialization)
|
||||
# and to define a `_parquet_byte_ranges` method. In the
|
||||
# future, this class may also be used to define other
|
||||
# methods/logic that are specific to fastparquet.
|
||||
|
||||
def __init__(self):
|
||||
import fastparquet as fp
|
||||
|
||||
self.fp = fp
|
||||
|
||||
def _row_group_filename(self, row_group, pf):
|
||||
return pf.row_group_filename(row_group)
|
||||
|
||||
def _parquet_byte_ranges(
|
||||
self,
|
||||
columns,
|
||||
row_groups=None,
|
||||
metadata=None,
|
||||
footer=None,
|
||||
footer_start=None,
|
||||
):
|
||||
# Initialize offset ranges and define ParqetFile metadata
|
||||
pf = metadata
|
||||
data_paths, data_starts, data_ends = [], [], []
|
||||
if pf is None:
|
||||
pf = self.fp.ParquetFile(io.BytesIO(footer))
|
||||
|
||||
# Convert columns to a set and add any index columns
|
||||
# specified in the pandas metadata (just in case)
|
||||
column_set = None if columns is None else set(columns)
|
||||
if column_set is not None and hasattr(pf, "pandas_metadata"):
|
||||
md_index = [
|
||||
ind
|
||||
for ind in pf.pandas_metadata.get("index_columns", [])
|
||||
# Ignore RangeIndex information
|
||||
if not isinstance(ind, dict)
|
||||
]
|
||||
column_set |= set(md_index)
|
||||
|
||||
# Check if row_groups is a list of integers
|
||||
# or a list of row-group metadata
|
||||
if row_groups and not isinstance(row_groups[0], int):
|
||||
# Input row_groups contains row-group metadata
|
||||
row_group_indices = None
|
||||
else:
|
||||
# Input row_groups contains row-group indices
|
||||
row_group_indices = row_groups
|
||||
row_groups = pf.row_groups
|
||||
|
||||
# Loop through column chunks to add required byte ranges
|
||||
for r, row_group in enumerate(row_groups):
|
||||
# Skip this row-group if we are targeting
|
||||
# specific row-groups
|
||||
if row_group_indices is None or r in row_group_indices:
|
||||
# Find the target parquet-file path for `row_group`
|
||||
fn = self._row_group_filename(row_group, pf)
|
||||
|
||||
for column in row_group.columns:
|
||||
name = column.meta_data.path_in_schema[0]
|
||||
# Skip this column if we are targeting a
|
||||
# specific columns
|
||||
if column_set is None or name in column_set:
|
||||
file_offset0 = column.meta_data.dictionary_page_offset
|
||||
if file_offset0 is None:
|
||||
file_offset0 = column.meta_data.data_page_offset
|
||||
num_bytes = column.meta_data.total_compressed_size
|
||||
if footer_start is None or file_offset0 < footer_start:
|
||||
data_paths.append(fn)
|
||||
data_starts.append(file_offset0)
|
||||
data_ends.append(
|
||||
min(
|
||||
file_offset0 + num_bytes,
|
||||
footer_start or (file_offset0 + num_bytes),
|
||||
)
|
||||
)
|
||||
|
||||
if metadata:
|
||||
# The metadata in this call may map to multiple
|
||||
# file paths. Need to include `data_paths`
|
||||
return data_paths, data_starts, data_ends
|
||||
return data_starts, data_ends
|
||||
|
||||
|
||||
class PyarrowEngine:
|
||||
# The purpose of the PyarrowEngine class is
|
||||
# to check if pyarrow can be imported (on initialization)
|
||||
# and to define a `_parquet_byte_ranges` method. In the
|
||||
# future, this class may also be used to define other
|
||||
# methods/logic that are specific to pyarrow.
|
||||
|
||||
def __init__(self):
|
||||
import pyarrow.parquet as pq
|
||||
|
||||
self.pq = pq
|
||||
|
||||
def _row_group_filename(self, row_group, metadata):
|
||||
raise NotImplementedError
|
||||
|
||||
def _parquet_byte_ranges(
|
||||
self,
|
||||
columns,
|
||||
row_groups=None,
|
||||
metadata=None,
|
||||
footer=None,
|
||||
footer_start=None,
|
||||
):
|
||||
if metadata is not None:
|
||||
raise ValueError("metadata input not supported for PyarrowEngine")
|
||||
|
||||
data_starts, data_ends = [], []
|
||||
md = self.pq.ParquetFile(io.BytesIO(footer)).metadata
|
||||
|
||||
# Convert columns to a set and add any index columns
|
||||
# specified in the pandas metadata (just in case)
|
||||
column_set = None if columns is None else set(columns)
|
||||
if column_set is not None:
|
||||
schema = md.schema.to_arrow_schema()
|
||||
has_pandas_metadata = (
|
||||
schema.metadata is not None and b"pandas" in schema.metadata
|
||||
)
|
||||
if has_pandas_metadata:
|
||||
md_index = [
|
||||
ind
|
||||
for ind in json.loads(
|
||||
schema.metadata[b"pandas"].decode("utf8")
|
||||
).get("index_columns", [])
|
||||
# Ignore RangeIndex information
|
||||
if not isinstance(ind, dict)
|
||||
]
|
||||
column_set |= set(md_index)
|
||||
|
||||
# Loop through column chunks to add required byte ranges
|
||||
for r in range(md.num_row_groups):
|
||||
# Skip this row-group if we are targeting
|
||||
# specific row-groups
|
||||
if row_groups is None or r in row_groups:
|
||||
row_group = md.row_group(r)
|
||||
for c in range(row_group.num_columns):
|
||||
column = row_group.column(c)
|
||||
name = column.path_in_schema
|
||||
# Skip this column if we are targeting a
|
||||
# specific columns
|
||||
split_name = name.split(".")[0]
|
||||
if (
|
||||
column_set is None
|
||||
or name in column_set
|
||||
or split_name in column_set
|
||||
):
|
||||
file_offset0 = column.dictionary_page_offset
|
||||
if file_offset0 is None:
|
||||
file_offset0 = column.data_page_offset
|
||||
num_bytes = column.total_compressed_size
|
||||
if file_offset0 < footer_start:
|
||||
data_starts.append(file_offset0)
|
||||
data_ends.append(
|
||||
min(file_offset0 + num_bytes, footer_start)
|
||||
)
|
||||
return data_starts, data_ends
|
||||
@@ -0,0 +1,330 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import importlib
|
||||
import types
|
||||
import warnings
|
||||
|
||||
__all__ = ["registry", "get_filesystem_class", "default"]
|
||||
|
||||
# internal, mutable
|
||||
_registry: dict[str, type] = {}
|
||||
|
||||
# external, immutable
|
||||
registry = types.MappingProxyType(_registry)
|
||||
default = "file"
|
||||
|
||||
|
||||
def register_implementation(name, cls, clobber=False, errtxt=None):
|
||||
"""Add implementation class to the registry
|
||||
|
||||
Parameters
|
||||
----------
|
||||
name: str
|
||||
Protocol name to associate with the class
|
||||
cls: class or str
|
||||
if a class: fsspec-compliant implementation class (normally inherits from
|
||||
``fsspec.AbstractFileSystem``, gets added straight to the registry. If a
|
||||
str, the full path to an implementation class like package.module.class,
|
||||
which gets added to known_implementations,
|
||||
so the import is deferred until the filesystem is actually used.
|
||||
clobber: bool (optional)
|
||||
Whether to overwrite a protocol with the same name; if False, will raise
|
||||
instead.
|
||||
errtxt: str (optional)
|
||||
If given, then a failure to import the given class will result in this
|
||||
text being given.
|
||||
"""
|
||||
if isinstance(cls, str):
|
||||
if name in known_implementations and clobber is False:
|
||||
if cls != known_implementations[name]["class"]:
|
||||
raise ValueError(
|
||||
f"Name ({name}) already in the known_implementations and clobber "
|
||||
f"is False"
|
||||
)
|
||||
else:
|
||||
known_implementations[name] = {
|
||||
"class": cls,
|
||||
"err": errtxt or f"{cls} import failed for protocol {name}",
|
||||
}
|
||||
|
||||
else:
|
||||
if name in registry and clobber is False:
|
||||
if _registry[name] is not cls:
|
||||
raise ValueError(
|
||||
f"Name ({name}) already in the registry and clobber is False"
|
||||
)
|
||||
else:
|
||||
_registry[name] = cls
|
||||
|
||||
|
||||
# protocols mapped to the class which implements them. This dict can be
|
||||
# updated with register_implementation
|
||||
known_implementations = {
|
||||
"abfs": {
|
||||
"class": "adlfs.AzureBlobFileSystem",
|
||||
"err": "Install adlfs to access Azure Datalake Gen2 and Azure Blob Storage",
|
||||
},
|
||||
"adl": {
|
||||
"class": "adlfs.AzureDatalakeFileSystem",
|
||||
"err": "Install adlfs to access Azure Datalake Gen1",
|
||||
},
|
||||
"arrow_hdfs": {
|
||||
"class": "fsspec.implementations.arrow.HadoopFileSystem",
|
||||
"err": "pyarrow and local java libraries required for HDFS",
|
||||
},
|
||||
"asynclocal": {
|
||||
"class": "morefs.asyn_local.AsyncLocalFileSystem",
|
||||
"err": "Install 'morefs[asynclocalfs]' to use AsyncLocalFileSystem",
|
||||
},
|
||||
"asyncwrapper": {
|
||||
"class": "fsspec.implementations.asyn_wrapper.AsyncFileSystemWrapper",
|
||||
},
|
||||
"az": {
|
||||
"class": "adlfs.AzureBlobFileSystem",
|
||||
"err": "Install adlfs to access Azure Datalake Gen2 and Azure Blob Storage",
|
||||
},
|
||||
"blockcache": {"class": "fsspec.implementations.cached.CachingFileSystem"},
|
||||
"box": {
|
||||
"class": "boxfs.BoxFileSystem",
|
||||
"err": "Please install boxfs to access BoxFileSystem",
|
||||
},
|
||||
"cached": {"class": "fsspec.implementations.cached.CachingFileSystem"},
|
||||
"dask": {
|
||||
"class": "fsspec.implementations.dask.DaskWorkerFileSystem",
|
||||
"err": "Install dask distributed to access worker file system",
|
||||
},
|
||||
"data": {"class": "fsspec.implementations.data.DataFileSystem"},
|
||||
"dbfs": {
|
||||
"class": "fsspec.implementations.dbfs.DatabricksFileSystem",
|
||||
"err": "Install the requests package to use the DatabricksFileSystem",
|
||||
},
|
||||
"dir": {"class": "fsspec.implementations.dirfs.DirFileSystem"},
|
||||
"dropbox": {
|
||||
"class": "dropboxdrivefs.DropboxDriveFileSystem",
|
||||
"err": (
|
||||
'DropboxFileSystem requires "dropboxdrivefs","requests" and "'
|
||||
'"dropbox" to be installed'
|
||||
),
|
||||
},
|
||||
"dvc": {
|
||||
"class": "dvc.api.DVCFileSystem",
|
||||
"err": "Install dvc to access DVCFileSystem",
|
||||
},
|
||||
"file": {"class": "fsspec.implementations.local.LocalFileSystem"},
|
||||
"filecache": {"class": "fsspec.implementations.cached.WholeFileCacheFileSystem"},
|
||||
"ftp": {"class": "fsspec.implementations.ftp.FTPFileSystem"},
|
||||
"gcs": {
|
||||
"class": "gcsfs.GCSFileSystem",
|
||||
"err": "Please install gcsfs to access Google Storage",
|
||||
},
|
||||
"gdrive": {
|
||||
"class": "gdrive_fsspec.GoogleDriveFileSystem",
|
||||
"err": "Please install gdrive_fs for access to Google Drive",
|
||||
},
|
||||
"generic": {"class": "fsspec.generic.GenericFileSystem"},
|
||||
"gist": {
|
||||
"class": "fsspec.implementations.gist.GistFileSystem",
|
||||
"err": "Install the requests package to use the gist FS",
|
||||
},
|
||||
"git": {
|
||||
"class": "fsspec.implementations.git.GitFileSystem",
|
||||
"err": "Install pygit2 to browse local git repos",
|
||||
},
|
||||
"github": {
|
||||
"class": "fsspec.implementations.github.GithubFileSystem",
|
||||
"err": "Install the requests package to use the github FS",
|
||||
},
|
||||
"gs": {
|
||||
"class": "gcsfs.GCSFileSystem",
|
||||
"err": "Please install gcsfs to access Google Storage",
|
||||
},
|
||||
"hdfs": {
|
||||
"class": "fsspec.implementations.arrow.HadoopFileSystem",
|
||||
"err": "pyarrow and local java libraries required for HDFS",
|
||||
},
|
||||
"hf": {
|
||||
"class": "huggingface_hub.HfFileSystem",
|
||||
"err": "Install huggingface_hub to access HfFileSystem",
|
||||
},
|
||||
"http": {
|
||||
"class": "fsspec.implementations.http.HTTPFileSystem",
|
||||
"err": 'HTTPFileSystem requires "requests" and "aiohttp" to be installed',
|
||||
},
|
||||
"https": {
|
||||
"class": "fsspec.implementations.http.HTTPFileSystem",
|
||||
"err": 'HTTPFileSystem requires "requests" and "aiohttp" to be installed',
|
||||
},
|
||||
"jlab": {
|
||||
"class": "fsspec.implementations.jupyter.JupyterFileSystem",
|
||||
"err": "Jupyter FS requires requests to be installed",
|
||||
},
|
||||
"jupyter": {
|
||||
"class": "fsspec.implementations.jupyter.JupyterFileSystem",
|
||||
"err": "Jupyter FS requires requests to be installed",
|
||||
},
|
||||
"lakefs": {
|
||||
"class": "lakefs_spec.LakeFSFileSystem",
|
||||
"err": "Please install lakefs-spec to access LakeFSFileSystem",
|
||||
},
|
||||
"libarchive": {
|
||||
"class": "fsspec.implementations.libarchive.LibArchiveFileSystem",
|
||||
"err": "LibArchive requires to be installed",
|
||||
},
|
||||
"local": {"class": "fsspec.implementations.local.LocalFileSystem"},
|
||||
"memory": {"class": "fsspec.implementations.memory.MemoryFileSystem"},
|
||||
"oci": {
|
||||
"class": "ocifs.OCIFileSystem",
|
||||
"err": "Install ocifs to access OCI Object Storage",
|
||||
},
|
||||
"ocilake": {
|
||||
"class": "ocifs.OCIFileSystem",
|
||||
"err": "Install ocifs to access OCI Data Lake",
|
||||
},
|
||||
"oss": {
|
||||
"class": "ossfs.OSSFileSystem",
|
||||
"err": "Install ossfs to access Alibaba Object Storage System",
|
||||
},
|
||||
"pyscript": {
|
||||
"class": "pyscript_fsspec_client.client.PyscriptFileSystem",
|
||||
"err": "Install requests (cpython) or run in pyscript",
|
||||
},
|
||||
"reference": {"class": "fsspec.implementations.reference.ReferenceFileSystem"},
|
||||
"root": {
|
||||
"class": "fsspec_xrootd.XRootDFileSystem",
|
||||
"err": (
|
||||
"Install fsspec-xrootd to access xrootd storage system. "
|
||||
"Note: 'root' is the protocol name for xrootd storage systems, "
|
||||
"not referring to root directories"
|
||||
),
|
||||
},
|
||||
"s3": {"class": "s3fs.S3FileSystem", "err": "Install s3fs to access S3"},
|
||||
"s3a": {"class": "s3fs.S3FileSystem", "err": "Install s3fs to access S3"},
|
||||
"sftp": {
|
||||
"class": "fsspec.implementations.sftp.SFTPFileSystem",
|
||||
"err": 'SFTPFileSystem requires "paramiko" to be installed',
|
||||
},
|
||||
"simplecache": {"class": "fsspec.implementations.cached.SimpleCacheFileSystem"},
|
||||
"smb": {
|
||||
"class": "fsspec.implementations.smb.SMBFileSystem",
|
||||
"err": 'SMB requires "smbprotocol" or "smbprotocol[kerberos]" installed',
|
||||
},
|
||||
"ssh": {
|
||||
"class": "fsspec.implementations.sftp.SFTPFileSystem",
|
||||
"err": 'SFTPFileSystem requires "paramiko" to be installed',
|
||||
},
|
||||
"tar": {"class": "fsspec.implementations.tar.TarFileSystem"},
|
||||
"tos": {
|
||||
"class": "tosfs.TosFileSystem",
|
||||
"err": "Install tosfs to access ByteDance volcano engine Tinder Object Storage",
|
||||
},
|
||||
"tosfs": {
|
||||
"class": "tosfs.TosFileSystem",
|
||||
"err": "Install tosfs to access ByteDance volcano engine Tinder Object Storage",
|
||||
},
|
||||
"wandb": {"class": "wandbfs.WandbFS", "err": "Install wandbfs to access wandb"},
|
||||
"webdav": {
|
||||
"class": "webdav4.fsspec.WebdavFileSystem",
|
||||
"err": "Install webdav4 to access WebDAV",
|
||||
},
|
||||
"webhdfs": {
|
||||
"class": "fsspec.implementations.webhdfs.WebHDFS",
|
||||
"err": 'webHDFS access requires "requests" to be installed',
|
||||
},
|
||||
"zip": {"class": "fsspec.implementations.zip.ZipFileSystem"},
|
||||
}
|
||||
|
||||
assert list(known_implementations) == sorted(known_implementations), (
|
||||
"Not in alphabetical order"
|
||||
)
|
||||
|
||||
|
||||
def get_filesystem_class(protocol):
|
||||
"""Fetch named protocol implementation from the registry
|
||||
|
||||
The dict ``known_implementations`` maps protocol names to the locations
|
||||
of classes implementing the corresponding file-system. When used for the
|
||||
first time, appropriate imports will happen and the class will be placed in
|
||||
the registry. All subsequent calls will fetch directly from the registry.
|
||||
|
||||
Some protocol implementations require additional dependencies, and so the
|
||||
import may fail. In this case, the string in the "err" field of the
|
||||
``known_implementations`` will be given as the error message.
|
||||
"""
|
||||
if not protocol:
|
||||
protocol = default
|
||||
|
||||
if protocol not in registry:
|
||||
if protocol not in known_implementations:
|
||||
raise ValueError(f"Protocol not known: {protocol}")
|
||||
bit = known_implementations[protocol]
|
||||
try:
|
||||
register_implementation(protocol, _import_class(bit["class"]))
|
||||
except ImportError as e:
|
||||
raise ImportError(bit.get("err")) from e
|
||||
cls = registry[protocol]
|
||||
if getattr(cls, "protocol", None) in ("abstract", None):
|
||||
cls.protocol = protocol
|
||||
|
||||
return cls
|
||||
|
||||
|
||||
s3_msg = """Your installed version of s3fs is very old and known to cause
|
||||
severe performance issues, see also https://github.com/dask/dask/issues/10276
|
||||
|
||||
To fix, you should specify a lower version bound on s3fs, or
|
||||
update the current installation.
|
||||
"""
|
||||
|
||||
|
||||
def _import_class(fqp: str):
|
||||
"""Take a fully-qualified path and return the imported class or identifier.
|
||||
|
||||
``fqp`` is of the form "package.module.klass" or
|
||||
"package.module:subobject.klass".
|
||||
|
||||
Warnings
|
||||
--------
|
||||
This can import arbitrary modules. Make sure you haven't installed any modules
|
||||
that may execute malicious code at import time.
|
||||
"""
|
||||
if ":" in fqp:
|
||||
mod, name = fqp.rsplit(":", 1)
|
||||
else:
|
||||
mod, name = fqp.rsplit(".", 1)
|
||||
|
||||
is_s3 = mod == "s3fs"
|
||||
mod = importlib.import_module(mod)
|
||||
if is_s3 and mod.__version__.split(".") < ["0", "5"]:
|
||||
warnings.warn(s3_msg)
|
||||
for part in name.split("."):
|
||||
mod = getattr(mod, part)
|
||||
|
||||
if not isinstance(mod, type):
|
||||
raise TypeError(f"{fqp} is not a class")
|
||||
|
||||
return mod
|
||||
|
||||
|
||||
def filesystem(protocol, **storage_options):
|
||||
"""Instantiate filesystems for given protocol and arguments
|
||||
|
||||
``storage_options`` are specific to the protocol being chosen, and are
|
||||
passed directly to the class.
|
||||
"""
|
||||
if protocol == "arrow_hdfs":
|
||||
warnings.warn(
|
||||
"The 'arrow_hdfs' protocol has been deprecated and will be "
|
||||
"removed in the future. Specify it as 'hdfs'.",
|
||||
DeprecationWarning,
|
||||
)
|
||||
|
||||
cls = get_filesystem_class(protocol)
|
||||
return cls(**storage_options)
|
||||
|
||||
|
||||
def available_protocols():
|
||||
"""Return a list of the implemented protocols.
|
||||
|
||||
Note that any given protocol may require extra packages to be importable.
|
||||
"""
|
||||
return list(known_implementations)
|
||||
2281
backend_service/venv/lib/python3.13/site-packages/fsspec/spec.py
Normal file
2281
backend_service/venv/lib/python3.13/site-packages/fsspec/spec.py
Normal file
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,289 @@
|
||||
import os
|
||||
from hashlib import md5
|
||||
|
||||
import pytest
|
||||
|
||||
from fsspec.implementations.local import LocalFileSystem
|
||||
from fsspec.tests.abstract.copy import AbstractCopyTests # noqa: F401
|
||||
from fsspec.tests.abstract.get import AbstractGetTests # noqa: F401
|
||||
from fsspec.tests.abstract.open import AbstractOpenTests # noqa: F401
|
||||
from fsspec.tests.abstract.pipe import AbstractPipeTests # noqa: F401
|
||||
from fsspec.tests.abstract.put import AbstractPutTests # noqa: F401
|
||||
|
||||
|
||||
class BaseAbstractFixtures:
|
||||
"""
|
||||
Abstract base class containing fixtures that are used by but never need to
|
||||
be overridden in derived filesystem-specific classes to run the abstract
|
||||
tests on such filesystems.
|
||||
"""
|
||||
|
||||
@pytest.fixture
|
||||
def fs_bulk_operations_scenario_0(self, fs, fs_join, fs_path):
|
||||
"""
|
||||
Scenario on remote filesystem that is used for many cp/get/put tests.
|
||||
|
||||
Cleans up at the end of each test it which it is used.
|
||||
"""
|
||||
source = self._bulk_operations_scenario_0(fs, fs_join, fs_path)
|
||||
yield source
|
||||
fs.rm(source, recursive=True)
|
||||
|
||||
@pytest.fixture
|
||||
def fs_glob_edge_cases_files(self, fs, fs_join, fs_path):
|
||||
"""
|
||||
Scenario on remote filesystem that is used for glob edge cases cp/get/put tests.
|
||||
|
||||
Cleans up at the end of each test it which it is used.
|
||||
"""
|
||||
source = self._glob_edge_cases_files(fs, fs_join, fs_path)
|
||||
yield source
|
||||
fs.rm(source, recursive=True)
|
||||
|
||||
@pytest.fixture
|
||||
def fs_dir_and_file_with_same_name_prefix(self, fs, fs_join, fs_path):
|
||||
"""
|
||||
Scenario on remote filesystem that is used to check cp/get/put on directory
|
||||
and file with the same name prefixes.
|
||||
|
||||
Cleans up at the end of each test it which it is used.
|
||||
"""
|
||||
source = self._dir_and_file_with_same_name_prefix(fs, fs_join, fs_path)
|
||||
yield source
|
||||
fs.rm(source, recursive=True)
|
||||
|
||||
@pytest.fixture
|
||||
def fs_10_files_with_hashed_names(self, fs, fs_join, fs_path):
|
||||
"""
|
||||
Scenario on remote filesystem that is used to check cp/get/put files order
|
||||
when source and destination are lists.
|
||||
|
||||
Cleans up at the end of each test it which it is used.
|
||||
"""
|
||||
source = self._10_files_with_hashed_names(fs, fs_join, fs_path)
|
||||
yield source
|
||||
fs.rm(source, recursive=True)
|
||||
|
||||
@pytest.fixture
|
||||
def fs_target(self, fs, fs_join, fs_path):
|
||||
"""
|
||||
Return name of remote directory that does not yet exist to copy into.
|
||||
|
||||
Cleans up at the end of each test it which it is used.
|
||||
"""
|
||||
target = fs_join(fs_path, "target")
|
||||
yield target
|
||||
if fs.exists(target):
|
||||
fs.rm(target, recursive=True)
|
||||
|
||||
@pytest.fixture
|
||||
def local_bulk_operations_scenario_0(self, local_fs, local_join, local_path):
|
||||
"""
|
||||
Scenario on local filesystem that is used for many cp/get/put tests.
|
||||
|
||||
Cleans up at the end of each test it which it is used.
|
||||
"""
|
||||
source = self._bulk_operations_scenario_0(local_fs, local_join, local_path)
|
||||
yield source
|
||||
local_fs.rm(source, recursive=True)
|
||||
|
||||
@pytest.fixture
|
||||
def local_glob_edge_cases_files(self, local_fs, local_join, local_path):
|
||||
"""
|
||||
Scenario on local filesystem that is used for glob edge cases cp/get/put tests.
|
||||
|
||||
Cleans up at the end of each test it which it is used.
|
||||
"""
|
||||
source = self._glob_edge_cases_files(local_fs, local_join, local_path)
|
||||
yield source
|
||||
local_fs.rm(source, recursive=True)
|
||||
|
||||
@pytest.fixture
|
||||
def local_dir_and_file_with_same_name_prefix(
|
||||
self, local_fs, local_join, local_path
|
||||
):
|
||||
"""
|
||||
Scenario on local filesystem that is used to check cp/get/put on directory
|
||||
and file with the same name prefixes.
|
||||
|
||||
Cleans up at the end of each test it which it is used.
|
||||
"""
|
||||
source = self._dir_and_file_with_same_name_prefix(
|
||||
local_fs, local_join, local_path
|
||||
)
|
||||
yield source
|
||||
local_fs.rm(source, recursive=True)
|
||||
|
||||
@pytest.fixture
|
||||
def local_10_files_with_hashed_names(self, local_fs, local_join, local_path):
|
||||
"""
|
||||
Scenario on local filesystem that is used to check cp/get/put files order
|
||||
when source and destination are lists.
|
||||
|
||||
Cleans up at the end of each test it which it is used.
|
||||
"""
|
||||
source = self._10_files_with_hashed_names(local_fs, local_join, local_path)
|
||||
yield source
|
||||
local_fs.rm(source, recursive=True)
|
||||
|
||||
@pytest.fixture
|
||||
def local_target(self, local_fs, local_join, local_path):
|
||||
"""
|
||||
Return name of local directory that does not yet exist to copy into.
|
||||
|
||||
Cleans up at the end of each test it which it is used.
|
||||
"""
|
||||
target = local_join(local_path, "target")
|
||||
yield target
|
||||
if local_fs.exists(target):
|
||||
local_fs.rm(target, recursive=True)
|
||||
|
||||
def _glob_edge_cases_files(self, some_fs, some_join, some_path):
|
||||
"""
|
||||
Scenario that is used for glob edge cases cp/get/put tests.
|
||||
Creates the following directory and file structure:
|
||||
|
||||
📁 source
|
||||
├── 📄 file1
|
||||
├── 📄 file2
|
||||
├── 📁 subdir0
|
||||
│ ├── 📄 subfile1
|
||||
│ ├── 📄 subfile2
|
||||
│ └── 📁 nesteddir
|
||||
│ └── 📄 nestedfile
|
||||
└── 📁 subdir1
|
||||
├── 📄 subfile1
|
||||
├── 📄 subfile2
|
||||
└── 📁 nesteddir
|
||||
└── 📄 nestedfile
|
||||
"""
|
||||
source = some_join(some_path, "source")
|
||||
some_fs.touch(some_join(source, "file1"))
|
||||
some_fs.touch(some_join(source, "file2"))
|
||||
|
||||
for subdir_idx in range(2):
|
||||
subdir = some_join(source, f"subdir{subdir_idx}")
|
||||
nesteddir = some_join(subdir, "nesteddir")
|
||||
some_fs.makedirs(nesteddir)
|
||||
some_fs.touch(some_join(subdir, "subfile1"))
|
||||
some_fs.touch(some_join(subdir, "subfile2"))
|
||||
some_fs.touch(some_join(nesteddir, "nestedfile"))
|
||||
|
||||
return source
|
||||
|
||||
def _bulk_operations_scenario_0(self, some_fs, some_join, some_path):
|
||||
"""
|
||||
Scenario that is used for many cp/get/put tests. Creates the following
|
||||
directory and file structure:
|
||||
|
||||
📁 source
|
||||
├── 📄 file1
|
||||
├── 📄 file2
|
||||
└── 📁 subdir
|
||||
├── 📄 subfile1
|
||||
├── 📄 subfile2
|
||||
└── 📁 nesteddir
|
||||
└── 📄 nestedfile
|
||||
"""
|
||||
source = some_join(some_path, "source")
|
||||
subdir = some_join(source, "subdir")
|
||||
nesteddir = some_join(subdir, "nesteddir")
|
||||
some_fs.makedirs(nesteddir)
|
||||
some_fs.touch(some_join(source, "file1"))
|
||||
some_fs.touch(some_join(source, "file2"))
|
||||
some_fs.touch(some_join(subdir, "subfile1"))
|
||||
some_fs.touch(some_join(subdir, "subfile2"))
|
||||
some_fs.touch(some_join(nesteddir, "nestedfile"))
|
||||
return source
|
||||
|
||||
def _dir_and_file_with_same_name_prefix(self, some_fs, some_join, some_path):
|
||||
"""
|
||||
Scenario that is used to check cp/get/put on directory and file with
|
||||
the same name prefixes. Creates the following directory and file structure:
|
||||
|
||||
📁 source
|
||||
├── 📄 subdir.txt
|
||||
└── 📁 subdir
|
||||
└── 📄 subfile.txt
|
||||
"""
|
||||
source = some_join(some_path, "source")
|
||||
subdir = some_join(source, "subdir")
|
||||
file = some_join(source, "subdir.txt")
|
||||
subfile = some_join(subdir, "subfile.txt")
|
||||
some_fs.makedirs(subdir)
|
||||
some_fs.touch(file)
|
||||
some_fs.touch(subfile)
|
||||
return source
|
||||
|
||||
def _10_files_with_hashed_names(self, some_fs, some_join, some_path):
|
||||
"""
|
||||
Scenario that is used to check cp/get/put files order when source and
|
||||
destination are lists. Creates the following directory and file structure:
|
||||
|
||||
📁 source
|
||||
└── 📄 {hashed([0-9])}.txt
|
||||
"""
|
||||
source = some_join(some_path, "source")
|
||||
for i in range(10):
|
||||
hashed_i = md5(str(i).encode("utf-8")).hexdigest()
|
||||
path = some_join(source, f"{hashed_i}.txt")
|
||||
some_fs.pipe(path=path, value=f"{i}".encode())
|
||||
return source
|
||||
|
||||
|
||||
class AbstractFixtures(BaseAbstractFixtures):
|
||||
"""
|
||||
Abstract base class containing fixtures that may be overridden in derived
|
||||
filesystem-specific classes to run the abstract tests on such filesystems.
|
||||
|
||||
For any particular filesystem some of these fixtures must be overridden,
|
||||
such as ``fs`` and ``fs_path``, and others may be overridden if the
|
||||
default functions here are not appropriate, such as ``fs_join``.
|
||||
"""
|
||||
|
||||
@pytest.fixture
|
||||
def fs(self):
|
||||
raise NotImplementedError("This function must be overridden in derived classes")
|
||||
|
||||
@pytest.fixture
|
||||
def fs_join(self):
|
||||
"""
|
||||
Return a function that joins its arguments together into a path.
|
||||
|
||||
Most fsspec implementations join paths in a platform-dependent way,
|
||||
but some will override this to always use a forward slash.
|
||||
"""
|
||||
return os.path.join
|
||||
|
||||
@pytest.fixture
|
||||
def fs_path(self):
|
||||
raise NotImplementedError("This function must be overridden in derived classes")
|
||||
|
||||
@pytest.fixture(scope="class")
|
||||
def local_fs(self):
|
||||
# Maybe need an option for auto_mkdir=False? This is only relevant
|
||||
# for certain implementations.
|
||||
return LocalFileSystem(auto_mkdir=True)
|
||||
|
||||
@pytest.fixture
|
||||
def local_join(self):
|
||||
"""
|
||||
Return a function that joins its arguments together into a path, on
|
||||
the local filesystem.
|
||||
"""
|
||||
return os.path.join
|
||||
|
||||
@pytest.fixture
|
||||
def local_path(self, tmpdir):
|
||||
return tmpdir
|
||||
|
||||
@pytest.fixture
|
||||
def supports_empty_directories(self):
|
||||
"""
|
||||
Return whether this implementation supports empty directories.
|
||||
"""
|
||||
return True
|
||||
|
||||
@pytest.fixture
|
||||
def fs_sanitize_path(self):
|
||||
return lambda x: x
|
||||
@@ -0,0 +1,175 @@
|
||||
GLOB_EDGE_CASES_TESTS = {
|
||||
"argnames": ("path", "recursive", "maxdepth", "expected"),
|
||||
"argvalues": [
|
||||
("fil?1", False, None, ["file1"]),
|
||||
("fil?1", True, None, ["file1"]),
|
||||
("file[1-2]", False, None, ["file1", "file2"]),
|
||||
("file[1-2]", True, None, ["file1", "file2"]),
|
||||
("*", False, None, ["file1", "file2"]),
|
||||
(
|
||||
"*",
|
||||
True,
|
||||
None,
|
||||
[
|
||||
"file1",
|
||||
"file2",
|
||||
"subdir0/subfile1",
|
||||
"subdir0/subfile2",
|
||||
"subdir0/nesteddir/nestedfile",
|
||||
"subdir1/subfile1",
|
||||
"subdir1/subfile2",
|
||||
"subdir1/nesteddir/nestedfile",
|
||||
],
|
||||
),
|
||||
("*", True, 1, ["file1", "file2"]),
|
||||
(
|
||||
"*",
|
||||
True,
|
||||
2,
|
||||
[
|
||||
"file1",
|
||||
"file2",
|
||||
"subdir0/subfile1",
|
||||
"subdir0/subfile2",
|
||||
"subdir1/subfile1",
|
||||
"subdir1/subfile2",
|
||||
],
|
||||
),
|
||||
("*1", False, None, ["file1"]),
|
||||
(
|
||||
"*1",
|
||||
True,
|
||||
None,
|
||||
[
|
||||
"file1",
|
||||
"subdir1/subfile1",
|
||||
"subdir1/subfile2",
|
||||
"subdir1/nesteddir/nestedfile",
|
||||
],
|
||||
),
|
||||
("*1", True, 2, ["file1", "subdir1/subfile1", "subdir1/subfile2"]),
|
||||
(
|
||||
"**",
|
||||
False,
|
||||
None,
|
||||
[
|
||||
"file1",
|
||||
"file2",
|
||||
"subdir0/subfile1",
|
||||
"subdir0/subfile2",
|
||||
"subdir0/nesteddir/nestedfile",
|
||||
"subdir1/subfile1",
|
||||
"subdir1/subfile2",
|
||||
"subdir1/nesteddir/nestedfile",
|
||||
],
|
||||
),
|
||||
(
|
||||
"**",
|
||||
True,
|
||||
None,
|
||||
[
|
||||
"file1",
|
||||
"file2",
|
||||
"subdir0/subfile1",
|
||||
"subdir0/subfile2",
|
||||
"subdir0/nesteddir/nestedfile",
|
||||
"subdir1/subfile1",
|
||||
"subdir1/subfile2",
|
||||
"subdir1/nesteddir/nestedfile",
|
||||
],
|
||||
),
|
||||
("**", True, 1, ["file1", "file2"]),
|
||||
(
|
||||
"**",
|
||||
True,
|
||||
2,
|
||||
[
|
||||
"file1",
|
||||
"file2",
|
||||
"subdir0/subfile1",
|
||||
"subdir0/subfile2",
|
||||
"subdir0/nesteddir/nestedfile",
|
||||
"subdir1/subfile1",
|
||||
"subdir1/subfile2",
|
||||
"subdir1/nesteddir/nestedfile",
|
||||
],
|
||||
),
|
||||
(
|
||||
"**",
|
||||
False,
|
||||
2,
|
||||
[
|
||||
"file1",
|
||||
"file2",
|
||||
"subdir0/subfile1",
|
||||
"subdir0/subfile2",
|
||||
"subdir1/subfile1",
|
||||
"subdir1/subfile2",
|
||||
],
|
||||
),
|
||||
("**/*1", False, None, ["file1", "subdir0/subfile1", "subdir1/subfile1"]),
|
||||
(
|
||||
"**/*1",
|
||||
True,
|
||||
None,
|
||||
[
|
||||
"file1",
|
||||
"subdir0/subfile1",
|
||||
"subdir1/subfile1",
|
||||
"subdir1/subfile2",
|
||||
"subdir1/nesteddir/nestedfile",
|
||||
],
|
||||
),
|
||||
("**/*1", True, 1, ["file1"]),
|
||||
(
|
||||
"**/*1",
|
||||
True,
|
||||
2,
|
||||
["file1", "subdir0/subfile1", "subdir1/subfile1", "subdir1/subfile2"],
|
||||
),
|
||||
("**/*1", False, 2, ["file1", "subdir0/subfile1", "subdir1/subfile1"]),
|
||||
("**/subdir0", False, None, []),
|
||||
("**/subdir0", True, None, ["subfile1", "subfile2", "nesteddir/nestedfile"]),
|
||||
("**/subdir0/nested*", False, 2, []),
|
||||
("**/subdir0/nested*", True, 2, ["nestedfile"]),
|
||||
("subdir[1-2]", False, None, []),
|
||||
("subdir[1-2]", True, None, ["subfile1", "subfile2", "nesteddir/nestedfile"]),
|
||||
("subdir[1-2]", True, 2, ["subfile1", "subfile2"]),
|
||||
("subdir[0-1]", False, None, []),
|
||||
(
|
||||
"subdir[0-1]",
|
||||
True,
|
||||
None,
|
||||
[
|
||||
"subdir0/subfile1",
|
||||
"subdir0/subfile2",
|
||||
"subdir0/nesteddir/nestedfile",
|
||||
"subdir1/subfile1",
|
||||
"subdir1/subfile2",
|
||||
"subdir1/nesteddir/nestedfile",
|
||||
],
|
||||
),
|
||||
(
|
||||
"subdir[0-1]/*fil[e]*",
|
||||
False,
|
||||
None,
|
||||
[
|
||||
"subdir0/subfile1",
|
||||
"subdir0/subfile2",
|
||||
"subdir1/subfile1",
|
||||
"subdir1/subfile2",
|
||||
],
|
||||
),
|
||||
(
|
||||
"subdir[0-1]/*fil[e]*",
|
||||
True,
|
||||
None,
|
||||
[
|
||||
"subdir0/subfile1",
|
||||
"subdir0/subfile2",
|
||||
"subdir1/subfile1",
|
||||
"subdir1/subfile2",
|
||||
],
|
||||
),
|
||||
],
|
||||
}
|
||||
@@ -0,0 +1,557 @@
|
||||
from hashlib import md5
|
||||
from itertools import product
|
||||
|
||||
import pytest
|
||||
|
||||
from fsspec.tests.abstract.common import GLOB_EDGE_CASES_TESTS
|
||||
|
||||
|
||||
class AbstractCopyTests:
|
||||
def test_copy_file_to_existing_directory(
|
||||
self,
|
||||
fs,
|
||||
fs_join,
|
||||
fs_bulk_operations_scenario_0,
|
||||
fs_target,
|
||||
supports_empty_directories,
|
||||
):
|
||||
# Copy scenario 1a
|
||||
source = fs_bulk_operations_scenario_0
|
||||
|
||||
target = fs_target
|
||||
fs.mkdir(target)
|
||||
if not supports_empty_directories:
|
||||
# Force target directory to exist by adding a dummy file
|
||||
fs.touch(fs_join(target, "dummy"))
|
||||
assert fs.isdir(target)
|
||||
|
||||
target_file2 = fs_join(target, "file2")
|
||||
target_subfile1 = fs_join(target, "subfile1")
|
||||
|
||||
# Copy from source directory
|
||||
fs.cp(fs_join(source, "file2"), target)
|
||||
assert fs.isfile(target_file2)
|
||||
|
||||
# Copy from sub directory
|
||||
fs.cp(fs_join(source, "subdir", "subfile1"), target)
|
||||
assert fs.isfile(target_subfile1)
|
||||
|
||||
# Remove copied files
|
||||
fs.rm([target_file2, target_subfile1])
|
||||
assert not fs.exists(target_file2)
|
||||
assert not fs.exists(target_subfile1)
|
||||
|
||||
# Repeat with trailing slash on target
|
||||
fs.cp(fs_join(source, "file2"), target + "/")
|
||||
assert fs.isdir(target)
|
||||
assert fs.isfile(target_file2)
|
||||
|
||||
fs.cp(fs_join(source, "subdir", "subfile1"), target + "/")
|
||||
assert fs.isfile(target_subfile1)
|
||||
|
||||
def test_copy_file_to_new_directory(
|
||||
self, fs, fs_join, fs_bulk_operations_scenario_0, fs_target
|
||||
):
|
||||
# Copy scenario 1b
|
||||
source = fs_bulk_operations_scenario_0
|
||||
|
||||
target = fs_target
|
||||
fs.mkdir(target)
|
||||
|
||||
fs.cp(
|
||||
fs_join(source, "subdir", "subfile1"), fs_join(target, "newdir/")
|
||||
) # Note trailing slash
|
||||
assert fs.isdir(target)
|
||||
assert fs.isdir(fs_join(target, "newdir"))
|
||||
assert fs.isfile(fs_join(target, "newdir", "subfile1"))
|
||||
|
||||
def test_copy_file_to_file_in_existing_directory(
|
||||
self,
|
||||
fs,
|
||||
fs_join,
|
||||
fs_bulk_operations_scenario_0,
|
||||
fs_target,
|
||||
supports_empty_directories,
|
||||
):
|
||||
# Copy scenario 1c
|
||||
source = fs_bulk_operations_scenario_0
|
||||
|
||||
target = fs_target
|
||||
fs.mkdir(target)
|
||||
if not supports_empty_directories:
|
||||
# Force target directory to exist by adding a dummy file
|
||||
fs.touch(fs_join(target, "dummy"))
|
||||
assert fs.isdir(target)
|
||||
|
||||
fs.cp(fs_join(source, "subdir", "subfile1"), fs_join(target, "newfile"))
|
||||
assert fs.isfile(fs_join(target, "newfile"))
|
||||
|
||||
def test_copy_file_to_file_in_new_directory(
|
||||
self, fs, fs_join, fs_bulk_operations_scenario_0, fs_target
|
||||
):
|
||||
# Copy scenario 1d
|
||||
source = fs_bulk_operations_scenario_0
|
||||
|
||||
target = fs_target
|
||||
fs.mkdir(target)
|
||||
|
||||
fs.cp(
|
||||
fs_join(source, "subdir", "subfile1"), fs_join(target, "newdir", "newfile")
|
||||
)
|
||||
assert fs.isdir(fs_join(target, "newdir"))
|
||||
assert fs.isfile(fs_join(target, "newdir", "newfile"))
|
||||
|
||||
def test_copy_directory_to_existing_directory(
|
||||
self,
|
||||
fs,
|
||||
fs_join,
|
||||
fs_bulk_operations_scenario_0,
|
||||
fs_target,
|
||||
supports_empty_directories,
|
||||
):
|
||||
# Copy scenario 1e
|
||||
source = fs_bulk_operations_scenario_0
|
||||
|
||||
target = fs_target
|
||||
fs.mkdir(target)
|
||||
if not supports_empty_directories:
|
||||
# Force target directory to exist by adding a dummy file
|
||||
dummy = fs_join(target, "dummy")
|
||||
fs.touch(dummy)
|
||||
assert fs.isdir(target)
|
||||
|
||||
for source_slash, target_slash in zip([False, True], [False, True]):
|
||||
s = fs_join(source, "subdir")
|
||||
if source_slash:
|
||||
s += "/"
|
||||
t = target + "/" if target_slash else target
|
||||
|
||||
# Without recursive does nothing
|
||||
fs.cp(s, t)
|
||||
assert fs.ls(target, detail=False) == (
|
||||
[] if supports_empty_directories else [dummy]
|
||||
)
|
||||
|
||||
# With recursive
|
||||
fs.cp(s, t, recursive=True)
|
||||
if source_slash:
|
||||
assert fs.isfile(fs_join(target, "subfile1"))
|
||||
assert fs.isfile(fs_join(target, "subfile2"))
|
||||
assert fs.isdir(fs_join(target, "nesteddir"))
|
||||
assert fs.isfile(fs_join(target, "nesteddir", "nestedfile"))
|
||||
assert not fs.exists(fs_join(target, "subdir"))
|
||||
|
||||
fs.rm(
|
||||
[
|
||||
fs_join(target, "subfile1"),
|
||||
fs_join(target, "subfile2"),
|
||||
fs_join(target, "nesteddir"),
|
||||
],
|
||||
recursive=True,
|
||||
)
|
||||
else:
|
||||
assert fs.isdir(fs_join(target, "subdir"))
|
||||
assert fs.isfile(fs_join(target, "subdir", "subfile1"))
|
||||
assert fs.isfile(fs_join(target, "subdir", "subfile2"))
|
||||
assert fs.isdir(fs_join(target, "subdir", "nesteddir"))
|
||||
assert fs.isfile(fs_join(target, "subdir", "nesteddir", "nestedfile"))
|
||||
|
||||
fs.rm(fs_join(target, "subdir"), recursive=True)
|
||||
assert fs.ls(target, detail=False) == (
|
||||
[] if supports_empty_directories else [dummy]
|
||||
)
|
||||
|
||||
# Limit recursive by maxdepth
|
||||
fs.cp(s, t, recursive=True, maxdepth=1)
|
||||
if source_slash:
|
||||
assert fs.isfile(fs_join(target, "subfile1"))
|
||||
assert fs.isfile(fs_join(target, "subfile2"))
|
||||
assert not fs.exists(fs_join(target, "nesteddir"))
|
||||
assert not fs.exists(fs_join(target, "subdir"))
|
||||
|
||||
fs.rm(
|
||||
[
|
||||
fs_join(target, "subfile1"),
|
||||
fs_join(target, "subfile2"),
|
||||
],
|
||||
recursive=True,
|
||||
)
|
||||
else:
|
||||
assert fs.isdir(fs_join(target, "subdir"))
|
||||
assert fs.isfile(fs_join(target, "subdir", "subfile1"))
|
||||
assert fs.isfile(fs_join(target, "subdir", "subfile2"))
|
||||
assert not fs.exists(fs_join(target, "subdir", "nesteddir"))
|
||||
|
||||
fs.rm(fs_join(target, "subdir"), recursive=True)
|
||||
assert fs.ls(target, detail=False) == (
|
||||
[] if supports_empty_directories else [dummy]
|
||||
)
|
||||
|
||||
def test_copy_directory_to_new_directory(
|
||||
self,
|
||||
fs,
|
||||
fs_join,
|
||||
fs_bulk_operations_scenario_0,
|
||||
fs_target,
|
||||
supports_empty_directories,
|
||||
):
|
||||
# Copy scenario 1f
|
||||
source = fs_bulk_operations_scenario_0
|
||||
|
||||
target = fs_target
|
||||
fs.mkdir(target)
|
||||
|
||||
for source_slash, target_slash in zip([False, True], [False, True]):
|
||||
s = fs_join(source, "subdir")
|
||||
if source_slash:
|
||||
s += "/"
|
||||
t = fs_join(target, "newdir")
|
||||
if target_slash:
|
||||
t += "/"
|
||||
|
||||
# Without recursive does nothing
|
||||
fs.cp(s, t)
|
||||
if supports_empty_directories:
|
||||
assert fs.ls(target) == []
|
||||
else:
|
||||
with pytest.raises(FileNotFoundError):
|
||||
fs.ls(target)
|
||||
|
||||
# With recursive
|
||||
fs.cp(s, t, recursive=True)
|
||||
assert fs.isdir(fs_join(target, "newdir"))
|
||||
assert fs.isfile(fs_join(target, "newdir", "subfile1"))
|
||||
assert fs.isfile(fs_join(target, "newdir", "subfile2"))
|
||||
assert fs.isdir(fs_join(target, "newdir", "nesteddir"))
|
||||
assert fs.isfile(fs_join(target, "newdir", "nesteddir", "nestedfile"))
|
||||
assert not fs.exists(fs_join(target, "subdir"))
|
||||
|
||||
fs.rm(fs_join(target, "newdir"), recursive=True)
|
||||
assert not fs.exists(fs_join(target, "newdir"))
|
||||
|
||||
# Limit recursive by maxdepth
|
||||
fs.cp(s, t, recursive=True, maxdepth=1)
|
||||
assert fs.isdir(fs_join(target, "newdir"))
|
||||
assert fs.isfile(fs_join(target, "newdir", "subfile1"))
|
||||
assert fs.isfile(fs_join(target, "newdir", "subfile2"))
|
||||
assert not fs.exists(fs_join(target, "newdir", "nesteddir"))
|
||||
assert not fs.exists(fs_join(target, "subdir"))
|
||||
|
||||
fs.rm(fs_join(target, "newdir"), recursive=True)
|
||||
assert not fs.exists(fs_join(target, "newdir"))
|
||||
|
||||
def test_copy_glob_to_existing_directory(
|
||||
self,
|
||||
fs,
|
||||
fs_join,
|
||||
fs_bulk_operations_scenario_0,
|
||||
fs_target,
|
||||
supports_empty_directories,
|
||||
):
|
||||
# Copy scenario 1g
|
||||
source = fs_bulk_operations_scenario_0
|
||||
|
||||
target = fs_target
|
||||
fs.mkdir(target)
|
||||
if not supports_empty_directories:
|
||||
# Force target directory to exist by adding a dummy file
|
||||
dummy = fs_join(target, "dummy")
|
||||
fs.touch(dummy)
|
||||
assert fs.isdir(target)
|
||||
|
||||
for target_slash in [False, True]:
|
||||
t = target + "/" if target_slash else target
|
||||
|
||||
# Without recursive
|
||||
fs.cp(fs_join(source, "subdir", "*"), t)
|
||||
assert fs.isfile(fs_join(target, "subfile1"))
|
||||
assert fs.isfile(fs_join(target, "subfile2"))
|
||||
assert not fs.isdir(fs_join(target, "nesteddir"))
|
||||
assert not fs.exists(fs_join(target, "nesteddir", "nestedfile"))
|
||||
assert not fs.exists(fs_join(target, "subdir"))
|
||||
|
||||
fs.rm(
|
||||
[
|
||||
fs_join(target, "subfile1"),
|
||||
fs_join(target, "subfile2"),
|
||||
],
|
||||
recursive=True,
|
||||
)
|
||||
assert fs.ls(target, detail=False) == (
|
||||
[] if supports_empty_directories else [dummy]
|
||||
)
|
||||
|
||||
# With recursive
|
||||
for glob, recursive in zip(["*", "**"], [True, False]):
|
||||
fs.cp(fs_join(source, "subdir", glob), t, recursive=recursive)
|
||||
assert fs.isfile(fs_join(target, "subfile1"))
|
||||
assert fs.isfile(fs_join(target, "subfile2"))
|
||||
assert fs.isdir(fs_join(target, "nesteddir"))
|
||||
assert fs.isfile(fs_join(target, "nesteddir", "nestedfile"))
|
||||
assert not fs.exists(fs_join(target, "subdir"))
|
||||
|
||||
fs.rm(
|
||||
[
|
||||
fs_join(target, "subfile1"),
|
||||
fs_join(target, "subfile2"),
|
||||
fs_join(target, "nesteddir"),
|
||||
],
|
||||
recursive=True,
|
||||
)
|
||||
assert fs.ls(target, detail=False) == (
|
||||
[] if supports_empty_directories else [dummy]
|
||||
)
|
||||
|
||||
# Limit recursive by maxdepth
|
||||
fs.cp(
|
||||
fs_join(source, "subdir", glob), t, recursive=recursive, maxdepth=1
|
||||
)
|
||||
assert fs.isfile(fs_join(target, "subfile1"))
|
||||
assert fs.isfile(fs_join(target, "subfile2"))
|
||||
assert not fs.exists(fs_join(target, "nesteddir"))
|
||||
assert not fs.exists(fs_join(target, "subdir"))
|
||||
|
||||
fs.rm(
|
||||
[
|
||||
fs_join(target, "subfile1"),
|
||||
fs_join(target, "subfile2"),
|
||||
],
|
||||
recursive=True,
|
||||
)
|
||||
assert fs.ls(target, detail=False) == (
|
||||
[] if supports_empty_directories else [dummy]
|
||||
)
|
||||
|
||||
def test_copy_glob_to_new_directory(
|
||||
self, fs, fs_join, fs_bulk_operations_scenario_0, fs_target
|
||||
):
|
||||
# Copy scenario 1h
|
||||
source = fs_bulk_operations_scenario_0
|
||||
|
||||
target = fs_target
|
||||
fs.mkdir(target)
|
||||
|
||||
for target_slash in [False, True]:
|
||||
t = fs_join(target, "newdir")
|
||||
if target_slash:
|
||||
t += "/"
|
||||
|
||||
# Without recursive
|
||||
fs.cp(fs_join(source, "subdir", "*"), t)
|
||||
assert fs.isdir(fs_join(target, "newdir"))
|
||||
assert fs.isfile(fs_join(target, "newdir", "subfile1"))
|
||||
assert fs.isfile(fs_join(target, "newdir", "subfile2"))
|
||||
assert not fs.exists(fs_join(target, "newdir", "nesteddir"))
|
||||
assert not fs.exists(fs_join(target, "newdir", "nesteddir", "nestedfile"))
|
||||
assert not fs.exists(fs_join(target, "subdir"))
|
||||
assert not fs.exists(fs_join(target, "newdir", "subdir"))
|
||||
|
||||
fs.rm(fs_join(target, "newdir"), recursive=True)
|
||||
assert not fs.exists(fs_join(target, "newdir"))
|
||||
|
||||
# With recursive
|
||||
for glob, recursive in zip(["*", "**"], [True, False]):
|
||||
fs.cp(fs_join(source, "subdir", glob), t, recursive=recursive)
|
||||
assert fs.isdir(fs_join(target, "newdir"))
|
||||
assert fs.isfile(fs_join(target, "newdir", "subfile1"))
|
||||
assert fs.isfile(fs_join(target, "newdir", "subfile2"))
|
||||
assert fs.isdir(fs_join(target, "newdir", "nesteddir"))
|
||||
assert fs.isfile(fs_join(target, "newdir", "nesteddir", "nestedfile"))
|
||||
assert not fs.exists(fs_join(target, "subdir"))
|
||||
assert not fs.exists(fs_join(target, "newdir", "subdir"))
|
||||
|
||||
fs.rm(fs_join(target, "newdir"), recursive=True)
|
||||
assert not fs.exists(fs_join(target, "newdir"))
|
||||
|
||||
# Limit recursive by maxdepth
|
||||
fs.cp(
|
||||
fs_join(source, "subdir", glob), t, recursive=recursive, maxdepth=1
|
||||
)
|
||||
assert fs.isdir(fs_join(target, "newdir"))
|
||||
assert fs.isfile(fs_join(target, "newdir", "subfile1"))
|
||||
assert fs.isfile(fs_join(target, "newdir", "subfile2"))
|
||||
assert not fs.exists(fs_join(target, "newdir", "nesteddir"))
|
||||
assert not fs.exists(fs_join(target, "subdir"))
|
||||
assert not fs.exists(fs_join(target, "newdir", "subdir"))
|
||||
|
||||
fs.rm(fs_join(target, "newdir"), recursive=True)
|
||||
assert not fs.exists(fs_join(target, "newdir"))
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
GLOB_EDGE_CASES_TESTS["argnames"],
|
||||
GLOB_EDGE_CASES_TESTS["argvalues"],
|
||||
)
|
||||
def test_copy_glob_edge_cases(
|
||||
self,
|
||||
path,
|
||||
recursive,
|
||||
maxdepth,
|
||||
expected,
|
||||
fs,
|
||||
fs_join,
|
||||
fs_glob_edge_cases_files,
|
||||
fs_target,
|
||||
fs_sanitize_path,
|
||||
):
|
||||
# Copy scenario 1g
|
||||
source = fs_glob_edge_cases_files
|
||||
|
||||
target = fs_target
|
||||
|
||||
for new_dir, target_slash in product([True, False], [True, False]):
|
||||
fs.mkdir(target)
|
||||
|
||||
t = fs_join(target, "newdir") if new_dir else target
|
||||
t = t + "/" if target_slash else t
|
||||
|
||||
fs.copy(fs_join(source, path), t, recursive=recursive, maxdepth=maxdepth)
|
||||
|
||||
output = fs.find(target)
|
||||
if new_dir:
|
||||
prefixed_expected = [
|
||||
fs_sanitize_path(fs_join(target, "newdir", p)) for p in expected
|
||||
]
|
||||
else:
|
||||
prefixed_expected = [
|
||||
fs_sanitize_path(fs_join(target, p)) for p in expected
|
||||
]
|
||||
assert sorted(output) == sorted(prefixed_expected)
|
||||
|
||||
try:
|
||||
fs.rm(target, recursive=True)
|
||||
except FileNotFoundError:
|
||||
pass
|
||||
|
||||
def test_copy_list_of_files_to_existing_directory(
|
||||
self,
|
||||
fs,
|
||||
fs_join,
|
||||
fs_bulk_operations_scenario_0,
|
||||
fs_target,
|
||||
supports_empty_directories,
|
||||
):
|
||||
# Copy scenario 2a
|
||||
source = fs_bulk_operations_scenario_0
|
||||
|
||||
target = fs_target
|
||||
fs.mkdir(target)
|
||||
if not supports_empty_directories:
|
||||
# Force target directory to exist by adding a dummy file
|
||||
dummy = fs_join(target, "dummy")
|
||||
fs.touch(dummy)
|
||||
assert fs.isdir(target)
|
||||
|
||||
source_files = [
|
||||
fs_join(source, "file1"),
|
||||
fs_join(source, "file2"),
|
||||
fs_join(source, "subdir", "subfile1"),
|
||||
]
|
||||
|
||||
for target_slash in [False, True]:
|
||||
t = target + "/" if target_slash else target
|
||||
|
||||
fs.cp(source_files, t)
|
||||
assert fs.isfile(fs_join(target, "file1"))
|
||||
assert fs.isfile(fs_join(target, "file2"))
|
||||
assert fs.isfile(fs_join(target, "subfile1"))
|
||||
|
||||
fs.rm(
|
||||
[
|
||||
fs_join(target, "file1"),
|
||||
fs_join(target, "file2"),
|
||||
fs_join(target, "subfile1"),
|
||||
],
|
||||
recursive=True,
|
||||
)
|
||||
assert fs.ls(target, detail=False) == (
|
||||
[] if supports_empty_directories else [dummy]
|
||||
)
|
||||
|
||||
def test_copy_list_of_files_to_new_directory(
|
||||
self, fs, fs_join, fs_bulk_operations_scenario_0, fs_target
|
||||
):
|
||||
# Copy scenario 2b
|
||||
source = fs_bulk_operations_scenario_0
|
||||
|
||||
target = fs_target
|
||||
fs.mkdir(target)
|
||||
|
||||
source_files = [
|
||||
fs_join(source, "file1"),
|
||||
fs_join(source, "file2"),
|
||||
fs_join(source, "subdir", "subfile1"),
|
||||
]
|
||||
|
||||
fs.cp(source_files, fs_join(target, "newdir") + "/") # Note trailing slash
|
||||
assert fs.isdir(fs_join(target, "newdir"))
|
||||
assert fs.isfile(fs_join(target, "newdir", "file1"))
|
||||
assert fs.isfile(fs_join(target, "newdir", "file2"))
|
||||
assert fs.isfile(fs_join(target, "newdir", "subfile1"))
|
||||
|
||||
def test_copy_two_files_new_directory(
|
||||
self, fs, fs_join, fs_bulk_operations_scenario_0, fs_target
|
||||
):
|
||||
# This is a duplicate of test_copy_list_of_files_to_new_directory and
|
||||
# can eventually be removed.
|
||||
source = fs_bulk_operations_scenario_0
|
||||
|
||||
target = fs_target
|
||||
assert not fs.exists(target)
|
||||
fs.cp([fs_join(source, "file1"), fs_join(source, "file2")], target)
|
||||
|
||||
assert fs.isdir(target)
|
||||
assert fs.isfile(fs_join(target, "file1"))
|
||||
assert fs.isfile(fs_join(target, "file2"))
|
||||
|
||||
def test_copy_directory_without_files_with_same_name_prefix(
|
||||
self,
|
||||
fs,
|
||||
fs_join,
|
||||
fs_target,
|
||||
fs_dir_and_file_with_same_name_prefix,
|
||||
supports_empty_directories,
|
||||
):
|
||||
# Create the test dirs
|
||||
source = fs_dir_and_file_with_same_name_prefix
|
||||
target = fs_target
|
||||
|
||||
# Test without glob
|
||||
fs.cp(fs_join(source, "subdir"), target, recursive=True)
|
||||
|
||||
assert fs.isfile(fs_join(target, "subfile.txt"))
|
||||
assert not fs.isfile(fs_join(target, "subdir.txt"))
|
||||
|
||||
fs.rm([fs_join(target, "subfile.txt")])
|
||||
if supports_empty_directories:
|
||||
assert fs.ls(target) == []
|
||||
else:
|
||||
assert not fs.exists(target)
|
||||
|
||||
# Test with glob
|
||||
fs.cp(fs_join(source, "subdir*"), target, recursive=True)
|
||||
|
||||
assert fs.isdir(fs_join(target, "subdir"))
|
||||
assert fs.isfile(fs_join(target, "subdir", "subfile.txt"))
|
||||
assert fs.isfile(fs_join(target, "subdir.txt"))
|
||||
|
||||
def test_copy_with_source_and_destination_as_list(
|
||||
self, fs, fs_target, fs_join, fs_10_files_with_hashed_names
|
||||
):
|
||||
# Create the test dir
|
||||
source = fs_10_files_with_hashed_names
|
||||
target = fs_target
|
||||
|
||||
# Create list of files for source and destination
|
||||
source_files = []
|
||||
destination_files = []
|
||||
for i in range(10):
|
||||
hashed_i = md5(str(i).encode("utf-8")).hexdigest()
|
||||
source_files.append(fs_join(source, f"{hashed_i}.txt"))
|
||||
destination_files.append(fs_join(target, f"{hashed_i}.txt"))
|
||||
|
||||
# Copy and assert order was kept
|
||||
fs.copy(path1=source_files, path2=destination_files)
|
||||
|
||||
for i in range(10):
|
||||
file_content = fs.cat(destination_files[i]).decode("utf-8")
|
||||
assert file_content == str(i)
|
||||
@@ -0,0 +1,587 @@
|
||||
from hashlib import md5
|
||||
from itertools import product
|
||||
|
||||
import pytest
|
||||
|
||||
from fsspec.implementations.local import make_path_posix
|
||||
from fsspec.tests.abstract.common import GLOB_EDGE_CASES_TESTS
|
||||
|
||||
|
||||
class AbstractGetTests:
|
||||
def test_get_file_to_existing_directory(
|
||||
self,
|
||||
fs,
|
||||
fs_join,
|
||||
fs_bulk_operations_scenario_0,
|
||||
local_fs,
|
||||
local_join,
|
||||
local_target,
|
||||
):
|
||||
# Copy scenario 1a
|
||||
source = fs_bulk_operations_scenario_0
|
||||
|
||||
target = local_target
|
||||
local_fs.mkdir(target)
|
||||
assert local_fs.isdir(target)
|
||||
|
||||
target_file2 = local_join(target, "file2")
|
||||
target_subfile1 = local_join(target, "subfile1")
|
||||
|
||||
# Copy from source directory
|
||||
fs.get(fs_join(source, "file2"), target)
|
||||
assert local_fs.isfile(target_file2)
|
||||
|
||||
# Copy from sub directory
|
||||
fs.get(fs_join(source, "subdir", "subfile1"), target)
|
||||
assert local_fs.isfile(target_subfile1)
|
||||
|
||||
# Remove copied files
|
||||
local_fs.rm([target_file2, target_subfile1])
|
||||
assert not local_fs.exists(target_file2)
|
||||
assert not local_fs.exists(target_subfile1)
|
||||
|
||||
# Repeat with trailing slash on target
|
||||
fs.get(fs_join(source, "file2"), target + "/")
|
||||
assert local_fs.isdir(target)
|
||||
assert local_fs.isfile(target_file2)
|
||||
|
||||
fs.get(fs_join(source, "subdir", "subfile1"), target + "/")
|
||||
assert local_fs.isfile(target_subfile1)
|
||||
|
||||
def test_get_file_to_new_directory(
|
||||
self,
|
||||
fs,
|
||||
fs_join,
|
||||
fs_bulk_operations_scenario_0,
|
||||
local_fs,
|
||||
local_join,
|
||||
local_target,
|
||||
):
|
||||
# Copy scenario 1b
|
||||
source = fs_bulk_operations_scenario_0
|
||||
|
||||
target = local_target
|
||||
local_fs.mkdir(target)
|
||||
|
||||
fs.get(
|
||||
fs_join(source, "subdir", "subfile1"), local_join(target, "newdir/")
|
||||
) # Note trailing slash
|
||||
|
||||
assert local_fs.isdir(target)
|
||||
assert local_fs.isdir(local_join(target, "newdir"))
|
||||
assert local_fs.isfile(local_join(target, "newdir", "subfile1"))
|
||||
|
||||
def test_get_file_to_file_in_existing_directory(
|
||||
self,
|
||||
fs,
|
||||
fs_join,
|
||||
fs_bulk_operations_scenario_0,
|
||||
local_fs,
|
||||
local_join,
|
||||
local_target,
|
||||
):
|
||||
# Copy scenario 1c
|
||||
source = fs_bulk_operations_scenario_0
|
||||
|
||||
target = local_target
|
||||
local_fs.mkdir(target)
|
||||
|
||||
fs.get(fs_join(source, "subdir", "subfile1"), local_join(target, "newfile"))
|
||||
assert local_fs.isfile(local_join(target, "newfile"))
|
||||
|
||||
def test_get_file_to_file_in_new_directory(
|
||||
self,
|
||||
fs,
|
||||
fs_join,
|
||||
fs_bulk_operations_scenario_0,
|
||||
local_fs,
|
||||
local_join,
|
||||
local_target,
|
||||
):
|
||||
# Copy scenario 1d
|
||||
source = fs_bulk_operations_scenario_0
|
||||
|
||||
target = local_target
|
||||
local_fs.mkdir(target)
|
||||
|
||||
fs.get(
|
||||
fs_join(source, "subdir", "subfile1"),
|
||||
local_join(target, "newdir", "newfile"),
|
||||
)
|
||||
assert local_fs.isdir(local_join(target, "newdir"))
|
||||
assert local_fs.isfile(local_join(target, "newdir", "newfile"))
|
||||
|
||||
def test_get_directory_to_existing_directory(
|
||||
self,
|
||||
fs,
|
||||
fs_join,
|
||||
fs_bulk_operations_scenario_0,
|
||||
local_fs,
|
||||
local_join,
|
||||
local_target,
|
||||
):
|
||||
# Copy scenario 1e
|
||||
source = fs_bulk_operations_scenario_0
|
||||
|
||||
target = local_target
|
||||
local_fs.mkdir(target)
|
||||
assert local_fs.isdir(target)
|
||||
|
||||
for source_slash, target_slash in zip([False, True], [False, True]):
|
||||
s = fs_join(source, "subdir")
|
||||
if source_slash:
|
||||
s += "/"
|
||||
t = target + "/" if target_slash else target
|
||||
|
||||
# Without recursive does nothing
|
||||
fs.get(s, t)
|
||||
assert local_fs.ls(target) == []
|
||||
|
||||
# With recursive
|
||||
fs.get(s, t, recursive=True)
|
||||
if source_slash:
|
||||
assert local_fs.isfile(local_join(target, "subfile1"))
|
||||
assert local_fs.isfile(local_join(target, "subfile2"))
|
||||
assert local_fs.isdir(local_join(target, "nesteddir"))
|
||||
assert local_fs.isfile(local_join(target, "nesteddir", "nestedfile"))
|
||||
assert not local_fs.exists(local_join(target, "subdir"))
|
||||
|
||||
local_fs.rm(
|
||||
[
|
||||
local_join(target, "subfile1"),
|
||||
local_join(target, "subfile2"),
|
||||
local_join(target, "nesteddir"),
|
||||
],
|
||||
recursive=True,
|
||||
)
|
||||
else:
|
||||
assert local_fs.isdir(local_join(target, "subdir"))
|
||||
assert local_fs.isfile(local_join(target, "subdir", "subfile1"))
|
||||
assert local_fs.isfile(local_join(target, "subdir", "subfile2"))
|
||||
assert local_fs.isdir(local_join(target, "subdir", "nesteddir"))
|
||||
assert local_fs.isfile(
|
||||
local_join(target, "subdir", "nesteddir", "nestedfile")
|
||||
)
|
||||
|
||||
local_fs.rm(local_join(target, "subdir"), recursive=True)
|
||||
assert local_fs.ls(target) == []
|
||||
|
||||
# Limit recursive by maxdepth
|
||||
fs.get(s, t, recursive=True, maxdepth=1)
|
||||
if source_slash:
|
||||
assert local_fs.isfile(local_join(target, "subfile1"))
|
||||
assert local_fs.isfile(local_join(target, "subfile2"))
|
||||
assert not local_fs.exists(local_join(target, "nesteddir"))
|
||||
assert not local_fs.exists(local_join(target, "subdir"))
|
||||
|
||||
local_fs.rm(
|
||||
[
|
||||
local_join(target, "subfile1"),
|
||||
local_join(target, "subfile2"),
|
||||
],
|
||||
recursive=True,
|
||||
)
|
||||
else:
|
||||
assert local_fs.isdir(local_join(target, "subdir"))
|
||||
assert local_fs.isfile(local_join(target, "subdir", "subfile1"))
|
||||
assert local_fs.isfile(local_join(target, "subdir", "subfile2"))
|
||||
assert not local_fs.exists(local_join(target, "subdir", "nesteddir"))
|
||||
|
||||
local_fs.rm(local_join(target, "subdir"), recursive=True)
|
||||
assert local_fs.ls(target) == []
|
||||
|
||||
def test_get_directory_to_new_directory(
|
||||
self,
|
||||
fs,
|
||||
fs_join,
|
||||
fs_bulk_operations_scenario_0,
|
||||
local_fs,
|
||||
local_join,
|
||||
local_target,
|
||||
):
|
||||
# Copy scenario 1f
|
||||
source = fs_bulk_operations_scenario_0
|
||||
|
||||
target = local_target
|
||||
local_fs.mkdir(target)
|
||||
|
||||
for source_slash, target_slash in zip([False, True], [False, True]):
|
||||
s = fs_join(source, "subdir")
|
||||
if source_slash:
|
||||
s += "/"
|
||||
t = local_join(target, "newdir")
|
||||
if target_slash:
|
||||
t += "/"
|
||||
|
||||
# Without recursive does nothing
|
||||
fs.get(s, t)
|
||||
assert local_fs.ls(target) == []
|
||||
|
||||
# With recursive
|
||||
fs.get(s, t, recursive=True)
|
||||
assert local_fs.isdir(local_join(target, "newdir"))
|
||||
assert local_fs.isfile(local_join(target, "newdir", "subfile1"))
|
||||
assert local_fs.isfile(local_join(target, "newdir", "subfile2"))
|
||||
assert local_fs.isdir(local_join(target, "newdir", "nesteddir"))
|
||||
assert local_fs.isfile(
|
||||
local_join(target, "newdir", "nesteddir", "nestedfile")
|
||||
)
|
||||
assert not local_fs.exists(local_join(target, "subdir"))
|
||||
|
||||
local_fs.rm(local_join(target, "newdir"), recursive=True)
|
||||
assert local_fs.ls(target) == []
|
||||
|
||||
# Limit recursive by maxdepth
|
||||
fs.get(s, t, recursive=True, maxdepth=1)
|
||||
assert local_fs.isdir(local_join(target, "newdir"))
|
||||
assert local_fs.isfile(local_join(target, "newdir", "subfile1"))
|
||||
assert local_fs.isfile(local_join(target, "newdir", "subfile2"))
|
||||
assert not local_fs.exists(local_join(target, "newdir", "nesteddir"))
|
||||
assert not local_fs.exists(local_join(target, "subdir"))
|
||||
|
||||
local_fs.rm(local_join(target, "newdir"), recursive=True)
|
||||
assert not local_fs.exists(local_join(target, "newdir"))
|
||||
|
||||
def test_get_glob_to_existing_directory(
|
||||
self,
|
||||
fs,
|
||||
fs_join,
|
||||
fs_bulk_operations_scenario_0,
|
||||
local_fs,
|
||||
local_join,
|
||||
local_target,
|
||||
):
|
||||
# Copy scenario 1g
|
||||
source = fs_bulk_operations_scenario_0
|
||||
|
||||
target = local_target
|
||||
local_fs.mkdir(target)
|
||||
|
||||
for target_slash in [False, True]:
|
||||
t = target + "/" if target_slash else target
|
||||
|
||||
# Without recursive
|
||||
fs.get(fs_join(source, "subdir", "*"), t)
|
||||
assert local_fs.isfile(local_join(target, "subfile1"))
|
||||
assert local_fs.isfile(local_join(target, "subfile2"))
|
||||
assert not local_fs.isdir(local_join(target, "nesteddir"))
|
||||
assert not local_fs.exists(local_join(target, "nesteddir", "nestedfile"))
|
||||
assert not local_fs.exists(local_join(target, "subdir"))
|
||||
|
||||
local_fs.rm(
|
||||
[
|
||||
local_join(target, "subfile1"),
|
||||
local_join(target, "subfile2"),
|
||||
],
|
||||
recursive=True,
|
||||
)
|
||||
assert local_fs.ls(target) == []
|
||||
|
||||
# With recursive
|
||||
for glob, recursive in zip(["*", "**"], [True, False]):
|
||||
fs.get(fs_join(source, "subdir", glob), t, recursive=recursive)
|
||||
assert local_fs.isfile(local_join(target, "subfile1"))
|
||||
assert local_fs.isfile(local_join(target, "subfile2"))
|
||||
assert local_fs.isdir(local_join(target, "nesteddir"))
|
||||
assert local_fs.isfile(local_join(target, "nesteddir", "nestedfile"))
|
||||
assert not local_fs.exists(local_join(target, "subdir"))
|
||||
|
||||
local_fs.rm(
|
||||
[
|
||||
local_join(target, "subfile1"),
|
||||
local_join(target, "subfile2"),
|
||||
local_join(target, "nesteddir"),
|
||||
],
|
||||
recursive=True,
|
||||
)
|
||||
assert local_fs.ls(target) == []
|
||||
|
||||
# Limit recursive by maxdepth
|
||||
fs.get(
|
||||
fs_join(source, "subdir", glob), t, recursive=recursive, maxdepth=1
|
||||
)
|
||||
assert local_fs.isfile(local_join(target, "subfile1"))
|
||||
assert local_fs.isfile(local_join(target, "subfile2"))
|
||||
assert not local_fs.exists(local_join(target, "nesteddir"))
|
||||
assert not local_fs.exists(local_join(target, "subdir"))
|
||||
|
||||
local_fs.rm(
|
||||
[
|
||||
local_join(target, "subfile1"),
|
||||
local_join(target, "subfile2"),
|
||||
],
|
||||
recursive=True,
|
||||
)
|
||||
assert local_fs.ls(target) == []
|
||||
|
||||
def test_get_glob_to_new_directory(
|
||||
self,
|
||||
fs,
|
||||
fs_join,
|
||||
fs_bulk_operations_scenario_0,
|
||||
local_fs,
|
||||
local_join,
|
||||
local_target,
|
||||
):
|
||||
# Copy scenario 1h
|
||||
source = fs_bulk_operations_scenario_0
|
||||
|
||||
target = local_target
|
||||
local_fs.mkdir(target)
|
||||
|
||||
for target_slash in [False, True]:
|
||||
t = fs_join(target, "newdir")
|
||||
if target_slash:
|
||||
t += "/"
|
||||
|
||||
# Without recursive
|
||||
fs.get(fs_join(source, "subdir", "*"), t)
|
||||
assert local_fs.isdir(local_join(target, "newdir"))
|
||||
assert local_fs.isfile(local_join(target, "newdir", "subfile1"))
|
||||
assert local_fs.isfile(local_join(target, "newdir", "subfile2"))
|
||||
assert not local_fs.exists(local_join(target, "newdir", "nesteddir"))
|
||||
assert not local_fs.exists(
|
||||
local_join(target, "newdir", "nesteddir", "nestedfile")
|
||||
)
|
||||
assert not local_fs.exists(local_join(target, "subdir"))
|
||||
assert not local_fs.exists(local_join(target, "newdir", "subdir"))
|
||||
|
||||
local_fs.rm(local_join(target, "newdir"), recursive=True)
|
||||
assert local_fs.ls(target) == []
|
||||
|
||||
# With recursive
|
||||
for glob, recursive in zip(["*", "**"], [True, False]):
|
||||
fs.get(fs_join(source, "subdir", glob), t, recursive=recursive)
|
||||
assert local_fs.isdir(local_join(target, "newdir"))
|
||||
assert local_fs.isfile(local_join(target, "newdir", "subfile1"))
|
||||
assert local_fs.isfile(local_join(target, "newdir", "subfile2"))
|
||||
assert local_fs.isdir(local_join(target, "newdir", "nesteddir"))
|
||||
assert local_fs.isfile(
|
||||
local_join(target, "newdir", "nesteddir", "nestedfile")
|
||||
)
|
||||
assert not local_fs.exists(local_join(target, "subdir"))
|
||||
assert not local_fs.exists(local_join(target, "newdir", "subdir"))
|
||||
|
||||
local_fs.rm(local_join(target, "newdir"), recursive=True)
|
||||
assert not local_fs.exists(local_join(target, "newdir"))
|
||||
|
||||
# Limit recursive by maxdepth
|
||||
fs.get(
|
||||
fs_join(source, "subdir", glob), t, recursive=recursive, maxdepth=1
|
||||
)
|
||||
assert local_fs.isdir(local_join(target, "newdir"))
|
||||
assert local_fs.isfile(local_join(target, "newdir", "subfile1"))
|
||||
assert local_fs.isfile(local_join(target, "newdir", "subfile2"))
|
||||
assert not local_fs.exists(local_join(target, "newdir", "nesteddir"))
|
||||
assert not local_fs.exists(local_join(target, "subdir"))
|
||||
assert not local_fs.exists(local_join(target, "newdir", "subdir"))
|
||||
|
||||
local_fs.rm(local_fs.ls(target, detail=False), recursive=True)
|
||||
assert not local_fs.exists(local_join(target, "newdir"))
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
GLOB_EDGE_CASES_TESTS["argnames"],
|
||||
GLOB_EDGE_CASES_TESTS["argvalues"],
|
||||
)
|
||||
def test_get_glob_edge_cases(
|
||||
self,
|
||||
path,
|
||||
recursive,
|
||||
maxdepth,
|
||||
expected,
|
||||
fs,
|
||||
fs_join,
|
||||
fs_glob_edge_cases_files,
|
||||
local_fs,
|
||||
local_join,
|
||||
local_target,
|
||||
):
|
||||
# Copy scenario 1g
|
||||
source = fs_glob_edge_cases_files
|
||||
|
||||
target = local_target
|
||||
|
||||
for new_dir, target_slash in product([True, False], [True, False]):
|
||||
local_fs.mkdir(target)
|
||||
|
||||
t = local_join(target, "newdir") if new_dir else target
|
||||
t = t + "/" if target_slash else t
|
||||
|
||||
fs.get(fs_join(source, path), t, recursive=recursive, maxdepth=maxdepth)
|
||||
|
||||
output = local_fs.find(target)
|
||||
if new_dir:
|
||||
prefixed_expected = [
|
||||
make_path_posix(local_join(target, "newdir", p)) for p in expected
|
||||
]
|
||||
else:
|
||||
prefixed_expected = [
|
||||
make_path_posix(local_join(target, p)) for p in expected
|
||||
]
|
||||
assert sorted(output) == sorted(prefixed_expected)
|
||||
|
||||
try:
|
||||
local_fs.rm(target, recursive=True)
|
||||
except FileNotFoundError:
|
||||
pass
|
||||
|
||||
def test_get_list_of_files_to_existing_directory(
|
||||
self,
|
||||
fs,
|
||||
fs_join,
|
||||
fs_bulk_operations_scenario_0,
|
||||
local_fs,
|
||||
local_join,
|
||||
local_target,
|
||||
):
|
||||
# Copy scenario 2a
|
||||
source = fs_bulk_operations_scenario_0
|
||||
|
||||
target = local_target
|
||||
local_fs.mkdir(target)
|
||||
|
||||
source_files = [
|
||||
fs_join(source, "file1"),
|
||||
fs_join(source, "file2"),
|
||||
fs_join(source, "subdir", "subfile1"),
|
||||
]
|
||||
|
||||
for target_slash in [False, True]:
|
||||
t = target + "/" if target_slash else target
|
||||
|
||||
fs.get(source_files, t)
|
||||
assert local_fs.isfile(local_join(target, "file1"))
|
||||
assert local_fs.isfile(local_join(target, "file2"))
|
||||
assert local_fs.isfile(local_join(target, "subfile1"))
|
||||
|
||||
local_fs.rm(
|
||||
[
|
||||
local_join(target, "file1"),
|
||||
local_join(target, "file2"),
|
||||
local_join(target, "subfile1"),
|
||||
],
|
||||
recursive=True,
|
||||
)
|
||||
assert local_fs.ls(target) == []
|
||||
|
||||
def test_get_list_of_files_to_new_directory(
|
||||
self,
|
||||
fs,
|
||||
fs_join,
|
||||
fs_bulk_operations_scenario_0,
|
||||
local_fs,
|
||||
local_join,
|
||||
local_target,
|
||||
):
|
||||
# Copy scenario 2b
|
||||
source = fs_bulk_operations_scenario_0
|
||||
|
||||
target = local_target
|
||||
local_fs.mkdir(target)
|
||||
|
||||
source_files = [
|
||||
fs_join(source, "file1"),
|
||||
fs_join(source, "file2"),
|
||||
fs_join(source, "subdir", "subfile1"),
|
||||
]
|
||||
|
||||
fs.get(source_files, local_join(target, "newdir") + "/") # Note trailing slash
|
||||
assert local_fs.isdir(local_join(target, "newdir"))
|
||||
assert local_fs.isfile(local_join(target, "newdir", "file1"))
|
||||
assert local_fs.isfile(local_join(target, "newdir", "file2"))
|
||||
assert local_fs.isfile(local_join(target, "newdir", "subfile1"))
|
||||
|
||||
def test_get_directory_recursive(
|
||||
self, fs, fs_join, fs_path, local_fs, local_join, local_target
|
||||
):
|
||||
# https://github.com/fsspec/filesystem_spec/issues/1062
|
||||
# Recursive cp/get/put of source directory into non-existent target directory.
|
||||
src = fs_join(fs_path, "src")
|
||||
src_file = fs_join(src, "file")
|
||||
fs.mkdir(src)
|
||||
fs.touch(src_file)
|
||||
|
||||
target = local_target
|
||||
|
||||
# get without slash
|
||||
assert not local_fs.exists(target)
|
||||
for loop in range(2):
|
||||
fs.get(src, target, recursive=True)
|
||||
assert local_fs.isdir(target)
|
||||
|
||||
if loop == 0:
|
||||
assert local_fs.isfile(local_join(target, "file"))
|
||||
assert not local_fs.exists(local_join(target, "src"))
|
||||
else:
|
||||
assert local_fs.isfile(local_join(target, "file"))
|
||||
assert local_fs.isdir(local_join(target, "src"))
|
||||
assert local_fs.isfile(local_join(target, "src", "file"))
|
||||
|
||||
local_fs.rm(target, recursive=True)
|
||||
|
||||
# get with slash
|
||||
assert not local_fs.exists(target)
|
||||
for loop in range(2):
|
||||
fs.get(src + "/", target, recursive=True)
|
||||
assert local_fs.isdir(target)
|
||||
assert local_fs.isfile(local_join(target, "file"))
|
||||
assert not local_fs.exists(local_join(target, "src"))
|
||||
|
||||
def test_get_directory_without_files_with_same_name_prefix(
|
||||
self,
|
||||
fs,
|
||||
fs_join,
|
||||
local_fs,
|
||||
local_join,
|
||||
local_target,
|
||||
fs_dir_and_file_with_same_name_prefix,
|
||||
):
|
||||
# Create the test dirs
|
||||
source = fs_dir_and_file_with_same_name_prefix
|
||||
target = local_target
|
||||
|
||||
# Test without glob
|
||||
fs.get(fs_join(source, "subdir"), target, recursive=True)
|
||||
|
||||
assert local_fs.isfile(local_join(target, "subfile.txt"))
|
||||
assert not local_fs.isfile(local_join(target, "subdir.txt"))
|
||||
|
||||
local_fs.rm([local_join(target, "subfile.txt")])
|
||||
assert local_fs.ls(target) == []
|
||||
|
||||
# Test with glob
|
||||
fs.get(fs_join(source, "subdir*"), target, recursive=True)
|
||||
|
||||
assert local_fs.isdir(local_join(target, "subdir"))
|
||||
assert local_fs.isfile(local_join(target, "subdir", "subfile.txt"))
|
||||
assert local_fs.isfile(local_join(target, "subdir.txt"))
|
||||
|
||||
def test_get_with_source_and_destination_as_list(
|
||||
self,
|
||||
fs,
|
||||
fs_join,
|
||||
local_fs,
|
||||
local_join,
|
||||
local_target,
|
||||
fs_10_files_with_hashed_names,
|
||||
):
|
||||
# Create the test dir
|
||||
source = fs_10_files_with_hashed_names
|
||||
target = local_target
|
||||
|
||||
# Create list of files for source and destination
|
||||
source_files = []
|
||||
destination_files = []
|
||||
for i in range(10):
|
||||
hashed_i = md5(str(i).encode("utf-8")).hexdigest()
|
||||
source_files.append(fs_join(source, f"{hashed_i}.txt"))
|
||||
destination_files.append(
|
||||
make_path_posix(local_join(target, f"{hashed_i}.txt"))
|
||||
)
|
||||
|
||||
# Copy and assert order was kept
|
||||
fs.get(rpath=source_files, lpath=destination_files)
|
||||
|
||||
for i in range(10):
|
||||
file_content = local_fs.cat(destination_files[i]).decode("utf-8")
|
||||
assert file_content == str(i)
|
||||
@@ -0,0 +1,57 @@
|
||||
import os
|
||||
|
||||
import pytest
|
||||
|
||||
import fsspec
|
||||
|
||||
|
||||
def test_move_raises_error_with_tmpdir(tmpdir):
|
||||
# Create a file in the temporary directory
|
||||
source = tmpdir.join("source_file.txt")
|
||||
source.write("content")
|
||||
|
||||
# Define a destination that simulates a protected or invalid path
|
||||
destination = tmpdir.join("non_existent_directory/destination_file.txt")
|
||||
|
||||
# Instantiate the filesystem (assuming the local file system interface)
|
||||
fs = fsspec.filesystem("file")
|
||||
|
||||
# Use the actual file paths as string
|
||||
with pytest.raises(FileNotFoundError):
|
||||
fs.mv(str(source), str(destination))
|
||||
|
||||
|
||||
@pytest.mark.parametrize("recursive", (True, False))
|
||||
def test_move_raises_error_with_tmpdir_permission(recursive, tmpdir):
|
||||
# Create a file in the temporary directory
|
||||
source = tmpdir.join("source_file.txt")
|
||||
source.write("content")
|
||||
|
||||
# Create a protected directory (non-writable)
|
||||
protected_dir = tmpdir.mkdir("protected_directory")
|
||||
protected_path = str(protected_dir)
|
||||
|
||||
# Set the directory to read-only
|
||||
if os.name == "nt":
|
||||
os.system(f'icacls "{protected_path}" /deny Everyone:(W)')
|
||||
else:
|
||||
os.chmod(protected_path, 0o555) # Sets the directory to read-only
|
||||
|
||||
# Define a destination inside the protected directory
|
||||
destination = protected_dir.join("destination_file.txt")
|
||||
|
||||
# Instantiate the filesystem (assuming the local file system interface)
|
||||
fs = fsspec.filesystem("file")
|
||||
|
||||
# Try to move the file to the read-only directory, expecting a permission error
|
||||
with pytest.raises(PermissionError):
|
||||
fs.mv(str(source), str(destination), recursive=recursive)
|
||||
|
||||
# Assert the file was not created in the destination
|
||||
assert not os.path.exists(destination)
|
||||
|
||||
# Cleanup: Restore permissions so the directory can be cleaned up
|
||||
if os.name == "nt":
|
||||
os.system(f'icacls "{protected_path}" /remove:d Everyone')
|
||||
else:
|
||||
os.chmod(protected_path, 0o755) # Restore write permission for cleanup
|
||||
@@ -0,0 +1,11 @@
|
||||
import pytest
|
||||
|
||||
|
||||
class AbstractOpenTests:
|
||||
def test_open_exclusive(self, fs, fs_target):
|
||||
with fs.open(fs_target, "wb") as f:
|
||||
f.write(b"data")
|
||||
with fs.open(fs_target, "rb") as f:
|
||||
assert f.read() == b"data"
|
||||
with pytest.raises(FileExistsError):
|
||||
fs.open(fs_target, "xb")
|
||||
@@ -0,0 +1,11 @@
|
||||
import pytest
|
||||
|
||||
|
||||
class AbstractPipeTests:
|
||||
def test_pipe_exclusive(self, fs, fs_target):
|
||||
fs.pipe_file(fs_target, b"data")
|
||||
assert fs.cat_file(fs_target) == b"data"
|
||||
with pytest.raises(FileExistsError):
|
||||
fs.pipe_file(fs_target, b"data", mode="create")
|
||||
fs.pipe_file(fs_target, b"new data", mode="overwrite")
|
||||
assert fs.cat_file(fs_target) == b"new data"
|
||||
@@ -0,0 +1,591 @@
|
||||
from hashlib import md5
|
||||
from itertools import product
|
||||
|
||||
import pytest
|
||||
|
||||
from fsspec.tests.abstract.common import GLOB_EDGE_CASES_TESTS
|
||||
|
||||
|
||||
class AbstractPutTests:
|
||||
def test_put_file_to_existing_directory(
|
||||
self,
|
||||
fs,
|
||||
fs_join,
|
||||
fs_target,
|
||||
local_join,
|
||||
local_bulk_operations_scenario_0,
|
||||
supports_empty_directories,
|
||||
):
|
||||
# Copy scenario 1a
|
||||
source = local_bulk_operations_scenario_0
|
||||
|
||||
target = fs_target
|
||||
fs.mkdir(target)
|
||||
if not supports_empty_directories:
|
||||
# Force target directory to exist by adding a dummy file
|
||||
fs.touch(fs_join(target, "dummy"))
|
||||
assert fs.isdir(target)
|
||||
|
||||
target_file2 = fs_join(target, "file2")
|
||||
target_subfile1 = fs_join(target, "subfile1")
|
||||
|
||||
# Copy from source directory
|
||||
fs.put(local_join(source, "file2"), target)
|
||||
assert fs.isfile(target_file2)
|
||||
|
||||
# Copy from sub directory
|
||||
fs.put(local_join(source, "subdir", "subfile1"), target)
|
||||
assert fs.isfile(target_subfile1)
|
||||
|
||||
# Remove copied files
|
||||
fs.rm([target_file2, target_subfile1])
|
||||
assert not fs.exists(target_file2)
|
||||
assert not fs.exists(target_subfile1)
|
||||
|
||||
# Repeat with trailing slash on target
|
||||
fs.put(local_join(source, "file2"), target + "/")
|
||||
assert fs.isdir(target)
|
||||
assert fs.isfile(target_file2)
|
||||
|
||||
fs.put(local_join(source, "subdir", "subfile1"), target + "/")
|
||||
assert fs.isfile(target_subfile1)
|
||||
|
||||
def test_put_file_to_new_directory(
|
||||
self, fs, fs_join, fs_target, local_join, local_bulk_operations_scenario_0
|
||||
):
|
||||
# Copy scenario 1b
|
||||
source = local_bulk_operations_scenario_0
|
||||
|
||||
target = fs_target
|
||||
fs.mkdir(target)
|
||||
|
||||
fs.put(
|
||||
local_join(source, "subdir", "subfile1"), fs_join(target, "newdir/")
|
||||
) # Note trailing slash
|
||||
assert fs.isdir(target)
|
||||
assert fs.isdir(fs_join(target, "newdir"))
|
||||
assert fs.isfile(fs_join(target, "newdir", "subfile1"))
|
||||
|
||||
def test_put_file_to_file_in_existing_directory(
|
||||
self,
|
||||
fs,
|
||||
fs_join,
|
||||
fs_target,
|
||||
local_join,
|
||||
supports_empty_directories,
|
||||
local_bulk_operations_scenario_0,
|
||||
):
|
||||
# Copy scenario 1c
|
||||
source = local_bulk_operations_scenario_0
|
||||
|
||||
target = fs_target
|
||||
fs.mkdir(target)
|
||||
if not supports_empty_directories:
|
||||
# Force target directory to exist by adding a dummy file
|
||||
fs.touch(fs_join(target, "dummy"))
|
||||
assert fs.isdir(target)
|
||||
|
||||
fs.put(local_join(source, "subdir", "subfile1"), fs_join(target, "newfile"))
|
||||
assert fs.isfile(fs_join(target, "newfile"))
|
||||
|
||||
def test_put_file_to_file_in_new_directory(
|
||||
self, fs, fs_join, fs_target, local_join, local_bulk_operations_scenario_0
|
||||
):
|
||||
# Copy scenario 1d
|
||||
source = local_bulk_operations_scenario_0
|
||||
|
||||
target = fs_target
|
||||
fs.mkdir(target)
|
||||
|
||||
fs.put(
|
||||
local_join(source, "subdir", "subfile1"),
|
||||
fs_join(target, "newdir", "newfile"),
|
||||
)
|
||||
assert fs.isdir(fs_join(target, "newdir"))
|
||||
assert fs.isfile(fs_join(target, "newdir", "newfile"))
|
||||
|
||||
def test_put_directory_to_existing_directory(
|
||||
self,
|
||||
fs,
|
||||
fs_join,
|
||||
fs_target,
|
||||
local_bulk_operations_scenario_0,
|
||||
supports_empty_directories,
|
||||
):
|
||||
# Copy scenario 1e
|
||||
source = local_bulk_operations_scenario_0
|
||||
|
||||
target = fs_target
|
||||
fs.mkdir(target)
|
||||
if not supports_empty_directories:
|
||||
# Force target directory to exist by adding a dummy file
|
||||
dummy = fs_join(target, "dummy")
|
||||
fs.touch(dummy)
|
||||
assert fs.isdir(target)
|
||||
|
||||
for source_slash, target_slash in zip([False, True], [False, True]):
|
||||
s = fs_join(source, "subdir")
|
||||
if source_slash:
|
||||
s += "/"
|
||||
t = target + "/" if target_slash else target
|
||||
|
||||
# Without recursive does nothing
|
||||
fs.put(s, t)
|
||||
assert fs.ls(target, detail=False) == (
|
||||
[] if supports_empty_directories else [dummy]
|
||||
)
|
||||
|
||||
# With recursive
|
||||
fs.put(s, t, recursive=True)
|
||||
if source_slash:
|
||||
assert fs.isfile(fs_join(target, "subfile1"))
|
||||
assert fs.isfile(fs_join(target, "subfile2"))
|
||||
assert fs.isdir(fs_join(target, "nesteddir"))
|
||||
assert fs.isfile(fs_join(target, "nesteddir", "nestedfile"))
|
||||
assert not fs.exists(fs_join(target, "subdir"))
|
||||
|
||||
fs.rm(
|
||||
[
|
||||
fs_join(target, "subfile1"),
|
||||
fs_join(target, "subfile2"),
|
||||
fs_join(target, "nesteddir"),
|
||||
],
|
||||
recursive=True,
|
||||
)
|
||||
else:
|
||||
assert fs.isdir(fs_join(target, "subdir"))
|
||||
assert fs.isfile(fs_join(target, "subdir", "subfile1"))
|
||||
assert fs.isfile(fs_join(target, "subdir", "subfile2"))
|
||||
assert fs.isdir(fs_join(target, "subdir", "nesteddir"))
|
||||
assert fs.isfile(fs_join(target, "subdir", "nesteddir", "nestedfile"))
|
||||
|
||||
fs.rm(fs_join(target, "subdir"), recursive=True)
|
||||
assert fs.ls(target, detail=False) == (
|
||||
[] if supports_empty_directories else [dummy]
|
||||
)
|
||||
|
||||
# Limit recursive by maxdepth
|
||||
fs.put(s, t, recursive=True, maxdepth=1)
|
||||
if source_slash:
|
||||
assert fs.isfile(fs_join(target, "subfile1"))
|
||||
assert fs.isfile(fs_join(target, "subfile2"))
|
||||
assert not fs.exists(fs_join(target, "nesteddir"))
|
||||
assert not fs.exists(fs_join(target, "subdir"))
|
||||
|
||||
fs.rm(
|
||||
[
|
||||
fs_join(target, "subfile1"),
|
||||
fs_join(target, "subfile2"),
|
||||
],
|
||||
recursive=True,
|
||||
)
|
||||
else:
|
||||
assert fs.isdir(fs_join(target, "subdir"))
|
||||
assert fs.isfile(fs_join(target, "subdir", "subfile1"))
|
||||
assert fs.isfile(fs_join(target, "subdir", "subfile2"))
|
||||
assert not fs.exists(fs_join(target, "subdir", "nesteddir"))
|
||||
|
||||
fs.rm(fs_join(target, "subdir"), recursive=True)
|
||||
assert fs.ls(target, detail=False) == (
|
||||
[] if supports_empty_directories else [dummy]
|
||||
)
|
||||
|
||||
def test_put_directory_to_new_directory(
|
||||
self,
|
||||
fs,
|
||||
fs_join,
|
||||
fs_target,
|
||||
local_bulk_operations_scenario_0,
|
||||
supports_empty_directories,
|
||||
):
|
||||
# Copy scenario 1f
|
||||
source = local_bulk_operations_scenario_0
|
||||
|
||||
target = fs_target
|
||||
fs.mkdir(target)
|
||||
|
||||
for source_slash, target_slash in zip([False, True], [False, True]):
|
||||
s = fs_join(source, "subdir")
|
||||
if source_slash:
|
||||
s += "/"
|
||||
t = fs_join(target, "newdir")
|
||||
if target_slash:
|
||||
t += "/"
|
||||
|
||||
# Without recursive does nothing
|
||||
fs.put(s, t)
|
||||
if supports_empty_directories:
|
||||
assert fs.ls(target) == []
|
||||
else:
|
||||
with pytest.raises(FileNotFoundError):
|
||||
fs.ls(target)
|
||||
|
||||
# With recursive
|
||||
fs.put(s, t, recursive=True)
|
||||
assert fs.isdir(fs_join(target, "newdir"))
|
||||
assert fs.isfile(fs_join(target, "newdir", "subfile1"))
|
||||
assert fs.isfile(fs_join(target, "newdir", "subfile2"))
|
||||
assert fs.isdir(fs_join(target, "newdir", "nesteddir"))
|
||||
assert fs.isfile(fs_join(target, "newdir", "nesteddir", "nestedfile"))
|
||||
assert not fs.exists(fs_join(target, "subdir"))
|
||||
|
||||
fs.rm(fs_join(target, "newdir"), recursive=True)
|
||||
assert not fs.exists(fs_join(target, "newdir"))
|
||||
|
||||
# Limit recursive by maxdepth
|
||||
fs.put(s, t, recursive=True, maxdepth=1)
|
||||
assert fs.isdir(fs_join(target, "newdir"))
|
||||
assert fs.isfile(fs_join(target, "newdir", "subfile1"))
|
||||
assert fs.isfile(fs_join(target, "newdir", "subfile2"))
|
||||
assert not fs.exists(fs_join(target, "newdir", "nesteddir"))
|
||||
assert not fs.exists(fs_join(target, "subdir"))
|
||||
|
||||
fs.rm(fs_join(target, "newdir"), recursive=True)
|
||||
assert not fs.exists(fs_join(target, "newdir"))
|
||||
|
||||
def test_put_glob_to_existing_directory(
|
||||
self,
|
||||
fs,
|
||||
fs_join,
|
||||
fs_target,
|
||||
local_join,
|
||||
supports_empty_directories,
|
||||
local_bulk_operations_scenario_0,
|
||||
):
|
||||
# Copy scenario 1g
|
||||
source = local_bulk_operations_scenario_0
|
||||
|
||||
target = fs_target
|
||||
fs.mkdir(target)
|
||||
if not supports_empty_directories:
|
||||
# Force target directory to exist by adding a dummy file
|
||||
dummy = fs_join(target, "dummy")
|
||||
fs.touch(dummy)
|
||||
assert fs.isdir(target)
|
||||
|
||||
for target_slash in [False, True]:
|
||||
t = target + "/" if target_slash else target
|
||||
|
||||
# Without recursive
|
||||
fs.put(local_join(source, "subdir", "*"), t)
|
||||
assert fs.isfile(fs_join(target, "subfile1"))
|
||||
assert fs.isfile(fs_join(target, "subfile2"))
|
||||
assert not fs.isdir(fs_join(target, "nesteddir"))
|
||||
assert not fs.exists(fs_join(target, "nesteddir", "nestedfile"))
|
||||
assert not fs.exists(fs_join(target, "subdir"))
|
||||
|
||||
fs.rm(
|
||||
[
|
||||
fs_join(target, "subfile1"),
|
||||
fs_join(target, "subfile2"),
|
||||
],
|
||||
recursive=True,
|
||||
)
|
||||
assert fs.ls(target, detail=False) == (
|
||||
[] if supports_empty_directories else [dummy]
|
||||
)
|
||||
|
||||
# With recursive
|
||||
for glob, recursive in zip(["*", "**"], [True, False]):
|
||||
fs.put(local_join(source, "subdir", glob), t, recursive=recursive)
|
||||
assert fs.isfile(fs_join(target, "subfile1"))
|
||||
assert fs.isfile(fs_join(target, "subfile2"))
|
||||
assert fs.isdir(fs_join(target, "nesteddir"))
|
||||
assert fs.isfile(fs_join(target, "nesteddir", "nestedfile"))
|
||||
assert not fs.exists(fs_join(target, "subdir"))
|
||||
|
||||
fs.rm(
|
||||
[
|
||||
fs_join(target, "subfile1"),
|
||||
fs_join(target, "subfile2"),
|
||||
fs_join(target, "nesteddir"),
|
||||
],
|
||||
recursive=True,
|
||||
)
|
||||
assert fs.ls(target, detail=False) == (
|
||||
[] if supports_empty_directories else [dummy]
|
||||
)
|
||||
|
||||
# Limit recursive by maxdepth
|
||||
fs.put(
|
||||
local_join(source, "subdir", glob),
|
||||
t,
|
||||
recursive=recursive,
|
||||
maxdepth=1,
|
||||
)
|
||||
assert fs.isfile(fs_join(target, "subfile1"))
|
||||
assert fs.isfile(fs_join(target, "subfile2"))
|
||||
assert not fs.exists(fs_join(target, "nesteddir"))
|
||||
assert not fs.exists(fs_join(target, "subdir"))
|
||||
|
||||
fs.rm(
|
||||
[
|
||||
fs_join(target, "subfile1"),
|
||||
fs_join(target, "subfile2"),
|
||||
],
|
||||
recursive=True,
|
||||
)
|
||||
assert fs.ls(target, detail=False) == (
|
||||
[] if supports_empty_directories else [dummy]
|
||||
)
|
||||
|
||||
def test_put_glob_to_new_directory(
|
||||
self, fs, fs_join, fs_target, local_join, local_bulk_operations_scenario_0
|
||||
):
|
||||
# Copy scenario 1h
|
||||
source = local_bulk_operations_scenario_0
|
||||
|
||||
target = fs_target
|
||||
fs.mkdir(target)
|
||||
|
||||
for target_slash in [False, True]:
|
||||
t = fs_join(target, "newdir")
|
||||
if target_slash:
|
||||
t += "/"
|
||||
|
||||
# Without recursive
|
||||
fs.put(local_join(source, "subdir", "*"), t)
|
||||
assert fs.isdir(fs_join(target, "newdir"))
|
||||
assert fs.isfile(fs_join(target, "newdir", "subfile1"))
|
||||
assert fs.isfile(fs_join(target, "newdir", "subfile2"))
|
||||
assert not fs.exists(fs_join(target, "newdir", "nesteddir"))
|
||||
assert not fs.exists(fs_join(target, "newdir", "nesteddir", "nestedfile"))
|
||||
assert not fs.exists(fs_join(target, "subdir"))
|
||||
assert not fs.exists(fs_join(target, "newdir", "subdir"))
|
||||
|
||||
fs.rm(fs_join(target, "newdir"), recursive=True)
|
||||
assert not fs.exists(fs_join(target, "newdir"))
|
||||
|
||||
# With recursive
|
||||
for glob, recursive in zip(["*", "**"], [True, False]):
|
||||
fs.put(local_join(source, "subdir", glob), t, recursive=recursive)
|
||||
assert fs.isdir(fs_join(target, "newdir"))
|
||||
assert fs.isfile(fs_join(target, "newdir", "subfile1"))
|
||||
assert fs.isfile(fs_join(target, "newdir", "subfile2"))
|
||||
assert fs.isdir(fs_join(target, "newdir", "nesteddir"))
|
||||
assert fs.isfile(fs_join(target, "newdir", "nesteddir", "nestedfile"))
|
||||
assert not fs.exists(fs_join(target, "subdir"))
|
||||
assert not fs.exists(fs_join(target, "newdir", "subdir"))
|
||||
|
||||
fs.rm(fs_join(target, "newdir"), recursive=True)
|
||||
assert not fs.exists(fs_join(target, "newdir"))
|
||||
|
||||
# Limit recursive by maxdepth
|
||||
fs.put(
|
||||
local_join(source, "subdir", glob),
|
||||
t,
|
||||
recursive=recursive,
|
||||
maxdepth=1,
|
||||
)
|
||||
assert fs.isdir(fs_join(target, "newdir"))
|
||||
assert fs.isfile(fs_join(target, "newdir", "subfile1"))
|
||||
assert fs.isfile(fs_join(target, "newdir", "subfile2"))
|
||||
assert not fs.exists(fs_join(target, "newdir", "nesteddir"))
|
||||
assert not fs.exists(fs_join(target, "subdir"))
|
||||
assert not fs.exists(fs_join(target, "newdir", "subdir"))
|
||||
|
||||
fs.rm(fs_join(target, "newdir"), recursive=True)
|
||||
assert not fs.exists(fs_join(target, "newdir"))
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
GLOB_EDGE_CASES_TESTS["argnames"],
|
||||
GLOB_EDGE_CASES_TESTS["argvalues"],
|
||||
)
|
||||
def test_put_glob_edge_cases(
|
||||
self,
|
||||
path,
|
||||
recursive,
|
||||
maxdepth,
|
||||
expected,
|
||||
fs,
|
||||
fs_join,
|
||||
fs_target,
|
||||
local_glob_edge_cases_files,
|
||||
local_join,
|
||||
fs_sanitize_path,
|
||||
):
|
||||
# Copy scenario 1g
|
||||
source = local_glob_edge_cases_files
|
||||
|
||||
target = fs_target
|
||||
|
||||
for new_dir, target_slash in product([True, False], [True, False]):
|
||||
fs.mkdir(target)
|
||||
|
||||
t = fs_join(target, "newdir") if new_dir else target
|
||||
t = t + "/" if target_slash else t
|
||||
|
||||
fs.put(local_join(source, path), t, recursive=recursive, maxdepth=maxdepth)
|
||||
|
||||
output = fs.find(target)
|
||||
if new_dir:
|
||||
prefixed_expected = [
|
||||
fs_sanitize_path(fs_join(target, "newdir", p)) for p in expected
|
||||
]
|
||||
else:
|
||||
prefixed_expected = [
|
||||
fs_sanitize_path(fs_join(target, p)) for p in expected
|
||||
]
|
||||
assert sorted(output) == sorted(prefixed_expected)
|
||||
|
||||
try:
|
||||
fs.rm(target, recursive=True)
|
||||
except FileNotFoundError:
|
||||
pass
|
||||
|
||||
def test_put_list_of_files_to_existing_directory(
|
||||
self,
|
||||
fs,
|
||||
fs_join,
|
||||
fs_target,
|
||||
local_join,
|
||||
local_bulk_operations_scenario_0,
|
||||
supports_empty_directories,
|
||||
):
|
||||
# Copy scenario 2a
|
||||
source = local_bulk_operations_scenario_0
|
||||
|
||||
target = fs_target
|
||||
fs.mkdir(target)
|
||||
if not supports_empty_directories:
|
||||
# Force target directory to exist by adding a dummy file
|
||||
dummy = fs_join(target, "dummy")
|
||||
fs.touch(dummy)
|
||||
assert fs.isdir(target)
|
||||
|
||||
source_files = [
|
||||
local_join(source, "file1"),
|
||||
local_join(source, "file2"),
|
||||
local_join(source, "subdir", "subfile1"),
|
||||
]
|
||||
|
||||
for target_slash in [False, True]:
|
||||
t = target + "/" if target_slash else target
|
||||
|
||||
fs.put(source_files, t)
|
||||
assert fs.isfile(fs_join(target, "file1"))
|
||||
assert fs.isfile(fs_join(target, "file2"))
|
||||
assert fs.isfile(fs_join(target, "subfile1"))
|
||||
|
||||
fs.rm(
|
||||
[
|
||||
fs_join(target, "file1"),
|
||||
fs_join(target, "file2"),
|
||||
fs_join(target, "subfile1"),
|
||||
],
|
||||
recursive=True,
|
||||
)
|
||||
assert fs.ls(target, detail=False) == (
|
||||
[] if supports_empty_directories else [dummy]
|
||||
)
|
||||
|
||||
def test_put_list_of_files_to_new_directory(
|
||||
self, fs, fs_join, fs_target, local_join, local_bulk_operations_scenario_0
|
||||
):
|
||||
# Copy scenario 2b
|
||||
source = local_bulk_operations_scenario_0
|
||||
|
||||
target = fs_target
|
||||
fs.mkdir(target)
|
||||
|
||||
source_files = [
|
||||
local_join(source, "file1"),
|
||||
local_join(source, "file2"),
|
||||
local_join(source, "subdir", "subfile1"),
|
||||
]
|
||||
|
||||
fs.put(source_files, fs_join(target, "newdir") + "/") # Note trailing slash
|
||||
assert fs.isdir(fs_join(target, "newdir"))
|
||||
assert fs.isfile(fs_join(target, "newdir", "file1"))
|
||||
assert fs.isfile(fs_join(target, "newdir", "file2"))
|
||||
assert fs.isfile(fs_join(target, "newdir", "subfile1"))
|
||||
|
||||
def test_put_directory_recursive(
|
||||
self, fs, fs_join, fs_target, local_fs, local_join, local_path
|
||||
):
|
||||
# https://github.com/fsspec/filesystem_spec/issues/1062
|
||||
# Recursive cp/get/put of source directory into non-existent target directory.
|
||||
src = local_join(local_path, "src")
|
||||
src_file = local_join(src, "file")
|
||||
local_fs.mkdir(src)
|
||||
local_fs.touch(src_file)
|
||||
|
||||
target = fs_target
|
||||
|
||||
# put without slash
|
||||
assert not fs.exists(target)
|
||||
for loop in range(2):
|
||||
fs.put(src, target, recursive=True)
|
||||
assert fs.isdir(target)
|
||||
|
||||
if loop == 0:
|
||||
assert fs.isfile(fs_join(target, "file"))
|
||||
assert not fs.exists(fs_join(target, "src"))
|
||||
else:
|
||||
assert fs.isfile(fs_join(target, "file"))
|
||||
assert fs.isdir(fs_join(target, "src"))
|
||||
assert fs.isfile(fs_join(target, "src", "file"))
|
||||
|
||||
fs.rm(target, recursive=True)
|
||||
|
||||
# put with slash
|
||||
assert not fs.exists(target)
|
||||
for loop in range(2):
|
||||
fs.put(src + "/", target, recursive=True)
|
||||
assert fs.isdir(target)
|
||||
assert fs.isfile(fs_join(target, "file"))
|
||||
assert not fs.exists(fs_join(target, "src"))
|
||||
|
||||
def test_put_directory_without_files_with_same_name_prefix(
|
||||
self,
|
||||
fs,
|
||||
fs_join,
|
||||
fs_target,
|
||||
local_join,
|
||||
local_dir_and_file_with_same_name_prefix,
|
||||
supports_empty_directories,
|
||||
):
|
||||
# Create the test dirs
|
||||
source = local_dir_and_file_with_same_name_prefix
|
||||
target = fs_target
|
||||
|
||||
# Test without glob
|
||||
fs.put(local_join(source, "subdir"), fs_target, recursive=True)
|
||||
|
||||
assert fs.isfile(fs_join(fs_target, "subfile.txt"))
|
||||
assert not fs.isfile(fs_join(fs_target, "subdir.txt"))
|
||||
|
||||
fs.rm([fs_join(target, "subfile.txt")])
|
||||
if supports_empty_directories:
|
||||
assert fs.ls(target) == []
|
||||
else:
|
||||
assert not fs.exists(target)
|
||||
|
||||
# Test with glob
|
||||
fs.put(local_join(source, "subdir*"), fs_target, recursive=True)
|
||||
|
||||
assert fs.isdir(fs_join(fs_target, "subdir"))
|
||||
assert fs.isfile(fs_join(fs_target, "subdir", "subfile.txt"))
|
||||
assert fs.isfile(fs_join(fs_target, "subdir.txt"))
|
||||
|
||||
def test_copy_with_source_and_destination_as_list(
|
||||
self, fs, fs_target, fs_join, local_join, local_10_files_with_hashed_names
|
||||
):
|
||||
# Create the test dir
|
||||
source = local_10_files_with_hashed_names
|
||||
target = fs_target
|
||||
|
||||
# Create list of files for source and destination
|
||||
source_files = []
|
||||
destination_files = []
|
||||
for i in range(10):
|
||||
hashed_i = md5(str(i).encode("utf-8")).hexdigest()
|
||||
source_files.append(local_join(source, f"{hashed_i}.txt"))
|
||||
destination_files.append(fs_join(target, f"{hashed_i}.txt"))
|
||||
|
||||
# Copy and assert order was kept
|
||||
fs.put(lpath=source_files, rpath=destination_files)
|
||||
|
||||
for i in range(10):
|
||||
file_content = fs.cat(destination_files[i]).decode("utf-8")
|
||||
assert file_content == str(i)
|
||||
@@ -0,0 +1,90 @@
|
||||
from collections import deque
|
||||
|
||||
|
||||
class Transaction:
|
||||
"""Filesystem transaction write context
|
||||
|
||||
Gathers files for deferred commit or discard, so that several write
|
||||
operations can be finalized semi-atomically. This works by having this
|
||||
instance as the ``.transaction`` attribute of the given filesystem
|
||||
"""
|
||||
|
||||
def __init__(self, fs, **kwargs):
|
||||
"""
|
||||
Parameters
|
||||
----------
|
||||
fs: FileSystem instance
|
||||
"""
|
||||
self.fs = fs
|
||||
self.files = deque()
|
||||
|
||||
def __enter__(self):
|
||||
self.start()
|
||||
return self
|
||||
|
||||
def __exit__(self, exc_type, exc_val, exc_tb):
|
||||
"""End transaction and commit, if exit is not due to exception"""
|
||||
# only commit if there was no exception
|
||||
self.complete(commit=exc_type is None)
|
||||
if self.fs:
|
||||
self.fs._intrans = False
|
||||
self.fs._transaction = None
|
||||
self.fs = None
|
||||
|
||||
def start(self):
|
||||
"""Start a transaction on this FileSystem"""
|
||||
self.files = deque() # clean up after previous failed completions
|
||||
self.fs._intrans = True
|
||||
|
||||
def complete(self, commit=True):
|
||||
"""Finish transaction: commit or discard all deferred files"""
|
||||
while self.files:
|
||||
f = self.files.popleft()
|
||||
if commit:
|
||||
f.commit()
|
||||
else:
|
||||
f.discard()
|
||||
self.fs._intrans = False
|
||||
self.fs._transaction = None
|
||||
self.fs = None
|
||||
|
||||
|
||||
class FileActor:
|
||||
def __init__(self):
|
||||
self.files = []
|
||||
|
||||
def commit(self):
|
||||
for f in self.files:
|
||||
f.commit()
|
||||
self.files.clear()
|
||||
|
||||
def discard(self):
|
||||
for f in self.files:
|
||||
f.discard()
|
||||
self.files.clear()
|
||||
|
||||
def append(self, f):
|
||||
self.files.append(f)
|
||||
|
||||
|
||||
class DaskTransaction(Transaction):
|
||||
def __init__(self, fs):
|
||||
"""
|
||||
Parameters
|
||||
----------
|
||||
fs: FileSystem instance
|
||||
"""
|
||||
import distributed
|
||||
|
||||
super().__init__(fs)
|
||||
client = distributed.default_client()
|
||||
self.files = client.submit(FileActor, actor=True).result()
|
||||
|
||||
def complete(self, commit=True):
|
||||
"""Finish transaction: commit or discard all deferred files"""
|
||||
if commit:
|
||||
self.files.commit().result()
|
||||
else:
|
||||
self.files.discard().result()
|
||||
self.fs._intrans = False
|
||||
self.fs = None
|
||||
@@ -0,0 +1,745 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import contextlib
|
||||
import logging
|
||||
import math
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import tempfile
|
||||
from collections.abc import Iterable, Iterator, Sequence
|
||||
from functools import partial
|
||||
from hashlib import md5
|
||||
from importlib.metadata import version
|
||||
from typing import (
|
||||
IO,
|
||||
TYPE_CHECKING,
|
||||
Any,
|
||||
Callable,
|
||||
TypeVar,
|
||||
)
|
||||
from urllib.parse import urlsplit
|
||||
|
||||
if TYPE_CHECKING:
|
||||
import pathlib
|
||||
|
||||
from typing_extensions import TypeGuard
|
||||
|
||||
from fsspec.spec import AbstractFileSystem
|
||||
|
||||
|
||||
DEFAULT_BLOCK_SIZE = 5 * 2**20
|
||||
|
||||
T = TypeVar("T")
|
||||
|
||||
|
||||
def infer_storage_options(
|
||||
urlpath: str, inherit_storage_options: dict[str, Any] | None = None
|
||||
) -> dict[str, Any]:
|
||||
"""Infer storage options from URL path and merge it with existing storage
|
||||
options.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
urlpath: str or unicode
|
||||
Either local absolute file path or URL (hdfs://namenode:8020/file.csv)
|
||||
inherit_storage_options: dict (optional)
|
||||
Its contents will get merged with the inferred information from the
|
||||
given path
|
||||
|
||||
Returns
|
||||
-------
|
||||
Storage options dict.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> infer_storage_options('/mnt/datasets/test.csv') # doctest: +SKIP
|
||||
{"protocol": "file", "path", "/mnt/datasets/test.csv"}
|
||||
>>> infer_storage_options(
|
||||
... 'hdfs://username:pwd@node:123/mnt/datasets/test.csv?q=1',
|
||||
... inherit_storage_options={'extra': 'value'},
|
||||
... ) # doctest: +SKIP
|
||||
{"protocol": "hdfs", "username": "username", "password": "pwd",
|
||||
"host": "node", "port": 123, "path": "/mnt/datasets/test.csv",
|
||||
"url_query": "q=1", "extra": "value"}
|
||||
"""
|
||||
# Handle Windows paths including disk name in this special case
|
||||
if (
|
||||
re.match(r"^[a-zA-Z]:[\\/]", urlpath)
|
||||
or re.match(r"^[a-zA-Z0-9]+://", urlpath) is None
|
||||
):
|
||||
return {"protocol": "file", "path": urlpath}
|
||||
|
||||
parsed_path = urlsplit(urlpath)
|
||||
protocol = parsed_path.scheme or "file"
|
||||
if parsed_path.fragment:
|
||||
path = "#".join([parsed_path.path, parsed_path.fragment])
|
||||
else:
|
||||
path = parsed_path.path
|
||||
if protocol == "file":
|
||||
# Special case parsing file protocol URL on Windows according to:
|
||||
# https://msdn.microsoft.com/en-us/library/jj710207.aspx
|
||||
windows_path = re.match(r"^/([a-zA-Z])[:|]([\\/].*)$", path)
|
||||
if windows_path:
|
||||
drive, path = windows_path.groups()
|
||||
path = f"{drive}:{path}"
|
||||
|
||||
if protocol in ["http", "https"]:
|
||||
# for HTTP, we don't want to parse, as requests will anyway
|
||||
return {"protocol": protocol, "path": urlpath}
|
||||
|
||||
options: dict[str, Any] = {"protocol": protocol, "path": path}
|
||||
|
||||
if parsed_path.netloc:
|
||||
# Parse `hostname` from netloc manually because `parsed_path.hostname`
|
||||
# lowercases the hostname which is not always desirable (e.g. in S3):
|
||||
# https://github.com/dask/dask/issues/1417
|
||||
options["host"] = parsed_path.netloc.rsplit("@", 1)[-1].rsplit(":", 1)[0]
|
||||
|
||||
if protocol in ("s3", "s3a", "gcs", "gs"):
|
||||
options["path"] = options["host"] + options["path"]
|
||||
else:
|
||||
options["host"] = options["host"]
|
||||
if parsed_path.port:
|
||||
options["port"] = parsed_path.port
|
||||
if parsed_path.username:
|
||||
options["username"] = parsed_path.username
|
||||
if parsed_path.password:
|
||||
options["password"] = parsed_path.password
|
||||
|
||||
if parsed_path.query:
|
||||
options["url_query"] = parsed_path.query
|
||||
if parsed_path.fragment:
|
||||
options["url_fragment"] = parsed_path.fragment
|
||||
|
||||
if inherit_storage_options:
|
||||
update_storage_options(options, inherit_storage_options)
|
||||
|
||||
return options
|
||||
|
||||
|
||||
def update_storage_options(
|
||||
options: dict[str, Any], inherited: dict[str, Any] | None = None
|
||||
) -> None:
|
||||
if not inherited:
|
||||
inherited = {}
|
||||
collisions = set(options) & set(inherited)
|
||||
if collisions:
|
||||
for collision in collisions:
|
||||
if options.get(collision) != inherited.get(collision):
|
||||
raise KeyError(
|
||||
f"Collision between inferred and specified storage "
|
||||
f"option:\n{collision}"
|
||||
)
|
||||
options.update(inherited)
|
||||
|
||||
|
||||
# Compression extensions registered via fsspec.compression.register_compression
|
||||
compressions: dict[str, str] = {}
|
||||
|
||||
|
||||
def infer_compression(filename: str) -> str | None:
|
||||
"""Infer compression, if available, from filename.
|
||||
|
||||
Infer a named compression type, if registered and available, from filename
|
||||
extension. This includes builtin (gz, bz2, zip) compressions, as well as
|
||||
optional compressions. See fsspec.compression.register_compression.
|
||||
"""
|
||||
extension = os.path.splitext(filename)[-1].strip(".").lower()
|
||||
if extension in compressions:
|
||||
return compressions[extension]
|
||||
return None
|
||||
|
||||
|
||||
def build_name_function(max_int: float) -> Callable[[int], str]:
|
||||
"""Returns a function that receives a single integer
|
||||
and returns it as a string padded by enough zero characters
|
||||
to align with maximum possible integer
|
||||
|
||||
>>> name_f = build_name_function(57)
|
||||
|
||||
>>> name_f(7)
|
||||
'07'
|
||||
>>> name_f(31)
|
||||
'31'
|
||||
>>> build_name_function(1000)(42)
|
||||
'0042'
|
||||
>>> build_name_function(999)(42)
|
||||
'042'
|
||||
>>> build_name_function(0)(0)
|
||||
'0'
|
||||
"""
|
||||
# handle corner cases max_int is 0 or exact power of 10
|
||||
max_int += 1e-8
|
||||
|
||||
pad_length = int(math.ceil(math.log10(max_int)))
|
||||
|
||||
def name_function(i: int) -> str:
|
||||
return str(i).zfill(pad_length)
|
||||
|
||||
return name_function
|
||||
|
||||
|
||||
def seek_delimiter(file: IO[bytes], delimiter: bytes, blocksize: int) -> bool:
|
||||
r"""Seek current file to file start, file end, or byte after delimiter seq.
|
||||
|
||||
Seeks file to next chunk delimiter, where chunks are defined on file start,
|
||||
a delimiting sequence, and file end. Use file.tell() to see location afterwards.
|
||||
Note that file start is a valid split, so must be at offset > 0 to seek for
|
||||
delimiter.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
file: a file
|
||||
delimiter: bytes
|
||||
a delimiter like ``b'\n'`` or message sentinel, matching file .read() type
|
||||
blocksize: int
|
||||
Number of bytes to read from the file at once.
|
||||
|
||||
|
||||
Returns
|
||||
-------
|
||||
Returns True if a delimiter was found, False if at file start or end.
|
||||
|
||||
"""
|
||||
|
||||
if file.tell() == 0:
|
||||
# beginning-of-file, return without seek
|
||||
return False
|
||||
|
||||
# Interface is for binary IO, with delimiter as bytes, but initialize last
|
||||
# with result of file.read to preserve compatibility with text IO.
|
||||
last: bytes | None = None
|
||||
while True:
|
||||
current = file.read(blocksize)
|
||||
if not current:
|
||||
# end-of-file without delimiter
|
||||
return False
|
||||
full = last + current if last else current
|
||||
try:
|
||||
if delimiter in full:
|
||||
i = full.index(delimiter)
|
||||
file.seek(file.tell() - (len(full) - i) + len(delimiter))
|
||||
return True
|
||||
elif len(current) < blocksize:
|
||||
# end-of-file without delimiter
|
||||
return False
|
||||
except (OSError, ValueError):
|
||||
pass
|
||||
last = full[-len(delimiter) :]
|
||||
|
||||
|
||||
def read_block(
|
||||
f: IO[bytes],
|
||||
offset: int,
|
||||
length: int | None,
|
||||
delimiter: bytes | None = None,
|
||||
split_before: bool = False,
|
||||
) -> bytes:
|
||||
"""Read a block of bytes from a file
|
||||
|
||||
Parameters
|
||||
----------
|
||||
f: File
|
||||
Open file
|
||||
offset: int
|
||||
Byte offset to start read
|
||||
length: int
|
||||
Number of bytes to read, read through end of file if None
|
||||
delimiter: bytes (optional)
|
||||
Ensure reading starts and stops at delimiter bytestring
|
||||
split_before: bool (optional)
|
||||
Start/stop read *before* delimiter bytestring.
|
||||
|
||||
|
||||
If using the ``delimiter=`` keyword argument we ensure that the read
|
||||
starts and stops at delimiter boundaries that follow the locations
|
||||
``offset`` and ``offset + length``. If ``offset`` is zero then we
|
||||
start at zero, regardless of delimiter. The bytestring returned WILL
|
||||
include the terminating delimiter string.
|
||||
|
||||
Examples
|
||||
--------
|
||||
|
||||
>>> from io import BytesIO # doctest: +SKIP
|
||||
>>> f = BytesIO(b'Alice, 100\\nBob, 200\\nCharlie, 300') # doctest: +SKIP
|
||||
>>> read_block(f, 0, 13) # doctest: +SKIP
|
||||
b'Alice, 100\\nBo'
|
||||
|
||||
>>> read_block(f, 0, 13, delimiter=b'\\n') # doctest: +SKIP
|
||||
b'Alice, 100\\nBob, 200\\n'
|
||||
|
||||
>>> read_block(f, 10, 10, delimiter=b'\\n') # doctest: +SKIP
|
||||
b'Bob, 200\\nCharlie, 300'
|
||||
"""
|
||||
if delimiter:
|
||||
f.seek(offset)
|
||||
found_start_delim = seek_delimiter(f, delimiter, 2**16)
|
||||
if length is None:
|
||||
return f.read()
|
||||
start = f.tell()
|
||||
length -= start - offset
|
||||
|
||||
f.seek(start + length)
|
||||
found_end_delim = seek_delimiter(f, delimiter, 2**16)
|
||||
end = f.tell()
|
||||
|
||||
# Adjust split location to before delimiter if seek found the
|
||||
# delimiter sequence, not start or end of file.
|
||||
if found_start_delim and split_before:
|
||||
start -= len(delimiter)
|
||||
|
||||
if found_end_delim and split_before:
|
||||
end -= len(delimiter)
|
||||
|
||||
offset = start
|
||||
length = end - start
|
||||
|
||||
f.seek(offset)
|
||||
|
||||
# TODO: allow length to be None and read to the end of the file?
|
||||
assert length is not None
|
||||
b = f.read(length)
|
||||
return b
|
||||
|
||||
|
||||
def tokenize(*args: Any, **kwargs: Any) -> str:
|
||||
"""Deterministic token
|
||||
|
||||
(modified from dask.base)
|
||||
|
||||
>>> tokenize([1, 2, '3'])
|
||||
'9d71491b50023b06fc76928e6eddb952'
|
||||
|
||||
>>> tokenize('Hello') == tokenize('Hello')
|
||||
True
|
||||
"""
|
||||
if kwargs:
|
||||
args += (kwargs,)
|
||||
try:
|
||||
h = md5(str(args).encode())
|
||||
except ValueError:
|
||||
# FIPS systems: https://github.com/fsspec/filesystem_spec/issues/380
|
||||
h = md5(str(args).encode(), usedforsecurity=False)
|
||||
return h.hexdigest()
|
||||
|
||||
|
||||
def stringify_path(filepath: str | os.PathLike[str] | pathlib.Path) -> str:
|
||||
"""Attempt to convert a path-like object to a string.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
filepath: object to be converted
|
||||
|
||||
Returns
|
||||
-------
|
||||
filepath_str: maybe a string version of the object
|
||||
|
||||
Notes
|
||||
-----
|
||||
Objects supporting the fspath protocol are coerced according to its
|
||||
__fspath__ method.
|
||||
|
||||
For backwards compatibility with older Python version, pathlib.Path
|
||||
objects are specially coerced.
|
||||
|
||||
Any other object is passed through unchanged, which includes bytes,
|
||||
strings, buffers, or anything else that's not even path-like.
|
||||
"""
|
||||
if isinstance(filepath, str):
|
||||
return filepath
|
||||
elif hasattr(filepath, "__fspath__"):
|
||||
return filepath.__fspath__()
|
||||
elif hasattr(filepath, "path"):
|
||||
return filepath.path
|
||||
else:
|
||||
return filepath # type: ignore[return-value]
|
||||
|
||||
|
||||
def make_instance(
|
||||
cls: Callable[..., T], args: Sequence[Any], kwargs: dict[str, Any]
|
||||
) -> T:
|
||||
inst = cls(*args, **kwargs)
|
||||
inst._determine_worker() # type: ignore[attr-defined]
|
||||
return inst
|
||||
|
||||
|
||||
def common_prefix(paths: Iterable[str]) -> str:
|
||||
"""For a list of paths, find the shortest prefix common to all"""
|
||||
parts = [p.split("/") for p in paths]
|
||||
lmax = min(len(p) for p in parts)
|
||||
end = 0
|
||||
for i in range(lmax):
|
||||
end = all(p[i] == parts[0][i] for p in parts)
|
||||
if not end:
|
||||
break
|
||||
i += end
|
||||
return "/".join(parts[0][:i])
|
||||
|
||||
|
||||
def other_paths(
|
||||
paths: list[str],
|
||||
path2: str | list[str],
|
||||
exists: bool = False,
|
||||
flatten: bool = False,
|
||||
) -> list[str]:
|
||||
"""In bulk file operations, construct a new file tree from a list of files
|
||||
|
||||
Parameters
|
||||
----------
|
||||
paths: list of str
|
||||
The input file tree
|
||||
path2: str or list of str
|
||||
Root to construct the new list in. If this is already a list of str, we just
|
||||
assert it has the right number of elements.
|
||||
exists: bool (optional)
|
||||
For a str destination, it is already exists (and is a dir), files should
|
||||
end up inside.
|
||||
flatten: bool (optional)
|
||||
Whether to flatten the input directory tree structure so that the output files
|
||||
are in the same directory.
|
||||
|
||||
Returns
|
||||
-------
|
||||
list of str
|
||||
"""
|
||||
|
||||
if isinstance(path2, str):
|
||||
path2 = path2.rstrip("/")
|
||||
|
||||
if flatten:
|
||||
path2 = ["/".join((path2, p.split("/")[-1])) for p in paths]
|
||||
else:
|
||||
cp = common_prefix(paths)
|
||||
if exists:
|
||||
cp = cp.rsplit("/", 1)[0]
|
||||
if not cp and all(not s.startswith("/") for s in paths):
|
||||
path2 = ["/".join([path2, p]) for p in paths]
|
||||
else:
|
||||
path2 = [p.replace(cp, path2, 1) for p in paths]
|
||||
else:
|
||||
assert len(paths) == len(path2)
|
||||
return path2
|
||||
|
||||
|
||||
def is_exception(obj: Any) -> bool:
|
||||
return isinstance(obj, BaseException)
|
||||
|
||||
|
||||
def isfilelike(f: Any) -> TypeGuard[IO[bytes]]:
|
||||
return all(hasattr(f, attr) for attr in ["read", "close", "tell"])
|
||||
|
||||
|
||||
def get_protocol(url: str) -> str:
|
||||
url = stringify_path(url)
|
||||
parts = re.split(r"(\:\:|\://)", url, maxsplit=1)
|
||||
if len(parts) > 1:
|
||||
return parts[0]
|
||||
return "file"
|
||||
|
||||
|
||||
def get_file_extension(url: str) -> str:
|
||||
url = stringify_path(url)
|
||||
ext_parts = url.rsplit(".", 1)
|
||||
if len(ext_parts) > 1:
|
||||
return ext_parts[-1]
|
||||
return ""
|
||||
|
||||
|
||||
def can_be_local(path: str) -> bool:
|
||||
"""Can the given URL be used with open_local?"""
|
||||
from fsspec import get_filesystem_class
|
||||
|
||||
try:
|
||||
return getattr(get_filesystem_class(get_protocol(path)), "local_file", False)
|
||||
except (ValueError, ImportError):
|
||||
# not in registry or import failed
|
||||
return False
|
||||
|
||||
|
||||
def get_package_version_without_import(name: str) -> str | None:
|
||||
"""For given package name, try to find the version without importing it
|
||||
|
||||
Import and package.__version__ is still the backup here, so an import
|
||||
*might* happen.
|
||||
|
||||
Returns either the version string, or None if the package
|
||||
or the version was not readily found.
|
||||
"""
|
||||
if name in sys.modules:
|
||||
mod = sys.modules[name]
|
||||
if hasattr(mod, "__version__"):
|
||||
return mod.__version__
|
||||
try:
|
||||
return version(name)
|
||||
except: # noqa: E722
|
||||
pass
|
||||
try:
|
||||
import importlib
|
||||
|
||||
mod = importlib.import_module(name)
|
||||
return mod.__version__
|
||||
except (ImportError, AttributeError):
|
||||
return None
|
||||
|
||||
|
||||
def setup_logging(
|
||||
logger: logging.Logger | None = None,
|
||||
logger_name: str | None = None,
|
||||
level: str = "DEBUG",
|
||||
clear: bool = True,
|
||||
) -> logging.Logger:
|
||||
if logger is None and logger_name is None:
|
||||
raise ValueError("Provide either logger object or logger name")
|
||||
logger = logger or logging.getLogger(logger_name)
|
||||
handle = logging.StreamHandler()
|
||||
formatter = logging.Formatter(
|
||||
"%(asctime)s - %(name)s - %(levelname)s - %(funcName)s -- %(message)s"
|
||||
)
|
||||
handle.setFormatter(formatter)
|
||||
if clear:
|
||||
logger.handlers.clear()
|
||||
logger.addHandler(handle)
|
||||
logger.setLevel(level)
|
||||
return logger
|
||||
|
||||
|
||||
def _unstrip_protocol(name: str, fs: AbstractFileSystem) -> str:
|
||||
return fs.unstrip_protocol(name)
|
||||
|
||||
|
||||
def mirror_from(
|
||||
origin_name: str, methods: Iterable[str]
|
||||
) -> Callable[[type[T]], type[T]]:
|
||||
"""Mirror attributes and methods from the given
|
||||
origin_name attribute of the instance to the
|
||||
decorated class"""
|
||||
|
||||
def origin_getter(method: str, self: Any) -> Any:
|
||||
origin = getattr(self, origin_name)
|
||||
return getattr(origin, method)
|
||||
|
||||
def wrapper(cls: type[T]) -> type[T]:
|
||||
for method in methods:
|
||||
wrapped_method = partial(origin_getter, method)
|
||||
setattr(cls, method, property(wrapped_method))
|
||||
return cls
|
||||
|
||||
return wrapper
|
||||
|
||||
|
||||
@contextlib.contextmanager
|
||||
def nullcontext(obj: T) -> Iterator[T]:
|
||||
yield obj
|
||||
|
||||
|
||||
def merge_offset_ranges(
|
||||
paths: list[str],
|
||||
starts: list[int] | int,
|
||||
ends: list[int] | int,
|
||||
max_gap: int = 0,
|
||||
max_block: int | None = None,
|
||||
sort: bool = True,
|
||||
) -> tuple[list[str], list[int], list[int]]:
|
||||
"""Merge adjacent byte-offset ranges when the inter-range
|
||||
gap is <= `max_gap`, and when the merged byte range does not
|
||||
exceed `max_block` (if specified). By default, this function
|
||||
will re-order the input paths and byte ranges to ensure sorted
|
||||
order. If the user can guarantee that the inputs are already
|
||||
sorted, passing `sort=False` will skip the re-ordering.
|
||||
"""
|
||||
# Check input
|
||||
if not isinstance(paths, list):
|
||||
raise TypeError
|
||||
if not isinstance(starts, list):
|
||||
starts = [starts] * len(paths)
|
||||
if not isinstance(ends, list):
|
||||
ends = [ends] * len(paths)
|
||||
if len(starts) != len(paths) or len(ends) != len(paths):
|
||||
raise ValueError
|
||||
|
||||
# Early Return
|
||||
if len(starts) <= 1:
|
||||
return paths, starts, ends
|
||||
|
||||
starts = [s or 0 for s in starts]
|
||||
# Sort by paths and then ranges if `sort=True`
|
||||
if sort:
|
||||
paths, starts, ends = (
|
||||
list(v)
|
||||
for v in zip(
|
||||
*sorted(
|
||||
zip(paths, starts, ends),
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
if paths:
|
||||
# Loop through the coupled `paths`, `starts`, and
|
||||
# `ends`, and merge adjacent blocks when appropriate
|
||||
new_paths = paths[:1]
|
||||
new_starts = starts[:1]
|
||||
new_ends = ends[:1]
|
||||
for i in range(1, len(paths)):
|
||||
if paths[i] == paths[i - 1] and new_ends[-1] is None:
|
||||
continue
|
||||
elif (
|
||||
paths[i] != paths[i - 1]
|
||||
or ((starts[i] - new_ends[-1]) > max_gap)
|
||||
or (max_block is not None and (ends[i] - new_starts[-1]) > max_block)
|
||||
):
|
||||
# Cannot merge with previous block.
|
||||
# Add new `paths`, `starts`, and `ends` elements
|
||||
new_paths.append(paths[i])
|
||||
new_starts.append(starts[i])
|
||||
new_ends.append(ends[i])
|
||||
else:
|
||||
# Merge with previous block by updating the
|
||||
# last element of `ends`
|
||||
new_ends[-1] = ends[i]
|
||||
return new_paths, new_starts, new_ends
|
||||
|
||||
# `paths` is empty. Just return input lists
|
||||
return paths, starts, ends
|
||||
|
||||
|
||||
def file_size(filelike: IO[bytes]) -> int:
|
||||
"""Find length of any open read-mode file-like"""
|
||||
pos = filelike.tell()
|
||||
try:
|
||||
return filelike.seek(0, 2)
|
||||
finally:
|
||||
filelike.seek(pos)
|
||||
|
||||
|
||||
@contextlib.contextmanager
|
||||
def atomic_write(path: str, mode: str = "wb"):
|
||||
"""
|
||||
A context manager that opens a temporary file next to `path` and, on exit,
|
||||
replaces `path` with the temporary file, thereby updating `path`
|
||||
atomically.
|
||||
"""
|
||||
fd, fn = tempfile.mkstemp(
|
||||
dir=os.path.dirname(path), prefix=os.path.basename(path) + "-"
|
||||
)
|
||||
try:
|
||||
with open(fd, mode) as fp:
|
||||
yield fp
|
||||
except BaseException:
|
||||
with contextlib.suppress(FileNotFoundError):
|
||||
os.unlink(fn)
|
||||
raise
|
||||
else:
|
||||
os.replace(fn, path)
|
||||
|
||||
|
||||
def _translate(pat, STAR, QUESTION_MARK):
|
||||
# Copied from: https://github.com/python/cpython/pull/106703.
|
||||
res: list[str] = []
|
||||
add = res.append
|
||||
i, n = 0, len(pat)
|
||||
while i < n:
|
||||
c = pat[i]
|
||||
i = i + 1
|
||||
if c == "*":
|
||||
# compress consecutive `*` into one
|
||||
if (not res) or res[-1] is not STAR:
|
||||
add(STAR)
|
||||
elif c == "?":
|
||||
add(QUESTION_MARK)
|
||||
elif c == "[":
|
||||
j = i
|
||||
if j < n and pat[j] == "!":
|
||||
j = j + 1
|
||||
if j < n and pat[j] == "]":
|
||||
j = j + 1
|
||||
while j < n and pat[j] != "]":
|
||||
j = j + 1
|
||||
if j >= n:
|
||||
add("\\[")
|
||||
else:
|
||||
stuff = pat[i:j]
|
||||
if "-" not in stuff:
|
||||
stuff = stuff.replace("\\", r"\\")
|
||||
else:
|
||||
chunks = []
|
||||
k = i + 2 if pat[i] == "!" else i + 1
|
||||
while True:
|
||||
k = pat.find("-", k, j)
|
||||
if k < 0:
|
||||
break
|
||||
chunks.append(pat[i:k])
|
||||
i = k + 1
|
||||
k = k + 3
|
||||
chunk = pat[i:j]
|
||||
if chunk:
|
||||
chunks.append(chunk)
|
||||
else:
|
||||
chunks[-1] += "-"
|
||||
# Remove empty ranges -- invalid in RE.
|
||||
for k in range(len(chunks) - 1, 0, -1):
|
||||
if chunks[k - 1][-1] > chunks[k][0]:
|
||||
chunks[k - 1] = chunks[k - 1][:-1] + chunks[k][1:]
|
||||
del chunks[k]
|
||||
# Escape backslashes and hyphens for set difference (--).
|
||||
# Hyphens that create ranges shouldn't be escaped.
|
||||
stuff = "-".join(
|
||||
s.replace("\\", r"\\").replace("-", r"\-") for s in chunks
|
||||
)
|
||||
# Escape set operations (&&, ~~ and ||).
|
||||
stuff = re.sub(r"([&~|])", r"\\\1", stuff)
|
||||
i = j + 1
|
||||
if not stuff:
|
||||
# Empty range: never match.
|
||||
add("(?!)")
|
||||
elif stuff == "!":
|
||||
# Negated empty range: match any character.
|
||||
add(".")
|
||||
else:
|
||||
if stuff[0] == "!":
|
||||
stuff = "^" + stuff[1:]
|
||||
elif stuff[0] in ("^", "["):
|
||||
stuff = "\\" + stuff
|
||||
add(f"[{stuff}]")
|
||||
else:
|
||||
add(re.escape(c))
|
||||
assert i == n
|
||||
return res
|
||||
|
||||
|
||||
def glob_translate(pat):
|
||||
# Copied from: https://github.com/python/cpython/pull/106703.
|
||||
# The keyword parameters' values are fixed to:
|
||||
# recursive=True, include_hidden=True, seps=None
|
||||
"""Translate a pathname with shell wildcards to a regular expression."""
|
||||
if os.path.altsep:
|
||||
seps = os.path.sep + os.path.altsep
|
||||
else:
|
||||
seps = os.path.sep
|
||||
escaped_seps = "".join(map(re.escape, seps))
|
||||
any_sep = f"[{escaped_seps}]" if len(seps) > 1 else escaped_seps
|
||||
not_sep = f"[^{escaped_seps}]"
|
||||
one_last_segment = f"{not_sep}+"
|
||||
one_segment = f"{one_last_segment}{any_sep}"
|
||||
any_segments = f"(?:.+{any_sep})?"
|
||||
any_last_segments = ".*"
|
||||
results = []
|
||||
parts = re.split(any_sep, pat)
|
||||
last_part_idx = len(parts) - 1
|
||||
for idx, part in enumerate(parts):
|
||||
if part == "*":
|
||||
results.append(one_segment if idx < last_part_idx else one_last_segment)
|
||||
continue
|
||||
if part == "**":
|
||||
results.append(any_segments if idx < last_part_idx else any_last_segments)
|
||||
continue
|
||||
elif "**" in part:
|
||||
raise ValueError(
|
||||
"Invalid pattern: '**' can only be an entire path component"
|
||||
)
|
||||
if part:
|
||||
results.extend(_translate(part, f"{not_sep}*", not_sep))
|
||||
if idx < last_part_idx:
|
||||
results.append(any_sep)
|
||||
res = "".join(results)
|
||||
return rf"(?s:{res})\Z"
|
||||
Reference in New Issue
Block a user