chore: 添加虚拟环境到仓库

- 添加 backend_service/venv 虚拟环境
- 包含所有Python依赖包
- 注意:虚拟环境约393MB,包含12655个文件
This commit is contained in:
2025-12-03 10:19:25 +08:00
parent a6c2027caa
commit c4f851d387
12655 changed files with 3009376 additions and 0 deletions

View File

@@ -0,0 +1,71 @@
from . import caching
from ._version import __version__ # noqa: F401
from .callbacks import Callback
from .compression import available_compressions
from .core import get_fs_token_paths, open, open_files, open_local, url_to_fs
from .exceptions import FSTimeoutError
from .mapping import FSMap, get_mapper
from .registry import (
available_protocols,
filesystem,
get_filesystem_class,
register_implementation,
registry,
)
from .spec import AbstractFileSystem
__all__ = [
"AbstractFileSystem",
"FSTimeoutError",
"FSMap",
"filesystem",
"register_implementation",
"get_filesystem_class",
"get_fs_token_paths",
"get_mapper",
"open",
"open_files",
"open_local",
"registry",
"caching",
"Callback",
"available_protocols",
"available_compressions",
"url_to_fs",
]
def process_entries():
try:
from importlib.metadata import entry_points
except ImportError:
return
if entry_points is not None:
try:
eps = entry_points()
except TypeError:
pass # importlib-metadata < 0.8
else:
if hasattr(eps, "select"): # Python 3.10+ / importlib_metadata >= 3.9.0
specs = eps.select(group="fsspec.specs")
else:
specs = eps.get("fsspec.specs", [])
registered_names = {}
for spec in specs:
err_msg = f"Unable to load filesystem from {spec}"
name = spec.name
if name in registered_names:
continue
registered_names[name] = True
register_implementation(
name,
spec.value.replace(":", "."),
errtxt=err_msg,
# We take our implementations as the ones to overload with if
# for some reason we encounter some, may be the same, already
# registered
clobber=True,
)
process_entries()

View File

@@ -0,0 +1,34 @@
# file generated by setuptools-scm
# don't change, don't track in version control
__all__ = [
"__version__",
"__version_tuple__",
"version",
"version_tuple",
"__commit_id__",
"commit_id",
]
TYPE_CHECKING = False
if TYPE_CHECKING:
from typing import Tuple
from typing import Union
VERSION_TUPLE = Tuple[Union[int, str], ...]
COMMIT_ID = Union[str, None]
else:
VERSION_TUPLE = object
COMMIT_ID = object
version: str
__version__: str
__version_tuple__: VERSION_TUPLE
version_tuple: VERSION_TUPLE
commit_id: COMMIT_ID
__commit_id__: COMMIT_ID
__version__ = version = '2025.10.0'
__version_tuple__ = version_tuple = (2025, 10, 0)
__commit_id__ = commit_id = None

View File

@@ -0,0 +1,75 @@
import operator
from fsspec import AbstractFileSystem
from fsspec.utils import tokenize
class AbstractArchiveFileSystem(AbstractFileSystem):
"""
A generic superclass for implementing Archive-based filesystems.
Currently, it is shared amongst
:class:`~fsspec.implementations.zip.ZipFileSystem`,
:class:`~fsspec.implementations.libarchive.LibArchiveFileSystem` and
:class:`~fsspec.implementations.tar.TarFileSystem`.
"""
def __str__(self):
return f"<Archive-like object {type(self).__name__} at {id(self)}>"
__repr__ = __str__
def ukey(self, path):
return tokenize(path, self.fo, self.protocol)
def _all_dirnames(self, paths):
"""Returns *all* directory names for each path in paths, including intermediate
ones.
Parameters
----------
paths: Iterable of path strings
"""
if len(paths) == 0:
return set()
dirnames = {self._parent(path) for path in paths} - {self.root_marker}
return dirnames | self._all_dirnames(dirnames)
def info(self, path, **kwargs):
self._get_dirs()
path = self._strip_protocol(path)
if path in {"", "/"} and self.dir_cache:
return {"name": "", "type": "directory", "size": 0}
if path in self.dir_cache:
return self.dir_cache[path]
elif path + "/" in self.dir_cache:
return self.dir_cache[path + "/"]
else:
raise FileNotFoundError(path)
def ls(self, path, detail=True, **kwargs):
self._get_dirs()
paths = {}
for p, f in self.dir_cache.items():
p = p.rstrip("/")
if "/" in p:
root = p.rsplit("/", 1)[0]
else:
root = ""
if root == path.rstrip("/"):
paths[p] = f
elif all(
(a == b)
for a, b in zip(path.split("/"), [""] + p.strip("/").split("/"))
):
# root directory entry
ppath = p.rstrip("/").split("/", 1)[0]
if ppath not in paths:
out = {"name": ppath, "size": 0, "type": "directory"}
paths[ppath] = out
if detail:
out = sorted(paths.values(), key=operator.itemgetter("name"))
return out
else:
return sorted(paths)

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,324 @@
from functools import wraps
class Callback:
"""
Base class and interface for callback mechanism
This class can be used directly for monitoring file transfers by
providing ``callback=Callback(hooks=...)`` (see the ``hooks`` argument,
below), or subclassed for more specialised behaviour.
Parameters
----------
size: int (optional)
Nominal quantity for the value that corresponds to a complete
transfer, e.g., total number of tiles or total number of
bytes
value: int (0)
Starting internal counter value
hooks: dict or None
A dict of named functions to be called on each update. The signature
of these must be ``f(size, value, **kwargs)``
"""
def __init__(self, size=None, value=0, hooks=None, **kwargs):
self.size = size
self.value = value
self.hooks = hooks or {}
self.kw = kwargs
def __enter__(self):
return self
def __exit__(self, *exc_args):
self.close()
def close(self):
"""Close callback."""
def branched(self, path_1, path_2, **kwargs):
"""
Return callback for child transfers
If this callback is operating at a higher level, e.g., put, which may
trigger transfers that can also be monitored. The function returns a callback
that has to be passed to the child method, e.g., put_file,
as `callback=` argument.
The implementation uses `callback.branch` for compatibility.
When implementing callbacks, it is recommended to override this function instead
of `branch` and avoid calling `super().branched(...)`.
Prefer using this function over `branch`.
Parameters
----------
path_1: str
Child's source path
path_2: str
Child's destination path
**kwargs:
Arbitrary keyword arguments
Returns
-------
callback: Callback
A callback instance to be passed to the child method
"""
self.branch(path_1, path_2, kwargs)
# mutate kwargs so that we can force the caller to pass "callback=" explicitly
return kwargs.pop("callback", DEFAULT_CALLBACK)
def branch_coro(self, fn):
"""
Wraps a coroutine, and pass a new child callback to it.
"""
@wraps(fn)
async def func(path1, path2: str, **kwargs):
with self.branched(path1, path2, **kwargs) as child:
return await fn(path1, path2, callback=child, **kwargs)
return func
def set_size(self, size):
"""
Set the internal maximum size attribute
Usually called if not initially set at instantiation. Note that this
triggers a ``call()``.
Parameters
----------
size: int
"""
self.size = size
self.call()
def absolute_update(self, value):
"""
Set the internal value state
Triggers ``call()``
Parameters
----------
value: int
"""
self.value = value
self.call()
def relative_update(self, inc=1):
"""
Delta increment the internal counter
Triggers ``call()``
Parameters
----------
inc: int
"""
self.value += inc
self.call()
def call(self, hook_name=None, **kwargs):
"""
Execute hook(s) with current state
Each function is passed the internal size and current value
Parameters
----------
hook_name: str or None
If given, execute on this hook
kwargs: passed on to (all) hook(s)
"""
if not self.hooks:
return
kw = self.kw.copy()
kw.update(kwargs)
if hook_name:
if hook_name not in self.hooks:
return
return self.hooks[hook_name](self.size, self.value, **kw)
for hook in self.hooks.values() or []:
hook(self.size, self.value, **kw)
def wrap(self, iterable):
"""
Wrap an iterable to call ``relative_update`` on each iterations
Parameters
----------
iterable: Iterable
The iterable that is being wrapped
"""
for item in iterable:
self.relative_update()
yield item
def branch(self, path_1, path_2, kwargs):
"""
Set callbacks for child transfers
If this callback is operating at a higher level, e.g., put, which may
trigger transfers that can also be monitored. The passed kwargs are
to be *mutated* to add ``callback=``, if this class supports branching
to children.
Parameters
----------
path_1: str
Child's source path
path_2: str
Child's destination path
kwargs: dict
arguments passed to child method, e.g., put_file.
Returns
-------
"""
return None
def no_op(self, *_, **__):
pass
def __getattr__(self, item):
"""
If undefined methods are called on this class, nothing happens
"""
return self.no_op
@classmethod
def as_callback(cls, maybe_callback=None):
"""Transform callback=... into Callback instance
For the special value of ``None``, return the global instance of
``NoOpCallback``. This is an alternative to including
``callback=DEFAULT_CALLBACK`` directly in a method signature.
"""
if maybe_callback is None:
return DEFAULT_CALLBACK
return maybe_callback
class NoOpCallback(Callback):
"""
This implementation of Callback does exactly nothing
"""
def call(self, *args, **kwargs):
return None
class DotPrinterCallback(Callback):
"""
Simple example Callback implementation
Almost identical to Callback with a hook that prints a char; here we
demonstrate how the outer layer may print "#" and the inner layer "."
"""
def __init__(self, chr_to_print="#", **kwargs):
self.chr = chr_to_print
super().__init__(**kwargs)
def branch(self, path_1, path_2, kwargs):
"""Mutate kwargs to add new instance with different print char"""
kwargs["callback"] = DotPrinterCallback(".")
def call(self, **kwargs):
"""Just outputs a character"""
print(self.chr, end="")
class TqdmCallback(Callback):
"""
A callback to display a progress bar using tqdm
Parameters
----------
tqdm_kwargs : dict, (optional)
Any argument accepted by the tqdm constructor.
See the `tqdm doc <https://tqdm.github.io/docs/tqdm/#__init__>`_.
Will be forwarded to `tqdm_cls`.
tqdm_cls: (optional)
subclass of `tqdm.tqdm`. If not passed, it will default to `tqdm.tqdm`.
Examples
--------
>>> import fsspec
>>> from fsspec.callbacks import TqdmCallback
>>> fs = fsspec.filesystem("memory")
>>> path2distant_data = "/your-path"
>>> fs.upload(
".",
path2distant_data,
recursive=True,
callback=TqdmCallback(),
)
You can forward args to tqdm using the ``tqdm_kwargs`` parameter.
>>> fs.upload(
".",
path2distant_data,
recursive=True,
callback=TqdmCallback(tqdm_kwargs={"desc": "Your tqdm description"}),
)
You can also customize the progress bar by passing a subclass of `tqdm`.
.. code-block:: python
class TqdmFormat(tqdm):
'''Provides a `total_time` format parameter'''
@property
def format_dict(self):
d = super().format_dict
total_time = d["elapsed"] * (d["total"] or 0) / max(d["n"], 1)
d.update(total_time=self.format_interval(total_time) + " in total")
return d
>>> with TqdmCallback(
tqdm_kwargs={
"desc": "desc",
"bar_format": "{total_time}: {percentage:.0f}%|{bar}{r_bar}",
},
tqdm_cls=TqdmFormat,
) as callback:
fs.upload(".", path2distant_data, recursive=True, callback=callback)
"""
def __init__(self, tqdm_kwargs=None, *args, **kwargs):
try:
from tqdm import tqdm
except ImportError as exce:
raise ImportError(
"Using TqdmCallback requires tqdm to be installed"
) from exce
self._tqdm_cls = kwargs.pop("tqdm_cls", tqdm)
self._tqdm_kwargs = tqdm_kwargs or {}
self.tqdm = None
super().__init__(*args, **kwargs)
def call(self, *args, **kwargs):
if self.tqdm is None:
self.tqdm = self._tqdm_cls(total=self.size, **self._tqdm_kwargs)
self.tqdm.total = self.size
self.tqdm.update(self.value - self.tqdm.n)
def close(self):
if self.tqdm is not None:
self.tqdm.close()
self.tqdm = None
def __del__(self):
return self.close()
DEFAULT_CALLBACK = _DEFAULT_CALLBACK = NoOpCallback()

View File

@@ -0,0 +1,182 @@
"""Helper functions for a standard streaming compression API"""
from zipfile import ZipFile
import fsspec.utils
from fsspec.spec import AbstractBufferedFile
def noop_file(file, mode, **kwargs):
return file
# TODO: files should also be available as contexts
# should be functions of the form func(infile, mode=, **kwargs) -> file-like
compr = {None: noop_file}
def register_compression(name, callback, extensions, force=False):
"""Register an "inferable" file compression type.
Registers transparent file compression type for use with fsspec.open.
Compression can be specified by name in open, or "infer"-ed for any files
ending with the given extensions.
Args:
name: (str) The compression type name. Eg. "gzip".
callback: A callable of form (infile, mode, **kwargs) -> file-like.
Accepts an input file-like object, the target mode and kwargs.
Returns a wrapped file-like object.
extensions: (str, Iterable[str]) A file extension, or list of file
extensions for which to infer this compression scheme. Eg. "gz".
force: (bool) Force re-registration of compression type or extensions.
Raises:
ValueError: If name or extensions already registered, and not force.
"""
if isinstance(extensions, str):
extensions = [extensions]
# Validate registration
if name in compr and not force:
raise ValueError(f"Duplicate compression registration: {name}")
for ext in extensions:
if ext in fsspec.utils.compressions and not force:
raise ValueError(f"Duplicate compression file extension: {ext} ({name})")
compr[name] = callback
for ext in extensions:
fsspec.utils.compressions[ext] = name
def unzip(infile, mode="rb", filename=None, **kwargs):
if "r" not in mode:
filename = filename or "file"
z = ZipFile(infile, mode="w", **kwargs)
fo = z.open(filename, mode="w")
fo.close = lambda closer=fo.close: closer() or z.close()
return fo
z = ZipFile(infile)
if filename is None:
filename = z.namelist()[0]
return z.open(filename, mode="r", **kwargs)
register_compression("zip", unzip, "zip")
try:
from bz2 import BZ2File
except ImportError:
pass
else:
register_compression("bz2", BZ2File, "bz2")
try: # pragma: no cover
from isal import igzip
def isal(infile, mode="rb", **kwargs):
return igzip.IGzipFile(fileobj=infile, mode=mode, **kwargs)
register_compression("gzip", isal, "gz")
except ImportError:
from gzip import GzipFile
register_compression(
"gzip", lambda f, **kwargs: GzipFile(fileobj=f, **kwargs), "gz"
)
try:
from lzma import LZMAFile
register_compression("lzma", LZMAFile, "lzma")
register_compression("xz", LZMAFile, "xz")
except ImportError:
pass
try:
import lzmaffi
register_compression("lzma", lzmaffi.LZMAFile, "lzma", force=True)
register_compression("xz", lzmaffi.LZMAFile, "xz", force=True)
except ImportError:
pass
class SnappyFile(AbstractBufferedFile):
def __init__(self, infile, mode, **kwargs):
import snappy
super().__init__(
fs=None, path="snappy", mode=mode.strip("b") + "b", size=999999999, **kwargs
)
self.infile = infile
if "r" in mode:
self.codec = snappy.StreamDecompressor()
else:
self.codec = snappy.StreamCompressor()
def _upload_chunk(self, final=False):
self.buffer.seek(0)
out = self.codec.add_chunk(self.buffer.read())
self.infile.write(out)
return True
def seek(self, loc, whence=0):
raise NotImplementedError("SnappyFile is not seekable")
def seekable(self):
return False
def _fetch_range(self, start, end):
"""Get the specified set of bytes from remote"""
data = self.infile.read(end - start)
return self.codec.decompress(data)
try:
import snappy
snappy.compress(b"")
# Snappy may use the .sz file extension, but this is not part of the
# standard implementation.
register_compression("snappy", SnappyFile, [])
except (ImportError, NameError, AttributeError):
pass
try:
import lz4.frame
register_compression("lz4", lz4.frame.open, "lz4")
except ImportError:
pass
try:
# zstd in the standard library for python >= 3.14
from compression.zstd import ZstdFile
register_compression("zstd", ZstdFile, "zst")
except ImportError:
try:
import zstandard as zstd
def zstandard_file(infile, mode="rb"):
if "r" in mode:
cctx = zstd.ZstdDecompressor()
return cctx.stream_reader(infile)
else:
cctx = zstd.ZstdCompressor(level=10)
return cctx.stream_writer(infile)
register_compression("zstd", zstandard_file, "zst")
except ImportError:
pass
def available_compressions():
"""Return a list of the implemented compressions."""
return list(compr)

View File

@@ -0,0 +1,131 @@
from __future__ import annotations
import configparser
import json
import os
import warnings
from typing import Any
conf: dict[str, dict[str, Any]] = {}
default_conf_dir = os.path.join(os.path.expanduser("~"), ".config/fsspec")
conf_dir = os.environ.get("FSSPEC_CONFIG_DIR", default_conf_dir)
def set_conf_env(conf_dict, envdict=os.environ):
"""Set config values from environment variables
Looks for variables of the form ``FSSPEC_<protocol>`` and
``FSSPEC_<protocol>_<kwarg>``. For ``FSSPEC_<protocol>`` the value is parsed
as a json dictionary and used to ``update`` the config of the
corresponding protocol. For ``FSSPEC_<protocol>_<kwarg>`` there is no
attempt to convert the string value, but the kwarg keys will be lower-cased.
The ``FSSPEC_<protocol>_<kwarg>`` variables are applied after the
``FSSPEC_<protocol>`` ones.
Parameters
----------
conf_dict : dict(str, dict)
This dict will be mutated
envdict : dict-like(str, str)
Source for the values - usually the real environment
"""
kwarg_keys = []
for key in envdict:
if key.startswith("FSSPEC_") and len(key) > 7 and key[7] != "_":
if key.count("_") > 1:
kwarg_keys.append(key)
continue
try:
value = json.loads(envdict[key])
except json.decoder.JSONDecodeError as ex:
warnings.warn(
f"Ignoring environment variable {key} due to a parse failure: {ex}"
)
else:
if isinstance(value, dict):
_, proto = key.split("_", 1)
conf_dict.setdefault(proto.lower(), {}).update(value)
else:
warnings.warn(
f"Ignoring environment variable {key} due to not being a dict:"
f" {type(value)}"
)
elif key.startswith("FSSPEC"):
warnings.warn(
f"Ignoring environment variable {key} due to having an unexpected name"
)
for key in kwarg_keys:
_, proto, kwarg = key.split("_", 2)
conf_dict.setdefault(proto.lower(), {})[kwarg.lower()] = envdict[key]
def set_conf_files(cdir, conf_dict):
"""Set config values from files
Scans for INI and JSON files in the given dictionary, and uses their
contents to set the config. In case of repeated values, later values
win.
In the case of INI files, all values are strings, and these will not
be converted.
Parameters
----------
cdir : str
Directory to search
conf_dict : dict(str, dict)
This dict will be mutated
"""
if not os.path.isdir(cdir):
return
allfiles = sorted(os.listdir(cdir))
for fn in allfiles:
if fn.endswith(".ini"):
ini = configparser.ConfigParser()
ini.read(os.path.join(cdir, fn))
for key in ini:
if key == "DEFAULT":
continue
conf_dict.setdefault(key, {}).update(dict(ini[key]))
if fn.endswith(".json"):
with open(os.path.join(cdir, fn)) as f:
js = json.load(f)
for key in js:
conf_dict.setdefault(key, {}).update(dict(js[key]))
def apply_config(cls, kwargs, conf_dict=None):
"""Supply default values for kwargs when instantiating class
Augments the passed kwargs, by finding entries in the config dict
which match the classes ``.protocol`` attribute (one or more str)
Parameters
----------
cls : file system implementation
kwargs : dict
conf_dict : dict of dict
Typically this is the global configuration
Returns
-------
dict : the modified set of kwargs
"""
if conf_dict is None:
conf_dict = conf
protos = cls.protocol if isinstance(cls.protocol, (tuple, list)) else [cls.protocol]
kw = {}
for proto in protos:
# default kwargs from the current state of the config
if proto in conf_dict:
kw.update(conf_dict[proto])
# explicit kwargs always win
kw.update(**kwargs)
kwargs = kw
return kwargs
set_conf_files(conf_dir, conf)
set_conf_env(conf)

View File

@@ -0,0 +1,125 @@
import os
import shutil
import subprocess
import sys
import time
from collections import deque
from collections.abc import Generator, Sequence
import pytest
import fsspec
@pytest.fixture()
def m():
"""
Fixture providing a memory filesystem.
"""
m = fsspec.filesystem("memory")
m.store.clear()
m.pseudo_dirs.clear()
m.pseudo_dirs.append("")
try:
yield m
finally:
m.store.clear()
m.pseudo_dirs.clear()
m.pseudo_dirs.append("")
class InstanceCacheInspector:
"""
Helper class to inspect instance caches of filesystem classes in tests.
"""
def clear(self) -> None:
"""
Clear instance caches of all currently imported filesystem classes.
"""
classes = deque([fsspec.spec.AbstractFileSystem])
while classes:
cls = classes.popleft()
cls.clear_instance_cache()
classes.extend(cls.__subclasses__())
def gather_counts(self, *, omit_zero: bool = True) -> dict[str, int]:
"""
Gather counts of filesystem instances in the instance caches
of all currently imported filesystem classes.
Parameters
----------
omit_zero:
Whether to omit instance types with no cached instances.
"""
out: dict[str, int] = {}
classes = deque([fsspec.spec.AbstractFileSystem])
while classes:
cls = classes.popleft()
count = len(cls._cache) # there is no public interface for the cache
# note: skip intermediate AbstractFileSystem subclasses
# if they proxy the protocol attribute via a property.
if isinstance(cls.protocol, (Sequence, str)):
key = cls.protocol if isinstance(cls.protocol, str) else cls.protocol[0]
if count or not omit_zero:
out[key] = count
classes.extend(cls.__subclasses__())
return out
@pytest.fixture(scope="function", autouse=True)
def instance_caches() -> Generator[InstanceCacheInspector, None, None]:
"""
Fixture to ensure empty filesystem instance caches before and after a test.
Used by default for all tests.
Clears caches of all imported filesystem classes.
Can be used to write test assertions about instance caches.
Usage:
def test_something(instance_caches):
# Test code here
fsspec.open("file://abc")
fsspec.open("memory://foo/bar")
# Test assertion
assert instance_caches.gather_counts() == {"file": 1, "memory": 1}
Returns
-------
instance_caches: An instance cache inspector for clearing and inspecting caches.
"""
ic = InstanceCacheInspector()
ic.clear()
try:
yield ic
finally:
ic.clear()
@pytest.fixture(scope="function")
def ftp_writable(tmpdir):
"""
Fixture providing a writable FTP filesystem.
"""
pytest.importorskip("pyftpdlib")
d = str(tmpdir)
with open(os.path.join(d, "out"), "wb") as f:
f.write(b"hello" * 10000)
P = subprocess.Popen(
[sys.executable, "-m", "pyftpdlib", "-d", d, "-u", "user", "-P", "pass", "-w"]
)
try:
time.sleep(1)
yield "localhost", 2121, "user", "pass"
finally:
P.terminate()
P.wait()
try:
shutil.rmtree(tmpdir)
except Exception:
pass

View File

@@ -0,0 +1,743 @@
from __future__ import annotations
import io
import logging
import os
import re
from glob import has_magic
from pathlib import Path
# for backwards compat, we export cache things from here too
from fsspec.caching import ( # noqa: F401
BaseCache,
BlockCache,
BytesCache,
MMapCache,
ReadAheadCache,
caches,
)
from fsspec.compression import compr
from fsspec.config import conf
from fsspec.registry import filesystem, get_filesystem_class
from fsspec.utils import (
_unstrip_protocol,
build_name_function,
infer_compression,
stringify_path,
)
logger = logging.getLogger("fsspec")
class OpenFile:
"""
File-like object to be used in a context
Can layer (buffered) text-mode and compression over any file-system, which
are typically binary-only.
These instances are safe to serialize, as the low-level file object
is not created until invoked using ``with``.
Parameters
----------
fs: FileSystem
The file system to use for opening the file. Should be a subclass or duck-type
with ``fsspec.spec.AbstractFileSystem``
path: str
Location to open
mode: str like 'rb', optional
Mode of the opened file
compression: str or None, optional
Compression to apply
encoding: str or None, optional
The encoding to use if opened in text mode.
errors: str or None, optional
How to handle encoding errors if opened in text mode.
newline: None or str
Passed to TextIOWrapper in text mode, how to handle line endings.
autoopen: bool
If True, calls open() immediately. Mostly used by pickle
pos: int
If given and autoopen is True, seek to this location immediately
"""
def __init__(
self,
fs,
path,
mode="rb",
compression=None,
encoding=None,
errors=None,
newline=None,
):
self.fs = fs
self.path = path
self.mode = mode
self.compression = get_compression(path, compression)
self.encoding = encoding
self.errors = errors
self.newline = newline
self.fobjects = []
def __reduce__(self):
return (
OpenFile,
(
self.fs,
self.path,
self.mode,
self.compression,
self.encoding,
self.errors,
self.newline,
),
)
def __repr__(self):
return f"<OpenFile '{self.path}'>"
def __enter__(self):
mode = self.mode.replace("t", "").replace("b", "") + "b"
try:
f = self.fs.open(self.path, mode=mode)
except FileNotFoundError as e:
if has_magic(self.path):
raise FileNotFoundError(
"%s not found. The URL contains glob characters: you maybe needed\n"
"to pass expand=True in fsspec.open() or the storage_options of \n"
"your library. You can also set the config value 'open_expand'\n"
"before import, or fsspec.core.DEFAULT_EXPAND at runtime, to True.",
self.path,
) from e
raise
self.fobjects = [f]
if self.compression is not None:
compress = compr[self.compression]
f = compress(f, mode=mode[0])
self.fobjects.append(f)
if "b" not in self.mode:
# assume, for example, that 'r' is equivalent to 'rt' as in builtin
f = PickleableTextIOWrapper(
f, encoding=self.encoding, errors=self.errors, newline=self.newline
)
self.fobjects.append(f)
return self.fobjects[-1]
def __exit__(self, *args):
self.close()
@property
def full_name(self):
return _unstrip_protocol(self.path, self.fs)
def open(self):
"""Materialise this as a real open file without context
The OpenFile object should be explicitly closed to avoid enclosed file
instances persisting. You must, therefore, keep a reference to the OpenFile
during the life of the file-like it generates.
"""
return self.__enter__()
def close(self):
"""Close all encapsulated file objects"""
for f in reversed(self.fobjects):
if "r" not in self.mode and not f.closed:
f.flush()
f.close()
self.fobjects.clear()
class OpenFiles(list):
"""List of OpenFile instances
Can be used in a single context, which opens and closes all of the
contained files. Normal list access to get the elements works as
normal.
A special case is made for caching filesystems - the files will
be down/uploaded together at the start or end of the context, and
this may happen concurrently, if the target filesystem supports it.
"""
def __init__(self, *args, mode="rb", fs=None):
self.mode = mode
self.fs = fs
self.files = []
super().__init__(*args)
def __enter__(self):
if self.fs is None:
raise ValueError("Context has already been used")
fs = self.fs
while True:
if hasattr(fs, "open_many"):
# check for concurrent cache download; or set up for upload
self.files = fs.open_many(self)
return self.files
if hasattr(fs, "fs") and fs.fs is not None:
fs = fs.fs
else:
break
return [s.__enter__() for s in self]
def __exit__(self, *args):
fs = self.fs
[s.__exit__(*args) for s in self]
if "r" not in self.mode:
while True:
if hasattr(fs, "open_many"):
# check for concurrent cache upload
fs.commit_many(self.files)
return
if hasattr(fs, "fs") and fs.fs is not None:
fs = fs.fs
else:
break
def __getitem__(self, item):
out = super().__getitem__(item)
if isinstance(item, slice):
return OpenFiles(out, mode=self.mode, fs=self.fs)
return out
def __repr__(self):
return f"<List of {len(self)} OpenFile instances>"
def open_files(
urlpath,
mode="rb",
compression=None,
encoding="utf8",
errors=None,
name_function=None,
num=1,
protocol=None,
newline=None,
auto_mkdir=True,
expand=True,
**kwargs,
):
"""Given a path or paths, return a list of ``OpenFile`` objects.
For writing, a str path must contain the "*" character, which will be filled
in by increasing numbers, e.g., "part*" -> "part1", "part2" if num=2.
For either reading or writing, can instead provide explicit list of paths.
Parameters
----------
urlpath: string or list
Absolute or relative filepath(s). Prefix with a protocol like ``s3://``
to read from alternative filesystems. To read from multiple files you
can pass a globstring or a list of paths, with the caveat that they
must all have the same protocol.
mode: 'rb', 'wt', etc.
compression: string or None
If given, open file using compression codec. Can either be a compression
name (a key in ``fsspec.compression.compr``) or "infer" to guess the
compression from the filename suffix.
encoding: str
For text mode only
errors: None or str
Passed to TextIOWrapper in text mode
name_function: function or None
if opening a set of files for writing, those files do not yet exist,
so we need to generate their names by formatting the urlpath for
each sequence number
num: int [1]
if writing mode, number of files we expect to create (passed to
name+function)
protocol: str or None
If given, overrides the protocol found in the URL.
newline: bytes or None
Used for line terminator in text mode. If None, uses system default;
if blank, uses no translation.
auto_mkdir: bool (True)
If in write mode, this will ensure the target directory exists before
writing, by calling ``fs.mkdirs(exist_ok=True)``.
expand: bool
**kwargs: dict
Extra options that make sense to a particular storage connection, e.g.
host, port, username, password, etc.
Examples
--------
>>> files = open_files('2015-*-*.csv') # doctest: +SKIP
>>> files = open_files(
... 's3://bucket/2015-*-*.csv.gz', compression='gzip'
... ) # doctest: +SKIP
Returns
-------
An ``OpenFiles`` instance, which is a list of ``OpenFile`` objects that can
be used as a single context
Notes
-----
For a full list of the available protocols and the implementations that
they map across to see the latest online documentation:
- For implementations built into ``fsspec`` see
https://filesystem-spec.readthedocs.io/en/latest/api.html#built-in-implementations
- For implementations in separate packages see
https://filesystem-spec.readthedocs.io/en/latest/api.html#other-known-implementations
"""
fs, fs_token, paths = get_fs_token_paths(
urlpath,
mode,
num=num,
name_function=name_function,
storage_options=kwargs,
protocol=protocol,
expand=expand,
)
if fs.protocol == "file":
fs.auto_mkdir = auto_mkdir
elif "r" not in mode and auto_mkdir:
parents = {fs._parent(path) for path in paths}
for parent in parents:
try:
fs.makedirs(parent, exist_ok=True)
except PermissionError:
pass
return OpenFiles(
[
OpenFile(
fs,
path,
mode=mode,
compression=compression,
encoding=encoding,
errors=errors,
newline=newline,
)
for path in paths
],
mode=mode,
fs=fs,
)
def _un_chain(path, kwargs):
# Avoid a circular import
from fsspec.implementations.chained import ChainedFileSystem
if "::" in path:
x = re.compile(".*[^a-z]+.*") # test for non protocol-like single word
bits = []
for p in path.split("::"):
if "://" in p or x.match(p):
bits.append(p)
else:
bits.append(p + "://")
else:
bits = [path]
# [[url, protocol, kwargs], ...]
out = []
previous_bit = None
kwargs = kwargs.copy()
for bit in reversed(bits):
protocol = kwargs.pop("protocol", None) or split_protocol(bit)[0] or "file"
cls = get_filesystem_class(protocol)
extra_kwargs = cls._get_kwargs_from_urls(bit)
kws = kwargs.pop(protocol, {})
if bit is bits[0]:
kws.update(kwargs)
kw = dict(
**{k: v for k, v in extra_kwargs.items() if k not in kws or v != kws[k]},
**kws,
)
bit = cls._strip_protocol(bit)
if "target_protocol" not in kw and issubclass(cls, ChainedFileSystem):
bit = previous_bit
out.append((bit, protocol, kw))
previous_bit = bit
out.reverse()
return out
def url_to_fs(url, **kwargs):
"""
Turn fully-qualified and potentially chained URL into filesystem instance
Parameters
----------
url : str
The fsspec-compatible URL
**kwargs: dict
Extra options that make sense to a particular storage connection, e.g.
host, port, username, password, etc.
Returns
-------
filesystem : FileSystem
The new filesystem discovered from ``url`` and created with
``**kwargs``.
urlpath : str
The file-systems-specific URL for ``url``.
"""
url = stringify_path(url)
# non-FS arguments that appear in fsspec.open()
# inspect could keep this in sync with open()'s signature
known_kwargs = {
"compression",
"encoding",
"errors",
"expand",
"mode",
"name_function",
"newline",
"num",
}
kwargs = {k: v for k, v in kwargs.items() if k not in known_kwargs}
chain = _un_chain(url, kwargs)
inkwargs = {}
# Reverse iterate the chain, creating a nested target_* structure
for i, ch in enumerate(reversed(chain)):
urls, protocol, kw = ch
if i == len(chain) - 1:
inkwargs = dict(**kw, **inkwargs)
continue
inkwargs["target_options"] = dict(**kw, **inkwargs)
inkwargs["target_protocol"] = protocol
inkwargs["fo"] = urls
urlpath, protocol, _ = chain[0]
fs = filesystem(protocol, **inkwargs)
return fs, urlpath
DEFAULT_EXPAND = conf.get("open_expand", False)
def open(
urlpath,
mode="rb",
compression=None,
encoding="utf8",
errors=None,
protocol=None,
newline=None,
expand=None,
**kwargs,
):
"""Given a path or paths, return one ``OpenFile`` object.
Parameters
----------
urlpath: string or list
Absolute or relative filepath. Prefix with a protocol like ``s3://``
to read from alternative filesystems. Should not include glob
character(s).
mode: 'rb', 'wt', etc.
compression: string or None
If given, open file using compression codec. Can either be a compression
name (a key in ``fsspec.compression.compr``) or "infer" to guess the
compression from the filename suffix.
encoding: str
For text mode only
errors: None or str
Passed to TextIOWrapper in text mode
protocol: str or None
If given, overrides the protocol found in the URL.
newline: bytes or None
Used for line terminator in text mode. If None, uses system default;
if blank, uses no translation.
expand: bool or None
Whether to regard file paths containing special glob characters as needing
expansion (finding the first match) or absolute. Setting False allows using
paths which do embed such characters. If None (default), this argument
takes its value from the DEFAULT_EXPAND module variable, which takes
its initial value from the "open_expand" config value at startup, which will
be False if not set.
**kwargs: dict
Extra options that make sense to a particular storage connection, e.g.
host, port, username, password, etc.
Examples
--------
>>> openfile = open('2015-01-01.csv') # doctest: +SKIP
>>> openfile = open(
... 's3://bucket/2015-01-01.csv.gz', compression='gzip'
... ) # doctest: +SKIP
>>> with openfile as f:
... df = pd.read_csv(f) # doctest: +SKIP
...
Returns
-------
``OpenFile`` object.
Notes
-----
For a full list of the available protocols and the implementations that
they map across to see the latest online documentation:
- For implementations built into ``fsspec`` see
https://filesystem-spec.readthedocs.io/en/latest/api.html#built-in-implementations
- For implementations in separate packages see
https://filesystem-spec.readthedocs.io/en/latest/api.html#other-known-implementations
"""
expand = DEFAULT_EXPAND if expand is None else expand
out = open_files(
urlpath=[urlpath],
mode=mode,
compression=compression,
encoding=encoding,
errors=errors,
protocol=protocol,
newline=newline,
expand=expand,
**kwargs,
)
if not out:
raise FileNotFoundError(urlpath)
return out[0]
def open_local(
url: str | list[str] | Path | list[Path],
mode: str = "rb",
**storage_options: dict,
) -> str | list[str]:
"""Open file(s) which can be resolved to local
For files which either are local, or get downloaded upon open
(e.g., by file caching)
Parameters
----------
url: str or list(str)
mode: str
Must be read mode
storage_options:
passed on to FS for or used by open_files (e.g., compression)
"""
if "r" not in mode:
raise ValueError("Can only ensure local files when reading")
of = open_files(url, mode=mode, **storage_options)
if not getattr(of[0].fs, "local_file", False):
raise ValueError(
"open_local can only be used on a filesystem which"
" has attribute local_file=True"
)
with of as files:
paths = [f.name for f in files]
if (isinstance(url, str) and not has_magic(url)) or isinstance(url, Path):
return paths[0]
return paths
def get_compression(urlpath, compression):
if compression == "infer":
compression = infer_compression(urlpath)
if compression is not None and compression not in compr:
raise ValueError(f"Compression type {compression} not supported")
return compression
def split_protocol(urlpath):
"""Return protocol, path pair"""
urlpath = stringify_path(urlpath)
if "://" in urlpath:
protocol, path = urlpath.split("://", 1)
if len(protocol) > 1:
# excludes Windows paths
return protocol, path
if urlpath.startswith("data:"):
return urlpath.split(":", 1)
return None, urlpath
def strip_protocol(urlpath):
"""Return only path part of full URL, according to appropriate backend"""
protocol, _ = split_protocol(urlpath)
cls = get_filesystem_class(protocol)
return cls._strip_protocol(urlpath)
def expand_paths_if_needed(paths, mode, num, fs, name_function):
"""Expand paths if they have a ``*`` in them (write mode) or any of ``*?[]``
in them (read mode).
:param paths: list of paths
mode: str
Mode in which to open files.
num: int
If opening in writing mode, number of files we expect to create.
fs: filesystem object
name_function: callable
If opening in writing mode, this callable is used to generate path
names. Names are generated for each partition by
``urlpath.replace('*', name_function(partition_index))``.
:return: list of paths
"""
expanded_paths = []
paths = list(paths)
if "w" in mode: # read mode
if sum(1 for p in paths if "*" in p) > 1:
raise ValueError(
"When writing data, only one filename mask can be specified."
)
num = max(num, len(paths))
for curr_path in paths:
if "*" in curr_path:
# expand using name_function
expanded_paths.extend(_expand_paths(curr_path, name_function, num))
else:
expanded_paths.append(curr_path)
# if we generated more paths that asked for, trim the list
if len(expanded_paths) > num:
expanded_paths = expanded_paths[:num]
else: # read mode
for curr_path in paths:
if has_magic(curr_path):
# expand using glob
expanded_paths.extend(fs.glob(curr_path))
else:
expanded_paths.append(curr_path)
return expanded_paths
def get_fs_token_paths(
urlpath,
mode="rb",
num=1,
name_function=None,
storage_options=None,
protocol=None,
expand=True,
):
"""Filesystem, deterministic token, and paths from a urlpath and options.
Parameters
----------
urlpath: string or iterable
Absolute or relative filepath, URL (may include protocols like
``s3://``), or globstring pointing to data.
mode: str, optional
Mode in which to open files.
num: int, optional
If opening in writing mode, number of files we expect to create.
name_function: callable, optional
If opening in writing mode, this callable is used to generate path
names. Names are generated for each partition by
``urlpath.replace('*', name_function(partition_index))``.
storage_options: dict, optional
Additional keywords to pass to the filesystem class.
protocol: str or None
To override the protocol specifier in the URL
expand: bool
Expand string paths for writing, assuming the path is a directory
"""
if isinstance(urlpath, (list, tuple, set)):
if not urlpath:
raise ValueError("empty urlpath sequence")
urlpath0 = stringify_path(next(iter(urlpath)))
else:
urlpath0 = stringify_path(urlpath)
storage_options = storage_options or {}
if protocol:
storage_options["protocol"] = protocol
chain = _un_chain(urlpath0, storage_options or {})
inkwargs = {}
# Reverse iterate the chain, creating a nested target_* structure
for i, ch in enumerate(reversed(chain)):
urls, nested_protocol, kw = ch
if i == len(chain) - 1:
inkwargs = dict(**kw, **inkwargs)
continue
inkwargs["target_options"] = dict(**kw, **inkwargs)
inkwargs["target_protocol"] = nested_protocol
inkwargs["fo"] = urls
paths, protocol, _ = chain[0]
fs = filesystem(protocol, **inkwargs)
if isinstance(urlpath, (list, tuple, set)):
pchains = [
_un_chain(stringify_path(u), storage_options or {})[0] for u in urlpath
]
if len({pc[1] for pc in pchains}) > 1:
raise ValueError("Protocol mismatch getting fs from %s", urlpath)
paths = [pc[0] for pc in pchains]
else:
paths = fs._strip_protocol(paths)
if isinstance(paths, (list, tuple, set)):
if expand:
paths = expand_paths_if_needed(paths, mode, num, fs, name_function)
elif not isinstance(paths, list):
paths = list(paths)
else:
if ("w" in mode or "x" in mode) and expand:
paths = _expand_paths(paths, name_function, num)
elif "*" in paths:
paths = [f for f in sorted(fs.glob(paths)) if not fs.isdir(f)]
else:
paths = [paths]
return fs, fs._fs_token, paths
def _expand_paths(path, name_function, num):
if isinstance(path, str):
if path.count("*") > 1:
raise ValueError("Output path spec must contain exactly one '*'.")
elif "*" not in path:
path = os.path.join(path, "*.part")
if name_function is None:
name_function = build_name_function(num - 1)
paths = [path.replace("*", name_function(i)) for i in range(num)]
if paths != sorted(paths):
logger.warning(
"In order to preserve order between partitions"
" paths created with ``name_function`` should "
"sort to partition order"
)
elif isinstance(path, (tuple, list)):
assert len(path) == num
paths = list(path)
else:
raise ValueError(
"Path should be either\n"
"1. A list of paths: ['foo.json', 'bar.json', ...]\n"
"2. A directory: 'foo/\n"
"3. A path with a '*' in it: 'foo.*.json'"
)
return paths
class PickleableTextIOWrapper(io.TextIOWrapper):
"""TextIOWrapper cannot be pickled. This solves it.
Requires that ``buffer`` be pickleable, which all instances of
AbstractBufferedFile are.
"""
def __init__(
self,
buffer,
encoding=None,
errors=None,
newline=None,
line_buffering=False,
write_through=False,
):
self.args = buffer, encoding, errors, newline, line_buffering, write_through
super().__init__(*self.args)
def __reduce__(self):
return PickleableTextIOWrapper, self.args

View File

@@ -0,0 +1,98 @@
import time
from collections.abc import MutableMapping
from functools import lru_cache
class DirCache(MutableMapping):
"""
Caching of directory listings, in a structure like::
{"path0": [
{"name": "path0/file0",
"size": 123,
"type": "file",
...
},
{"name": "path0/file1",
},
...
],
"path1": [...]
}
Parameters to this class control listing expiry or indeed turn
caching off
"""
def __init__(
self,
use_listings_cache=True,
listings_expiry_time=None,
max_paths=None,
**kwargs,
):
"""
Parameters
----------
use_listings_cache: bool
If False, this cache never returns items, but always reports KeyError,
and setting items has no effect
listings_expiry_time: int or float (optional)
Time in seconds that a listing is considered valid. If None,
listings do not expire.
max_paths: int (optional)
The number of most recent listings that are considered valid; 'recent'
refers to when the entry was set.
"""
self._cache = {}
self._times = {}
if max_paths:
self._q = lru_cache(max_paths + 1)(lambda key: self._cache.pop(key, None))
self.use_listings_cache = use_listings_cache
self.listings_expiry_time = listings_expiry_time
self.max_paths = max_paths
def __getitem__(self, item):
if self.listings_expiry_time is not None:
if self._times.get(item, 0) - time.time() < -self.listings_expiry_time:
del self._cache[item]
if self.max_paths:
self._q(item)
return self._cache[item] # maybe raises KeyError
def clear(self):
self._cache.clear()
def __len__(self):
return len(self._cache)
def __contains__(self, item):
try:
self[item]
return True
except KeyError:
return False
def __setitem__(self, key, value):
if not self.use_listings_cache:
return
if self.max_paths:
self._q(key)
self._cache[key] = value
if self.listings_expiry_time is not None:
self._times[key] = time.time()
def __delitem__(self, key):
del self._cache[key]
def __iter__(self):
entries = list(self._cache)
return (k for k in entries if k in self)
def __reduce__(self):
return (
DirCache,
(self.use_listings_cache, self.listings_expiry_time, self.max_paths),
)

View File

@@ -0,0 +1,18 @@
"""
fsspec user-defined exception classes
"""
import asyncio
class BlocksizeMismatchError(ValueError):
"""
Raised when a cached file is opened with a different blocksize than it was
written with
"""
class FSTimeoutError(asyncio.TimeoutError):
"""
Raised when a fsspec function timed out occurs
"""

View File

@@ -0,0 +1,324 @@
import argparse
import logging
import os
import stat
import threading
import time
from errno import EIO, ENOENT
from fuse import FUSE, FuseOSError, LoggingMixIn, Operations
from fsspec import __version__
from fsspec.core import url_to_fs
logger = logging.getLogger("fsspec.fuse")
class FUSEr(Operations):
def __init__(self, fs, path, ready_file=False):
self.fs = fs
self.cache = {}
self.root = path.rstrip("/") + "/"
self.counter = 0
logger.info("Starting FUSE at %s", path)
self._ready_file = ready_file
def getattr(self, path, fh=None):
logger.debug("getattr %s", path)
if self._ready_file and path in ["/.fuse_ready", ".fuse_ready"]:
return {"type": "file", "st_size": 5}
path = "".join([self.root, path.lstrip("/")]).rstrip("/")
try:
info = self.fs.info(path)
except FileNotFoundError as exc:
raise FuseOSError(ENOENT) from exc
data = {"st_uid": info.get("uid", 1000), "st_gid": info.get("gid", 1000)}
perm = info.get("mode", 0o777)
if info["type"] != "file":
data["st_mode"] = stat.S_IFDIR | perm
data["st_size"] = 0
data["st_blksize"] = 0
else:
data["st_mode"] = stat.S_IFREG | perm
data["st_size"] = info["size"]
data["st_blksize"] = 5 * 2**20
data["st_nlink"] = 1
data["st_atime"] = info["atime"] if "atime" in info else time.time()
data["st_ctime"] = info["ctime"] if "ctime" in info else time.time()
data["st_mtime"] = info["mtime"] if "mtime" in info else time.time()
return data
def readdir(self, path, fh):
logger.debug("readdir %s", path)
path = "".join([self.root, path.lstrip("/")])
files = self.fs.ls(path, False)
files = [os.path.basename(f.rstrip("/")) for f in files]
return [".", ".."] + files
def mkdir(self, path, mode):
path = "".join([self.root, path.lstrip("/")])
self.fs.mkdir(path)
return 0
def rmdir(self, path):
path = "".join([self.root, path.lstrip("/")])
self.fs.rmdir(path)
return 0
def read(self, path, size, offset, fh):
logger.debug("read %s", (path, size, offset))
if self._ready_file and path in ["/.fuse_ready", ".fuse_ready"]:
# status indicator
return b"ready"
f = self.cache[fh]
f.seek(offset)
out = f.read(size)
return out
def write(self, path, data, offset, fh):
logger.debug("write %s", (path, offset))
f = self.cache[fh]
f.seek(offset)
f.write(data)
return len(data)
def create(self, path, flags, fi=None):
logger.debug("create %s", (path, flags))
fn = "".join([self.root, path.lstrip("/")])
self.fs.touch(fn) # OS will want to get attributes immediately
f = self.fs.open(fn, "wb")
self.cache[self.counter] = f
self.counter += 1
return self.counter - 1
def open(self, path, flags):
logger.debug("open %s", (path, flags))
fn = "".join([self.root, path.lstrip("/")])
if flags % 2 == 0:
# read
mode = "rb"
else:
# write/create
mode = "wb"
self.cache[self.counter] = self.fs.open(fn, mode)
self.counter += 1
return self.counter - 1
def truncate(self, path, length, fh=None):
fn = "".join([self.root, path.lstrip("/")])
if length != 0:
raise NotImplementedError
# maybe should be no-op since open with write sets size to zero anyway
self.fs.touch(fn)
def unlink(self, path):
fn = "".join([self.root, path.lstrip("/")])
try:
self.fs.rm(fn, False)
except (OSError, FileNotFoundError) as exc:
raise FuseOSError(EIO) from exc
def release(self, path, fh):
try:
if fh in self.cache:
f = self.cache[fh]
f.close()
self.cache.pop(fh)
except Exception as e:
print(e)
return 0
def chmod(self, path, mode):
if hasattr(self.fs, "chmod"):
path = "".join([self.root, path.lstrip("/")])
return self.fs.chmod(path, mode)
raise NotImplementedError
def run(
fs,
path,
mount_point,
foreground=True,
threads=False,
ready_file=False,
ops_class=FUSEr,
):
"""Mount stuff in a local directory
This uses fusepy to make it appear as if a given path on an fsspec
instance is in fact resident within the local file-system.
This requires that fusepy by installed, and that FUSE be available on
the system (typically requiring a package to be installed with
apt, yum, brew, etc.).
Parameters
----------
fs: file-system instance
From one of the compatible implementations
path: str
Location on that file-system to regard as the root directory to
mount. Note that you typically should include the terminating "/"
character.
mount_point: str
An empty directory on the local file-system where the contents of
the remote path will appear.
foreground: bool
Whether or not calling this function will block. Operation will
typically be more stable if True.
threads: bool
Whether or not to create threads when responding to file operations
within the mounter directory. Operation will typically be more
stable if False.
ready_file: bool
Whether the FUSE process is ready. The ``.fuse_ready`` file will
exist in the ``mount_point`` directory if True. Debugging purpose.
ops_class: FUSEr or Subclass of FUSEr
To override the default behavior of FUSEr. For Example, logging
to file.
"""
func = lambda: FUSE(
ops_class(fs, path, ready_file=ready_file),
mount_point,
nothreads=not threads,
foreground=foreground,
)
if not foreground:
th = threading.Thread(target=func)
th.daemon = True
th.start()
return th
else: # pragma: no cover
try:
func()
except KeyboardInterrupt:
pass
def main(args):
"""Mount filesystem from chained URL to MOUNT_POINT.
Examples:
python3 -m fsspec.fuse memory /usr/share /tmp/mem
python3 -m fsspec.fuse local /tmp/source /tmp/local \\
-l /tmp/fsspecfuse.log
You can also mount chained-URLs and use special settings:
python3 -m fsspec.fuse 'filecache::zip::file://data.zip' \\
/ /tmp/zip \\
-o 'filecache-cache_storage=/tmp/simplecache'
You can specify the type of the setting by using `[int]` or `[bool]`,
(`true`, `yes`, `1` represents the Boolean value `True`):
python3 -m fsspec.fuse 'simplecache::ftp://ftp1.at.proftpd.org' \\
/historic/packages/RPMS /tmp/ftp \\
-o 'simplecache-cache_storage=/tmp/simplecache' \\
-o 'simplecache-check_files=false[bool]' \\
-o 'ftp-listings_expiry_time=60[int]' \\
-o 'ftp-username=anonymous' \\
-o 'ftp-password=xieyanbo'
"""
class RawDescriptionArgumentParser(argparse.ArgumentParser):
def format_help(self):
usage = super().format_help()
parts = usage.split("\n\n")
parts[1] = self.description.rstrip()
return "\n\n".join(parts)
parser = RawDescriptionArgumentParser(prog="fsspec.fuse", description=main.__doc__)
parser.add_argument("--version", action="version", version=__version__)
parser.add_argument("url", type=str, help="fs url")
parser.add_argument("source_path", type=str, help="source directory in fs")
parser.add_argument("mount_point", type=str, help="local directory")
parser.add_argument(
"-o",
"--option",
action="append",
help="Any options of protocol included in the chained URL",
)
parser.add_argument(
"-l", "--log-file", type=str, help="Logging FUSE debug info (Default: '')"
)
parser.add_argument(
"-f",
"--foreground",
action="store_false",
help="Running in foreground or not (Default: False)",
)
parser.add_argument(
"-t",
"--threads",
action="store_false",
help="Running with threads support (Default: False)",
)
parser.add_argument(
"-r",
"--ready-file",
action="store_false",
help="The `.fuse_ready` file will exist after FUSE is ready. "
"(Debugging purpose, Default: False)",
)
args = parser.parse_args(args)
kwargs = {}
for item in args.option or []:
key, sep, value = item.partition("=")
if not sep:
parser.error(message=f"Wrong option: {item!r}")
val = value.lower()
if val.endswith("[int]"):
value = int(value[: -len("[int]")])
elif val.endswith("[bool]"):
value = val[: -len("[bool]")] in ["1", "yes", "true"]
if "-" in key:
fs_name, setting_name = key.split("-", 1)
if fs_name in kwargs:
kwargs[fs_name][setting_name] = value
else:
kwargs[fs_name] = {setting_name: value}
else:
kwargs[key] = value
if args.log_file:
logging.basicConfig(
level=logging.DEBUG,
filename=args.log_file,
format="%(asctime)s %(message)s",
)
class LoggingFUSEr(FUSEr, LoggingMixIn):
pass
fuser = LoggingFUSEr
else:
fuser = FUSEr
fs, url_path = url_to_fs(args.url, **kwargs)
logger.debug("Mounting %s to %s", url_path, str(args.mount_point))
run(
fs,
args.source_path,
args.mount_point,
foreground=args.foreground,
threads=args.threads,
ready_file=args.ready_file,
ops_class=fuser,
)
if __name__ == "__main__":
import sys
main(sys.argv[1:])

View File

@@ -0,0 +1,396 @@
from __future__ import annotations
import inspect
import logging
import os
import shutil
import uuid
from .asyn import AsyncFileSystem, _run_coros_in_chunks, sync_wrapper
from .callbacks import DEFAULT_CALLBACK
from .core import filesystem, get_filesystem_class, split_protocol, url_to_fs
_generic_fs = {}
logger = logging.getLogger("fsspec.generic")
def set_generic_fs(protocol, **storage_options):
"""Populate the dict used for method=="generic" lookups"""
_generic_fs[protocol] = filesystem(protocol, **storage_options)
def _resolve_fs(url, method, protocol=None, storage_options=None):
"""Pick instance of backend FS"""
url = url[0] if isinstance(url, (list, tuple)) else url
protocol = protocol or split_protocol(url)[0]
storage_options = storage_options or {}
if method == "default":
return filesystem(protocol)
if method == "generic":
return _generic_fs[protocol]
if method == "current":
cls = get_filesystem_class(protocol)
return cls.current()
if method == "options":
fs, _ = url_to_fs(url, **storage_options.get(protocol, {}))
return fs
raise ValueError(f"Unknown FS resolution method: {method}")
def rsync(
source,
destination,
delete_missing=False,
source_field="size",
dest_field="size",
update_cond="different",
inst_kwargs=None,
fs=None,
**kwargs,
):
"""Sync files between two directory trees
(experimental)
Parameters
----------
source: str
Root of the directory tree to take files from. This must be a directory, but
do not include any terminating "/" character
destination: str
Root path to copy into. The contents of this location should be
identical to the contents of ``source`` when done. This will be made a
directory, and the terminal "/" should not be included.
delete_missing: bool
If there are paths in the destination that don't exist in the
source and this is True, delete them. Otherwise, leave them alone.
source_field: str | callable
If ``update_field`` is "different", this is the key in the info
of source files to consider for difference. Maybe a function of the
info dict.
dest_field: str | callable
If ``update_field`` is "different", this is the key in the info
of destination files to consider for difference. May be a function of
the info dict.
update_cond: "different"|"always"|"never"
If "always", every file is copied, regardless of whether it exists in
the destination. If "never", files that exist in the destination are
not copied again. If "different" (default), only copy if the info
fields given by ``source_field`` and ``dest_field`` (usually "size")
are different. Other comparisons may be added in the future.
inst_kwargs: dict|None
If ``fs`` is None, use this set of keyword arguments to make a
GenericFileSystem instance
fs: GenericFileSystem|None
Instance to use if explicitly given. The instance defines how to
to make downstream file system instances from paths.
Returns
-------
dict of the copy operations that were performed, {source: destination}
"""
fs = fs or GenericFileSystem(**(inst_kwargs or {}))
source = fs._strip_protocol(source)
destination = fs._strip_protocol(destination)
allfiles = fs.find(source, withdirs=True, detail=True)
if not fs.isdir(source):
raise ValueError("Can only rsync on a directory")
otherfiles = fs.find(destination, withdirs=True, detail=True)
dirs = [
a
for a, v in allfiles.items()
if v["type"] == "directory" and a.replace(source, destination) not in otherfiles
]
logger.debug(f"{len(dirs)} directories to create")
if dirs:
fs.make_many_dirs(
[dirn.replace(source, destination) for dirn in dirs], exist_ok=True
)
allfiles = {a: v for a, v in allfiles.items() if v["type"] == "file"}
logger.debug(f"{len(allfiles)} files to consider for copy")
to_delete = [
o
for o, v in otherfiles.items()
if o.replace(destination, source) not in allfiles and v["type"] == "file"
]
for k, v in allfiles.copy().items():
otherfile = k.replace(source, destination)
if otherfile in otherfiles:
if update_cond == "always":
allfiles[k] = otherfile
elif update_cond == "never":
allfiles.pop(k)
elif update_cond == "different":
inf1 = source_field(v) if callable(source_field) else v[source_field]
v2 = otherfiles[otherfile]
inf2 = dest_field(v2) if callable(dest_field) else v2[dest_field]
if inf1 != inf2:
# details mismatch, make copy
allfiles[k] = otherfile
else:
# details match, don't copy
allfiles.pop(k)
else:
# file not in target yet
allfiles[k] = otherfile
logger.debug(f"{len(allfiles)} files to copy")
if allfiles:
source_files, target_files = zip(*allfiles.items())
fs.cp(source_files, target_files, **kwargs)
logger.debug(f"{len(to_delete)} files to delete")
if delete_missing and to_delete:
fs.rm(to_delete)
return allfiles
class GenericFileSystem(AsyncFileSystem):
"""Wrapper over all other FS types
<experimental!>
This implementation is a single unified interface to be able to run FS operations
over generic URLs, and dispatch to the specific implementations using the URL
protocol prefix.
Note: instances of this FS are always async, even if you never use it with any async
backend.
"""
protocol = "generic" # there is no real reason to ever use a protocol with this FS
def __init__(self, default_method="default", storage_options=None, **kwargs):
"""
Parameters
----------
default_method: str (optional)
Defines how to configure backend FS instances. Options are:
- "default": instantiate like FSClass(), with no
extra arguments; this is the default instance of that FS, and can be
configured via the config system
- "generic": takes instances from the `_generic_fs` dict in this module,
which you must populate before use. Keys are by protocol
- "options": expects storage_options, a dict mapping protocol to
kwargs to use when constructing the filesystem
- "current": takes the most recently instantiated version of each FS
"""
self.method = default_method
self.st_opts = storage_options
super().__init__(**kwargs)
def _parent(self, path):
fs = _resolve_fs(path, self.method, storage_options=self.st_opts)
return fs.unstrip_protocol(fs._parent(path))
def _strip_protocol(self, path):
# normalization only
fs = _resolve_fs(path, self.method, storage_options=self.st_opts)
return fs.unstrip_protocol(fs._strip_protocol(path))
async def _find(self, path, maxdepth=None, withdirs=False, detail=False, **kwargs):
fs = _resolve_fs(path, self.method, storage_options=self.st_opts)
if fs.async_impl:
out = await fs._find(
path, maxdepth=maxdepth, withdirs=withdirs, detail=True, **kwargs
)
else:
out = fs.find(
path, maxdepth=maxdepth, withdirs=withdirs, detail=True, **kwargs
)
result = {}
for k, v in out.items():
v = v.copy() # don't corrupt target FS dircache
name = fs.unstrip_protocol(k)
v["name"] = name
result[name] = v
if detail:
return result
return list(result)
async def _info(self, url, **kwargs):
fs = _resolve_fs(url, self.method)
if fs.async_impl:
out = await fs._info(url, **kwargs)
else:
out = fs.info(url, **kwargs)
out = out.copy() # don't edit originals
out["name"] = fs.unstrip_protocol(out["name"])
return out
async def _ls(
self,
url,
detail=True,
**kwargs,
):
fs = _resolve_fs(url, self.method)
if fs.async_impl:
out = await fs._ls(url, detail=True, **kwargs)
else:
out = fs.ls(url, detail=True, **kwargs)
out = [o.copy() for o in out] # don't edit originals
for o in out:
o["name"] = fs.unstrip_protocol(o["name"])
if detail:
return out
else:
return [o["name"] for o in out]
async def _cat_file(
self,
url,
**kwargs,
):
fs = _resolve_fs(url, self.method)
if fs.async_impl:
return await fs._cat_file(url, **kwargs)
else:
return fs.cat_file(url, **kwargs)
async def _pipe_file(
self,
path,
value,
**kwargs,
):
fs = _resolve_fs(path, self.method, storage_options=self.st_opts)
if fs.async_impl:
return await fs._pipe_file(path, value, **kwargs)
else:
return fs.pipe_file(path, value, **kwargs)
async def _rm(self, url, **kwargs):
urls = url
if isinstance(urls, str):
urls = [urls]
fs = _resolve_fs(urls[0], self.method)
if fs.async_impl:
await fs._rm(urls, **kwargs)
else:
fs.rm(url, **kwargs)
async def _makedirs(self, path, exist_ok=False):
logger.debug("Make dir %s", path)
fs = _resolve_fs(path, self.method, storage_options=self.st_opts)
if fs.async_impl:
await fs._makedirs(path, exist_ok=exist_ok)
else:
fs.makedirs(path, exist_ok=exist_ok)
def rsync(self, source, destination, **kwargs):
"""Sync files between two directory trees
See `func:rsync` for more details.
"""
rsync(source, destination, fs=self, **kwargs)
async def _cp_file(
self,
url,
url2,
blocksize=2**20,
callback=DEFAULT_CALLBACK,
tempdir: str | None = None,
**kwargs,
):
fs = _resolve_fs(url, self.method)
fs2 = _resolve_fs(url2, self.method)
if fs is fs2:
# pure remote
if fs.async_impl:
return await fs._copy(url, url2, **kwargs)
else:
return fs.copy(url, url2, **kwargs)
await copy_file_op(fs, [url], fs2, [url2], tempdir, 1, on_error="raise")
async def _make_many_dirs(self, urls, exist_ok=True):
fs = _resolve_fs(urls[0], self.method)
if fs.async_impl:
coros = [fs._makedirs(u, exist_ok=exist_ok) for u in urls]
await _run_coros_in_chunks(coros)
else:
for u in urls:
fs.makedirs(u, exist_ok=exist_ok)
make_many_dirs = sync_wrapper(_make_many_dirs)
async def _copy(
self,
path1: list[str],
path2: list[str],
recursive: bool = False,
on_error: str = "ignore",
maxdepth: int | None = None,
batch_size: int | None = None,
tempdir: str | None = None,
**kwargs,
):
# TODO: special case for one FS being local, which can use get/put
# TODO: special case for one being memFS, which can use cat/pipe
if recursive:
raise NotImplementedError("Please use fsspec.generic.rsync")
path1 = [path1] if isinstance(path1, str) else path1
path2 = [path2] if isinstance(path2, str) else path2
fs = _resolve_fs(path1, self.method)
fs2 = _resolve_fs(path2, self.method)
if fs is fs2:
if fs.async_impl:
return await fs._copy(path1, path2, **kwargs)
else:
return fs.copy(path1, path2, **kwargs)
await copy_file_op(
fs, path1, fs2, path2, tempdir, batch_size, on_error=on_error
)
async def copy_file_op(
fs1, url1, fs2, url2, tempdir=None, batch_size=20, on_error="ignore"
):
import tempfile
tempdir = tempdir or tempfile.mkdtemp()
try:
coros = [
_copy_file_op(
fs1,
u1,
fs2,
u2,
os.path.join(tempdir, uuid.uuid4().hex),
)
for u1, u2 in zip(url1, url2)
]
out = await _run_coros_in_chunks(
coros, batch_size=batch_size, return_exceptions=True
)
finally:
shutil.rmtree(tempdir)
if on_error == "return":
return out
elif on_error == "raise":
for o in out:
if isinstance(o, Exception):
raise o
async def _copy_file_op(fs1, url1, fs2, url2, local, on_error="ignore"):
if fs1.async_impl:
await fs1._get_file(url1, local)
else:
fs1.get_file(url1, local)
if fs2.async_impl:
await fs2._put_file(local, url2)
else:
fs2.put_file(local, url2)
os.unlink(local)
logger.debug("Copy %s -> %s; done", url1, url2)
async def maybe_await(cor):
if inspect.iscoroutine(cor):
return await cor
else:
return cor

View File

@@ -0,0 +1,417 @@
import ast
import contextlib
import logging
import os
import re
from collections.abc import Sequence
from typing import ClassVar
import panel as pn
from .core import OpenFile, get_filesystem_class, split_protocol
from .registry import known_implementations
pn.extension()
logger = logging.getLogger("fsspec.gui")
class SigSlot:
"""Signal-slot mixin, for Panel event passing
Include this class in a widget manager's superclasses to be able to
register events and callbacks on Panel widgets managed by that class.
The method ``_register`` should be called as widgets are added, and external
code should call ``connect`` to associate callbacks.
By default, all signals emit a DEBUG logging statement.
"""
# names of signals that this class may emit each of which must be
# set by _register for any new instance
signals: ClassVar[Sequence[str]] = []
# names of actions that this class may respond to
slots: ClassVar[Sequence[str]] = []
# each of which must be a method name
def __init__(self):
self._ignoring_events = False
self._sigs = {}
self._map = {}
self._setup()
def _setup(self):
"""Create GUI elements and register signals"""
self.panel = pn.pane.PaneBase()
# no signals to set up in the base class
def _register(
self, widget, name, thing="value", log_level=logging.DEBUG, auto=False
):
"""Watch the given attribute of a widget and assign it a named event
This is normally called at the time a widget is instantiated, in the
class which owns it.
Parameters
----------
widget : pn.layout.Panel or None
Widget to watch. If None, an anonymous signal not associated with
any widget.
name : str
Name of this event
thing : str
Attribute of the given widget to watch
log_level : int
When the signal is triggered, a logging event of the given level
will be fired in the dfviz logger.
auto : bool
If True, automatically connects with a method in this class of the
same name.
"""
if name not in self.signals:
raise ValueError(f"Attempt to assign an undeclared signal: {name}")
self._sigs[name] = {
"widget": widget,
"callbacks": [],
"thing": thing,
"log": log_level,
}
wn = "-".join(
[
getattr(widget, "name", str(widget)) if widget is not None else "none",
thing,
]
)
self._map[wn] = name
if widget is not None:
widget.param.watch(self._signal, thing, onlychanged=True)
if auto and hasattr(self, name):
self.connect(name, getattr(self, name))
def _repr_mimebundle_(self, *args, **kwargs):
"""Display in a notebook or a server"""
try:
return self.panel._repr_mimebundle_(*args, **kwargs)
except (ValueError, AttributeError) as exc:
raise NotImplementedError(
"Panel does not seem to be set up properly"
) from exc
def connect(self, signal, slot):
"""Associate call back with given event
The callback must be a function which takes the "new" value of the
watched attribute as the only parameter. If the callback return False,
this cancels any further processing of the given event.
Alternatively, the callback can be a string, in which case it means
emitting the correspondingly-named event (i.e., connect to self)
"""
self._sigs[signal]["callbacks"].append(slot)
def _signal(self, event):
"""This is called by a an action on a widget
Within an self.ignore_events context, nothing happens.
Tests can execute this method by directly changing the values of
widget components.
"""
if not self._ignoring_events:
wn = "-".join([event.obj.name, event.name])
if wn in self._map and self._map[wn] in self._sigs:
self._emit(self._map[wn], event.new)
@contextlib.contextmanager
def ignore_events(self):
"""Temporarily turn off events processing in this instance
(does not propagate to children)
"""
self._ignoring_events = True
try:
yield
finally:
self._ignoring_events = False
def _emit(self, sig, value=None):
"""An event happened, call its callbacks
This method can be used in tests to simulate message passing without
directly changing visual elements.
Calling of callbacks will halt whenever one returns False.
"""
logger.log(self._sigs[sig]["log"], f"{sig}: {value}")
for callback in self._sigs[sig]["callbacks"]:
if isinstance(callback, str):
self._emit(callback)
else:
try:
# running callbacks should not break the interface
ret = callback(value)
if ret is False:
break
except Exception as e:
logger.exception(
"Exception (%s) while executing callback for signal: %s",
e,
sig,
)
def show(self, threads=False):
"""Open a new browser tab and display this instance's interface"""
self.panel.show(threads=threads, verbose=False)
return self
class SingleSelect(SigSlot):
"""A multiselect which only allows you to select one item for an event"""
signals = ["_selected", "selected"] # the first is internal
slots = ["set_options", "set_selection", "add", "clear", "select"]
def __init__(self, **kwargs):
self.kwargs = kwargs
super().__init__()
def _setup(self):
self.panel = pn.widgets.MultiSelect(**self.kwargs)
self._register(self.panel, "_selected", "value")
self._register(None, "selected")
self.connect("_selected", self.select_one)
def _signal(self, *args, **kwargs):
super()._signal(*args, **kwargs)
def select_one(self, *_):
with self.ignore_events():
val = [self.panel.value[-1]] if self.panel.value else []
self.panel.value = val
self._emit("selected", self.panel.value)
def set_options(self, options):
self.panel.options = options
def clear(self):
self.panel.options = []
@property
def value(self):
return self.panel.value
def set_selection(self, selection):
self.panel.value = [selection]
class FileSelector(SigSlot):
"""Panel-based graphical file selector widget
Instances of this widget are interactive and can be displayed in jupyter by having
them as the output of a cell, or in a separate browser tab using ``.show()``.
"""
signals = [
"protocol_changed",
"selection_changed",
"directory_entered",
"home_clicked",
"up_clicked",
"go_clicked",
"filters_changed",
]
slots = ["set_filters", "go_home"]
def __init__(self, url=None, filters=None, ignore=None, kwargs=None):
"""
Parameters
----------
url : str (optional)
Initial value of the URL to populate the dialog; should include protocol
filters : list(str) (optional)
File endings to include in the listings. If not included, all files are
allowed. Does not affect directories.
If given, the endings will appear as checkboxes in the interface
ignore : list(str) (optional)
Regex(s) of file basename patterns to ignore, e.g., "\\." for typical
hidden files on posix
kwargs : dict (optional)
To pass to file system instance
"""
if url:
self.init_protocol, url = split_protocol(url)
else:
self.init_protocol, url = "file", os.getcwd()
self.init_url = url
self.init_kwargs = (kwargs if isinstance(kwargs, str) else str(kwargs)) or "{}"
self.filters = filters
self.ignore = [re.compile(i) for i in ignore or []]
self._fs = None
super().__init__()
def _setup(self):
self.url = pn.widgets.TextInput(
name="url",
value=self.init_url,
align="end",
sizing_mode="stretch_width",
width_policy="max",
)
self.protocol = pn.widgets.Select(
options=sorted(known_implementations),
value=self.init_protocol,
name="protocol",
align="center",
)
self.kwargs = pn.widgets.TextInput(
name="kwargs", value=self.init_kwargs, align="center"
)
self.go = pn.widgets.Button(name="", align="end", width=45)
self.main = SingleSelect(size=10)
self.home = pn.widgets.Button(name="🏠", width=40, height=30, align="end")
self.up = pn.widgets.Button(name="", width=30, height=30, align="end")
self._register(self.protocol, "protocol_changed", auto=True)
self._register(self.go, "go_clicked", "clicks", auto=True)
self._register(self.up, "up_clicked", "clicks", auto=True)
self._register(self.home, "home_clicked", "clicks", auto=True)
self._register(None, "selection_changed")
self.main.connect("selected", self.selection_changed)
self._register(None, "directory_entered")
self.prev_protocol = self.protocol.value
self.prev_kwargs = self.storage_options
self.filter_sel = pn.widgets.CheckBoxGroup(
value=[], options=[], inline=False, align="end", width_policy="min"
)
self._register(self.filter_sel, "filters_changed", auto=True)
self.panel = pn.Column(
pn.Row(self.protocol, self.kwargs),
pn.Row(self.home, self.up, self.url, self.go, self.filter_sel),
self.main.panel,
)
self.set_filters(self.filters)
self.go_clicked()
def set_filters(self, filters=None):
self.filters = filters
if filters:
self.filter_sel.options = filters
self.filter_sel.value = filters
else:
self.filter_sel.options = []
self.filter_sel.value = []
@property
def storage_options(self):
"""Value of the kwargs box as a dictionary"""
return ast.literal_eval(self.kwargs.value) or {}
@property
def fs(self):
"""Current filesystem instance"""
if self._fs is None:
cls = get_filesystem_class(self.protocol.value)
self._fs = cls(**self.storage_options)
return self._fs
@property
def urlpath(self):
"""URL of currently selected item"""
return (
(f"{self.protocol.value}://{self.main.value[0]}")
if self.main.value
else None
)
def open_file(self, mode="rb", compression=None, encoding=None):
"""Create OpenFile instance for the currently selected item
For example, in a notebook you might do something like
.. code-block::
[ ]: sel = FileSelector(); sel
# user selects their file
[ ]: with sel.open_file('rb') as f:
... out = f.read()
Parameters
----------
mode: str (optional)
Open mode for the file.
compression: str (optional)
The interact with the file as compressed. Set to 'infer' to guess
compression from the file ending
encoding: str (optional)
If using text mode, use this encoding; defaults to UTF8.
"""
if self.urlpath is None:
raise ValueError("No file selected")
return OpenFile(self.fs, self.urlpath, mode, compression, encoding)
def filters_changed(self, values):
self.filters = values
self.go_clicked()
def selection_changed(self, *_):
if self.urlpath is None:
return
if self.fs.isdir(self.urlpath):
self.url.value = self.fs._strip_protocol(self.urlpath)
self.go_clicked()
def go_clicked(self, *_):
if (
self.prev_protocol != self.protocol.value
or self.prev_kwargs != self.storage_options
):
self._fs = None # causes fs to be recreated
self.prev_protocol = self.protocol.value
self.prev_kwargs = self.storage_options
listing = sorted(
self.fs.ls(self.url.value, detail=True), key=lambda x: x["name"]
)
listing = [
l
for l in listing
if not any(i.match(l["name"].rsplit("/", 1)[-1]) for i in self.ignore)
]
folders = {
"📁 " + o["name"].rsplit("/", 1)[-1]: o["name"]
for o in listing
if o["type"] == "directory"
}
files = {
"📄 " + o["name"].rsplit("/", 1)[-1]: o["name"]
for o in listing
if o["type"] == "file"
}
if self.filters:
files = {
k: v
for k, v in files.items()
if any(v.endswith(ext) for ext in self.filters)
}
self.main.set_options(dict(**folders, **files))
def protocol_changed(self, *_):
self._fs = None
self.main.options = []
self.url.value = ""
def home_clicked(self, *_):
self.protocol.value = self.init_protocol
self.kwargs.value = self.init_kwargs
self.url.value = self.init_url
self.go_clicked()
def up_clicked(self, *_):
self.url.value = self.fs._parent(self.url.value)
self.go_clicked()

View File

@@ -0,0 +1,307 @@
import errno
import io
import os
import secrets
import shutil
from contextlib import suppress
from functools import cached_property, wraps
from urllib.parse import parse_qs
from fsspec.spec import AbstractFileSystem
from fsspec.utils import (
get_package_version_without_import,
infer_storage_options,
mirror_from,
tokenize,
)
def wrap_exceptions(func):
@wraps(func)
def wrapper(*args, **kwargs):
try:
return func(*args, **kwargs)
except OSError as exception:
if not exception.args:
raise
message, *args = exception.args
if isinstance(message, str) and "does not exist" in message:
raise FileNotFoundError(errno.ENOENT, message) from exception
else:
raise
return wrapper
PYARROW_VERSION = None
class ArrowFSWrapper(AbstractFileSystem):
"""FSSpec-compatible wrapper of pyarrow.fs.FileSystem.
Parameters
----------
fs : pyarrow.fs.FileSystem
"""
root_marker = "/"
def __init__(self, fs, **kwargs):
global PYARROW_VERSION
PYARROW_VERSION = get_package_version_without_import("pyarrow")
self.fs = fs
super().__init__(**kwargs)
@property
def protocol(self):
return self.fs.type_name
@cached_property
def fsid(self):
return "hdfs_" + tokenize(self.fs.host, self.fs.port)
@classmethod
def _strip_protocol(cls, path):
ops = infer_storage_options(path)
path = ops["path"]
if path.startswith("//"):
# special case for "hdfs://path" (without the triple slash)
path = path[1:]
return path
def ls(self, path, detail=False, **kwargs):
path = self._strip_protocol(path)
from pyarrow.fs import FileSelector
try:
entries = [
self._make_entry(entry)
for entry in self.fs.get_file_info(FileSelector(path))
]
except (FileNotFoundError, NotADirectoryError):
entries = [self.info(path, **kwargs)]
if detail:
return entries
else:
return [entry["name"] for entry in entries]
def info(self, path, **kwargs):
path = self._strip_protocol(path)
[info] = self.fs.get_file_info([path])
return self._make_entry(info)
def exists(self, path):
path = self._strip_protocol(path)
try:
self.info(path)
except FileNotFoundError:
return False
else:
return True
def _make_entry(self, info):
from pyarrow.fs import FileType
if info.type is FileType.Directory:
kind = "directory"
elif info.type is FileType.File:
kind = "file"
elif info.type is FileType.NotFound:
raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), info.path)
else:
kind = "other"
return {
"name": info.path,
"size": info.size,
"type": kind,
"mtime": info.mtime,
}
@wrap_exceptions
def cp_file(self, path1, path2, **kwargs):
path1 = self._strip_protocol(path1).rstrip("/")
path2 = self._strip_protocol(path2).rstrip("/")
with self._open(path1, "rb") as lstream:
tmp_fname = f"{path2}.tmp.{secrets.token_hex(6)}"
try:
with self.open(tmp_fname, "wb") as rstream:
shutil.copyfileobj(lstream, rstream)
self.fs.move(tmp_fname, path2)
except BaseException:
with suppress(FileNotFoundError):
self.fs.delete_file(tmp_fname)
raise
@wrap_exceptions
def mv(self, path1, path2, **kwargs):
path1 = self._strip_protocol(path1).rstrip("/")
path2 = self._strip_protocol(path2).rstrip("/")
self.fs.move(path1, path2)
@wrap_exceptions
def rm_file(self, path):
path = self._strip_protocol(path)
self.fs.delete_file(path)
@wrap_exceptions
def rm(self, path, recursive=False, maxdepth=None):
path = self._strip_protocol(path).rstrip("/")
if self.isdir(path):
if recursive:
self.fs.delete_dir(path)
else:
raise ValueError("Can't delete directories without recursive=False")
else:
self.fs.delete_file(path)
@wrap_exceptions
def _open(self, path, mode="rb", block_size=None, seekable=True, **kwargs):
if mode == "rb":
if seekable:
method = self.fs.open_input_file
else:
method = self.fs.open_input_stream
elif mode == "wb":
method = self.fs.open_output_stream
elif mode == "ab":
method = self.fs.open_append_stream
else:
raise ValueError(f"unsupported mode for Arrow filesystem: {mode!r}")
_kwargs = {}
if mode != "rb" or not seekable:
if int(PYARROW_VERSION.split(".")[0]) >= 4:
# disable compression auto-detection
_kwargs["compression"] = None
stream = method(path, **_kwargs)
return ArrowFile(self, stream, path, mode, block_size, **kwargs)
@wrap_exceptions
def mkdir(self, path, create_parents=True, **kwargs):
path = self._strip_protocol(path)
if create_parents:
self.makedirs(path, exist_ok=True)
else:
self.fs.create_dir(path, recursive=False)
@wrap_exceptions
def makedirs(self, path, exist_ok=False):
path = self._strip_protocol(path)
self.fs.create_dir(path, recursive=True)
@wrap_exceptions
def rmdir(self, path):
path = self._strip_protocol(path)
self.fs.delete_dir(path)
@wrap_exceptions
def modified(self, path):
path = self._strip_protocol(path)
return self.fs.get_file_info(path).mtime
def cat_file(self, path, start=None, end=None, **kwargs):
kwargs["seekable"] = start not in [None, 0]
return super().cat_file(path, start=None, end=None, **kwargs)
def get_file(self, rpath, lpath, **kwargs):
kwargs["seekable"] = False
super().get_file(rpath, lpath, **kwargs)
@mirror_from(
"stream",
[
"read",
"seek",
"tell",
"write",
"readable",
"writable",
"close",
"size",
"seekable",
],
)
class ArrowFile(io.IOBase):
def __init__(self, fs, stream, path, mode, block_size=None, **kwargs):
self.path = path
self.mode = mode
self.fs = fs
self.stream = stream
self.blocksize = self.block_size = block_size
self.kwargs = kwargs
def __enter__(self):
return self
def __exit__(self, *args):
return self.close()
class HadoopFileSystem(ArrowFSWrapper):
"""A wrapper on top of the pyarrow.fs.HadoopFileSystem
to connect it's interface with fsspec"""
protocol = "hdfs"
def __init__(
self,
host="default",
port=0,
user=None,
kerb_ticket=None,
replication=3,
extra_conf=None,
**kwargs,
):
"""
Parameters
----------
host: str
Hostname, IP or "default" to try to read from Hadoop config
port: int
Port to connect on, or default from Hadoop config if 0
user: str or None
If given, connect as this username
kerb_ticket: str or None
If given, use this ticket for authentication
replication: int
set replication factor of file for write operations. default value is 3.
extra_conf: None or dict
Passed on to HadoopFileSystem
"""
from pyarrow.fs import HadoopFileSystem
fs = HadoopFileSystem(
host=host,
port=port,
user=user,
kerb_ticket=kerb_ticket,
replication=replication,
extra_conf=extra_conf,
)
super().__init__(fs=fs, **kwargs)
@staticmethod
def _get_kwargs_from_urls(path):
ops = infer_storage_options(path)
out = {}
if ops.get("host", None):
out["host"] = ops["host"]
if ops.get("username", None):
out["user"] = ops["username"]
if ops.get("port", None):
out["port"] = ops["port"]
if ops.get("url_query", None):
queries = parse_qs(ops["url_query"])
if queries.get("replication", None):
out["replication"] = int(queries["replication"][0])
return out

View File

@@ -0,0 +1,122 @@
import asyncio
import functools
import inspect
import fsspec
from fsspec.asyn import AsyncFileSystem, running_async
def async_wrapper(func, obj=None, semaphore=None):
"""
Wraps a synchronous function to make it awaitable.
Parameters
----------
func : callable
The synchronous function to wrap.
obj : object, optional
The instance to bind the function to, if applicable.
semaphore : asyncio.Semaphore, optional
A semaphore to limit concurrent calls.
Returns
-------
coroutine
An awaitable version of the function.
"""
@functools.wraps(func)
async def wrapper(*args, **kwargs):
if semaphore:
async with semaphore:
return await asyncio.to_thread(func, *args, **kwargs)
return await asyncio.to_thread(func, *args, **kwargs)
return wrapper
class AsyncFileSystemWrapper(AsyncFileSystem):
"""
A wrapper class to convert a synchronous filesystem into an asynchronous one.
This class takes an existing synchronous filesystem implementation and wraps all
its methods to provide an asynchronous interface.
Parameters
----------
sync_fs : AbstractFileSystem
The synchronous filesystem instance to wrap.
"""
protocol = "asyncwrapper", "async_wrapper"
cachable = False
def __init__(
self,
fs=None,
asynchronous=None,
target_protocol=None,
target_options=None,
semaphore=None,
max_concurrent_tasks=None,
**kwargs,
):
if asynchronous is None:
asynchronous = running_async()
super().__init__(asynchronous=asynchronous, **kwargs)
if fs is not None:
self.sync_fs = fs
else:
self.sync_fs = fsspec.filesystem(target_protocol, **target_options)
self.protocol = self.sync_fs.protocol
self.semaphore = semaphore
self._wrap_all_sync_methods()
@property
def fsid(self):
return f"async_{self.sync_fs.fsid}"
def _wrap_all_sync_methods(self):
"""
Wrap all synchronous methods of the underlying filesystem with asynchronous versions.
"""
excluded_methods = {"open"}
for method_name in dir(self.sync_fs):
if method_name.startswith("_") or method_name in excluded_methods:
continue
attr = inspect.getattr_static(self.sync_fs, method_name)
if isinstance(attr, property):
continue
method = getattr(self.sync_fs, method_name)
if callable(method) and not inspect.iscoroutinefunction(method):
async_method = async_wrapper(method, obj=self, semaphore=self.semaphore)
setattr(self, f"_{method_name}", async_method)
@classmethod
def wrap_class(cls, sync_fs_class):
"""
Create a new class that can be used to instantiate an AsyncFileSystemWrapper
with lazy instantiation of the underlying synchronous filesystem.
Parameters
----------
sync_fs_class : type
The class of the synchronous filesystem to wrap.
Returns
-------
type
A new class that wraps the provided synchronous filesystem class.
"""
class GeneratedAsyncFileSystemWrapper(cls):
def __init__(self, *args, **kwargs):
sync_fs = sync_fs_class(*args, **kwargs)
super().__init__(sync_fs)
GeneratedAsyncFileSystemWrapper.__name__ = (
f"Async{sync_fs_class.__name__}Wrapper"
)
return GeneratedAsyncFileSystemWrapper

View File

@@ -0,0 +1,75 @@
from __future__ import annotations
import abc
import hashlib
from fsspec.implementations.local import make_path_posix
class AbstractCacheMapper(abc.ABC):
"""Abstract super-class for mappers from remote URLs to local cached
basenames.
"""
@abc.abstractmethod
def __call__(self, path: str) -> str: ...
def __eq__(self, other: object) -> bool:
# Identity only depends on class. When derived classes have attributes
# they will need to be included.
return isinstance(other, type(self))
def __hash__(self) -> int:
# Identity only depends on class. When derived classes have attributes
# they will need to be included.
return hash(type(self))
class BasenameCacheMapper(AbstractCacheMapper):
"""Cache mapper that uses the basename of the remote URL and a fixed number
of directory levels above this.
The default is zero directory levels, meaning different paths with the same
basename will have the same cached basename.
"""
def __init__(self, directory_levels: int = 0):
if directory_levels < 0:
raise ValueError(
"BasenameCacheMapper requires zero or positive directory_levels"
)
self.directory_levels = directory_levels
# Separator for directories when encoded as strings.
self._separator = "_@_"
def __call__(self, path: str) -> str:
path = make_path_posix(path)
prefix, *bits = path.rsplit("/", self.directory_levels + 1)
if bits:
return self._separator.join(bits)
else:
return prefix # No separator found, simple filename
def __eq__(self, other: object) -> bool:
return super().__eq__(other) and self.directory_levels == other.directory_levels
def __hash__(self) -> int:
return super().__hash__() ^ hash(self.directory_levels)
class HashCacheMapper(AbstractCacheMapper):
"""Cache mapper that uses a hash of the remote URL."""
def __call__(self, path: str) -> str:
return hashlib.sha256(path.encode()).hexdigest()
def create_cache_mapper(same_names: bool) -> AbstractCacheMapper:
"""Factory method to create cache mapper for backward compatibility with
``CachingFileSystem`` constructor using ``same_names`` kwarg.
"""
if same_names:
return BasenameCacheMapper()
else:
return HashCacheMapper()

View File

@@ -0,0 +1,233 @@
from __future__ import annotations
import os
import pickle
import time
from typing import TYPE_CHECKING
from fsspec.utils import atomic_write
try:
import ujson as json
except ImportError:
if not TYPE_CHECKING:
import json
if TYPE_CHECKING:
from collections.abc import Iterator
from typing import Any, Literal
from typing_extensions import TypeAlias
from .cached import CachingFileSystem
Detail: TypeAlias = dict[str, Any]
class CacheMetadata:
"""Cache metadata.
All reading and writing of cache metadata is performed by this class,
accessing the cached files and blocks is not.
Metadata is stored in a single file per storage directory in JSON format.
For backward compatibility, also reads metadata stored in pickle format
which is converted to JSON when next saved.
"""
def __init__(self, storage: list[str]):
"""
Parameters
----------
storage: list[str]
Directories containing cached files, must be at least one. Metadata
is stored in the last of these directories by convention.
"""
if not storage:
raise ValueError("CacheMetadata expects at least one storage location")
self._storage = storage
self.cached_files: list[Detail] = [{}]
# Private attribute to force saving of metadata in pickle format rather than
# JSON for use in tests to confirm can read both pickle and JSON formats.
self._force_save_pickle = False
def _load(self, fn: str) -> Detail:
"""Low-level function to load metadata from specific file"""
try:
with open(fn, "r") as f:
loaded = json.load(f)
except ValueError:
with open(fn, "rb") as f:
loaded = pickle.load(f)
for c in loaded.values():
if isinstance(c.get("blocks"), list):
c["blocks"] = set(c["blocks"])
return loaded
def _save(self, metadata_to_save: Detail, fn: str) -> None:
"""Low-level function to save metadata to specific file"""
if self._force_save_pickle:
with atomic_write(fn) as f:
pickle.dump(metadata_to_save, f)
else:
with atomic_write(fn, mode="w") as f:
json.dump(metadata_to_save, f)
def _scan_locations(
self, writable_only: bool = False
) -> Iterator[tuple[str, str, bool]]:
"""Yield locations (filenames) where metadata is stored, and whether
writable or not.
Parameters
----------
writable: bool
Set to True to only yield writable locations.
Returns
-------
Yields (str, str, bool)
"""
n = len(self._storage)
for i, storage in enumerate(self._storage):
writable = i == n - 1
if writable_only and not writable:
continue
yield os.path.join(storage, "cache"), storage, writable
def check_file(
self, path: str, cfs: CachingFileSystem | None
) -> Literal[False] | tuple[Detail, str]:
"""If path is in cache return its details, otherwise return ``False``.
If the optional CachingFileSystem is specified then it is used to
perform extra checks to reject possible matches, such as if they are
too old.
"""
for (fn, base, _), cache in zip(self._scan_locations(), self.cached_files):
if path not in cache:
continue
detail = cache[path].copy()
if cfs is not None:
if cfs.check_files and detail["uid"] != cfs.fs.ukey(path):
# Wrong file as determined by hash of file properties
continue
if cfs.expiry and time.time() - detail["time"] > cfs.expiry:
# Cached file has expired
continue
fn = os.path.join(base, detail["fn"])
if os.path.exists(fn):
return detail, fn
return False
def clear_expired(self, expiry_time: int) -> tuple[list[str], bool]:
"""Remove expired metadata from the cache.
Returns names of files corresponding to expired metadata and a boolean
flag indicating whether the writable cache is empty. Caller is
responsible for deleting the expired files.
"""
expired_files = []
for path, detail in self.cached_files[-1].copy().items():
if time.time() - detail["time"] > expiry_time:
fn = detail.get("fn", "")
if not fn:
raise RuntimeError(
f"Cache metadata does not contain 'fn' for {path}"
)
fn = os.path.join(self._storage[-1], fn)
expired_files.append(fn)
self.cached_files[-1].pop(path)
if self.cached_files[-1]:
cache_path = os.path.join(self._storage[-1], "cache")
self._save(self.cached_files[-1], cache_path)
writable_cache_empty = not self.cached_files[-1]
return expired_files, writable_cache_empty
def load(self) -> None:
"""Load all metadata from disk and store in ``self.cached_files``"""
cached_files = []
for fn, _, _ in self._scan_locations():
if os.path.exists(fn):
# TODO: consolidate blocks here
cached_files.append(self._load(fn))
else:
cached_files.append({})
self.cached_files = cached_files or [{}]
def on_close_cached_file(self, f: Any, path: str) -> None:
"""Perform side-effect actions on closing a cached file.
The actual closing of the file is the responsibility of the caller.
"""
# File must be writeble, so in self.cached_files[-1]
c = self.cached_files[-1][path]
if c["blocks"] is not True and len(c["blocks"]) * f.blocksize >= f.size:
c["blocks"] = True
def pop_file(self, path: str) -> str | None:
"""Remove metadata of cached file.
If path is in the cache, return the filename of the cached file,
otherwise return ``None``. Caller is responsible for deleting the
cached file.
"""
details = self.check_file(path, None)
if not details:
return None
_, fn = details
if fn.startswith(self._storage[-1]):
self.cached_files[-1].pop(path)
self.save()
else:
raise PermissionError(
"Can only delete cached file in last, writable cache location"
)
return fn
def save(self) -> None:
"""Save metadata to disk"""
for (fn, _, writable), cache in zip(self._scan_locations(), self.cached_files):
if not writable:
continue
if os.path.exists(fn):
cached_files = self._load(fn)
for k, c in cached_files.items():
if k in cache:
if c["blocks"] is True or cache[k]["blocks"] is True:
c["blocks"] = True
else:
# self.cached_files[*][*]["blocks"] must continue to
# point to the same set object so that updates
# performed by MMapCache are propagated back to
# self.cached_files.
blocks = cache[k]["blocks"]
blocks.update(c["blocks"])
c["blocks"] = blocks
c["time"] = max(c["time"], cache[k]["time"])
c["uid"] = cache[k]["uid"]
# Files can be added to cache after it was written once
for k, c in cache.items():
if k not in cached_files:
cached_files[k] = c
else:
cached_files = cache
cache = {k: v.copy() for k, v in cached_files.items()}
for c in cache.values():
if isinstance(c["blocks"], set):
c["blocks"] = list(c["blocks"])
self._save(cache, fn)
self.cached_files[-1] = cached_files
def update_file(self, path: str, detail: Detail) -> None:
"""Update metadata for specific file in memory, do not save"""
self.cached_files[-1][path] = detail

View File

@@ -0,0 +1,23 @@
from typing import ClassVar
from fsspec import AbstractFileSystem
__all__ = ("ChainedFileSystem",)
class ChainedFileSystem(AbstractFileSystem):
"""Chained filesystem base class.
A chained filesystem is designed to be layered over another FS.
This is useful to implement things like caching.
This base class does very little on its own, but is used as a marker
that the class is designed for chaining.
Right now this is only used in `url_to_fs` to provide the path argument
(`fo`) to the chained filesystem from the underlying filesystem.
Additional functionality may be added in the future.
"""
protocol: ClassVar[str] = "chained"

View File

@@ -0,0 +1,152 @@
import dask
from distributed.client import Client, _get_global_client
from distributed.worker import Worker
from fsspec import filesystem
from fsspec.spec import AbstractBufferedFile, AbstractFileSystem
from fsspec.utils import infer_storage_options
def _get_client(client):
if client is None:
return _get_global_client()
elif isinstance(client, Client):
return client
else:
# e.g., connection string
return Client(client)
def _in_worker():
return bool(Worker._instances)
class DaskWorkerFileSystem(AbstractFileSystem):
"""View files accessible to a worker as any other remote file-system
When instances are run on the worker, uses the real filesystem. When
run on the client, they call the worker to provide information or data.
**Warning** this implementation is experimental, and read-only for now.
"""
def __init__(
self, target_protocol=None, target_options=None, fs=None, client=None, **kwargs
):
super().__init__(**kwargs)
if not (fs is None) ^ (target_protocol is None):
raise ValueError(
"Please provide one of filesystem instance (fs) or"
" target_protocol, not both"
)
self.target_protocol = target_protocol
self.target_options = target_options
self.worker = None
self.client = client
self.fs = fs
self._determine_worker()
@staticmethod
def _get_kwargs_from_urls(path):
so = infer_storage_options(path)
if "host" in so and "port" in so:
return {"client": f"{so['host']}:{so['port']}"}
else:
return {}
def _determine_worker(self):
if _in_worker():
self.worker = True
if self.fs is None:
self.fs = filesystem(
self.target_protocol, **(self.target_options or {})
)
else:
self.worker = False
self.client = _get_client(self.client)
self.rfs = dask.delayed(self)
def mkdir(self, *args, **kwargs):
if self.worker:
self.fs.mkdir(*args, **kwargs)
else:
self.rfs.mkdir(*args, **kwargs).compute()
def rm(self, *args, **kwargs):
if self.worker:
self.fs.rm(*args, **kwargs)
else:
self.rfs.rm(*args, **kwargs).compute()
def copy(self, *args, **kwargs):
if self.worker:
self.fs.copy(*args, **kwargs)
else:
self.rfs.copy(*args, **kwargs).compute()
def mv(self, *args, **kwargs):
if self.worker:
self.fs.mv(*args, **kwargs)
else:
self.rfs.mv(*args, **kwargs).compute()
def ls(self, *args, **kwargs):
if self.worker:
return self.fs.ls(*args, **kwargs)
else:
return self.rfs.ls(*args, **kwargs).compute()
def _open(
self,
path,
mode="rb",
block_size=None,
autocommit=True,
cache_options=None,
**kwargs,
):
if self.worker:
return self.fs._open(
path,
mode=mode,
block_size=block_size,
autocommit=autocommit,
cache_options=cache_options,
**kwargs,
)
else:
return DaskFile(
fs=self,
path=path,
mode=mode,
block_size=block_size,
autocommit=autocommit,
cache_options=cache_options,
**kwargs,
)
def fetch_range(self, path, mode, start, end):
if self.worker:
with self._open(path, mode) as f:
f.seek(start)
return f.read(end - start)
else:
return self.rfs.fetch_range(path, mode, start, end).compute()
class DaskFile(AbstractBufferedFile):
def __init__(self, mode="rb", **kwargs):
if mode != "rb":
raise ValueError('Remote dask files can only be opened in "rb" mode')
super().__init__(**kwargs)
def _upload_chunk(self, final=False):
pass
def _initiate_upload(self):
"""Create remote file/upload"""
pass
def _fetch_range(self, start, end):
"""Get the specified set of bytes from remote"""
return self.fs.fetch_range(self.path, self.mode, start, end)

View File

@@ -0,0 +1,58 @@
import base64
import io
from typing import Optional
from urllib.parse import unquote
from fsspec import AbstractFileSystem
class DataFileSystem(AbstractFileSystem):
"""A handy decoder for data-URLs
Example
-------
>>> with fsspec.open("data:,Hello%2C%20World%21") as f:
... print(f.read())
b"Hello, World!"
See https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/Data_URLs
"""
protocol = "data"
def __init__(self, **kwargs):
"""No parameters for this filesystem"""
super().__init__(**kwargs)
def cat_file(self, path, start=None, end=None, **kwargs):
pref, data = path.split(",", 1)
if pref.endswith("base64"):
return base64.b64decode(data)[start:end]
return unquote(data).encode()[start:end]
def info(self, path, **kwargs):
pref, name = path.split(",", 1)
data = self.cat_file(path)
mime = pref.split(":", 1)[1].split(";", 1)[0]
return {"name": name, "size": len(data), "type": "file", "mimetype": mime}
def _open(
self,
path,
mode="rb",
block_size=None,
autocommit=True,
cache_options=None,
**kwargs,
):
if "r" not in mode:
raise ValueError("Read only filesystem")
return io.BytesIO(self.cat_file(path))
@staticmethod
def encode(data: bytes, mime: Optional[str] = None):
"""Format the given data into data-URL syntax
This version always base64 encodes, even when the data is ascii/url-safe.
"""
return f"data:{mime or ''};base64,{base64.b64encode(data).decode()}"

View File

@@ -0,0 +1,496 @@
from __future__ import annotations
import base64
import urllib
import requests
from requests.adapters import HTTPAdapter, Retry
from typing_extensions import override
from fsspec import AbstractFileSystem
from fsspec.spec import AbstractBufferedFile
class DatabricksException(Exception):
"""
Helper class for exceptions raised in this module.
"""
def __init__(self, error_code, message, details=None):
"""Create a new DatabricksException"""
super().__init__(message)
self.error_code = error_code
self.message = message
self.details = details
class DatabricksFileSystem(AbstractFileSystem):
"""
Get access to the Databricks filesystem implementation over HTTP.
Can be used inside and outside of a databricks cluster.
"""
def __init__(self, instance, token, **kwargs):
"""
Create a new DatabricksFileSystem.
Parameters
----------
instance: str
The instance URL of the databricks cluster.
For example for an Azure databricks cluster, this
has the form adb-<some-number>.<two digits>.azuredatabricks.net.
token: str
Your personal token. Find out more
here: https://docs.databricks.com/dev-tools/api/latest/authentication.html
"""
self.instance = instance
self.token = token
self.session = requests.Session()
self.retries = Retry(
total=10,
backoff_factor=0.05,
status_forcelist=[408, 429, 500, 502, 503, 504],
)
self.session.mount("https://", HTTPAdapter(max_retries=self.retries))
self.session.headers.update({"Authorization": f"Bearer {self.token}"})
super().__init__(**kwargs)
@override
def _ls_from_cache(self, path) -> list[dict[str, str | int]] | None:
"""Check cache for listing
Returns listing, if found (may be empty list for a directory that
exists but contains nothing), None if not in cache.
"""
self.dircache.pop(path.rstrip("/"), None)
parent = self._parent(path)
if parent in self.dircache:
for entry in self.dircache[parent]:
if entry["name"] == path.rstrip("/"):
if entry["type"] != "directory":
return [entry]
return []
raise FileNotFoundError(path)
def ls(self, path, detail=True, **kwargs):
"""
List the contents of the given path.
Parameters
----------
path: str
Absolute path
detail: bool
Return not only the list of filenames,
but also additional information on file sizes
and types.
"""
try:
out = self._ls_from_cache(path)
except FileNotFoundError:
# This happens if the `path`'s parent was cached, but `path` is not
# there. This suggests that `path` is new since the parent was
# cached. Attempt to invalidate parent's cache before continuing.
self.dircache.pop(self._parent(path), None)
out = None
if not out:
try:
r = self._send_to_api(
method="get", endpoint="list", json={"path": path}
)
except DatabricksException as e:
if e.error_code == "RESOURCE_DOES_NOT_EXIST":
raise FileNotFoundError(e.message) from e
raise
files = r.get("files", [])
out = [
{
"name": o["path"],
"type": "directory" if o["is_dir"] else "file",
"size": o["file_size"],
}
for o in files
]
self.dircache[path] = out
if detail:
return out
return [o["name"] for o in out]
def makedirs(self, path, exist_ok=True):
"""
Create a given absolute path and all of its parents.
Parameters
----------
path: str
Absolute path to create
exist_ok: bool
If false, checks if the folder
exists before creating it (and raises an
Exception if this is the case)
"""
if not exist_ok:
try:
# If the following succeeds, the path is already present
self._send_to_api(
method="get", endpoint="get-status", json={"path": path}
)
raise FileExistsError(f"Path {path} already exists")
except DatabricksException as e:
if e.error_code == "RESOURCE_DOES_NOT_EXIST":
pass
try:
self._send_to_api(method="post", endpoint="mkdirs", json={"path": path})
except DatabricksException as e:
if e.error_code == "RESOURCE_ALREADY_EXISTS":
raise FileExistsError(e.message) from e
raise
self.invalidate_cache(self._parent(path))
def mkdir(self, path, create_parents=True, **kwargs):
"""
Create a given absolute path and all of its parents.
Parameters
----------
path: str
Absolute path to create
create_parents: bool
Whether to create all parents or not.
"False" is not implemented so far.
"""
if not create_parents:
raise NotImplementedError
self.mkdirs(path, **kwargs)
def rm(self, path, recursive=False, **kwargs):
"""
Remove the file or folder at the given absolute path.
Parameters
----------
path: str
Absolute path what to remove
recursive: bool
Recursively delete all files in a folder.
"""
try:
self._send_to_api(
method="post",
endpoint="delete",
json={"path": path, "recursive": recursive},
)
except DatabricksException as e:
# This is not really an exception, it just means
# not everything was deleted so far
if e.error_code == "PARTIAL_DELETE":
self.rm(path=path, recursive=recursive)
elif e.error_code == "IO_ERROR":
# Using the same exception as the os module would use here
raise OSError(e.message) from e
raise
self.invalidate_cache(self._parent(path))
def mv(
self, source_path, destination_path, recursive=False, maxdepth=None, **kwargs
):
"""
Move a source to a destination path.
A note from the original [databricks API manual]
(https://docs.databricks.com/dev-tools/api/latest/dbfs.html#move).
When moving a large number of files the API call will time out after
approximately 60s, potentially resulting in partially moved data.
Therefore, for operations that move more than 10k files, we strongly
discourage using the DBFS REST API.
Parameters
----------
source_path: str
From where to move (absolute path)
destination_path: str
To where to move (absolute path)
recursive: bool
Not implemented to far.
maxdepth:
Not implemented to far.
"""
if recursive:
raise NotImplementedError
if maxdepth:
raise NotImplementedError
try:
self._send_to_api(
method="post",
endpoint="move",
json={"source_path": source_path, "destination_path": destination_path},
)
except DatabricksException as e:
if e.error_code == "RESOURCE_DOES_NOT_EXIST":
raise FileNotFoundError(e.message) from e
elif e.error_code == "RESOURCE_ALREADY_EXISTS":
raise FileExistsError(e.message) from e
raise
self.invalidate_cache(self._parent(source_path))
self.invalidate_cache(self._parent(destination_path))
def _open(self, path, mode="rb", block_size="default", **kwargs):
"""
Overwrite the base class method to make sure to create a DBFile.
All arguments are copied from the base method.
Only the default blocksize is allowed.
"""
return DatabricksFile(self, path, mode=mode, block_size=block_size, **kwargs)
def _send_to_api(self, method, endpoint, json):
"""
Send the given json to the DBFS API
using a get or post request (specified by the argument `method`).
Parameters
----------
method: str
Which http method to use for communication; "get" or "post".
endpoint: str
Where to send the request to (last part of the API URL)
json: dict
Dictionary of information to send
"""
if method == "post":
session_call = self.session.post
elif method == "get":
session_call = self.session.get
else:
raise ValueError(f"Do not understand method {method}")
url = urllib.parse.urljoin(f"https://{self.instance}/api/2.0/dbfs/", endpoint)
r = session_call(url, json=json)
# The DBFS API will return a json, also in case of an exception.
# We want to preserve this information as good as possible.
try:
r.raise_for_status()
except requests.HTTPError as e:
# try to extract json error message
# if that fails, fall back to the original exception
try:
exception_json = e.response.json()
except Exception:
raise e from None
raise DatabricksException(**exception_json) from e
return r.json()
def _create_handle(self, path, overwrite=True):
"""
Internal function to create a handle, which can be used to
write blocks of a file to DBFS.
A handle has a unique identifier which needs to be passed
whenever written during this transaction.
The handle is active for 10 minutes - after that a new
write transaction needs to be created.
Make sure to close the handle after you are finished.
Parameters
----------
path: str
Absolute path for this file.
overwrite: bool
If a file already exist at this location, either overwrite
it or raise an exception.
"""
try:
r = self._send_to_api(
method="post",
endpoint="create",
json={"path": path, "overwrite": overwrite},
)
return r["handle"]
except DatabricksException as e:
if e.error_code == "RESOURCE_ALREADY_EXISTS":
raise FileExistsError(e.message) from e
raise
def _close_handle(self, handle):
"""
Close a handle, which was opened by :func:`_create_handle`.
Parameters
----------
handle: str
Which handle to close.
"""
try:
self._send_to_api(method="post", endpoint="close", json={"handle": handle})
except DatabricksException as e:
if e.error_code == "RESOURCE_DOES_NOT_EXIST":
raise FileNotFoundError(e.message) from e
raise
def _add_data(self, handle, data):
"""
Upload data to an already opened file handle
(opened by :func:`_create_handle`).
The maximal allowed data size is 1MB after
conversion to base64.
Remember to close the handle when you are finished.
Parameters
----------
handle: str
Which handle to upload data to.
data: bytes
Block of data to add to the handle.
"""
data = base64.b64encode(data).decode()
try:
self._send_to_api(
method="post",
endpoint="add-block",
json={"handle": handle, "data": data},
)
except DatabricksException as e:
if e.error_code == "RESOURCE_DOES_NOT_EXIST":
raise FileNotFoundError(e.message) from e
elif e.error_code == "MAX_BLOCK_SIZE_EXCEEDED":
raise ValueError(e.message) from e
raise
def _get_data(self, path, start, end):
"""
Download data in bytes from a given absolute path in a block
from [start, start+length].
The maximum number of allowed bytes to read is 1MB.
Parameters
----------
path: str
Absolute path to download data from
start: int
Start position of the block
end: int
End position of the block
"""
try:
r = self._send_to_api(
method="get",
endpoint="read",
json={"path": path, "offset": start, "length": end - start},
)
return base64.b64decode(r["data"])
except DatabricksException as e:
if e.error_code == "RESOURCE_DOES_NOT_EXIST":
raise FileNotFoundError(e.message) from e
elif e.error_code in ["INVALID_PARAMETER_VALUE", "MAX_READ_SIZE_EXCEEDED"]:
raise ValueError(e.message) from e
raise
def invalidate_cache(self, path=None):
if path is None:
self.dircache.clear()
else:
self.dircache.pop(path, None)
super().invalidate_cache(path)
class DatabricksFile(AbstractBufferedFile):
"""
Helper class for files referenced in the DatabricksFileSystem.
"""
DEFAULT_BLOCK_SIZE = 1 * 2**20 # only allowed block size
def __init__(
self,
fs,
path,
mode="rb",
block_size="default",
autocommit=True,
cache_type="readahead",
cache_options=None,
**kwargs,
):
"""
Create a new instance of the DatabricksFile.
The blocksize needs to be the default one.
"""
if block_size is None or block_size == "default":
block_size = self.DEFAULT_BLOCK_SIZE
assert block_size == self.DEFAULT_BLOCK_SIZE, (
f"Only the default block size is allowed, not {block_size}"
)
super().__init__(
fs,
path,
mode=mode,
block_size=block_size,
autocommit=autocommit,
cache_type=cache_type,
cache_options=cache_options or {},
**kwargs,
)
def _initiate_upload(self):
"""Internal function to start a file upload"""
self.handle = self.fs._create_handle(self.path)
def _upload_chunk(self, final=False):
"""Internal function to add a chunk of data to a started upload"""
self.buffer.seek(0)
data = self.buffer.getvalue()
data_chunks = [
data[start:end] for start, end in self._to_sized_blocks(len(data))
]
for data_chunk in data_chunks:
self.fs._add_data(handle=self.handle, data=data_chunk)
if final:
self.fs._close_handle(handle=self.handle)
return True
def _fetch_range(self, start, end):
"""Internal function to download a block of data"""
return_buffer = b""
length = end - start
for chunk_start, chunk_end in self._to_sized_blocks(length, start):
return_buffer += self.fs._get_data(
path=self.path, start=chunk_start, end=chunk_end
)
return return_buffer
def _to_sized_blocks(self, length, start=0):
"""Helper function to split a range from 0 to total_length into blocksizes"""
end = start + length
for data_chunk in range(start, end, self.blocksize):
data_start = data_chunk
data_end = min(end, data_chunk + self.blocksize)
yield data_start, data_end

View File

@@ -0,0 +1,388 @@
from .. import filesystem
from ..asyn import AsyncFileSystem
class DirFileSystem(AsyncFileSystem):
"""Directory prefix filesystem
The DirFileSystem is a filesystem-wrapper. It assumes every path it is dealing with
is relative to the `path`. After performing the necessary paths operation it
delegates everything to the wrapped filesystem.
"""
protocol = "dir"
def __init__(
self,
path=None,
fs=None,
fo=None,
target_protocol=None,
target_options=None,
**storage_options,
):
"""
Parameters
----------
path: str
Path to the directory.
fs: AbstractFileSystem
An instantiated filesystem to wrap.
target_protocol, target_options:
if fs is none, construct it from these
fo: str
Alternate for path; do not provide both
"""
super().__init__(**storage_options)
if fs is None:
fs = filesystem(protocol=target_protocol, **(target_options or {}))
path = path or fo
if self.asynchronous and not fs.async_impl:
raise ValueError("can't use asynchronous with non-async fs")
if fs.async_impl and self.asynchronous != fs.asynchronous:
raise ValueError("both dirfs and fs should be in the same sync/async mode")
self.path = fs._strip_protocol(path)
self.fs = fs
def _join(self, path):
if isinstance(path, str):
if not self.path:
return path
if not path:
return self.path
return self.fs.sep.join((self.path, self._strip_protocol(path)))
if isinstance(path, dict):
return {self._join(_path): value for _path, value in path.items()}
return [self._join(_path) for _path in path]
def _relpath(self, path):
if isinstance(path, str):
if not self.path:
return path
# We need to account for S3FileSystem returning paths that do not
# start with a '/'
if path == self.path or (
self.path.startswith(self.fs.sep) and path == self.path[1:]
):
return ""
prefix = self.path + self.fs.sep
if self.path.startswith(self.fs.sep) and not path.startswith(self.fs.sep):
prefix = prefix[1:]
assert path.startswith(prefix)
return path[len(prefix) :]
return [self._relpath(_path) for _path in path]
# Wrappers below
@property
def sep(self):
return self.fs.sep
async def set_session(self, *args, **kwargs):
return await self.fs.set_session(*args, **kwargs)
async def _rm_file(self, path, **kwargs):
return await self.fs._rm_file(self._join(path), **kwargs)
def rm_file(self, path, **kwargs):
return self.fs.rm_file(self._join(path), **kwargs)
async def _rm(self, path, *args, **kwargs):
return await self.fs._rm(self._join(path), *args, **kwargs)
def rm(self, path, *args, **kwargs):
return self.fs.rm(self._join(path), *args, **kwargs)
async def _cp_file(self, path1, path2, **kwargs):
return await self.fs._cp_file(self._join(path1), self._join(path2), **kwargs)
def cp_file(self, path1, path2, **kwargs):
return self.fs.cp_file(self._join(path1), self._join(path2), **kwargs)
async def _copy(
self,
path1,
path2,
*args,
**kwargs,
):
return await self.fs._copy(
self._join(path1),
self._join(path2),
*args,
**kwargs,
)
def copy(self, path1, path2, *args, **kwargs):
return self.fs.copy(
self._join(path1),
self._join(path2),
*args,
**kwargs,
)
async def _pipe(self, path, *args, **kwargs):
return await self.fs._pipe(self._join(path), *args, **kwargs)
def pipe(self, path, *args, **kwargs):
return self.fs.pipe(self._join(path), *args, **kwargs)
async def _pipe_file(self, path, *args, **kwargs):
return await self.fs._pipe_file(self._join(path), *args, **kwargs)
def pipe_file(self, path, *args, **kwargs):
return self.fs.pipe_file(self._join(path), *args, **kwargs)
async def _cat_file(self, path, *args, **kwargs):
return await self.fs._cat_file(self._join(path), *args, **kwargs)
def cat_file(self, path, *args, **kwargs):
return self.fs.cat_file(self._join(path), *args, **kwargs)
async def _cat(self, path, *args, **kwargs):
ret = await self.fs._cat(
self._join(path),
*args,
**kwargs,
)
if isinstance(ret, dict):
return {self._relpath(key): value for key, value in ret.items()}
return ret
def cat(self, path, *args, **kwargs):
ret = self.fs.cat(
self._join(path),
*args,
**kwargs,
)
if isinstance(ret, dict):
return {self._relpath(key): value for key, value in ret.items()}
return ret
async def _put_file(self, lpath, rpath, **kwargs):
return await self.fs._put_file(lpath, self._join(rpath), **kwargs)
def put_file(self, lpath, rpath, **kwargs):
return self.fs.put_file(lpath, self._join(rpath), **kwargs)
async def _put(
self,
lpath,
rpath,
*args,
**kwargs,
):
return await self.fs._put(
lpath,
self._join(rpath),
*args,
**kwargs,
)
def put(self, lpath, rpath, *args, **kwargs):
return self.fs.put(
lpath,
self._join(rpath),
*args,
**kwargs,
)
async def _get_file(self, rpath, lpath, **kwargs):
return await self.fs._get_file(self._join(rpath), lpath, **kwargs)
def get_file(self, rpath, lpath, **kwargs):
return self.fs.get_file(self._join(rpath), lpath, **kwargs)
async def _get(self, rpath, *args, **kwargs):
return await self.fs._get(self._join(rpath), *args, **kwargs)
def get(self, rpath, *args, **kwargs):
return self.fs.get(self._join(rpath), *args, **kwargs)
async def _isfile(self, path):
return await self.fs._isfile(self._join(path))
def isfile(self, path):
return self.fs.isfile(self._join(path))
async def _isdir(self, path):
return await self.fs._isdir(self._join(path))
def isdir(self, path):
return self.fs.isdir(self._join(path))
async def _size(self, path):
return await self.fs._size(self._join(path))
def size(self, path):
return self.fs.size(self._join(path))
async def _exists(self, path):
return await self.fs._exists(self._join(path))
def exists(self, path):
return self.fs.exists(self._join(path))
async def _info(self, path, **kwargs):
info = await self.fs._info(self._join(path), **kwargs)
info = info.copy()
info["name"] = self._relpath(info["name"])
return info
def info(self, path, **kwargs):
info = self.fs.info(self._join(path), **kwargs)
info = info.copy()
info["name"] = self._relpath(info["name"])
return info
async def _ls(self, path, detail=True, **kwargs):
ret = (await self.fs._ls(self._join(path), detail=detail, **kwargs)).copy()
if detail:
out = []
for entry in ret:
entry = entry.copy()
entry["name"] = self._relpath(entry["name"])
out.append(entry)
return out
return self._relpath(ret)
def ls(self, path, detail=True, **kwargs):
ret = self.fs.ls(self._join(path), detail=detail, **kwargs).copy()
if detail:
out = []
for entry in ret:
entry = entry.copy()
entry["name"] = self._relpath(entry["name"])
out.append(entry)
return out
return self._relpath(ret)
async def _walk(self, path, *args, **kwargs):
async for root, dirs, files in self.fs._walk(self._join(path), *args, **kwargs):
yield self._relpath(root), dirs, files
def walk(self, path, *args, **kwargs):
for root, dirs, files in self.fs.walk(self._join(path), *args, **kwargs):
yield self._relpath(root), dirs, files
async def _glob(self, path, **kwargs):
detail = kwargs.get("detail", False)
ret = await self.fs._glob(self._join(path), **kwargs)
if detail:
return {self._relpath(path): info for path, info in ret.items()}
return self._relpath(ret)
def glob(self, path, **kwargs):
detail = kwargs.get("detail", False)
ret = self.fs.glob(self._join(path), **kwargs)
if detail:
return {self._relpath(path): info for path, info in ret.items()}
return self._relpath(ret)
async def _du(self, path, *args, **kwargs):
total = kwargs.get("total", True)
ret = await self.fs._du(self._join(path), *args, **kwargs)
if total:
return ret
return {self._relpath(path): size for path, size in ret.items()}
def du(self, path, *args, **kwargs):
total = kwargs.get("total", True)
ret = self.fs.du(self._join(path), *args, **kwargs)
if total:
return ret
return {self._relpath(path): size for path, size in ret.items()}
async def _find(self, path, *args, **kwargs):
detail = kwargs.get("detail", False)
ret = await self.fs._find(self._join(path), *args, **kwargs)
if detail:
return {self._relpath(path): info for path, info in ret.items()}
return self._relpath(ret)
def find(self, path, *args, **kwargs):
detail = kwargs.get("detail", False)
ret = self.fs.find(self._join(path), *args, **kwargs)
if detail:
return {self._relpath(path): info for path, info in ret.items()}
return self._relpath(ret)
async def _expand_path(self, path, *args, **kwargs):
return self._relpath(
await self.fs._expand_path(self._join(path), *args, **kwargs)
)
def expand_path(self, path, *args, **kwargs):
return self._relpath(self.fs.expand_path(self._join(path), *args, **kwargs))
async def _mkdir(self, path, *args, **kwargs):
return await self.fs._mkdir(self._join(path), *args, **kwargs)
def mkdir(self, path, *args, **kwargs):
return self.fs.mkdir(self._join(path), *args, **kwargs)
async def _makedirs(self, path, *args, **kwargs):
return await self.fs._makedirs(self._join(path), *args, **kwargs)
def makedirs(self, path, *args, **kwargs):
return self.fs.makedirs(self._join(path), *args, **kwargs)
def rmdir(self, path):
return self.fs.rmdir(self._join(path))
def mv(self, path1, path2, **kwargs):
return self.fs.mv(
self._join(path1),
self._join(path2),
**kwargs,
)
def touch(self, path, **kwargs):
return self.fs.touch(self._join(path), **kwargs)
def created(self, path):
return self.fs.created(self._join(path))
def modified(self, path):
return self.fs.modified(self._join(path))
def sign(self, path, *args, **kwargs):
return self.fs.sign(self._join(path), *args, **kwargs)
def __repr__(self):
return f"{self.__class__.__qualname__}(path='{self.path}', fs={self.fs})"
def open(
self,
path,
*args,
**kwargs,
):
return self.fs.open(
self._join(path),
*args,
**kwargs,
)
async def open_async(
self,
path,
*args,
**kwargs,
):
return await self.fs.open_async(
self._join(path),
*args,
**kwargs,
)

View File

@@ -0,0 +1,387 @@
import os
import uuid
from ftplib import FTP, FTP_TLS, Error, error_perm
from typing import Any
from ..spec import AbstractBufferedFile, AbstractFileSystem
from ..utils import infer_storage_options, isfilelike
class FTPFileSystem(AbstractFileSystem):
"""A filesystem over classic FTP"""
root_marker = "/"
cachable = False
protocol = "ftp"
def __init__(
self,
host,
port=21,
username=None,
password=None,
acct=None,
block_size=None,
tempdir=None,
timeout=30,
encoding="utf-8",
tls=False,
**kwargs,
):
"""
You can use _get_kwargs_from_urls to get some kwargs from
a reasonable FTP url.
Authentication will be anonymous if username/password are not
given.
Parameters
----------
host: str
The remote server name/ip to connect to
port: int
Port to connect with
username: str or None
If authenticating, the user's identifier
password: str of None
User's password on the server, if using
acct: str or None
Some servers also need an "account" string for auth
block_size: int or None
If given, the read-ahead or write buffer size.
tempdir: str
Directory on remote to put temporary files when in a transaction
timeout: int
Timeout of the ftp connection in seconds
encoding: str
Encoding to use for directories and filenames in FTP connection
tls: bool
Use FTP-TLS, by default False
"""
super().__init__(**kwargs)
self.host = host
self.port = port
self.tempdir = tempdir or "/tmp"
self.cred = username or "", password or "", acct or ""
self.timeout = timeout
self.encoding = encoding
if block_size is not None:
self.blocksize = block_size
else:
self.blocksize = 2**16
self.tls = tls
self._connect()
if self.tls:
self.ftp.prot_p()
def _connect(self):
if self.tls:
ftp_cls = FTP_TLS
else:
ftp_cls = FTP
self.ftp = ftp_cls(timeout=self.timeout, encoding=self.encoding)
self.ftp.connect(self.host, self.port)
self.ftp.login(*self.cred)
@classmethod
def _strip_protocol(cls, path):
return "/" + infer_storage_options(path)["path"].lstrip("/").rstrip("/")
@staticmethod
def _get_kwargs_from_urls(urlpath):
out = infer_storage_options(urlpath)
out.pop("path", None)
out.pop("protocol", None)
return out
def ls(self, path, detail=True, **kwargs):
path = self._strip_protocol(path)
out = []
if path not in self.dircache:
try:
try:
out = [
(fn, details)
for (fn, details) in self.ftp.mlsd(path)
if fn not in [".", ".."]
and details["type"] not in ["pdir", "cdir"]
]
except error_perm:
out = _mlsd2(self.ftp, path) # Not platform independent
for fn, details in out:
details["name"] = "/".join(
["" if path == "/" else path, fn.lstrip("/")]
)
if details["type"] == "file":
details["size"] = int(details["size"])
else:
details["size"] = 0
if details["type"] == "dir":
details["type"] = "directory"
self.dircache[path] = out
except Error:
try:
info = self.info(path)
if info["type"] == "file":
out = [(path, info)]
except (Error, IndexError) as exc:
raise FileNotFoundError(path) from exc
files = self.dircache.get(path, out)
if not detail:
return sorted([fn for fn, details in files])
return [details for fn, details in files]
def info(self, path, **kwargs):
# implement with direct method
path = self._strip_protocol(path)
if path == "/":
# special case, since this dir has no real entry
return {"name": "/", "size": 0, "type": "directory"}
files = self.ls(self._parent(path).lstrip("/"), True)
try:
out = next(f for f in files if f["name"] == path)
except StopIteration as exc:
raise FileNotFoundError(path) from exc
return out
def get_file(self, rpath, lpath, **kwargs):
if self.isdir(rpath):
if not os.path.exists(lpath):
os.mkdir(lpath)
return
if isfilelike(lpath):
outfile = lpath
else:
outfile = open(lpath, "wb")
def cb(x):
outfile.write(x)
self.ftp.retrbinary(
f"RETR {rpath}",
blocksize=self.blocksize,
callback=cb,
)
if not isfilelike(lpath):
outfile.close()
def cat_file(self, path, start=None, end=None, **kwargs):
if end is not None:
return super().cat_file(path, start, end, **kwargs)
out = []
def cb(x):
out.append(x)
try:
self.ftp.retrbinary(
f"RETR {path}",
blocksize=self.blocksize,
rest=start,
callback=cb,
)
except (Error, error_perm) as orig_exc:
raise FileNotFoundError(path) from orig_exc
return b"".join(out)
def _open(
self,
path,
mode="rb",
block_size=None,
cache_options=None,
autocommit=True,
**kwargs,
):
path = self._strip_protocol(path)
block_size = block_size or self.blocksize
return FTPFile(
self,
path,
mode=mode,
block_size=block_size,
tempdir=self.tempdir,
autocommit=autocommit,
cache_options=cache_options,
)
def _rm(self, path):
path = self._strip_protocol(path)
self.ftp.delete(path)
self.invalidate_cache(self._parent(path))
def rm(self, path, recursive=False, maxdepth=None):
paths = self.expand_path(path, recursive=recursive, maxdepth=maxdepth)
for p in reversed(paths):
if self.isfile(p):
self.rm_file(p)
else:
self.rmdir(p)
def mkdir(self, path: str, create_parents: bool = True, **kwargs: Any) -> None:
path = self._strip_protocol(path)
parent = self._parent(path)
if parent != self.root_marker and not self.exists(parent) and create_parents:
self.mkdir(parent, create_parents=create_parents)
self.ftp.mkd(path)
self.invalidate_cache(self._parent(path))
def makedirs(self, path: str, exist_ok: bool = False) -> None:
path = self._strip_protocol(path)
if self.exists(path):
# NB: "/" does not "exist" as it has no directory entry
if not exist_ok:
raise FileExistsError(f"{path} exists without `exist_ok`")
# exists_ok=True -> no-op
else:
self.mkdir(path, create_parents=True)
def rmdir(self, path):
path = self._strip_protocol(path)
self.ftp.rmd(path)
self.invalidate_cache(self._parent(path))
def mv(self, path1, path2, **kwargs):
path1 = self._strip_protocol(path1)
path2 = self._strip_protocol(path2)
self.ftp.rename(path1, path2)
self.invalidate_cache(self._parent(path1))
self.invalidate_cache(self._parent(path2))
def __del__(self):
self.ftp.close()
def invalidate_cache(self, path=None):
if path is None:
self.dircache.clear()
else:
self.dircache.pop(path, None)
super().invalidate_cache(path)
class TransferDone(Exception):
"""Internal exception to break out of transfer"""
pass
class FTPFile(AbstractBufferedFile):
"""Interact with a remote FTP file with read/write buffering"""
def __init__(
self,
fs,
path,
mode="rb",
block_size="default",
autocommit=True,
cache_type="readahead",
cache_options=None,
**kwargs,
):
super().__init__(
fs,
path,
mode=mode,
block_size=block_size,
autocommit=autocommit,
cache_type=cache_type,
cache_options=cache_options,
**kwargs,
)
if not autocommit:
self.target = self.path
self.path = "/".join([kwargs["tempdir"], str(uuid.uuid4())])
def commit(self):
self.fs.mv(self.path, self.target)
def discard(self):
self.fs.rm(self.path)
def _fetch_range(self, start, end):
"""Get bytes between given byte limits
Implemented by raising an exception in the fetch callback when the
number of bytes received reaches the requested amount.
Will fail if the server does not respect the REST command on
retrieve requests.
"""
out = []
total = [0]
def callback(x):
total[0] += len(x)
if total[0] > end - start:
out.append(x[: (end - start) - total[0]])
if end < self.size:
raise TransferDone
else:
out.append(x)
if total[0] == end - start and end < self.size:
raise TransferDone
try:
self.fs.ftp.retrbinary(
f"RETR {self.path}",
blocksize=self.blocksize,
rest=start,
callback=callback,
)
except TransferDone:
try:
# stop transfer, we got enough bytes for this block
self.fs.ftp.abort()
self.fs.ftp.getmultiline()
except Error:
self.fs._connect()
return b"".join(out)
def _upload_chunk(self, final=False):
self.buffer.seek(0)
self.fs.ftp.storbinary(
f"STOR {self.path}", self.buffer, blocksize=self.blocksize, rest=self.offset
)
return True
def _mlsd2(ftp, path="."):
"""
Fall back to using `dir` instead of `mlsd` if not supported.
This parses a Linux style `ls -l` response to `dir`, but the response may
be platform dependent.
Parameters
----------
ftp: ftplib.FTP
path: str
Expects to be given path, but defaults to ".".
"""
lines = []
minfo = []
ftp.dir(path, lines.append)
for line in lines:
split_line = line.split()
if len(split_line) < 9:
continue
this = (
split_line[-1],
{
"modify": " ".join(split_line[5:8]),
"unix.owner": split_line[2],
"unix.group": split_line[3],
"unix.mode": split_line[0],
"size": split_line[4],
},
)
if this[1]["unix.mode"][0] == "d":
this[1]["type"] = "dir"
else:
this[1]["type"] = "file"
minfo.append(this)
return minfo

View File

@@ -0,0 +1,241 @@
import requests
from ..spec import AbstractFileSystem
from ..utils import infer_storage_options
from .memory import MemoryFile
class GistFileSystem(AbstractFileSystem):
"""
Interface to files in a single GitHub Gist.
Provides read-only access to a gist's files. Gists do not contain
subdirectories, so file listing is straightforward.
Parameters
----------
gist_id: str
The ID of the gist you want to access (the long hex value from the URL).
filenames: list[str] (optional)
If provided, only make a file system representing these files, and do not fetch
the list of all files for this gist.
sha: str (optional)
If provided, fetch a particular revision of the gist. If omitted,
the latest revision is used.
username: str (optional)
GitHub username for authentication.
token: str (optional)
GitHub personal access token (required if username is given), or.
timeout: (float, float) or float, optional
Connect and read timeouts for requests (default 60s each).
kwargs: dict
Stored on `self.request_kw` and passed to `requests.get` when fetching Gist
metadata or reading ("opening") a file.
"""
protocol = "gist"
gist_url = "https://api.github.com/gists/{gist_id}"
gist_rev_url = "https://api.github.com/gists/{gist_id}/{sha}"
def __init__(
self,
gist_id,
filenames=None,
sha=None,
username=None,
token=None,
timeout=None,
**kwargs,
):
super().__init__()
self.gist_id = gist_id
self.filenames = filenames
self.sha = sha # revision of the gist (optional)
if username is not None and token is None:
raise ValueError("User auth requires a token")
self.username = username
self.token = token
self.request_kw = kwargs
# Default timeouts to 60s connect/read if none provided
self.timeout = timeout if timeout is not None else (60, 60)
# We use a single-level "directory" cache, because a gist is essentially flat
self.dircache[""] = self._fetch_file_list()
@property
def kw(self):
"""Auth parameters passed to 'requests' if we have username/token."""
kw = {
"headers": {
"Accept": "application/vnd.github+json",
"X-GitHub-Api-Version": "2022-11-28",
}
}
kw.update(self.request_kw)
if self.username and self.token:
kw["auth"] = (self.username, self.token)
elif self.token:
kw["headers"]["Authorization"] = f"Bearer {self.token}"
return kw
def _fetch_gist_metadata(self):
"""
Fetch the JSON metadata for this gist (possibly for a specific revision).
"""
if self.sha:
url = self.gist_rev_url.format(gist_id=self.gist_id, sha=self.sha)
else:
url = self.gist_url.format(gist_id=self.gist_id)
r = requests.get(url, timeout=self.timeout, **self.kw)
if r.status_code == 404:
raise FileNotFoundError(
f"Gist not found: {self.gist_id}@{self.sha or 'latest'}"
)
r.raise_for_status()
return r.json()
def _fetch_file_list(self):
"""
Returns a list of dicts describing each file in the gist. These get stored
in self.dircache[""].
"""
meta = self._fetch_gist_metadata()
if self.filenames:
available_files = meta.get("files", {})
files = {}
for fn in self.filenames:
if fn not in available_files:
raise FileNotFoundError(fn)
files[fn] = available_files[fn]
else:
files = meta.get("files", {})
out = []
for fname, finfo in files.items():
if finfo is None:
# Occasionally GitHub returns a file entry with null if it was deleted
continue
# Build a directory entry
out.append(
{
"name": fname, # file's name
"type": "file", # gists have no subdirectories
"size": finfo.get("size", 0), # file size in bytes
"raw_url": finfo.get("raw_url"),
}
)
return out
@classmethod
def _strip_protocol(cls, path):
"""
Remove 'gist://' from the path, if present.
"""
# The default infer_storage_options can handle gist://username:token@id/file
# or gist://id/file, but let's ensure we handle a normal usage too.
# We'll just strip the protocol prefix if it exists.
path = infer_storage_options(path).get("path", path)
return path.lstrip("/")
@staticmethod
def _get_kwargs_from_urls(path):
"""
Parse 'gist://' style URLs into GistFileSystem constructor kwargs.
For example:
gist://:TOKEN@<gist_id>/file.txt
gist://username:TOKEN@<gist_id>/file.txt
"""
so = infer_storage_options(path)
out = {}
if "username" in so and so["username"]:
out["username"] = so["username"]
if "password" in so and so["password"]:
out["token"] = so["password"]
if "host" in so and so["host"]:
# We interpret 'host' as the gist ID
out["gist_id"] = so["host"]
# Extract SHA and filename from path
if "path" in so and so["path"]:
path_parts = so["path"].rsplit("/", 2)[-2:]
if len(path_parts) == 2:
if path_parts[0]: # SHA present
out["sha"] = path_parts[0]
if path_parts[1]: # filename also present
out["filenames"] = [path_parts[1]]
return out
def ls(self, path="", detail=False, **kwargs):
"""
List files in the gist. Gists are single-level, so any 'path' is basically
the filename, or empty for all files.
Parameters
----------
path : str, optional
The filename to list. If empty, returns all files in the gist.
detail : bool, default False
If True, return a list of dicts; if False, return a list of filenames.
"""
path = self._strip_protocol(path or "")
# If path is empty, return all
if path == "":
results = self.dircache[""]
else:
# We want just the single file with this name
all_files = self.dircache[""]
results = [f for f in all_files if f["name"] == path]
if not results:
raise FileNotFoundError(path)
if detail:
return results
else:
return sorted(f["name"] for f in results)
def _open(self, path, mode="rb", block_size=None, **kwargs):
"""
Read a single file from the gist.
"""
if mode != "rb":
raise NotImplementedError("GitHub Gist FS is read-only (no write).")
path = self._strip_protocol(path)
# Find the file entry in our dircache
matches = [f for f in self.dircache[""] if f["name"] == path]
if not matches:
raise FileNotFoundError(path)
finfo = matches[0]
raw_url = finfo.get("raw_url")
if not raw_url:
raise FileNotFoundError(f"No raw_url for file: {path}")
r = requests.get(raw_url, timeout=self.timeout, **self.kw)
if r.status_code == 404:
raise FileNotFoundError(path)
r.raise_for_status()
return MemoryFile(path, None, r.content)
def cat(self, path, recursive=False, on_error="raise", **kwargs):
"""
Return {path: contents} for the given file or files. If 'recursive' is True,
and path is empty, returns all files in the gist.
"""
paths = self.expand_path(path, recursive=recursive)
out = {}
for p in paths:
try:
with self.open(p, "rb") as f:
out[p] = f.read()
except FileNotFoundError as e:
if on_error == "raise":
raise e
elif on_error == "omit":
pass # skip
else:
out[p] = e
if len(paths) == 1 and paths[0] == path:
return out[path]
return out

View File

@@ -0,0 +1,114 @@
import os
import pygit2
from fsspec.spec import AbstractFileSystem
from .memory import MemoryFile
class GitFileSystem(AbstractFileSystem):
"""Browse the files of a local git repo at any hash/tag/branch
(experimental backend)
"""
root_marker = ""
cachable = True
def __init__(self, path=None, fo=None, ref=None, **kwargs):
"""
Parameters
----------
path: str (optional)
Local location of the repo (uses current directory if not given).
May be deprecated in favour of ``fo``. When used with a higher
level function such as fsspec.open(), may be of the form
"git://[path-to-repo[:]][ref@]path/to/file" (but the actual
file path should not contain "@" or ":").
fo: str (optional)
Same as ``path``, but passed as part of a chained URL. This one
takes precedence if both are given.
ref: str (optional)
Reference to work with, could be a hash, tag or branch name. Defaults
to current working tree. Note that ``ls`` and ``open`` also take hash,
so this becomes the default for those operations
kwargs
"""
super().__init__(**kwargs)
self.repo = pygit2.Repository(fo or path or os.getcwd())
self.ref = ref or "master"
@classmethod
def _strip_protocol(cls, path):
path = super()._strip_protocol(path).lstrip("/")
if ":" in path:
path = path.split(":", 1)[1]
if "@" in path:
path = path.split("@", 1)[1]
return path.lstrip("/")
def _path_to_object(self, path, ref):
comm, ref = self.repo.resolve_refish(ref or self.ref)
parts = path.split("/")
tree = comm.tree
for part in parts:
if part and isinstance(tree, pygit2.Tree):
if part not in tree:
raise FileNotFoundError(path)
tree = tree[part]
return tree
@staticmethod
def _get_kwargs_from_urls(path):
path = path.removeprefix("git://")
out = {}
if ":" in path:
out["path"], path = path.split(":", 1)
if "@" in path:
out["ref"], path = path.split("@", 1)
return out
@staticmethod
def _object_to_info(obj, path=None):
# obj.name and obj.filemode are None for the root tree!
is_dir = isinstance(obj, pygit2.Tree)
return {
"type": "directory" if is_dir else "file",
"name": (
"/".join([path, obj.name or ""]).lstrip("/") if path else obj.name
),
"hex": str(obj.id),
"mode": "100644" if obj.filemode is None else f"{obj.filemode:o}",
"size": 0 if is_dir else obj.size,
}
def ls(self, path, detail=True, ref=None, **kwargs):
tree = self._path_to_object(self._strip_protocol(path), ref)
return [
GitFileSystem._object_to_info(obj, path)
if detail
else GitFileSystem._object_to_info(obj, path)["name"]
for obj in (tree if isinstance(tree, pygit2.Tree) else [tree])
]
def info(self, path, ref=None, **kwargs):
tree = self._path_to_object(self._strip_protocol(path), ref)
return GitFileSystem._object_to_info(tree, path)
def ukey(self, path, ref=None):
return self.info(path, ref=ref)["hex"]
def _open(
self,
path,
mode="rb",
block_size=None,
autocommit=True,
cache_options=None,
ref=None,
**kwargs,
):
obj = self._path_to_object(path, ref or self.ref)
return MemoryFile(data=obj.data)

View File

@@ -0,0 +1,333 @@
import base64
import re
import requests
from ..spec import AbstractFileSystem
from ..utils import infer_storage_options
from .memory import MemoryFile
class GithubFileSystem(AbstractFileSystem):
"""Interface to files in github
An instance of this class provides the files residing within a remote github
repository. You may specify a point in the repos history, by SHA, branch
or tag (default is current master).
For files less than 1 MB in size, file content is returned directly in a
MemoryFile. For larger files, or for files tracked by git-lfs, file content
is returned as an HTTPFile wrapping the ``download_url`` provided by the
GitHub API.
When using fsspec.open, allows URIs of the form:
- "github://path/file", in which case you must specify org, repo and
may specify sha in the extra args
- 'github://org:repo@/precip/catalog.yml', where the org and repo are
part of the URI
- 'github://org:repo@sha/precip/catalog.yml', where the sha is also included
``sha`` can be the full or abbreviated hex of the commit you want to fetch
from, or a branch or tag name (so long as it doesn't contain special characters
like "/", "?", which would have to be HTTP-encoded).
For authorised access, you must provide username and token, which can be made
at https://github.com/settings/tokens
"""
url = "https://api.github.com/repos/{org}/{repo}/git/trees/{sha}"
content_url = "https://api.github.com/repos/{org}/{repo}/contents/{path}?ref={sha}"
protocol = "github"
timeout = (60, 60) # connect, read timeouts
def __init__(
self, org, repo, sha=None, username=None, token=None, timeout=None, **kwargs
):
super().__init__(**kwargs)
self.org = org
self.repo = repo
if (username is None) ^ (token is None):
raise ValueError("Auth required both username and token")
self.username = username
self.token = token
if timeout is not None:
self.timeout = timeout
if sha is None:
# look up default branch (not necessarily "master")
u = "https://api.github.com/repos/{org}/{repo}"
r = requests.get(
u.format(org=org, repo=repo), timeout=self.timeout, **self.kw
)
r.raise_for_status()
sha = r.json()["default_branch"]
self.root = sha
self.ls("")
try:
from .http import HTTPFileSystem
self.http_fs = HTTPFileSystem(**kwargs)
except ImportError:
self.http_fs = None
@property
def kw(self):
if self.username:
return {"auth": (self.username, self.token)}
return {}
@classmethod
def repos(cls, org_or_user, is_org=True):
"""List repo names for given org or user
This may become the top level of the FS
Parameters
----------
org_or_user: str
Name of the github org or user to query
is_org: bool (default True)
Whether the name is an organisation (True) or user (False)
Returns
-------
List of string
"""
r = requests.get(
f"https://api.github.com/{['users', 'orgs'][is_org]}/{org_or_user}/repos",
timeout=cls.timeout,
)
r.raise_for_status()
return [repo["name"] for repo in r.json()]
@property
def tags(self):
"""Names of tags in the repo"""
r = requests.get(
f"https://api.github.com/repos/{self.org}/{self.repo}/tags",
timeout=self.timeout,
**self.kw,
)
r.raise_for_status()
return [t["name"] for t in r.json()]
@property
def branches(self):
"""Names of branches in the repo"""
r = requests.get(
f"https://api.github.com/repos/{self.org}/{self.repo}/branches",
timeout=self.timeout,
**self.kw,
)
r.raise_for_status()
return [t["name"] for t in r.json()]
@property
def refs(self):
"""Named references, tags and branches"""
return {"tags": self.tags, "branches": self.branches}
def ls(self, path, detail=False, sha=None, _sha=None, **kwargs):
"""List files at given path
Parameters
----------
path: str
Location to list, relative to repo root
detail: bool
If True, returns list of dicts, one per file; if False, returns
list of full filenames only
sha: str (optional)
List at the given point in the repo history, branch or tag name or commit
SHA
_sha: str (optional)
List this specific tree object (used internally to descend into trees)
"""
path = self._strip_protocol(path)
if path == "":
_sha = sha or self.root
if _sha is None:
parts = path.rstrip("/").split("/")
so_far = ""
_sha = sha or self.root
for part in parts:
out = self.ls(so_far, True, sha=sha, _sha=_sha)
so_far += "/" + part if so_far else part
out = [o for o in out if o["name"] == so_far]
if not out:
raise FileNotFoundError(path)
out = out[0]
if out["type"] == "file":
if detail:
return [out]
else:
return path
_sha = out["sha"]
if path not in self.dircache or sha not in [self.root, None]:
r = requests.get(
self.url.format(org=self.org, repo=self.repo, sha=_sha),
timeout=self.timeout,
**self.kw,
)
if r.status_code == 404:
raise FileNotFoundError(path)
r.raise_for_status()
types = {"blob": "file", "tree": "directory"}
out = [
{
"name": path + "/" + f["path"] if path else f["path"],
"mode": f["mode"],
"type": types[f["type"]],
"size": f.get("size", 0),
"sha": f["sha"],
}
for f in r.json()["tree"]
if f["type"] in types
]
if sha in [self.root, None]:
self.dircache[path] = out
else:
out = self.dircache[path]
if detail:
return out
else:
return sorted([f["name"] for f in out])
def invalidate_cache(self, path=None):
self.dircache.clear()
@classmethod
def _strip_protocol(cls, path):
opts = infer_storage_options(path)
if "username" not in opts:
return super()._strip_protocol(path)
return opts["path"].lstrip("/")
@staticmethod
def _get_kwargs_from_urls(path):
opts = infer_storage_options(path)
if "username" not in opts:
return {}
out = {"org": opts["username"], "repo": opts["password"]}
if opts["host"]:
out["sha"] = opts["host"]
return out
def _open(
self,
path,
mode="rb",
block_size=None,
cache_options=None,
sha=None,
**kwargs,
):
if mode != "rb":
raise NotImplementedError
# construct a url to hit the GitHub API's repo contents API
url = self.content_url.format(
org=self.org, repo=self.repo, path=path, sha=sha or self.root
)
# make a request to this API, and parse the response as JSON
r = requests.get(url, timeout=self.timeout, **self.kw)
if r.status_code == 404:
raise FileNotFoundError(path)
r.raise_for_status()
content_json = r.json()
# if the response's content key is not empty, try to parse it as base64
if content_json["content"]:
content = base64.b64decode(content_json["content"])
# as long as the content does not start with the string
# "version https://git-lfs.github.com/"
# then it is probably not a git-lfs pointer and we can just return
# the content directly
if not content.startswith(b"version https://git-lfs.github.com/"):
return MemoryFile(None, None, content)
# we land here if the content was not present in the first response
# (regular file over 1MB or git-lfs tracked file)
# in this case, we get let the HTTPFileSystem handle the download
if self.http_fs is None:
raise ImportError(
"Please install fsspec[http] to access github files >1 MB "
"or git-lfs tracked files."
)
return self.http_fs.open(
content_json["download_url"],
mode=mode,
block_size=block_size,
cache_options=cache_options,
**kwargs,
)
def rm(self, path, recursive=False, maxdepth=None, message=None):
path = self.expand_path(path, recursive=recursive, maxdepth=maxdepth)
for p in reversed(path):
self.rm_file(p, message=message)
def rm_file(self, path, message=None, **kwargs):
"""
Remove a file from a specified branch using a given commit message.
Since Github DELETE operation requires a branch name, and we can't reliably
determine whether the provided SHA refers to a branch, tag, or commit, we
assume it's a branch. If it's not, the user will encounter an error when
attempting to retrieve the file SHA or delete the file.
Parameters
----------
path: str
The file's location relative to the repository root.
message: str, optional
The commit message for the deletion.
"""
if not self.username:
raise ValueError("Authentication required")
path = self._strip_protocol(path)
# Attempt to get SHA from cache or Github API
sha = self._get_sha_from_cache(path)
if not sha:
url = self.content_url.format(
org=self.org, repo=self.repo, path=path.lstrip("/"), sha=self.root
)
r = requests.get(url, timeout=self.timeout, **self.kw)
if r.status_code == 404:
raise FileNotFoundError(path)
r.raise_for_status()
sha = r.json()["sha"]
# Delete the file
delete_url = self.content_url.format(
org=self.org, repo=self.repo, path=path, sha=self.root
)
branch = self.root
data = {
"message": message or f"Delete {path}",
"sha": sha,
**({"branch": branch} if branch else {}),
}
r = requests.delete(delete_url, json=data, timeout=self.timeout, **self.kw)
error_message = r.json().get("message", "")
if re.search(r"Branch .+ not found", error_message):
error = "Remove only works when the filesystem is initialised from a branch or default (None)"
raise ValueError(error)
r.raise_for_status()
self.invalidate_cache(path)
def _get_sha_from_cache(self, path):
for entries in self.dircache.values():
for entry in entries:
entry_path = entry.get("name")
if entry_path and entry_path == path and "sha" in entry:
return entry["sha"]
return None

View File

@@ -0,0 +1,891 @@
import asyncio
import io
import logging
import re
import weakref
from copy import copy
from urllib.parse import urlparse
import aiohttp
import yarl
from fsspec.asyn import AbstractAsyncStreamedFile, AsyncFileSystem, sync, sync_wrapper
from fsspec.callbacks import DEFAULT_CALLBACK
from fsspec.exceptions import FSTimeoutError
from fsspec.spec import AbstractBufferedFile
from fsspec.utils import (
DEFAULT_BLOCK_SIZE,
glob_translate,
isfilelike,
nullcontext,
tokenize,
)
from ..caching import AllBytes
# https://stackoverflow.com/a/15926317/3821154
ex = re.compile(r"""<(a|A)\s+(?:[^>]*?\s+)?(href|HREF)=["'](?P<url>[^"']+)""")
ex2 = re.compile(r"""(?P<url>http[s]?://[-a-zA-Z0-9@:%_+.~#?&/=]+)""")
logger = logging.getLogger("fsspec.http")
async def get_client(**kwargs):
return aiohttp.ClientSession(**kwargs)
class HTTPFileSystem(AsyncFileSystem):
"""
Simple File-System for fetching data via HTTP(S)
``ls()`` is implemented by loading the parent page and doing a regex
match on the result. If simple_link=True, anything of the form
"http(s)://server.com/stuff?thing=other"; otherwise only links within
HTML href tags will be used.
"""
protocol = ("http", "https")
sep = "/"
def __init__(
self,
simple_links=True,
block_size=None,
same_scheme=True,
size_policy=None,
cache_type="bytes",
cache_options=None,
asynchronous=False,
loop=None,
client_kwargs=None,
get_client=get_client,
encoded=False,
**storage_options,
):
"""
NB: if this is called async, you must await set_client
Parameters
----------
block_size: int
Blocks to read bytes; if 0, will default to raw requests file-like
objects instead of HTTPFile instances
simple_links: bool
If True, will consider both HTML <a> tags and anything that looks
like a URL; if False, will consider only the former.
same_scheme: True
When doing ls/glob, if this is True, only consider paths that have
http/https matching the input URLs.
size_policy: this argument is deprecated
client_kwargs: dict
Passed to aiohttp.ClientSession, see
https://docs.aiohttp.org/en/stable/client_reference.html
For example, ``{'auth': aiohttp.BasicAuth('user', 'pass')}``
get_client: Callable[..., aiohttp.ClientSession]
A callable, which takes keyword arguments and constructs
an aiohttp.ClientSession. Its state will be managed by
the HTTPFileSystem class.
storage_options: key-value
Any other parameters passed on to requests
cache_type, cache_options: defaults used in open()
"""
super().__init__(self, asynchronous=asynchronous, loop=loop, **storage_options)
self.block_size = block_size if block_size is not None else DEFAULT_BLOCK_SIZE
self.simple_links = simple_links
self.same_schema = same_scheme
self.cache_type = cache_type
self.cache_options = cache_options
self.client_kwargs = client_kwargs or {}
self.get_client = get_client
self.encoded = encoded
self.kwargs = storage_options
self._session = None
# Clean caching-related parameters from `storage_options`
# before propagating them as `request_options` through `self.kwargs`.
# TODO: Maybe rename `self.kwargs` to `self.request_options` to make
# it clearer.
request_options = copy(storage_options)
self.use_listings_cache = request_options.pop("use_listings_cache", False)
request_options.pop("listings_expiry_time", None)
request_options.pop("max_paths", None)
request_options.pop("skip_instance_cache", None)
self.kwargs = request_options
@property
def fsid(self):
return "http"
def encode_url(self, url):
return yarl.URL(url, encoded=self.encoded)
@staticmethod
def close_session(loop, session):
if loop is not None and loop.is_running():
try:
sync(loop, session.close, timeout=0.1)
return
except (TimeoutError, FSTimeoutError, NotImplementedError):
pass
connector = getattr(session, "_connector", None)
if connector is not None:
# close after loop is dead
connector._close()
async def set_session(self):
if self._session is None:
self._session = await self.get_client(loop=self.loop, **self.client_kwargs)
if not self.asynchronous:
weakref.finalize(self, self.close_session, self.loop, self._session)
return self._session
@classmethod
def _strip_protocol(cls, path):
"""For HTTP, we always want to keep the full URL"""
return path
@classmethod
def _parent(cls, path):
# override, since _strip_protocol is different for URLs
par = super()._parent(path)
if len(par) > 7: # "http://..."
return par
return ""
async def _ls_real(self, url, detail=True, **kwargs):
# ignoring URL-encoded arguments
kw = self.kwargs.copy()
kw.update(kwargs)
logger.debug(url)
session = await self.set_session()
async with session.get(self.encode_url(url), **self.kwargs) as r:
self._raise_not_found_for_status(r, url)
if "Content-Type" in r.headers:
mimetype = r.headers["Content-Type"].partition(";")[0]
else:
mimetype = None
if mimetype in ("text/html", None):
try:
text = await r.text(errors="ignore")
if self.simple_links:
links = ex2.findall(text) + [u[2] for u in ex.findall(text)]
else:
links = [u[2] for u in ex.findall(text)]
except UnicodeDecodeError:
links = [] # binary, not HTML
else:
links = []
out = set()
parts = urlparse(url)
for l in links:
if isinstance(l, tuple):
l = l[1]
if l.startswith("/") and len(l) > 1:
# absolute URL on this server
l = f"{parts.scheme}://{parts.netloc}{l}"
if l.startswith("http"):
if self.same_schema and l.startswith(url.rstrip("/") + "/"):
out.add(l)
elif l.replace("https", "http").startswith(
url.replace("https", "http").rstrip("/") + "/"
):
# allowed to cross http <-> https
out.add(l)
else:
if l not in ["..", "../"]:
# Ignore FTP-like "parent"
out.add("/".join([url.rstrip("/"), l.lstrip("/")]))
if not out and url.endswith("/"):
out = await self._ls_real(url.rstrip("/"), detail=False)
if detail:
return [
{
"name": u,
"size": None,
"type": "directory" if u.endswith("/") else "file",
}
for u in out
]
else:
return sorted(out)
async def _ls(self, url, detail=True, **kwargs):
if self.use_listings_cache and url in self.dircache:
out = self.dircache[url]
else:
out = await self._ls_real(url, detail=detail, **kwargs)
self.dircache[url] = out
return out
ls = sync_wrapper(_ls)
def _raise_not_found_for_status(self, response, url):
"""
Raises FileNotFoundError for 404s, otherwise uses raise_for_status.
"""
if response.status == 404:
raise FileNotFoundError(url)
response.raise_for_status()
async def _cat_file(self, url, start=None, end=None, **kwargs):
kw = self.kwargs.copy()
kw.update(kwargs)
logger.debug(url)
if start is not None or end is not None:
if start == end:
return b""
headers = kw.pop("headers", {}).copy()
headers["Range"] = await self._process_limits(url, start, end)
kw["headers"] = headers
session = await self.set_session()
async with session.get(self.encode_url(url), **kw) as r:
out = await r.read()
self._raise_not_found_for_status(r, url)
return out
async def _get_file(
self, rpath, lpath, chunk_size=5 * 2**20, callback=DEFAULT_CALLBACK, **kwargs
):
kw = self.kwargs.copy()
kw.update(kwargs)
logger.debug(rpath)
session = await self.set_session()
async with session.get(self.encode_url(rpath), **kw) as r:
try:
size = int(r.headers["content-length"])
except (ValueError, KeyError):
size = None
callback.set_size(size)
self._raise_not_found_for_status(r, rpath)
if isfilelike(lpath):
outfile = lpath
else:
outfile = open(lpath, "wb") # noqa: ASYNC230
try:
chunk = True
while chunk:
chunk = await r.content.read(chunk_size)
outfile.write(chunk)
callback.relative_update(len(chunk))
finally:
if not isfilelike(lpath):
outfile.close()
async def _put_file(
self,
lpath,
rpath,
chunk_size=5 * 2**20,
callback=DEFAULT_CALLBACK,
method="post",
mode="overwrite",
**kwargs,
):
if mode != "overwrite":
raise NotImplementedError("Exclusive write")
async def gen_chunks():
# Support passing arbitrary file-like objects
# and use them instead of streams.
if isinstance(lpath, io.IOBase):
context = nullcontext(lpath)
use_seek = False # might not support seeking
else:
context = open(lpath, "rb") # noqa: ASYNC230
use_seek = True
with context as f:
if use_seek:
callback.set_size(f.seek(0, 2))
f.seek(0)
else:
callback.set_size(getattr(f, "size", None))
chunk = f.read(chunk_size)
while chunk:
yield chunk
callback.relative_update(len(chunk))
chunk = f.read(chunk_size)
kw = self.kwargs.copy()
kw.update(kwargs)
session = await self.set_session()
method = method.lower()
if method not in ("post", "put"):
raise ValueError(
f"method has to be either 'post' or 'put', not: {method!r}"
)
meth = getattr(session, method)
async with meth(self.encode_url(rpath), data=gen_chunks(), **kw) as resp:
self._raise_not_found_for_status(resp, rpath)
async def _exists(self, path, **kwargs):
kw = self.kwargs.copy()
kw.update(kwargs)
try:
logger.debug(path)
session = await self.set_session()
r = await session.get(self.encode_url(path), **kw)
async with r:
return r.status < 400
except aiohttp.ClientError:
return False
async def _isfile(self, path, **kwargs):
return await self._exists(path, **kwargs)
def _open(
self,
path,
mode="rb",
block_size=None,
autocommit=None, # XXX: This differs from the base class.
cache_type=None,
cache_options=None,
size=None,
**kwargs,
):
"""Make a file-like object
Parameters
----------
path: str
Full URL with protocol
mode: string
must be "rb"
block_size: int or None
Bytes to download in one request; use instance value if None. If
zero, will return a streaming Requests file-like instance.
kwargs: key-value
Any other parameters, passed to requests calls
"""
if mode != "rb":
raise NotImplementedError
block_size = block_size if block_size is not None else self.block_size
kw = self.kwargs.copy()
kw["asynchronous"] = self.asynchronous
kw.update(kwargs)
info = {}
size = size or info.update(self.info(path, **kwargs)) or info["size"]
session = sync(self.loop, self.set_session)
if block_size and size and info.get("partial", True):
return HTTPFile(
self,
path,
session=session,
block_size=block_size,
mode=mode,
size=size,
cache_type=cache_type or self.cache_type,
cache_options=cache_options or self.cache_options,
loop=self.loop,
**kw,
)
else:
return HTTPStreamFile(
self,
path,
mode=mode,
loop=self.loop,
session=session,
**kw,
)
async def open_async(self, path, mode="rb", size=None, **kwargs):
session = await self.set_session()
if size is None:
try:
size = (await self._info(path, **kwargs))["size"]
except FileNotFoundError:
pass
return AsyncStreamFile(
self,
path,
loop=self.loop,
session=session,
size=size,
**kwargs,
)
def ukey(self, url):
"""Unique identifier; assume HTTP files are static, unchanging"""
return tokenize(url, self.kwargs, self.protocol)
async def _info(self, url, **kwargs):
"""Get info of URL
Tries to access location via HEAD, and then GET methods, but does
not fetch the data.
It is possible that the server does not supply any size information, in
which case size will be given as None (and certain operations on the
corresponding file will not work).
"""
info = {}
session = await self.set_session()
for policy in ["head", "get"]:
try:
info.update(
await _file_info(
self.encode_url(url),
size_policy=policy,
session=session,
**self.kwargs,
**kwargs,
)
)
if info.get("size") is not None:
break
except Exception as exc:
if policy == "get":
# If get failed, then raise a FileNotFoundError
raise FileNotFoundError(url) from exc
logger.debug("", exc_info=exc)
return {"name": url, "size": None, **info, "type": "file"}
async def _glob(self, path, maxdepth=None, **kwargs):
"""
Find files by glob-matching.
This implementation is idntical to the one in AbstractFileSystem,
but "?" is not considered as a character for globbing, because it is
so common in URLs, often identifying the "query" part.
"""
if maxdepth is not None and maxdepth < 1:
raise ValueError("maxdepth must be at least 1")
import re
ends_with_slash = path.endswith("/") # _strip_protocol strips trailing slash
path = self._strip_protocol(path)
append_slash_to_dirname = ends_with_slash or path.endswith(("/**", "/*"))
idx_star = path.find("*") if path.find("*") >= 0 else len(path)
idx_brace = path.find("[") if path.find("[") >= 0 else len(path)
min_idx = min(idx_star, idx_brace)
detail = kwargs.pop("detail", False)
if not has_magic(path):
if await self._exists(path, **kwargs):
if not detail:
return [path]
else:
return {path: await self._info(path, **kwargs)}
else:
if not detail:
return [] # glob of non-existent returns empty
else:
return {}
elif "/" in path[:min_idx]:
min_idx = path[:min_idx].rindex("/")
root = path[: min_idx + 1]
depth = path[min_idx + 1 :].count("/") + 1
else:
root = ""
depth = path[min_idx + 1 :].count("/") + 1
if "**" in path:
if maxdepth is not None:
idx_double_stars = path.find("**")
depth_double_stars = path[idx_double_stars:].count("/") + 1
depth = depth - depth_double_stars + maxdepth
else:
depth = None
allpaths = await self._find(
root, maxdepth=depth, withdirs=True, detail=True, **kwargs
)
pattern = glob_translate(path + ("/" if ends_with_slash else ""))
pattern = re.compile(pattern)
out = {
(
p.rstrip("/")
if not append_slash_to_dirname
and info["type"] == "directory"
and p.endswith("/")
else p
): info
for p, info in sorted(allpaths.items())
if pattern.match(p.rstrip("/"))
}
if detail:
return out
else:
return list(out)
async def _isdir(self, path):
# override, since all URLs are (also) files
try:
return bool(await self._ls(path))
except (FileNotFoundError, ValueError):
return False
async def _pipe_file(self, path, value, mode="overwrite", **kwargs):
"""
Write bytes to a remote file over HTTP.
Parameters
----------
path : str
Target URL where the data should be written
value : bytes
Data to be written
mode : str
How to write to the file - 'overwrite' or 'append'
**kwargs : dict
Additional parameters to pass to the HTTP request
"""
url = self._strip_protocol(path)
headers = kwargs.pop("headers", {})
headers["Content-Length"] = str(len(value))
session = await self.set_session()
async with session.put(url, data=value, headers=headers, **kwargs) as r:
r.raise_for_status()
class HTTPFile(AbstractBufferedFile):
"""
A file-like object pointing to a remote HTTP(S) resource
Supports only reading, with read-ahead of a predetermined block-size.
In the case that the server does not supply the filesize, only reading of
the complete file in one go is supported.
Parameters
----------
url: str
Full URL of the remote resource, including the protocol
session: aiohttp.ClientSession or None
All calls will be made within this session, to avoid restarting
connections where the server allows this
block_size: int or None
The amount of read-ahead to do, in bytes. Default is 5MB, or the value
configured for the FileSystem creating this file
size: None or int
If given, this is the size of the file in bytes, and we don't attempt
to call the server to find the value.
kwargs: all other key-values are passed to requests calls.
"""
def __init__(
self,
fs,
url,
session=None,
block_size=None,
mode="rb",
cache_type="bytes",
cache_options=None,
size=None,
loop=None,
asynchronous=False,
**kwargs,
):
if mode != "rb":
raise NotImplementedError("File mode not supported")
self.asynchronous = asynchronous
self.loop = loop
self.url = url
self.session = session
self.details = {"name": url, "size": size, "type": "file"}
super().__init__(
fs=fs,
path=url,
mode=mode,
block_size=block_size,
cache_type=cache_type,
cache_options=cache_options,
**kwargs,
)
def read(self, length=-1):
"""Read bytes from file
Parameters
----------
length: int
Read up to this many bytes. If negative, read all content to end of
file. If the server has not supplied the filesize, attempting to
read only part of the data will raise a ValueError.
"""
if (
(length < 0 and self.loc == 0) # explicit read all
# but not when the size is known and fits into a block anyways
and not (self.size is not None and self.size <= self.blocksize)
):
self._fetch_all()
if self.size is None:
if length < 0:
self._fetch_all()
else:
length = min(self.size - self.loc, length)
return super().read(length)
async def async_fetch_all(self):
"""Read whole file in one shot, without caching
This is only called when position is still at zero,
and read() is called without a byte-count.
"""
logger.debug(f"Fetch all for {self}")
if not isinstance(self.cache, AllBytes):
r = await self.session.get(self.fs.encode_url(self.url), **self.kwargs)
async with r:
r.raise_for_status()
out = await r.read()
self.cache = AllBytes(
size=len(out), fetcher=None, blocksize=None, data=out
)
self.size = len(out)
_fetch_all = sync_wrapper(async_fetch_all)
def _parse_content_range(self, headers):
"""Parse the Content-Range header"""
s = headers.get("Content-Range", "")
m = re.match(r"bytes (\d+-\d+|\*)/(\d+|\*)", s)
if not m:
return None, None, None
if m[1] == "*":
start = end = None
else:
start, end = [int(x) for x in m[1].split("-")]
total = None if m[2] == "*" else int(m[2])
return start, end, total
async def async_fetch_range(self, start, end):
"""Download a block of data
The expectation is that the server returns only the requested bytes,
with HTTP code 206. If this is not the case, we first check the headers,
and then stream the output - if the data size is bigger than we
requested, an exception is raised.
"""
logger.debug(f"Fetch range for {self}: {start}-{end}")
kwargs = self.kwargs.copy()
headers = kwargs.pop("headers", {}).copy()
headers["Range"] = f"bytes={start}-{end - 1}"
logger.debug(f"{self.url} : {headers['Range']}")
r = await self.session.get(
self.fs.encode_url(self.url), headers=headers, **kwargs
)
async with r:
if r.status == 416:
# range request outside file
return b""
r.raise_for_status()
# If the server has handled the range request, it should reply
# with status 206 (partial content). But we'll guess that a suitable
# Content-Range header or a Content-Length no more than the
# requested range also mean we have got the desired range.
response_is_range = (
r.status == 206
or self._parse_content_range(r.headers)[0] == start
or int(r.headers.get("Content-Length", end + 1)) <= end - start
)
if response_is_range:
# partial content, as expected
out = await r.read()
elif start > 0:
raise ValueError(
"The HTTP server doesn't appear to support range requests. "
"Only reading this file from the beginning is supported. "
"Open with block_size=0 for a streaming file interface."
)
else:
# Response is not a range, but we want the start of the file,
# so we can read the required amount anyway.
cl = 0
out = []
while True:
chunk = await r.content.read(2**20)
# data size unknown, let's read until we have enough
if chunk:
out.append(chunk)
cl += len(chunk)
if cl > end - start:
break
else:
break
out = b"".join(out)[: end - start]
return out
_fetch_range = sync_wrapper(async_fetch_range)
magic_check = re.compile("([*[])")
def has_magic(s):
match = magic_check.search(s)
return match is not None
class HTTPStreamFile(AbstractBufferedFile):
def __init__(self, fs, url, mode="rb", loop=None, session=None, **kwargs):
self.asynchronous = kwargs.pop("asynchronous", False)
self.url = url
self.loop = loop
self.session = session
if mode != "rb":
raise ValueError
self.details = {"name": url, "size": None}
super().__init__(fs=fs, path=url, mode=mode, cache_type="none", **kwargs)
async def cor():
r = await self.session.get(self.fs.encode_url(url), **kwargs).__aenter__()
self.fs._raise_not_found_for_status(r, url)
return r
self.r = sync(self.loop, cor)
self.loop = fs.loop
def seek(self, loc, whence=0):
if loc == 0 and whence == 1:
return
if loc == self.loc and whence == 0:
return
raise ValueError("Cannot seek streaming HTTP file")
async def _read(self, num=-1):
out = await self.r.content.read(num)
self.loc += len(out)
return out
read = sync_wrapper(_read)
async def _close(self):
self.r.close()
def close(self):
asyncio.run_coroutine_threadsafe(self._close(), self.loop)
super().close()
class AsyncStreamFile(AbstractAsyncStreamedFile):
def __init__(
self, fs, url, mode="rb", loop=None, session=None, size=None, **kwargs
):
self.url = url
self.session = session
self.r = None
if mode != "rb":
raise ValueError
self.details = {"name": url, "size": None}
self.kwargs = kwargs
super().__init__(fs=fs, path=url, mode=mode, cache_type="none")
self.size = size
async def read(self, num=-1):
if self.r is None:
r = await self.session.get(
self.fs.encode_url(self.url), **self.kwargs
).__aenter__()
self.fs._raise_not_found_for_status(r, self.url)
self.r = r
out = await self.r.content.read(num)
self.loc += len(out)
return out
async def close(self):
if self.r is not None:
self.r.close()
self.r = None
await super().close()
async def get_range(session, url, start, end, file=None, **kwargs):
# explicit get a range when we know it must be safe
kwargs = kwargs.copy()
headers = kwargs.pop("headers", {}).copy()
headers["Range"] = f"bytes={start}-{end - 1}"
r = await session.get(url, headers=headers, **kwargs)
r.raise_for_status()
async with r:
out = await r.read()
if file:
with open(file, "r+b") as f: # noqa: ASYNC230
f.seek(start)
f.write(out)
else:
return out
async def _file_info(url, session, size_policy="head", **kwargs):
"""Call HEAD on the server to get details about the file (size/checksum etc.)
Default operation is to explicitly allow redirects and use encoding
'identity' (no compression) to get the true size of the target.
"""
logger.debug("Retrieve file size for %s", url)
kwargs = kwargs.copy()
ar = kwargs.pop("allow_redirects", True)
head = kwargs.get("headers", {}).copy()
head["Accept-Encoding"] = "identity"
kwargs["headers"] = head
info = {}
if size_policy == "head":
r = await session.head(url, allow_redirects=ar, **kwargs)
elif size_policy == "get":
r = await session.get(url, allow_redirects=ar, **kwargs)
else:
raise TypeError(f'size_policy must be "head" or "get", got {size_policy}')
async with r:
r.raise_for_status()
if "Content-Length" in r.headers:
# Some servers may choose to ignore Accept-Encoding and return
# compressed content, in which case the returned size is unreliable.
if "Content-Encoding" not in r.headers or r.headers["Content-Encoding"] in [
"identity",
"",
]:
info["size"] = int(r.headers["Content-Length"])
elif "Content-Range" in r.headers:
info["size"] = int(r.headers["Content-Range"].split("/")[1])
if "Content-Type" in r.headers:
info["mimetype"] = r.headers["Content-Type"].partition(";")[0]
if r.headers.get("Accept-Ranges") == "none":
# Some servers may explicitly discourage partial content requests, but
# the lack of "Accept-Ranges" does not always indicate they would fail
info["partial"] = False
info["url"] = str(r.url)
for checksum_field in ["ETag", "Content-MD5", "Digest", "Last-Modified"]:
if r.headers.get(checksum_field):
info[checksum_field] = r.headers[checksum_field]
return info
async def _file_size(url, session=None, *args, **kwargs):
if session is None:
session = await get_client()
info = await _file_info(url, session=session, *args, **kwargs)
return info.get("size")
file_size = sync_wrapper(_file_size)

View File

@@ -0,0 +1,931 @@
"""This file is largely copied from http.py"""
import io
import logging
import re
import urllib.error
import urllib.parse
from copy import copy
from json import dumps, loads
from urllib.parse import urlparse
try:
import yarl
except (ImportError, ModuleNotFoundError, OSError):
yarl = False
from fsspec.callbacks import _DEFAULT_CALLBACK
from fsspec.registry import register_implementation
from fsspec.spec import AbstractBufferedFile, AbstractFileSystem
from fsspec.utils import DEFAULT_BLOCK_SIZE, isfilelike, nullcontext, tokenize
from ..caching import AllBytes
# https://stackoverflow.com/a/15926317/3821154
ex = re.compile(r"""<(a|A)\s+(?:[^>]*?\s+)?(href|HREF)=["'](?P<url>[^"']+)""")
ex2 = re.compile(r"""(?P<url>http[s]?://[-a-zA-Z0-9@:%_+.~#?&/=]+)""")
logger = logging.getLogger("fsspec.http")
class JsHttpException(urllib.error.HTTPError): ...
class StreamIO(io.BytesIO):
# fake class, so you can set attributes on it
# will eventually actually stream
...
class ResponseProxy:
"""Looks like a requests response"""
def __init__(self, req, stream=False):
self.request = req
self.stream = stream
self._data = None
self._headers = None
@property
def raw(self):
if self._data is None:
b = self.request.response.to_bytes()
if self.stream:
self._data = StreamIO(b)
else:
self._data = b
return self._data
def close(self):
if hasattr(self, "_data"):
del self._data
@property
def headers(self):
if self._headers is None:
self._headers = dict(
[
_.split(": ")
for _ in self.request.getAllResponseHeaders().strip().split("\r\n")
]
)
return self._headers
@property
def status_code(self):
return int(self.request.status)
def raise_for_status(self):
if not self.ok:
raise JsHttpException(
self.url, self.status_code, self.reason, self.headers, None
)
def iter_content(self, chunksize, *_, **__):
while True:
out = self.raw.read(chunksize)
if out:
yield out
else:
break
@property
def reason(self):
return self.request.statusText
@property
def ok(self):
return self.status_code < 400
@property
def url(self):
return self.request.response.responseURL
@property
def text(self):
# TODO: encoding from headers
return self.content.decode()
@property
def content(self):
self.stream = False
return self.raw
def json(self):
return loads(self.text)
class RequestsSessionShim:
def __init__(self):
self.headers = {}
def request(
self,
method,
url,
params=None,
data=None,
headers=None,
cookies=None,
files=None,
auth=None,
timeout=None,
allow_redirects=None,
proxies=None,
hooks=None,
stream=None,
verify=None,
cert=None,
json=None,
):
from js import Blob, XMLHttpRequest
logger.debug("JS request: %s %s", method, url)
if cert or verify or proxies or files or cookies or hooks:
raise NotImplementedError
if data and json:
raise ValueError("Use json= or data=, not both")
req = XMLHttpRequest.new()
extra = auth if auth else ()
if params:
url = f"{url}?{urllib.parse.urlencode(params)}"
req.open(method, url, False, *extra)
if timeout:
req.timeout = timeout
if headers:
for k, v in headers.items():
req.setRequestHeader(k, v)
req.setRequestHeader("Accept", "application/octet-stream")
req.responseType = "arraybuffer"
if json:
blob = Blob.new([dumps(data)], {type: "application/json"})
req.send(blob)
elif data:
if isinstance(data, io.IOBase):
data = data.read()
blob = Blob.new([data], {type: "application/octet-stream"})
req.send(blob)
else:
req.send(None)
return ResponseProxy(req, stream=stream)
def get(self, url, **kwargs):
return self.request("GET", url, **kwargs)
def head(self, url, **kwargs):
return self.request("HEAD", url, **kwargs)
def post(self, url, **kwargs):
return self.request("POST}", url, **kwargs)
def put(self, url, **kwargs):
return self.request("PUT", url, **kwargs)
def patch(self, url, **kwargs):
return self.request("PATCH", url, **kwargs)
def delete(self, url, **kwargs):
return self.request("DELETE", url, **kwargs)
class HTTPFileSystem(AbstractFileSystem):
"""
Simple File-System for fetching data via HTTP(S)
This is the BLOCKING version of the normal HTTPFileSystem. It uses
requests in normal python and the JS runtime in pyodide.
***This implementation is extremely experimental, do not use unless
you are testing pyodide/pyscript integration***
"""
protocol = ("http", "https", "sync-http", "sync-https")
sep = "/"
def __init__(
self,
simple_links=True,
block_size=None,
same_scheme=True,
cache_type="readahead",
cache_options=None,
client_kwargs=None,
encoded=False,
**storage_options,
):
"""
Parameters
----------
block_size: int
Blocks to read bytes; if 0, will default to raw requests file-like
objects instead of HTTPFile instances
simple_links: bool
If True, will consider both HTML <a> tags and anything that looks
like a URL; if False, will consider only the former.
same_scheme: True
When doing ls/glob, if this is True, only consider paths that have
http/https matching the input URLs.
size_policy: this argument is deprecated
client_kwargs: dict
Passed to aiohttp.ClientSession, see
https://docs.aiohttp.org/en/stable/client_reference.html
For example, ``{'auth': aiohttp.BasicAuth('user', 'pass')}``
storage_options: key-value
Any other parameters passed on to requests
cache_type, cache_options: defaults used in open
"""
super().__init__(self, **storage_options)
self.block_size = block_size if block_size is not None else DEFAULT_BLOCK_SIZE
self.simple_links = simple_links
self.same_schema = same_scheme
self.cache_type = cache_type
self.cache_options = cache_options
self.client_kwargs = client_kwargs or {}
self.encoded = encoded
self.kwargs = storage_options
try:
import js # noqa: F401
logger.debug("Starting JS session")
self.session = RequestsSessionShim()
self.js = True
except Exception as e:
import requests
logger.debug("Starting cpython session because of: %s", e)
self.session = requests.Session(**(client_kwargs or {}))
self.js = False
request_options = copy(storage_options)
self.use_listings_cache = request_options.pop("use_listings_cache", False)
request_options.pop("listings_expiry_time", None)
request_options.pop("max_paths", None)
request_options.pop("skip_instance_cache", None)
self.kwargs = request_options
@property
def fsid(self):
return "sync-http"
def encode_url(self, url):
if yarl:
return yarl.URL(url, encoded=self.encoded)
return url
@classmethod
def _strip_protocol(cls, path: str) -> str:
"""For HTTP, we always want to keep the full URL"""
path = path.replace("sync-http://", "http://").replace(
"sync-https://", "https://"
)
return path
@classmethod
def _parent(cls, path):
# override, since _strip_protocol is different for URLs
par = super()._parent(path)
if len(par) > 7: # "http://..."
return par
return ""
def _ls_real(self, url, detail=True, **kwargs):
# ignoring URL-encoded arguments
kw = self.kwargs.copy()
kw.update(kwargs)
logger.debug(url)
r = self.session.get(self.encode_url(url), **self.kwargs)
self._raise_not_found_for_status(r, url)
text = r.text
if self.simple_links:
links = ex2.findall(text) + [u[2] for u in ex.findall(text)]
else:
links = [u[2] for u in ex.findall(text)]
out = set()
parts = urlparse(url)
for l in links:
if isinstance(l, tuple):
l = l[1]
if l.startswith("/") and len(l) > 1:
# absolute URL on this server
l = parts.scheme + "://" + parts.netloc + l
if l.startswith("http"):
if self.same_schema and l.startswith(url.rstrip("/") + "/"):
out.add(l)
elif l.replace("https", "http").startswith(
url.replace("https", "http").rstrip("/") + "/"
):
# allowed to cross http <-> https
out.add(l)
else:
if l not in ["..", "../"]:
# Ignore FTP-like "parent"
out.add("/".join([url.rstrip("/"), l.lstrip("/")]))
if not out and url.endswith("/"):
out = self._ls_real(url.rstrip("/"), detail=False)
if detail:
return [
{
"name": u,
"size": None,
"type": "directory" if u.endswith("/") else "file",
}
for u in out
]
else:
return sorted(out)
def ls(self, url, detail=True, **kwargs):
if self.use_listings_cache and url in self.dircache:
out = self.dircache[url]
else:
out = self._ls_real(url, detail=detail, **kwargs)
self.dircache[url] = out
return out
def _raise_not_found_for_status(self, response, url):
"""
Raises FileNotFoundError for 404s, otherwise uses raise_for_status.
"""
if response.status_code == 404:
raise FileNotFoundError(url)
response.raise_for_status()
def cat_file(self, url, start=None, end=None, **kwargs):
kw = self.kwargs.copy()
kw.update(kwargs)
logger.debug(url)
if start is not None or end is not None:
if start == end:
return b""
headers = kw.pop("headers", {}).copy()
headers["Range"] = self._process_limits(url, start, end)
kw["headers"] = headers
r = self.session.get(self.encode_url(url), **kw)
self._raise_not_found_for_status(r, url)
return r.content
def get_file(
self, rpath, lpath, chunk_size=5 * 2**20, callback=_DEFAULT_CALLBACK, **kwargs
):
kw = self.kwargs.copy()
kw.update(kwargs)
logger.debug(rpath)
r = self.session.get(self.encode_url(rpath), **kw)
try:
size = int(
r.headers.get("content-length", None)
or r.headers.get("Content-Length", None)
)
except (ValueError, KeyError, TypeError):
size = None
callback.set_size(size)
self._raise_not_found_for_status(r, rpath)
if not isfilelike(lpath):
lpath = open(lpath, "wb")
for chunk in r.iter_content(chunk_size, decode_unicode=False):
lpath.write(chunk)
callback.relative_update(len(chunk))
def put_file(
self,
lpath,
rpath,
chunk_size=5 * 2**20,
callback=_DEFAULT_CALLBACK,
method="post",
**kwargs,
):
def gen_chunks():
# Support passing arbitrary file-like objects
# and use them instead of streams.
if isinstance(lpath, io.IOBase):
context = nullcontext(lpath)
use_seek = False # might not support seeking
else:
context = open(lpath, "rb")
use_seek = True
with context as f:
if use_seek:
callback.set_size(f.seek(0, 2))
f.seek(0)
else:
callback.set_size(getattr(f, "size", None))
chunk = f.read(chunk_size)
while chunk:
yield chunk
callback.relative_update(len(chunk))
chunk = f.read(chunk_size)
kw = self.kwargs.copy()
kw.update(kwargs)
method = method.lower()
if method not in ("post", "put"):
raise ValueError(
f"method has to be either 'post' or 'put', not: {method!r}"
)
meth = getattr(self.session, method)
resp = meth(rpath, data=gen_chunks(), **kw)
self._raise_not_found_for_status(resp, rpath)
def _process_limits(self, url, start, end):
"""Helper for "Range"-based _cat_file"""
size = None
suff = False
if start is not None and start < 0:
# if start is negative and end None, end is the "suffix length"
if end is None:
end = -start
start = ""
suff = True
else:
size = size or self.info(url)["size"]
start = size + start
elif start is None:
start = 0
if not suff:
if end is not None and end < 0:
if start is not None:
size = size or self.info(url)["size"]
end = size + end
elif end is None:
end = ""
if isinstance(end, int):
end -= 1 # bytes range is inclusive
return f"bytes={start}-{end}"
def exists(self, path, **kwargs):
kw = self.kwargs.copy()
kw.update(kwargs)
try:
logger.debug(path)
r = self.session.get(self.encode_url(path), **kw)
return r.status_code < 400
except Exception:
return False
def isfile(self, path, **kwargs):
return self.exists(path, **kwargs)
def _open(
self,
path,
mode="rb",
block_size=None,
autocommit=None, # XXX: This differs from the base class.
cache_type=None,
cache_options=None,
size=None,
**kwargs,
):
"""Make a file-like object
Parameters
----------
path: str
Full URL with protocol
mode: string
must be "rb"
block_size: int or None
Bytes to download in one request; use instance value if None. If
zero, will return a streaming Requests file-like instance.
kwargs: key-value
Any other parameters, passed to requests calls
"""
if mode != "rb":
raise NotImplementedError
block_size = block_size if block_size is not None else self.block_size
kw = self.kwargs.copy()
kw.update(kwargs)
size = size or self.info(path, **kwargs)["size"]
if block_size and size:
return HTTPFile(
self,
path,
session=self.session,
block_size=block_size,
mode=mode,
size=size,
cache_type=cache_type or self.cache_type,
cache_options=cache_options or self.cache_options,
**kw,
)
else:
return HTTPStreamFile(
self,
path,
mode=mode,
session=self.session,
**kw,
)
def ukey(self, url):
"""Unique identifier; assume HTTP files are static, unchanging"""
return tokenize(url, self.kwargs, self.protocol)
def info(self, url, **kwargs):
"""Get info of URL
Tries to access location via HEAD, and then GET methods, but does
not fetch the data.
It is possible that the server does not supply any size information, in
which case size will be given as None (and certain operations on the
corresponding file will not work).
"""
info = {}
for policy in ["head", "get"]:
try:
info.update(
_file_info(
self.encode_url(url),
size_policy=policy,
session=self.session,
**self.kwargs,
**kwargs,
)
)
if info.get("size") is not None:
break
except Exception as exc:
if policy == "get":
# If get failed, then raise a FileNotFoundError
raise FileNotFoundError(url) from exc
logger.debug(str(exc))
return {"name": url, "size": None, **info, "type": "file"}
def glob(self, path, maxdepth=None, **kwargs):
"""
Find files by glob-matching.
This implementation is idntical to the one in AbstractFileSystem,
but "?" is not considered as a character for globbing, because it is
so common in URLs, often identifying the "query" part.
"""
import re
ends = path.endswith("/")
path = self._strip_protocol(path)
indstar = path.find("*") if path.find("*") >= 0 else len(path)
indbrace = path.find("[") if path.find("[") >= 0 else len(path)
ind = min(indstar, indbrace)
detail = kwargs.pop("detail", False)
if not has_magic(path):
root = path
depth = 1
if ends:
path += "/*"
elif self.exists(path):
if not detail:
return [path]
else:
return {path: self.info(path)}
else:
if not detail:
return [] # glob of non-existent returns empty
else:
return {}
elif "/" in path[:ind]:
ind2 = path[:ind].rindex("/")
root = path[: ind2 + 1]
depth = None if "**" in path else path[ind2 + 1 :].count("/") + 1
else:
root = ""
depth = None if "**" in path else path[ind + 1 :].count("/") + 1
allpaths = self.find(
root, maxdepth=maxdepth or depth, withdirs=True, detail=True, **kwargs
)
# Escape characters special to python regex, leaving our supported
# special characters in place.
# See https://www.gnu.org/software/bash/manual/html_node/Pattern-Matching.html
# for shell globbing details.
pattern = (
"^"
+ (
path.replace("\\", r"\\")
.replace(".", r"\.")
.replace("+", r"\+")
.replace("//", "/")
.replace("(", r"\(")
.replace(")", r"\)")
.replace("|", r"\|")
.replace("^", r"\^")
.replace("$", r"\$")
.replace("{", r"\{")
.replace("}", r"\}")
.rstrip("/")
)
+ "$"
)
pattern = re.sub("[*]{2}", "=PLACEHOLDER=", pattern)
pattern = re.sub("[*]", "[^/]*", pattern)
pattern = re.compile(pattern.replace("=PLACEHOLDER=", ".*"))
out = {
p: allpaths[p]
for p in sorted(allpaths)
if pattern.match(p.replace("//", "/").rstrip("/"))
}
if detail:
return out
else:
return list(out)
def isdir(self, path):
# override, since all URLs are (also) files
try:
return bool(self.ls(path))
except (FileNotFoundError, ValueError):
return False
class HTTPFile(AbstractBufferedFile):
"""
A file-like object pointing to a remove HTTP(S) resource
Supports only reading, with read-ahead of a predermined block-size.
In the case that the server does not supply the filesize, only reading of
the complete file in one go is supported.
Parameters
----------
url: str
Full URL of the remote resource, including the protocol
session: requests.Session or None
All calls will be made within this session, to avoid restarting
connections where the server allows this
block_size: int or None
The amount of read-ahead to do, in bytes. Default is 5MB, or the value
configured for the FileSystem creating this file
size: None or int
If given, this is the size of the file in bytes, and we don't attempt
to call the server to find the value.
kwargs: all other key-values are passed to requests calls.
"""
def __init__(
self,
fs,
url,
session=None,
block_size=None,
mode="rb",
cache_type="bytes",
cache_options=None,
size=None,
**kwargs,
):
if mode != "rb":
raise NotImplementedError("File mode not supported")
self.url = url
self.session = session
self.details = {"name": url, "size": size, "type": "file"}
super().__init__(
fs=fs,
path=url,
mode=mode,
block_size=block_size,
cache_type=cache_type,
cache_options=cache_options,
**kwargs,
)
def read(self, length=-1):
"""Read bytes from file
Parameters
----------
length: int
Read up to this many bytes. If negative, read all content to end of
file. If the server has not supplied the filesize, attempting to
read only part of the data will raise a ValueError.
"""
if (
(length < 0 and self.loc == 0) # explicit read all
# but not when the size is known and fits into a block anyways
and not (self.size is not None and self.size <= self.blocksize)
):
self._fetch_all()
if self.size is None:
if length < 0:
self._fetch_all()
else:
length = min(self.size - self.loc, length)
return super().read(length)
def _fetch_all(self):
"""Read whole file in one shot, without caching
This is only called when position is still at zero,
and read() is called without a byte-count.
"""
logger.debug(f"Fetch all for {self}")
if not isinstance(self.cache, AllBytes):
r = self.session.get(self.fs.encode_url(self.url), **self.kwargs)
r.raise_for_status()
out = r.content
self.cache = AllBytes(size=len(out), fetcher=None, blocksize=None, data=out)
self.size = len(out)
def _parse_content_range(self, headers):
"""Parse the Content-Range header"""
s = headers.get("Content-Range", "")
m = re.match(r"bytes (\d+-\d+|\*)/(\d+|\*)", s)
if not m:
return None, None, None
if m[1] == "*":
start = end = None
else:
start, end = [int(x) for x in m[1].split("-")]
total = None if m[2] == "*" else int(m[2])
return start, end, total
def _fetch_range(self, start, end):
"""Download a block of data
The expectation is that the server returns only the requested bytes,
with HTTP code 206. If this is not the case, we first check the headers,
and then stream the output - if the data size is bigger than we
requested, an exception is raised.
"""
logger.debug(f"Fetch range for {self}: {start}-{end}")
kwargs = self.kwargs.copy()
headers = kwargs.pop("headers", {}).copy()
headers["Range"] = f"bytes={start}-{end - 1}"
logger.debug("%s : %s", self.url, headers["Range"])
r = self.session.get(self.fs.encode_url(self.url), headers=headers, **kwargs)
if r.status_code == 416:
# range request outside file
return b""
r.raise_for_status()
# If the server has handled the range request, it should reply
# with status 206 (partial content). But we'll guess that a suitable
# Content-Range header or a Content-Length no more than the
# requested range also mean we have got the desired range.
cl = r.headers.get("Content-Length", r.headers.get("content-length", end + 1))
response_is_range = (
r.status_code == 206
or self._parse_content_range(r.headers)[0] == start
or int(cl) <= end - start
)
if response_is_range:
# partial content, as expected
out = r.content
elif start > 0:
raise ValueError(
"The HTTP server doesn't appear to support range requests. "
"Only reading this file from the beginning is supported. "
"Open with block_size=0 for a streaming file interface."
)
else:
# Response is not a range, but we want the start of the file,
# so we can read the required amount anyway.
cl = 0
out = []
for chunk in r.iter_content(2**20, False):
out.append(chunk)
cl += len(chunk)
out = b"".join(out)[: end - start]
return out
magic_check = re.compile("([*[])")
def has_magic(s):
match = magic_check.search(s)
return match is not None
class HTTPStreamFile(AbstractBufferedFile):
def __init__(self, fs, url, mode="rb", session=None, **kwargs):
self.url = url
self.session = session
if mode != "rb":
raise ValueError
self.details = {"name": url, "size": None}
super().__init__(fs=fs, path=url, mode=mode, cache_type="readahead", **kwargs)
r = self.session.get(self.fs.encode_url(url), stream=True, **kwargs)
self.fs._raise_not_found_for_status(r, url)
self.it = r.iter_content(1024, False)
self.leftover = b""
self.r = r
def seek(self, *args, **kwargs):
raise ValueError("Cannot seek streaming HTTP file")
def read(self, num=-1):
bufs = [self.leftover]
leng = len(self.leftover)
while leng < num or num < 0:
try:
out = self.it.__next__()
except StopIteration:
break
if out:
bufs.append(out)
else:
break
leng += len(out)
out = b"".join(bufs)
if num >= 0:
self.leftover = out[num:]
out = out[:num]
else:
self.leftover = b""
self.loc += len(out)
return out
def close(self):
self.r.close()
self.closed = True
def get_range(session, url, start, end, **kwargs):
# explicit get a range when we know it must be safe
kwargs = kwargs.copy()
headers = kwargs.pop("headers", {}).copy()
headers["Range"] = f"bytes={start}-{end - 1}"
r = session.get(url, headers=headers, **kwargs)
r.raise_for_status()
return r.content
def _file_info(url, session, size_policy="head", **kwargs):
"""Call HEAD on the server to get details about the file (size/checksum etc.)
Default operation is to explicitly allow redirects and use encoding
'identity' (no compression) to get the true size of the target.
"""
logger.debug("Retrieve file size for %s", url)
kwargs = kwargs.copy()
ar = kwargs.pop("allow_redirects", True)
head = kwargs.get("headers", {}).copy()
# TODO: not allowed in JS
# head["Accept-Encoding"] = "identity"
kwargs["headers"] = head
info = {}
if size_policy == "head":
r = session.head(url, allow_redirects=ar, **kwargs)
elif size_policy == "get":
r = session.get(url, allow_redirects=ar, **kwargs)
else:
raise TypeError(f'size_policy must be "head" or "get", got {size_policy}')
r.raise_for_status()
# TODO:
# recognise lack of 'Accept-Ranges',
# or 'Accept-Ranges': 'none' (not 'bytes')
# to mean streaming only, no random access => return None
if "Content-Length" in r.headers:
info["size"] = int(r.headers["Content-Length"])
elif "Content-Range" in r.headers:
info["size"] = int(r.headers["Content-Range"].split("/")[1])
elif "content-length" in r.headers:
info["size"] = int(r.headers["content-length"])
elif "content-range" in r.headers:
info["size"] = int(r.headers["content-range"].split("/")[1])
for checksum_field in ["ETag", "Content-MD5", "Digest"]:
if r.headers.get(checksum_field):
info[checksum_field] = r.headers[checksum_field]
return info
# importing this is enough to register it
def register():
register_implementation("http", HTTPFileSystem, clobber=True)
register_implementation("https", HTTPFileSystem, clobber=True)
register_implementation("sync-http", HTTPFileSystem, clobber=True)
register_implementation("sync-https", HTTPFileSystem, clobber=True)
register()
def unregister():
from fsspec.implementations.http import HTTPFileSystem
register_implementation("http", HTTPFileSystem, clobber=True)
register_implementation("https", HTTPFileSystem, clobber=True)

View File

@@ -0,0 +1,129 @@
import base64
import io
import re
import requests
import fsspec
class JupyterFileSystem(fsspec.AbstractFileSystem):
"""View of the files as seen by a Jupyter server (notebook or lab)"""
protocol = ("jupyter", "jlab")
def __init__(self, url, tok=None, **kwargs):
"""
Parameters
----------
url : str
Base URL of the server, like "http://127.0.0.1:8888". May include
token in the string, which is given by the process when starting up
tok : str
If the token is obtained separately, can be given here
kwargs
"""
if "?" in url:
if tok is None:
try:
tok = re.findall("token=([a-z0-9]+)", url)[0]
except IndexError as e:
raise ValueError("Could not determine token") from e
url = url.split("?", 1)[0]
self.url = url.rstrip("/") + "/api/contents"
self.session = requests.Session()
if tok:
self.session.headers["Authorization"] = f"token {tok}"
super().__init__(**kwargs)
def ls(self, path, detail=True, **kwargs):
path = self._strip_protocol(path)
r = self.session.get(f"{self.url}/{path}")
if r.status_code == 404:
raise FileNotFoundError(path)
r.raise_for_status()
out = r.json()
if out["type"] == "directory":
out = out["content"]
else:
out = [out]
for o in out:
o["name"] = o.pop("path")
o.pop("content")
if o["type"] == "notebook":
o["type"] = "file"
if detail:
return out
return [o["name"] for o in out]
def cat_file(self, path, start=None, end=None, **kwargs):
path = self._strip_protocol(path)
r = self.session.get(f"{self.url}/{path}")
if r.status_code == 404:
raise FileNotFoundError(path)
r.raise_for_status()
out = r.json()
if out["format"] == "text":
# data should be binary
b = out["content"].encode()
else:
b = base64.b64decode(out["content"])
return b[start:end]
def pipe_file(self, path, value, **_):
path = self._strip_protocol(path)
json = {
"name": path.rsplit("/", 1)[-1],
"path": path,
"size": len(value),
"content": base64.b64encode(value).decode(),
"format": "base64",
"type": "file",
}
self.session.put(f"{self.url}/{path}", json=json)
def mkdir(self, path, create_parents=True, **kwargs):
path = self._strip_protocol(path)
if create_parents and "/" in path:
self.mkdir(path.rsplit("/", 1)[0], True)
json = {
"name": path.rsplit("/", 1)[-1],
"path": path,
"size": None,
"content": None,
"type": "directory",
}
self.session.put(f"{self.url}/{path}", json=json)
def mv(self, path1, path2, recursive=False, maxdepth=None, **kwargs):
if path1 == path2:
return
self.session.patch(f"{self.url}/{path1}", json={"path": path2})
def _rm(self, path):
path = self._strip_protocol(path)
self.session.delete(f"{self.url}/{path}")
def _open(self, path, mode="rb", **kwargs):
path = self._strip_protocol(path)
if mode == "rb":
data = self.cat_file(path)
return io.BytesIO(data)
else:
return SimpleFileWriter(self, path, mode="wb")
class SimpleFileWriter(fsspec.spec.AbstractBufferedFile):
def _upload_chunk(self, final=False):
"""Never uploads a chunk until file is done
Not suitable for large files
"""
if final is False:
return False
self.buffer.seek(0)
data = self.buffer.read()
self.fs.pipe_file(self.path, data)

View File

@@ -0,0 +1,213 @@
from contextlib import contextmanager
from ctypes import (
CFUNCTYPE,
POINTER,
c_int,
c_longlong,
c_void_p,
cast,
create_string_buffer,
)
import libarchive
import libarchive.ffi as ffi
from fsspec import open_files
from fsspec.archive import AbstractArchiveFileSystem
from fsspec.implementations.memory import MemoryFile
from fsspec.utils import DEFAULT_BLOCK_SIZE
# Libarchive requires seekable files or memory only for certain archive
# types. However, since we read the directory first to cache the contents
# and also allow random access to any file, the file-like object needs
# to be seekable no matter what.
# Seek call-backs (not provided in the libarchive python wrapper)
SEEK_CALLBACK = CFUNCTYPE(c_longlong, c_int, c_void_p, c_longlong, c_int)
read_set_seek_callback = ffi.ffi(
"read_set_seek_callback", [ffi.c_archive_p, SEEK_CALLBACK], c_int, ffi.check_int
)
new_api = hasattr(ffi, "NO_OPEN_CB")
@contextmanager
def custom_reader(file, format_name="all", filter_name="all", block_size=ffi.page_size):
"""Read an archive from a seekable file-like object.
The `file` object must support the standard `readinto` and 'seek' methods.
"""
buf = create_string_buffer(block_size)
buf_p = cast(buf, c_void_p)
def read_func(archive_p, context, ptrptr):
# readinto the buffer, returns number of bytes read
length = file.readinto(buf)
# write the address of the buffer into the pointer
ptrptr = cast(ptrptr, POINTER(c_void_p))
ptrptr[0] = buf_p
# tell libarchive how much data was written into the buffer
return length
def seek_func(archive_p, context, offset, whence):
file.seek(offset, whence)
# tell libarchvie the current position
return file.tell()
read_cb = ffi.READ_CALLBACK(read_func)
seek_cb = SEEK_CALLBACK(seek_func)
if new_api:
open_cb = ffi.NO_OPEN_CB
close_cb = ffi.NO_CLOSE_CB
else:
open_cb = libarchive.read.OPEN_CALLBACK(ffi.VOID_CB)
close_cb = libarchive.read.CLOSE_CALLBACK(ffi.VOID_CB)
with libarchive.read.new_archive_read(format_name, filter_name) as archive_p:
read_set_seek_callback(archive_p, seek_cb)
ffi.read_open(archive_p, None, open_cb, read_cb, close_cb)
yield libarchive.read.ArchiveRead(archive_p)
class LibArchiveFileSystem(AbstractArchiveFileSystem):
"""Compressed archives as a file-system (read-only)
Supports the following formats:
tar, pax , cpio, ISO9660, zip, mtree, shar, ar, raw, xar, lha/lzh, rar
Microsoft CAB, 7-Zip, WARC
See the libarchive documentation for further restrictions.
https://www.libarchive.org/
Keeps file object open while instance lives. It only works in seekable
file-like objects. In case the filesystem does not support this kind of
file object, it is recommended to cache locally.
This class is pickleable, but not necessarily thread-safe (depends on the
platform). See libarchive documentation for details.
"""
root_marker = ""
protocol = "libarchive"
cachable = False
def __init__(
self,
fo="",
mode="r",
target_protocol=None,
target_options=None,
block_size=DEFAULT_BLOCK_SIZE,
**kwargs,
):
"""
Parameters
----------
fo: str or file-like
Contains ZIP, and must exist. If a str, will fetch file using
:meth:`~fsspec.open_files`, which must return one file exactly.
mode: str
Currently, only 'r' accepted
target_protocol: str (optional)
If ``fo`` is a string, this value can be used to override the
FS protocol inferred from a URL
target_options: dict (optional)
Kwargs passed when instantiating the target FS, if ``fo`` is
a string.
"""
super().__init__(self, **kwargs)
if mode != "r":
raise ValueError("Only read from archive files accepted")
if isinstance(fo, str):
files = open_files(fo, protocol=target_protocol, **(target_options or {}))
if len(files) != 1:
raise ValueError(
f'Path "{fo}" did not resolve to exactly one file: "{files}"'
)
fo = files[0]
self.of = fo
self.fo = fo.__enter__() # the whole instance is a context
self.block_size = block_size
self.dir_cache = None
@contextmanager
def _open_archive(self):
self.fo.seek(0)
with custom_reader(self.fo, block_size=self.block_size) as arc:
yield arc
@classmethod
def _strip_protocol(cls, path):
# file paths are always relative to the archive root
return super()._strip_protocol(path).lstrip("/")
def _get_dirs(self):
fields = {
"name": "pathname",
"size": "size",
"created": "ctime",
"mode": "mode",
"uid": "uid",
"gid": "gid",
"mtime": "mtime",
}
if self.dir_cache is not None:
return
self.dir_cache = {}
list_names = []
with self._open_archive() as arc:
for entry in arc:
if not entry.isdir and not entry.isfile:
# Skip symbolic links, fifo entries, etc.
continue
self.dir_cache.update(
{
dirname: {"name": dirname, "size": 0, "type": "directory"}
for dirname in self._all_dirnames(set(entry.name))
}
)
f = {key: getattr(entry, fields[key]) for key in fields}
f["type"] = "directory" if entry.isdir else "file"
list_names.append(entry.name)
self.dir_cache[f["name"]] = f
# libarchive does not seem to return an entry for the directories (at least
# not in all formats), so get the directories names from the files names
self.dir_cache.update(
{
dirname: {"name": dirname, "size": 0, "type": "directory"}
for dirname in self._all_dirnames(list_names)
}
)
def _open(
self,
path,
mode="rb",
block_size=None,
autocommit=True,
cache_options=None,
**kwargs,
):
path = self._strip_protocol(path)
if mode != "rb":
raise NotImplementedError
data = bytes()
with self._open_archive() as arc:
for entry in arc:
if entry.pathname != path:
continue
if entry.size == 0:
# empty file, so there are no blocks
break
for block in entry.get_blocks(entry.size):
data = block
break
else:
raise ValueError
return MemoryFile(fs=self, path=path, data=data)

View File

@@ -0,0 +1,514 @@
import datetime
import io
import logging
import os
import os.path as osp
import shutil
import stat
import tempfile
from functools import lru_cache
from fsspec import AbstractFileSystem
from fsspec.compression import compr
from fsspec.core import get_compression
from fsspec.utils import isfilelike, stringify_path
logger = logging.getLogger("fsspec.local")
class LocalFileSystem(AbstractFileSystem):
"""Interface to files on local storage
Parameters
----------
auto_mkdir: bool
Whether, when opening a file, the directory containing it should
be created (if it doesn't already exist). This is assumed by pyarrow
code.
"""
root_marker = "/"
protocol = "file", "local"
local_file = True
def __init__(self, auto_mkdir=False, **kwargs):
super().__init__(**kwargs)
self.auto_mkdir = auto_mkdir
@property
def fsid(self):
return "local"
def mkdir(self, path, create_parents=True, **kwargs):
path = self._strip_protocol(path)
if self.exists(path):
raise FileExistsError(path)
if create_parents:
self.makedirs(path, exist_ok=True)
else:
os.mkdir(path, **kwargs)
def makedirs(self, path, exist_ok=False):
path = self._strip_protocol(path)
os.makedirs(path, exist_ok=exist_ok)
def rmdir(self, path):
path = self._strip_protocol(path)
os.rmdir(path)
def ls(self, path, detail=False, **kwargs):
path = self._strip_protocol(path)
path_info = self.info(path)
infos = []
if path_info["type"] == "directory":
with os.scandir(path) as it:
for f in it:
try:
# Only get the info if requested since it is a bit expensive (the stat call inside)
# The strip_protocol is also used in info() and calls make_path_posix to always return posix paths
info = self.info(f) if detail else self._strip_protocol(f.path)
infos.append(info)
except FileNotFoundError:
pass
else:
infos = [path_info] if detail else [path_info["name"]]
return infos
def info(self, path, **kwargs):
if isinstance(path, os.DirEntry):
# scandir DirEntry
out = path.stat(follow_symlinks=False)
link = path.is_symlink()
if path.is_dir(follow_symlinks=False):
t = "directory"
elif path.is_file(follow_symlinks=False):
t = "file"
else:
t = "other"
size = out.st_size
if link:
try:
out2 = path.stat(follow_symlinks=True)
size = out2.st_size
except OSError:
size = 0
path = self._strip_protocol(path.path)
else:
# str or path-like
path = self._strip_protocol(path)
out = os.stat(path, follow_symlinks=False)
link = stat.S_ISLNK(out.st_mode)
if link:
out = os.stat(path, follow_symlinks=True)
size = out.st_size
if stat.S_ISDIR(out.st_mode):
t = "directory"
elif stat.S_ISREG(out.st_mode):
t = "file"
else:
t = "other"
# Check for the 'st_birthtime' attribute, which is not always present; fallback to st_ctime
created_time = getattr(out, "st_birthtime", out.st_ctime)
result = {
"name": path,
"size": size,
"type": t,
"created": created_time,
"islink": link,
}
for field in ["mode", "uid", "gid", "mtime", "ino", "nlink"]:
result[field] = getattr(out, f"st_{field}")
if link:
result["destination"] = os.readlink(path)
return result
def lexists(self, path, **kwargs):
return osp.lexists(path)
def cp_file(self, path1, path2, **kwargs):
path1 = self._strip_protocol(path1)
path2 = self._strip_protocol(path2)
if self.auto_mkdir:
self.makedirs(self._parent(path2), exist_ok=True)
if self.isfile(path1):
shutil.copyfile(path1, path2)
elif self.isdir(path1):
self.mkdirs(path2, exist_ok=True)
else:
raise FileNotFoundError(path1)
def isfile(self, path):
path = self._strip_protocol(path)
return os.path.isfile(path)
def isdir(self, path):
path = self._strip_protocol(path)
return os.path.isdir(path)
def get_file(self, path1, path2, callback=None, **kwargs):
if isfilelike(path2):
with open(path1, "rb") as f:
shutil.copyfileobj(f, path2)
else:
return self.cp_file(path1, path2, **kwargs)
def put_file(self, path1, path2, callback=None, **kwargs):
return self.cp_file(path1, path2, **kwargs)
def mv(self, path1, path2, recursive: bool = True, **kwargs):
"""Move files/directories
For the specific case of local, all ops on directories are recursive and
the recursive= kwarg is ignored.
"""
path1 = self._strip_protocol(path1)
path2 = self._strip_protocol(path2)
shutil.move(path1, path2)
def link(self, src, dst, **kwargs):
src = self._strip_protocol(src)
dst = self._strip_protocol(dst)
os.link(src, dst, **kwargs)
def symlink(self, src, dst, **kwargs):
src = self._strip_protocol(src)
dst = self._strip_protocol(dst)
os.symlink(src, dst, **kwargs)
def islink(self, path) -> bool:
return os.path.islink(self._strip_protocol(path))
def rm_file(self, path):
os.remove(self._strip_protocol(path))
def rm(self, path, recursive=False, maxdepth=None):
if not isinstance(path, list):
path = [path]
for p in path:
p = self._strip_protocol(p)
if self.isdir(p):
if not recursive:
raise ValueError("Cannot delete directory, set recursive=True")
if osp.abspath(p) == os.getcwd():
raise ValueError("Cannot delete current working directory")
shutil.rmtree(p)
else:
os.remove(p)
def unstrip_protocol(self, name):
name = self._strip_protocol(name) # normalise for local/win/...
return f"file://{name}"
def _open(self, path, mode="rb", block_size=None, **kwargs):
path = self._strip_protocol(path)
if self.auto_mkdir and "w" in mode:
self.makedirs(self._parent(path), exist_ok=True)
return LocalFileOpener(path, mode, fs=self, **kwargs)
def touch(self, path, truncate=True, **kwargs):
path = self._strip_protocol(path)
if self.auto_mkdir:
self.makedirs(self._parent(path), exist_ok=True)
if self.exists(path):
os.utime(path, None)
else:
open(path, "a").close()
if truncate:
os.truncate(path, 0)
def created(self, path):
info = self.info(path=path)
return datetime.datetime.fromtimestamp(
info["created"], tz=datetime.timezone.utc
)
def modified(self, path):
info = self.info(path=path)
return datetime.datetime.fromtimestamp(info["mtime"], tz=datetime.timezone.utc)
@classmethod
def _parent(cls, path):
path = cls._strip_protocol(path)
if os.sep == "/":
# posix native
return path.rsplit("/", 1)[0] or "/"
else:
# NT
path_ = path.rsplit("/", 1)[0]
if len(path_) <= 3:
if path_[1:2] == ":":
# nt root (something like c:/)
return path_[0] + ":/"
# More cases may be required here
return path_
@classmethod
def _strip_protocol(cls, path):
path = stringify_path(path)
if path.startswith("file://"):
path = path[7:]
elif path.startswith("file:"):
path = path[5:]
elif path.startswith("local://"):
path = path[8:]
elif path.startswith("local:"):
path = path[6:]
path = make_path_posix(path)
if os.sep != "/":
# This code-path is a stripped down version of
# > drive, path = ntpath.splitdrive(path)
if path[1:2] == ":":
# Absolute drive-letter path, e.g. X:\Windows
# Relative path with drive, e.g. X:Windows
drive, path = path[:2], path[2:]
elif path[:2] == "//":
# UNC drives, e.g. \\server\share or \\?\UNC\server\share
# Device drives, e.g. \\.\device or \\?\device
if (index1 := path.find("/", 2)) == -1 or (
index2 := path.find("/", index1 + 1)
) == -1:
drive, path = path, ""
else:
drive, path = path[:index2], path[index2:]
else:
# Relative path, e.g. Windows
drive = ""
path = path.rstrip("/") or cls.root_marker
return drive + path
else:
return path.rstrip("/") or cls.root_marker
def _isfilestore(self):
# Inheriting from DaskFileSystem makes this False (S3, etc. were)
# the original motivation. But we are a posix-like file system.
# See https://github.com/dask/dask/issues/5526
return True
def chmod(self, path, mode):
path = stringify_path(path)
return os.chmod(path, mode)
def make_path_posix(path):
"""Make path generic and absolute for current OS"""
if not isinstance(path, str):
if isinstance(path, (list, set, tuple)):
return type(path)(make_path_posix(p) for p in path)
else:
path = stringify_path(path)
if not isinstance(path, str):
raise TypeError(f"could not convert {path!r} to string")
if os.sep == "/":
# Native posix
if path.startswith("/"):
# most common fast case for posix
return path
elif path.startswith("~"):
return osp.expanduser(path)
elif path.startswith("./"):
path = path[2:]
elif path == ".":
path = ""
return f"{os.getcwd()}/{path}"
else:
# NT handling
if path[0:1] == "/" and path[2:3] == ":":
# path is like "/c:/local/path"
path = path[1:]
if path[1:2] == ":":
# windows full path like "C:\\local\\path"
if len(path) <= 3:
# nt root (something like c:/)
return path[0] + ":/"
path = path.replace("\\", "/")
return path
elif path[0:1] == "~":
return make_path_posix(osp.expanduser(path))
elif path.startswith(("\\\\", "//")):
# windows UNC/DFS-style paths
return "//" + path[2:].replace("\\", "/")
elif path.startswith(("\\", "/")):
# windows relative path with root
path = path.replace("\\", "/")
return f"{osp.splitdrive(os.getcwd())[0]}{path}"
else:
path = path.replace("\\", "/")
if path.startswith("./"):
path = path[2:]
elif path == ".":
path = ""
return f"{make_path_posix(os.getcwd())}/{path}"
def trailing_sep(path):
"""Return True if the path ends with a path separator.
A forward slash is always considered a path separator, even on Operating
Systems that normally use a backslash.
"""
# TODO: if all incoming paths were posix-compliant then separator would
# always be a forward slash, simplifying this function.
# See https://github.com/fsspec/filesystem_spec/pull/1250
return path.endswith(os.sep) or (os.altsep is not None and path.endswith(os.altsep))
@lru_cache(maxsize=1)
def get_umask(mask: int = 0o666) -> int:
"""Get the current umask.
Follows https://stackoverflow.com/a/44130549 to get the umask.
Temporarily sets the umask to the given value, and then resets it to the
original value.
"""
value = os.umask(mask)
os.umask(value)
return value
class LocalFileOpener(io.IOBase):
def __init__(
self, path, mode, autocommit=True, fs=None, compression=None, **kwargs
):
logger.debug("open file: %s", path)
self.path = path
self.mode = mode
self.fs = fs
self.f = None
self.autocommit = autocommit
self.compression = get_compression(path, compression)
self.blocksize = io.DEFAULT_BUFFER_SIZE
self._open()
def _open(self):
if self.f is None or self.f.closed:
if self.autocommit or "w" not in self.mode:
self.f = open(self.path, mode=self.mode)
if self.compression:
compress = compr[self.compression]
self.f = compress(self.f, mode=self.mode)
else:
# TODO: check if path is writable?
i, name = tempfile.mkstemp()
os.close(i) # we want normal open and normal buffered file
self.temp = name
self.f = open(name, mode=self.mode)
if "w" not in self.mode:
self.size = self.f.seek(0, 2)
self.f.seek(0)
self.f.size = self.size
def _fetch_range(self, start, end):
# probably only used by cached FS
if "r" not in self.mode:
raise ValueError
self._open()
self.f.seek(start)
return self.f.read(end - start)
def __setstate__(self, state):
self.f = None
loc = state.pop("loc", None)
self.__dict__.update(state)
if "r" in state["mode"]:
self.f = None
self._open()
self.f.seek(loc)
def __getstate__(self):
d = self.__dict__.copy()
d.pop("f")
if "r" in self.mode:
d["loc"] = self.f.tell()
else:
if not self.f.closed:
raise ValueError("Cannot serialise open write-mode local file")
return d
def commit(self):
if self.autocommit:
raise RuntimeError("Can only commit if not already set to autocommit")
try:
shutil.move(self.temp, self.path)
except PermissionError as e:
# shutil.move raises PermissionError if os.rename
# and the default copy2 fallback with shutil.copystats fail.
# The file should be there nonetheless, but without copied permissions.
# If it doesn't exist, there was no permission to create the file.
if not os.path.exists(self.path):
raise e
else:
# If PermissionError is not raised, permissions can be set.
try:
mask = 0o666
os.chmod(self.path, mask & ~get_umask(mask))
except RuntimeError:
pass
def discard(self):
if self.autocommit:
raise RuntimeError("Cannot discard if set to autocommit")
os.remove(self.temp)
def readable(self) -> bool:
return True
def writable(self) -> bool:
return "r" not in self.mode
def read(self, *args, **kwargs):
return self.f.read(*args, **kwargs)
def write(self, *args, **kwargs):
return self.f.write(*args, **kwargs)
def tell(self, *args, **kwargs):
return self.f.tell(*args, **kwargs)
def seek(self, *args, **kwargs):
return self.f.seek(*args, **kwargs)
def seekable(self, *args, **kwargs):
return self.f.seekable(*args, **kwargs)
def readline(self, *args, **kwargs):
return self.f.readline(*args, **kwargs)
def readlines(self, *args, **kwargs):
return self.f.readlines(*args, **kwargs)
def close(self):
return self.f.close()
def truncate(self, size=None) -> int:
return self.f.truncate(size)
@property
def closed(self):
return self.f.closed
def fileno(self):
return self.raw.fileno()
def flush(self) -> None:
self.f.flush()
def __iter__(self):
return self.f.__iter__()
def __getattr__(self, item):
return getattr(self.f, item)
def __enter__(self):
self._incontext = True
return self
def __exit__(self, exc_type, exc_value, traceback):
self._incontext = False
self.f.__exit__(exc_type, exc_value, traceback)

View File

@@ -0,0 +1,311 @@
from __future__ import annotations
import logging
from datetime import datetime, timezone
from errno import ENOTEMPTY
from io import BytesIO
from pathlib import PurePath, PureWindowsPath
from typing import Any, ClassVar
from fsspec import AbstractFileSystem
from fsspec.implementations.local import LocalFileSystem
from fsspec.utils import stringify_path
logger = logging.getLogger("fsspec.memoryfs")
class MemoryFileSystem(AbstractFileSystem):
"""A filesystem based on a dict of BytesIO objects
This is a global filesystem so instances of this class all point to the same
in memory filesystem.
"""
store: ClassVar[dict[str, Any]] = {} # global, do not overwrite!
pseudo_dirs = [""] # global, do not overwrite!
protocol = "memory"
root_marker = "/"
@classmethod
def _strip_protocol(cls, path):
if isinstance(path, PurePath):
if isinstance(path, PureWindowsPath):
return LocalFileSystem._strip_protocol(path)
else:
path = stringify_path(path)
path = path.removeprefix("memory://")
if "::" in path or "://" in path:
return path.rstrip("/")
path = path.lstrip("/").rstrip("/")
return "/" + path if path else ""
def ls(self, path, detail=True, **kwargs):
path = self._strip_protocol(path)
if path in self.store:
# there is a key with this exact name
if not detail:
return [path]
return [
{
"name": path,
"size": self.store[path].size,
"type": "file",
"created": self.store[path].created.timestamp(),
}
]
paths = set()
starter = path + "/"
out = []
for p2 in tuple(self.store):
if p2.startswith(starter):
if "/" not in p2[len(starter) :]:
# exact child
out.append(
{
"name": p2,
"size": self.store[p2].size,
"type": "file",
"created": self.store[p2].created.timestamp(),
}
)
elif len(p2) > len(starter):
# implied child directory
ppath = starter + p2[len(starter) :].split("/", 1)[0]
if ppath not in paths:
out = out or []
out.append(
{
"name": ppath,
"size": 0,
"type": "directory",
}
)
paths.add(ppath)
for p2 in self.pseudo_dirs:
if p2.startswith(starter):
if "/" not in p2[len(starter) :]:
# exact child pdir
if p2 not in paths:
out.append({"name": p2, "size": 0, "type": "directory"})
paths.add(p2)
else:
# directory implied by deeper pdir
ppath = starter + p2[len(starter) :].split("/", 1)[0]
if ppath not in paths:
out.append({"name": ppath, "size": 0, "type": "directory"})
paths.add(ppath)
if not out:
if path in self.pseudo_dirs:
# empty dir
return []
raise FileNotFoundError(path)
if detail:
return out
return sorted([f["name"] for f in out])
def mkdir(self, path, create_parents=True, **kwargs):
path = self._strip_protocol(path)
if path in self.store or path in self.pseudo_dirs:
raise FileExistsError(path)
if self._parent(path).strip("/") and self.isfile(self._parent(path)):
raise NotADirectoryError(self._parent(path))
if create_parents and self._parent(path).strip("/"):
try:
self.mkdir(self._parent(path), create_parents, **kwargs)
except FileExistsError:
pass
if path and path not in self.pseudo_dirs:
self.pseudo_dirs.append(path)
def makedirs(self, path, exist_ok=False):
try:
self.mkdir(path, create_parents=True)
except FileExistsError:
if not exist_ok:
raise
def pipe_file(self, path, value, mode="overwrite", **kwargs):
"""Set the bytes of given file
Avoids copies of the data if possible
"""
mode = "xb" if mode == "create" else "wb"
self.open(path, mode=mode, data=value)
def rmdir(self, path):
path = self._strip_protocol(path)
if path == "":
# silently avoid deleting FS root
return
if path in self.pseudo_dirs:
if not self.ls(path):
self.pseudo_dirs.remove(path)
else:
raise OSError(ENOTEMPTY, "Directory not empty", path)
else:
raise FileNotFoundError(path)
def info(self, path, **kwargs):
logger.debug("info: %s", path)
path = self._strip_protocol(path)
if path in self.pseudo_dirs or any(
p.startswith(path + "/") for p in list(self.store) + self.pseudo_dirs
):
return {
"name": path,
"size": 0,
"type": "directory",
}
elif path in self.store:
filelike = self.store[path]
return {
"name": path,
"size": filelike.size,
"type": "file",
"created": getattr(filelike, "created", None),
}
else:
raise FileNotFoundError(path)
def _open(
self,
path,
mode="rb",
block_size=None,
autocommit=True,
cache_options=None,
**kwargs,
):
path = self._strip_protocol(path)
if "x" in mode and self.exists(path):
raise FileExistsError
if path in self.pseudo_dirs:
raise IsADirectoryError(path)
parent = path
while len(parent) > 1:
parent = self._parent(parent)
if self.isfile(parent):
raise FileExistsError(parent)
if mode in ["rb", "ab", "r+b", "a+b"]:
if path in self.store:
f = self.store[path]
if "a" in mode:
# position at the end of file
f.seek(0, 2)
else:
# position at the beginning of file
f.seek(0)
return f
else:
raise FileNotFoundError(path)
elif mode in {"wb", "w+b", "xb", "x+b"}:
if "x" in mode and self.exists(path):
raise FileExistsError
m = MemoryFile(self, path, kwargs.get("data"))
if not self._intrans:
m.commit()
return m
else:
name = self.__class__.__name__
raise ValueError(f"unsupported file mode for {name}: {mode!r}")
def cp_file(self, path1, path2, **kwargs):
path1 = self._strip_protocol(path1)
path2 = self._strip_protocol(path2)
if self.isfile(path1):
self.store[path2] = MemoryFile(
self, path2, self.store[path1].getvalue()
) # implicit copy
elif self.isdir(path1):
if path2 not in self.pseudo_dirs:
self.pseudo_dirs.append(path2)
else:
raise FileNotFoundError(path1)
def cat_file(self, path, start=None, end=None, **kwargs):
logger.debug("cat: %s", path)
path = self._strip_protocol(path)
try:
return bytes(self.store[path].getbuffer()[start:end])
except KeyError as e:
raise FileNotFoundError(path) from e
def _rm(self, path):
path = self._strip_protocol(path)
try:
del self.store[path]
except KeyError as e:
raise FileNotFoundError(path) from e
def modified(self, path):
path = self._strip_protocol(path)
try:
return self.store[path].modified
except KeyError as e:
raise FileNotFoundError(path) from e
def created(self, path):
path = self._strip_protocol(path)
try:
return self.store[path].created
except KeyError as e:
raise FileNotFoundError(path) from e
def isfile(self, path):
path = self._strip_protocol(path)
return path in self.store
def rm(self, path, recursive=False, maxdepth=None):
if isinstance(path, str):
path = self._strip_protocol(path)
else:
path = [self._strip_protocol(p) for p in path]
paths = self.expand_path(path, recursive=recursive, maxdepth=maxdepth)
for p in reversed(paths):
if self.isfile(p):
self.rm_file(p)
# If the expanded path doesn't exist, it is only because the expanded
# path was a directory that does not exist in self.pseudo_dirs. This
# is possible if you directly create files without making the
# directories first.
elif not self.exists(p):
continue
else:
self.rmdir(p)
class MemoryFile(BytesIO):
"""A BytesIO which can't close and works as a context manager
Can initialise with data. Each path should only be active once at any moment.
No need to provide fs, path if auto-committing (default)
"""
def __init__(self, fs=None, path=None, data=None):
logger.debug("open file %s", path)
self.fs = fs
self.path = path
self.created = datetime.now(tz=timezone.utc)
self.modified = datetime.now(tz=timezone.utc)
if data:
super().__init__(data)
self.seek(0)
@property
def size(self):
return self.getbuffer().nbytes
def __enter__(self):
return self
def close(self):
pass
def discard(self):
pass
def commit(self):
self.fs.store[self.path] = self
self.modified = datetime.now(tz=timezone.utc)

View File

@@ -0,0 +1,187 @@
import datetime
import logging
import os
import types
import uuid
from stat import S_ISDIR, S_ISLNK
import paramiko
from .. import AbstractFileSystem
from ..utils import infer_storage_options
logger = logging.getLogger("fsspec.sftp")
class SFTPFileSystem(AbstractFileSystem):
"""Files over SFTP/SSH
Peer-to-peer filesystem over SSH using paramiko.
Note: if using this with the ``open`` or ``open_files``, with full URLs,
there is no way to tell if a path is relative, so all paths are assumed
to be absolute.
"""
protocol = "sftp", "ssh"
def __init__(self, host, **ssh_kwargs):
"""
Parameters
----------
host: str
Hostname or IP as a string
temppath: str
Location on the server to put files, when within a transaction
ssh_kwargs: dict
Parameters passed on to connection. See details in
https://docs.paramiko.org/en/3.3/api/client.html#paramiko.client.SSHClient.connect
May include port, username, password...
"""
if self._cached:
return
super().__init__(**ssh_kwargs)
self.temppath = ssh_kwargs.pop("temppath", "/tmp") # remote temp directory
self.host = host
self.ssh_kwargs = ssh_kwargs
self._connect()
def _connect(self):
logger.debug("Connecting to SFTP server %s", self.host)
self.client = paramiko.SSHClient()
self.client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
self.client.connect(self.host, **self.ssh_kwargs)
self.ftp = self.client.open_sftp()
@classmethod
def _strip_protocol(cls, path):
return infer_storage_options(path)["path"]
@staticmethod
def _get_kwargs_from_urls(urlpath):
out = infer_storage_options(urlpath)
out.pop("path", None)
out.pop("protocol", None)
return out
def mkdir(self, path, create_parents=True, mode=511):
path = self._strip_protocol(path)
logger.debug("Creating folder %s", path)
if self.exists(path):
raise FileExistsError(f"File exists: {path}")
if create_parents:
self.makedirs(path)
else:
self.ftp.mkdir(path, mode)
def makedirs(self, path, exist_ok=False, mode=511):
if self.exists(path) and not exist_ok:
raise FileExistsError(f"File exists: {path}")
parts = path.split("/")
new_path = "/" if path[:1] == "/" else ""
for part in parts:
if part:
new_path = f"{new_path}/{part}" if new_path else part
if not self.exists(new_path):
self.ftp.mkdir(new_path, mode)
def rmdir(self, path):
path = self._strip_protocol(path)
logger.debug("Removing folder %s", path)
self.ftp.rmdir(path)
def info(self, path):
path = self._strip_protocol(path)
stat = self._decode_stat(self.ftp.stat(path))
stat["name"] = path
return stat
@staticmethod
def _decode_stat(stat, parent_path=None):
if S_ISDIR(stat.st_mode):
t = "directory"
elif S_ISLNK(stat.st_mode):
t = "link"
else:
t = "file"
out = {
"name": "",
"size": stat.st_size,
"type": t,
"uid": stat.st_uid,
"gid": stat.st_gid,
"time": datetime.datetime.fromtimestamp(
stat.st_atime, tz=datetime.timezone.utc
),
"mtime": datetime.datetime.fromtimestamp(
stat.st_mtime, tz=datetime.timezone.utc
),
}
if parent_path:
out["name"] = "/".join([parent_path.rstrip("/"), stat.filename])
return out
def ls(self, path, detail=False):
path = self._strip_protocol(path)
logger.debug("Listing folder %s", path)
stats = [self._decode_stat(stat, path) for stat in self.ftp.listdir_iter(path)]
if detail:
return stats
else:
paths = [stat["name"] for stat in stats]
return sorted(paths)
def put(self, lpath, rpath, callback=None, **kwargs):
rpath = self._strip_protocol(rpath)
logger.debug("Put file %s into %s", lpath, rpath)
self.ftp.put(lpath, rpath)
def get_file(self, rpath, lpath, **kwargs):
if self.isdir(rpath):
os.makedirs(lpath, exist_ok=True)
else:
self.ftp.get(self._strip_protocol(rpath), lpath)
def _open(self, path, mode="rb", block_size=None, **kwargs):
"""
block_size: int or None
If 0, no buffering, if 1, line buffering, if >1, buffer that many
bytes, if None use default from paramiko.
"""
logger.debug("Opening file %s", path)
if kwargs.get("autocommit", True) is False:
# writes to temporary file, move on commit
path2 = "/".join([self.temppath, str(uuid.uuid4())])
f = self.ftp.open(path2, mode, bufsize=block_size if block_size else -1)
f.temppath = path2
f.targetpath = path
f.fs = self
f.commit = types.MethodType(commit_a_file, f)
f.discard = types.MethodType(discard_a_file, f)
else:
f = self.ftp.open(path, mode, bufsize=block_size if block_size else -1)
return f
def _rm(self, path):
if self.isdir(path):
self.ftp.rmdir(path)
else:
self.ftp.remove(path)
def mv(self, old, new):
new = self._strip_protocol(new)
old = self._strip_protocol(old)
logger.debug("Renaming %s into %s", old, new)
self.ftp.posix_rename(old, new)
def commit_a_file(self):
self.fs.mv(self.temppath, self.targetpath)
def discard_a_file(self):
self.fs._rm(self.temppath)

View File

@@ -0,0 +1,416 @@
"""
This module contains SMBFileSystem class responsible for handling access to
Windows Samba network shares by using package smbprotocol
"""
import datetime
import re
import uuid
from stat import S_ISDIR, S_ISLNK
import smbclient
import smbprotocol.exceptions
from .. import AbstractFileSystem
from ..utils import infer_storage_options
# ! pylint: disable=bad-continuation
class SMBFileSystem(AbstractFileSystem):
"""Allow reading and writing to Windows and Samba network shares.
When using `fsspec.open()` for getting a file-like object the URI
should be specified as this format:
``smb://workgroup;user:password@server:port/share/folder/file.csv``.
Example::
>>> import fsspec
>>> with fsspec.open(
... 'smb://myuser:mypassword@myserver.com/' 'share/folder/file.csv'
... ) as smbfile:
... df = pd.read_csv(smbfile, sep='|', header=None)
Note that you need to pass in a valid hostname or IP address for the host
component of the URL. Do not use the Windows/NetBIOS machine name for the
host component.
The first component of the path in the URL points to the name of the shared
folder. Subsequent path components will point to the directory/folder/file.
The URL components ``workgroup`` , ``user``, ``password`` and ``port`` may be
optional.
.. note::
For working this source require `smbprotocol`_ to be installed, e.g.::
$ pip install smbprotocol
# or
# pip install smbprotocol[kerberos]
.. _smbprotocol: https://github.com/jborean93/smbprotocol#requirements
Note: if using this with the ``open`` or ``open_files``, with full URLs,
there is no way to tell if a path is relative, so all paths are assumed
to be absolute.
"""
protocol = "smb"
# pylint: disable=too-many-arguments
def __init__(
self,
host,
port=None,
username=None,
password=None,
timeout=60,
encrypt=None,
share_access=None,
register_session_retries=4,
register_session_retry_wait=1,
register_session_retry_factor=10,
auto_mkdir=False,
**kwargs,
):
"""
You can use _get_kwargs_from_urls to get some kwargs from
a reasonable SMB url.
Authentication will be anonymous or integrated if username/password are not
given.
Parameters
----------
host: str
The remote server name/ip to connect to
port: int or None
Port to connect with. Usually 445, sometimes 139.
username: str or None
Username to connect with. Required if Kerberos auth is not being used.
password: str or None
User's password on the server, if using username
timeout: int
Connection timeout in seconds
encrypt: bool
Whether to force encryption or not, once this has been set to True
the session cannot be changed back to False.
share_access: str or None
Specifies the default access applied to file open operations
performed with this file system object.
This affects whether other processes can concurrently open a handle
to the same file.
- None (the default): exclusively locks the file until closed.
- 'r': Allow other handles to be opened with read access.
- 'w': Allow other handles to be opened with write access.
- 'd': Allow other handles to be opened with delete access.
register_session_retries: int
Number of retries to register a session with the server. Retries are not performed
for authentication errors, as they are considered as invalid credentials and not network
issues. If set to negative value, no register attempts will be performed.
register_session_retry_wait: int
Time in seconds to wait between each retry. Number must be non-negative.
register_session_retry_factor: int
Base factor for the wait time between each retry. The wait time
is calculated using exponential function. For factor=1 all wait times
will be equal to `register_session_retry_wait`. For any number of retries,
the last wait time will be equal to `register_session_retry_wait` and for retries>1
the first wait time will be equal to `register_session_retry_wait / factor`.
Number must be equal to or greater than 1. Optimal factor is 10.
auto_mkdir: bool
Whether, when opening a file, the directory containing it should
be created (if it doesn't already exist). This is assumed by pyarrow
and zarr-python code.
"""
super().__init__(**kwargs)
self.host = host
self.port = port
self.username = username
self.password = password
self.timeout = timeout
self.encrypt = encrypt
self.temppath = kwargs.pop("temppath", "")
self.share_access = share_access
self.register_session_retries = register_session_retries
if register_session_retry_wait < 0:
raise ValueError(
"register_session_retry_wait must be a non-negative integer"
)
self.register_session_retry_wait = register_session_retry_wait
if register_session_retry_factor < 1:
raise ValueError(
"register_session_retry_factor must be a positive "
"integer equal to or greater than 1"
)
self.register_session_retry_factor = register_session_retry_factor
self.auto_mkdir = auto_mkdir
self._connect()
@property
def _port(self):
return 445 if self.port is None else self.port
def _connect(self):
import time
if self.register_session_retries <= -1:
return
retried_errors = []
wait_time = self.register_session_retry_wait
n_waits = (
self.register_session_retries - 1
) # -1 = No wait time after the last retry
factor = self.register_session_retry_factor
# Generate wait times for each retry attempt.
# Wait times are calculated using exponential function. For factor=1 all wait times
# will be equal to `wait`. For any number of retries the last wait time will be
# equal to `wait` and for retries>2 the first wait time will be equal to `wait / factor`.
wait_times = iter(
factor ** (n / n_waits - 1) * wait_time for n in range(0, n_waits + 1)
)
for attempt in range(self.register_session_retries + 1):
try:
smbclient.register_session(
self.host,
username=self.username,
password=self.password,
port=self._port,
encrypt=self.encrypt,
connection_timeout=self.timeout,
)
return
except (
smbprotocol.exceptions.SMBAuthenticationError,
smbprotocol.exceptions.LogonFailure,
):
# These exceptions should not be repeated, as they clearly indicate
# that the credentials are invalid and not a network issue.
raise
except ValueError as exc:
if re.findall(r"\[Errno -\d+]", str(exc)):
# This exception is raised by the smbprotocol.transport:Tcp.connect
# and originates from socket.gaierror (OSError). These exceptions might
# be raised due to network instability. We will retry to connect.
retried_errors.append(exc)
else:
# All another ValueError exceptions should be raised, as they are not
# related to network issues.
raise
except Exception as exc:
# Save the exception and retry to connect. This except might be dropped
# in the future, once all exceptions suited for retry are identified.
retried_errors.append(exc)
if attempt < self.register_session_retries:
time.sleep(next(wait_times))
# Raise last exception to inform user about the connection issues.
# Note: Should we use ExceptionGroup to raise all exceptions?
raise retried_errors[-1]
@classmethod
def _strip_protocol(cls, path):
return infer_storage_options(path)["path"]
@staticmethod
def _get_kwargs_from_urls(path):
# smb://workgroup;user:password@host:port/share/folder/file.csv
out = infer_storage_options(path)
out.pop("path", None)
out.pop("protocol", None)
return out
def mkdir(self, path, create_parents=True, **kwargs):
wpath = _as_unc_path(self.host, path)
if create_parents:
smbclient.makedirs(wpath, exist_ok=False, port=self._port, **kwargs)
else:
smbclient.mkdir(wpath, port=self._port, **kwargs)
def makedirs(self, path, exist_ok=False):
if _share_has_path(path):
wpath = _as_unc_path(self.host, path)
smbclient.makedirs(wpath, exist_ok=exist_ok, port=self._port)
def rmdir(self, path):
if _share_has_path(path):
wpath = _as_unc_path(self.host, path)
smbclient.rmdir(wpath, port=self._port)
def info(self, path, **kwargs):
wpath = _as_unc_path(self.host, path)
stats = smbclient.stat(wpath, port=self._port, **kwargs)
if S_ISDIR(stats.st_mode):
stype = "directory"
elif S_ISLNK(stats.st_mode):
stype = "link"
else:
stype = "file"
res = {
"name": path + "/" if stype == "directory" else path,
"size": stats.st_size,
"type": stype,
"uid": stats.st_uid,
"gid": stats.st_gid,
"time": stats.st_atime,
"mtime": stats.st_mtime,
}
return res
def created(self, path):
"""Return the created timestamp of a file as a datetime.datetime"""
wpath = _as_unc_path(self.host, path)
stats = smbclient.stat(wpath, port=self._port)
return datetime.datetime.fromtimestamp(stats.st_ctime, tz=datetime.timezone.utc)
def modified(self, path):
"""Return the modified timestamp of a file as a datetime.datetime"""
wpath = _as_unc_path(self.host, path)
stats = smbclient.stat(wpath, port=self._port)
return datetime.datetime.fromtimestamp(stats.st_mtime, tz=datetime.timezone.utc)
def ls(self, path, detail=True, **kwargs):
unc = _as_unc_path(self.host, path)
listed = smbclient.listdir(unc, port=self._port, **kwargs)
dirs = ["/".join([path.rstrip("/"), p]) for p in listed]
if detail:
dirs = [self.info(d) for d in dirs]
return dirs
# pylint: disable=too-many-arguments
def _open(
self,
path,
mode="rb",
block_size=-1,
autocommit=True,
cache_options=None,
**kwargs,
):
"""
block_size: int or None
If 0, no buffering, 1, line buffering, >1, buffer that many bytes
Notes
-----
By specifying 'share_access' in 'kwargs' it is possible to override the
default shared access setting applied in the constructor of this object.
"""
if self.auto_mkdir and "w" in mode:
self.makedirs(self._parent(path), exist_ok=True)
bls = block_size if block_size is not None and block_size >= 0 else -1
wpath = _as_unc_path(self.host, path)
share_access = kwargs.pop("share_access", self.share_access)
if "w" in mode and autocommit is False:
temp = _as_temp_path(self.host, path, self.temppath)
return SMBFileOpener(
wpath, temp, mode, port=self._port, block_size=bls, **kwargs
)
return smbclient.open_file(
wpath,
mode,
buffering=bls,
share_access=share_access,
port=self._port,
**kwargs,
)
def copy(self, path1, path2, **kwargs):
"""Copy within two locations in the same filesystem"""
wpath1 = _as_unc_path(self.host, path1)
wpath2 = _as_unc_path(self.host, path2)
if self.auto_mkdir:
self.makedirs(self._parent(path2), exist_ok=True)
smbclient.copyfile(wpath1, wpath2, port=self._port, **kwargs)
def _rm(self, path):
if _share_has_path(path):
wpath = _as_unc_path(self.host, path)
stats = smbclient.stat(wpath, port=self._port)
if S_ISDIR(stats.st_mode):
smbclient.rmdir(wpath, port=self._port)
else:
smbclient.remove(wpath, port=self._port)
def mv(self, path1, path2, recursive=None, maxdepth=None, **kwargs):
wpath1 = _as_unc_path(self.host, path1)
wpath2 = _as_unc_path(self.host, path2)
smbclient.rename(wpath1, wpath2, port=self._port, **kwargs)
def _as_unc_path(host, path):
rpath = path.replace("/", "\\")
unc = f"\\\\{host}{rpath}"
return unc
def _as_temp_path(host, path, temppath):
share = path.split("/")[1]
temp_file = f"/{share}{temppath}/{uuid.uuid4()}"
unc = _as_unc_path(host, temp_file)
return unc
def _share_has_path(path):
parts = path.count("/")
if path.endswith("/"):
return parts > 2
return parts > 1
class SMBFileOpener:
"""writes to remote temporary file, move on commit"""
def __init__(self, path, temp, mode, port=445, block_size=-1, **kwargs):
self.path = path
self.temp = temp
self.mode = mode
self.block_size = block_size
self.kwargs = kwargs
self.smbfile = None
self._incontext = False
self.port = port
self._open()
def _open(self):
if self.smbfile is None or self.smbfile.closed:
self.smbfile = smbclient.open_file(
self.temp,
self.mode,
port=self.port,
buffering=self.block_size,
**self.kwargs,
)
def commit(self):
"""Move temp file to definitive on success."""
# TODO: use transaction support in SMB protocol
smbclient.replace(self.temp, self.path, port=self.port)
def discard(self):
"""Remove the temp file on failure."""
smbclient.remove(self.temp, port=self.port)
def __fspath__(self):
return self.path
def __iter__(self):
return self.smbfile.__iter__()
def __getattr__(self, item):
return getattr(self.smbfile, item)
def __enter__(self):
self._incontext = True
return self.smbfile.__enter__()
def __exit__(self, exc_type, exc_value, traceback):
self._incontext = False
self.smbfile.__exit__(exc_type, exc_value, traceback)

View File

@@ -0,0 +1,124 @@
import logging
import tarfile
import fsspec
from fsspec.archive import AbstractArchiveFileSystem
from fsspec.compression import compr
from fsspec.utils import infer_compression
typemap = {b"0": "file", b"5": "directory"}
logger = logging.getLogger("tar")
class TarFileSystem(AbstractArchiveFileSystem):
"""Compressed Tar archives as a file-system (read-only)
Supports the following formats:
tar.gz, tar.bz2, tar.xz
"""
root_marker = ""
protocol = "tar"
cachable = False
def __init__(
self,
fo="",
index_store=None,
target_options=None,
target_protocol=None,
compression=None,
**kwargs,
):
super().__init__(**kwargs)
target_options = target_options or {}
if isinstance(fo, str):
self.of = fsspec.open(fo, protocol=target_protocol, **target_options)
fo = self.of.open() # keep the reference
# Try to infer compression.
if compression is None:
name = None
# Try different ways to get hold of the filename. `fo` might either
# be a `fsspec.LocalFileOpener`, an `io.BufferedReader` or an
# `fsspec.AbstractFileSystem` instance.
try:
# Amended io.BufferedReader or similar.
# This uses a "protocol extension" where original filenames are
# propagated to archive-like filesystems in order to let them
# infer the right compression appropriately.
if hasattr(fo, "original"):
name = fo.original
# fsspec.LocalFileOpener
elif hasattr(fo, "path"):
name = fo.path
# io.BufferedReader
elif hasattr(fo, "name"):
name = fo.name
# fsspec.AbstractFileSystem
elif hasattr(fo, "info"):
name = fo.info()["name"]
except Exception as ex:
logger.warning(
f"Unable to determine file name, not inferring compression: {ex}"
)
if name is not None:
compression = infer_compression(name)
logger.info(f"Inferred compression {compression} from file name {name}")
if compression is not None:
# TODO: tarfile already implements compression with modes like "'r:gz'",
# but then would seek to offset in the file work?
fo = compr[compression](fo)
self._fo_ref = fo
self.fo = fo # the whole instance is a context
self.tar = tarfile.TarFile(fileobj=self.fo)
self.dir_cache = None
self.index_store = index_store
self.index = None
self._index()
def _index(self):
# TODO: load and set saved index, if exists
out = {}
for ti in self.tar:
info = ti.get_info()
info["type"] = typemap.get(info["type"], "file")
name = ti.get_info()["name"].rstrip("/")
out[name] = (info, ti.offset_data)
self.index = out
# TODO: save index to self.index_store here, if set
def _get_dirs(self):
if self.dir_cache is not None:
return
# This enables ls to get directories as children as well as files
self.dir_cache = {
dirname: {"name": dirname, "size": 0, "type": "directory"}
for dirname in self._all_dirnames(self.tar.getnames())
}
for member in self.tar.getmembers():
info = member.get_info()
info["name"] = info["name"].rstrip("/")
info["type"] = typemap.get(info["type"], "file")
self.dir_cache[info["name"]] = info
def _open(self, path, mode="rb", **kwargs):
if mode != "rb":
raise ValueError("Read-only filesystem implementation")
details, offset = self.index[path]
if details["type"] != "file":
raise ValueError("Can only handle regular files")
return self.tar.extractfile(path)

View File

@@ -0,0 +1,485 @@
# https://hadoop.apache.org/docs/r1.0.4/webhdfs.html
import logging
import os
import secrets
import shutil
import tempfile
import uuid
from contextlib import suppress
from urllib.parse import quote
import requests
from ..spec import AbstractBufferedFile, AbstractFileSystem
from ..utils import infer_storage_options, tokenize
logger = logging.getLogger("webhdfs")
class WebHDFS(AbstractFileSystem):
"""
Interface to HDFS over HTTP using the WebHDFS API. Supports also HttpFS gateways.
Four auth mechanisms are supported:
insecure: no auth is done, and the user is assumed to be whoever they
say they are (parameter ``user``), or a predefined value such as
"dr.who" if not given
spnego: when kerberos authentication is enabled, auth is negotiated by
requests_kerberos https://github.com/requests/requests-kerberos .
This establishes a session based on existing kinit login and/or
specified principal/password; parameters are passed with ``kerb_kwargs``
token: uses an existing Hadoop delegation token from another secured
service. Indeed, this client can also generate such tokens when
not insecure. Note that tokens expire, but can be renewed (by a
previously specified user) and may allow for proxying.
basic-auth: used when both parameter ``user`` and parameter ``password``
are provided.
"""
tempdir = str(tempfile.gettempdir())
protocol = "webhdfs", "webHDFS"
def __init__(
self,
host,
port=50070,
kerberos=False,
token=None,
user=None,
password=None,
proxy_to=None,
kerb_kwargs=None,
data_proxy=None,
use_https=False,
session_cert=None,
session_verify=True,
**kwargs,
):
"""
Parameters
----------
host: str
Name-node address
port: int
Port for webHDFS
kerberos: bool
Whether to authenticate with kerberos for this connection
token: str or None
If given, use this token on every call to authenticate. A user
and user-proxy may be encoded in the token and should not be also
given
user: str or None
If given, assert the user name to connect with
password: str or None
If given, assert the password to use for basic auth. If password
is provided, user must be provided also
proxy_to: str or None
If given, the user has the authority to proxy, and this value is
the user in who's name actions are taken
kerb_kwargs: dict
Any extra arguments for HTTPKerberosAuth, see
`<https://github.com/requests/requests-kerberos/blob/master/requests_kerberos/kerberos_.py>`_
data_proxy: dict, callable or None
If given, map data-node addresses. This can be necessary if the
HDFS cluster is behind a proxy, running on Docker or otherwise has
a mismatch between the host-names given by the name-node and the
address by which to refer to them from the client. If a dict,
maps host names ``host->data_proxy[host]``; if a callable, full
URLs are passed, and function must conform to
``url->data_proxy(url)``.
use_https: bool
Whether to connect to the Name-node using HTTPS instead of HTTP
session_cert: str or Tuple[str, str] or None
Path to a certificate file, or tuple of (cert, key) files to use
for the requests.Session
session_verify: str, bool or None
Path to a certificate file to use for verifying the requests.Session.
kwargs
"""
if self._cached:
return
super().__init__(**kwargs)
self.url = f"{'https' if use_https else 'http'}://{host}:{port}/webhdfs/v1"
self.kerb = kerberos
self.kerb_kwargs = kerb_kwargs or {}
self.pars = {}
self.proxy = data_proxy or {}
if token is not None:
if user is not None or proxy_to is not None:
raise ValueError(
"If passing a delegation token, must not set "
"user or proxy_to, as these are encoded in the"
" token"
)
self.pars["delegation"] = token
self.user = user
self.password = password
if password is not None:
if user is None:
raise ValueError(
"If passing a password, the user must also be"
"set in order to set up the basic-auth"
)
else:
if user is not None:
self.pars["user.name"] = user
if proxy_to is not None:
self.pars["doas"] = proxy_to
if kerberos and user is not None:
raise ValueError(
"If using Kerberos auth, do not specify the "
"user, this is handled by kinit."
)
self.session_cert = session_cert
self.session_verify = session_verify
self._connect()
self._fsid = f"webhdfs_{tokenize(host, port)}"
@property
def fsid(self):
return self._fsid
def _connect(self):
self.session = requests.Session()
if self.session_cert:
self.session.cert = self.session_cert
self.session.verify = self.session_verify
if self.kerb:
from requests_kerberos import HTTPKerberosAuth
self.session.auth = HTTPKerberosAuth(**self.kerb_kwargs)
if self.user is not None and self.password is not None:
from requests.auth import HTTPBasicAuth
self.session.auth = HTTPBasicAuth(self.user, self.password)
def _call(self, op, method="get", path=None, data=None, redirect=True, **kwargs):
path = self._strip_protocol(path) if path is not None else ""
url = self._apply_proxy(self.url + quote(path, safe="/="))
args = kwargs.copy()
args.update(self.pars)
args["op"] = op.upper()
logger.debug("sending %s with %s", url, method)
out = self.session.request(
method=method.upper(),
url=url,
params=args,
data=data,
allow_redirects=redirect,
)
if out.status_code in [400, 401, 403, 404, 500]:
try:
err = out.json()
msg = err["RemoteException"]["message"]
exp = err["RemoteException"]["exception"]
except (ValueError, KeyError):
pass
else:
if exp in ["IllegalArgumentException", "UnsupportedOperationException"]:
raise ValueError(msg)
elif exp in ["SecurityException", "AccessControlException"]:
raise PermissionError(msg)
elif exp in ["FileNotFoundException"]:
raise FileNotFoundError(msg)
else:
raise RuntimeError(msg)
out.raise_for_status()
return out
def _open(
self,
path,
mode="rb",
block_size=None,
autocommit=True,
replication=None,
permissions=None,
**kwargs,
):
"""
Parameters
----------
path: str
File location
mode: str
'rb', 'wb', etc.
block_size: int
Client buffer size for read-ahead or write buffer
autocommit: bool
If False, writes to temporary file that only gets put in final
location upon commit
replication: int
Number of copies of file on the cluster, write mode only
permissions: str or int
posix permissions, write mode only
kwargs
Returns
-------
WebHDFile instance
"""
block_size = block_size or self.blocksize
return WebHDFile(
self,
path,
mode=mode,
block_size=block_size,
tempdir=self.tempdir,
autocommit=autocommit,
replication=replication,
permissions=permissions,
)
@staticmethod
def _process_info(info):
info["type"] = info["type"].lower()
info["size"] = info["length"]
return info
@classmethod
def _strip_protocol(cls, path):
return infer_storage_options(path)["path"]
@staticmethod
def _get_kwargs_from_urls(urlpath):
out = infer_storage_options(urlpath)
out.pop("path", None)
out.pop("protocol", None)
if "username" in out:
out["user"] = out.pop("username")
return out
def info(self, path):
out = self._call("GETFILESTATUS", path=path)
info = out.json()["FileStatus"]
info["name"] = path
return self._process_info(info)
def ls(self, path, detail=False, **kwargs):
out = self._call("LISTSTATUS", path=path)
infos = out.json()["FileStatuses"]["FileStatus"]
for info in infos:
self._process_info(info)
info["name"] = path.rstrip("/") + "/" + info["pathSuffix"]
if detail:
return sorted(infos, key=lambda i: i["name"])
else:
return sorted(info["name"] for info in infos)
def content_summary(self, path):
"""Total numbers of files, directories and bytes under path"""
out = self._call("GETCONTENTSUMMARY", path=path)
return out.json()["ContentSummary"]
def ukey(self, path):
"""Checksum info of file, giving method and result"""
out = self._call("GETFILECHECKSUM", path=path, redirect=False)
if "Location" in out.headers:
location = self._apply_proxy(out.headers["Location"])
out2 = self.session.get(location)
out2.raise_for_status()
return out2.json()["FileChecksum"]
else:
out.raise_for_status()
return out.json()["FileChecksum"]
def home_directory(self):
"""Get user's home directory"""
out = self._call("GETHOMEDIRECTORY")
return out.json()["Path"]
def get_delegation_token(self, renewer=None):
"""Retrieve token which can give the same authority to other uses
Parameters
----------
renewer: str or None
User who may use this token; if None, will be current user
"""
if renewer:
out = self._call("GETDELEGATIONTOKEN", renewer=renewer)
else:
out = self._call("GETDELEGATIONTOKEN")
t = out.json()["Token"]
if t is None:
raise ValueError("No token available for this user/security context")
return t["urlString"]
def renew_delegation_token(self, token):
"""Make token live longer. Returns new expiry time"""
out = self._call("RENEWDELEGATIONTOKEN", method="put", token=token)
return out.json()["long"]
def cancel_delegation_token(self, token):
"""Stop the token from being useful"""
self._call("CANCELDELEGATIONTOKEN", method="put", token=token)
def chmod(self, path, mod):
"""Set the permission at path
Parameters
----------
path: str
location to set (file or directory)
mod: str or int
posix epresentation or permission, give as oct string, e.g, '777'
or 0o777
"""
self._call("SETPERMISSION", method="put", path=path, permission=mod)
def chown(self, path, owner=None, group=None):
"""Change owning user and/or group"""
kwargs = {}
if owner is not None:
kwargs["owner"] = owner
if group is not None:
kwargs["group"] = group
self._call("SETOWNER", method="put", path=path, **kwargs)
def set_replication(self, path, replication):
"""
Set file replication factor
Parameters
----------
path: str
File location (not for directories)
replication: int
Number of copies of file on the cluster. Should be smaller than
number of data nodes; normally 3 on most systems.
"""
self._call("SETREPLICATION", path=path, method="put", replication=replication)
def mkdir(self, path, **kwargs):
self._call("MKDIRS", method="put", path=path)
def makedirs(self, path, exist_ok=False):
if exist_ok is False and self.exists(path):
raise FileExistsError(path)
self.mkdir(path)
def mv(self, path1, path2, **kwargs):
self._call("RENAME", method="put", path=path1, destination=path2)
def rm(self, path, recursive=False, **kwargs):
self._call(
"DELETE",
method="delete",
path=path,
recursive="true" if recursive else "false",
)
def rm_file(self, path, **kwargs):
self.rm(path)
def cp_file(self, lpath, rpath, **kwargs):
with self.open(lpath) as lstream:
tmp_fname = "/".join([self._parent(rpath), f".tmp.{secrets.token_hex(16)}"])
# Perform an atomic copy (stream to a temporary file and
# move it to the actual destination).
try:
with self.open(tmp_fname, "wb") as rstream:
shutil.copyfileobj(lstream, rstream)
self.mv(tmp_fname, rpath)
except BaseException:
with suppress(FileNotFoundError):
self.rm(tmp_fname)
raise
def _apply_proxy(self, location):
if self.proxy and callable(self.proxy):
location = self.proxy(location)
elif self.proxy:
# as a dict
for k, v in self.proxy.items():
location = location.replace(k, v, 1)
return location
class WebHDFile(AbstractBufferedFile):
"""A file living in HDFS over webHDFS"""
def __init__(self, fs, path, **kwargs):
super().__init__(fs, path, **kwargs)
kwargs = kwargs.copy()
if kwargs.get("permissions", None) is None:
kwargs.pop("permissions", None)
if kwargs.get("replication", None) is None:
kwargs.pop("replication", None)
self.permissions = kwargs.pop("permissions", 511)
tempdir = kwargs.pop("tempdir")
if kwargs.pop("autocommit", False) is False:
self.target = self.path
self.path = os.path.join(tempdir, str(uuid.uuid4()))
def _upload_chunk(self, final=False):
"""Write one part of a multi-block file upload
Parameters
==========
final: bool
This is the last block, so should complete file, if
self.autocommit is True.
"""
out = self.fs.session.post(
self.location,
data=self.buffer.getvalue(),
headers={"content-type": "application/octet-stream"},
)
out.raise_for_status()
return True
def _initiate_upload(self):
"""Create remote file/upload"""
kwargs = self.kwargs.copy()
if "a" in self.mode:
op, method = "APPEND", "POST"
else:
op, method = "CREATE", "PUT"
kwargs["overwrite"] = "true"
out = self.fs._call(op, method, self.path, redirect=False, **kwargs)
location = self.fs._apply_proxy(out.headers["Location"])
if "w" in self.mode:
# create empty file to append to
out2 = self.fs.session.put(
location, headers={"content-type": "application/octet-stream"}
)
out2.raise_for_status()
# after creating empty file, change location to append to
out2 = self.fs._call("APPEND", "POST", self.path, redirect=False, **kwargs)
self.location = self.fs._apply_proxy(out2.headers["Location"])
def _fetch_range(self, start, end):
start = max(start, 0)
end = min(self.size, end)
if start >= end or start >= self.size:
return b""
out = self.fs._call(
"OPEN", path=self.path, offset=start, length=end - start, redirect=False
)
out.raise_for_status()
if "Location" in out.headers:
location = out.headers["Location"]
out2 = self.fs.session.get(self.fs._apply_proxy(location))
return out2.content
else:
return out.content
def commit(self):
self.fs.mv(self.path, self.target)
def discard(self):
self.fs.rm(self.path)

View File

@@ -0,0 +1,177 @@
import os
import zipfile
import fsspec
from fsspec.archive import AbstractArchiveFileSystem
class ZipFileSystem(AbstractArchiveFileSystem):
"""Read/Write contents of ZIP archive as a file-system
Keeps file object open while instance lives.
This class is pickleable, but not necessarily thread-safe
"""
root_marker = ""
protocol = "zip"
cachable = False
def __init__(
self,
fo="",
mode="r",
target_protocol=None,
target_options=None,
compression=zipfile.ZIP_STORED,
allowZip64=True,
compresslevel=None,
**kwargs,
):
"""
Parameters
----------
fo: str or file-like
Contains ZIP, and must exist. If a str, will fetch file using
:meth:`~fsspec.open_files`, which must return one file exactly.
mode: str
Accept: "r", "w", "a"
target_protocol: str (optional)
If ``fo`` is a string, this value can be used to override the
FS protocol inferred from a URL
target_options: dict (optional)
Kwargs passed when instantiating the target FS, if ``fo`` is
a string.
compression, allowZip64, compresslevel: passed to ZipFile
Only relevant when creating a ZIP
"""
super().__init__(self, **kwargs)
if mode not in set("rwa"):
raise ValueError(f"mode '{mode}' no understood")
self.mode = mode
if isinstance(fo, (str, os.PathLike)):
if mode == "a":
m = "r+b"
else:
m = mode + "b"
fo = fsspec.open(
fo, mode=m, protocol=target_protocol, **(target_options or {})
)
self.force_zip_64 = allowZip64
self.of = fo
self.fo = fo.__enter__() # the whole instance is a context
self.zip = zipfile.ZipFile(
self.fo,
mode=mode,
compression=compression,
allowZip64=allowZip64,
compresslevel=compresslevel,
)
self.dir_cache = None
@classmethod
def _strip_protocol(cls, path):
# zip file paths are always relative to the archive root
return super()._strip_protocol(path).lstrip("/")
def __del__(self):
if hasattr(self, "zip"):
self.close()
del self.zip
def close(self):
"""Commits any write changes to the file. Done on ``del`` too."""
self.zip.close()
def _get_dirs(self):
if self.dir_cache is None or self.mode in set("wa"):
# when writing, dir_cache is always in the ZipFile's attributes,
# not read from the file.
files = self.zip.infolist()
self.dir_cache = {
dirname.rstrip("/"): {
"name": dirname.rstrip("/"),
"size": 0,
"type": "directory",
}
for dirname in self._all_dirnames(self.zip.namelist())
}
for z in files:
f = {s: getattr(z, s, None) for s in zipfile.ZipInfo.__slots__}
f.update(
{
"name": z.filename.rstrip("/"),
"size": z.file_size,
"type": ("directory" if z.is_dir() else "file"),
}
)
self.dir_cache[f["name"]] = f
def pipe_file(self, path, value, **kwargs):
# override upstream, because we know the exact file size in this case
self.zip.writestr(path, value, **kwargs)
def _open(
self,
path,
mode="rb",
block_size=None,
autocommit=True,
cache_options=None,
**kwargs,
):
path = self._strip_protocol(path)
if "r" in mode and self.mode in set("wa"):
if self.exists(path):
raise OSError("ZipFS can only be open for reading or writing, not both")
raise FileNotFoundError(path)
if "r" in self.mode and "w" in mode:
raise OSError("ZipFS can only be open for reading or writing, not both")
out = self.zip.open(path, mode.strip("b"), force_zip64=self.force_zip_64)
if "r" in mode:
info = self.info(path)
out.size = info["size"]
out.name = info["name"]
return out
def find(self, path, maxdepth=None, withdirs=False, detail=False, **kwargs):
if maxdepth is not None and maxdepth < 1:
raise ValueError("maxdepth must be at least 1")
# Remove the leading slash, as the zip file paths are always
# given without a leading slash
path = path.lstrip("/")
path_parts = list(filter(lambda s: bool(s), path.split("/")))
def _matching_starts(file_path):
file_parts = filter(lambda s: bool(s), file_path.split("/"))
return all(a == b for a, b in zip(path_parts, file_parts))
self._get_dirs()
result = {}
# To match posix find, if an exact file name is given, we should
# return only that file
if path in self.dir_cache and self.dir_cache[path]["type"] == "file":
result[path] = self.dir_cache[path]
return result if detail else [path]
for file_path, file_info in self.dir_cache.items():
if not (path == "" or _matching_starts(file_path)):
continue
if file_info["type"] == "directory":
if withdirs:
if file_path not in result:
result[file_path.strip("/")] = file_info
continue
if file_path not in result:
result[file_path] = file_info if detail else None
if maxdepth:
path_depth = path.count("/")
result = {
k: v for k, v in result.items() if k.count("/") - path_depth < maxdepth
}
return result if detail else sorted(result)

View File

@@ -0,0 +1,117 @@
import json
from collections.abc import Mapping, Sequence
from contextlib import suppress
from pathlib import PurePath
from typing import (
Any,
Callable,
ClassVar,
Optional,
)
from .registry import _import_class, get_filesystem_class
from .spec import AbstractFileSystem
class FilesystemJSONEncoder(json.JSONEncoder):
include_password: ClassVar[bool] = True
def default(self, o: Any) -> Any:
if isinstance(o, AbstractFileSystem):
return o.to_dict(include_password=self.include_password)
if isinstance(o, PurePath):
cls = type(o)
return {"cls": f"{cls.__module__}.{cls.__name__}", "str": str(o)}
return super().default(o)
def make_serializable(self, obj: Any) -> Any:
"""
Recursively converts an object so that it can be JSON serialized via
:func:`json.dumps` and :func:`json.dump`, without actually calling
said functions.
"""
if isinstance(obj, (str, int, float, bool)):
return obj
if isinstance(obj, Mapping):
return {k: self.make_serializable(v) for k, v in obj.items()}
if isinstance(obj, Sequence):
return [self.make_serializable(v) for v in obj]
return self.default(obj)
class FilesystemJSONDecoder(json.JSONDecoder):
def __init__(
self,
*,
object_hook: Optional[Callable[[dict[str, Any]], Any]] = None,
parse_float: Optional[Callable[[str], Any]] = None,
parse_int: Optional[Callable[[str], Any]] = None,
parse_constant: Optional[Callable[[str], Any]] = None,
strict: bool = True,
object_pairs_hook: Optional[Callable[[list[tuple[str, Any]]], Any]] = None,
) -> None:
self.original_object_hook = object_hook
super().__init__(
object_hook=self.custom_object_hook,
parse_float=parse_float,
parse_int=parse_int,
parse_constant=parse_constant,
strict=strict,
object_pairs_hook=object_pairs_hook,
)
@classmethod
def try_resolve_path_cls(cls, dct: dict[str, Any]):
with suppress(Exception):
fqp = dct["cls"]
path_cls = _import_class(fqp)
if issubclass(path_cls, PurePath):
return path_cls
return None
@classmethod
def try_resolve_fs_cls(cls, dct: dict[str, Any]):
with suppress(Exception):
if "cls" in dct:
try:
fs_cls = _import_class(dct["cls"])
if issubclass(fs_cls, AbstractFileSystem):
return fs_cls
except Exception:
if "protocol" in dct: # Fallback if cls cannot be imported
return get_filesystem_class(dct["protocol"])
raise
return None
def custom_object_hook(self, dct: dict[str, Any]):
if "cls" in dct:
if (obj_cls := self.try_resolve_fs_cls(dct)) is not None:
return AbstractFileSystem.from_dict(dct)
if (obj_cls := self.try_resolve_path_cls(dct)) is not None:
return obj_cls(dct["str"])
if self.original_object_hook is not None:
return self.original_object_hook(dct)
return dct
def unmake_serializable(self, obj: Any) -> Any:
"""
Inverse function of :meth:`FilesystemJSONEncoder.make_serializable`.
"""
if isinstance(obj, dict):
obj = self.custom_object_hook(obj)
if isinstance(obj, dict):
return {k: self.unmake_serializable(v) for k, v in obj.items()}
if isinstance(obj, (list, tuple)):
return [self.unmake_serializable(v) for v in obj]
return obj

View File

@@ -0,0 +1,251 @@
import array
import logging
import posixpath
import warnings
from collections.abc import MutableMapping
from functools import cached_property
from fsspec.core import url_to_fs
logger = logging.getLogger("fsspec.mapping")
class FSMap(MutableMapping):
"""Wrap a FileSystem instance as a mutable wrapping.
The keys of the mapping become files under the given root, and the
values (which must be bytes) the contents of those files.
Parameters
----------
root: string
prefix for all the files
fs: FileSystem instance
check: bool (=True)
performs a touch at the location, to check for write access.
Examples
--------
>>> fs = FileSystem(**parameters) # doctest: +SKIP
>>> d = FSMap('my-data/path/', fs) # doctest: +SKIP
or, more likely
>>> d = fs.get_mapper('my-data/path/')
>>> d['loc1'] = b'Hello World' # doctest: +SKIP
>>> list(d.keys()) # doctest: +SKIP
['loc1']
>>> d['loc1'] # doctest: +SKIP
b'Hello World'
"""
def __init__(self, root, fs, check=False, create=False, missing_exceptions=None):
self.fs = fs
self.root = fs._strip_protocol(root)
self._root_key_to_str = fs._strip_protocol(posixpath.join(root, "x"))[:-1]
if missing_exceptions is None:
missing_exceptions = (
FileNotFoundError,
IsADirectoryError,
NotADirectoryError,
)
self.missing_exceptions = missing_exceptions
self.check = check
self.create = create
if create:
if not self.fs.exists(root):
self.fs.mkdir(root)
if check:
if not self.fs.exists(root):
raise ValueError(
f"Path {root} does not exist. Create "
f" with the ``create=True`` keyword"
)
self.fs.touch(root + "/a")
self.fs.rm(root + "/a")
@cached_property
def dirfs(self):
"""dirfs instance that can be used with the same keys as the mapper"""
from .implementations.dirfs import DirFileSystem
return DirFileSystem(path=self._root_key_to_str, fs=self.fs)
def clear(self):
"""Remove all keys below root - empties out mapping"""
logger.info("Clear mapping at %s", self.root)
try:
self.fs.rm(self.root, True)
self.fs.mkdir(self.root)
except: # noqa: E722
pass
def getitems(self, keys, on_error="raise"):
"""Fetch multiple items from the store
If the backend is async-able, this might proceed concurrently
Parameters
----------
keys: list(str)
They keys to be fetched
on_error : "raise", "omit", "return"
If raise, an underlying exception will be raised (converted to KeyError
if the type is in self.missing_exceptions); if omit, keys with exception
will simply not be included in the output; if "return", all keys are
included in the output, but the value will be bytes or an exception
instance.
Returns
-------
dict(key, bytes|exception)
"""
keys2 = [self._key_to_str(k) for k in keys]
oe = on_error if on_error == "raise" else "return"
try:
out = self.fs.cat(keys2, on_error=oe)
if isinstance(out, bytes):
out = {keys2[0]: out}
except self.missing_exceptions as e:
raise KeyError from e
out = {
k: (KeyError() if isinstance(v, self.missing_exceptions) else v)
for k, v in out.items()
}
return {
key: out[k2] if on_error == "raise" else out.get(k2, KeyError(k2))
for key, k2 in zip(keys, keys2)
if on_error == "return" or not isinstance(out[k2], BaseException)
}
def setitems(self, values_dict):
"""Set the values of multiple items in the store
Parameters
----------
values_dict: dict(str, bytes)
"""
values = {self._key_to_str(k): maybe_convert(v) for k, v in values_dict.items()}
self.fs.pipe(values)
def delitems(self, keys):
"""Remove multiple keys from the store"""
self.fs.rm([self._key_to_str(k) for k in keys])
def _key_to_str(self, key):
"""Generate full path for the key"""
if not isinstance(key, str):
# raise TypeError("key must be of type `str`, got `{type(key).__name__}`"
warnings.warn(
"from fsspec 2023.5 onward FSMap non-str keys will raise TypeError",
DeprecationWarning,
)
if isinstance(key, list):
key = tuple(key)
key = str(key)
return f"{self._root_key_to_str}{key}".rstrip("/")
def _str_to_key(self, s):
"""Strip path of to leave key name"""
return s[len(self.root) :].lstrip("/")
def __getitem__(self, key, default=None):
"""Retrieve data"""
k = self._key_to_str(key)
try:
result = self.fs.cat(k)
except self.missing_exceptions as exc:
if default is not None:
return default
raise KeyError(key) from exc
return result
def pop(self, key, default=None):
"""Pop data"""
result = self.__getitem__(key, default)
try:
del self[key]
except KeyError:
pass
return result
def __setitem__(self, key, value):
"""Store value in key"""
key = self._key_to_str(key)
self.fs.mkdirs(self.fs._parent(key), exist_ok=True)
self.fs.pipe_file(key, maybe_convert(value))
def __iter__(self):
return (self._str_to_key(x) for x in self.fs.find(self.root))
def __len__(self):
return len(self.fs.find(self.root))
def __delitem__(self, key):
"""Remove key"""
try:
self.fs.rm(self._key_to_str(key))
except Exception as exc:
raise KeyError from exc
def __contains__(self, key):
"""Does key exist in mapping?"""
path = self._key_to_str(key)
return self.fs.isfile(path)
def __reduce__(self):
return FSMap, (self.root, self.fs, False, False, self.missing_exceptions)
def maybe_convert(value):
if isinstance(value, array.array) or hasattr(value, "__array__"):
# bytes-like things
if hasattr(value, "dtype") and value.dtype.kind in "Mm":
# The buffer interface doesn't support datetime64/timdelta64 numpy
# arrays
value = value.view("int64")
value = bytes(memoryview(value))
return value
def get_mapper(
url="",
check=False,
create=False,
missing_exceptions=None,
alternate_root=None,
**kwargs,
):
"""Create key-value interface for given URL and options
The URL will be of the form "protocol://location" and point to the root
of the mapper required. All keys will be file-names below this location,
and their values the contents of each key.
Also accepts compound URLs like zip::s3://bucket/file.zip , see ``fsspec.open``.
Parameters
----------
url: str
Root URL of mapping
check: bool
Whether to attempt to read from the location before instantiation, to
check that the mapping does exist
create: bool
Whether to make the directory corresponding to the root before
instantiating
missing_exceptions: None or tuple
If given, these exception types will be regarded as missing keys and
return KeyError when trying to read data. By default, you get
(FileNotFoundError, IsADirectoryError, NotADirectoryError)
alternate_root: None or str
In cases of complex URLs, the parser may fail to pick the correct part
for the mapper root, so this arg can override
Returns
-------
``FSMap`` instance, the dict-like key-value store.
"""
# Removing protocol here - could defer to each open() on the backend
fs, urlpath = url_to_fs(url, **kwargs)
root = alternate_root if alternate_root is not None else urlpath
return FSMap(root, fs, check, create, missing_exceptions=missing_exceptions)

View File

@@ -0,0 +1,541 @@
import io
import json
import warnings
from .core import url_to_fs
from .utils import merge_offset_ranges
# Parquet-Specific Utilities for fsspec
#
# Most of the functions defined in this module are NOT
# intended for public consumption. The only exception
# to this is `open_parquet_file`, which should be used
# place of `fs.open()` to open parquet-formatted files
# on remote file systems.
def open_parquet_file(
path,
mode="rb",
fs=None,
metadata=None,
columns=None,
row_groups=None,
storage_options=None,
strict=False,
engine="auto",
max_gap=64_000,
max_block=256_000_000,
footer_sample_size=1_000_000,
**kwargs,
):
"""
Return a file-like object for a single Parquet file.
The specified parquet `engine` will be used to parse the
footer metadata, and determine the required byte ranges
from the file. The target path will then be opened with
the "parts" (`KnownPartsOfAFile`) caching strategy.
Note that this method is intended for usage with remote
file systems, and is unlikely to improve parquet-read
performance on local file systems.
Parameters
----------
path: str
Target file path.
mode: str, optional
Mode option to be passed through to `fs.open`. Default is "rb".
metadata: Any, optional
Parquet metadata object. Object type must be supported
by the backend parquet engine. For now, only the "fastparquet"
engine supports an explicit `ParquetFile` metadata object.
If a metadata object is supplied, the remote footer metadata
will not need to be transferred into local memory.
fs: AbstractFileSystem, optional
Filesystem object to use for opening the file. If nothing is
specified, an `AbstractFileSystem` object will be inferred.
engine : str, default "auto"
Parquet engine to use for metadata parsing. Allowed options
include "fastparquet", "pyarrow", and "auto". The specified
engine must be installed in the current environment. If
"auto" is specified, and both engines are installed,
"fastparquet" will take precedence over "pyarrow".
columns: list, optional
List of all column names that may be read from the file.
row_groups : list, optional
List of all row-groups that may be read from the file. This
may be a list of row-group indices (integers), or it may be
a list of `RowGroup` metadata objects (if the "fastparquet"
engine is used).
storage_options : dict, optional
Used to generate an `AbstractFileSystem` object if `fs` was
not specified.
strict : bool, optional
Whether the resulting `KnownPartsOfAFile` cache should
fetch reads that go beyond a known byte-range boundary.
If `False` (the default), any read that ends outside a
known part will be zero padded. Note that using
`strict=True` may be useful for debugging.
max_gap : int, optional
Neighboring byte ranges will only be merged when their
inter-range gap is <= `max_gap`. Default is 64KB.
max_block : int, optional
Neighboring byte ranges will only be merged when the size of
the aggregated range is <= `max_block`. Default is 256MB.
footer_sample_size : int, optional
Number of bytes to read from the end of the path to look
for the footer metadata. If the sampled bytes do not contain
the footer, a second read request will be required, and
performance will suffer. Default is 1MB.
**kwargs :
Optional key-word arguments to pass to `fs.open`
"""
# Make sure we have an `AbstractFileSystem` object
# to work with
if fs is None:
fs = url_to_fs(path, **(storage_options or {}))[0]
# For now, `columns == []` not supported. Just use
# default `open` command with `path` input
if columns is not None and len(columns) == 0:
return fs.open(path, mode=mode)
# Set the engine
engine = _set_engine(engine)
# Fetch the known byte ranges needed to read
# `columns` and/or `row_groups`
data = _get_parquet_byte_ranges(
[path],
fs,
metadata=metadata,
columns=columns,
row_groups=row_groups,
engine=engine,
max_gap=max_gap,
max_block=max_block,
footer_sample_size=footer_sample_size,
)
# Extract file name from `data`
fn = next(iter(data)) if data else path
# Call self.open with "parts" caching
options = kwargs.pop("cache_options", {}).copy()
return fs.open(
fn,
mode=mode,
cache_type="parts",
cache_options={
**options,
"data": data.get(fn, {}),
"strict": strict,
},
**kwargs,
)
def _get_parquet_byte_ranges(
paths,
fs,
metadata=None,
columns=None,
row_groups=None,
max_gap=64_000,
max_block=256_000_000,
footer_sample_size=1_000_000,
engine="auto",
):
"""Get a dictionary of the known byte ranges needed
to read a specific column/row-group selection from a
Parquet dataset. Each value in the output dictionary
is intended for use as the `data` argument for the
`KnownPartsOfAFile` caching strategy of a single path.
"""
# Set engine if necessary
if isinstance(engine, str):
engine = _set_engine(engine)
# Pass to specialized function if metadata is defined
if metadata is not None:
# Use the provided parquet metadata object
# to avoid transferring/parsing footer metadata
return _get_parquet_byte_ranges_from_metadata(
metadata,
fs,
engine,
columns=columns,
row_groups=row_groups,
max_gap=max_gap,
max_block=max_block,
)
# Get file sizes asynchronously
file_sizes = fs.sizes(paths)
# Populate global paths, starts, & ends
result = {}
data_paths = []
data_starts = []
data_ends = []
add_header_magic = True
if columns is None and row_groups is None:
# We are NOT selecting specific columns or row-groups.
#
# We can avoid sampling the footers, and just transfer
# all file data with cat_ranges
for i, path in enumerate(paths):
result[path] = {}
for b in range(0, file_sizes[i], max_block):
data_paths.append(path)
data_starts.append(b)
data_ends.append(min(b + max_block, file_sizes[i]))
add_header_magic = False # "Magic" should already be included
else:
# We ARE selecting specific columns or row-groups.
#
# Gather file footers.
# We just take the last `footer_sample_size` bytes of each
# file (or the entire file if it is smaller than that)
footer_starts = []
footer_ends = []
for i, path in enumerate(paths):
footer_ends.append(file_sizes[i])
sample_size = max(0, file_sizes[i] - footer_sample_size)
footer_starts.append(sample_size)
footer_samples = fs.cat_ranges(paths, footer_starts, footer_ends)
# Check our footer samples and re-sample if necessary.
missing_footer_starts = footer_starts.copy()
large_footer = 0
for i, path in enumerate(paths):
footer_size = int.from_bytes(footer_samples[i][-8:-4], "little")
real_footer_start = file_sizes[i] - (footer_size + 8)
if real_footer_start < footer_starts[i]:
missing_footer_starts[i] = real_footer_start
large_footer = max(large_footer, (footer_size + 8))
if large_footer:
warnings.warn(
f"Not enough data was used to sample the parquet footer. "
f"Try setting footer_sample_size >= {large_footer}."
)
for i, block in enumerate(
fs.cat_ranges(
paths,
missing_footer_starts,
footer_starts,
)
):
footer_samples[i] = block + footer_samples[i]
footer_starts[i] = missing_footer_starts[i]
# Calculate required byte ranges for each path
for i, path in enumerate(paths):
# Deal with small-file case.
# Just include all remaining bytes of the file
# in a single range.
if file_sizes[i] < max_block:
if footer_starts[i] > 0:
# Only need to transfer the data if the
# footer sample isn't already the whole file
data_paths.append(path)
data_starts.append(0)
data_ends.append(footer_starts[i])
continue
# Use "engine" to collect data byte ranges
path_data_starts, path_data_ends = engine._parquet_byte_ranges(
columns,
row_groups=row_groups,
footer=footer_samples[i],
footer_start=footer_starts[i],
)
data_paths += [path] * len(path_data_starts)
data_starts += path_data_starts
data_ends += path_data_ends
# Merge adjacent offset ranges
data_paths, data_starts, data_ends = merge_offset_ranges(
data_paths,
data_starts,
data_ends,
max_gap=max_gap,
max_block=max_block,
sort=False, # Should already be sorted
)
# Start by populating `result` with footer samples
for i, path in enumerate(paths):
result[path] = {(footer_starts[i], footer_ends[i]): footer_samples[i]}
# Transfer the data byte-ranges into local memory
_transfer_ranges(fs, result, data_paths, data_starts, data_ends)
# Add b"PAR1" to header if necessary
if add_header_magic:
_add_header_magic(result)
return result
def _get_parquet_byte_ranges_from_metadata(
metadata,
fs,
engine,
columns=None,
row_groups=None,
max_gap=64_000,
max_block=256_000_000,
):
"""Simplified version of `_get_parquet_byte_ranges` for
the case that an engine-specific `metadata` object is
provided, and the remote footer metadata does not need to
be transferred before calculating the required byte ranges.
"""
# Use "engine" to collect data byte ranges
data_paths, data_starts, data_ends = engine._parquet_byte_ranges(
columns,
row_groups=row_groups,
metadata=metadata,
)
# Merge adjacent offset ranges
data_paths, data_starts, data_ends = merge_offset_ranges(
data_paths,
data_starts,
data_ends,
max_gap=max_gap,
max_block=max_block,
sort=False, # Should be sorted
)
# Transfer the data byte-ranges into local memory
result = {fn: {} for fn in list(set(data_paths))}
_transfer_ranges(fs, result, data_paths, data_starts, data_ends)
# Add b"PAR1" to header
_add_header_magic(result)
return result
def _transfer_ranges(fs, blocks, paths, starts, ends):
# Use cat_ranges to gather the data byte_ranges
ranges = (paths, starts, ends)
for path, start, stop, data in zip(*ranges, fs.cat_ranges(*ranges)):
blocks[path][(start, stop)] = data
def _add_header_magic(data):
# Add b"PAR1" to file headers
for path in list(data.keys()):
add_magic = True
for k in data[path]:
if k[0] == 0 and k[1] >= 4:
add_magic = False
break
if add_magic:
data[path][(0, 4)] = b"PAR1"
def _set_engine(engine_str):
# Define a list of parquet engines to try
if engine_str == "auto":
try_engines = ("fastparquet", "pyarrow")
elif not isinstance(engine_str, str):
raise ValueError(
"Failed to set parquet engine! "
"Please pass 'fastparquet', 'pyarrow', or 'auto'"
)
elif engine_str not in ("fastparquet", "pyarrow"):
raise ValueError(f"{engine_str} engine not supported by `fsspec.parquet`")
else:
try_engines = [engine_str]
# Try importing the engines in `try_engines`,
# and choose the first one that succeeds
for engine in try_engines:
try:
if engine == "fastparquet":
return FastparquetEngine()
elif engine == "pyarrow":
return PyarrowEngine()
except ImportError:
pass
# Raise an error if a supported parquet engine
# was not found
raise ImportError(
f"The following parquet engines are not installed "
f"in your python environment: {try_engines}."
f"Please install 'fastparquert' or 'pyarrow' to "
f"utilize the `fsspec.parquet` module."
)
class FastparquetEngine:
# The purpose of the FastparquetEngine class is
# to check if fastparquet can be imported (on initialization)
# and to define a `_parquet_byte_ranges` method. In the
# future, this class may also be used to define other
# methods/logic that are specific to fastparquet.
def __init__(self):
import fastparquet as fp
self.fp = fp
def _row_group_filename(self, row_group, pf):
return pf.row_group_filename(row_group)
def _parquet_byte_ranges(
self,
columns,
row_groups=None,
metadata=None,
footer=None,
footer_start=None,
):
# Initialize offset ranges and define ParqetFile metadata
pf = metadata
data_paths, data_starts, data_ends = [], [], []
if pf is None:
pf = self.fp.ParquetFile(io.BytesIO(footer))
# Convert columns to a set and add any index columns
# specified in the pandas metadata (just in case)
column_set = None if columns is None else set(columns)
if column_set is not None and hasattr(pf, "pandas_metadata"):
md_index = [
ind
for ind in pf.pandas_metadata.get("index_columns", [])
# Ignore RangeIndex information
if not isinstance(ind, dict)
]
column_set |= set(md_index)
# Check if row_groups is a list of integers
# or a list of row-group metadata
if row_groups and not isinstance(row_groups[0], int):
# Input row_groups contains row-group metadata
row_group_indices = None
else:
# Input row_groups contains row-group indices
row_group_indices = row_groups
row_groups = pf.row_groups
# Loop through column chunks to add required byte ranges
for r, row_group in enumerate(row_groups):
# Skip this row-group if we are targeting
# specific row-groups
if row_group_indices is None or r in row_group_indices:
# Find the target parquet-file path for `row_group`
fn = self._row_group_filename(row_group, pf)
for column in row_group.columns:
name = column.meta_data.path_in_schema[0]
# Skip this column if we are targeting a
# specific columns
if column_set is None or name in column_set:
file_offset0 = column.meta_data.dictionary_page_offset
if file_offset0 is None:
file_offset0 = column.meta_data.data_page_offset
num_bytes = column.meta_data.total_compressed_size
if footer_start is None or file_offset0 < footer_start:
data_paths.append(fn)
data_starts.append(file_offset0)
data_ends.append(
min(
file_offset0 + num_bytes,
footer_start or (file_offset0 + num_bytes),
)
)
if metadata:
# The metadata in this call may map to multiple
# file paths. Need to include `data_paths`
return data_paths, data_starts, data_ends
return data_starts, data_ends
class PyarrowEngine:
# The purpose of the PyarrowEngine class is
# to check if pyarrow can be imported (on initialization)
# and to define a `_parquet_byte_ranges` method. In the
# future, this class may also be used to define other
# methods/logic that are specific to pyarrow.
def __init__(self):
import pyarrow.parquet as pq
self.pq = pq
def _row_group_filename(self, row_group, metadata):
raise NotImplementedError
def _parquet_byte_ranges(
self,
columns,
row_groups=None,
metadata=None,
footer=None,
footer_start=None,
):
if metadata is not None:
raise ValueError("metadata input not supported for PyarrowEngine")
data_starts, data_ends = [], []
md = self.pq.ParquetFile(io.BytesIO(footer)).metadata
# Convert columns to a set and add any index columns
# specified in the pandas metadata (just in case)
column_set = None if columns is None else set(columns)
if column_set is not None:
schema = md.schema.to_arrow_schema()
has_pandas_metadata = (
schema.metadata is not None and b"pandas" in schema.metadata
)
if has_pandas_metadata:
md_index = [
ind
for ind in json.loads(
schema.metadata[b"pandas"].decode("utf8")
).get("index_columns", [])
# Ignore RangeIndex information
if not isinstance(ind, dict)
]
column_set |= set(md_index)
# Loop through column chunks to add required byte ranges
for r in range(md.num_row_groups):
# Skip this row-group if we are targeting
# specific row-groups
if row_groups is None or r in row_groups:
row_group = md.row_group(r)
for c in range(row_group.num_columns):
column = row_group.column(c)
name = column.path_in_schema
# Skip this column if we are targeting a
# specific columns
split_name = name.split(".")[0]
if (
column_set is None
or name in column_set
or split_name in column_set
):
file_offset0 = column.dictionary_page_offset
if file_offset0 is None:
file_offset0 = column.data_page_offset
num_bytes = column.total_compressed_size
if file_offset0 < footer_start:
data_starts.append(file_offset0)
data_ends.append(
min(file_offset0 + num_bytes, footer_start)
)
return data_starts, data_ends

View File

@@ -0,0 +1,330 @@
from __future__ import annotations
import importlib
import types
import warnings
__all__ = ["registry", "get_filesystem_class", "default"]
# internal, mutable
_registry: dict[str, type] = {}
# external, immutable
registry = types.MappingProxyType(_registry)
default = "file"
def register_implementation(name, cls, clobber=False, errtxt=None):
"""Add implementation class to the registry
Parameters
----------
name: str
Protocol name to associate with the class
cls: class or str
if a class: fsspec-compliant implementation class (normally inherits from
``fsspec.AbstractFileSystem``, gets added straight to the registry. If a
str, the full path to an implementation class like package.module.class,
which gets added to known_implementations,
so the import is deferred until the filesystem is actually used.
clobber: bool (optional)
Whether to overwrite a protocol with the same name; if False, will raise
instead.
errtxt: str (optional)
If given, then a failure to import the given class will result in this
text being given.
"""
if isinstance(cls, str):
if name in known_implementations and clobber is False:
if cls != known_implementations[name]["class"]:
raise ValueError(
f"Name ({name}) already in the known_implementations and clobber "
f"is False"
)
else:
known_implementations[name] = {
"class": cls,
"err": errtxt or f"{cls} import failed for protocol {name}",
}
else:
if name in registry and clobber is False:
if _registry[name] is not cls:
raise ValueError(
f"Name ({name}) already in the registry and clobber is False"
)
else:
_registry[name] = cls
# protocols mapped to the class which implements them. This dict can be
# updated with register_implementation
known_implementations = {
"abfs": {
"class": "adlfs.AzureBlobFileSystem",
"err": "Install adlfs to access Azure Datalake Gen2 and Azure Blob Storage",
},
"adl": {
"class": "adlfs.AzureDatalakeFileSystem",
"err": "Install adlfs to access Azure Datalake Gen1",
},
"arrow_hdfs": {
"class": "fsspec.implementations.arrow.HadoopFileSystem",
"err": "pyarrow and local java libraries required for HDFS",
},
"asynclocal": {
"class": "morefs.asyn_local.AsyncLocalFileSystem",
"err": "Install 'morefs[asynclocalfs]' to use AsyncLocalFileSystem",
},
"asyncwrapper": {
"class": "fsspec.implementations.asyn_wrapper.AsyncFileSystemWrapper",
},
"az": {
"class": "adlfs.AzureBlobFileSystem",
"err": "Install adlfs to access Azure Datalake Gen2 and Azure Blob Storage",
},
"blockcache": {"class": "fsspec.implementations.cached.CachingFileSystem"},
"box": {
"class": "boxfs.BoxFileSystem",
"err": "Please install boxfs to access BoxFileSystem",
},
"cached": {"class": "fsspec.implementations.cached.CachingFileSystem"},
"dask": {
"class": "fsspec.implementations.dask.DaskWorkerFileSystem",
"err": "Install dask distributed to access worker file system",
},
"data": {"class": "fsspec.implementations.data.DataFileSystem"},
"dbfs": {
"class": "fsspec.implementations.dbfs.DatabricksFileSystem",
"err": "Install the requests package to use the DatabricksFileSystem",
},
"dir": {"class": "fsspec.implementations.dirfs.DirFileSystem"},
"dropbox": {
"class": "dropboxdrivefs.DropboxDriveFileSystem",
"err": (
'DropboxFileSystem requires "dropboxdrivefs","requests" and "'
'"dropbox" to be installed'
),
},
"dvc": {
"class": "dvc.api.DVCFileSystem",
"err": "Install dvc to access DVCFileSystem",
},
"file": {"class": "fsspec.implementations.local.LocalFileSystem"},
"filecache": {"class": "fsspec.implementations.cached.WholeFileCacheFileSystem"},
"ftp": {"class": "fsspec.implementations.ftp.FTPFileSystem"},
"gcs": {
"class": "gcsfs.GCSFileSystem",
"err": "Please install gcsfs to access Google Storage",
},
"gdrive": {
"class": "gdrive_fsspec.GoogleDriveFileSystem",
"err": "Please install gdrive_fs for access to Google Drive",
},
"generic": {"class": "fsspec.generic.GenericFileSystem"},
"gist": {
"class": "fsspec.implementations.gist.GistFileSystem",
"err": "Install the requests package to use the gist FS",
},
"git": {
"class": "fsspec.implementations.git.GitFileSystem",
"err": "Install pygit2 to browse local git repos",
},
"github": {
"class": "fsspec.implementations.github.GithubFileSystem",
"err": "Install the requests package to use the github FS",
},
"gs": {
"class": "gcsfs.GCSFileSystem",
"err": "Please install gcsfs to access Google Storage",
},
"hdfs": {
"class": "fsspec.implementations.arrow.HadoopFileSystem",
"err": "pyarrow and local java libraries required for HDFS",
},
"hf": {
"class": "huggingface_hub.HfFileSystem",
"err": "Install huggingface_hub to access HfFileSystem",
},
"http": {
"class": "fsspec.implementations.http.HTTPFileSystem",
"err": 'HTTPFileSystem requires "requests" and "aiohttp" to be installed',
},
"https": {
"class": "fsspec.implementations.http.HTTPFileSystem",
"err": 'HTTPFileSystem requires "requests" and "aiohttp" to be installed',
},
"jlab": {
"class": "fsspec.implementations.jupyter.JupyterFileSystem",
"err": "Jupyter FS requires requests to be installed",
},
"jupyter": {
"class": "fsspec.implementations.jupyter.JupyterFileSystem",
"err": "Jupyter FS requires requests to be installed",
},
"lakefs": {
"class": "lakefs_spec.LakeFSFileSystem",
"err": "Please install lakefs-spec to access LakeFSFileSystem",
},
"libarchive": {
"class": "fsspec.implementations.libarchive.LibArchiveFileSystem",
"err": "LibArchive requires to be installed",
},
"local": {"class": "fsspec.implementations.local.LocalFileSystem"},
"memory": {"class": "fsspec.implementations.memory.MemoryFileSystem"},
"oci": {
"class": "ocifs.OCIFileSystem",
"err": "Install ocifs to access OCI Object Storage",
},
"ocilake": {
"class": "ocifs.OCIFileSystem",
"err": "Install ocifs to access OCI Data Lake",
},
"oss": {
"class": "ossfs.OSSFileSystem",
"err": "Install ossfs to access Alibaba Object Storage System",
},
"pyscript": {
"class": "pyscript_fsspec_client.client.PyscriptFileSystem",
"err": "Install requests (cpython) or run in pyscript",
},
"reference": {"class": "fsspec.implementations.reference.ReferenceFileSystem"},
"root": {
"class": "fsspec_xrootd.XRootDFileSystem",
"err": (
"Install fsspec-xrootd to access xrootd storage system. "
"Note: 'root' is the protocol name for xrootd storage systems, "
"not referring to root directories"
),
},
"s3": {"class": "s3fs.S3FileSystem", "err": "Install s3fs to access S3"},
"s3a": {"class": "s3fs.S3FileSystem", "err": "Install s3fs to access S3"},
"sftp": {
"class": "fsspec.implementations.sftp.SFTPFileSystem",
"err": 'SFTPFileSystem requires "paramiko" to be installed',
},
"simplecache": {"class": "fsspec.implementations.cached.SimpleCacheFileSystem"},
"smb": {
"class": "fsspec.implementations.smb.SMBFileSystem",
"err": 'SMB requires "smbprotocol" or "smbprotocol[kerberos]" installed',
},
"ssh": {
"class": "fsspec.implementations.sftp.SFTPFileSystem",
"err": 'SFTPFileSystem requires "paramiko" to be installed',
},
"tar": {"class": "fsspec.implementations.tar.TarFileSystem"},
"tos": {
"class": "tosfs.TosFileSystem",
"err": "Install tosfs to access ByteDance volcano engine Tinder Object Storage",
},
"tosfs": {
"class": "tosfs.TosFileSystem",
"err": "Install tosfs to access ByteDance volcano engine Tinder Object Storage",
},
"wandb": {"class": "wandbfs.WandbFS", "err": "Install wandbfs to access wandb"},
"webdav": {
"class": "webdav4.fsspec.WebdavFileSystem",
"err": "Install webdav4 to access WebDAV",
},
"webhdfs": {
"class": "fsspec.implementations.webhdfs.WebHDFS",
"err": 'webHDFS access requires "requests" to be installed',
},
"zip": {"class": "fsspec.implementations.zip.ZipFileSystem"},
}
assert list(known_implementations) == sorted(known_implementations), (
"Not in alphabetical order"
)
def get_filesystem_class(protocol):
"""Fetch named protocol implementation from the registry
The dict ``known_implementations`` maps protocol names to the locations
of classes implementing the corresponding file-system. When used for the
first time, appropriate imports will happen and the class will be placed in
the registry. All subsequent calls will fetch directly from the registry.
Some protocol implementations require additional dependencies, and so the
import may fail. In this case, the string in the "err" field of the
``known_implementations`` will be given as the error message.
"""
if not protocol:
protocol = default
if protocol not in registry:
if protocol not in known_implementations:
raise ValueError(f"Protocol not known: {protocol}")
bit = known_implementations[protocol]
try:
register_implementation(protocol, _import_class(bit["class"]))
except ImportError as e:
raise ImportError(bit.get("err")) from e
cls = registry[protocol]
if getattr(cls, "protocol", None) in ("abstract", None):
cls.protocol = protocol
return cls
s3_msg = """Your installed version of s3fs is very old and known to cause
severe performance issues, see also https://github.com/dask/dask/issues/10276
To fix, you should specify a lower version bound on s3fs, or
update the current installation.
"""
def _import_class(fqp: str):
"""Take a fully-qualified path and return the imported class or identifier.
``fqp`` is of the form "package.module.klass" or
"package.module:subobject.klass".
Warnings
--------
This can import arbitrary modules. Make sure you haven't installed any modules
that may execute malicious code at import time.
"""
if ":" in fqp:
mod, name = fqp.rsplit(":", 1)
else:
mod, name = fqp.rsplit(".", 1)
is_s3 = mod == "s3fs"
mod = importlib.import_module(mod)
if is_s3 and mod.__version__.split(".") < ["0", "5"]:
warnings.warn(s3_msg)
for part in name.split("."):
mod = getattr(mod, part)
if not isinstance(mod, type):
raise TypeError(f"{fqp} is not a class")
return mod
def filesystem(protocol, **storage_options):
"""Instantiate filesystems for given protocol and arguments
``storage_options`` are specific to the protocol being chosen, and are
passed directly to the class.
"""
if protocol == "arrow_hdfs":
warnings.warn(
"The 'arrow_hdfs' protocol has been deprecated and will be "
"removed in the future. Specify it as 'hdfs'.",
DeprecationWarning,
)
cls = get_filesystem_class(protocol)
return cls(**storage_options)
def available_protocols():
"""Return a list of the implemented protocols.
Note that any given protocol may require extra packages to be importable.
"""
return list(known_implementations)

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,289 @@
import os
from hashlib import md5
import pytest
from fsspec.implementations.local import LocalFileSystem
from fsspec.tests.abstract.copy import AbstractCopyTests # noqa: F401
from fsspec.tests.abstract.get import AbstractGetTests # noqa: F401
from fsspec.tests.abstract.open import AbstractOpenTests # noqa: F401
from fsspec.tests.abstract.pipe import AbstractPipeTests # noqa: F401
from fsspec.tests.abstract.put import AbstractPutTests # noqa: F401
class BaseAbstractFixtures:
"""
Abstract base class containing fixtures that are used by but never need to
be overridden in derived filesystem-specific classes to run the abstract
tests on such filesystems.
"""
@pytest.fixture
def fs_bulk_operations_scenario_0(self, fs, fs_join, fs_path):
"""
Scenario on remote filesystem that is used for many cp/get/put tests.
Cleans up at the end of each test it which it is used.
"""
source = self._bulk_operations_scenario_0(fs, fs_join, fs_path)
yield source
fs.rm(source, recursive=True)
@pytest.fixture
def fs_glob_edge_cases_files(self, fs, fs_join, fs_path):
"""
Scenario on remote filesystem that is used for glob edge cases cp/get/put tests.
Cleans up at the end of each test it which it is used.
"""
source = self._glob_edge_cases_files(fs, fs_join, fs_path)
yield source
fs.rm(source, recursive=True)
@pytest.fixture
def fs_dir_and_file_with_same_name_prefix(self, fs, fs_join, fs_path):
"""
Scenario on remote filesystem that is used to check cp/get/put on directory
and file with the same name prefixes.
Cleans up at the end of each test it which it is used.
"""
source = self._dir_and_file_with_same_name_prefix(fs, fs_join, fs_path)
yield source
fs.rm(source, recursive=True)
@pytest.fixture
def fs_10_files_with_hashed_names(self, fs, fs_join, fs_path):
"""
Scenario on remote filesystem that is used to check cp/get/put files order
when source and destination are lists.
Cleans up at the end of each test it which it is used.
"""
source = self._10_files_with_hashed_names(fs, fs_join, fs_path)
yield source
fs.rm(source, recursive=True)
@pytest.fixture
def fs_target(self, fs, fs_join, fs_path):
"""
Return name of remote directory that does not yet exist to copy into.
Cleans up at the end of each test it which it is used.
"""
target = fs_join(fs_path, "target")
yield target
if fs.exists(target):
fs.rm(target, recursive=True)
@pytest.fixture
def local_bulk_operations_scenario_0(self, local_fs, local_join, local_path):
"""
Scenario on local filesystem that is used for many cp/get/put tests.
Cleans up at the end of each test it which it is used.
"""
source = self._bulk_operations_scenario_0(local_fs, local_join, local_path)
yield source
local_fs.rm(source, recursive=True)
@pytest.fixture
def local_glob_edge_cases_files(self, local_fs, local_join, local_path):
"""
Scenario on local filesystem that is used for glob edge cases cp/get/put tests.
Cleans up at the end of each test it which it is used.
"""
source = self._glob_edge_cases_files(local_fs, local_join, local_path)
yield source
local_fs.rm(source, recursive=True)
@pytest.fixture
def local_dir_and_file_with_same_name_prefix(
self, local_fs, local_join, local_path
):
"""
Scenario on local filesystem that is used to check cp/get/put on directory
and file with the same name prefixes.
Cleans up at the end of each test it which it is used.
"""
source = self._dir_and_file_with_same_name_prefix(
local_fs, local_join, local_path
)
yield source
local_fs.rm(source, recursive=True)
@pytest.fixture
def local_10_files_with_hashed_names(self, local_fs, local_join, local_path):
"""
Scenario on local filesystem that is used to check cp/get/put files order
when source and destination are lists.
Cleans up at the end of each test it which it is used.
"""
source = self._10_files_with_hashed_names(local_fs, local_join, local_path)
yield source
local_fs.rm(source, recursive=True)
@pytest.fixture
def local_target(self, local_fs, local_join, local_path):
"""
Return name of local directory that does not yet exist to copy into.
Cleans up at the end of each test it which it is used.
"""
target = local_join(local_path, "target")
yield target
if local_fs.exists(target):
local_fs.rm(target, recursive=True)
def _glob_edge_cases_files(self, some_fs, some_join, some_path):
"""
Scenario that is used for glob edge cases cp/get/put tests.
Creates the following directory and file structure:
📁 source
├── 📄 file1
├── 📄 file2
├── 📁 subdir0
│ ├── 📄 subfile1
│ ├── 📄 subfile2
│ └── 📁 nesteddir
│ └── 📄 nestedfile
└── 📁 subdir1
├── 📄 subfile1
├── 📄 subfile2
└── 📁 nesteddir
└── 📄 nestedfile
"""
source = some_join(some_path, "source")
some_fs.touch(some_join(source, "file1"))
some_fs.touch(some_join(source, "file2"))
for subdir_idx in range(2):
subdir = some_join(source, f"subdir{subdir_idx}")
nesteddir = some_join(subdir, "nesteddir")
some_fs.makedirs(nesteddir)
some_fs.touch(some_join(subdir, "subfile1"))
some_fs.touch(some_join(subdir, "subfile2"))
some_fs.touch(some_join(nesteddir, "nestedfile"))
return source
def _bulk_operations_scenario_0(self, some_fs, some_join, some_path):
"""
Scenario that is used for many cp/get/put tests. Creates the following
directory and file structure:
📁 source
├── 📄 file1
├── 📄 file2
└── 📁 subdir
├── 📄 subfile1
├── 📄 subfile2
└── 📁 nesteddir
└── 📄 nestedfile
"""
source = some_join(some_path, "source")
subdir = some_join(source, "subdir")
nesteddir = some_join(subdir, "nesteddir")
some_fs.makedirs(nesteddir)
some_fs.touch(some_join(source, "file1"))
some_fs.touch(some_join(source, "file2"))
some_fs.touch(some_join(subdir, "subfile1"))
some_fs.touch(some_join(subdir, "subfile2"))
some_fs.touch(some_join(nesteddir, "nestedfile"))
return source
def _dir_and_file_with_same_name_prefix(self, some_fs, some_join, some_path):
"""
Scenario that is used to check cp/get/put on directory and file with
the same name prefixes. Creates the following directory and file structure:
📁 source
├── 📄 subdir.txt
└── 📁 subdir
└── 📄 subfile.txt
"""
source = some_join(some_path, "source")
subdir = some_join(source, "subdir")
file = some_join(source, "subdir.txt")
subfile = some_join(subdir, "subfile.txt")
some_fs.makedirs(subdir)
some_fs.touch(file)
some_fs.touch(subfile)
return source
def _10_files_with_hashed_names(self, some_fs, some_join, some_path):
"""
Scenario that is used to check cp/get/put files order when source and
destination are lists. Creates the following directory and file structure:
📁 source
└── 📄 {hashed([0-9])}.txt
"""
source = some_join(some_path, "source")
for i in range(10):
hashed_i = md5(str(i).encode("utf-8")).hexdigest()
path = some_join(source, f"{hashed_i}.txt")
some_fs.pipe(path=path, value=f"{i}".encode())
return source
class AbstractFixtures(BaseAbstractFixtures):
"""
Abstract base class containing fixtures that may be overridden in derived
filesystem-specific classes to run the abstract tests on such filesystems.
For any particular filesystem some of these fixtures must be overridden,
such as ``fs`` and ``fs_path``, and others may be overridden if the
default functions here are not appropriate, such as ``fs_join``.
"""
@pytest.fixture
def fs(self):
raise NotImplementedError("This function must be overridden in derived classes")
@pytest.fixture
def fs_join(self):
"""
Return a function that joins its arguments together into a path.
Most fsspec implementations join paths in a platform-dependent way,
but some will override this to always use a forward slash.
"""
return os.path.join
@pytest.fixture
def fs_path(self):
raise NotImplementedError("This function must be overridden in derived classes")
@pytest.fixture(scope="class")
def local_fs(self):
# Maybe need an option for auto_mkdir=False? This is only relevant
# for certain implementations.
return LocalFileSystem(auto_mkdir=True)
@pytest.fixture
def local_join(self):
"""
Return a function that joins its arguments together into a path, on
the local filesystem.
"""
return os.path.join
@pytest.fixture
def local_path(self, tmpdir):
return tmpdir
@pytest.fixture
def supports_empty_directories(self):
"""
Return whether this implementation supports empty directories.
"""
return True
@pytest.fixture
def fs_sanitize_path(self):
return lambda x: x

View File

@@ -0,0 +1,175 @@
GLOB_EDGE_CASES_TESTS = {
"argnames": ("path", "recursive", "maxdepth", "expected"),
"argvalues": [
("fil?1", False, None, ["file1"]),
("fil?1", True, None, ["file1"]),
("file[1-2]", False, None, ["file1", "file2"]),
("file[1-2]", True, None, ["file1", "file2"]),
("*", False, None, ["file1", "file2"]),
(
"*",
True,
None,
[
"file1",
"file2",
"subdir0/subfile1",
"subdir0/subfile2",
"subdir0/nesteddir/nestedfile",
"subdir1/subfile1",
"subdir1/subfile2",
"subdir1/nesteddir/nestedfile",
],
),
("*", True, 1, ["file1", "file2"]),
(
"*",
True,
2,
[
"file1",
"file2",
"subdir0/subfile1",
"subdir0/subfile2",
"subdir1/subfile1",
"subdir1/subfile2",
],
),
("*1", False, None, ["file1"]),
(
"*1",
True,
None,
[
"file1",
"subdir1/subfile1",
"subdir1/subfile2",
"subdir1/nesteddir/nestedfile",
],
),
("*1", True, 2, ["file1", "subdir1/subfile1", "subdir1/subfile2"]),
(
"**",
False,
None,
[
"file1",
"file2",
"subdir0/subfile1",
"subdir0/subfile2",
"subdir0/nesteddir/nestedfile",
"subdir1/subfile1",
"subdir1/subfile2",
"subdir1/nesteddir/nestedfile",
],
),
(
"**",
True,
None,
[
"file1",
"file2",
"subdir0/subfile1",
"subdir0/subfile2",
"subdir0/nesteddir/nestedfile",
"subdir1/subfile1",
"subdir1/subfile2",
"subdir1/nesteddir/nestedfile",
],
),
("**", True, 1, ["file1", "file2"]),
(
"**",
True,
2,
[
"file1",
"file2",
"subdir0/subfile1",
"subdir0/subfile2",
"subdir0/nesteddir/nestedfile",
"subdir1/subfile1",
"subdir1/subfile2",
"subdir1/nesteddir/nestedfile",
],
),
(
"**",
False,
2,
[
"file1",
"file2",
"subdir0/subfile1",
"subdir0/subfile2",
"subdir1/subfile1",
"subdir1/subfile2",
],
),
("**/*1", False, None, ["file1", "subdir0/subfile1", "subdir1/subfile1"]),
(
"**/*1",
True,
None,
[
"file1",
"subdir0/subfile1",
"subdir1/subfile1",
"subdir1/subfile2",
"subdir1/nesteddir/nestedfile",
],
),
("**/*1", True, 1, ["file1"]),
(
"**/*1",
True,
2,
["file1", "subdir0/subfile1", "subdir1/subfile1", "subdir1/subfile2"],
),
("**/*1", False, 2, ["file1", "subdir0/subfile1", "subdir1/subfile1"]),
("**/subdir0", False, None, []),
("**/subdir0", True, None, ["subfile1", "subfile2", "nesteddir/nestedfile"]),
("**/subdir0/nested*", False, 2, []),
("**/subdir0/nested*", True, 2, ["nestedfile"]),
("subdir[1-2]", False, None, []),
("subdir[1-2]", True, None, ["subfile1", "subfile2", "nesteddir/nestedfile"]),
("subdir[1-2]", True, 2, ["subfile1", "subfile2"]),
("subdir[0-1]", False, None, []),
(
"subdir[0-1]",
True,
None,
[
"subdir0/subfile1",
"subdir0/subfile2",
"subdir0/nesteddir/nestedfile",
"subdir1/subfile1",
"subdir1/subfile2",
"subdir1/nesteddir/nestedfile",
],
),
(
"subdir[0-1]/*fil[e]*",
False,
None,
[
"subdir0/subfile1",
"subdir0/subfile2",
"subdir1/subfile1",
"subdir1/subfile2",
],
),
(
"subdir[0-1]/*fil[e]*",
True,
None,
[
"subdir0/subfile1",
"subdir0/subfile2",
"subdir1/subfile1",
"subdir1/subfile2",
],
),
],
}

View File

@@ -0,0 +1,557 @@
from hashlib import md5
from itertools import product
import pytest
from fsspec.tests.abstract.common import GLOB_EDGE_CASES_TESTS
class AbstractCopyTests:
def test_copy_file_to_existing_directory(
self,
fs,
fs_join,
fs_bulk_operations_scenario_0,
fs_target,
supports_empty_directories,
):
# Copy scenario 1a
source = fs_bulk_operations_scenario_0
target = fs_target
fs.mkdir(target)
if not supports_empty_directories:
# Force target directory to exist by adding a dummy file
fs.touch(fs_join(target, "dummy"))
assert fs.isdir(target)
target_file2 = fs_join(target, "file2")
target_subfile1 = fs_join(target, "subfile1")
# Copy from source directory
fs.cp(fs_join(source, "file2"), target)
assert fs.isfile(target_file2)
# Copy from sub directory
fs.cp(fs_join(source, "subdir", "subfile1"), target)
assert fs.isfile(target_subfile1)
# Remove copied files
fs.rm([target_file2, target_subfile1])
assert not fs.exists(target_file2)
assert not fs.exists(target_subfile1)
# Repeat with trailing slash on target
fs.cp(fs_join(source, "file2"), target + "/")
assert fs.isdir(target)
assert fs.isfile(target_file2)
fs.cp(fs_join(source, "subdir", "subfile1"), target + "/")
assert fs.isfile(target_subfile1)
def test_copy_file_to_new_directory(
self, fs, fs_join, fs_bulk_operations_scenario_0, fs_target
):
# Copy scenario 1b
source = fs_bulk_operations_scenario_0
target = fs_target
fs.mkdir(target)
fs.cp(
fs_join(source, "subdir", "subfile1"), fs_join(target, "newdir/")
) # Note trailing slash
assert fs.isdir(target)
assert fs.isdir(fs_join(target, "newdir"))
assert fs.isfile(fs_join(target, "newdir", "subfile1"))
def test_copy_file_to_file_in_existing_directory(
self,
fs,
fs_join,
fs_bulk_operations_scenario_0,
fs_target,
supports_empty_directories,
):
# Copy scenario 1c
source = fs_bulk_operations_scenario_0
target = fs_target
fs.mkdir(target)
if not supports_empty_directories:
# Force target directory to exist by adding a dummy file
fs.touch(fs_join(target, "dummy"))
assert fs.isdir(target)
fs.cp(fs_join(source, "subdir", "subfile1"), fs_join(target, "newfile"))
assert fs.isfile(fs_join(target, "newfile"))
def test_copy_file_to_file_in_new_directory(
self, fs, fs_join, fs_bulk_operations_scenario_0, fs_target
):
# Copy scenario 1d
source = fs_bulk_operations_scenario_0
target = fs_target
fs.mkdir(target)
fs.cp(
fs_join(source, "subdir", "subfile1"), fs_join(target, "newdir", "newfile")
)
assert fs.isdir(fs_join(target, "newdir"))
assert fs.isfile(fs_join(target, "newdir", "newfile"))
def test_copy_directory_to_existing_directory(
self,
fs,
fs_join,
fs_bulk_operations_scenario_0,
fs_target,
supports_empty_directories,
):
# Copy scenario 1e
source = fs_bulk_operations_scenario_0
target = fs_target
fs.mkdir(target)
if not supports_empty_directories:
# Force target directory to exist by adding a dummy file
dummy = fs_join(target, "dummy")
fs.touch(dummy)
assert fs.isdir(target)
for source_slash, target_slash in zip([False, True], [False, True]):
s = fs_join(source, "subdir")
if source_slash:
s += "/"
t = target + "/" if target_slash else target
# Without recursive does nothing
fs.cp(s, t)
assert fs.ls(target, detail=False) == (
[] if supports_empty_directories else [dummy]
)
# With recursive
fs.cp(s, t, recursive=True)
if source_slash:
assert fs.isfile(fs_join(target, "subfile1"))
assert fs.isfile(fs_join(target, "subfile2"))
assert fs.isdir(fs_join(target, "nesteddir"))
assert fs.isfile(fs_join(target, "nesteddir", "nestedfile"))
assert not fs.exists(fs_join(target, "subdir"))
fs.rm(
[
fs_join(target, "subfile1"),
fs_join(target, "subfile2"),
fs_join(target, "nesteddir"),
],
recursive=True,
)
else:
assert fs.isdir(fs_join(target, "subdir"))
assert fs.isfile(fs_join(target, "subdir", "subfile1"))
assert fs.isfile(fs_join(target, "subdir", "subfile2"))
assert fs.isdir(fs_join(target, "subdir", "nesteddir"))
assert fs.isfile(fs_join(target, "subdir", "nesteddir", "nestedfile"))
fs.rm(fs_join(target, "subdir"), recursive=True)
assert fs.ls(target, detail=False) == (
[] if supports_empty_directories else [dummy]
)
# Limit recursive by maxdepth
fs.cp(s, t, recursive=True, maxdepth=1)
if source_slash:
assert fs.isfile(fs_join(target, "subfile1"))
assert fs.isfile(fs_join(target, "subfile2"))
assert not fs.exists(fs_join(target, "nesteddir"))
assert not fs.exists(fs_join(target, "subdir"))
fs.rm(
[
fs_join(target, "subfile1"),
fs_join(target, "subfile2"),
],
recursive=True,
)
else:
assert fs.isdir(fs_join(target, "subdir"))
assert fs.isfile(fs_join(target, "subdir", "subfile1"))
assert fs.isfile(fs_join(target, "subdir", "subfile2"))
assert not fs.exists(fs_join(target, "subdir", "nesteddir"))
fs.rm(fs_join(target, "subdir"), recursive=True)
assert fs.ls(target, detail=False) == (
[] if supports_empty_directories else [dummy]
)
def test_copy_directory_to_new_directory(
self,
fs,
fs_join,
fs_bulk_operations_scenario_0,
fs_target,
supports_empty_directories,
):
# Copy scenario 1f
source = fs_bulk_operations_scenario_0
target = fs_target
fs.mkdir(target)
for source_slash, target_slash in zip([False, True], [False, True]):
s = fs_join(source, "subdir")
if source_slash:
s += "/"
t = fs_join(target, "newdir")
if target_slash:
t += "/"
# Without recursive does nothing
fs.cp(s, t)
if supports_empty_directories:
assert fs.ls(target) == []
else:
with pytest.raises(FileNotFoundError):
fs.ls(target)
# With recursive
fs.cp(s, t, recursive=True)
assert fs.isdir(fs_join(target, "newdir"))
assert fs.isfile(fs_join(target, "newdir", "subfile1"))
assert fs.isfile(fs_join(target, "newdir", "subfile2"))
assert fs.isdir(fs_join(target, "newdir", "nesteddir"))
assert fs.isfile(fs_join(target, "newdir", "nesteddir", "nestedfile"))
assert not fs.exists(fs_join(target, "subdir"))
fs.rm(fs_join(target, "newdir"), recursive=True)
assert not fs.exists(fs_join(target, "newdir"))
# Limit recursive by maxdepth
fs.cp(s, t, recursive=True, maxdepth=1)
assert fs.isdir(fs_join(target, "newdir"))
assert fs.isfile(fs_join(target, "newdir", "subfile1"))
assert fs.isfile(fs_join(target, "newdir", "subfile2"))
assert not fs.exists(fs_join(target, "newdir", "nesteddir"))
assert not fs.exists(fs_join(target, "subdir"))
fs.rm(fs_join(target, "newdir"), recursive=True)
assert not fs.exists(fs_join(target, "newdir"))
def test_copy_glob_to_existing_directory(
self,
fs,
fs_join,
fs_bulk_operations_scenario_0,
fs_target,
supports_empty_directories,
):
# Copy scenario 1g
source = fs_bulk_operations_scenario_0
target = fs_target
fs.mkdir(target)
if not supports_empty_directories:
# Force target directory to exist by adding a dummy file
dummy = fs_join(target, "dummy")
fs.touch(dummy)
assert fs.isdir(target)
for target_slash in [False, True]:
t = target + "/" if target_slash else target
# Without recursive
fs.cp(fs_join(source, "subdir", "*"), t)
assert fs.isfile(fs_join(target, "subfile1"))
assert fs.isfile(fs_join(target, "subfile2"))
assert not fs.isdir(fs_join(target, "nesteddir"))
assert not fs.exists(fs_join(target, "nesteddir", "nestedfile"))
assert not fs.exists(fs_join(target, "subdir"))
fs.rm(
[
fs_join(target, "subfile1"),
fs_join(target, "subfile2"),
],
recursive=True,
)
assert fs.ls(target, detail=False) == (
[] if supports_empty_directories else [dummy]
)
# With recursive
for glob, recursive in zip(["*", "**"], [True, False]):
fs.cp(fs_join(source, "subdir", glob), t, recursive=recursive)
assert fs.isfile(fs_join(target, "subfile1"))
assert fs.isfile(fs_join(target, "subfile2"))
assert fs.isdir(fs_join(target, "nesteddir"))
assert fs.isfile(fs_join(target, "nesteddir", "nestedfile"))
assert not fs.exists(fs_join(target, "subdir"))
fs.rm(
[
fs_join(target, "subfile1"),
fs_join(target, "subfile2"),
fs_join(target, "nesteddir"),
],
recursive=True,
)
assert fs.ls(target, detail=False) == (
[] if supports_empty_directories else [dummy]
)
# Limit recursive by maxdepth
fs.cp(
fs_join(source, "subdir", glob), t, recursive=recursive, maxdepth=1
)
assert fs.isfile(fs_join(target, "subfile1"))
assert fs.isfile(fs_join(target, "subfile2"))
assert not fs.exists(fs_join(target, "nesteddir"))
assert not fs.exists(fs_join(target, "subdir"))
fs.rm(
[
fs_join(target, "subfile1"),
fs_join(target, "subfile2"),
],
recursive=True,
)
assert fs.ls(target, detail=False) == (
[] if supports_empty_directories else [dummy]
)
def test_copy_glob_to_new_directory(
self, fs, fs_join, fs_bulk_operations_scenario_0, fs_target
):
# Copy scenario 1h
source = fs_bulk_operations_scenario_0
target = fs_target
fs.mkdir(target)
for target_slash in [False, True]:
t = fs_join(target, "newdir")
if target_slash:
t += "/"
# Without recursive
fs.cp(fs_join(source, "subdir", "*"), t)
assert fs.isdir(fs_join(target, "newdir"))
assert fs.isfile(fs_join(target, "newdir", "subfile1"))
assert fs.isfile(fs_join(target, "newdir", "subfile2"))
assert not fs.exists(fs_join(target, "newdir", "nesteddir"))
assert not fs.exists(fs_join(target, "newdir", "nesteddir", "nestedfile"))
assert not fs.exists(fs_join(target, "subdir"))
assert not fs.exists(fs_join(target, "newdir", "subdir"))
fs.rm(fs_join(target, "newdir"), recursive=True)
assert not fs.exists(fs_join(target, "newdir"))
# With recursive
for glob, recursive in zip(["*", "**"], [True, False]):
fs.cp(fs_join(source, "subdir", glob), t, recursive=recursive)
assert fs.isdir(fs_join(target, "newdir"))
assert fs.isfile(fs_join(target, "newdir", "subfile1"))
assert fs.isfile(fs_join(target, "newdir", "subfile2"))
assert fs.isdir(fs_join(target, "newdir", "nesteddir"))
assert fs.isfile(fs_join(target, "newdir", "nesteddir", "nestedfile"))
assert not fs.exists(fs_join(target, "subdir"))
assert not fs.exists(fs_join(target, "newdir", "subdir"))
fs.rm(fs_join(target, "newdir"), recursive=True)
assert not fs.exists(fs_join(target, "newdir"))
# Limit recursive by maxdepth
fs.cp(
fs_join(source, "subdir", glob), t, recursive=recursive, maxdepth=1
)
assert fs.isdir(fs_join(target, "newdir"))
assert fs.isfile(fs_join(target, "newdir", "subfile1"))
assert fs.isfile(fs_join(target, "newdir", "subfile2"))
assert not fs.exists(fs_join(target, "newdir", "nesteddir"))
assert not fs.exists(fs_join(target, "subdir"))
assert not fs.exists(fs_join(target, "newdir", "subdir"))
fs.rm(fs_join(target, "newdir"), recursive=True)
assert not fs.exists(fs_join(target, "newdir"))
@pytest.mark.parametrize(
GLOB_EDGE_CASES_TESTS["argnames"],
GLOB_EDGE_CASES_TESTS["argvalues"],
)
def test_copy_glob_edge_cases(
self,
path,
recursive,
maxdepth,
expected,
fs,
fs_join,
fs_glob_edge_cases_files,
fs_target,
fs_sanitize_path,
):
# Copy scenario 1g
source = fs_glob_edge_cases_files
target = fs_target
for new_dir, target_slash in product([True, False], [True, False]):
fs.mkdir(target)
t = fs_join(target, "newdir") if new_dir else target
t = t + "/" if target_slash else t
fs.copy(fs_join(source, path), t, recursive=recursive, maxdepth=maxdepth)
output = fs.find(target)
if new_dir:
prefixed_expected = [
fs_sanitize_path(fs_join(target, "newdir", p)) for p in expected
]
else:
prefixed_expected = [
fs_sanitize_path(fs_join(target, p)) for p in expected
]
assert sorted(output) == sorted(prefixed_expected)
try:
fs.rm(target, recursive=True)
except FileNotFoundError:
pass
def test_copy_list_of_files_to_existing_directory(
self,
fs,
fs_join,
fs_bulk_operations_scenario_0,
fs_target,
supports_empty_directories,
):
# Copy scenario 2a
source = fs_bulk_operations_scenario_0
target = fs_target
fs.mkdir(target)
if not supports_empty_directories:
# Force target directory to exist by adding a dummy file
dummy = fs_join(target, "dummy")
fs.touch(dummy)
assert fs.isdir(target)
source_files = [
fs_join(source, "file1"),
fs_join(source, "file2"),
fs_join(source, "subdir", "subfile1"),
]
for target_slash in [False, True]:
t = target + "/" if target_slash else target
fs.cp(source_files, t)
assert fs.isfile(fs_join(target, "file1"))
assert fs.isfile(fs_join(target, "file2"))
assert fs.isfile(fs_join(target, "subfile1"))
fs.rm(
[
fs_join(target, "file1"),
fs_join(target, "file2"),
fs_join(target, "subfile1"),
],
recursive=True,
)
assert fs.ls(target, detail=False) == (
[] if supports_empty_directories else [dummy]
)
def test_copy_list_of_files_to_new_directory(
self, fs, fs_join, fs_bulk_operations_scenario_0, fs_target
):
# Copy scenario 2b
source = fs_bulk_operations_scenario_0
target = fs_target
fs.mkdir(target)
source_files = [
fs_join(source, "file1"),
fs_join(source, "file2"),
fs_join(source, "subdir", "subfile1"),
]
fs.cp(source_files, fs_join(target, "newdir") + "/") # Note trailing slash
assert fs.isdir(fs_join(target, "newdir"))
assert fs.isfile(fs_join(target, "newdir", "file1"))
assert fs.isfile(fs_join(target, "newdir", "file2"))
assert fs.isfile(fs_join(target, "newdir", "subfile1"))
def test_copy_two_files_new_directory(
self, fs, fs_join, fs_bulk_operations_scenario_0, fs_target
):
# This is a duplicate of test_copy_list_of_files_to_new_directory and
# can eventually be removed.
source = fs_bulk_operations_scenario_0
target = fs_target
assert not fs.exists(target)
fs.cp([fs_join(source, "file1"), fs_join(source, "file2")], target)
assert fs.isdir(target)
assert fs.isfile(fs_join(target, "file1"))
assert fs.isfile(fs_join(target, "file2"))
def test_copy_directory_without_files_with_same_name_prefix(
self,
fs,
fs_join,
fs_target,
fs_dir_and_file_with_same_name_prefix,
supports_empty_directories,
):
# Create the test dirs
source = fs_dir_and_file_with_same_name_prefix
target = fs_target
# Test without glob
fs.cp(fs_join(source, "subdir"), target, recursive=True)
assert fs.isfile(fs_join(target, "subfile.txt"))
assert not fs.isfile(fs_join(target, "subdir.txt"))
fs.rm([fs_join(target, "subfile.txt")])
if supports_empty_directories:
assert fs.ls(target) == []
else:
assert not fs.exists(target)
# Test with glob
fs.cp(fs_join(source, "subdir*"), target, recursive=True)
assert fs.isdir(fs_join(target, "subdir"))
assert fs.isfile(fs_join(target, "subdir", "subfile.txt"))
assert fs.isfile(fs_join(target, "subdir.txt"))
def test_copy_with_source_and_destination_as_list(
self, fs, fs_target, fs_join, fs_10_files_with_hashed_names
):
# Create the test dir
source = fs_10_files_with_hashed_names
target = fs_target
# Create list of files for source and destination
source_files = []
destination_files = []
for i in range(10):
hashed_i = md5(str(i).encode("utf-8")).hexdigest()
source_files.append(fs_join(source, f"{hashed_i}.txt"))
destination_files.append(fs_join(target, f"{hashed_i}.txt"))
# Copy and assert order was kept
fs.copy(path1=source_files, path2=destination_files)
for i in range(10):
file_content = fs.cat(destination_files[i]).decode("utf-8")
assert file_content == str(i)

View File

@@ -0,0 +1,587 @@
from hashlib import md5
from itertools import product
import pytest
from fsspec.implementations.local import make_path_posix
from fsspec.tests.abstract.common import GLOB_EDGE_CASES_TESTS
class AbstractGetTests:
def test_get_file_to_existing_directory(
self,
fs,
fs_join,
fs_bulk_operations_scenario_0,
local_fs,
local_join,
local_target,
):
# Copy scenario 1a
source = fs_bulk_operations_scenario_0
target = local_target
local_fs.mkdir(target)
assert local_fs.isdir(target)
target_file2 = local_join(target, "file2")
target_subfile1 = local_join(target, "subfile1")
# Copy from source directory
fs.get(fs_join(source, "file2"), target)
assert local_fs.isfile(target_file2)
# Copy from sub directory
fs.get(fs_join(source, "subdir", "subfile1"), target)
assert local_fs.isfile(target_subfile1)
# Remove copied files
local_fs.rm([target_file2, target_subfile1])
assert not local_fs.exists(target_file2)
assert not local_fs.exists(target_subfile1)
# Repeat with trailing slash on target
fs.get(fs_join(source, "file2"), target + "/")
assert local_fs.isdir(target)
assert local_fs.isfile(target_file2)
fs.get(fs_join(source, "subdir", "subfile1"), target + "/")
assert local_fs.isfile(target_subfile1)
def test_get_file_to_new_directory(
self,
fs,
fs_join,
fs_bulk_operations_scenario_0,
local_fs,
local_join,
local_target,
):
# Copy scenario 1b
source = fs_bulk_operations_scenario_0
target = local_target
local_fs.mkdir(target)
fs.get(
fs_join(source, "subdir", "subfile1"), local_join(target, "newdir/")
) # Note trailing slash
assert local_fs.isdir(target)
assert local_fs.isdir(local_join(target, "newdir"))
assert local_fs.isfile(local_join(target, "newdir", "subfile1"))
def test_get_file_to_file_in_existing_directory(
self,
fs,
fs_join,
fs_bulk_operations_scenario_0,
local_fs,
local_join,
local_target,
):
# Copy scenario 1c
source = fs_bulk_operations_scenario_0
target = local_target
local_fs.mkdir(target)
fs.get(fs_join(source, "subdir", "subfile1"), local_join(target, "newfile"))
assert local_fs.isfile(local_join(target, "newfile"))
def test_get_file_to_file_in_new_directory(
self,
fs,
fs_join,
fs_bulk_operations_scenario_0,
local_fs,
local_join,
local_target,
):
# Copy scenario 1d
source = fs_bulk_operations_scenario_0
target = local_target
local_fs.mkdir(target)
fs.get(
fs_join(source, "subdir", "subfile1"),
local_join(target, "newdir", "newfile"),
)
assert local_fs.isdir(local_join(target, "newdir"))
assert local_fs.isfile(local_join(target, "newdir", "newfile"))
def test_get_directory_to_existing_directory(
self,
fs,
fs_join,
fs_bulk_operations_scenario_0,
local_fs,
local_join,
local_target,
):
# Copy scenario 1e
source = fs_bulk_operations_scenario_0
target = local_target
local_fs.mkdir(target)
assert local_fs.isdir(target)
for source_slash, target_slash in zip([False, True], [False, True]):
s = fs_join(source, "subdir")
if source_slash:
s += "/"
t = target + "/" if target_slash else target
# Without recursive does nothing
fs.get(s, t)
assert local_fs.ls(target) == []
# With recursive
fs.get(s, t, recursive=True)
if source_slash:
assert local_fs.isfile(local_join(target, "subfile1"))
assert local_fs.isfile(local_join(target, "subfile2"))
assert local_fs.isdir(local_join(target, "nesteddir"))
assert local_fs.isfile(local_join(target, "nesteddir", "nestedfile"))
assert not local_fs.exists(local_join(target, "subdir"))
local_fs.rm(
[
local_join(target, "subfile1"),
local_join(target, "subfile2"),
local_join(target, "nesteddir"),
],
recursive=True,
)
else:
assert local_fs.isdir(local_join(target, "subdir"))
assert local_fs.isfile(local_join(target, "subdir", "subfile1"))
assert local_fs.isfile(local_join(target, "subdir", "subfile2"))
assert local_fs.isdir(local_join(target, "subdir", "nesteddir"))
assert local_fs.isfile(
local_join(target, "subdir", "nesteddir", "nestedfile")
)
local_fs.rm(local_join(target, "subdir"), recursive=True)
assert local_fs.ls(target) == []
# Limit recursive by maxdepth
fs.get(s, t, recursive=True, maxdepth=1)
if source_slash:
assert local_fs.isfile(local_join(target, "subfile1"))
assert local_fs.isfile(local_join(target, "subfile2"))
assert not local_fs.exists(local_join(target, "nesteddir"))
assert not local_fs.exists(local_join(target, "subdir"))
local_fs.rm(
[
local_join(target, "subfile1"),
local_join(target, "subfile2"),
],
recursive=True,
)
else:
assert local_fs.isdir(local_join(target, "subdir"))
assert local_fs.isfile(local_join(target, "subdir", "subfile1"))
assert local_fs.isfile(local_join(target, "subdir", "subfile2"))
assert not local_fs.exists(local_join(target, "subdir", "nesteddir"))
local_fs.rm(local_join(target, "subdir"), recursive=True)
assert local_fs.ls(target) == []
def test_get_directory_to_new_directory(
self,
fs,
fs_join,
fs_bulk_operations_scenario_0,
local_fs,
local_join,
local_target,
):
# Copy scenario 1f
source = fs_bulk_operations_scenario_0
target = local_target
local_fs.mkdir(target)
for source_slash, target_slash in zip([False, True], [False, True]):
s = fs_join(source, "subdir")
if source_slash:
s += "/"
t = local_join(target, "newdir")
if target_slash:
t += "/"
# Without recursive does nothing
fs.get(s, t)
assert local_fs.ls(target) == []
# With recursive
fs.get(s, t, recursive=True)
assert local_fs.isdir(local_join(target, "newdir"))
assert local_fs.isfile(local_join(target, "newdir", "subfile1"))
assert local_fs.isfile(local_join(target, "newdir", "subfile2"))
assert local_fs.isdir(local_join(target, "newdir", "nesteddir"))
assert local_fs.isfile(
local_join(target, "newdir", "nesteddir", "nestedfile")
)
assert not local_fs.exists(local_join(target, "subdir"))
local_fs.rm(local_join(target, "newdir"), recursive=True)
assert local_fs.ls(target) == []
# Limit recursive by maxdepth
fs.get(s, t, recursive=True, maxdepth=1)
assert local_fs.isdir(local_join(target, "newdir"))
assert local_fs.isfile(local_join(target, "newdir", "subfile1"))
assert local_fs.isfile(local_join(target, "newdir", "subfile2"))
assert not local_fs.exists(local_join(target, "newdir", "nesteddir"))
assert not local_fs.exists(local_join(target, "subdir"))
local_fs.rm(local_join(target, "newdir"), recursive=True)
assert not local_fs.exists(local_join(target, "newdir"))
def test_get_glob_to_existing_directory(
self,
fs,
fs_join,
fs_bulk_operations_scenario_0,
local_fs,
local_join,
local_target,
):
# Copy scenario 1g
source = fs_bulk_operations_scenario_0
target = local_target
local_fs.mkdir(target)
for target_slash in [False, True]:
t = target + "/" if target_slash else target
# Without recursive
fs.get(fs_join(source, "subdir", "*"), t)
assert local_fs.isfile(local_join(target, "subfile1"))
assert local_fs.isfile(local_join(target, "subfile2"))
assert not local_fs.isdir(local_join(target, "nesteddir"))
assert not local_fs.exists(local_join(target, "nesteddir", "nestedfile"))
assert not local_fs.exists(local_join(target, "subdir"))
local_fs.rm(
[
local_join(target, "subfile1"),
local_join(target, "subfile2"),
],
recursive=True,
)
assert local_fs.ls(target) == []
# With recursive
for glob, recursive in zip(["*", "**"], [True, False]):
fs.get(fs_join(source, "subdir", glob), t, recursive=recursive)
assert local_fs.isfile(local_join(target, "subfile1"))
assert local_fs.isfile(local_join(target, "subfile2"))
assert local_fs.isdir(local_join(target, "nesteddir"))
assert local_fs.isfile(local_join(target, "nesteddir", "nestedfile"))
assert not local_fs.exists(local_join(target, "subdir"))
local_fs.rm(
[
local_join(target, "subfile1"),
local_join(target, "subfile2"),
local_join(target, "nesteddir"),
],
recursive=True,
)
assert local_fs.ls(target) == []
# Limit recursive by maxdepth
fs.get(
fs_join(source, "subdir", glob), t, recursive=recursive, maxdepth=1
)
assert local_fs.isfile(local_join(target, "subfile1"))
assert local_fs.isfile(local_join(target, "subfile2"))
assert not local_fs.exists(local_join(target, "nesteddir"))
assert not local_fs.exists(local_join(target, "subdir"))
local_fs.rm(
[
local_join(target, "subfile1"),
local_join(target, "subfile2"),
],
recursive=True,
)
assert local_fs.ls(target) == []
def test_get_glob_to_new_directory(
self,
fs,
fs_join,
fs_bulk_operations_scenario_0,
local_fs,
local_join,
local_target,
):
# Copy scenario 1h
source = fs_bulk_operations_scenario_0
target = local_target
local_fs.mkdir(target)
for target_slash in [False, True]:
t = fs_join(target, "newdir")
if target_slash:
t += "/"
# Without recursive
fs.get(fs_join(source, "subdir", "*"), t)
assert local_fs.isdir(local_join(target, "newdir"))
assert local_fs.isfile(local_join(target, "newdir", "subfile1"))
assert local_fs.isfile(local_join(target, "newdir", "subfile2"))
assert not local_fs.exists(local_join(target, "newdir", "nesteddir"))
assert not local_fs.exists(
local_join(target, "newdir", "nesteddir", "nestedfile")
)
assert not local_fs.exists(local_join(target, "subdir"))
assert not local_fs.exists(local_join(target, "newdir", "subdir"))
local_fs.rm(local_join(target, "newdir"), recursive=True)
assert local_fs.ls(target) == []
# With recursive
for glob, recursive in zip(["*", "**"], [True, False]):
fs.get(fs_join(source, "subdir", glob), t, recursive=recursive)
assert local_fs.isdir(local_join(target, "newdir"))
assert local_fs.isfile(local_join(target, "newdir", "subfile1"))
assert local_fs.isfile(local_join(target, "newdir", "subfile2"))
assert local_fs.isdir(local_join(target, "newdir", "nesteddir"))
assert local_fs.isfile(
local_join(target, "newdir", "nesteddir", "nestedfile")
)
assert not local_fs.exists(local_join(target, "subdir"))
assert not local_fs.exists(local_join(target, "newdir", "subdir"))
local_fs.rm(local_join(target, "newdir"), recursive=True)
assert not local_fs.exists(local_join(target, "newdir"))
# Limit recursive by maxdepth
fs.get(
fs_join(source, "subdir", glob), t, recursive=recursive, maxdepth=1
)
assert local_fs.isdir(local_join(target, "newdir"))
assert local_fs.isfile(local_join(target, "newdir", "subfile1"))
assert local_fs.isfile(local_join(target, "newdir", "subfile2"))
assert not local_fs.exists(local_join(target, "newdir", "nesteddir"))
assert not local_fs.exists(local_join(target, "subdir"))
assert not local_fs.exists(local_join(target, "newdir", "subdir"))
local_fs.rm(local_fs.ls(target, detail=False), recursive=True)
assert not local_fs.exists(local_join(target, "newdir"))
@pytest.mark.parametrize(
GLOB_EDGE_CASES_TESTS["argnames"],
GLOB_EDGE_CASES_TESTS["argvalues"],
)
def test_get_glob_edge_cases(
self,
path,
recursive,
maxdepth,
expected,
fs,
fs_join,
fs_glob_edge_cases_files,
local_fs,
local_join,
local_target,
):
# Copy scenario 1g
source = fs_glob_edge_cases_files
target = local_target
for new_dir, target_slash in product([True, False], [True, False]):
local_fs.mkdir(target)
t = local_join(target, "newdir") if new_dir else target
t = t + "/" if target_slash else t
fs.get(fs_join(source, path), t, recursive=recursive, maxdepth=maxdepth)
output = local_fs.find(target)
if new_dir:
prefixed_expected = [
make_path_posix(local_join(target, "newdir", p)) for p in expected
]
else:
prefixed_expected = [
make_path_posix(local_join(target, p)) for p in expected
]
assert sorted(output) == sorted(prefixed_expected)
try:
local_fs.rm(target, recursive=True)
except FileNotFoundError:
pass
def test_get_list_of_files_to_existing_directory(
self,
fs,
fs_join,
fs_bulk_operations_scenario_0,
local_fs,
local_join,
local_target,
):
# Copy scenario 2a
source = fs_bulk_operations_scenario_0
target = local_target
local_fs.mkdir(target)
source_files = [
fs_join(source, "file1"),
fs_join(source, "file2"),
fs_join(source, "subdir", "subfile1"),
]
for target_slash in [False, True]:
t = target + "/" if target_slash else target
fs.get(source_files, t)
assert local_fs.isfile(local_join(target, "file1"))
assert local_fs.isfile(local_join(target, "file2"))
assert local_fs.isfile(local_join(target, "subfile1"))
local_fs.rm(
[
local_join(target, "file1"),
local_join(target, "file2"),
local_join(target, "subfile1"),
],
recursive=True,
)
assert local_fs.ls(target) == []
def test_get_list_of_files_to_new_directory(
self,
fs,
fs_join,
fs_bulk_operations_scenario_0,
local_fs,
local_join,
local_target,
):
# Copy scenario 2b
source = fs_bulk_operations_scenario_0
target = local_target
local_fs.mkdir(target)
source_files = [
fs_join(source, "file1"),
fs_join(source, "file2"),
fs_join(source, "subdir", "subfile1"),
]
fs.get(source_files, local_join(target, "newdir") + "/") # Note trailing slash
assert local_fs.isdir(local_join(target, "newdir"))
assert local_fs.isfile(local_join(target, "newdir", "file1"))
assert local_fs.isfile(local_join(target, "newdir", "file2"))
assert local_fs.isfile(local_join(target, "newdir", "subfile1"))
def test_get_directory_recursive(
self, fs, fs_join, fs_path, local_fs, local_join, local_target
):
# https://github.com/fsspec/filesystem_spec/issues/1062
# Recursive cp/get/put of source directory into non-existent target directory.
src = fs_join(fs_path, "src")
src_file = fs_join(src, "file")
fs.mkdir(src)
fs.touch(src_file)
target = local_target
# get without slash
assert not local_fs.exists(target)
for loop in range(2):
fs.get(src, target, recursive=True)
assert local_fs.isdir(target)
if loop == 0:
assert local_fs.isfile(local_join(target, "file"))
assert not local_fs.exists(local_join(target, "src"))
else:
assert local_fs.isfile(local_join(target, "file"))
assert local_fs.isdir(local_join(target, "src"))
assert local_fs.isfile(local_join(target, "src", "file"))
local_fs.rm(target, recursive=True)
# get with slash
assert not local_fs.exists(target)
for loop in range(2):
fs.get(src + "/", target, recursive=True)
assert local_fs.isdir(target)
assert local_fs.isfile(local_join(target, "file"))
assert not local_fs.exists(local_join(target, "src"))
def test_get_directory_without_files_with_same_name_prefix(
self,
fs,
fs_join,
local_fs,
local_join,
local_target,
fs_dir_and_file_with_same_name_prefix,
):
# Create the test dirs
source = fs_dir_and_file_with_same_name_prefix
target = local_target
# Test without glob
fs.get(fs_join(source, "subdir"), target, recursive=True)
assert local_fs.isfile(local_join(target, "subfile.txt"))
assert not local_fs.isfile(local_join(target, "subdir.txt"))
local_fs.rm([local_join(target, "subfile.txt")])
assert local_fs.ls(target) == []
# Test with glob
fs.get(fs_join(source, "subdir*"), target, recursive=True)
assert local_fs.isdir(local_join(target, "subdir"))
assert local_fs.isfile(local_join(target, "subdir", "subfile.txt"))
assert local_fs.isfile(local_join(target, "subdir.txt"))
def test_get_with_source_and_destination_as_list(
self,
fs,
fs_join,
local_fs,
local_join,
local_target,
fs_10_files_with_hashed_names,
):
# Create the test dir
source = fs_10_files_with_hashed_names
target = local_target
# Create list of files for source and destination
source_files = []
destination_files = []
for i in range(10):
hashed_i = md5(str(i).encode("utf-8")).hexdigest()
source_files.append(fs_join(source, f"{hashed_i}.txt"))
destination_files.append(
make_path_posix(local_join(target, f"{hashed_i}.txt"))
)
# Copy and assert order was kept
fs.get(rpath=source_files, lpath=destination_files)
for i in range(10):
file_content = local_fs.cat(destination_files[i]).decode("utf-8")
assert file_content == str(i)

View File

@@ -0,0 +1,57 @@
import os
import pytest
import fsspec
def test_move_raises_error_with_tmpdir(tmpdir):
# Create a file in the temporary directory
source = tmpdir.join("source_file.txt")
source.write("content")
# Define a destination that simulates a protected or invalid path
destination = tmpdir.join("non_existent_directory/destination_file.txt")
# Instantiate the filesystem (assuming the local file system interface)
fs = fsspec.filesystem("file")
# Use the actual file paths as string
with pytest.raises(FileNotFoundError):
fs.mv(str(source), str(destination))
@pytest.mark.parametrize("recursive", (True, False))
def test_move_raises_error_with_tmpdir_permission(recursive, tmpdir):
# Create a file in the temporary directory
source = tmpdir.join("source_file.txt")
source.write("content")
# Create a protected directory (non-writable)
protected_dir = tmpdir.mkdir("protected_directory")
protected_path = str(protected_dir)
# Set the directory to read-only
if os.name == "nt":
os.system(f'icacls "{protected_path}" /deny Everyone:(W)')
else:
os.chmod(protected_path, 0o555) # Sets the directory to read-only
# Define a destination inside the protected directory
destination = protected_dir.join("destination_file.txt")
# Instantiate the filesystem (assuming the local file system interface)
fs = fsspec.filesystem("file")
# Try to move the file to the read-only directory, expecting a permission error
with pytest.raises(PermissionError):
fs.mv(str(source), str(destination), recursive=recursive)
# Assert the file was not created in the destination
assert not os.path.exists(destination)
# Cleanup: Restore permissions so the directory can be cleaned up
if os.name == "nt":
os.system(f'icacls "{protected_path}" /remove:d Everyone')
else:
os.chmod(protected_path, 0o755) # Restore write permission for cleanup

View File

@@ -0,0 +1,11 @@
import pytest
class AbstractOpenTests:
def test_open_exclusive(self, fs, fs_target):
with fs.open(fs_target, "wb") as f:
f.write(b"data")
with fs.open(fs_target, "rb") as f:
assert f.read() == b"data"
with pytest.raises(FileExistsError):
fs.open(fs_target, "xb")

View File

@@ -0,0 +1,11 @@
import pytest
class AbstractPipeTests:
def test_pipe_exclusive(self, fs, fs_target):
fs.pipe_file(fs_target, b"data")
assert fs.cat_file(fs_target) == b"data"
with pytest.raises(FileExistsError):
fs.pipe_file(fs_target, b"data", mode="create")
fs.pipe_file(fs_target, b"new data", mode="overwrite")
assert fs.cat_file(fs_target) == b"new data"

View File

@@ -0,0 +1,591 @@
from hashlib import md5
from itertools import product
import pytest
from fsspec.tests.abstract.common import GLOB_EDGE_CASES_TESTS
class AbstractPutTests:
def test_put_file_to_existing_directory(
self,
fs,
fs_join,
fs_target,
local_join,
local_bulk_operations_scenario_0,
supports_empty_directories,
):
# Copy scenario 1a
source = local_bulk_operations_scenario_0
target = fs_target
fs.mkdir(target)
if not supports_empty_directories:
# Force target directory to exist by adding a dummy file
fs.touch(fs_join(target, "dummy"))
assert fs.isdir(target)
target_file2 = fs_join(target, "file2")
target_subfile1 = fs_join(target, "subfile1")
# Copy from source directory
fs.put(local_join(source, "file2"), target)
assert fs.isfile(target_file2)
# Copy from sub directory
fs.put(local_join(source, "subdir", "subfile1"), target)
assert fs.isfile(target_subfile1)
# Remove copied files
fs.rm([target_file2, target_subfile1])
assert not fs.exists(target_file2)
assert not fs.exists(target_subfile1)
# Repeat with trailing slash on target
fs.put(local_join(source, "file2"), target + "/")
assert fs.isdir(target)
assert fs.isfile(target_file2)
fs.put(local_join(source, "subdir", "subfile1"), target + "/")
assert fs.isfile(target_subfile1)
def test_put_file_to_new_directory(
self, fs, fs_join, fs_target, local_join, local_bulk_operations_scenario_0
):
# Copy scenario 1b
source = local_bulk_operations_scenario_0
target = fs_target
fs.mkdir(target)
fs.put(
local_join(source, "subdir", "subfile1"), fs_join(target, "newdir/")
) # Note trailing slash
assert fs.isdir(target)
assert fs.isdir(fs_join(target, "newdir"))
assert fs.isfile(fs_join(target, "newdir", "subfile1"))
def test_put_file_to_file_in_existing_directory(
self,
fs,
fs_join,
fs_target,
local_join,
supports_empty_directories,
local_bulk_operations_scenario_0,
):
# Copy scenario 1c
source = local_bulk_operations_scenario_0
target = fs_target
fs.mkdir(target)
if not supports_empty_directories:
# Force target directory to exist by adding a dummy file
fs.touch(fs_join(target, "dummy"))
assert fs.isdir(target)
fs.put(local_join(source, "subdir", "subfile1"), fs_join(target, "newfile"))
assert fs.isfile(fs_join(target, "newfile"))
def test_put_file_to_file_in_new_directory(
self, fs, fs_join, fs_target, local_join, local_bulk_operations_scenario_0
):
# Copy scenario 1d
source = local_bulk_operations_scenario_0
target = fs_target
fs.mkdir(target)
fs.put(
local_join(source, "subdir", "subfile1"),
fs_join(target, "newdir", "newfile"),
)
assert fs.isdir(fs_join(target, "newdir"))
assert fs.isfile(fs_join(target, "newdir", "newfile"))
def test_put_directory_to_existing_directory(
self,
fs,
fs_join,
fs_target,
local_bulk_operations_scenario_0,
supports_empty_directories,
):
# Copy scenario 1e
source = local_bulk_operations_scenario_0
target = fs_target
fs.mkdir(target)
if not supports_empty_directories:
# Force target directory to exist by adding a dummy file
dummy = fs_join(target, "dummy")
fs.touch(dummy)
assert fs.isdir(target)
for source_slash, target_slash in zip([False, True], [False, True]):
s = fs_join(source, "subdir")
if source_slash:
s += "/"
t = target + "/" if target_slash else target
# Without recursive does nothing
fs.put(s, t)
assert fs.ls(target, detail=False) == (
[] if supports_empty_directories else [dummy]
)
# With recursive
fs.put(s, t, recursive=True)
if source_slash:
assert fs.isfile(fs_join(target, "subfile1"))
assert fs.isfile(fs_join(target, "subfile2"))
assert fs.isdir(fs_join(target, "nesteddir"))
assert fs.isfile(fs_join(target, "nesteddir", "nestedfile"))
assert not fs.exists(fs_join(target, "subdir"))
fs.rm(
[
fs_join(target, "subfile1"),
fs_join(target, "subfile2"),
fs_join(target, "nesteddir"),
],
recursive=True,
)
else:
assert fs.isdir(fs_join(target, "subdir"))
assert fs.isfile(fs_join(target, "subdir", "subfile1"))
assert fs.isfile(fs_join(target, "subdir", "subfile2"))
assert fs.isdir(fs_join(target, "subdir", "nesteddir"))
assert fs.isfile(fs_join(target, "subdir", "nesteddir", "nestedfile"))
fs.rm(fs_join(target, "subdir"), recursive=True)
assert fs.ls(target, detail=False) == (
[] if supports_empty_directories else [dummy]
)
# Limit recursive by maxdepth
fs.put(s, t, recursive=True, maxdepth=1)
if source_slash:
assert fs.isfile(fs_join(target, "subfile1"))
assert fs.isfile(fs_join(target, "subfile2"))
assert not fs.exists(fs_join(target, "nesteddir"))
assert not fs.exists(fs_join(target, "subdir"))
fs.rm(
[
fs_join(target, "subfile1"),
fs_join(target, "subfile2"),
],
recursive=True,
)
else:
assert fs.isdir(fs_join(target, "subdir"))
assert fs.isfile(fs_join(target, "subdir", "subfile1"))
assert fs.isfile(fs_join(target, "subdir", "subfile2"))
assert not fs.exists(fs_join(target, "subdir", "nesteddir"))
fs.rm(fs_join(target, "subdir"), recursive=True)
assert fs.ls(target, detail=False) == (
[] if supports_empty_directories else [dummy]
)
def test_put_directory_to_new_directory(
self,
fs,
fs_join,
fs_target,
local_bulk_operations_scenario_0,
supports_empty_directories,
):
# Copy scenario 1f
source = local_bulk_operations_scenario_0
target = fs_target
fs.mkdir(target)
for source_slash, target_slash in zip([False, True], [False, True]):
s = fs_join(source, "subdir")
if source_slash:
s += "/"
t = fs_join(target, "newdir")
if target_slash:
t += "/"
# Without recursive does nothing
fs.put(s, t)
if supports_empty_directories:
assert fs.ls(target) == []
else:
with pytest.raises(FileNotFoundError):
fs.ls(target)
# With recursive
fs.put(s, t, recursive=True)
assert fs.isdir(fs_join(target, "newdir"))
assert fs.isfile(fs_join(target, "newdir", "subfile1"))
assert fs.isfile(fs_join(target, "newdir", "subfile2"))
assert fs.isdir(fs_join(target, "newdir", "nesteddir"))
assert fs.isfile(fs_join(target, "newdir", "nesteddir", "nestedfile"))
assert not fs.exists(fs_join(target, "subdir"))
fs.rm(fs_join(target, "newdir"), recursive=True)
assert not fs.exists(fs_join(target, "newdir"))
# Limit recursive by maxdepth
fs.put(s, t, recursive=True, maxdepth=1)
assert fs.isdir(fs_join(target, "newdir"))
assert fs.isfile(fs_join(target, "newdir", "subfile1"))
assert fs.isfile(fs_join(target, "newdir", "subfile2"))
assert not fs.exists(fs_join(target, "newdir", "nesteddir"))
assert not fs.exists(fs_join(target, "subdir"))
fs.rm(fs_join(target, "newdir"), recursive=True)
assert not fs.exists(fs_join(target, "newdir"))
def test_put_glob_to_existing_directory(
self,
fs,
fs_join,
fs_target,
local_join,
supports_empty_directories,
local_bulk_operations_scenario_0,
):
# Copy scenario 1g
source = local_bulk_operations_scenario_0
target = fs_target
fs.mkdir(target)
if not supports_empty_directories:
# Force target directory to exist by adding a dummy file
dummy = fs_join(target, "dummy")
fs.touch(dummy)
assert fs.isdir(target)
for target_slash in [False, True]:
t = target + "/" if target_slash else target
# Without recursive
fs.put(local_join(source, "subdir", "*"), t)
assert fs.isfile(fs_join(target, "subfile1"))
assert fs.isfile(fs_join(target, "subfile2"))
assert not fs.isdir(fs_join(target, "nesteddir"))
assert not fs.exists(fs_join(target, "nesteddir", "nestedfile"))
assert not fs.exists(fs_join(target, "subdir"))
fs.rm(
[
fs_join(target, "subfile1"),
fs_join(target, "subfile2"),
],
recursive=True,
)
assert fs.ls(target, detail=False) == (
[] if supports_empty_directories else [dummy]
)
# With recursive
for glob, recursive in zip(["*", "**"], [True, False]):
fs.put(local_join(source, "subdir", glob), t, recursive=recursive)
assert fs.isfile(fs_join(target, "subfile1"))
assert fs.isfile(fs_join(target, "subfile2"))
assert fs.isdir(fs_join(target, "nesteddir"))
assert fs.isfile(fs_join(target, "nesteddir", "nestedfile"))
assert not fs.exists(fs_join(target, "subdir"))
fs.rm(
[
fs_join(target, "subfile1"),
fs_join(target, "subfile2"),
fs_join(target, "nesteddir"),
],
recursive=True,
)
assert fs.ls(target, detail=False) == (
[] if supports_empty_directories else [dummy]
)
# Limit recursive by maxdepth
fs.put(
local_join(source, "subdir", glob),
t,
recursive=recursive,
maxdepth=1,
)
assert fs.isfile(fs_join(target, "subfile1"))
assert fs.isfile(fs_join(target, "subfile2"))
assert not fs.exists(fs_join(target, "nesteddir"))
assert not fs.exists(fs_join(target, "subdir"))
fs.rm(
[
fs_join(target, "subfile1"),
fs_join(target, "subfile2"),
],
recursive=True,
)
assert fs.ls(target, detail=False) == (
[] if supports_empty_directories else [dummy]
)
def test_put_glob_to_new_directory(
self, fs, fs_join, fs_target, local_join, local_bulk_operations_scenario_0
):
# Copy scenario 1h
source = local_bulk_operations_scenario_0
target = fs_target
fs.mkdir(target)
for target_slash in [False, True]:
t = fs_join(target, "newdir")
if target_slash:
t += "/"
# Without recursive
fs.put(local_join(source, "subdir", "*"), t)
assert fs.isdir(fs_join(target, "newdir"))
assert fs.isfile(fs_join(target, "newdir", "subfile1"))
assert fs.isfile(fs_join(target, "newdir", "subfile2"))
assert not fs.exists(fs_join(target, "newdir", "nesteddir"))
assert not fs.exists(fs_join(target, "newdir", "nesteddir", "nestedfile"))
assert not fs.exists(fs_join(target, "subdir"))
assert not fs.exists(fs_join(target, "newdir", "subdir"))
fs.rm(fs_join(target, "newdir"), recursive=True)
assert not fs.exists(fs_join(target, "newdir"))
# With recursive
for glob, recursive in zip(["*", "**"], [True, False]):
fs.put(local_join(source, "subdir", glob), t, recursive=recursive)
assert fs.isdir(fs_join(target, "newdir"))
assert fs.isfile(fs_join(target, "newdir", "subfile1"))
assert fs.isfile(fs_join(target, "newdir", "subfile2"))
assert fs.isdir(fs_join(target, "newdir", "nesteddir"))
assert fs.isfile(fs_join(target, "newdir", "nesteddir", "nestedfile"))
assert not fs.exists(fs_join(target, "subdir"))
assert not fs.exists(fs_join(target, "newdir", "subdir"))
fs.rm(fs_join(target, "newdir"), recursive=True)
assert not fs.exists(fs_join(target, "newdir"))
# Limit recursive by maxdepth
fs.put(
local_join(source, "subdir", glob),
t,
recursive=recursive,
maxdepth=1,
)
assert fs.isdir(fs_join(target, "newdir"))
assert fs.isfile(fs_join(target, "newdir", "subfile1"))
assert fs.isfile(fs_join(target, "newdir", "subfile2"))
assert not fs.exists(fs_join(target, "newdir", "nesteddir"))
assert not fs.exists(fs_join(target, "subdir"))
assert not fs.exists(fs_join(target, "newdir", "subdir"))
fs.rm(fs_join(target, "newdir"), recursive=True)
assert not fs.exists(fs_join(target, "newdir"))
@pytest.mark.parametrize(
GLOB_EDGE_CASES_TESTS["argnames"],
GLOB_EDGE_CASES_TESTS["argvalues"],
)
def test_put_glob_edge_cases(
self,
path,
recursive,
maxdepth,
expected,
fs,
fs_join,
fs_target,
local_glob_edge_cases_files,
local_join,
fs_sanitize_path,
):
# Copy scenario 1g
source = local_glob_edge_cases_files
target = fs_target
for new_dir, target_slash in product([True, False], [True, False]):
fs.mkdir(target)
t = fs_join(target, "newdir") if new_dir else target
t = t + "/" if target_slash else t
fs.put(local_join(source, path), t, recursive=recursive, maxdepth=maxdepth)
output = fs.find(target)
if new_dir:
prefixed_expected = [
fs_sanitize_path(fs_join(target, "newdir", p)) for p in expected
]
else:
prefixed_expected = [
fs_sanitize_path(fs_join(target, p)) for p in expected
]
assert sorted(output) == sorted(prefixed_expected)
try:
fs.rm(target, recursive=True)
except FileNotFoundError:
pass
def test_put_list_of_files_to_existing_directory(
self,
fs,
fs_join,
fs_target,
local_join,
local_bulk_operations_scenario_0,
supports_empty_directories,
):
# Copy scenario 2a
source = local_bulk_operations_scenario_0
target = fs_target
fs.mkdir(target)
if not supports_empty_directories:
# Force target directory to exist by adding a dummy file
dummy = fs_join(target, "dummy")
fs.touch(dummy)
assert fs.isdir(target)
source_files = [
local_join(source, "file1"),
local_join(source, "file2"),
local_join(source, "subdir", "subfile1"),
]
for target_slash in [False, True]:
t = target + "/" if target_slash else target
fs.put(source_files, t)
assert fs.isfile(fs_join(target, "file1"))
assert fs.isfile(fs_join(target, "file2"))
assert fs.isfile(fs_join(target, "subfile1"))
fs.rm(
[
fs_join(target, "file1"),
fs_join(target, "file2"),
fs_join(target, "subfile1"),
],
recursive=True,
)
assert fs.ls(target, detail=False) == (
[] if supports_empty_directories else [dummy]
)
def test_put_list_of_files_to_new_directory(
self, fs, fs_join, fs_target, local_join, local_bulk_operations_scenario_0
):
# Copy scenario 2b
source = local_bulk_operations_scenario_0
target = fs_target
fs.mkdir(target)
source_files = [
local_join(source, "file1"),
local_join(source, "file2"),
local_join(source, "subdir", "subfile1"),
]
fs.put(source_files, fs_join(target, "newdir") + "/") # Note trailing slash
assert fs.isdir(fs_join(target, "newdir"))
assert fs.isfile(fs_join(target, "newdir", "file1"))
assert fs.isfile(fs_join(target, "newdir", "file2"))
assert fs.isfile(fs_join(target, "newdir", "subfile1"))
def test_put_directory_recursive(
self, fs, fs_join, fs_target, local_fs, local_join, local_path
):
# https://github.com/fsspec/filesystem_spec/issues/1062
# Recursive cp/get/put of source directory into non-existent target directory.
src = local_join(local_path, "src")
src_file = local_join(src, "file")
local_fs.mkdir(src)
local_fs.touch(src_file)
target = fs_target
# put without slash
assert not fs.exists(target)
for loop in range(2):
fs.put(src, target, recursive=True)
assert fs.isdir(target)
if loop == 0:
assert fs.isfile(fs_join(target, "file"))
assert not fs.exists(fs_join(target, "src"))
else:
assert fs.isfile(fs_join(target, "file"))
assert fs.isdir(fs_join(target, "src"))
assert fs.isfile(fs_join(target, "src", "file"))
fs.rm(target, recursive=True)
# put with slash
assert not fs.exists(target)
for loop in range(2):
fs.put(src + "/", target, recursive=True)
assert fs.isdir(target)
assert fs.isfile(fs_join(target, "file"))
assert not fs.exists(fs_join(target, "src"))
def test_put_directory_without_files_with_same_name_prefix(
self,
fs,
fs_join,
fs_target,
local_join,
local_dir_and_file_with_same_name_prefix,
supports_empty_directories,
):
# Create the test dirs
source = local_dir_and_file_with_same_name_prefix
target = fs_target
# Test without glob
fs.put(local_join(source, "subdir"), fs_target, recursive=True)
assert fs.isfile(fs_join(fs_target, "subfile.txt"))
assert not fs.isfile(fs_join(fs_target, "subdir.txt"))
fs.rm([fs_join(target, "subfile.txt")])
if supports_empty_directories:
assert fs.ls(target) == []
else:
assert not fs.exists(target)
# Test with glob
fs.put(local_join(source, "subdir*"), fs_target, recursive=True)
assert fs.isdir(fs_join(fs_target, "subdir"))
assert fs.isfile(fs_join(fs_target, "subdir", "subfile.txt"))
assert fs.isfile(fs_join(fs_target, "subdir.txt"))
def test_copy_with_source_and_destination_as_list(
self, fs, fs_target, fs_join, local_join, local_10_files_with_hashed_names
):
# Create the test dir
source = local_10_files_with_hashed_names
target = fs_target
# Create list of files for source and destination
source_files = []
destination_files = []
for i in range(10):
hashed_i = md5(str(i).encode("utf-8")).hexdigest()
source_files.append(local_join(source, f"{hashed_i}.txt"))
destination_files.append(fs_join(target, f"{hashed_i}.txt"))
# Copy and assert order was kept
fs.put(lpath=source_files, rpath=destination_files)
for i in range(10):
file_content = fs.cat(destination_files[i]).decode("utf-8")
assert file_content == str(i)

View File

@@ -0,0 +1,90 @@
from collections import deque
class Transaction:
"""Filesystem transaction write context
Gathers files for deferred commit or discard, so that several write
operations can be finalized semi-atomically. This works by having this
instance as the ``.transaction`` attribute of the given filesystem
"""
def __init__(self, fs, **kwargs):
"""
Parameters
----------
fs: FileSystem instance
"""
self.fs = fs
self.files = deque()
def __enter__(self):
self.start()
return self
def __exit__(self, exc_type, exc_val, exc_tb):
"""End transaction and commit, if exit is not due to exception"""
# only commit if there was no exception
self.complete(commit=exc_type is None)
if self.fs:
self.fs._intrans = False
self.fs._transaction = None
self.fs = None
def start(self):
"""Start a transaction on this FileSystem"""
self.files = deque() # clean up after previous failed completions
self.fs._intrans = True
def complete(self, commit=True):
"""Finish transaction: commit or discard all deferred files"""
while self.files:
f = self.files.popleft()
if commit:
f.commit()
else:
f.discard()
self.fs._intrans = False
self.fs._transaction = None
self.fs = None
class FileActor:
def __init__(self):
self.files = []
def commit(self):
for f in self.files:
f.commit()
self.files.clear()
def discard(self):
for f in self.files:
f.discard()
self.files.clear()
def append(self, f):
self.files.append(f)
class DaskTransaction(Transaction):
def __init__(self, fs):
"""
Parameters
----------
fs: FileSystem instance
"""
import distributed
super().__init__(fs)
client = distributed.default_client()
self.files = client.submit(FileActor, actor=True).result()
def complete(self, commit=True):
"""Finish transaction: commit or discard all deferred files"""
if commit:
self.files.commit().result()
else:
self.files.discard().result()
self.fs._intrans = False
self.fs = None

View File

@@ -0,0 +1,745 @@
from __future__ import annotations
import contextlib
import logging
import math
import os
import re
import sys
import tempfile
from collections.abc import Iterable, Iterator, Sequence
from functools import partial
from hashlib import md5
from importlib.metadata import version
from typing import (
IO,
TYPE_CHECKING,
Any,
Callable,
TypeVar,
)
from urllib.parse import urlsplit
if TYPE_CHECKING:
import pathlib
from typing_extensions import TypeGuard
from fsspec.spec import AbstractFileSystem
DEFAULT_BLOCK_SIZE = 5 * 2**20
T = TypeVar("T")
def infer_storage_options(
urlpath: str, inherit_storage_options: dict[str, Any] | None = None
) -> dict[str, Any]:
"""Infer storage options from URL path and merge it with existing storage
options.
Parameters
----------
urlpath: str or unicode
Either local absolute file path or URL (hdfs://namenode:8020/file.csv)
inherit_storage_options: dict (optional)
Its contents will get merged with the inferred information from the
given path
Returns
-------
Storage options dict.
Examples
--------
>>> infer_storage_options('/mnt/datasets/test.csv') # doctest: +SKIP
{"protocol": "file", "path", "/mnt/datasets/test.csv"}
>>> infer_storage_options(
... 'hdfs://username:pwd@node:123/mnt/datasets/test.csv?q=1',
... inherit_storage_options={'extra': 'value'},
... ) # doctest: +SKIP
{"protocol": "hdfs", "username": "username", "password": "pwd",
"host": "node", "port": 123, "path": "/mnt/datasets/test.csv",
"url_query": "q=1", "extra": "value"}
"""
# Handle Windows paths including disk name in this special case
if (
re.match(r"^[a-zA-Z]:[\\/]", urlpath)
or re.match(r"^[a-zA-Z0-9]+://", urlpath) is None
):
return {"protocol": "file", "path": urlpath}
parsed_path = urlsplit(urlpath)
protocol = parsed_path.scheme or "file"
if parsed_path.fragment:
path = "#".join([parsed_path.path, parsed_path.fragment])
else:
path = parsed_path.path
if protocol == "file":
# Special case parsing file protocol URL on Windows according to:
# https://msdn.microsoft.com/en-us/library/jj710207.aspx
windows_path = re.match(r"^/([a-zA-Z])[:|]([\\/].*)$", path)
if windows_path:
drive, path = windows_path.groups()
path = f"{drive}:{path}"
if protocol in ["http", "https"]:
# for HTTP, we don't want to parse, as requests will anyway
return {"protocol": protocol, "path": urlpath}
options: dict[str, Any] = {"protocol": protocol, "path": path}
if parsed_path.netloc:
# Parse `hostname` from netloc manually because `parsed_path.hostname`
# lowercases the hostname which is not always desirable (e.g. in S3):
# https://github.com/dask/dask/issues/1417
options["host"] = parsed_path.netloc.rsplit("@", 1)[-1].rsplit(":", 1)[0]
if protocol in ("s3", "s3a", "gcs", "gs"):
options["path"] = options["host"] + options["path"]
else:
options["host"] = options["host"]
if parsed_path.port:
options["port"] = parsed_path.port
if parsed_path.username:
options["username"] = parsed_path.username
if parsed_path.password:
options["password"] = parsed_path.password
if parsed_path.query:
options["url_query"] = parsed_path.query
if parsed_path.fragment:
options["url_fragment"] = parsed_path.fragment
if inherit_storage_options:
update_storage_options(options, inherit_storage_options)
return options
def update_storage_options(
options: dict[str, Any], inherited: dict[str, Any] | None = None
) -> None:
if not inherited:
inherited = {}
collisions = set(options) & set(inherited)
if collisions:
for collision in collisions:
if options.get(collision) != inherited.get(collision):
raise KeyError(
f"Collision between inferred and specified storage "
f"option:\n{collision}"
)
options.update(inherited)
# Compression extensions registered via fsspec.compression.register_compression
compressions: dict[str, str] = {}
def infer_compression(filename: str) -> str | None:
"""Infer compression, if available, from filename.
Infer a named compression type, if registered and available, from filename
extension. This includes builtin (gz, bz2, zip) compressions, as well as
optional compressions. See fsspec.compression.register_compression.
"""
extension = os.path.splitext(filename)[-1].strip(".").lower()
if extension in compressions:
return compressions[extension]
return None
def build_name_function(max_int: float) -> Callable[[int], str]:
"""Returns a function that receives a single integer
and returns it as a string padded by enough zero characters
to align with maximum possible integer
>>> name_f = build_name_function(57)
>>> name_f(7)
'07'
>>> name_f(31)
'31'
>>> build_name_function(1000)(42)
'0042'
>>> build_name_function(999)(42)
'042'
>>> build_name_function(0)(0)
'0'
"""
# handle corner cases max_int is 0 or exact power of 10
max_int += 1e-8
pad_length = int(math.ceil(math.log10(max_int)))
def name_function(i: int) -> str:
return str(i).zfill(pad_length)
return name_function
def seek_delimiter(file: IO[bytes], delimiter: bytes, blocksize: int) -> bool:
r"""Seek current file to file start, file end, or byte after delimiter seq.
Seeks file to next chunk delimiter, where chunks are defined on file start,
a delimiting sequence, and file end. Use file.tell() to see location afterwards.
Note that file start is a valid split, so must be at offset > 0 to seek for
delimiter.
Parameters
----------
file: a file
delimiter: bytes
a delimiter like ``b'\n'`` or message sentinel, matching file .read() type
blocksize: int
Number of bytes to read from the file at once.
Returns
-------
Returns True if a delimiter was found, False if at file start or end.
"""
if file.tell() == 0:
# beginning-of-file, return without seek
return False
# Interface is for binary IO, with delimiter as bytes, but initialize last
# with result of file.read to preserve compatibility with text IO.
last: bytes | None = None
while True:
current = file.read(blocksize)
if not current:
# end-of-file without delimiter
return False
full = last + current if last else current
try:
if delimiter in full:
i = full.index(delimiter)
file.seek(file.tell() - (len(full) - i) + len(delimiter))
return True
elif len(current) < blocksize:
# end-of-file without delimiter
return False
except (OSError, ValueError):
pass
last = full[-len(delimiter) :]
def read_block(
f: IO[bytes],
offset: int,
length: int | None,
delimiter: bytes | None = None,
split_before: bool = False,
) -> bytes:
"""Read a block of bytes from a file
Parameters
----------
f: File
Open file
offset: int
Byte offset to start read
length: int
Number of bytes to read, read through end of file if None
delimiter: bytes (optional)
Ensure reading starts and stops at delimiter bytestring
split_before: bool (optional)
Start/stop read *before* delimiter bytestring.
If using the ``delimiter=`` keyword argument we ensure that the read
starts and stops at delimiter boundaries that follow the locations
``offset`` and ``offset + length``. If ``offset`` is zero then we
start at zero, regardless of delimiter. The bytestring returned WILL
include the terminating delimiter string.
Examples
--------
>>> from io import BytesIO # doctest: +SKIP
>>> f = BytesIO(b'Alice, 100\\nBob, 200\\nCharlie, 300') # doctest: +SKIP
>>> read_block(f, 0, 13) # doctest: +SKIP
b'Alice, 100\\nBo'
>>> read_block(f, 0, 13, delimiter=b'\\n') # doctest: +SKIP
b'Alice, 100\\nBob, 200\\n'
>>> read_block(f, 10, 10, delimiter=b'\\n') # doctest: +SKIP
b'Bob, 200\\nCharlie, 300'
"""
if delimiter:
f.seek(offset)
found_start_delim = seek_delimiter(f, delimiter, 2**16)
if length is None:
return f.read()
start = f.tell()
length -= start - offset
f.seek(start + length)
found_end_delim = seek_delimiter(f, delimiter, 2**16)
end = f.tell()
# Adjust split location to before delimiter if seek found the
# delimiter sequence, not start or end of file.
if found_start_delim and split_before:
start -= len(delimiter)
if found_end_delim and split_before:
end -= len(delimiter)
offset = start
length = end - start
f.seek(offset)
# TODO: allow length to be None and read to the end of the file?
assert length is not None
b = f.read(length)
return b
def tokenize(*args: Any, **kwargs: Any) -> str:
"""Deterministic token
(modified from dask.base)
>>> tokenize([1, 2, '3'])
'9d71491b50023b06fc76928e6eddb952'
>>> tokenize('Hello') == tokenize('Hello')
True
"""
if kwargs:
args += (kwargs,)
try:
h = md5(str(args).encode())
except ValueError:
# FIPS systems: https://github.com/fsspec/filesystem_spec/issues/380
h = md5(str(args).encode(), usedforsecurity=False)
return h.hexdigest()
def stringify_path(filepath: str | os.PathLike[str] | pathlib.Path) -> str:
"""Attempt to convert a path-like object to a string.
Parameters
----------
filepath: object to be converted
Returns
-------
filepath_str: maybe a string version of the object
Notes
-----
Objects supporting the fspath protocol are coerced according to its
__fspath__ method.
For backwards compatibility with older Python version, pathlib.Path
objects are specially coerced.
Any other object is passed through unchanged, which includes bytes,
strings, buffers, or anything else that's not even path-like.
"""
if isinstance(filepath, str):
return filepath
elif hasattr(filepath, "__fspath__"):
return filepath.__fspath__()
elif hasattr(filepath, "path"):
return filepath.path
else:
return filepath # type: ignore[return-value]
def make_instance(
cls: Callable[..., T], args: Sequence[Any], kwargs: dict[str, Any]
) -> T:
inst = cls(*args, **kwargs)
inst._determine_worker() # type: ignore[attr-defined]
return inst
def common_prefix(paths: Iterable[str]) -> str:
"""For a list of paths, find the shortest prefix common to all"""
parts = [p.split("/") for p in paths]
lmax = min(len(p) for p in parts)
end = 0
for i in range(lmax):
end = all(p[i] == parts[0][i] for p in parts)
if not end:
break
i += end
return "/".join(parts[0][:i])
def other_paths(
paths: list[str],
path2: str | list[str],
exists: bool = False,
flatten: bool = False,
) -> list[str]:
"""In bulk file operations, construct a new file tree from a list of files
Parameters
----------
paths: list of str
The input file tree
path2: str or list of str
Root to construct the new list in. If this is already a list of str, we just
assert it has the right number of elements.
exists: bool (optional)
For a str destination, it is already exists (and is a dir), files should
end up inside.
flatten: bool (optional)
Whether to flatten the input directory tree structure so that the output files
are in the same directory.
Returns
-------
list of str
"""
if isinstance(path2, str):
path2 = path2.rstrip("/")
if flatten:
path2 = ["/".join((path2, p.split("/")[-1])) for p in paths]
else:
cp = common_prefix(paths)
if exists:
cp = cp.rsplit("/", 1)[0]
if not cp and all(not s.startswith("/") for s in paths):
path2 = ["/".join([path2, p]) for p in paths]
else:
path2 = [p.replace(cp, path2, 1) for p in paths]
else:
assert len(paths) == len(path2)
return path2
def is_exception(obj: Any) -> bool:
return isinstance(obj, BaseException)
def isfilelike(f: Any) -> TypeGuard[IO[bytes]]:
return all(hasattr(f, attr) for attr in ["read", "close", "tell"])
def get_protocol(url: str) -> str:
url = stringify_path(url)
parts = re.split(r"(\:\:|\://)", url, maxsplit=1)
if len(parts) > 1:
return parts[0]
return "file"
def get_file_extension(url: str) -> str:
url = stringify_path(url)
ext_parts = url.rsplit(".", 1)
if len(ext_parts) > 1:
return ext_parts[-1]
return ""
def can_be_local(path: str) -> bool:
"""Can the given URL be used with open_local?"""
from fsspec import get_filesystem_class
try:
return getattr(get_filesystem_class(get_protocol(path)), "local_file", False)
except (ValueError, ImportError):
# not in registry or import failed
return False
def get_package_version_without_import(name: str) -> str | None:
"""For given package name, try to find the version without importing it
Import and package.__version__ is still the backup here, so an import
*might* happen.
Returns either the version string, or None if the package
or the version was not readily found.
"""
if name in sys.modules:
mod = sys.modules[name]
if hasattr(mod, "__version__"):
return mod.__version__
try:
return version(name)
except: # noqa: E722
pass
try:
import importlib
mod = importlib.import_module(name)
return mod.__version__
except (ImportError, AttributeError):
return None
def setup_logging(
logger: logging.Logger | None = None,
logger_name: str | None = None,
level: str = "DEBUG",
clear: bool = True,
) -> logging.Logger:
if logger is None and logger_name is None:
raise ValueError("Provide either logger object or logger name")
logger = logger or logging.getLogger(logger_name)
handle = logging.StreamHandler()
formatter = logging.Formatter(
"%(asctime)s - %(name)s - %(levelname)s - %(funcName)s -- %(message)s"
)
handle.setFormatter(formatter)
if clear:
logger.handlers.clear()
logger.addHandler(handle)
logger.setLevel(level)
return logger
def _unstrip_protocol(name: str, fs: AbstractFileSystem) -> str:
return fs.unstrip_protocol(name)
def mirror_from(
origin_name: str, methods: Iterable[str]
) -> Callable[[type[T]], type[T]]:
"""Mirror attributes and methods from the given
origin_name attribute of the instance to the
decorated class"""
def origin_getter(method: str, self: Any) -> Any:
origin = getattr(self, origin_name)
return getattr(origin, method)
def wrapper(cls: type[T]) -> type[T]:
for method in methods:
wrapped_method = partial(origin_getter, method)
setattr(cls, method, property(wrapped_method))
return cls
return wrapper
@contextlib.contextmanager
def nullcontext(obj: T) -> Iterator[T]:
yield obj
def merge_offset_ranges(
paths: list[str],
starts: list[int] | int,
ends: list[int] | int,
max_gap: int = 0,
max_block: int | None = None,
sort: bool = True,
) -> tuple[list[str], list[int], list[int]]:
"""Merge adjacent byte-offset ranges when the inter-range
gap is <= `max_gap`, and when the merged byte range does not
exceed `max_block` (if specified). By default, this function
will re-order the input paths and byte ranges to ensure sorted
order. If the user can guarantee that the inputs are already
sorted, passing `sort=False` will skip the re-ordering.
"""
# Check input
if not isinstance(paths, list):
raise TypeError
if not isinstance(starts, list):
starts = [starts] * len(paths)
if not isinstance(ends, list):
ends = [ends] * len(paths)
if len(starts) != len(paths) or len(ends) != len(paths):
raise ValueError
# Early Return
if len(starts) <= 1:
return paths, starts, ends
starts = [s or 0 for s in starts]
# Sort by paths and then ranges if `sort=True`
if sort:
paths, starts, ends = (
list(v)
for v in zip(
*sorted(
zip(paths, starts, ends),
)
)
)
if paths:
# Loop through the coupled `paths`, `starts`, and
# `ends`, and merge adjacent blocks when appropriate
new_paths = paths[:1]
new_starts = starts[:1]
new_ends = ends[:1]
for i in range(1, len(paths)):
if paths[i] == paths[i - 1] and new_ends[-1] is None:
continue
elif (
paths[i] != paths[i - 1]
or ((starts[i] - new_ends[-1]) > max_gap)
or (max_block is not None and (ends[i] - new_starts[-1]) > max_block)
):
# Cannot merge with previous block.
# Add new `paths`, `starts`, and `ends` elements
new_paths.append(paths[i])
new_starts.append(starts[i])
new_ends.append(ends[i])
else:
# Merge with previous block by updating the
# last element of `ends`
new_ends[-1] = ends[i]
return new_paths, new_starts, new_ends
# `paths` is empty. Just return input lists
return paths, starts, ends
def file_size(filelike: IO[bytes]) -> int:
"""Find length of any open read-mode file-like"""
pos = filelike.tell()
try:
return filelike.seek(0, 2)
finally:
filelike.seek(pos)
@contextlib.contextmanager
def atomic_write(path: str, mode: str = "wb"):
"""
A context manager that opens a temporary file next to `path` and, on exit,
replaces `path` with the temporary file, thereby updating `path`
atomically.
"""
fd, fn = tempfile.mkstemp(
dir=os.path.dirname(path), prefix=os.path.basename(path) + "-"
)
try:
with open(fd, mode) as fp:
yield fp
except BaseException:
with contextlib.suppress(FileNotFoundError):
os.unlink(fn)
raise
else:
os.replace(fn, path)
def _translate(pat, STAR, QUESTION_MARK):
# Copied from: https://github.com/python/cpython/pull/106703.
res: list[str] = []
add = res.append
i, n = 0, len(pat)
while i < n:
c = pat[i]
i = i + 1
if c == "*":
# compress consecutive `*` into one
if (not res) or res[-1] is not STAR:
add(STAR)
elif c == "?":
add(QUESTION_MARK)
elif c == "[":
j = i
if j < n and pat[j] == "!":
j = j + 1
if j < n and pat[j] == "]":
j = j + 1
while j < n and pat[j] != "]":
j = j + 1
if j >= n:
add("\\[")
else:
stuff = pat[i:j]
if "-" not in stuff:
stuff = stuff.replace("\\", r"\\")
else:
chunks = []
k = i + 2 if pat[i] == "!" else i + 1
while True:
k = pat.find("-", k, j)
if k < 0:
break
chunks.append(pat[i:k])
i = k + 1
k = k + 3
chunk = pat[i:j]
if chunk:
chunks.append(chunk)
else:
chunks[-1] += "-"
# Remove empty ranges -- invalid in RE.
for k in range(len(chunks) - 1, 0, -1):
if chunks[k - 1][-1] > chunks[k][0]:
chunks[k - 1] = chunks[k - 1][:-1] + chunks[k][1:]
del chunks[k]
# Escape backslashes and hyphens for set difference (--).
# Hyphens that create ranges shouldn't be escaped.
stuff = "-".join(
s.replace("\\", r"\\").replace("-", r"\-") for s in chunks
)
# Escape set operations (&&, ~~ and ||).
stuff = re.sub(r"([&~|])", r"\\\1", stuff)
i = j + 1
if not stuff:
# Empty range: never match.
add("(?!)")
elif stuff == "!":
# Negated empty range: match any character.
add(".")
else:
if stuff[0] == "!":
stuff = "^" + stuff[1:]
elif stuff[0] in ("^", "["):
stuff = "\\" + stuff
add(f"[{stuff}]")
else:
add(re.escape(c))
assert i == n
return res
def glob_translate(pat):
# Copied from: https://github.com/python/cpython/pull/106703.
# The keyword parameters' values are fixed to:
# recursive=True, include_hidden=True, seps=None
"""Translate a pathname with shell wildcards to a regular expression."""
if os.path.altsep:
seps = os.path.sep + os.path.altsep
else:
seps = os.path.sep
escaped_seps = "".join(map(re.escape, seps))
any_sep = f"[{escaped_seps}]" if len(seps) > 1 else escaped_seps
not_sep = f"[^{escaped_seps}]"
one_last_segment = f"{not_sep}+"
one_segment = f"{one_last_segment}{any_sep}"
any_segments = f"(?:.+{any_sep})?"
any_last_segments = ".*"
results = []
parts = re.split(any_sep, pat)
last_part_idx = len(parts) - 1
for idx, part in enumerate(parts):
if part == "*":
results.append(one_segment if idx < last_part_idx else one_last_segment)
continue
if part == "**":
results.append(any_segments if idx < last_part_idx else any_last_segments)
continue
elif "**" in part:
raise ValueError(
"Invalid pattern: '**' can only be an entire path component"
)
if part:
results.extend(_translate(part, f"{not_sep}*", not_sep))
if idx < last_part_idx:
results.append(any_sep)
res = "".join(results)
return rf"(?s:{res})\Z"