chore: 添加虚拟环境到仓库
- 添加 backend_service/venv 虚拟环境 - 包含所有Python依赖包 - 注意:虚拟环境约393MB,包含12655个文件
This commit is contained in:
@@ -0,0 +1,307 @@
|
||||
import errno
|
||||
import io
|
||||
import os
|
||||
import secrets
|
||||
import shutil
|
||||
from contextlib import suppress
|
||||
from functools import cached_property, wraps
|
||||
from urllib.parse import parse_qs
|
||||
|
||||
from fsspec.spec import AbstractFileSystem
|
||||
from fsspec.utils import (
|
||||
get_package_version_without_import,
|
||||
infer_storage_options,
|
||||
mirror_from,
|
||||
tokenize,
|
||||
)
|
||||
|
||||
|
||||
def wrap_exceptions(func):
|
||||
@wraps(func)
|
||||
def wrapper(*args, **kwargs):
|
||||
try:
|
||||
return func(*args, **kwargs)
|
||||
except OSError as exception:
|
||||
if not exception.args:
|
||||
raise
|
||||
|
||||
message, *args = exception.args
|
||||
if isinstance(message, str) and "does not exist" in message:
|
||||
raise FileNotFoundError(errno.ENOENT, message) from exception
|
||||
else:
|
||||
raise
|
||||
|
||||
return wrapper
|
||||
|
||||
|
||||
PYARROW_VERSION = None
|
||||
|
||||
|
||||
class ArrowFSWrapper(AbstractFileSystem):
|
||||
"""FSSpec-compatible wrapper of pyarrow.fs.FileSystem.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
fs : pyarrow.fs.FileSystem
|
||||
|
||||
"""
|
||||
|
||||
root_marker = "/"
|
||||
|
||||
def __init__(self, fs, **kwargs):
|
||||
global PYARROW_VERSION
|
||||
PYARROW_VERSION = get_package_version_without_import("pyarrow")
|
||||
self.fs = fs
|
||||
super().__init__(**kwargs)
|
||||
|
||||
@property
|
||||
def protocol(self):
|
||||
return self.fs.type_name
|
||||
|
||||
@cached_property
|
||||
def fsid(self):
|
||||
return "hdfs_" + tokenize(self.fs.host, self.fs.port)
|
||||
|
||||
@classmethod
|
||||
def _strip_protocol(cls, path):
|
||||
ops = infer_storage_options(path)
|
||||
path = ops["path"]
|
||||
if path.startswith("//"):
|
||||
# special case for "hdfs://path" (without the triple slash)
|
||||
path = path[1:]
|
||||
return path
|
||||
|
||||
def ls(self, path, detail=False, **kwargs):
|
||||
path = self._strip_protocol(path)
|
||||
from pyarrow.fs import FileSelector
|
||||
|
||||
try:
|
||||
entries = [
|
||||
self._make_entry(entry)
|
||||
for entry in self.fs.get_file_info(FileSelector(path))
|
||||
]
|
||||
except (FileNotFoundError, NotADirectoryError):
|
||||
entries = [self.info(path, **kwargs)]
|
||||
if detail:
|
||||
return entries
|
||||
else:
|
||||
return [entry["name"] for entry in entries]
|
||||
|
||||
def info(self, path, **kwargs):
|
||||
path = self._strip_protocol(path)
|
||||
[info] = self.fs.get_file_info([path])
|
||||
return self._make_entry(info)
|
||||
|
||||
def exists(self, path):
|
||||
path = self._strip_protocol(path)
|
||||
try:
|
||||
self.info(path)
|
||||
except FileNotFoundError:
|
||||
return False
|
||||
else:
|
||||
return True
|
||||
|
||||
def _make_entry(self, info):
|
||||
from pyarrow.fs import FileType
|
||||
|
||||
if info.type is FileType.Directory:
|
||||
kind = "directory"
|
||||
elif info.type is FileType.File:
|
||||
kind = "file"
|
||||
elif info.type is FileType.NotFound:
|
||||
raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), info.path)
|
||||
else:
|
||||
kind = "other"
|
||||
|
||||
return {
|
||||
"name": info.path,
|
||||
"size": info.size,
|
||||
"type": kind,
|
||||
"mtime": info.mtime,
|
||||
}
|
||||
|
||||
@wrap_exceptions
|
||||
def cp_file(self, path1, path2, **kwargs):
|
||||
path1 = self._strip_protocol(path1).rstrip("/")
|
||||
path2 = self._strip_protocol(path2).rstrip("/")
|
||||
|
||||
with self._open(path1, "rb") as lstream:
|
||||
tmp_fname = f"{path2}.tmp.{secrets.token_hex(6)}"
|
||||
try:
|
||||
with self.open(tmp_fname, "wb") as rstream:
|
||||
shutil.copyfileobj(lstream, rstream)
|
||||
self.fs.move(tmp_fname, path2)
|
||||
except BaseException:
|
||||
with suppress(FileNotFoundError):
|
||||
self.fs.delete_file(tmp_fname)
|
||||
raise
|
||||
|
||||
@wrap_exceptions
|
||||
def mv(self, path1, path2, **kwargs):
|
||||
path1 = self._strip_protocol(path1).rstrip("/")
|
||||
path2 = self._strip_protocol(path2).rstrip("/")
|
||||
self.fs.move(path1, path2)
|
||||
|
||||
@wrap_exceptions
|
||||
def rm_file(self, path):
|
||||
path = self._strip_protocol(path)
|
||||
self.fs.delete_file(path)
|
||||
|
||||
@wrap_exceptions
|
||||
def rm(self, path, recursive=False, maxdepth=None):
|
||||
path = self._strip_protocol(path).rstrip("/")
|
||||
if self.isdir(path):
|
||||
if recursive:
|
||||
self.fs.delete_dir(path)
|
||||
else:
|
||||
raise ValueError("Can't delete directories without recursive=False")
|
||||
else:
|
||||
self.fs.delete_file(path)
|
||||
|
||||
@wrap_exceptions
|
||||
def _open(self, path, mode="rb", block_size=None, seekable=True, **kwargs):
|
||||
if mode == "rb":
|
||||
if seekable:
|
||||
method = self.fs.open_input_file
|
||||
else:
|
||||
method = self.fs.open_input_stream
|
||||
elif mode == "wb":
|
||||
method = self.fs.open_output_stream
|
||||
elif mode == "ab":
|
||||
method = self.fs.open_append_stream
|
||||
else:
|
||||
raise ValueError(f"unsupported mode for Arrow filesystem: {mode!r}")
|
||||
|
||||
_kwargs = {}
|
||||
if mode != "rb" or not seekable:
|
||||
if int(PYARROW_VERSION.split(".")[0]) >= 4:
|
||||
# disable compression auto-detection
|
||||
_kwargs["compression"] = None
|
||||
stream = method(path, **_kwargs)
|
||||
|
||||
return ArrowFile(self, stream, path, mode, block_size, **kwargs)
|
||||
|
||||
@wrap_exceptions
|
||||
def mkdir(self, path, create_parents=True, **kwargs):
|
||||
path = self._strip_protocol(path)
|
||||
if create_parents:
|
||||
self.makedirs(path, exist_ok=True)
|
||||
else:
|
||||
self.fs.create_dir(path, recursive=False)
|
||||
|
||||
@wrap_exceptions
|
||||
def makedirs(self, path, exist_ok=False):
|
||||
path = self._strip_protocol(path)
|
||||
self.fs.create_dir(path, recursive=True)
|
||||
|
||||
@wrap_exceptions
|
||||
def rmdir(self, path):
|
||||
path = self._strip_protocol(path)
|
||||
self.fs.delete_dir(path)
|
||||
|
||||
@wrap_exceptions
|
||||
def modified(self, path):
|
||||
path = self._strip_protocol(path)
|
||||
return self.fs.get_file_info(path).mtime
|
||||
|
||||
def cat_file(self, path, start=None, end=None, **kwargs):
|
||||
kwargs["seekable"] = start not in [None, 0]
|
||||
return super().cat_file(path, start=None, end=None, **kwargs)
|
||||
|
||||
def get_file(self, rpath, lpath, **kwargs):
|
||||
kwargs["seekable"] = False
|
||||
super().get_file(rpath, lpath, **kwargs)
|
||||
|
||||
|
||||
@mirror_from(
|
||||
"stream",
|
||||
[
|
||||
"read",
|
||||
"seek",
|
||||
"tell",
|
||||
"write",
|
||||
"readable",
|
||||
"writable",
|
||||
"close",
|
||||
"size",
|
||||
"seekable",
|
||||
],
|
||||
)
|
||||
class ArrowFile(io.IOBase):
|
||||
def __init__(self, fs, stream, path, mode, block_size=None, **kwargs):
|
||||
self.path = path
|
||||
self.mode = mode
|
||||
|
||||
self.fs = fs
|
||||
self.stream = stream
|
||||
|
||||
self.blocksize = self.block_size = block_size
|
||||
self.kwargs = kwargs
|
||||
|
||||
def __enter__(self):
|
||||
return self
|
||||
|
||||
def __exit__(self, *args):
|
||||
return self.close()
|
||||
|
||||
|
||||
class HadoopFileSystem(ArrowFSWrapper):
|
||||
"""A wrapper on top of the pyarrow.fs.HadoopFileSystem
|
||||
to connect it's interface with fsspec"""
|
||||
|
||||
protocol = "hdfs"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
host="default",
|
||||
port=0,
|
||||
user=None,
|
||||
kerb_ticket=None,
|
||||
replication=3,
|
||||
extra_conf=None,
|
||||
**kwargs,
|
||||
):
|
||||
"""
|
||||
|
||||
Parameters
|
||||
----------
|
||||
host: str
|
||||
Hostname, IP or "default" to try to read from Hadoop config
|
||||
port: int
|
||||
Port to connect on, or default from Hadoop config if 0
|
||||
user: str or None
|
||||
If given, connect as this username
|
||||
kerb_ticket: str or None
|
||||
If given, use this ticket for authentication
|
||||
replication: int
|
||||
set replication factor of file for write operations. default value is 3.
|
||||
extra_conf: None or dict
|
||||
Passed on to HadoopFileSystem
|
||||
"""
|
||||
from pyarrow.fs import HadoopFileSystem
|
||||
|
||||
fs = HadoopFileSystem(
|
||||
host=host,
|
||||
port=port,
|
||||
user=user,
|
||||
kerb_ticket=kerb_ticket,
|
||||
replication=replication,
|
||||
extra_conf=extra_conf,
|
||||
)
|
||||
super().__init__(fs=fs, **kwargs)
|
||||
|
||||
@staticmethod
|
||||
def _get_kwargs_from_urls(path):
|
||||
ops = infer_storage_options(path)
|
||||
out = {}
|
||||
if ops.get("host", None):
|
||||
out["host"] = ops["host"]
|
||||
if ops.get("username", None):
|
||||
out["user"] = ops["username"]
|
||||
if ops.get("port", None):
|
||||
out["port"] = ops["port"]
|
||||
if ops.get("url_query", None):
|
||||
queries = parse_qs(ops["url_query"])
|
||||
if queries.get("replication", None):
|
||||
out["replication"] = int(queries["replication"][0])
|
||||
return out
|
||||
@@ -0,0 +1,122 @@
|
||||
import asyncio
|
||||
import functools
|
||||
import inspect
|
||||
|
||||
import fsspec
|
||||
from fsspec.asyn import AsyncFileSystem, running_async
|
||||
|
||||
|
||||
def async_wrapper(func, obj=None, semaphore=None):
|
||||
"""
|
||||
Wraps a synchronous function to make it awaitable.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
func : callable
|
||||
The synchronous function to wrap.
|
||||
obj : object, optional
|
||||
The instance to bind the function to, if applicable.
|
||||
semaphore : asyncio.Semaphore, optional
|
||||
A semaphore to limit concurrent calls.
|
||||
|
||||
Returns
|
||||
-------
|
||||
coroutine
|
||||
An awaitable version of the function.
|
||||
"""
|
||||
|
||||
@functools.wraps(func)
|
||||
async def wrapper(*args, **kwargs):
|
||||
if semaphore:
|
||||
async with semaphore:
|
||||
return await asyncio.to_thread(func, *args, **kwargs)
|
||||
return await asyncio.to_thread(func, *args, **kwargs)
|
||||
|
||||
return wrapper
|
||||
|
||||
|
||||
class AsyncFileSystemWrapper(AsyncFileSystem):
|
||||
"""
|
||||
A wrapper class to convert a synchronous filesystem into an asynchronous one.
|
||||
|
||||
This class takes an existing synchronous filesystem implementation and wraps all
|
||||
its methods to provide an asynchronous interface.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
sync_fs : AbstractFileSystem
|
||||
The synchronous filesystem instance to wrap.
|
||||
"""
|
||||
|
||||
protocol = "asyncwrapper", "async_wrapper"
|
||||
cachable = False
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
fs=None,
|
||||
asynchronous=None,
|
||||
target_protocol=None,
|
||||
target_options=None,
|
||||
semaphore=None,
|
||||
max_concurrent_tasks=None,
|
||||
**kwargs,
|
||||
):
|
||||
if asynchronous is None:
|
||||
asynchronous = running_async()
|
||||
super().__init__(asynchronous=asynchronous, **kwargs)
|
||||
if fs is not None:
|
||||
self.sync_fs = fs
|
||||
else:
|
||||
self.sync_fs = fsspec.filesystem(target_protocol, **target_options)
|
||||
self.protocol = self.sync_fs.protocol
|
||||
self.semaphore = semaphore
|
||||
self._wrap_all_sync_methods()
|
||||
|
||||
@property
|
||||
def fsid(self):
|
||||
return f"async_{self.sync_fs.fsid}"
|
||||
|
||||
def _wrap_all_sync_methods(self):
|
||||
"""
|
||||
Wrap all synchronous methods of the underlying filesystem with asynchronous versions.
|
||||
"""
|
||||
excluded_methods = {"open"}
|
||||
for method_name in dir(self.sync_fs):
|
||||
if method_name.startswith("_") or method_name in excluded_methods:
|
||||
continue
|
||||
|
||||
attr = inspect.getattr_static(self.sync_fs, method_name)
|
||||
if isinstance(attr, property):
|
||||
continue
|
||||
|
||||
method = getattr(self.sync_fs, method_name)
|
||||
if callable(method) and not inspect.iscoroutinefunction(method):
|
||||
async_method = async_wrapper(method, obj=self, semaphore=self.semaphore)
|
||||
setattr(self, f"_{method_name}", async_method)
|
||||
|
||||
@classmethod
|
||||
def wrap_class(cls, sync_fs_class):
|
||||
"""
|
||||
Create a new class that can be used to instantiate an AsyncFileSystemWrapper
|
||||
with lazy instantiation of the underlying synchronous filesystem.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
sync_fs_class : type
|
||||
The class of the synchronous filesystem to wrap.
|
||||
|
||||
Returns
|
||||
-------
|
||||
type
|
||||
A new class that wraps the provided synchronous filesystem class.
|
||||
"""
|
||||
|
||||
class GeneratedAsyncFileSystemWrapper(cls):
|
||||
def __init__(self, *args, **kwargs):
|
||||
sync_fs = sync_fs_class(*args, **kwargs)
|
||||
super().__init__(sync_fs)
|
||||
|
||||
GeneratedAsyncFileSystemWrapper.__name__ = (
|
||||
f"Async{sync_fs_class.__name__}Wrapper"
|
||||
)
|
||||
return GeneratedAsyncFileSystemWrapper
|
||||
@@ -0,0 +1,75 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import abc
|
||||
import hashlib
|
||||
|
||||
from fsspec.implementations.local import make_path_posix
|
||||
|
||||
|
||||
class AbstractCacheMapper(abc.ABC):
|
||||
"""Abstract super-class for mappers from remote URLs to local cached
|
||||
basenames.
|
||||
"""
|
||||
|
||||
@abc.abstractmethod
|
||||
def __call__(self, path: str) -> str: ...
|
||||
|
||||
def __eq__(self, other: object) -> bool:
|
||||
# Identity only depends on class. When derived classes have attributes
|
||||
# they will need to be included.
|
||||
return isinstance(other, type(self))
|
||||
|
||||
def __hash__(self) -> int:
|
||||
# Identity only depends on class. When derived classes have attributes
|
||||
# they will need to be included.
|
||||
return hash(type(self))
|
||||
|
||||
|
||||
class BasenameCacheMapper(AbstractCacheMapper):
|
||||
"""Cache mapper that uses the basename of the remote URL and a fixed number
|
||||
of directory levels above this.
|
||||
|
||||
The default is zero directory levels, meaning different paths with the same
|
||||
basename will have the same cached basename.
|
||||
"""
|
||||
|
||||
def __init__(self, directory_levels: int = 0):
|
||||
if directory_levels < 0:
|
||||
raise ValueError(
|
||||
"BasenameCacheMapper requires zero or positive directory_levels"
|
||||
)
|
||||
self.directory_levels = directory_levels
|
||||
|
||||
# Separator for directories when encoded as strings.
|
||||
self._separator = "_@_"
|
||||
|
||||
def __call__(self, path: str) -> str:
|
||||
path = make_path_posix(path)
|
||||
prefix, *bits = path.rsplit("/", self.directory_levels + 1)
|
||||
if bits:
|
||||
return self._separator.join(bits)
|
||||
else:
|
||||
return prefix # No separator found, simple filename
|
||||
|
||||
def __eq__(self, other: object) -> bool:
|
||||
return super().__eq__(other) and self.directory_levels == other.directory_levels
|
||||
|
||||
def __hash__(self) -> int:
|
||||
return super().__hash__() ^ hash(self.directory_levels)
|
||||
|
||||
|
||||
class HashCacheMapper(AbstractCacheMapper):
|
||||
"""Cache mapper that uses a hash of the remote URL."""
|
||||
|
||||
def __call__(self, path: str) -> str:
|
||||
return hashlib.sha256(path.encode()).hexdigest()
|
||||
|
||||
|
||||
def create_cache_mapper(same_names: bool) -> AbstractCacheMapper:
|
||||
"""Factory method to create cache mapper for backward compatibility with
|
||||
``CachingFileSystem`` constructor using ``same_names`` kwarg.
|
||||
"""
|
||||
if same_names:
|
||||
return BasenameCacheMapper()
|
||||
else:
|
||||
return HashCacheMapper()
|
||||
@@ -0,0 +1,233 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import pickle
|
||||
import time
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from fsspec.utils import atomic_write
|
||||
|
||||
try:
|
||||
import ujson as json
|
||||
except ImportError:
|
||||
if not TYPE_CHECKING:
|
||||
import json
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import Iterator
|
||||
from typing import Any, Literal
|
||||
|
||||
from typing_extensions import TypeAlias
|
||||
|
||||
from .cached import CachingFileSystem
|
||||
|
||||
Detail: TypeAlias = dict[str, Any]
|
||||
|
||||
|
||||
class CacheMetadata:
|
||||
"""Cache metadata.
|
||||
|
||||
All reading and writing of cache metadata is performed by this class,
|
||||
accessing the cached files and blocks is not.
|
||||
|
||||
Metadata is stored in a single file per storage directory in JSON format.
|
||||
For backward compatibility, also reads metadata stored in pickle format
|
||||
which is converted to JSON when next saved.
|
||||
"""
|
||||
|
||||
def __init__(self, storage: list[str]):
|
||||
"""
|
||||
|
||||
Parameters
|
||||
----------
|
||||
storage: list[str]
|
||||
Directories containing cached files, must be at least one. Metadata
|
||||
is stored in the last of these directories by convention.
|
||||
"""
|
||||
if not storage:
|
||||
raise ValueError("CacheMetadata expects at least one storage location")
|
||||
|
||||
self._storage = storage
|
||||
self.cached_files: list[Detail] = [{}]
|
||||
|
||||
# Private attribute to force saving of metadata in pickle format rather than
|
||||
# JSON for use in tests to confirm can read both pickle and JSON formats.
|
||||
self._force_save_pickle = False
|
||||
|
||||
def _load(self, fn: str) -> Detail:
|
||||
"""Low-level function to load metadata from specific file"""
|
||||
try:
|
||||
with open(fn, "r") as f:
|
||||
loaded = json.load(f)
|
||||
except ValueError:
|
||||
with open(fn, "rb") as f:
|
||||
loaded = pickle.load(f)
|
||||
for c in loaded.values():
|
||||
if isinstance(c.get("blocks"), list):
|
||||
c["blocks"] = set(c["blocks"])
|
||||
return loaded
|
||||
|
||||
def _save(self, metadata_to_save: Detail, fn: str) -> None:
|
||||
"""Low-level function to save metadata to specific file"""
|
||||
if self._force_save_pickle:
|
||||
with atomic_write(fn) as f:
|
||||
pickle.dump(metadata_to_save, f)
|
||||
else:
|
||||
with atomic_write(fn, mode="w") as f:
|
||||
json.dump(metadata_to_save, f)
|
||||
|
||||
def _scan_locations(
|
||||
self, writable_only: bool = False
|
||||
) -> Iterator[tuple[str, str, bool]]:
|
||||
"""Yield locations (filenames) where metadata is stored, and whether
|
||||
writable or not.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
writable: bool
|
||||
Set to True to only yield writable locations.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Yields (str, str, bool)
|
||||
"""
|
||||
n = len(self._storage)
|
||||
for i, storage in enumerate(self._storage):
|
||||
writable = i == n - 1
|
||||
if writable_only and not writable:
|
||||
continue
|
||||
yield os.path.join(storage, "cache"), storage, writable
|
||||
|
||||
def check_file(
|
||||
self, path: str, cfs: CachingFileSystem | None
|
||||
) -> Literal[False] | tuple[Detail, str]:
|
||||
"""If path is in cache return its details, otherwise return ``False``.
|
||||
|
||||
If the optional CachingFileSystem is specified then it is used to
|
||||
perform extra checks to reject possible matches, such as if they are
|
||||
too old.
|
||||
"""
|
||||
for (fn, base, _), cache in zip(self._scan_locations(), self.cached_files):
|
||||
if path not in cache:
|
||||
continue
|
||||
detail = cache[path].copy()
|
||||
|
||||
if cfs is not None:
|
||||
if cfs.check_files and detail["uid"] != cfs.fs.ukey(path):
|
||||
# Wrong file as determined by hash of file properties
|
||||
continue
|
||||
if cfs.expiry and time.time() - detail["time"] > cfs.expiry:
|
||||
# Cached file has expired
|
||||
continue
|
||||
|
||||
fn = os.path.join(base, detail["fn"])
|
||||
if os.path.exists(fn):
|
||||
return detail, fn
|
||||
return False
|
||||
|
||||
def clear_expired(self, expiry_time: int) -> tuple[list[str], bool]:
|
||||
"""Remove expired metadata from the cache.
|
||||
|
||||
Returns names of files corresponding to expired metadata and a boolean
|
||||
flag indicating whether the writable cache is empty. Caller is
|
||||
responsible for deleting the expired files.
|
||||
"""
|
||||
expired_files = []
|
||||
for path, detail in self.cached_files[-1].copy().items():
|
||||
if time.time() - detail["time"] > expiry_time:
|
||||
fn = detail.get("fn", "")
|
||||
if not fn:
|
||||
raise RuntimeError(
|
||||
f"Cache metadata does not contain 'fn' for {path}"
|
||||
)
|
||||
fn = os.path.join(self._storage[-1], fn)
|
||||
expired_files.append(fn)
|
||||
self.cached_files[-1].pop(path)
|
||||
|
||||
if self.cached_files[-1]:
|
||||
cache_path = os.path.join(self._storage[-1], "cache")
|
||||
self._save(self.cached_files[-1], cache_path)
|
||||
|
||||
writable_cache_empty = not self.cached_files[-1]
|
||||
return expired_files, writable_cache_empty
|
||||
|
||||
def load(self) -> None:
|
||||
"""Load all metadata from disk and store in ``self.cached_files``"""
|
||||
cached_files = []
|
||||
for fn, _, _ in self._scan_locations():
|
||||
if os.path.exists(fn):
|
||||
# TODO: consolidate blocks here
|
||||
cached_files.append(self._load(fn))
|
||||
else:
|
||||
cached_files.append({})
|
||||
self.cached_files = cached_files or [{}]
|
||||
|
||||
def on_close_cached_file(self, f: Any, path: str) -> None:
|
||||
"""Perform side-effect actions on closing a cached file.
|
||||
|
||||
The actual closing of the file is the responsibility of the caller.
|
||||
"""
|
||||
# File must be writeble, so in self.cached_files[-1]
|
||||
c = self.cached_files[-1][path]
|
||||
if c["blocks"] is not True and len(c["blocks"]) * f.blocksize >= f.size:
|
||||
c["blocks"] = True
|
||||
|
||||
def pop_file(self, path: str) -> str | None:
|
||||
"""Remove metadata of cached file.
|
||||
|
||||
If path is in the cache, return the filename of the cached file,
|
||||
otherwise return ``None``. Caller is responsible for deleting the
|
||||
cached file.
|
||||
"""
|
||||
details = self.check_file(path, None)
|
||||
if not details:
|
||||
return None
|
||||
_, fn = details
|
||||
if fn.startswith(self._storage[-1]):
|
||||
self.cached_files[-1].pop(path)
|
||||
self.save()
|
||||
else:
|
||||
raise PermissionError(
|
||||
"Can only delete cached file in last, writable cache location"
|
||||
)
|
||||
return fn
|
||||
|
||||
def save(self) -> None:
|
||||
"""Save metadata to disk"""
|
||||
for (fn, _, writable), cache in zip(self._scan_locations(), self.cached_files):
|
||||
if not writable:
|
||||
continue
|
||||
|
||||
if os.path.exists(fn):
|
||||
cached_files = self._load(fn)
|
||||
for k, c in cached_files.items():
|
||||
if k in cache:
|
||||
if c["blocks"] is True or cache[k]["blocks"] is True:
|
||||
c["blocks"] = True
|
||||
else:
|
||||
# self.cached_files[*][*]["blocks"] must continue to
|
||||
# point to the same set object so that updates
|
||||
# performed by MMapCache are propagated back to
|
||||
# self.cached_files.
|
||||
blocks = cache[k]["blocks"]
|
||||
blocks.update(c["blocks"])
|
||||
c["blocks"] = blocks
|
||||
c["time"] = max(c["time"], cache[k]["time"])
|
||||
c["uid"] = cache[k]["uid"]
|
||||
|
||||
# Files can be added to cache after it was written once
|
||||
for k, c in cache.items():
|
||||
if k not in cached_files:
|
||||
cached_files[k] = c
|
||||
else:
|
||||
cached_files = cache
|
||||
cache = {k: v.copy() for k, v in cached_files.items()}
|
||||
for c in cache.values():
|
||||
if isinstance(c["blocks"], set):
|
||||
c["blocks"] = list(c["blocks"])
|
||||
self._save(cache, fn)
|
||||
self.cached_files[-1] = cached_files
|
||||
|
||||
def update_file(self, path: str, detail: Detail) -> None:
|
||||
"""Update metadata for specific file in memory, do not save"""
|
||||
self.cached_files[-1][path] = detail
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,23 @@
|
||||
from typing import ClassVar
|
||||
|
||||
from fsspec import AbstractFileSystem
|
||||
|
||||
__all__ = ("ChainedFileSystem",)
|
||||
|
||||
|
||||
class ChainedFileSystem(AbstractFileSystem):
|
||||
"""Chained filesystem base class.
|
||||
|
||||
A chained filesystem is designed to be layered over another FS.
|
||||
This is useful to implement things like caching.
|
||||
|
||||
This base class does very little on its own, but is used as a marker
|
||||
that the class is designed for chaining.
|
||||
|
||||
Right now this is only used in `url_to_fs` to provide the path argument
|
||||
(`fo`) to the chained filesystem from the underlying filesystem.
|
||||
|
||||
Additional functionality may be added in the future.
|
||||
"""
|
||||
|
||||
protocol: ClassVar[str] = "chained"
|
||||
@@ -0,0 +1,152 @@
|
||||
import dask
|
||||
from distributed.client import Client, _get_global_client
|
||||
from distributed.worker import Worker
|
||||
|
||||
from fsspec import filesystem
|
||||
from fsspec.spec import AbstractBufferedFile, AbstractFileSystem
|
||||
from fsspec.utils import infer_storage_options
|
||||
|
||||
|
||||
def _get_client(client):
|
||||
if client is None:
|
||||
return _get_global_client()
|
||||
elif isinstance(client, Client):
|
||||
return client
|
||||
else:
|
||||
# e.g., connection string
|
||||
return Client(client)
|
||||
|
||||
|
||||
def _in_worker():
|
||||
return bool(Worker._instances)
|
||||
|
||||
|
||||
class DaskWorkerFileSystem(AbstractFileSystem):
|
||||
"""View files accessible to a worker as any other remote file-system
|
||||
|
||||
When instances are run on the worker, uses the real filesystem. When
|
||||
run on the client, they call the worker to provide information or data.
|
||||
|
||||
**Warning** this implementation is experimental, and read-only for now.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self, target_protocol=None, target_options=None, fs=None, client=None, **kwargs
|
||||
):
|
||||
super().__init__(**kwargs)
|
||||
if not (fs is None) ^ (target_protocol is None):
|
||||
raise ValueError(
|
||||
"Please provide one of filesystem instance (fs) or"
|
||||
" target_protocol, not both"
|
||||
)
|
||||
self.target_protocol = target_protocol
|
||||
self.target_options = target_options
|
||||
self.worker = None
|
||||
self.client = client
|
||||
self.fs = fs
|
||||
self._determine_worker()
|
||||
|
||||
@staticmethod
|
||||
def _get_kwargs_from_urls(path):
|
||||
so = infer_storage_options(path)
|
||||
if "host" in so and "port" in so:
|
||||
return {"client": f"{so['host']}:{so['port']}"}
|
||||
else:
|
||||
return {}
|
||||
|
||||
def _determine_worker(self):
|
||||
if _in_worker():
|
||||
self.worker = True
|
||||
if self.fs is None:
|
||||
self.fs = filesystem(
|
||||
self.target_protocol, **(self.target_options or {})
|
||||
)
|
||||
else:
|
||||
self.worker = False
|
||||
self.client = _get_client(self.client)
|
||||
self.rfs = dask.delayed(self)
|
||||
|
||||
def mkdir(self, *args, **kwargs):
|
||||
if self.worker:
|
||||
self.fs.mkdir(*args, **kwargs)
|
||||
else:
|
||||
self.rfs.mkdir(*args, **kwargs).compute()
|
||||
|
||||
def rm(self, *args, **kwargs):
|
||||
if self.worker:
|
||||
self.fs.rm(*args, **kwargs)
|
||||
else:
|
||||
self.rfs.rm(*args, **kwargs).compute()
|
||||
|
||||
def copy(self, *args, **kwargs):
|
||||
if self.worker:
|
||||
self.fs.copy(*args, **kwargs)
|
||||
else:
|
||||
self.rfs.copy(*args, **kwargs).compute()
|
||||
|
||||
def mv(self, *args, **kwargs):
|
||||
if self.worker:
|
||||
self.fs.mv(*args, **kwargs)
|
||||
else:
|
||||
self.rfs.mv(*args, **kwargs).compute()
|
||||
|
||||
def ls(self, *args, **kwargs):
|
||||
if self.worker:
|
||||
return self.fs.ls(*args, **kwargs)
|
||||
else:
|
||||
return self.rfs.ls(*args, **kwargs).compute()
|
||||
|
||||
def _open(
|
||||
self,
|
||||
path,
|
||||
mode="rb",
|
||||
block_size=None,
|
||||
autocommit=True,
|
||||
cache_options=None,
|
||||
**kwargs,
|
||||
):
|
||||
if self.worker:
|
||||
return self.fs._open(
|
||||
path,
|
||||
mode=mode,
|
||||
block_size=block_size,
|
||||
autocommit=autocommit,
|
||||
cache_options=cache_options,
|
||||
**kwargs,
|
||||
)
|
||||
else:
|
||||
return DaskFile(
|
||||
fs=self,
|
||||
path=path,
|
||||
mode=mode,
|
||||
block_size=block_size,
|
||||
autocommit=autocommit,
|
||||
cache_options=cache_options,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
def fetch_range(self, path, mode, start, end):
|
||||
if self.worker:
|
||||
with self._open(path, mode) as f:
|
||||
f.seek(start)
|
||||
return f.read(end - start)
|
||||
else:
|
||||
return self.rfs.fetch_range(path, mode, start, end).compute()
|
||||
|
||||
|
||||
class DaskFile(AbstractBufferedFile):
|
||||
def __init__(self, mode="rb", **kwargs):
|
||||
if mode != "rb":
|
||||
raise ValueError('Remote dask files can only be opened in "rb" mode')
|
||||
super().__init__(**kwargs)
|
||||
|
||||
def _upload_chunk(self, final=False):
|
||||
pass
|
||||
|
||||
def _initiate_upload(self):
|
||||
"""Create remote file/upload"""
|
||||
pass
|
||||
|
||||
def _fetch_range(self, start, end):
|
||||
"""Get the specified set of bytes from remote"""
|
||||
return self.fs.fetch_range(self.path, self.mode, start, end)
|
||||
@@ -0,0 +1,58 @@
|
||||
import base64
|
||||
import io
|
||||
from typing import Optional
|
||||
from urllib.parse import unquote
|
||||
|
||||
from fsspec import AbstractFileSystem
|
||||
|
||||
|
||||
class DataFileSystem(AbstractFileSystem):
|
||||
"""A handy decoder for data-URLs
|
||||
|
||||
Example
|
||||
-------
|
||||
>>> with fsspec.open("data:,Hello%2C%20World%21") as f:
|
||||
... print(f.read())
|
||||
b"Hello, World!"
|
||||
|
||||
See https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/Data_URLs
|
||||
"""
|
||||
|
||||
protocol = "data"
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
"""No parameters for this filesystem"""
|
||||
super().__init__(**kwargs)
|
||||
|
||||
def cat_file(self, path, start=None, end=None, **kwargs):
|
||||
pref, data = path.split(",", 1)
|
||||
if pref.endswith("base64"):
|
||||
return base64.b64decode(data)[start:end]
|
||||
return unquote(data).encode()[start:end]
|
||||
|
||||
def info(self, path, **kwargs):
|
||||
pref, name = path.split(",", 1)
|
||||
data = self.cat_file(path)
|
||||
mime = pref.split(":", 1)[1].split(";", 1)[0]
|
||||
return {"name": name, "size": len(data), "type": "file", "mimetype": mime}
|
||||
|
||||
def _open(
|
||||
self,
|
||||
path,
|
||||
mode="rb",
|
||||
block_size=None,
|
||||
autocommit=True,
|
||||
cache_options=None,
|
||||
**kwargs,
|
||||
):
|
||||
if "r" not in mode:
|
||||
raise ValueError("Read only filesystem")
|
||||
return io.BytesIO(self.cat_file(path))
|
||||
|
||||
@staticmethod
|
||||
def encode(data: bytes, mime: Optional[str] = None):
|
||||
"""Format the given data into data-URL syntax
|
||||
|
||||
This version always base64 encodes, even when the data is ascii/url-safe.
|
||||
"""
|
||||
return f"data:{mime or ''};base64,{base64.b64encode(data).decode()}"
|
||||
@@ -0,0 +1,496 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import base64
|
||||
import urllib
|
||||
|
||||
import requests
|
||||
from requests.adapters import HTTPAdapter, Retry
|
||||
from typing_extensions import override
|
||||
|
||||
from fsspec import AbstractFileSystem
|
||||
from fsspec.spec import AbstractBufferedFile
|
||||
|
||||
|
||||
class DatabricksException(Exception):
|
||||
"""
|
||||
Helper class for exceptions raised in this module.
|
||||
"""
|
||||
|
||||
def __init__(self, error_code, message, details=None):
|
||||
"""Create a new DatabricksException"""
|
||||
super().__init__(message)
|
||||
|
||||
self.error_code = error_code
|
||||
self.message = message
|
||||
self.details = details
|
||||
|
||||
|
||||
class DatabricksFileSystem(AbstractFileSystem):
|
||||
"""
|
||||
Get access to the Databricks filesystem implementation over HTTP.
|
||||
Can be used inside and outside of a databricks cluster.
|
||||
"""
|
||||
|
||||
def __init__(self, instance, token, **kwargs):
|
||||
"""
|
||||
Create a new DatabricksFileSystem.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
instance: str
|
||||
The instance URL of the databricks cluster.
|
||||
For example for an Azure databricks cluster, this
|
||||
has the form adb-<some-number>.<two digits>.azuredatabricks.net.
|
||||
token: str
|
||||
Your personal token. Find out more
|
||||
here: https://docs.databricks.com/dev-tools/api/latest/authentication.html
|
||||
"""
|
||||
self.instance = instance
|
||||
self.token = token
|
||||
self.session = requests.Session()
|
||||
self.retries = Retry(
|
||||
total=10,
|
||||
backoff_factor=0.05,
|
||||
status_forcelist=[408, 429, 500, 502, 503, 504],
|
||||
)
|
||||
|
||||
self.session.mount("https://", HTTPAdapter(max_retries=self.retries))
|
||||
self.session.headers.update({"Authorization": f"Bearer {self.token}"})
|
||||
|
||||
super().__init__(**kwargs)
|
||||
|
||||
@override
|
||||
def _ls_from_cache(self, path) -> list[dict[str, str | int]] | None:
|
||||
"""Check cache for listing
|
||||
|
||||
Returns listing, if found (may be empty list for a directory that
|
||||
exists but contains nothing), None if not in cache.
|
||||
"""
|
||||
self.dircache.pop(path.rstrip("/"), None)
|
||||
|
||||
parent = self._parent(path)
|
||||
if parent in self.dircache:
|
||||
for entry in self.dircache[parent]:
|
||||
if entry["name"] == path.rstrip("/"):
|
||||
if entry["type"] != "directory":
|
||||
return [entry]
|
||||
return []
|
||||
raise FileNotFoundError(path)
|
||||
|
||||
def ls(self, path, detail=True, **kwargs):
|
||||
"""
|
||||
List the contents of the given path.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
path: str
|
||||
Absolute path
|
||||
detail: bool
|
||||
Return not only the list of filenames,
|
||||
but also additional information on file sizes
|
||||
and types.
|
||||
"""
|
||||
try:
|
||||
out = self._ls_from_cache(path)
|
||||
except FileNotFoundError:
|
||||
# This happens if the `path`'s parent was cached, but `path` is not
|
||||
# there. This suggests that `path` is new since the parent was
|
||||
# cached. Attempt to invalidate parent's cache before continuing.
|
||||
self.dircache.pop(self._parent(path), None)
|
||||
out = None
|
||||
|
||||
if not out:
|
||||
try:
|
||||
r = self._send_to_api(
|
||||
method="get", endpoint="list", json={"path": path}
|
||||
)
|
||||
except DatabricksException as e:
|
||||
if e.error_code == "RESOURCE_DOES_NOT_EXIST":
|
||||
raise FileNotFoundError(e.message) from e
|
||||
|
||||
raise
|
||||
files = r.get("files", [])
|
||||
out = [
|
||||
{
|
||||
"name": o["path"],
|
||||
"type": "directory" if o["is_dir"] else "file",
|
||||
"size": o["file_size"],
|
||||
}
|
||||
for o in files
|
||||
]
|
||||
self.dircache[path] = out
|
||||
|
||||
if detail:
|
||||
return out
|
||||
return [o["name"] for o in out]
|
||||
|
||||
def makedirs(self, path, exist_ok=True):
|
||||
"""
|
||||
Create a given absolute path and all of its parents.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
path: str
|
||||
Absolute path to create
|
||||
exist_ok: bool
|
||||
If false, checks if the folder
|
||||
exists before creating it (and raises an
|
||||
Exception if this is the case)
|
||||
"""
|
||||
if not exist_ok:
|
||||
try:
|
||||
# If the following succeeds, the path is already present
|
||||
self._send_to_api(
|
||||
method="get", endpoint="get-status", json={"path": path}
|
||||
)
|
||||
raise FileExistsError(f"Path {path} already exists")
|
||||
except DatabricksException as e:
|
||||
if e.error_code == "RESOURCE_DOES_NOT_EXIST":
|
||||
pass
|
||||
|
||||
try:
|
||||
self._send_to_api(method="post", endpoint="mkdirs", json={"path": path})
|
||||
except DatabricksException as e:
|
||||
if e.error_code == "RESOURCE_ALREADY_EXISTS":
|
||||
raise FileExistsError(e.message) from e
|
||||
|
||||
raise
|
||||
self.invalidate_cache(self._parent(path))
|
||||
|
||||
def mkdir(self, path, create_parents=True, **kwargs):
|
||||
"""
|
||||
Create a given absolute path and all of its parents.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
path: str
|
||||
Absolute path to create
|
||||
create_parents: bool
|
||||
Whether to create all parents or not.
|
||||
"False" is not implemented so far.
|
||||
"""
|
||||
if not create_parents:
|
||||
raise NotImplementedError
|
||||
|
||||
self.mkdirs(path, **kwargs)
|
||||
|
||||
def rm(self, path, recursive=False, **kwargs):
|
||||
"""
|
||||
Remove the file or folder at the given absolute path.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
path: str
|
||||
Absolute path what to remove
|
||||
recursive: bool
|
||||
Recursively delete all files in a folder.
|
||||
"""
|
||||
try:
|
||||
self._send_to_api(
|
||||
method="post",
|
||||
endpoint="delete",
|
||||
json={"path": path, "recursive": recursive},
|
||||
)
|
||||
except DatabricksException as e:
|
||||
# This is not really an exception, it just means
|
||||
# not everything was deleted so far
|
||||
if e.error_code == "PARTIAL_DELETE":
|
||||
self.rm(path=path, recursive=recursive)
|
||||
elif e.error_code == "IO_ERROR":
|
||||
# Using the same exception as the os module would use here
|
||||
raise OSError(e.message) from e
|
||||
|
||||
raise
|
||||
self.invalidate_cache(self._parent(path))
|
||||
|
||||
def mv(
|
||||
self, source_path, destination_path, recursive=False, maxdepth=None, **kwargs
|
||||
):
|
||||
"""
|
||||
Move a source to a destination path.
|
||||
|
||||
A note from the original [databricks API manual]
|
||||
(https://docs.databricks.com/dev-tools/api/latest/dbfs.html#move).
|
||||
|
||||
When moving a large number of files the API call will time out after
|
||||
approximately 60s, potentially resulting in partially moved data.
|
||||
Therefore, for operations that move more than 10k files, we strongly
|
||||
discourage using the DBFS REST API.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
source_path: str
|
||||
From where to move (absolute path)
|
||||
destination_path: str
|
||||
To where to move (absolute path)
|
||||
recursive: bool
|
||||
Not implemented to far.
|
||||
maxdepth:
|
||||
Not implemented to far.
|
||||
"""
|
||||
if recursive:
|
||||
raise NotImplementedError
|
||||
if maxdepth:
|
||||
raise NotImplementedError
|
||||
|
||||
try:
|
||||
self._send_to_api(
|
||||
method="post",
|
||||
endpoint="move",
|
||||
json={"source_path": source_path, "destination_path": destination_path},
|
||||
)
|
||||
except DatabricksException as e:
|
||||
if e.error_code == "RESOURCE_DOES_NOT_EXIST":
|
||||
raise FileNotFoundError(e.message) from e
|
||||
elif e.error_code == "RESOURCE_ALREADY_EXISTS":
|
||||
raise FileExistsError(e.message) from e
|
||||
|
||||
raise
|
||||
self.invalidate_cache(self._parent(source_path))
|
||||
self.invalidate_cache(self._parent(destination_path))
|
||||
|
||||
def _open(self, path, mode="rb", block_size="default", **kwargs):
|
||||
"""
|
||||
Overwrite the base class method to make sure to create a DBFile.
|
||||
All arguments are copied from the base method.
|
||||
|
||||
Only the default blocksize is allowed.
|
||||
"""
|
||||
return DatabricksFile(self, path, mode=mode, block_size=block_size, **kwargs)
|
||||
|
||||
def _send_to_api(self, method, endpoint, json):
|
||||
"""
|
||||
Send the given json to the DBFS API
|
||||
using a get or post request (specified by the argument `method`).
|
||||
|
||||
Parameters
|
||||
----------
|
||||
method: str
|
||||
Which http method to use for communication; "get" or "post".
|
||||
endpoint: str
|
||||
Where to send the request to (last part of the API URL)
|
||||
json: dict
|
||||
Dictionary of information to send
|
||||
"""
|
||||
if method == "post":
|
||||
session_call = self.session.post
|
||||
elif method == "get":
|
||||
session_call = self.session.get
|
||||
else:
|
||||
raise ValueError(f"Do not understand method {method}")
|
||||
|
||||
url = urllib.parse.urljoin(f"https://{self.instance}/api/2.0/dbfs/", endpoint)
|
||||
|
||||
r = session_call(url, json=json)
|
||||
|
||||
# The DBFS API will return a json, also in case of an exception.
|
||||
# We want to preserve this information as good as possible.
|
||||
try:
|
||||
r.raise_for_status()
|
||||
except requests.HTTPError as e:
|
||||
# try to extract json error message
|
||||
# if that fails, fall back to the original exception
|
||||
try:
|
||||
exception_json = e.response.json()
|
||||
except Exception:
|
||||
raise e from None
|
||||
|
||||
raise DatabricksException(**exception_json) from e
|
||||
|
||||
return r.json()
|
||||
|
||||
def _create_handle(self, path, overwrite=True):
|
||||
"""
|
||||
Internal function to create a handle, which can be used to
|
||||
write blocks of a file to DBFS.
|
||||
A handle has a unique identifier which needs to be passed
|
||||
whenever written during this transaction.
|
||||
The handle is active for 10 minutes - after that a new
|
||||
write transaction needs to be created.
|
||||
Make sure to close the handle after you are finished.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
path: str
|
||||
Absolute path for this file.
|
||||
overwrite: bool
|
||||
If a file already exist at this location, either overwrite
|
||||
it or raise an exception.
|
||||
"""
|
||||
try:
|
||||
r = self._send_to_api(
|
||||
method="post",
|
||||
endpoint="create",
|
||||
json={"path": path, "overwrite": overwrite},
|
||||
)
|
||||
return r["handle"]
|
||||
except DatabricksException as e:
|
||||
if e.error_code == "RESOURCE_ALREADY_EXISTS":
|
||||
raise FileExistsError(e.message) from e
|
||||
|
||||
raise
|
||||
|
||||
def _close_handle(self, handle):
|
||||
"""
|
||||
Close a handle, which was opened by :func:`_create_handle`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
handle: str
|
||||
Which handle to close.
|
||||
"""
|
||||
try:
|
||||
self._send_to_api(method="post", endpoint="close", json={"handle": handle})
|
||||
except DatabricksException as e:
|
||||
if e.error_code == "RESOURCE_DOES_NOT_EXIST":
|
||||
raise FileNotFoundError(e.message) from e
|
||||
|
||||
raise
|
||||
|
||||
def _add_data(self, handle, data):
|
||||
"""
|
||||
Upload data to an already opened file handle
|
||||
(opened by :func:`_create_handle`).
|
||||
The maximal allowed data size is 1MB after
|
||||
conversion to base64.
|
||||
Remember to close the handle when you are finished.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
handle: str
|
||||
Which handle to upload data to.
|
||||
data: bytes
|
||||
Block of data to add to the handle.
|
||||
"""
|
||||
data = base64.b64encode(data).decode()
|
||||
try:
|
||||
self._send_to_api(
|
||||
method="post",
|
||||
endpoint="add-block",
|
||||
json={"handle": handle, "data": data},
|
||||
)
|
||||
except DatabricksException as e:
|
||||
if e.error_code == "RESOURCE_DOES_NOT_EXIST":
|
||||
raise FileNotFoundError(e.message) from e
|
||||
elif e.error_code == "MAX_BLOCK_SIZE_EXCEEDED":
|
||||
raise ValueError(e.message) from e
|
||||
|
||||
raise
|
||||
|
||||
def _get_data(self, path, start, end):
|
||||
"""
|
||||
Download data in bytes from a given absolute path in a block
|
||||
from [start, start+length].
|
||||
The maximum number of allowed bytes to read is 1MB.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
path: str
|
||||
Absolute path to download data from
|
||||
start: int
|
||||
Start position of the block
|
||||
end: int
|
||||
End position of the block
|
||||
"""
|
||||
try:
|
||||
r = self._send_to_api(
|
||||
method="get",
|
||||
endpoint="read",
|
||||
json={"path": path, "offset": start, "length": end - start},
|
||||
)
|
||||
return base64.b64decode(r["data"])
|
||||
except DatabricksException as e:
|
||||
if e.error_code == "RESOURCE_DOES_NOT_EXIST":
|
||||
raise FileNotFoundError(e.message) from e
|
||||
elif e.error_code in ["INVALID_PARAMETER_VALUE", "MAX_READ_SIZE_EXCEEDED"]:
|
||||
raise ValueError(e.message) from e
|
||||
|
||||
raise
|
||||
|
||||
def invalidate_cache(self, path=None):
|
||||
if path is None:
|
||||
self.dircache.clear()
|
||||
else:
|
||||
self.dircache.pop(path, None)
|
||||
super().invalidate_cache(path)
|
||||
|
||||
|
||||
class DatabricksFile(AbstractBufferedFile):
|
||||
"""
|
||||
Helper class for files referenced in the DatabricksFileSystem.
|
||||
"""
|
||||
|
||||
DEFAULT_BLOCK_SIZE = 1 * 2**20 # only allowed block size
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
fs,
|
||||
path,
|
||||
mode="rb",
|
||||
block_size="default",
|
||||
autocommit=True,
|
||||
cache_type="readahead",
|
||||
cache_options=None,
|
||||
**kwargs,
|
||||
):
|
||||
"""
|
||||
Create a new instance of the DatabricksFile.
|
||||
|
||||
The blocksize needs to be the default one.
|
||||
"""
|
||||
if block_size is None or block_size == "default":
|
||||
block_size = self.DEFAULT_BLOCK_SIZE
|
||||
|
||||
assert block_size == self.DEFAULT_BLOCK_SIZE, (
|
||||
f"Only the default block size is allowed, not {block_size}"
|
||||
)
|
||||
|
||||
super().__init__(
|
||||
fs,
|
||||
path,
|
||||
mode=mode,
|
||||
block_size=block_size,
|
||||
autocommit=autocommit,
|
||||
cache_type=cache_type,
|
||||
cache_options=cache_options or {},
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
def _initiate_upload(self):
|
||||
"""Internal function to start a file upload"""
|
||||
self.handle = self.fs._create_handle(self.path)
|
||||
|
||||
def _upload_chunk(self, final=False):
|
||||
"""Internal function to add a chunk of data to a started upload"""
|
||||
self.buffer.seek(0)
|
||||
data = self.buffer.getvalue()
|
||||
|
||||
data_chunks = [
|
||||
data[start:end] for start, end in self._to_sized_blocks(len(data))
|
||||
]
|
||||
|
||||
for data_chunk in data_chunks:
|
||||
self.fs._add_data(handle=self.handle, data=data_chunk)
|
||||
|
||||
if final:
|
||||
self.fs._close_handle(handle=self.handle)
|
||||
return True
|
||||
|
||||
def _fetch_range(self, start, end):
|
||||
"""Internal function to download a block of data"""
|
||||
return_buffer = b""
|
||||
length = end - start
|
||||
for chunk_start, chunk_end in self._to_sized_blocks(length, start):
|
||||
return_buffer += self.fs._get_data(
|
||||
path=self.path, start=chunk_start, end=chunk_end
|
||||
)
|
||||
|
||||
return return_buffer
|
||||
|
||||
def _to_sized_blocks(self, length, start=0):
|
||||
"""Helper function to split a range from 0 to total_length into blocksizes"""
|
||||
end = start + length
|
||||
for data_chunk in range(start, end, self.blocksize):
|
||||
data_start = data_chunk
|
||||
data_end = min(end, data_chunk + self.blocksize)
|
||||
yield data_start, data_end
|
||||
@@ -0,0 +1,388 @@
|
||||
from .. import filesystem
|
||||
from ..asyn import AsyncFileSystem
|
||||
|
||||
|
||||
class DirFileSystem(AsyncFileSystem):
|
||||
"""Directory prefix filesystem
|
||||
|
||||
The DirFileSystem is a filesystem-wrapper. It assumes every path it is dealing with
|
||||
is relative to the `path`. After performing the necessary paths operation it
|
||||
delegates everything to the wrapped filesystem.
|
||||
"""
|
||||
|
||||
protocol = "dir"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
path=None,
|
||||
fs=None,
|
||||
fo=None,
|
||||
target_protocol=None,
|
||||
target_options=None,
|
||||
**storage_options,
|
||||
):
|
||||
"""
|
||||
Parameters
|
||||
----------
|
||||
path: str
|
||||
Path to the directory.
|
||||
fs: AbstractFileSystem
|
||||
An instantiated filesystem to wrap.
|
||||
target_protocol, target_options:
|
||||
if fs is none, construct it from these
|
||||
fo: str
|
||||
Alternate for path; do not provide both
|
||||
"""
|
||||
super().__init__(**storage_options)
|
||||
if fs is None:
|
||||
fs = filesystem(protocol=target_protocol, **(target_options or {}))
|
||||
path = path or fo
|
||||
|
||||
if self.asynchronous and not fs.async_impl:
|
||||
raise ValueError("can't use asynchronous with non-async fs")
|
||||
|
||||
if fs.async_impl and self.asynchronous != fs.asynchronous:
|
||||
raise ValueError("both dirfs and fs should be in the same sync/async mode")
|
||||
|
||||
self.path = fs._strip_protocol(path)
|
||||
self.fs = fs
|
||||
|
||||
def _join(self, path):
|
||||
if isinstance(path, str):
|
||||
if not self.path:
|
||||
return path
|
||||
if not path:
|
||||
return self.path
|
||||
return self.fs.sep.join((self.path, self._strip_protocol(path)))
|
||||
if isinstance(path, dict):
|
||||
return {self._join(_path): value for _path, value in path.items()}
|
||||
return [self._join(_path) for _path in path]
|
||||
|
||||
def _relpath(self, path):
|
||||
if isinstance(path, str):
|
||||
if not self.path:
|
||||
return path
|
||||
# We need to account for S3FileSystem returning paths that do not
|
||||
# start with a '/'
|
||||
if path == self.path or (
|
||||
self.path.startswith(self.fs.sep) and path == self.path[1:]
|
||||
):
|
||||
return ""
|
||||
prefix = self.path + self.fs.sep
|
||||
if self.path.startswith(self.fs.sep) and not path.startswith(self.fs.sep):
|
||||
prefix = prefix[1:]
|
||||
assert path.startswith(prefix)
|
||||
return path[len(prefix) :]
|
||||
return [self._relpath(_path) for _path in path]
|
||||
|
||||
# Wrappers below
|
||||
|
||||
@property
|
||||
def sep(self):
|
||||
return self.fs.sep
|
||||
|
||||
async def set_session(self, *args, **kwargs):
|
||||
return await self.fs.set_session(*args, **kwargs)
|
||||
|
||||
async def _rm_file(self, path, **kwargs):
|
||||
return await self.fs._rm_file(self._join(path), **kwargs)
|
||||
|
||||
def rm_file(self, path, **kwargs):
|
||||
return self.fs.rm_file(self._join(path), **kwargs)
|
||||
|
||||
async def _rm(self, path, *args, **kwargs):
|
||||
return await self.fs._rm(self._join(path), *args, **kwargs)
|
||||
|
||||
def rm(self, path, *args, **kwargs):
|
||||
return self.fs.rm(self._join(path), *args, **kwargs)
|
||||
|
||||
async def _cp_file(self, path1, path2, **kwargs):
|
||||
return await self.fs._cp_file(self._join(path1), self._join(path2), **kwargs)
|
||||
|
||||
def cp_file(self, path1, path2, **kwargs):
|
||||
return self.fs.cp_file(self._join(path1), self._join(path2), **kwargs)
|
||||
|
||||
async def _copy(
|
||||
self,
|
||||
path1,
|
||||
path2,
|
||||
*args,
|
||||
**kwargs,
|
||||
):
|
||||
return await self.fs._copy(
|
||||
self._join(path1),
|
||||
self._join(path2),
|
||||
*args,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
def copy(self, path1, path2, *args, **kwargs):
|
||||
return self.fs.copy(
|
||||
self._join(path1),
|
||||
self._join(path2),
|
||||
*args,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
async def _pipe(self, path, *args, **kwargs):
|
||||
return await self.fs._pipe(self._join(path), *args, **kwargs)
|
||||
|
||||
def pipe(self, path, *args, **kwargs):
|
||||
return self.fs.pipe(self._join(path), *args, **kwargs)
|
||||
|
||||
async def _pipe_file(self, path, *args, **kwargs):
|
||||
return await self.fs._pipe_file(self._join(path), *args, **kwargs)
|
||||
|
||||
def pipe_file(self, path, *args, **kwargs):
|
||||
return self.fs.pipe_file(self._join(path), *args, **kwargs)
|
||||
|
||||
async def _cat_file(self, path, *args, **kwargs):
|
||||
return await self.fs._cat_file(self._join(path), *args, **kwargs)
|
||||
|
||||
def cat_file(self, path, *args, **kwargs):
|
||||
return self.fs.cat_file(self._join(path), *args, **kwargs)
|
||||
|
||||
async def _cat(self, path, *args, **kwargs):
|
||||
ret = await self.fs._cat(
|
||||
self._join(path),
|
||||
*args,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
if isinstance(ret, dict):
|
||||
return {self._relpath(key): value for key, value in ret.items()}
|
||||
|
||||
return ret
|
||||
|
||||
def cat(self, path, *args, **kwargs):
|
||||
ret = self.fs.cat(
|
||||
self._join(path),
|
||||
*args,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
if isinstance(ret, dict):
|
||||
return {self._relpath(key): value for key, value in ret.items()}
|
||||
|
||||
return ret
|
||||
|
||||
async def _put_file(self, lpath, rpath, **kwargs):
|
||||
return await self.fs._put_file(lpath, self._join(rpath), **kwargs)
|
||||
|
||||
def put_file(self, lpath, rpath, **kwargs):
|
||||
return self.fs.put_file(lpath, self._join(rpath), **kwargs)
|
||||
|
||||
async def _put(
|
||||
self,
|
||||
lpath,
|
||||
rpath,
|
||||
*args,
|
||||
**kwargs,
|
||||
):
|
||||
return await self.fs._put(
|
||||
lpath,
|
||||
self._join(rpath),
|
||||
*args,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
def put(self, lpath, rpath, *args, **kwargs):
|
||||
return self.fs.put(
|
||||
lpath,
|
||||
self._join(rpath),
|
||||
*args,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
async def _get_file(self, rpath, lpath, **kwargs):
|
||||
return await self.fs._get_file(self._join(rpath), lpath, **kwargs)
|
||||
|
||||
def get_file(self, rpath, lpath, **kwargs):
|
||||
return self.fs.get_file(self._join(rpath), lpath, **kwargs)
|
||||
|
||||
async def _get(self, rpath, *args, **kwargs):
|
||||
return await self.fs._get(self._join(rpath), *args, **kwargs)
|
||||
|
||||
def get(self, rpath, *args, **kwargs):
|
||||
return self.fs.get(self._join(rpath), *args, **kwargs)
|
||||
|
||||
async def _isfile(self, path):
|
||||
return await self.fs._isfile(self._join(path))
|
||||
|
||||
def isfile(self, path):
|
||||
return self.fs.isfile(self._join(path))
|
||||
|
||||
async def _isdir(self, path):
|
||||
return await self.fs._isdir(self._join(path))
|
||||
|
||||
def isdir(self, path):
|
||||
return self.fs.isdir(self._join(path))
|
||||
|
||||
async def _size(self, path):
|
||||
return await self.fs._size(self._join(path))
|
||||
|
||||
def size(self, path):
|
||||
return self.fs.size(self._join(path))
|
||||
|
||||
async def _exists(self, path):
|
||||
return await self.fs._exists(self._join(path))
|
||||
|
||||
def exists(self, path):
|
||||
return self.fs.exists(self._join(path))
|
||||
|
||||
async def _info(self, path, **kwargs):
|
||||
info = await self.fs._info(self._join(path), **kwargs)
|
||||
info = info.copy()
|
||||
info["name"] = self._relpath(info["name"])
|
||||
return info
|
||||
|
||||
def info(self, path, **kwargs):
|
||||
info = self.fs.info(self._join(path), **kwargs)
|
||||
info = info.copy()
|
||||
info["name"] = self._relpath(info["name"])
|
||||
return info
|
||||
|
||||
async def _ls(self, path, detail=True, **kwargs):
|
||||
ret = (await self.fs._ls(self._join(path), detail=detail, **kwargs)).copy()
|
||||
if detail:
|
||||
out = []
|
||||
for entry in ret:
|
||||
entry = entry.copy()
|
||||
entry["name"] = self._relpath(entry["name"])
|
||||
out.append(entry)
|
||||
return out
|
||||
|
||||
return self._relpath(ret)
|
||||
|
||||
def ls(self, path, detail=True, **kwargs):
|
||||
ret = self.fs.ls(self._join(path), detail=detail, **kwargs).copy()
|
||||
if detail:
|
||||
out = []
|
||||
for entry in ret:
|
||||
entry = entry.copy()
|
||||
entry["name"] = self._relpath(entry["name"])
|
||||
out.append(entry)
|
||||
return out
|
||||
|
||||
return self._relpath(ret)
|
||||
|
||||
async def _walk(self, path, *args, **kwargs):
|
||||
async for root, dirs, files in self.fs._walk(self._join(path), *args, **kwargs):
|
||||
yield self._relpath(root), dirs, files
|
||||
|
||||
def walk(self, path, *args, **kwargs):
|
||||
for root, dirs, files in self.fs.walk(self._join(path), *args, **kwargs):
|
||||
yield self._relpath(root), dirs, files
|
||||
|
||||
async def _glob(self, path, **kwargs):
|
||||
detail = kwargs.get("detail", False)
|
||||
ret = await self.fs._glob(self._join(path), **kwargs)
|
||||
if detail:
|
||||
return {self._relpath(path): info for path, info in ret.items()}
|
||||
return self._relpath(ret)
|
||||
|
||||
def glob(self, path, **kwargs):
|
||||
detail = kwargs.get("detail", False)
|
||||
ret = self.fs.glob(self._join(path), **kwargs)
|
||||
if detail:
|
||||
return {self._relpath(path): info for path, info in ret.items()}
|
||||
return self._relpath(ret)
|
||||
|
||||
async def _du(self, path, *args, **kwargs):
|
||||
total = kwargs.get("total", True)
|
||||
ret = await self.fs._du(self._join(path), *args, **kwargs)
|
||||
if total:
|
||||
return ret
|
||||
|
||||
return {self._relpath(path): size for path, size in ret.items()}
|
||||
|
||||
def du(self, path, *args, **kwargs):
|
||||
total = kwargs.get("total", True)
|
||||
ret = self.fs.du(self._join(path), *args, **kwargs)
|
||||
if total:
|
||||
return ret
|
||||
|
||||
return {self._relpath(path): size for path, size in ret.items()}
|
||||
|
||||
async def _find(self, path, *args, **kwargs):
|
||||
detail = kwargs.get("detail", False)
|
||||
ret = await self.fs._find(self._join(path), *args, **kwargs)
|
||||
if detail:
|
||||
return {self._relpath(path): info for path, info in ret.items()}
|
||||
return self._relpath(ret)
|
||||
|
||||
def find(self, path, *args, **kwargs):
|
||||
detail = kwargs.get("detail", False)
|
||||
ret = self.fs.find(self._join(path), *args, **kwargs)
|
||||
if detail:
|
||||
return {self._relpath(path): info for path, info in ret.items()}
|
||||
return self._relpath(ret)
|
||||
|
||||
async def _expand_path(self, path, *args, **kwargs):
|
||||
return self._relpath(
|
||||
await self.fs._expand_path(self._join(path), *args, **kwargs)
|
||||
)
|
||||
|
||||
def expand_path(self, path, *args, **kwargs):
|
||||
return self._relpath(self.fs.expand_path(self._join(path), *args, **kwargs))
|
||||
|
||||
async def _mkdir(self, path, *args, **kwargs):
|
||||
return await self.fs._mkdir(self._join(path), *args, **kwargs)
|
||||
|
||||
def mkdir(self, path, *args, **kwargs):
|
||||
return self.fs.mkdir(self._join(path), *args, **kwargs)
|
||||
|
||||
async def _makedirs(self, path, *args, **kwargs):
|
||||
return await self.fs._makedirs(self._join(path), *args, **kwargs)
|
||||
|
||||
def makedirs(self, path, *args, **kwargs):
|
||||
return self.fs.makedirs(self._join(path), *args, **kwargs)
|
||||
|
||||
def rmdir(self, path):
|
||||
return self.fs.rmdir(self._join(path))
|
||||
|
||||
def mv(self, path1, path2, **kwargs):
|
||||
return self.fs.mv(
|
||||
self._join(path1),
|
||||
self._join(path2),
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
def touch(self, path, **kwargs):
|
||||
return self.fs.touch(self._join(path), **kwargs)
|
||||
|
||||
def created(self, path):
|
||||
return self.fs.created(self._join(path))
|
||||
|
||||
def modified(self, path):
|
||||
return self.fs.modified(self._join(path))
|
||||
|
||||
def sign(self, path, *args, **kwargs):
|
||||
return self.fs.sign(self._join(path), *args, **kwargs)
|
||||
|
||||
def __repr__(self):
|
||||
return f"{self.__class__.__qualname__}(path='{self.path}', fs={self.fs})"
|
||||
|
||||
def open(
|
||||
self,
|
||||
path,
|
||||
*args,
|
||||
**kwargs,
|
||||
):
|
||||
return self.fs.open(
|
||||
self._join(path),
|
||||
*args,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
async def open_async(
|
||||
self,
|
||||
path,
|
||||
*args,
|
||||
**kwargs,
|
||||
):
|
||||
return await self.fs.open_async(
|
||||
self._join(path),
|
||||
*args,
|
||||
**kwargs,
|
||||
)
|
||||
@@ -0,0 +1,387 @@
|
||||
import os
|
||||
import uuid
|
||||
from ftplib import FTP, FTP_TLS, Error, error_perm
|
||||
from typing import Any
|
||||
|
||||
from ..spec import AbstractBufferedFile, AbstractFileSystem
|
||||
from ..utils import infer_storage_options, isfilelike
|
||||
|
||||
|
||||
class FTPFileSystem(AbstractFileSystem):
|
||||
"""A filesystem over classic FTP"""
|
||||
|
||||
root_marker = "/"
|
||||
cachable = False
|
||||
protocol = "ftp"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
host,
|
||||
port=21,
|
||||
username=None,
|
||||
password=None,
|
||||
acct=None,
|
||||
block_size=None,
|
||||
tempdir=None,
|
||||
timeout=30,
|
||||
encoding="utf-8",
|
||||
tls=False,
|
||||
**kwargs,
|
||||
):
|
||||
"""
|
||||
You can use _get_kwargs_from_urls to get some kwargs from
|
||||
a reasonable FTP url.
|
||||
|
||||
Authentication will be anonymous if username/password are not
|
||||
given.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
host: str
|
||||
The remote server name/ip to connect to
|
||||
port: int
|
||||
Port to connect with
|
||||
username: str or None
|
||||
If authenticating, the user's identifier
|
||||
password: str of None
|
||||
User's password on the server, if using
|
||||
acct: str or None
|
||||
Some servers also need an "account" string for auth
|
||||
block_size: int or None
|
||||
If given, the read-ahead or write buffer size.
|
||||
tempdir: str
|
||||
Directory on remote to put temporary files when in a transaction
|
||||
timeout: int
|
||||
Timeout of the ftp connection in seconds
|
||||
encoding: str
|
||||
Encoding to use for directories and filenames in FTP connection
|
||||
tls: bool
|
||||
Use FTP-TLS, by default False
|
||||
"""
|
||||
super().__init__(**kwargs)
|
||||
self.host = host
|
||||
self.port = port
|
||||
self.tempdir = tempdir or "/tmp"
|
||||
self.cred = username or "", password or "", acct or ""
|
||||
self.timeout = timeout
|
||||
self.encoding = encoding
|
||||
if block_size is not None:
|
||||
self.blocksize = block_size
|
||||
else:
|
||||
self.blocksize = 2**16
|
||||
self.tls = tls
|
||||
self._connect()
|
||||
if self.tls:
|
||||
self.ftp.prot_p()
|
||||
|
||||
def _connect(self):
|
||||
if self.tls:
|
||||
ftp_cls = FTP_TLS
|
||||
else:
|
||||
ftp_cls = FTP
|
||||
self.ftp = ftp_cls(timeout=self.timeout, encoding=self.encoding)
|
||||
self.ftp.connect(self.host, self.port)
|
||||
self.ftp.login(*self.cred)
|
||||
|
||||
@classmethod
|
||||
def _strip_protocol(cls, path):
|
||||
return "/" + infer_storage_options(path)["path"].lstrip("/").rstrip("/")
|
||||
|
||||
@staticmethod
|
||||
def _get_kwargs_from_urls(urlpath):
|
||||
out = infer_storage_options(urlpath)
|
||||
out.pop("path", None)
|
||||
out.pop("protocol", None)
|
||||
return out
|
||||
|
||||
def ls(self, path, detail=True, **kwargs):
|
||||
path = self._strip_protocol(path)
|
||||
out = []
|
||||
if path not in self.dircache:
|
||||
try:
|
||||
try:
|
||||
out = [
|
||||
(fn, details)
|
||||
for (fn, details) in self.ftp.mlsd(path)
|
||||
if fn not in [".", ".."]
|
||||
and details["type"] not in ["pdir", "cdir"]
|
||||
]
|
||||
except error_perm:
|
||||
out = _mlsd2(self.ftp, path) # Not platform independent
|
||||
for fn, details in out:
|
||||
details["name"] = "/".join(
|
||||
["" if path == "/" else path, fn.lstrip("/")]
|
||||
)
|
||||
if details["type"] == "file":
|
||||
details["size"] = int(details["size"])
|
||||
else:
|
||||
details["size"] = 0
|
||||
if details["type"] == "dir":
|
||||
details["type"] = "directory"
|
||||
self.dircache[path] = out
|
||||
except Error:
|
||||
try:
|
||||
info = self.info(path)
|
||||
if info["type"] == "file":
|
||||
out = [(path, info)]
|
||||
except (Error, IndexError) as exc:
|
||||
raise FileNotFoundError(path) from exc
|
||||
files = self.dircache.get(path, out)
|
||||
if not detail:
|
||||
return sorted([fn for fn, details in files])
|
||||
return [details for fn, details in files]
|
||||
|
||||
def info(self, path, **kwargs):
|
||||
# implement with direct method
|
||||
path = self._strip_protocol(path)
|
||||
if path == "/":
|
||||
# special case, since this dir has no real entry
|
||||
return {"name": "/", "size": 0, "type": "directory"}
|
||||
files = self.ls(self._parent(path).lstrip("/"), True)
|
||||
try:
|
||||
out = next(f for f in files if f["name"] == path)
|
||||
except StopIteration as exc:
|
||||
raise FileNotFoundError(path) from exc
|
||||
return out
|
||||
|
||||
def get_file(self, rpath, lpath, **kwargs):
|
||||
if self.isdir(rpath):
|
||||
if not os.path.exists(lpath):
|
||||
os.mkdir(lpath)
|
||||
return
|
||||
if isfilelike(lpath):
|
||||
outfile = lpath
|
||||
else:
|
||||
outfile = open(lpath, "wb")
|
||||
|
||||
def cb(x):
|
||||
outfile.write(x)
|
||||
|
||||
self.ftp.retrbinary(
|
||||
f"RETR {rpath}",
|
||||
blocksize=self.blocksize,
|
||||
callback=cb,
|
||||
)
|
||||
if not isfilelike(lpath):
|
||||
outfile.close()
|
||||
|
||||
def cat_file(self, path, start=None, end=None, **kwargs):
|
||||
if end is not None:
|
||||
return super().cat_file(path, start, end, **kwargs)
|
||||
out = []
|
||||
|
||||
def cb(x):
|
||||
out.append(x)
|
||||
|
||||
try:
|
||||
self.ftp.retrbinary(
|
||||
f"RETR {path}",
|
||||
blocksize=self.blocksize,
|
||||
rest=start,
|
||||
callback=cb,
|
||||
)
|
||||
except (Error, error_perm) as orig_exc:
|
||||
raise FileNotFoundError(path) from orig_exc
|
||||
return b"".join(out)
|
||||
|
||||
def _open(
|
||||
self,
|
||||
path,
|
||||
mode="rb",
|
||||
block_size=None,
|
||||
cache_options=None,
|
||||
autocommit=True,
|
||||
**kwargs,
|
||||
):
|
||||
path = self._strip_protocol(path)
|
||||
block_size = block_size or self.blocksize
|
||||
return FTPFile(
|
||||
self,
|
||||
path,
|
||||
mode=mode,
|
||||
block_size=block_size,
|
||||
tempdir=self.tempdir,
|
||||
autocommit=autocommit,
|
||||
cache_options=cache_options,
|
||||
)
|
||||
|
||||
def _rm(self, path):
|
||||
path = self._strip_protocol(path)
|
||||
self.ftp.delete(path)
|
||||
self.invalidate_cache(self._parent(path))
|
||||
|
||||
def rm(self, path, recursive=False, maxdepth=None):
|
||||
paths = self.expand_path(path, recursive=recursive, maxdepth=maxdepth)
|
||||
for p in reversed(paths):
|
||||
if self.isfile(p):
|
||||
self.rm_file(p)
|
||||
else:
|
||||
self.rmdir(p)
|
||||
|
||||
def mkdir(self, path: str, create_parents: bool = True, **kwargs: Any) -> None:
|
||||
path = self._strip_protocol(path)
|
||||
parent = self._parent(path)
|
||||
if parent != self.root_marker and not self.exists(parent) and create_parents:
|
||||
self.mkdir(parent, create_parents=create_parents)
|
||||
|
||||
self.ftp.mkd(path)
|
||||
self.invalidate_cache(self._parent(path))
|
||||
|
||||
def makedirs(self, path: str, exist_ok: bool = False) -> None:
|
||||
path = self._strip_protocol(path)
|
||||
if self.exists(path):
|
||||
# NB: "/" does not "exist" as it has no directory entry
|
||||
if not exist_ok:
|
||||
raise FileExistsError(f"{path} exists without `exist_ok`")
|
||||
# exists_ok=True -> no-op
|
||||
else:
|
||||
self.mkdir(path, create_parents=True)
|
||||
|
||||
def rmdir(self, path):
|
||||
path = self._strip_protocol(path)
|
||||
self.ftp.rmd(path)
|
||||
self.invalidate_cache(self._parent(path))
|
||||
|
||||
def mv(self, path1, path2, **kwargs):
|
||||
path1 = self._strip_protocol(path1)
|
||||
path2 = self._strip_protocol(path2)
|
||||
self.ftp.rename(path1, path2)
|
||||
self.invalidate_cache(self._parent(path1))
|
||||
self.invalidate_cache(self._parent(path2))
|
||||
|
||||
def __del__(self):
|
||||
self.ftp.close()
|
||||
|
||||
def invalidate_cache(self, path=None):
|
||||
if path is None:
|
||||
self.dircache.clear()
|
||||
else:
|
||||
self.dircache.pop(path, None)
|
||||
super().invalidate_cache(path)
|
||||
|
||||
|
||||
class TransferDone(Exception):
|
||||
"""Internal exception to break out of transfer"""
|
||||
|
||||
pass
|
||||
|
||||
|
||||
class FTPFile(AbstractBufferedFile):
|
||||
"""Interact with a remote FTP file with read/write buffering"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
fs,
|
||||
path,
|
||||
mode="rb",
|
||||
block_size="default",
|
||||
autocommit=True,
|
||||
cache_type="readahead",
|
||||
cache_options=None,
|
||||
**kwargs,
|
||||
):
|
||||
super().__init__(
|
||||
fs,
|
||||
path,
|
||||
mode=mode,
|
||||
block_size=block_size,
|
||||
autocommit=autocommit,
|
||||
cache_type=cache_type,
|
||||
cache_options=cache_options,
|
||||
**kwargs,
|
||||
)
|
||||
if not autocommit:
|
||||
self.target = self.path
|
||||
self.path = "/".join([kwargs["tempdir"], str(uuid.uuid4())])
|
||||
|
||||
def commit(self):
|
||||
self.fs.mv(self.path, self.target)
|
||||
|
||||
def discard(self):
|
||||
self.fs.rm(self.path)
|
||||
|
||||
def _fetch_range(self, start, end):
|
||||
"""Get bytes between given byte limits
|
||||
|
||||
Implemented by raising an exception in the fetch callback when the
|
||||
number of bytes received reaches the requested amount.
|
||||
|
||||
Will fail if the server does not respect the REST command on
|
||||
retrieve requests.
|
||||
"""
|
||||
out = []
|
||||
total = [0]
|
||||
|
||||
def callback(x):
|
||||
total[0] += len(x)
|
||||
if total[0] > end - start:
|
||||
out.append(x[: (end - start) - total[0]])
|
||||
if end < self.size:
|
||||
raise TransferDone
|
||||
else:
|
||||
out.append(x)
|
||||
|
||||
if total[0] == end - start and end < self.size:
|
||||
raise TransferDone
|
||||
|
||||
try:
|
||||
self.fs.ftp.retrbinary(
|
||||
f"RETR {self.path}",
|
||||
blocksize=self.blocksize,
|
||||
rest=start,
|
||||
callback=callback,
|
||||
)
|
||||
except TransferDone:
|
||||
try:
|
||||
# stop transfer, we got enough bytes for this block
|
||||
self.fs.ftp.abort()
|
||||
self.fs.ftp.getmultiline()
|
||||
except Error:
|
||||
self.fs._connect()
|
||||
|
||||
return b"".join(out)
|
||||
|
||||
def _upload_chunk(self, final=False):
|
||||
self.buffer.seek(0)
|
||||
self.fs.ftp.storbinary(
|
||||
f"STOR {self.path}", self.buffer, blocksize=self.blocksize, rest=self.offset
|
||||
)
|
||||
return True
|
||||
|
||||
|
||||
def _mlsd2(ftp, path="."):
|
||||
"""
|
||||
Fall back to using `dir` instead of `mlsd` if not supported.
|
||||
|
||||
This parses a Linux style `ls -l` response to `dir`, but the response may
|
||||
be platform dependent.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
ftp: ftplib.FTP
|
||||
path: str
|
||||
Expects to be given path, but defaults to ".".
|
||||
"""
|
||||
lines = []
|
||||
minfo = []
|
||||
ftp.dir(path, lines.append)
|
||||
for line in lines:
|
||||
split_line = line.split()
|
||||
if len(split_line) < 9:
|
||||
continue
|
||||
this = (
|
||||
split_line[-1],
|
||||
{
|
||||
"modify": " ".join(split_line[5:8]),
|
||||
"unix.owner": split_line[2],
|
||||
"unix.group": split_line[3],
|
||||
"unix.mode": split_line[0],
|
||||
"size": split_line[4],
|
||||
},
|
||||
)
|
||||
if this[1]["unix.mode"][0] == "d":
|
||||
this[1]["type"] = "dir"
|
||||
else:
|
||||
this[1]["type"] = "file"
|
||||
minfo.append(this)
|
||||
return minfo
|
||||
@@ -0,0 +1,241 @@
|
||||
import requests
|
||||
|
||||
from ..spec import AbstractFileSystem
|
||||
from ..utils import infer_storage_options
|
||||
from .memory import MemoryFile
|
||||
|
||||
|
||||
class GistFileSystem(AbstractFileSystem):
|
||||
"""
|
||||
Interface to files in a single GitHub Gist.
|
||||
|
||||
Provides read-only access to a gist's files. Gists do not contain
|
||||
subdirectories, so file listing is straightforward.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
gist_id: str
|
||||
The ID of the gist you want to access (the long hex value from the URL).
|
||||
filenames: list[str] (optional)
|
||||
If provided, only make a file system representing these files, and do not fetch
|
||||
the list of all files for this gist.
|
||||
sha: str (optional)
|
||||
If provided, fetch a particular revision of the gist. If omitted,
|
||||
the latest revision is used.
|
||||
username: str (optional)
|
||||
GitHub username for authentication.
|
||||
token: str (optional)
|
||||
GitHub personal access token (required if username is given), or.
|
||||
timeout: (float, float) or float, optional
|
||||
Connect and read timeouts for requests (default 60s each).
|
||||
kwargs: dict
|
||||
Stored on `self.request_kw` and passed to `requests.get` when fetching Gist
|
||||
metadata or reading ("opening") a file.
|
||||
"""
|
||||
|
||||
protocol = "gist"
|
||||
gist_url = "https://api.github.com/gists/{gist_id}"
|
||||
gist_rev_url = "https://api.github.com/gists/{gist_id}/{sha}"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
gist_id,
|
||||
filenames=None,
|
||||
sha=None,
|
||||
username=None,
|
||||
token=None,
|
||||
timeout=None,
|
||||
**kwargs,
|
||||
):
|
||||
super().__init__()
|
||||
self.gist_id = gist_id
|
||||
self.filenames = filenames
|
||||
self.sha = sha # revision of the gist (optional)
|
||||
if username is not None and token is None:
|
||||
raise ValueError("User auth requires a token")
|
||||
self.username = username
|
||||
self.token = token
|
||||
self.request_kw = kwargs
|
||||
# Default timeouts to 60s connect/read if none provided
|
||||
self.timeout = timeout if timeout is not None else (60, 60)
|
||||
|
||||
# We use a single-level "directory" cache, because a gist is essentially flat
|
||||
self.dircache[""] = self._fetch_file_list()
|
||||
|
||||
@property
|
||||
def kw(self):
|
||||
"""Auth parameters passed to 'requests' if we have username/token."""
|
||||
kw = {
|
||||
"headers": {
|
||||
"Accept": "application/vnd.github+json",
|
||||
"X-GitHub-Api-Version": "2022-11-28",
|
||||
}
|
||||
}
|
||||
kw.update(self.request_kw)
|
||||
if self.username and self.token:
|
||||
kw["auth"] = (self.username, self.token)
|
||||
elif self.token:
|
||||
kw["headers"]["Authorization"] = f"Bearer {self.token}"
|
||||
return kw
|
||||
|
||||
def _fetch_gist_metadata(self):
|
||||
"""
|
||||
Fetch the JSON metadata for this gist (possibly for a specific revision).
|
||||
"""
|
||||
if self.sha:
|
||||
url = self.gist_rev_url.format(gist_id=self.gist_id, sha=self.sha)
|
||||
else:
|
||||
url = self.gist_url.format(gist_id=self.gist_id)
|
||||
|
||||
r = requests.get(url, timeout=self.timeout, **self.kw)
|
||||
if r.status_code == 404:
|
||||
raise FileNotFoundError(
|
||||
f"Gist not found: {self.gist_id}@{self.sha or 'latest'}"
|
||||
)
|
||||
r.raise_for_status()
|
||||
return r.json()
|
||||
|
||||
def _fetch_file_list(self):
|
||||
"""
|
||||
Returns a list of dicts describing each file in the gist. These get stored
|
||||
in self.dircache[""].
|
||||
"""
|
||||
meta = self._fetch_gist_metadata()
|
||||
if self.filenames:
|
||||
available_files = meta.get("files", {})
|
||||
files = {}
|
||||
for fn in self.filenames:
|
||||
if fn not in available_files:
|
||||
raise FileNotFoundError(fn)
|
||||
files[fn] = available_files[fn]
|
||||
else:
|
||||
files = meta.get("files", {})
|
||||
|
||||
out = []
|
||||
for fname, finfo in files.items():
|
||||
if finfo is None:
|
||||
# Occasionally GitHub returns a file entry with null if it was deleted
|
||||
continue
|
||||
# Build a directory entry
|
||||
out.append(
|
||||
{
|
||||
"name": fname, # file's name
|
||||
"type": "file", # gists have no subdirectories
|
||||
"size": finfo.get("size", 0), # file size in bytes
|
||||
"raw_url": finfo.get("raw_url"),
|
||||
}
|
||||
)
|
||||
return out
|
||||
|
||||
@classmethod
|
||||
def _strip_protocol(cls, path):
|
||||
"""
|
||||
Remove 'gist://' from the path, if present.
|
||||
"""
|
||||
# The default infer_storage_options can handle gist://username:token@id/file
|
||||
# or gist://id/file, but let's ensure we handle a normal usage too.
|
||||
# We'll just strip the protocol prefix if it exists.
|
||||
path = infer_storage_options(path).get("path", path)
|
||||
return path.lstrip("/")
|
||||
|
||||
@staticmethod
|
||||
def _get_kwargs_from_urls(path):
|
||||
"""
|
||||
Parse 'gist://' style URLs into GistFileSystem constructor kwargs.
|
||||
For example:
|
||||
gist://:TOKEN@<gist_id>/file.txt
|
||||
gist://username:TOKEN@<gist_id>/file.txt
|
||||
"""
|
||||
so = infer_storage_options(path)
|
||||
out = {}
|
||||
if "username" in so and so["username"]:
|
||||
out["username"] = so["username"]
|
||||
if "password" in so and so["password"]:
|
||||
out["token"] = so["password"]
|
||||
if "host" in so and so["host"]:
|
||||
# We interpret 'host' as the gist ID
|
||||
out["gist_id"] = so["host"]
|
||||
|
||||
# Extract SHA and filename from path
|
||||
if "path" in so and so["path"]:
|
||||
path_parts = so["path"].rsplit("/", 2)[-2:]
|
||||
if len(path_parts) == 2:
|
||||
if path_parts[0]: # SHA present
|
||||
out["sha"] = path_parts[0]
|
||||
if path_parts[1]: # filename also present
|
||||
out["filenames"] = [path_parts[1]]
|
||||
|
||||
return out
|
||||
|
||||
def ls(self, path="", detail=False, **kwargs):
|
||||
"""
|
||||
List files in the gist. Gists are single-level, so any 'path' is basically
|
||||
the filename, or empty for all files.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
path : str, optional
|
||||
The filename to list. If empty, returns all files in the gist.
|
||||
detail : bool, default False
|
||||
If True, return a list of dicts; if False, return a list of filenames.
|
||||
"""
|
||||
path = self._strip_protocol(path or "")
|
||||
# If path is empty, return all
|
||||
if path == "":
|
||||
results = self.dircache[""]
|
||||
else:
|
||||
# We want just the single file with this name
|
||||
all_files = self.dircache[""]
|
||||
results = [f for f in all_files if f["name"] == path]
|
||||
if not results:
|
||||
raise FileNotFoundError(path)
|
||||
if detail:
|
||||
return results
|
||||
else:
|
||||
return sorted(f["name"] for f in results)
|
||||
|
||||
def _open(self, path, mode="rb", block_size=None, **kwargs):
|
||||
"""
|
||||
Read a single file from the gist.
|
||||
"""
|
||||
if mode != "rb":
|
||||
raise NotImplementedError("GitHub Gist FS is read-only (no write).")
|
||||
|
||||
path = self._strip_protocol(path)
|
||||
# Find the file entry in our dircache
|
||||
matches = [f for f in self.dircache[""] if f["name"] == path]
|
||||
if not matches:
|
||||
raise FileNotFoundError(path)
|
||||
finfo = matches[0]
|
||||
|
||||
raw_url = finfo.get("raw_url")
|
||||
if not raw_url:
|
||||
raise FileNotFoundError(f"No raw_url for file: {path}")
|
||||
|
||||
r = requests.get(raw_url, timeout=self.timeout, **self.kw)
|
||||
if r.status_code == 404:
|
||||
raise FileNotFoundError(path)
|
||||
r.raise_for_status()
|
||||
return MemoryFile(path, None, r.content)
|
||||
|
||||
def cat(self, path, recursive=False, on_error="raise", **kwargs):
|
||||
"""
|
||||
Return {path: contents} for the given file or files. If 'recursive' is True,
|
||||
and path is empty, returns all files in the gist.
|
||||
"""
|
||||
paths = self.expand_path(path, recursive=recursive)
|
||||
out = {}
|
||||
for p in paths:
|
||||
try:
|
||||
with self.open(p, "rb") as f:
|
||||
out[p] = f.read()
|
||||
except FileNotFoundError as e:
|
||||
if on_error == "raise":
|
||||
raise e
|
||||
elif on_error == "omit":
|
||||
pass # skip
|
||||
else:
|
||||
out[p] = e
|
||||
if len(paths) == 1 and paths[0] == path:
|
||||
return out[path]
|
||||
return out
|
||||
@@ -0,0 +1,114 @@
|
||||
import os
|
||||
|
||||
import pygit2
|
||||
|
||||
from fsspec.spec import AbstractFileSystem
|
||||
|
||||
from .memory import MemoryFile
|
||||
|
||||
|
||||
class GitFileSystem(AbstractFileSystem):
|
||||
"""Browse the files of a local git repo at any hash/tag/branch
|
||||
|
||||
(experimental backend)
|
||||
"""
|
||||
|
||||
root_marker = ""
|
||||
cachable = True
|
||||
|
||||
def __init__(self, path=None, fo=None, ref=None, **kwargs):
|
||||
"""
|
||||
|
||||
Parameters
|
||||
----------
|
||||
path: str (optional)
|
||||
Local location of the repo (uses current directory if not given).
|
||||
May be deprecated in favour of ``fo``. When used with a higher
|
||||
level function such as fsspec.open(), may be of the form
|
||||
"git://[path-to-repo[:]][ref@]path/to/file" (but the actual
|
||||
file path should not contain "@" or ":").
|
||||
fo: str (optional)
|
||||
Same as ``path``, but passed as part of a chained URL. This one
|
||||
takes precedence if both are given.
|
||||
ref: str (optional)
|
||||
Reference to work with, could be a hash, tag or branch name. Defaults
|
||||
to current working tree. Note that ``ls`` and ``open`` also take hash,
|
||||
so this becomes the default for those operations
|
||||
kwargs
|
||||
"""
|
||||
super().__init__(**kwargs)
|
||||
self.repo = pygit2.Repository(fo or path or os.getcwd())
|
||||
self.ref = ref or "master"
|
||||
|
||||
@classmethod
|
||||
def _strip_protocol(cls, path):
|
||||
path = super()._strip_protocol(path).lstrip("/")
|
||||
if ":" in path:
|
||||
path = path.split(":", 1)[1]
|
||||
if "@" in path:
|
||||
path = path.split("@", 1)[1]
|
||||
return path.lstrip("/")
|
||||
|
||||
def _path_to_object(self, path, ref):
|
||||
comm, ref = self.repo.resolve_refish(ref or self.ref)
|
||||
parts = path.split("/")
|
||||
tree = comm.tree
|
||||
for part in parts:
|
||||
if part and isinstance(tree, pygit2.Tree):
|
||||
if part not in tree:
|
||||
raise FileNotFoundError(path)
|
||||
tree = tree[part]
|
||||
return tree
|
||||
|
||||
@staticmethod
|
||||
def _get_kwargs_from_urls(path):
|
||||
path = path.removeprefix("git://")
|
||||
out = {}
|
||||
if ":" in path:
|
||||
out["path"], path = path.split(":", 1)
|
||||
if "@" in path:
|
||||
out["ref"], path = path.split("@", 1)
|
||||
return out
|
||||
|
||||
@staticmethod
|
||||
def _object_to_info(obj, path=None):
|
||||
# obj.name and obj.filemode are None for the root tree!
|
||||
is_dir = isinstance(obj, pygit2.Tree)
|
||||
return {
|
||||
"type": "directory" if is_dir else "file",
|
||||
"name": (
|
||||
"/".join([path, obj.name or ""]).lstrip("/") if path else obj.name
|
||||
),
|
||||
"hex": str(obj.id),
|
||||
"mode": "100644" if obj.filemode is None else f"{obj.filemode:o}",
|
||||
"size": 0 if is_dir else obj.size,
|
||||
}
|
||||
|
||||
def ls(self, path, detail=True, ref=None, **kwargs):
|
||||
tree = self._path_to_object(self._strip_protocol(path), ref)
|
||||
return [
|
||||
GitFileSystem._object_to_info(obj, path)
|
||||
if detail
|
||||
else GitFileSystem._object_to_info(obj, path)["name"]
|
||||
for obj in (tree if isinstance(tree, pygit2.Tree) else [tree])
|
||||
]
|
||||
|
||||
def info(self, path, ref=None, **kwargs):
|
||||
tree = self._path_to_object(self._strip_protocol(path), ref)
|
||||
return GitFileSystem._object_to_info(tree, path)
|
||||
|
||||
def ukey(self, path, ref=None):
|
||||
return self.info(path, ref=ref)["hex"]
|
||||
|
||||
def _open(
|
||||
self,
|
||||
path,
|
||||
mode="rb",
|
||||
block_size=None,
|
||||
autocommit=True,
|
||||
cache_options=None,
|
||||
ref=None,
|
||||
**kwargs,
|
||||
):
|
||||
obj = self._path_to_object(path, ref or self.ref)
|
||||
return MemoryFile(data=obj.data)
|
||||
@@ -0,0 +1,333 @@
|
||||
import base64
|
||||
import re
|
||||
|
||||
import requests
|
||||
|
||||
from ..spec import AbstractFileSystem
|
||||
from ..utils import infer_storage_options
|
||||
from .memory import MemoryFile
|
||||
|
||||
|
||||
class GithubFileSystem(AbstractFileSystem):
|
||||
"""Interface to files in github
|
||||
|
||||
An instance of this class provides the files residing within a remote github
|
||||
repository. You may specify a point in the repos history, by SHA, branch
|
||||
or tag (default is current master).
|
||||
|
||||
For files less than 1 MB in size, file content is returned directly in a
|
||||
MemoryFile. For larger files, or for files tracked by git-lfs, file content
|
||||
is returned as an HTTPFile wrapping the ``download_url`` provided by the
|
||||
GitHub API.
|
||||
|
||||
When using fsspec.open, allows URIs of the form:
|
||||
|
||||
- "github://path/file", in which case you must specify org, repo and
|
||||
may specify sha in the extra args
|
||||
- 'github://org:repo@/precip/catalog.yml', where the org and repo are
|
||||
part of the URI
|
||||
- 'github://org:repo@sha/precip/catalog.yml', where the sha is also included
|
||||
|
||||
``sha`` can be the full or abbreviated hex of the commit you want to fetch
|
||||
from, or a branch or tag name (so long as it doesn't contain special characters
|
||||
like "/", "?", which would have to be HTTP-encoded).
|
||||
|
||||
For authorised access, you must provide username and token, which can be made
|
||||
at https://github.com/settings/tokens
|
||||
"""
|
||||
|
||||
url = "https://api.github.com/repos/{org}/{repo}/git/trees/{sha}"
|
||||
content_url = "https://api.github.com/repos/{org}/{repo}/contents/{path}?ref={sha}"
|
||||
protocol = "github"
|
||||
timeout = (60, 60) # connect, read timeouts
|
||||
|
||||
def __init__(
|
||||
self, org, repo, sha=None, username=None, token=None, timeout=None, **kwargs
|
||||
):
|
||||
super().__init__(**kwargs)
|
||||
self.org = org
|
||||
self.repo = repo
|
||||
if (username is None) ^ (token is None):
|
||||
raise ValueError("Auth required both username and token")
|
||||
self.username = username
|
||||
self.token = token
|
||||
if timeout is not None:
|
||||
self.timeout = timeout
|
||||
if sha is None:
|
||||
# look up default branch (not necessarily "master")
|
||||
u = "https://api.github.com/repos/{org}/{repo}"
|
||||
r = requests.get(
|
||||
u.format(org=org, repo=repo), timeout=self.timeout, **self.kw
|
||||
)
|
||||
r.raise_for_status()
|
||||
sha = r.json()["default_branch"]
|
||||
|
||||
self.root = sha
|
||||
self.ls("")
|
||||
try:
|
||||
from .http import HTTPFileSystem
|
||||
|
||||
self.http_fs = HTTPFileSystem(**kwargs)
|
||||
except ImportError:
|
||||
self.http_fs = None
|
||||
|
||||
@property
|
||||
def kw(self):
|
||||
if self.username:
|
||||
return {"auth": (self.username, self.token)}
|
||||
return {}
|
||||
|
||||
@classmethod
|
||||
def repos(cls, org_or_user, is_org=True):
|
||||
"""List repo names for given org or user
|
||||
|
||||
This may become the top level of the FS
|
||||
|
||||
Parameters
|
||||
----------
|
||||
org_or_user: str
|
||||
Name of the github org or user to query
|
||||
is_org: bool (default True)
|
||||
Whether the name is an organisation (True) or user (False)
|
||||
|
||||
Returns
|
||||
-------
|
||||
List of string
|
||||
"""
|
||||
r = requests.get(
|
||||
f"https://api.github.com/{['users', 'orgs'][is_org]}/{org_or_user}/repos",
|
||||
timeout=cls.timeout,
|
||||
)
|
||||
r.raise_for_status()
|
||||
return [repo["name"] for repo in r.json()]
|
||||
|
||||
@property
|
||||
def tags(self):
|
||||
"""Names of tags in the repo"""
|
||||
r = requests.get(
|
||||
f"https://api.github.com/repos/{self.org}/{self.repo}/tags",
|
||||
timeout=self.timeout,
|
||||
**self.kw,
|
||||
)
|
||||
r.raise_for_status()
|
||||
return [t["name"] for t in r.json()]
|
||||
|
||||
@property
|
||||
def branches(self):
|
||||
"""Names of branches in the repo"""
|
||||
r = requests.get(
|
||||
f"https://api.github.com/repos/{self.org}/{self.repo}/branches",
|
||||
timeout=self.timeout,
|
||||
**self.kw,
|
||||
)
|
||||
r.raise_for_status()
|
||||
return [t["name"] for t in r.json()]
|
||||
|
||||
@property
|
||||
def refs(self):
|
||||
"""Named references, tags and branches"""
|
||||
return {"tags": self.tags, "branches": self.branches}
|
||||
|
||||
def ls(self, path, detail=False, sha=None, _sha=None, **kwargs):
|
||||
"""List files at given path
|
||||
|
||||
Parameters
|
||||
----------
|
||||
path: str
|
||||
Location to list, relative to repo root
|
||||
detail: bool
|
||||
If True, returns list of dicts, one per file; if False, returns
|
||||
list of full filenames only
|
||||
sha: str (optional)
|
||||
List at the given point in the repo history, branch or tag name or commit
|
||||
SHA
|
||||
_sha: str (optional)
|
||||
List this specific tree object (used internally to descend into trees)
|
||||
"""
|
||||
path = self._strip_protocol(path)
|
||||
if path == "":
|
||||
_sha = sha or self.root
|
||||
if _sha is None:
|
||||
parts = path.rstrip("/").split("/")
|
||||
so_far = ""
|
||||
_sha = sha or self.root
|
||||
for part in parts:
|
||||
out = self.ls(so_far, True, sha=sha, _sha=_sha)
|
||||
so_far += "/" + part if so_far else part
|
||||
out = [o for o in out if o["name"] == so_far]
|
||||
if not out:
|
||||
raise FileNotFoundError(path)
|
||||
out = out[0]
|
||||
if out["type"] == "file":
|
||||
if detail:
|
||||
return [out]
|
||||
else:
|
||||
return path
|
||||
_sha = out["sha"]
|
||||
if path not in self.dircache or sha not in [self.root, None]:
|
||||
r = requests.get(
|
||||
self.url.format(org=self.org, repo=self.repo, sha=_sha),
|
||||
timeout=self.timeout,
|
||||
**self.kw,
|
||||
)
|
||||
if r.status_code == 404:
|
||||
raise FileNotFoundError(path)
|
||||
r.raise_for_status()
|
||||
types = {"blob": "file", "tree": "directory"}
|
||||
out = [
|
||||
{
|
||||
"name": path + "/" + f["path"] if path else f["path"],
|
||||
"mode": f["mode"],
|
||||
"type": types[f["type"]],
|
||||
"size": f.get("size", 0),
|
||||
"sha": f["sha"],
|
||||
}
|
||||
for f in r.json()["tree"]
|
||||
if f["type"] in types
|
||||
]
|
||||
if sha in [self.root, None]:
|
||||
self.dircache[path] = out
|
||||
else:
|
||||
out = self.dircache[path]
|
||||
if detail:
|
||||
return out
|
||||
else:
|
||||
return sorted([f["name"] for f in out])
|
||||
|
||||
def invalidate_cache(self, path=None):
|
||||
self.dircache.clear()
|
||||
|
||||
@classmethod
|
||||
def _strip_protocol(cls, path):
|
||||
opts = infer_storage_options(path)
|
||||
if "username" not in opts:
|
||||
return super()._strip_protocol(path)
|
||||
return opts["path"].lstrip("/")
|
||||
|
||||
@staticmethod
|
||||
def _get_kwargs_from_urls(path):
|
||||
opts = infer_storage_options(path)
|
||||
if "username" not in opts:
|
||||
return {}
|
||||
out = {"org": opts["username"], "repo": opts["password"]}
|
||||
if opts["host"]:
|
||||
out["sha"] = opts["host"]
|
||||
return out
|
||||
|
||||
def _open(
|
||||
self,
|
||||
path,
|
||||
mode="rb",
|
||||
block_size=None,
|
||||
cache_options=None,
|
||||
sha=None,
|
||||
**kwargs,
|
||||
):
|
||||
if mode != "rb":
|
||||
raise NotImplementedError
|
||||
|
||||
# construct a url to hit the GitHub API's repo contents API
|
||||
url = self.content_url.format(
|
||||
org=self.org, repo=self.repo, path=path, sha=sha or self.root
|
||||
)
|
||||
|
||||
# make a request to this API, and parse the response as JSON
|
||||
r = requests.get(url, timeout=self.timeout, **self.kw)
|
||||
if r.status_code == 404:
|
||||
raise FileNotFoundError(path)
|
||||
r.raise_for_status()
|
||||
content_json = r.json()
|
||||
|
||||
# if the response's content key is not empty, try to parse it as base64
|
||||
if content_json["content"]:
|
||||
content = base64.b64decode(content_json["content"])
|
||||
|
||||
# as long as the content does not start with the string
|
||||
# "version https://git-lfs.github.com/"
|
||||
# then it is probably not a git-lfs pointer and we can just return
|
||||
# the content directly
|
||||
if not content.startswith(b"version https://git-lfs.github.com/"):
|
||||
return MemoryFile(None, None, content)
|
||||
|
||||
# we land here if the content was not present in the first response
|
||||
# (regular file over 1MB or git-lfs tracked file)
|
||||
# in this case, we get let the HTTPFileSystem handle the download
|
||||
if self.http_fs is None:
|
||||
raise ImportError(
|
||||
"Please install fsspec[http] to access github files >1 MB "
|
||||
"or git-lfs tracked files."
|
||||
)
|
||||
return self.http_fs.open(
|
||||
content_json["download_url"],
|
||||
mode=mode,
|
||||
block_size=block_size,
|
||||
cache_options=cache_options,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
def rm(self, path, recursive=False, maxdepth=None, message=None):
|
||||
path = self.expand_path(path, recursive=recursive, maxdepth=maxdepth)
|
||||
for p in reversed(path):
|
||||
self.rm_file(p, message=message)
|
||||
|
||||
def rm_file(self, path, message=None, **kwargs):
|
||||
"""
|
||||
Remove a file from a specified branch using a given commit message.
|
||||
|
||||
Since Github DELETE operation requires a branch name, and we can't reliably
|
||||
determine whether the provided SHA refers to a branch, tag, or commit, we
|
||||
assume it's a branch. If it's not, the user will encounter an error when
|
||||
attempting to retrieve the file SHA or delete the file.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
path: str
|
||||
The file's location relative to the repository root.
|
||||
message: str, optional
|
||||
The commit message for the deletion.
|
||||
"""
|
||||
|
||||
if not self.username:
|
||||
raise ValueError("Authentication required")
|
||||
|
||||
path = self._strip_protocol(path)
|
||||
|
||||
# Attempt to get SHA from cache or Github API
|
||||
sha = self._get_sha_from_cache(path)
|
||||
if not sha:
|
||||
url = self.content_url.format(
|
||||
org=self.org, repo=self.repo, path=path.lstrip("/"), sha=self.root
|
||||
)
|
||||
r = requests.get(url, timeout=self.timeout, **self.kw)
|
||||
if r.status_code == 404:
|
||||
raise FileNotFoundError(path)
|
||||
r.raise_for_status()
|
||||
sha = r.json()["sha"]
|
||||
|
||||
# Delete the file
|
||||
delete_url = self.content_url.format(
|
||||
org=self.org, repo=self.repo, path=path, sha=self.root
|
||||
)
|
||||
branch = self.root
|
||||
data = {
|
||||
"message": message or f"Delete {path}",
|
||||
"sha": sha,
|
||||
**({"branch": branch} if branch else {}),
|
||||
}
|
||||
|
||||
r = requests.delete(delete_url, json=data, timeout=self.timeout, **self.kw)
|
||||
error_message = r.json().get("message", "")
|
||||
if re.search(r"Branch .+ not found", error_message):
|
||||
error = "Remove only works when the filesystem is initialised from a branch or default (None)"
|
||||
raise ValueError(error)
|
||||
r.raise_for_status()
|
||||
|
||||
self.invalidate_cache(path)
|
||||
|
||||
def _get_sha_from_cache(self, path):
|
||||
for entries in self.dircache.values():
|
||||
for entry in entries:
|
||||
entry_path = entry.get("name")
|
||||
if entry_path and entry_path == path and "sha" in entry:
|
||||
return entry["sha"]
|
||||
return None
|
||||
@@ -0,0 +1,891 @@
|
||||
import asyncio
|
||||
import io
|
||||
import logging
|
||||
import re
|
||||
import weakref
|
||||
from copy import copy
|
||||
from urllib.parse import urlparse
|
||||
|
||||
import aiohttp
|
||||
import yarl
|
||||
|
||||
from fsspec.asyn import AbstractAsyncStreamedFile, AsyncFileSystem, sync, sync_wrapper
|
||||
from fsspec.callbacks import DEFAULT_CALLBACK
|
||||
from fsspec.exceptions import FSTimeoutError
|
||||
from fsspec.spec import AbstractBufferedFile
|
||||
from fsspec.utils import (
|
||||
DEFAULT_BLOCK_SIZE,
|
||||
glob_translate,
|
||||
isfilelike,
|
||||
nullcontext,
|
||||
tokenize,
|
||||
)
|
||||
|
||||
from ..caching import AllBytes
|
||||
|
||||
# https://stackoverflow.com/a/15926317/3821154
|
||||
ex = re.compile(r"""<(a|A)\s+(?:[^>]*?\s+)?(href|HREF)=["'](?P<url>[^"']+)""")
|
||||
ex2 = re.compile(r"""(?P<url>http[s]?://[-a-zA-Z0-9@:%_+.~#?&/=]+)""")
|
||||
logger = logging.getLogger("fsspec.http")
|
||||
|
||||
|
||||
async def get_client(**kwargs):
|
||||
return aiohttp.ClientSession(**kwargs)
|
||||
|
||||
|
||||
class HTTPFileSystem(AsyncFileSystem):
|
||||
"""
|
||||
Simple File-System for fetching data via HTTP(S)
|
||||
|
||||
``ls()`` is implemented by loading the parent page and doing a regex
|
||||
match on the result. If simple_link=True, anything of the form
|
||||
"http(s)://server.com/stuff?thing=other"; otherwise only links within
|
||||
HTML href tags will be used.
|
||||
"""
|
||||
|
||||
protocol = ("http", "https")
|
||||
sep = "/"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
simple_links=True,
|
||||
block_size=None,
|
||||
same_scheme=True,
|
||||
size_policy=None,
|
||||
cache_type="bytes",
|
||||
cache_options=None,
|
||||
asynchronous=False,
|
||||
loop=None,
|
||||
client_kwargs=None,
|
||||
get_client=get_client,
|
||||
encoded=False,
|
||||
**storage_options,
|
||||
):
|
||||
"""
|
||||
NB: if this is called async, you must await set_client
|
||||
|
||||
Parameters
|
||||
----------
|
||||
block_size: int
|
||||
Blocks to read bytes; if 0, will default to raw requests file-like
|
||||
objects instead of HTTPFile instances
|
||||
simple_links: bool
|
||||
If True, will consider both HTML <a> tags and anything that looks
|
||||
like a URL; if False, will consider only the former.
|
||||
same_scheme: True
|
||||
When doing ls/glob, if this is True, only consider paths that have
|
||||
http/https matching the input URLs.
|
||||
size_policy: this argument is deprecated
|
||||
client_kwargs: dict
|
||||
Passed to aiohttp.ClientSession, see
|
||||
https://docs.aiohttp.org/en/stable/client_reference.html
|
||||
For example, ``{'auth': aiohttp.BasicAuth('user', 'pass')}``
|
||||
get_client: Callable[..., aiohttp.ClientSession]
|
||||
A callable, which takes keyword arguments and constructs
|
||||
an aiohttp.ClientSession. Its state will be managed by
|
||||
the HTTPFileSystem class.
|
||||
storage_options: key-value
|
||||
Any other parameters passed on to requests
|
||||
cache_type, cache_options: defaults used in open()
|
||||
"""
|
||||
super().__init__(self, asynchronous=asynchronous, loop=loop, **storage_options)
|
||||
self.block_size = block_size if block_size is not None else DEFAULT_BLOCK_SIZE
|
||||
self.simple_links = simple_links
|
||||
self.same_schema = same_scheme
|
||||
self.cache_type = cache_type
|
||||
self.cache_options = cache_options
|
||||
self.client_kwargs = client_kwargs or {}
|
||||
self.get_client = get_client
|
||||
self.encoded = encoded
|
||||
self.kwargs = storage_options
|
||||
self._session = None
|
||||
|
||||
# Clean caching-related parameters from `storage_options`
|
||||
# before propagating them as `request_options` through `self.kwargs`.
|
||||
# TODO: Maybe rename `self.kwargs` to `self.request_options` to make
|
||||
# it clearer.
|
||||
request_options = copy(storage_options)
|
||||
self.use_listings_cache = request_options.pop("use_listings_cache", False)
|
||||
request_options.pop("listings_expiry_time", None)
|
||||
request_options.pop("max_paths", None)
|
||||
request_options.pop("skip_instance_cache", None)
|
||||
self.kwargs = request_options
|
||||
|
||||
@property
|
||||
def fsid(self):
|
||||
return "http"
|
||||
|
||||
def encode_url(self, url):
|
||||
return yarl.URL(url, encoded=self.encoded)
|
||||
|
||||
@staticmethod
|
||||
def close_session(loop, session):
|
||||
if loop is not None and loop.is_running():
|
||||
try:
|
||||
sync(loop, session.close, timeout=0.1)
|
||||
return
|
||||
except (TimeoutError, FSTimeoutError, NotImplementedError):
|
||||
pass
|
||||
connector = getattr(session, "_connector", None)
|
||||
if connector is not None:
|
||||
# close after loop is dead
|
||||
connector._close()
|
||||
|
||||
async def set_session(self):
|
||||
if self._session is None:
|
||||
self._session = await self.get_client(loop=self.loop, **self.client_kwargs)
|
||||
if not self.asynchronous:
|
||||
weakref.finalize(self, self.close_session, self.loop, self._session)
|
||||
return self._session
|
||||
|
||||
@classmethod
|
||||
def _strip_protocol(cls, path):
|
||||
"""For HTTP, we always want to keep the full URL"""
|
||||
return path
|
||||
|
||||
@classmethod
|
||||
def _parent(cls, path):
|
||||
# override, since _strip_protocol is different for URLs
|
||||
par = super()._parent(path)
|
||||
if len(par) > 7: # "http://..."
|
||||
return par
|
||||
return ""
|
||||
|
||||
async def _ls_real(self, url, detail=True, **kwargs):
|
||||
# ignoring URL-encoded arguments
|
||||
kw = self.kwargs.copy()
|
||||
kw.update(kwargs)
|
||||
logger.debug(url)
|
||||
session = await self.set_session()
|
||||
async with session.get(self.encode_url(url), **self.kwargs) as r:
|
||||
self._raise_not_found_for_status(r, url)
|
||||
|
||||
if "Content-Type" in r.headers:
|
||||
mimetype = r.headers["Content-Type"].partition(";")[0]
|
||||
else:
|
||||
mimetype = None
|
||||
|
||||
if mimetype in ("text/html", None):
|
||||
try:
|
||||
text = await r.text(errors="ignore")
|
||||
if self.simple_links:
|
||||
links = ex2.findall(text) + [u[2] for u in ex.findall(text)]
|
||||
else:
|
||||
links = [u[2] for u in ex.findall(text)]
|
||||
except UnicodeDecodeError:
|
||||
links = [] # binary, not HTML
|
||||
else:
|
||||
links = []
|
||||
|
||||
out = set()
|
||||
parts = urlparse(url)
|
||||
for l in links:
|
||||
if isinstance(l, tuple):
|
||||
l = l[1]
|
||||
if l.startswith("/") and len(l) > 1:
|
||||
# absolute URL on this server
|
||||
l = f"{parts.scheme}://{parts.netloc}{l}"
|
||||
if l.startswith("http"):
|
||||
if self.same_schema and l.startswith(url.rstrip("/") + "/"):
|
||||
out.add(l)
|
||||
elif l.replace("https", "http").startswith(
|
||||
url.replace("https", "http").rstrip("/") + "/"
|
||||
):
|
||||
# allowed to cross http <-> https
|
||||
out.add(l)
|
||||
else:
|
||||
if l not in ["..", "../"]:
|
||||
# Ignore FTP-like "parent"
|
||||
out.add("/".join([url.rstrip("/"), l.lstrip("/")]))
|
||||
if not out and url.endswith("/"):
|
||||
out = await self._ls_real(url.rstrip("/"), detail=False)
|
||||
if detail:
|
||||
return [
|
||||
{
|
||||
"name": u,
|
||||
"size": None,
|
||||
"type": "directory" if u.endswith("/") else "file",
|
||||
}
|
||||
for u in out
|
||||
]
|
||||
else:
|
||||
return sorted(out)
|
||||
|
||||
async def _ls(self, url, detail=True, **kwargs):
|
||||
if self.use_listings_cache and url in self.dircache:
|
||||
out = self.dircache[url]
|
||||
else:
|
||||
out = await self._ls_real(url, detail=detail, **kwargs)
|
||||
self.dircache[url] = out
|
||||
return out
|
||||
|
||||
ls = sync_wrapper(_ls)
|
||||
|
||||
def _raise_not_found_for_status(self, response, url):
|
||||
"""
|
||||
Raises FileNotFoundError for 404s, otherwise uses raise_for_status.
|
||||
"""
|
||||
if response.status == 404:
|
||||
raise FileNotFoundError(url)
|
||||
response.raise_for_status()
|
||||
|
||||
async def _cat_file(self, url, start=None, end=None, **kwargs):
|
||||
kw = self.kwargs.copy()
|
||||
kw.update(kwargs)
|
||||
logger.debug(url)
|
||||
|
||||
if start is not None or end is not None:
|
||||
if start == end:
|
||||
return b""
|
||||
headers = kw.pop("headers", {}).copy()
|
||||
|
||||
headers["Range"] = await self._process_limits(url, start, end)
|
||||
kw["headers"] = headers
|
||||
session = await self.set_session()
|
||||
async with session.get(self.encode_url(url), **kw) as r:
|
||||
out = await r.read()
|
||||
self._raise_not_found_for_status(r, url)
|
||||
return out
|
||||
|
||||
async def _get_file(
|
||||
self, rpath, lpath, chunk_size=5 * 2**20, callback=DEFAULT_CALLBACK, **kwargs
|
||||
):
|
||||
kw = self.kwargs.copy()
|
||||
kw.update(kwargs)
|
||||
logger.debug(rpath)
|
||||
session = await self.set_session()
|
||||
async with session.get(self.encode_url(rpath), **kw) as r:
|
||||
try:
|
||||
size = int(r.headers["content-length"])
|
||||
except (ValueError, KeyError):
|
||||
size = None
|
||||
|
||||
callback.set_size(size)
|
||||
self._raise_not_found_for_status(r, rpath)
|
||||
if isfilelike(lpath):
|
||||
outfile = lpath
|
||||
else:
|
||||
outfile = open(lpath, "wb") # noqa: ASYNC230
|
||||
|
||||
try:
|
||||
chunk = True
|
||||
while chunk:
|
||||
chunk = await r.content.read(chunk_size)
|
||||
outfile.write(chunk)
|
||||
callback.relative_update(len(chunk))
|
||||
finally:
|
||||
if not isfilelike(lpath):
|
||||
outfile.close()
|
||||
|
||||
async def _put_file(
|
||||
self,
|
||||
lpath,
|
||||
rpath,
|
||||
chunk_size=5 * 2**20,
|
||||
callback=DEFAULT_CALLBACK,
|
||||
method="post",
|
||||
mode="overwrite",
|
||||
**kwargs,
|
||||
):
|
||||
if mode != "overwrite":
|
||||
raise NotImplementedError("Exclusive write")
|
||||
|
||||
async def gen_chunks():
|
||||
# Support passing arbitrary file-like objects
|
||||
# and use them instead of streams.
|
||||
if isinstance(lpath, io.IOBase):
|
||||
context = nullcontext(lpath)
|
||||
use_seek = False # might not support seeking
|
||||
else:
|
||||
context = open(lpath, "rb") # noqa: ASYNC230
|
||||
use_seek = True
|
||||
|
||||
with context as f:
|
||||
if use_seek:
|
||||
callback.set_size(f.seek(0, 2))
|
||||
f.seek(0)
|
||||
else:
|
||||
callback.set_size(getattr(f, "size", None))
|
||||
|
||||
chunk = f.read(chunk_size)
|
||||
while chunk:
|
||||
yield chunk
|
||||
callback.relative_update(len(chunk))
|
||||
chunk = f.read(chunk_size)
|
||||
|
||||
kw = self.kwargs.copy()
|
||||
kw.update(kwargs)
|
||||
session = await self.set_session()
|
||||
|
||||
method = method.lower()
|
||||
if method not in ("post", "put"):
|
||||
raise ValueError(
|
||||
f"method has to be either 'post' or 'put', not: {method!r}"
|
||||
)
|
||||
|
||||
meth = getattr(session, method)
|
||||
async with meth(self.encode_url(rpath), data=gen_chunks(), **kw) as resp:
|
||||
self._raise_not_found_for_status(resp, rpath)
|
||||
|
||||
async def _exists(self, path, **kwargs):
|
||||
kw = self.kwargs.copy()
|
||||
kw.update(kwargs)
|
||||
try:
|
||||
logger.debug(path)
|
||||
session = await self.set_session()
|
||||
r = await session.get(self.encode_url(path), **kw)
|
||||
async with r:
|
||||
return r.status < 400
|
||||
except aiohttp.ClientError:
|
||||
return False
|
||||
|
||||
async def _isfile(self, path, **kwargs):
|
||||
return await self._exists(path, **kwargs)
|
||||
|
||||
def _open(
|
||||
self,
|
||||
path,
|
||||
mode="rb",
|
||||
block_size=None,
|
||||
autocommit=None, # XXX: This differs from the base class.
|
||||
cache_type=None,
|
||||
cache_options=None,
|
||||
size=None,
|
||||
**kwargs,
|
||||
):
|
||||
"""Make a file-like object
|
||||
|
||||
Parameters
|
||||
----------
|
||||
path: str
|
||||
Full URL with protocol
|
||||
mode: string
|
||||
must be "rb"
|
||||
block_size: int or None
|
||||
Bytes to download in one request; use instance value if None. If
|
||||
zero, will return a streaming Requests file-like instance.
|
||||
kwargs: key-value
|
||||
Any other parameters, passed to requests calls
|
||||
"""
|
||||
if mode != "rb":
|
||||
raise NotImplementedError
|
||||
block_size = block_size if block_size is not None else self.block_size
|
||||
kw = self.kwargs.copy()
|
||||
kw["asynchronous"] = self.asynchronous
|
||||
kw.update(kwargs)
|
||||
info = {}
|
||||
size = size or info.update(self.info(path, **kwargs)) or info["size"]
|
||||
session = sync(self.loop, self.set_session)
|
||||
if block_size and size and info.get("partial", True):
|
||||
return HTTPFile(
|
||||
self,
|
||||
path,
|
||||
session=session,
|
||||
block_size=block_size,
|
||||
mode=mode,
|
||||
size=size,
|
||||
cache_type=cache_type or self.cache_type,
|
||||
cache_options=cache_options or self.cache_options,
|
||||
loop=self.loop,
|
||||
**kw,
|
||||
)
|
||||
else:
|
||||
return HTTPStreamFile(
|
||||
self,
|
||||
path,
|
||||
mode=mode,
|
||||
loop=self.loop,
|
||||
session=session,
|
||||
**kw,
|
||||
)
|
||||
|
||||
async def open_async(self, path, mode="rb", size=None, **kwargs):
|
||||
session = await self.set_session()
|
||||
if size is None:
|
||||
try:
|
||||
size = (await self._info(path, **kwargs))["size"]
|
||||
except FileNotFoundError:
|
||||
pass
|
||||
return AsyncStreamFile(
|
||||
self,
|
||||
path,
|
||||
loop=self.loop,
|
||||
session=session,
|
||||
size=size,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
def ukey(self, url):
|
||||
"""Unique identifier; assume HTTP files are static, unchanging"""
|
||||
return tokenize(url, self.kwargs, self.protocol)
|
||||
|
||||
async def _info(self, url, **kwargs):
|
||||
"""Get info of URL
|
||||
|
||||
Tries to access location via HEAD, and then GET methods, but does
|
||||
not fetch the data.
|
||||
|
||||
It is possible that the server does not supply any size information, in
|
||||
which case size will be given as None (and certain operations on the
|
||||
corresponding file will not work).
|
||||
"""
|
||||
info = {}
|
||||
session = await self.set_session()
|
||||
|
||||
for policy in ["head", "get"]:
|
||||
try:
|
||||
info.update(
|
||||
await _file_info(
|
||||
self.encode_url(url),
|
||||
size_policy=policy,
|
||||
session=session,
|
||||
**self.kwargs,
|
||||
**kwargs,
|
||||
)
|
||||
)
|
||||
if info.get("size") is not None:
|
||||
break
|
||||
except Exception as exc:
|
||||
if policy == "get":
|
||||
# If get failed, then raise a FileNotFoundError
|
||||
raise FileNotFoundError(url) from exc
|
||||
logger.debug("", exc_info=exc)
|
||||
|
||||
return {"name": url, "size": None, **info, "type": "file"}
|
||||
|
||||
async def _glob(self, path, maxdepth=None, **kwargs):
|
||||
"""
|
||||
Find files by glob-matching.
|
||||
|
||||
This implementation is idntical to the one in AbstractFileSystem,
|
||||
but "?" is not considered as a character for globbing, because it is
|
||||
so common in URLs, often identifying the "query" part.
|
||||
"""
|
||||
if maxdepth is not None and maxdepth < 1:
|
||||
raise ValueError("maxdepth must be at least 1")
|
||||
import re
|
||||
|
||||
ends_with_slash = path.endswith("/") # _strip_protocol strips trailing slash
|
||||
path = self._strip_protocol(path)
|
||||
append_slash_to_dirname = ends_with_slash or path.endswith(("/**", "/*"))
|
||||
idx_star = path.find("*") if path.find("*") >= 0 else len(path)
|
||||
idx_brace = path.find("[") if path.find("[") >= 0 else len(path)
|
||||
|
||||
min_idx = min(idx_star, idx_brace)
|
||||
|
||||
detail = kwargs.pop("detail", False)
|
||||
|
||||
if not has_magic(path):
|
||||
if await self._exists(path, **kwargs):
|
||||
if not detail:
|
||||
return [path]
|
||||
else:
|
||||
return {path: await self._info(path, **kwargs)}
|
||||
else:
|
||||
if not detail:
|
||||
return [] # glob of non-existent returns empty
|
||||
else:
|
||||
return {}
|
||||
elif "/" in path[:min_idx]:
|
||||
min_idx = path[:min_idx].rindex("/")
|
||||
root = path[: min_idx + 1]
|
||||
depth = path[min_idx + 1 :].count("/") + 1
|
||||
else:
|
||||
root = ""
|
||||
depth = path[min_idx + 1 :].count("/") + 1
|
||||
|
||||
if "**" in path:
|
||||
if maxdepth is not None:
|
||||
idx_double_stars = path.find("**")
|
||||
depth_double_stars = path[idx_double_stars:].count("/") + 1
|
||||
depth = depth - depth_double_stars + maxdepth
|
||||
else:
|
||||
depth = None
|
||||
|
||||
allpaths = await self._find(
|
||||
root, maxdepth=depth, withdirs=True, detail=True, **kwargs
|
||||
)
|
||||
|
||||
pattern = glob_translate(path + ("/" if ends_with_slash else ""))
|
||||
pattern = re.compile(pattern)
|
||||
|
||||
out = {
|
||||
(
|
||||
p.rstrip("/")
|
||||
if not append_slash_to_dirname
|
||||
and info["type"] == "directory"
|
||||
and p.endswith("/")
|
||||
else p
|
||||
): info
|
||||
for p, info in sorted(allpaths.items())
|
||||
if pattern.match(p.rstrip("/"))
|
||||
}
|
||||
|
||||
if detail:
|
||||
return out
|
||||
else:
|
||||
return list(out)
|
||||
|
||||
async def _isdir(self, path):
|
||||
# override, since all URLs are (also) files
|
||||
try:
|
||||
return bool(await self._ls(path))
|
||||
except (FileNotFoundError, ValueError):
|
||||
return False
|
||||
|
||||
async def _pipe_file(self, path, value, mode="overwrite", **kwargs):
|
||||
"""
|
||||
Write bytes to a remote file over HTTP.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
path : str
|
||||
Target URL where the data should be written
|
||||
value : bytes
|
||||
Data to be written
|
||||
mode : str
|
||||
How to write to the file - 'overwrite' or 'append'
|
||||
**kwargs : dict
|
||||
Additional parameters to pass to the HTTP request
|
||||
"""
|
||||
url = self._strip_protocol(path)
|
||||
headers = kwargs.pop("headers", {})
|
||||
headers["Content-Length"] = str(len(value))
|
||||
|
||||
session = await self.set_session()
|
||||
|
||||
async with session.put(url, data=value, headers=headers, **kwargs) as r:
|
||||
r.raise_for_status()
|
||||
|
||||
|
||||
class HTTPFile(AbstractBufferedFile):
|
||||
"""
|
||||
A file-like object pointing to a remote HTTP(S) resource
|
||||
|
||||
Supports only reading, with read-ahead of a predetermined block-size.
|
||||
|
||||
In the case that the server does not supply the filesize, only reading of
|
||||
the complete file in one go is supported.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
url: str
|
||||
Full URL of the remote resource, including the protocol
|
||||
session: aiohttp.ClientSession or None
|
||||
All calls will be made within this session, to avoid restarting
|
||||
connections where the server allows this
|
||||
block_size: int or None
|
||||
The amount of read-ahead to do, in bytes. Default is 5MB, or the value
|
||||
configured for the FileSystem creating this file
|
||||
size: None or int
|
||||
If given, this is the size of the file in bytes, and we don't attempt
|
||||
to call the server to find the value.
|
||||
kwargs: all other key-values are passed to requests calls.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
fs,
|
||||
url,
|
||||
session=None,
|
||||
block_size=None,
|
||||
mode="rb",
|
||||
cache_type="bytes",
|
||||
cache_options=None,
|
||||
size=None,
|
||||
loop=None,
|
||||
asynchronous=False,
|
||||
**kwargs,
|
||||
):
|
||||
if mode != "rb":
|
||||
raise NotImplementedError("File mode not supported")
|
||||
self.asynchronous = asynchronous
|
||||
self.loop = loop
|
||||
self.url = url
|
||||
self.session = session
|
||||
self.details = {"name": url, "size": size, "type": "file"}
|
||||
super().__init__(
|
||||
fs=fs,
|
||||
path=url,
|
||||
mode=mode,
|
||||
block_size=block_size,
|
||||
cache_type=cache_type,
|
||||
cache_options=cache_options,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
def read(self, length=-1):
|
||||
"""Read bytes from file
|
||||
|
||||
Parameters
|
||||
----------
|
||||
length: int
|
||||
Read up to this many bytes. If negative, read all content to end of
|
||||
file. If the server has not supplied the filesize, attempting to
|
||||
read only part of the data will raise a ValueError.
|
||||
"""
|
||||
if (
|
||||
(length < 0 and self.loc == 0) # explicit read all
|
||||
# but not when the size is known and fits into a block anyways
|
||||
and not (self.size is not None and self.size <= self.blocksize)
|
||||
):
|
||||
self._fetch_all()
|
||||
if self.size is None:
|
||||
if length < 0:
|
||||
self._fetch_all()
|
||||
else:
|
||||
length = min(self.size - self.loc, length)
|
||||
return super().read(length)
|
||||
|
||||
async def async_fetch_all(self):
|
||||
"""Read whole file in one shot, without caching
|
||||
|
||||
This is only called when position is still at zero,
|
||||
and read() is called without a byte-count.
|
||||
"""
|
||||
logger.debug(f"Fetch all for {self}")
|
||||
if not isinstance(self.cache, AllBytes):
|
||||
r = await self.session.get(self.fs.encode_url(self.url), **self.kwargs)
|
||||
async with r:
|
||||
r.raise_for_status()
|
||||
out = await r.read()
|
||||
self.cache = AllBytes(
|
||||
size=len(out), fetcher=None, blocksize=None, data=out
|
||||
)
|
||||
self.size = len(out)
|
||||
|
||||
_fetch_all = sync_wrapper(async_fetch_all)
|
||||
|
||||
def _parse_content_range(self, headers):
|
||||
"""Parse the Content-Range header"""
|
||||
s = headers.get("Content-Range", "")
|
||||
m = re.match(r"bytes (\d+-\d+|\*)/(\d+|\*)", s)
|
||||
if not m:
|
||||
return None, None, None
|
||||
|
||||
if m[1] == "*":
|
||||
start = end = None
|
||||
else:
|
||||
start, end = [int(x) for x in m[1].split("-")]
|
||||
total = None if m[2] == "*" else int(m[2])
|
||||
return start, end, total
|
||||
|
||||
async def async_fetch_range(self, start, end):
|
||||
"""Download a block of data
|
||||
|
||||
The expectation is that the server returns only the requested bytes,
|
||||
with HTTP code 206. If this is not the case, we first check the headers,
|
||||
and then stream the output - if the data size is bigger than we
|
||||
requested, an exception is raised.
|
||||
"""
|
||||
logger.debug(f"Fetch range for {self}: {start}-{end}")
|
||||
kwargs = self.kwargs.copy()
|
||||
headers = kwargs.pop("headers", {}).copy()
|
||||
headers["Range"] = f"bytes={start}-{end - 1}"
|
||||
logger.debug(f"{self.url} : {headers['Range']}")
|
||||
r = await self.session.get(
|
||||
self.fs.encode_url(self.url), headers=headers, **kwargs
|
||||
)
|
||||
async with r:
|
||||
if r.status == 416:
|
||||
# range request outside file
|
||||
return b""
|
||||
r.raise_for_status()
|
||||
|
||||
# If the server has handled the range request, it should reply
|
||||
# with status 206 (partial content). But we'll guess that a suitable
|
||||
# Content-Range header or a Content-Length no more than the
|
||||
# requested range also mean we have got the desired range.
|
||||
response_is_range = (
|
||||
r.status == 206
|
||||
or self._parse_content_range(r.headers)[0] == start
|
||||
or int(r.headers.get("Content-Length", end + 1)) <= end - start
|
||||
)
|
||||
|
||||
if response_is_range:
|
||||
# partial content, as expected
|
||||
out = await r.read()
|
||||
elif start > 0:
|
||||
raise ValueError(
|
||||
"The HTTP server doesn't appear to support range requests. "
|
||||
"Only reading this file from the beginning is supported. "
|
||||
"Open with block_size=0 for a streaming file interface."
|
||||
)
|
||||
else:
|
||||
# Response is not a range, but we want the start of the file,
|
||||
# so we can read the required amount anyway.
|
||||
cl = 0
|
||||
out = []
|
||||
while True:
|
||||
chunk = await r.content.read(2**20)
|
||||
# data size unknown, let's read until we have enough
|
||||
if chunk:
|
||||
out.append(chunk)
|
||||
cl += len(chunk)
|
||||
if cl > end - start:
|
||||
break
|
||||
else:
|
||||
break
|
||||
out = b"".join(out)[: end - start]
|
||||
return out
|
||||
|
||||
_fetch_range = sync_wrapper(async_fetch_range)
|
||||
|
||||
|
||||
magic_check = re.compile("([*[])")
|
||||
|
||||
|
||||
def has_magic(s):
|
||||
match = magic_check.search(s)
|
||||
return match is not None
|
||||
|
||||
|
||||
class HTTPStreamFile(AbstractBufferedFile):
|
||||
def __init__(self, fs, url, mode="rb", loop=None, session=None, **kwargs):
|
||||
self.asynchronous = kwargs.pop("asynchronous", False)
|
||||
self.url = url
|
||||
self.loop = loop
|
||||
self.session = session
|
||||
if mode != "rb":
|
||||
raise ValueError
|
||||
self.details = {"name": url, "size": None}
|
||||
super().__init__(fs=fs, path=url, mode=mode, cache_type="none", **kwargs)
|
||||
|
||||
async def cor():
|
||||
r = await self.session.get(self.fs.encode_url(url), **kwargs).__aenter__()
|
||||
self.fs._raise_not_found_for_status(r, url)
|
||||
return r
|
||||
|
||||
self.r = sync(self.loop, cor)
|
||||
self.loop = fs.loop
|
||||
|
||||
def seek(self, loc, whence=0):
|
||||
if loc == 0 and whence == 1:
|
||||
return
|
||||
if loc == self.loc and whence == 0:
|
||||
return
|
||||
raise ValueError("Cannot seek streaming HTTP file")
|
||||
|
||||
async def _read(self, num=-1):
|
||||
out = await self.r.content.read(num)
|
||||
self.loc += len(out)
|
||||
return out
|
||||
|
||||
read = sync_wrapper(_read)
|
||||
|
||||
async def _close(self):
|
||||
self.r.close()
|
||||
|
||||
def close(self):
|
||||
asyncio.run_coroutine_threadsafe(self._close(), self.loop)
|
||||
super().close()
|
||||
|
||||
|
||||
class AsyncStreamFile(AbstractAsyncStreamedFile):
|
||||
def __init__(
|
||||
self, fs, url, mode="rb", loop=None, session=None, size=None, **kwargs
|
||||
):
|
||||
self.url = url
|
||||
self.session = session
|
||||
self.r = None
|
||||
if mode != "rb":
|
||||
raise ValueError
|
||||
self.details = {"name": url, "size": None}
|
||||
self.kwargs = kwargs
|
||||
super().__init__(fs=fs, path=url, mode=mode, cache_type="none")
|
||||
self.size = size
|
||||
|
||||
async def read(self, num=-1):
|
||||
if self.r is None:
|
||||
r = await self.session.get(
|
||||
self.fs.encode_url(self.url), **self.kwargs
|
||||
).__aenter__()
|
||||
self.fs._raise_not_found_for_status(r, self.url)
|
||||
self.r = r
|
||||
out = await self.r.content.read(num)
|
||||
self.loc += len(out)
|
||||
return out
|
||||
|
||||
async def close(self):
|
||||
if self.r is not None:
|
||||
self.r.close()
|
||||
self.r = None
|
||||
await super().close()
|
||||
|
||||
|
||||
async def get_range(session, url, start, end, file=None, **kwargs):
|
||||
# explicit get a range when we know it must be safe
|
||||
kwargs = kwargs.copy()
|
||||
headers = kwargs.pop("headers", {}).copy()
|
||||
headers["Range"] = f"bytes={start}-{end - 1}"
|
||||
r = await session.get(url, headers=headers, **kwargs)
|
||||
r.raise_for_status()
|
||||
async with r:
|
||||
out = await r.read()
|
||||
if file:
|
||||
with open(file, "r+b") as f: # noqa: ASYNC230
|
||||
f.seek(start)
|
||||
f.write(out)
|
||||
else:
|
||||
return out
|
||||
|
||||
|
||||
async def _file_info(url, session, size_policy="head", **kwargs):
|
||||
"""Call HEAD on the server to get details about the file (size/checksum etc.)
|
||||
|
||||
Default operation is to explicitly allow redirects and use encoding
|
||||
'identity' (no compression) to get the true size of the target.
|
||||
"""
|
||||
logger.debug("Retrieve file size for %s", url)
|
||||
kwargs = kwargs.copy()
|
||||
ar = kwargs.pop("allow_redirects", True)
|
||||
head = kwargs.get("headers", {}).copy()
|
||||
head["Accept-Encoding"] = "identity"
|
||||
kwargs["headers"] = head
|
||||
|
||||
info = {}
|
||||
if size_policy == "head":
|
||||
r = await session.head(url, allow_redirects=ar, **kwargs)
|
||||
elif size_policy == "get":
|
||||
r = await session.get(url, allow_redirects=ar, **kwargs)
|
||||
else:
|
||||
raise TypeError(f'size_policy must be "head" or "get", got {size_policy}')
|
||||
async with r:
|
||||
r.raise_for_status()
|
||||
|
||||
if "Content-Length" in r.headers:
|
||||
# Some servers may choose to ignore Accept-Encoding and return
|
||||
# compressed content, in which case the returned size is unreliable.
|
||||
if "Content-Encoding" not in r.headers or r.headers["Content-Encoding"] in [
|
||||
"identity",
|
||||
"",
|
||||
]:
|
||||
info["size"] = int(r.headers["Content-Length"])
|
||||
elif "Content-Range" in r.headers:
|
||||
info["size"] = int(r.headers["Content-Range"].split("/")[1])
|
||||
|
||||
if "Content-Type" in r.headers:
|
||||
info["mimetype"] = r.headers["Content-Type"].partition(";")[0]
|
||||
|
||||
if r.headers.get("Accept-Ranges") == "none":
|
||||
# Some servers may explicitly discourage partial content requests, but
|
||||
# the lack of "Accept-Ranges" does not always indicate they would fail
|
||||
info["partial"] = False
|
||||
|
||||
info["url"] = str(r.url)
|
||||
|
||||
for checksum_field in ["ETag", "Content-MD5", "Digest", "Last-Modified"]:
|
||||
if r.headers.get(checksum_field):
|
||||
info[checksum_field] = r.headers[checksum_field]
|
||||
|
||||
return info
|
||||
|
||||
|
||||
async def _file_size(url, session=None, *args, **kwargs):
|
||||
if session is None:
|
||||
session = await get_client()
|
||||
info = await _file_info(url, session=session, *args, **kwargs)
|
||||
return info.get("size")
|
||||
|
||||
|
||||
file_size = sync_wrapper(_file_size)
|
||||
@@ -0,0 +1,931 @@
|
||||
"""This file is largely copied from http.py"""
|
||||
|
||||
import io
|
||||
import logging
|
||||
import re
|
||||
import urllib.error
|
||||
import urllib.parse
|
||||
from copy import copy
|
||||
from json import dumps, loads
|
||||
from urllib.parse import urlparse
|
||||
|
||||
try:
|
||||
import yarl
|
||||
except (ImportError, ModuleNotFoundError, OSError):
|
||||
yarl = False
|
||||
|
||||
from fsspec.callbacks import _DEFAULT_CALLBACK
|
||||
from fsspec.registry import register_implementation
|
||||
from fsspec.spec import AbstractBufferedFile, AbstractFileSystem
|
||||
from fsspec.utils import DEFAULT_BLOCK_SIZE, isfilelike, nullcontext, tokenize
|
||||
|
||||
from ..caching import AllBytes
|
||||
|
||||
# https://stackoverflow.com/a/15926317/3821154
|
||||
ex = re.compile(r"""<(a|A)\s+(?:[^>]*?\s+)?(href|HREF)=["'](?P<url>[^"']+)""")
|
||||
ex2 = re.compile(r"""(?P<url>http[s]?://[-a-zA-Z0-9@:%_+.~#?&/=]+)""")
|
||||
logger = logging.getLogger("fsspec.http")
|
||||
|
||||
|
||||
class JsHttpException(urllib.error.HTTPError): ...
|
||||
|
||||
|
||||
class StreamIO(io.BytesIO):
|
||||
# fake class, so you can set attributes on it
|
||||
# will eventually actually stream
|
||||
...
|
||||
|
||||
|
||||
class ResponseProxy:
|
||||
"""Looks like a requests response"""
|
||||
|
||||
def __init__(self, req, stream=False):
|
||||
self.request = req
|
||||
self.stream = stream
|
||||
self._data = None
|
||||
self._headers = None
|
||||
|
||||
@property
|
||||
def raw(self):
|
||||
if self._data is None:
|
||||
b = self.request.response.to_bytes()
|
||||
if self.stream:
|
||||
self._data = StreamIO(b)
|
||||
else:
|
||||
self._data = b
|
||||
return self._data
|
||||
|
||||
def close(self):
|
||||
if hasattr(self, "_data"):
|
||||
del self._data
|
||||
|
||||
@property
|
||||
def headers(self):
|
||||
if self._headers is None:
|
||||
self._headers = dict(
|
||||
[
|
||||
_.split(": ")
|
||||
for _ in self.request.getAllResponseHeaders().strip().split("\r\n")
|
||||
]
|
||||
)
|
||||
return self._headers
|
||||
|
||||
@property
|
||||
def status_code(self):
|
||||
return int(self.request.status)
|
||||
|
||||
def raise_for_status(self):
|
||||
if not self.ok:
|
||||
raise JsHttpException(
|
||||
self.url, self.status_code, self.reason, self.headers, None
|
||||
)
|
||||
|
||||
def iter_content(self, chunksize, *_, **__):
|
||||
while True:
|
||||
out = self.raw.read(chunksize)
|
||||
if out:
|
||||
yield out
|
||||
else:
|
||||
break
|
||||
|
||||
@property
|
||||
def reason(self):
|
||||
return self.request.statusText
|
||||
|
||||
@property
|
||||
def ok(self):
|
||||
return self.status_code < 400
|
||||
|
||||
@property
|
||||
def url(self):
|
||||
return self.request.response.responseURL
|
||||
|
||||
@property
|
||||
def text(self):
|
||||
# TODO: encoding from headers
|
||||
return self.content.decode()
|
||||
|
||||
@property
|
||||
def content(self):
|
||||
self.stream = False
|
||||
return self.raw
|
||||
|
||||
def json(self):
|
||||
return loads(self.text)
|
||||
|
||||
|
||||
class RequestsSessionShim:
|
||||
def __init__(self):
|
||||
self.headers = {}
|
||||
|
||||
def request(
|
||||
self,
|
||||
method,
|
||||
url,
|
||||
params=None,
|
||||
data=None,
|
||||
headers=None,
|
||||
cookies=None,
|
||||
files=None,
|
||||
auth=None,
|
||||
timeout=None,
|
||||
allow_redirects=None,
|
||||
proxies=None,
|
||||
hooks=None,
|
||||
stream=None,
|
||||
verify=None,
|
||||
cert=None,
|
||||
json=None,
|
||||
):
|
||||
from js import Blob, XMLHttpRequest
|
||||
|
||||
logger.debug("JS request: %s %s", method, url)
|
||||
|
||||
if cert or verify or proxies or files or cookies or hooks:
|
||||
raise NotImplementedError
|
||||
if data and json:
|
||||
raise ValueError("Use json= or data=, not both")
|
||||
req = XMLHttpRequest.new()
|
||||
extra = auth if auth else ()
|
||||
if params:
|
||||
url = f"{url}?{urllib.parse.urlencode(params)}"
|
||||
req.open(method, url, False, *extra)
|
||||
if timeout:
|
||||
req.timeout = timeout
|
||||
if headers:
|
||||
for k, v in headers.items():
|
||||
req.setRequestHeader(k, v)
|
||||
|
||||
req.setRequestHeader("Accept", "application/octet-stream")
|
||||
req.responseType = "arraybuffer"
|
||||
if json:
|
||||
blob = Blob.new([dumps(data)], {type: "application/json"})
|
||||
req.send(blob)
|
||||
elif data:
|
||||
if isinstance(data, io.IOBase):
|
||||
data = data.read()
|
||||
blob = Blob.new([data], {type: "application/octet-stream"})
|
||||
req.send(blob)
|
||||
else:
|
||||
req.send(None)
|
||||
return ResponseProxy(req, stream=stream)
|
||||
|
||||
def get(self, url, **kwargs):
|
||||
return self.request("GET", url, **kwargs)
|
||||
|
||||
def head(self, url, **kwargs):
|
||||
return self.request("HEAD", url, **kwargs)
|
||||
|
||||
def post(self, url, **kwargs):
|
||||
return self.request("POST}", url, **kwargs)
|
||||
|
||||
def put(self, url, **kwargs):
|
||||
return self.request("PUT", url, **kwargs)
|
||||
|
||||
def patch(self, url, **kwargs):
|
||||
return self.request("PATCH", url, **kwargs)
|
||||
|
||||
def delete(self, url, **kwargs):
|
||||
return self.request("DELETE", url, **kwargs)
|
||||
|
||||
|
||||
class HTTPFileSystem(AbstractFileSystem):
|
||||
"""
|
||||
Simple File-System for fetching data via HTTP(S)
|
||||
|
||||
This is the BLOCKING version of the normal HTTPFileSystem. It uses
|
||||
requests in normal python and the JS runtime in pyodide.
|
||||
|
||||
***This implementation is extremely experimental, do not use unless
|
||||
you are testing pyodide/pyscript integration***
|
||||
"""
|
||||
|
||||
protocol = ("http", "https", "sync-http", "sync-https")
|
||||
sep = "/"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
simple_links=True,
|
||||
block_size=None,
|
||||
same_scheme=True,
|
||||
cache_type="readahead",
|
||||
cache_options=None,
|
||||
client_kwargs=None,
|
||||
encoded=False,
|
||||
**storage_options,
|
||||
):
|
||||
"""
|
||||
|
||||
Parameters
|
||||
----------
|
||||
block_size: int
|
||||
Blocks to read bytes; if 0, will default to raw requests file-like
|
||||
objects instead of HTTPFile instances
|
||||
simple_links: bool
|
||||
If True, will consider both HTML <a> tags and anything that looks
|
||||
like a URL; if False, will consider only the former.
|
||||
same_scheme: True
|
||||
When doing ls/glob, if this is True, only consider paths that have
|
||||
http/https matching the input URLs.
|
||||
size_policy: this argument is deprecated
|
||||
client_kwargs: dict
|
||||
Passed to aiohttp.ClientSession, see
|
||||
https://docs.aiohttp.org/en/stable/client_reference.html
|
||||
For example, ``{'auth': aiohttp.BasicAuth('user', 'pass')}``
|
||||
storage_options: key-value
|
||||
Any other parameters passed on to requests
|
||||
cache_type, cache_options: defaults used in open
|
||||
"""
|
||||
super().__init__(self, **storage_options)
|
||||
self.block_size = block_size if block_size is not None else DEFAULT_BLOCK_SIZE
|
||||
self.simple_links = simple_links
|
||||
self.same_schema = same_scheme
|
||||
self.cache_type = cache_type
|
||||
self.cache_options = cache_options
|
||||
self.client_kwargs = client_kwargs or {}
|
||||
self.encoded = encoded
|
||||
self.kwargs = storage_options
|
||||
|
||||
try:
|
||||
import js # noqa: F401
|
||||
|
||||
logger.debug("Starting JS session")
|
||||
self.session = RequestsSessionShim()
|
||||
self.js = True
|
||||
except Exception as e:
|
||||
import requests
|
||||
|
||||
logger.debug("Starting cpython session because of: %s", e)
|
||||
self.session = requests.Session(**(client_kwargs or {}))
|
||||
self.js = False
|
||||
|
||||
request_options = copy(storage_options)
|
||||
self.use_listings_cache = request_options.pop("use_listings_cache", False)
|
||||
request_options.pop("listings_expiry_time", None)
|
||||
request_options.pop("max_paths", None)
|
||||
request_options.pop("skip_instance_cache", None)
|
||||
self.kwargs = request_options
|
||||
|
||||
@property
|
||||
def fsid(self):
|
||||
return "sync-http"
|
||||
|
||||
def encode_url(self, url):
|
||||
if yarl:
|
||||
return yarl.URL(url, encoded=self.encoded)
|
||||
return url
|
||||
|
||||
@classmethod
|
||||
def _strip_protocol(cls, path: str) -> str:
|
||||
"""For HTTP, we always want to keep the full URL"""
|
||||
path = path.replace("sync-http://", "http://").replace(
|
||||
"sync-https://", "https://"
|
||||
)
|
||||
return path
|
||||
|
||||
@classmethod
|
||||
def _parent(cls, path):
|
||||
# override, since _strip_protocol is different for URLs
|
||||
par = super()._parent(path)
|
||||
if len(par) > 7: # "http://..."
|
||||
return par
|
||||
return ""
|
||||
|
||||
def _ls_real(self, url, detail=True, **kwargs):
|
||||
# ignoring URL-encoded arguments
|
||||
kw = self.kwargs.copy()
|
||||
kw.update(kwargs)
|
||||
logger.debug(url)
|
||||
r = self.session.get(self.encode_url(url), **self.kwargs)
|
||||
self._raise_not_found_for_status(r, url)
|
||||
text = r.text
|
||||
if self.simple_links:
|
||||
links = ex2.findall(text) + [u[2] for u in ex.findall(text)]
|
||||
else:
|
||||
links = [u[2] for u in ex.findall(text)]
|
||||
out = set()
|
||||
parts = urlparse(url)
|
||||
for l in links:
|
||||
if isinstance(l, tuple):
|
||||
l = l[1]
|
||||
if l.startswith("/") and len(l) > 1:
|
||||
# absolute URL on this server
|
||||
l = parts.scheme + "://" + parts.netloc + l
|
||||
if l.startswith("http"):
|
||||
if self.same_schema and l.startswith(url.rstrip("/") + "/"):
|
||||
out.add(l)
|
||||
elif l.replace("https", "http").startswith(
|
||||
url.replace("https", "http").rstrip("/") + "/"
|
||||
):
|
||||
# allowed to cross http <-> https
|
||||
out.add(l)
|
||||
else:
|
||||
if l not in ["..", "../"]:
|
||||
# Ignore FTP-like "parent"
|
||||
out.add("/".join([url.rstrip("/"), l.lstrip("/")]))
|
||||
if not out and url.endswith("/"):
|
||||
out = self._ls_real(url.rstrip("/"), detail=False)
|
||||
if detail:
|
||||
return [
|
||||
{
|
||||
"name": u,
|
||||
"size": None,
|
||||
"type": "directory" if u.endswith("/") else "file",
|
||||
}
|
||||
for u in out
|
||||
]
|
||||
else:
|
||||
return sorted(out)
|
||||
|
||||
def ls(self, url, detail=True, **kwargs):
|
||||
if self.use_listings_cache and url in self.dircache:
|
||||
out = self.dircache[url]
|
||||
else:
|
||||
out = self._ls_real(url, detail=detail, **kwargs)
|
||||
self.dircache[url] = out
|
||||
return out
|
||||
|
||||
def _raise_not_found_for_status(self, response, url):
|
||||
"""
|
||||
Raises FileNotFoundError for 404s, otherwise uses raise_for_status.
|
||||
"""
|
||||
if response.status_code == 404:
|
||||
raise FileNotFoundError(url)
|
||||
response.raise_for_status()
|
||||
|
||||
def cat_file(self, url, start=None, end=None, **kwargs):
|
||||
kw = self.kwargs.copy()
|
||||
kw.update(kwargs)
|
||||
logger.debug(url)
|
||||
|
||||
if start is not None or end is not None:
|
||||
if start == end:
|
||||
return b""
|
||||
headers = kw.pop("headers", {}).copy()
|
||||
|
||||
headers["Range"] = self._process_limits(url, start, end)
|
||||
kw["headers"] = headers
|
||||
r = self.session.get(self.encode_url(url), **kw)
|
||||
self._raise_not_found_for_status(r, url)
|
||||
return r.content
|
||||
|
||||
def get_file(
|
||||
self, rpath, lpath, chunk_size=5 * 2**20, callback=_DEFAULT_CALLBACK, **kwargs
|
||||
):
|
||||
kw = self.kwargs.copy()
|
||||
kw.update(kwargs)
|
||||
logger.debug(rpath)
|
||||
r = self.session.get(self.encode_url(rpath), **kw)
|
||||
try:
|
||||
size = int(
|
||||
r.headers.get("content-length", None)
|
||||
or r.headers.get("Content-Length", None)
|
||||
)
|
||||
except (ValueError, KeyError, TypeError):
|
||||
size = None
|
||||
|
||||
callback.set_size(size)
|
||||
self._raise_not_found_for_status(r, rpath)
|
||||
if not isfilelike(lpath):
|
||||
lpath = open(lpath, "wb")
|
||||
for chunk in r.iter_content(chunk_size, decode_unicode=False):
|
||||
lpath.write(chunk)
|
||||
callback.relative_update(len(chunk))
|
||||
|
||||
def put_file(
|
||||
self,
|
||||
lpath,
|
||||
rpath,
|
||||
chunk_size=5 * 2**20,
|
||||
callback=_DEFAULT_CALLBACK,
|
||||
method="post",
|
||||
**kwargs,
|
||||
):
|
||||
def gen_chunks():
|
||||
# Support passing arbitrary file-like objects
|
||||
# and use them instead of streams.
|
||||
if isinstance(lpath, io.IOBase):
|
||||
context = nullcontext(lpath)
|
||||
use_seek = False # might not support seeking
|
||||
else:
|
||||
context = open(lpath, "rb")
|
||||
use_seek = True
|
||||
|
||||
with context as f:
|
||||
if use_seek:
|
||||
callback.set_size(f.seek(0, 2))
|
||||
f.seek(0)
|
||||
else:
|
||||
callback.set_size(getattr(f, "size", None))
|
||||
|
||||
chunk = f.read(chunk_size)
|
||||
while chunk:
|
||||
yield chunk
|
||||
callback.relative_update(len(chunk))
|
||||
chunk = f.read(chunk_size)
|
||||
|
||||
kw = self.kwargs.copy()
|
||||
kw.update(kwargs)
|
||||
|
||||
method = method.lower()
|
||||
if method not in ("post", "put"):
|
||||
raise ValueError(
|
||||
f"method has to be either 'post' or 'put', not: {method!r}"
|
||||
)
|
||||
|
||||
meth = getattr(self.session, method)
|
||||
resp = meth(rpath, data=gen_chunks(), **kw)
|
||||
self._raise_not_found_for_status(resp, rpath)
|
||||
|
||||
def _process_limits(self, url, start, end):
|
||||
"""Helper for "Range"-based _cat_file"""
|
||||
size = None
|
||||
suff = False
|
||||
if start is not None and start < 0:
|
||||
# if start is negative and end None, end is the "suffix length"
|
||||
if end is None:
|
||||
end = -start
|
||||
start = ""
|
||||
suff = True
|
||||
else:
|
||||
size = size or self.info(url)["size"]
|
||||
start = size + start
|
||||
elif start is None:
|
||||
start = 0
|
||||
if not suff:
|
||||
if end is not None and end < 0:
|
||||
if start is not None:
|
||||
size = size or self.info(url)["size"]
|
||||
end = size + end
|
||||
elif end is None:
|
||||
end = ""
|
||||
if isinstance(end, int):
|
||||
end -= 1 # bytes range is inclusive
|
||||
return f"bytes={start}-{end}"
|
||||
|
||||
def exists(self, path, **kwargs):
|
||||
kw = self.kwargs.copy()
|
||||
kw.update(kwargs)
|
||||
try:
|
||||
logger.debug(path)
|
||||
r = self.session.get(self.encode_url(path), **kw)
|
||||
return r.status_code < 400
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
def isfile(self, path, **kwargs):
|
||||
return self.exists(path, **kwargs)
|
||||
|
||||
def _open(
|
||||
self,
|
||||
path,
|
||||
mode="rb",
|
||||
block_size=None,
|
||||
autocommit=None, # XXX: This differs from the base class.
|
||||
cache_type=None,
|
||||
cache_options=None,
|
||||
size=None,
|
||||
**kwargs,
|
||||
):
|
||||
"""Make a file-like object
|
||||
|
||||
Parameters
|
||||
----------
|
||||
path: str
|
||||
Full URL with protocol
|
||||
mode: string
|
||||
must be "rb"
|
||||
block_size: int or None
|
||||
Bytes to download in one request; use instance value if None. If
|
||||
zero, will return a streaming Requests file-like instance.
|
||||
kwargs: key-value
|
||||
Any other parameters, passed to requests calls
|
||||
"""
|
||||
if mode != "rb":
|
||||
raise NotImplementedError
|
||||
block_size = block_size if block_size is not None else self.block_size
|
||||
kw = self.kwargs.copy()
|
||||
kw.update(kwargs)
|
||||
size = size or self.info(path, **kwargs)["size"]
|
||||
if block_size and size:
|
||||
return HTTPFile(
|
||||
self,
|
||||
path,
|
||||
session=self.session,
|
||||
block_size=block_size,
|
||||
mode=mode,
|
||||
size=size,
|
||||
cache_type=cache_type or self.cache_type,
|
||||
cache_options=cache_options or self.cache_options,
|
||||
**kw,
|
||||
)
|
||||
else:
|
||||
return HTTPStreamFile(
|
||||
self,
|
||||
path,
|
||||
mode=mode,
|
||||
session=self.session,
|
||||
**kw,
|
||||
)
|
||||
|
||||
def ukey(self, url):
|
||||
"""Unique identifier; assume HTTP files are static, unchanging"""
|
||||
return tokenize(url, self.kwargs, self.protocol)
|
||||
|
||||
def info(self, url, **kwargs):
|
||||
"""Get info of URL
|
||||
|
||||
Tries to access location via HEAD, and then GET methods, but does
|
||||
not fetch the data.
|
||||
|
||||
It is possible that the server does not supply any size information, in
|
||||
which case size will be given as None (and certain operations on the
|
||||
corresponding file will not work).
|
||||
"""
|
||||
info = {}
|
||||
for policy in ["head", "get"]:
|
||||
try:
|
||||
info.update(
|
||||
_file_info(
|
||||
self.encode_url(url),
|
||||
size_policy=policy,
|
||||
session=self.session,
|
||||
**self.kwargs,
|
||||
**kwargs,
|
||||
)
|
||||
)
|
||||
if info.get("size") is not None:
|
||||
break
|
||||
except Exception as exc:
|
||||
if policy == "get":
|
||||
# If get failed, then raise a FileNotFoundError
|
||||
raise FileNotFoundError(url) from exc
|
||||
logger.debug(str(exc))
|
||||
|
||||
return {"name": url, "size": None, **info, "type": "file"}
|
||||
|
||||
def glob(self, path, maxdepth=None, **kwargs):
|
||||
"""
|
||||
Find files by glob-matching.
|
||||
|
||||
This implementation is idntical to the one in AbstractFileSystem,
|
||||
but "?" is not considered as a character for globbing, because it is
|
||||
so common in URLs, often identifying the "query" part.
|
||||
"""
|
||||
import re
|
||||
|
||||
ends = path.endswith("/")
|
||||
path = self._strip_protocol(path)
|
||||
indstar = path.find("*") if path.find("*") >= 0 else len(path)
|
||||
indbrace = path.find("[") if path.find("[") >= 0 else len(path)
|
||||
|
||||
ind = min(indstar, indbrace)
|
||||
|
||||
detail = kwargs.pop("detail", False)
|
||||
|
||||
if not has_magic(path):
|
||||
root = path
|
||||
depth = 1
|
||||
if ends:
|
||||
path += "/*"
|
||||
elif self.exists(path):
|
||||
if not detail:
|
||||
return [path]
|
||||
else:
|
||||
return {path: self.info(path)}
|
||||
else:
|
||||
if not detail:
|
||||
return [] # glob of non-existent returns empty
|
||||
else:
|
||||
return {}
|
||||
elif "/" in path[:ind]:
|
||||
ind2 = path[:ind].rindex("/")
|
||||
root = path[: ind2 + 1]
|
||||
depth = None if "**" in path else path[ind2 + 1 :].count("/") + 1
|
||||
else:
|
||||
root = ""
|
||||
depth = None if "**" in path else path[ind + 1 :].count("/") + 1
|
||||
|
||||
allpaths = self.find(
|
||||
root, maxdepth=maxdepth or depth, withdirs=True, detail=True, **kwargs
|
||||
)
|
||||
# Escape characters special to python regex, leaving our supported
|
||||
# special characters in place.
|
||||
# See https://www.gnu.org/software/bash/manual/html_node/Pattern-Matching.html
|
||||
# for shell globbing details.
|
||||
pattern = (
|
||||
"^"
|
||||
+ (
|
||||
path.replace("\\", r"\\")
|
||||
.replace(".", r"\.")
|
||||
.replace("+", r"\+")
|
||||
.replace("//", "/")
|
||||
.replace("(", r"\(")
|
||||
.replace(")", r"\)")
|
||||
.replace("|", r"\|")
|
||||
.replace("^", r"\^")
|
||||
.replace("$", r"\$")
|
||||
.replace("{", r"\{")
|
||||
.replace("}", r"\}")
|
||||
.rstrip("/")
|
||||
)
|
||||
+ "$"
|
||||
)
|
||||
pattern = re.sub("[*]{2}", "=PLACEHOLDER=", pattern)
|
||||
pattern = re.sub("[*]", "[^/]*", pattern)
|
||||
pattern = re.compile(pattern.replace("=PLACEHOLDER=", ".*"))
|
||||
out = {
|
||||
p: allpaths[p]
|
||||
for p in sorted(allpaths)
|
||||
if pattern.match(p.replace("//", "/").rstrip("/"))
|
||||
}
|
||||
if detail:
|
||||
return out
|
||||
else:
|
||||
return list(out)
|
||||
|
||||
def isdir(self, path):
|
||||
# override, since all URLs are (also) files
|
||||
try:
|
||||
return bool(self.ls(path))
|
||||
except (FileNotFoundError, ValueError):
|
||||
return False
|
||||
|
||||
|
||||
class HTTPFile(AbstractBufferedFile):
|
||||
"""
|
||||
A file-like object pointing to a remove HTTP(S) resource
|
||||
|
||||
Supports only reading, with read-ahead of a predermined block-size.
|
||||
|
||||
In the case that the server does not supply the filesize, only reading of
|
||||
the complete file in one go is supported.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
url: str
|
||||
Full URL of the remote resource, including the protocol
|
||||
session: requests.Session or None
|
||||
All calls will be made within this session, to avoid restarting
|
||||
connections where the server allows this
|
||||
block_size: int or None
|
||||
The amount of read-ahead to do, in bytes. Default is 5MB, or the value
|
||||
configured for the FileSystem creating this file
|
||||
size: None or int
|
||||
If given, this is the size of the file in bytes, and we don't attempt
|
||||
to call the server to find the value.
|
||||
kwargs: all other key-values are passed to requests calls.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
fs,
|
||||
url,
|
||||
session=None,
|
||||
block_size=None,
|
||||
mode="rb",
|
||||
cache_type="bytes",
|
||||
cache_options=None,
|
||||
size=None,
|
||||
**kwargs,
|
||||
):
|
||||
if mode != "rb":
|
||||
raise NotImplementedError("File mode not supported")
|
||||
self.url = url
|
||||
self.session = session
|
||||
self.details = {"name": url, "size": size, "type": "file"}
|
||||
super().__init__(
|
||||
fs=fs,
|
||||
path=url,
|
||||
mode=mode,
|
||||
block_size=block_size,
|
||||
cache_type=cache_type,
|
||||
cache_options=cache_options,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
def read(self, length=-1):
|
||||
"""Read bytes from file
|
||||
|
||||
Parameters
|
||||
----------
|
||||
length: int
|
||||
Read up to this many bytes. If negative, read all content to end of
|
||||
file. If the server has not supplied the filesize, attempting to
|
||||
read only part of the data will raise a ValueError.
|
||||
"""
|
||||
if (
|
||||
(length < 0 and self.loc == 0) # explicit read all
|
||||
# but not when the size is known and fits into a block anyways
|
||||
and not (self.size is not None and self.size <= self.blocksize)
|
||||
):
|
||||
self._fetch_all()
|
||||
if self.size is None:
|
||||
if length < 0:
|
||||
self._fetch_all()
|
||||
else:
|
||||
length = min(self.size - self.loc, length)
|
||||
return super().read(length)
|
||||
|
||||
def _fetch_all(self):
|
||||
"""Read whole file in one shot, without caching
|
||||
|
||||
This is only called when position is still at zero,
|
||||
and read() is called without a byte-count.
|
||||
"""
|
||||
logger.debug(f"Fetch all for {self}")
|
||||
if not isinstance(self.cache, AllBytes):
|
||||
r = self.session.get(self.fs.encode_url(self.url), **self.kwargs)
|
||||
r.raise_for_status()
|
||||
out = r.content
|
||||
self.cache = AllBytes(size=len(out), fetcher=None, blocksize=None, data=out)
|
||||
self.size = len(out)
|
||||
|
||||
def _parse_content_range(self, headers):
|
||||
"""Parse the Content-Range header"""
|
||||
s = headers.get("Content-Range", "")
|
||||
m = re.match(r"bytes (\d+-\d+|\*)/(\d+|\*)", s)
|
||||
if not m:
|
||||
return None, None, None
|
||||
|
||||
if m[1] == "*":
|
||||
start = end = None
|
||||
else:
|
||||
start, end = [int(x) for x in m[1].split("-")]
|
||||
total = None if m[2] == "*" else int(m[2])
|
||||
return start, end, total
|
||||
|
||||
def _fetch_range(self, start, end):
|
||||
"""Download a block of data
|
||||
|
||||
The expectation is that the server returns only the requested bytes,
|
||||
with HTTP code 206. If this is not the case, we first check the headers,
|
||||
and then stream the output - if the data size is bigger than we
|
||||
requested, an exception is raised.
|
||||
"""
|
||||
logger.debug(f"Fetch range for {self}: {start}-{end}")
|
||||
kwargs = self.kwargs.copy()
|
||||
headers = kwargs.pop("headers", {}).copy()
|
||||
headers["Range"] = f"bytes={start}-{end - 1}"
|
||||
logger.debug("%s : %s", self.url, headers["Range"])
|
||||
r = self.session.get(self.fs.encode_url(self.url), headers=headers, **kwargs)
|
||||
if r.status_code == 416:
|
||||
# range request outside file
|
||||
return b""
|
||||
r.raise_for_status()
|
||||
|
||||
# If the server has handled the range request, it should reply
|
||||
# with status 206 (partial content). But we'll guess that a suitable
|
||||
# Content-Range header or a Content-Length no more than the
|
||||
# requested range also mean we have got the desired range.
|
||||
cl = r.headers.get("Content-Length", r.headers.get("content-length", end + 1))
|
||||
response_is_range = (
|
||||
r.status_code == 206
|
||||
or self._parse_content_range(r.headers)[0] == start
|
||||
or int(cl) <= end - start
|
||||
)
|
||||
|
||||
if response_is_range:
|
||||
# partial content, as expected
|
||||
out = r.content
|
||||
elif start > 0:
|
||||
raise ValueError(
|
||||
"The HTTP server doesn't appear to support range requests. "
|
||||
"Only reading this file from the beginning is supported. "
|
||||
"Open with block_size=0 for a streaming file interface."
|
||||
)
|
||||
else:
|
||||
# Response is not a range, but we want the start of the file,
|
||||
# so we can read the required amount anyway.
|
||||
cl = 0
|
||||
out = []
|
||||
for chunk in r.iter_content(2**20, False):
|
||||
out.append(chunk)
|
||||
cl += len(chunk)
|
||||
out = b"".join(out)[: end - start]
|
||||
return out
|
||||
|
||||
|
||||
magic_check = re.compile("([*[])")
|
||||
|
||||
|
||||
def has_magic(s):
|
||||
match = magic_check.search(s)
|
||||
return match is not None
|
||||
|
||||
|
||||
class HTTPStreamFile(AbstractBufferedFile):
|
||||
def __init__(self, fs, url, mode="rb", session=None, **kwargs):
|
||||
self.url = url
|
||||
self.session = session
|
||||
if mode != "rb":
|
||||
raise ValueError
|
||||
self.details = {"name": url, "size": None}
|
||||
super().__init__(fs=fs, path=url, mode=mode, cache_type="readahead", **kwargs)
|
||||
|
||||
r = self.session.get(self.fs.encode_url(url), stream=True, **kwargs)
|
||||
self.fs._raise_not_found_for_status(r, url)
|
||||
self.it = r.iter_content(1024, False)
|
||||
self.leftover = b""
|
||||
|
||||
self.r = r
|
||||
|
||||
def seek(self, *args, **kwargs):
|
||||
raise ValueError("Cannot seek streaming HTTP file")
|
||||
|
||||
def read(self, num=-1):
|
||||
bufs = [self.leftover]
|
||||
leng = len(self.leftover)
|
||||
while leng < num or num < 0:
|
||||
try:
|
||||
out = self.it.__next__()
|
||||
except StopIteration:
|
||||
break
|
||||
if out:
|
||||
bufs.append(out)
|
||||
else:
|
||||
break
|
||||
leng += len(out)
|
||||
out = b"".join(bufs)
|
||||
if num >= 0:
|
||||
self.leftover = out[num:]
|
||||
out = out[:num]
|
||||
else:
|
||||
self.leftover = b""
|
||||
self.loc += len(out)
|
||||
return out
|
||||
|
||||
def close(self):
|
||||
self.r.close()
|
||||
self.closed = True
|
||||
|
||||
|
||||
def get_range(session, url, start, end, **kwargs):
|
||||
# explicit get a range when we know it must be safe
|
||||
kwargs = kwargs.copy()
|
||||
headers = kwargs.pop("headers", {}).copy()
|
||||
headers["Range"] = f"bytes={start}-{end - 1}"
|
||||
r = session.get(url, headers=headers, **kwargs)
|
||||
r.raise_for_status()
|
||||
return r.content
|
||||
|
||||
|
||||
def _file_info(url, session, size_policy="head", **kwargs):
|
||||
"""Call HEAD on the server to get details about the file (size/checksum etc.)
|
||||
|
||||
Default operation is to explicitly allow redirects and use encoding
|
||||
'identity' (no compression) to get the true size of the target.
|
||||
"""
|
||||
logger.debug("Retrieve file size for %s", url)
|
||||
kwargs = kwargs.copy()
|
||||
ar = kwargs.pop("allow_redirects", True)
|
||||
head = kwargs.get("headers", {}).copy()
|
||||
# TODO: not allowed in JS
|
||||
# head["Accept-Encoding"] = "identity"
|
||||
kwargs["headers"] = head
|
||||
|
||||
info = {}
|
||||
if size_policy == "head":
|
||||
r = session.head(url, allow_redirects=ar, **kwargs)
|
||||
elif size_policy == "get":
|
||||
r = session.get(url, allow_redirects=ar, **kwargs)
|
||||
else:
|
||||
raise TypeError(f'size_policy must be "head" or "get", got {size_policy}')
|
||||
r.raise_for_status()
|
||||
|
||||
# TODO:
|
||||
# recognise lack of 'Accept-Ranges',
|
||||
# or 'Accept-Ranges': 'none' (not 'bytes')
|
||||
# to mean streaming only, no random access => return None
|
||||
if "Content-Length" in r.headers:
|
||||
info["size"] = int(r.headers["Content-Length"])
|
||||
elif "Content-Range" in r.headers:
|
||||
info["size"] = int(r.headers["Content-Range"].split("/")[1])
|
||||
elif "content-length" in r.headers:
|
||||
info["size"] = int(r.headers["content-length"])
|
||||
elif "content-range" in r.headers:
|
||||
info["size"] = int(r.headers["content-range"].split("/")[1])
|
||||
|
||||
for checksum_field in ["ETag", "Content-MD5", "Digest"]:
|
||||
if r.headers.get(checksum_field):
|
||||
info[checksum_field] = r.headers[checksum_field]
|
||||
|
||||
return info
|
||||
|
||||
|
||||
# importing this is enough to register it
|
||||
def register():
|
||||
register_implementation("http", HTTPFileSystem, clobber=True)
|
||||
register_implementation("https", HTTPFileSystem, clobber=True)
|
||||
register_implementation("sync-http", HTTPFileSystem, clobber=True)
|
||||
register_implementation("sync-https", HTTPFileSystem, clobber=True)
|
||||
|
||||
|
||||
register()
|
||||
|
||||
|
||||
def unregister():
|
||||
from fsspec.implementations.http import HTTPFileSystem
|
||||
|
||||
register_implementation("http", HTTPFileSystem, clobber=True)
|
||||
register_implementation("https", HTTPFileSystem, clobber=True)
|
||||
@@ -0,0 +1,129 @@
|
||||
import base64
|
||||
import io
|
||||
import re
|
||||
|
||||
import requests
|
||||
|
||||
import fsspec
|
||||
|
||||
|
||||
class JupyterFileSystem(fsspec.AbstractFileSystem):
|
||||
"""View of the files as seen by a Jupyter server (notebook or lab)"""
|
||||
|
||||
protocol = ("jupyter", "jlab")
|
||||
|
||||
def __init__(self, url, tok=None, **kwargs):
|
||||
"""
|
||||
|
||||
Parameters
|
||||
----------
|
||||
url : str
|
||||
Base URL of the server, like "http://127.0.0.1:8888". May include
|
||||
token in the string, which is given by the process when starting up
|
||||
tok : str
|
||||
If the token is obtained separately, can be given here
|
||||
kwargs
|
||||
"""
|
||||
if "?" in url:
|
||||
if tok is None:
|
||||
try:
|
||||
tok = re.findall("token=([a-z0-9]+)", url)[0]
|
||||
except IndexError as e:
|
||||
raise ValueError("Could not determine token") from e
|
||||
url = url.split("?", 1)[0]
|
||||
self.url = url.rstrip("/") + "/api/contents"
|
||||
self.session = requests.Session()
|
||||
if tok:
|
||||
self.session.headers["Authorization"] = f"token {tok}"
|
||||
|
||||
super().__init__(**kwargs)
|
||||
|
||||
def ls(self, path, detail=True, **kwargs):
|
||||
path = self._strip_protocol(path)
|
||||
r = self.session.get(f"{self.url}/{path}")
|
||||
if r.status_code == 404:
|
||||
raise FileNotFoundError(path)
|
||||
r.raise_for_status()
|
||||
out = r.json()
|
||||
|
||||
if out["type"] == "directory":
|
||||
out = out["content"]
|
||||
else:
|
||||
out = [out]
|
||||
for o in out:
|
||||
o["name"] = o.pop("path")
|
||||
o.pop("content")
|
||||
if o["type"] == "notebook":
|
||||
o["type"] = "file"
|
||||
if detail:
|
||||
return out
|
||||
return [o["name"] for o in out]
|
||||
|
||||
def cat_file(self, path, start=None, end=None, **kwargs):
|
||||
path = self._strip_protocol(path)
|
||||
r = self.session.get(f"{self.url}/{path}")
|
||||
if r.status_code == 404:
|
||||
raise FileNotFoundError(path)
|
||||
r.raise_for_status()
|
||||
out = r.json()
|
||||
if out["format"] == "text":
|
||||
# data should be binary
|
||||
b = out["content"].encode()
|
||||
else:
|
||||
b = base64.b64decode(out["content"])
|
||||
return b[start:end]
|
||||
|
||||
def pipe_file(self, path, value, **_):
|
||||
path = self._strip_protocol(path)
|
||||
json = {
|
||||
"name": path.rsplit("/", 1)[-1],
|
||||
"path": path,
|
||||
"size": len(value),
|
||||
"content": base64.b64encode(value).decode(),
|
||||
"format": "base64",
|
||||
"type": "file",
|
||||
}
|
||||
self.session.put(f"{self.url}/{path}", json=json)
|
||||
|
||||
def mkdir(self, path, create_parents=True, **kwargs):
|
||||
path = self._strip_protocol(path)
|
||||
if create_parents and "/" in path:
|
||||
self.mkdir(path.rsplit("/", 1)[0], True)
|
||||
json = {
|
||||
"name": path.rsplit("/", 1)[-1],
|
||||
"path": path,
|
||||
"size": None,
|
||||
"content": None,
|
||||
"type": "directory",
|
||||
}
|
||||
self.session.put(f"{self.url}/{path}", json=json)
|
||||
|
||||
def mv(self, path1, path2, recursive=False, maxdepth=None, **kwargs):
|
||||
if path1 == path2:
|
||||
return
|
||||
self.session.patch(f"{self.url}/{path1}", json={"path": path2})
|
||||
|
||||
def _rm(self, path):
|
||||
path = self._strip_protocol(path)
|
||||
self.session.delete(f"{self.url}/{path}")
|
||||
|
||||
def _open(self, path, mode="rb", **kwargs):
|
||||
path = self._strip_protocol(path)
|
||||
if mode == "rb":
|
||||
data = self.cat_file(path)
|
||||
return io.BytesIO(data)
|
||||
else:
|
||||
return SimpleFileWriter(self, path, mode="wb")
|
||||
|
||||
|
||||
class SimpleFileWriter(fsspec.spec.AbstractBufferedFile):
|
||||
def _upload_chunk(self, final=False):
|
||||
"""Never uploads a chunk until file is done
|
||||
|
||||
Not suitable for large files
|
||||
"""
|
||||
if final is False:
|
||||
return False
|
||||
self.buffer.seek(0)
|
||||
data = self.buffer.read()
|
||||
self.fs.pipe_file(self.path, data)
|
||||
@@ -0,0 +1,213 @@
|
||||
from contextlib import contextmanager
|
||||
from ctypes import (
|
||||
CFUNCTYPE,
|
||||
POINTER,
|
||||
c_int,
|
||||
c_longlong,
|
||||
c_void_p,
|
||||
cast,
|
||||
create_string_buffer,
|
||||
)
|
||||
|
||||
import libarchive
|
||||
import libarchive.ffi as ffi
|
||||
|
||||
from fsspec import open_files
|
||||
from fsspec.archive import AbstractArchiveFileSystem
|
||||
from fsspec.implementations.memory import MemoryFile
|
||||
from fsspec.utils import DEFAULT_BLOCK_SIZE
|
||||
|
||||
# Libarchive requires seekable files or memory only for certain archive
|
||||
# types. However, since we read the directory first to cache the contents
|
||||
# and also allow random access to any file, the file-like object needs
|
||||
# to be seekable no matter what.
|
||||
|
||||
# Seek call-backs (not provided in the libarchive python wrapper)
|
||||
SEEK_CALLBACK = CFUNCTYPE(c_longlong, c_int, c_void_p, c_longlong, c_int)
|
||||
read_set_seek_callback = ffi.ffi(
|
||||
"read_set_seek_callback", [ffi.c_archive_p, SEEK_CALLBACK], c_int, ffi.check_int
|
||||
)
|
||||
new_api = hasattr(ffi, "NO_OPEN_CB")
|
||||
|
||||
|
||||
@contextmanager
|
||||
def custom_reader(file, format_name="all", filter_name="all", block_size=ffi.page_size):
|
||||
"""Read an archive from a seekable file-like object.
|
||||
|
||||
The `file` object must support the standard `readinto` and 'seek' methods.
|
||||
"""
|
||||
buf = create_string_buffer(block_size)
|
||||
buf_p = cast(buf, c_void_p)
|
||||
|
||||
def read_func(archive_p, context, ptrptr):
|
||||
# readinto the buffer, returns number of bytes read
|
||||
length = file.readinto(buf)
|
||||
# write the address of the buffer into the pointer
|
||||
ptrptr = cast(ptrptr, POINTER(c_void_p))
|
||||
ptrptr[0] = buf_p
|
||||
# tell libarchive how much data was written into the buffer
|
||||
return length
|
||||
|
||||
def seek_func(archive_p, context, offset, whence):
|
||||
file.seek(offset, whence)
|
||||
# tell libarchvie the current position
|
||||
return file.tell()
|
||||
|
||||
read_cb = ffi.READ_CALLBACK(read_func)
|
||||
seek_cb = SEEK_CALLBACK(seek_func)
|
||||
|
||||
if new_api:
|
||||
open_cb = ffi.NO_OPEN_CB
|
||||
close_cb = ffi.NO_CLOSE_CB
|
||||
else:
|
||||
open_cb = libarchive.read.OPEN_CALLBACK(ffi.VOID_CB)
|
||||
close_cb = libarchive.read.CLOSE_CALLBACK(ffi.VOID_CB)
|
||||
|
||||
with libarchive.read.new_archive_read(format_name, filter_name) as archive_p:
|
||||
read_set_seek_callback(archive_p, seek_cb)
|
||||
ffi.read_open(archive_p, None, open_cb, read_cb, close_cb)
|
||||
yield libarchive.read.ArchiveRead(archive_p)
|
||||
|
||||
|
||||
class LibArchiveFileSystem(AbstractArchiveFileSystem):
|
||||
"""Compressed archives as a file-system (read-only)
|
||||
|
||||
Supports the following formats:
|
||||
tar, pax , cpio, ISO9660, zip, mtree, shar, ar, raw, xar, lha/lzh, rar
|
||||
Microsoft CAB, 7-Zip, WARC
|
||||
|
||||
See the libarchive documentation for further restrictions.
|
||||
https://www.libarchive.org/
|
||||
|
||||
Keeps file object open while instance lives. It only works in seekable
|
||||
file-like objects. In case the filesystem does not support this kind of
|
||||
file object, it is recommended to cache locally.
|
||||
|
||||
This class is pickleable, but not necessarily thread-safe (depends on the
|
||||
platform). See libarchive documentation for details.
|
||||
"""
|
||||
|
||||
root_marker = ""
|
||||
protocol = "libarchive"
|
||||
cachable = False
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
fo="",
|
||||
mode="r",
|
||||
target_protocol=None,
|
||||
target_options=None,
|
||||
block_size=DEFAULT_BLOCK_SIZE,
|
||||
**kwargs,
|
||||
):
|
||||
"""
|
||||
Parameters
|
||||
----------
|
||||
fo: str or file-like
|
||||
Contains ZIP, and must exist. If a str, will fetch file using
|
||||
:meth:`~fsspec.open_files`, which must return one file exactly.
|
||||
mode: str
|
||||
Currently, only 'r' accepted
|
||||
target_protocol: str (optional)
|
||||
If ``fo`` is a string, this value can be used to override the
|
||||
FS protocol inferred from a URL
|
||||
target_options: dict (optional)
|
||||
Kwargs passed when instantiating the target FS, if ``fo`` is
|
||||
a string.
|
||||
"""
|
||||
super().__init__(self, **kwargs)
|
||||
if mode != "r":
|
||||
raise ValueError("Only read from archive files accepted")
|
||||
if isinstance(fo, str):
|
||||
files = open_files(fo, protocol=target_protocol, **(target_options or {}))
|
||||
if len(files) != 1:
|
||||
raise ValueError(
|
||||
f'Path "{fo}" did not resolve to exactly one file: "{files}"'
|
||||
)
|
||||
fo = files[0]
|
||||
self.of = fo
|
||||
self.fo = fo.__enter__() # the whole instance is a context
|
||||
self.block_size = block_size
|
||||
self.dir_cache = None
|
||||
|
||||
@contextmanager
|
||||
def _open_archive(self):
|
||||
self.fo.seek(0)
|
||||
with custom_reader(self.fo, block_size=self.block_size) as arc:
|
||||
yield arc
|
||||
|
||||
@classmethod
|
||||
def _strip_protocol(cls, path):
|
||||
# file paths are always relative to the archive root
|
||||
return super()._strip_protocol(path).lstrip("/")
|
||||
|
||||
def _get_dirs(self):
|
||||
fields = {
|
||||
"name": "pathname",
|
||||
"size": "size",
|
||||
"created": "ctime",
|
||||
"mode": "mode",
|
||||
"uid": "uid",
|
||||
"gid": "gid",
|
||||
"mtime": "mtime",
|
||||
}
|
||||
|
||||
if self.dir_cache is not None:
|
||||
return
|
||||
|
||||
self.dir_cache = {}
|
||||
list_names = []
|
||||
with self._open_archive() as arc:
|
||||
for entry in arc:
|
||||
if not entry.isdir and not entry.isfile:
|
||||
# Skip symbolic links, fifo entries, etc.
|
||||
continue
|
||||
self.dir_cache.update(
|
||||
{
|
||||
dirname: {"name": dirname, "size": 0, "type": "directory"}
|
||||
for dirname in self._all_dirnames(set(entry.name))
|
||||
}
|
||||
)
|
||||
f = {key: getattr(entry, fields[key]) for key in fields}
|
||||
f["type"] = "directory" if entry.isdir else "file"
|
||||
list_names.append(entry.name)
|
||||
|
||||
self.dir_cache[f["name"]] = f
|
||||
# libarchive does not seem to return an entry for the directories (at least
|
||||
# not in all formats), so get the directories names from the files names
|
||||
self.dir_cache.update(
|
||||
{
|
||||
dirname: {"name": dirname, "size": 0, "type": "directory"}
|
||||
for dirname in self._all_dirnames(list_names)
|
||||
}
|
||||
)
|
||||
|
||||
def _open(
|
||||
self,
|
||||
path,
|
||||
mode="rb",
|
||||
block_size=None,
|
||||
autocommit=True,
|
||||
cache_options=None,
|
||||
**kwargs,
|
||||
):
|
||||
path = self._strip_protocol(path)
|
||||
if mode != "rb":
|
||||
raise NotImplementedError
|
||||
|
||||
data = bytes()
|
||||
with self._open_archive() as arc:
|
||||
for entry in arc:
|
||||
if entry.pathname != path:
|
||||
continue
|
||||
|
||||
if entry.size == 0:
|
||||
# empty file, so there are no blocks
|
||||
break
|
||||
|
||||
for block in entry.get_blocks(entry.size):
|
||||
data = block
|
||||
break
|
||||
else:
|
||||
raise ValueError
|
||||
return MemoryFile(fs=self, path=path, data=data)
|
||||
@@ -0,0 +1,514 @@
|
||||
import datetime
|
||||
import io
|
||||
import logging
|
||||
import os
|
||||
import os.path as osp
|
||||
import shutil
|
||||
import stat
|
||||
import tempfile
|
||||
from functools import lru_cache
|
||||
|
||||
from fsspec import AbstractFileSystem
|
||||
from fsspec.compression import compr
|
||||
from fsspec.core import get_compression
|
||||
from fsspec.utils import isfilelike, stringify_path
|
||||
|
||||
logger = logging.getLogger("fsspec.local")
|
||||
|
||||
|
||||
class LocalFileSystem(AbstractFileSystem):
|
||||
"""Interface to files on local storage
|
||||
|
||||
Parameters
|
||||
----------
|
||||
auto_mkdir: bool
|
||||
Whether, when opening a file, the directory containing it should
|
||||
be created (if it doesn't already exist). This is assumed by pyarrow
|
||||
code.
|
||||
"""
|
||||
|
||||
root_marker = "/"
|
||||
protocol = "file", "local"
|
||||
local_file = True
|
||||
|
||||
def __init__(self, auto_mkdir=False, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
self.auto_mkdir = auto_mkdir
|
||||
|
||||
@property
|
||||
def fsid(self):
|
||||
return "local"
|
||||
|
||||
def mkdir(self, path, create_parents=True, **kwargs):
|
||||
path = self._strip_protocol(path)
|
||||
if self.exists(path):
|
||||
raise FileExistsError(path)
|
||||
if create_parents:
|
||||
self.makedirs(path, exist_ok=True)
|
||||
else:
|
||||
os.mkdir(path, **kwargs)
|
||||
|
||||
def makedirs(self, path, exist_ok=False):
|
||||
path = self._strip_protocol(path)
|
||||
os.makedirs(path, exist_ok=exist_ok)
|
||||
|
||||
def rmdir(self, path):
|
||||
path = self._strip_protocol(path)
|
||||
os.rmdir(path)
|
||||
|
||||
def ls(self, path, detail=False, **kwargs):
|
||||
path = self._strip_protocol(path)
|
||||
path_info = self.info(path)
|
||||
infos = []
|
||||
if path_info["type"] == "directory":
|
||||
with os.scandir(path) as it:
|
||||
for f in it:
|
||||
try:
|
||||
# Only get the info if requested since it is a bit expensive (the stat call inside)
|
||||
# The strip_protocol is also used in info() and calls make_path_posix to always return posix paths
|
||||
info = self.info(f) if detail else self._strip_protocol(f.path)
|
||||
infos.append(info)
|
||||
except FileNotFoundError:
|
||||
pass
|
||||
else:
|
||||
infos = [path_info] if detail else [path_info["name"]]
|
||||
|
||||
return infos
|
||||
|
||||
def info(self, path, **kwargs):
|
||||
if isinstance(path, os.DirEntry):
|
||||
# scandir DirEntry
|
||||
out = path.stat(follow_symlinks=False)
|
||||
link = path.is_symlink()
|
||||
if path.is_dir(follow_symlinks=False):
|
||||
t = "directory"
|
||||
elif path.is_file(follow_symlinks=False):
|
||||
t = "file"
|
||||
else:
|
||||
t = "other"
|
||||
|
||||
size = out.st_size
|
||||
if link:
|
||||
try:
|
||||
out2 = path.stat(follow_symlinks=True)
|
||||
size = out2.st_size
|
||||
except OSError:
|
||||
size = 0
|
||||
path = self._strip_protocol(path.path)
|
||||
else:
|
||||
# str or path-like
|
||||
path = self._strip_protocol(path)
|
||||
out = os.stat(path, follow_symlinks=False)
|
||||
link = stat.S_ISLNK(out.st_mode)
|
||||
if link:
|
||||
out = os.stat(path, follow_symlinks=True)
|
||||
size = out.st_size
|
||||
if stat.S_ISDIR(out.st_mode):
|
||||
t = "directory"
|
||||
elif stat.S_ISREG(out.st_mode):
|
||||
t = "file"
|
||||
else:
|
||||
t = "other"
|
||||
|
||||
# Check for the 'st_birthtime' attribute, which is not always present; fallback to st_ctime
|
||||
created_time = getattr(out, "st_birthtime", out.st_ctime)
|
||||
|
||||
result = {
|
||||
"name": path,
|
||||
"size": size,
|
||||
"type": t,
|
||||
"created": created_time,
|
||||
"islink": link,
|
||||
}
|
||||
for field in ["mode", "uid", "gid", "mtime", "ino", "nlink"]:
|
||||
result[field] = getattr(out, f"st_{field}")
|
||||
if link:
|
||||
result["destination"] = os.readlink(path)
|
||||
return result
|
||||
|
||||
def lexists(self, path, **kwargs):
|
||||
return osp.lexists(path)
|
||||
|
||||
def cp_file(self, path1, path2, **kwargs):
|
||||
path1 = self._strip_protocol(path1)
|
||||
path2 = self._strip_protocol(path2)
|
||||
if self.auto_mkdir:
|
||||
self.makedirs(self._parent(path2), exist_ok=True)
|
||||
if self.isfile(path1):
|
||||
shutil.copyfile(path1, path2)
|
||||
elif self.isdir(path1):
|
||||
self.mkdirs(path2, exist_ok=True)
|
||||
else:
|
||||
raise FileNotFoundError(path1)
|
||||
|
||||
def isfile(self, path):
|
||||
path = self._strip_protocol(path)
|
||||
return os.path.isfile(path)
|
||||
|
||||
def isdir(self, path):
|
||||
path = self._strip_protocol(path)
|
||||
return os.path.isdir(path)
|
||||
|
||||
def get_file(self, path1, path2, callback=None, **kwargs):
|
||||
if isfilelike(path2):
|
||||
with open(path1, "rb") as f:
|
||||
shutil.copyfileobj(f, path2)
|
||||
else:
|
||||
return self.cp_file(path1, path2, **kwargs)
|
||||
|
||||
def put_file(self, path1, path2, callback=None, **kwargs):
|
||||
return self.cp_file(path1, path2, **kwargs)
|
||||
|
||||
def mv(self, path1, path2, recursive: bool = True, **kwargs):
|
||||
"""Move files/directories
|
||||
For the specific case of local, all ops on directories are recursive and
|
||||
the recursive= kwarg is ignored.
|
||||
"""
|
||||
path1 = self._strip_protocol(path1)
|
||||
path2 = self._strip_protocol(path2)
|
||||
shutil.move(path1, path2)
|
||||
|
||||
def link(self, src, dst, **kwargs):
|
||||
src = self._strip_protocol(src)
|
||||
dst = self._strip_protocol(dst)
|
||||
os.link(src, dst, **kwargs)
|
||||
|
||||
def symlink(self, src, dst, **kwargs):
|
||||
src = self._strip_protocol(src)
|
||||
dst = self._strip_protocol(dst)
|
||||
os.symlink(src, dst, **kwargs)
|
||||
|
||||
def islink(self, path) -> bool:
|
||||
return os.path.islink(self._strip_protocol(path))
|
||||
|
||||
def rm_file(self, path):
|
||||
os.remove(self._strip_protocol(path))
|
||||
|
||||
def rm(self, path, recursive=False, maxdepth=None):
|
||||
if not isinstance(path, list):
|
||||
path = [path]
|
||||
|
||||
for p in path:
|
||||
p = self._strip_protocol(p)
|
||||
if self.isdir(p):
|
||||
if not recursive:
|
||||
raise ValueError("Cannot delete directory, set recursive=True")
|
||||
if osp.abspath(p) == os.getcwd():
|
||||
raise ValueError("Cannot delete current working directory")
|
||||
shutil.rmtree(p)
|
||||
else:
|
||||
os.remove(p)
|
||||
|
||||
def unstrip_protocol(self, name):
|
||||
name = self._strip_protocol(name) # normalise for local/win/...
|
||||
return f"file://{name}"
|
||||
|
||||
def _open(self, path, mode="rb", block_size=None, **kwargs):
|
||||
path = self._strip_protocol(path)
|
||||
if self.auto_mkdir and "w" in mode:
|
||||
self.makedirs(self._parent(path), exist_ok=True)
|
||||
return LocalFileOpener(path, mode, fs=self, **kwargs)
|
||||
|
||||
def touch(self, path, truncate=True, **kwargs):
|
||||
path = self._strip_protocol(path)
|
||||
if self.auto_mkdir:
|
||||
self.makedirs(self._parent(path), exist_ok=True)
|
||||
if self.exists(path):
|
||||
os.utime(path, None)
|
||||
else:
|
||||
open(path, "a").close()
|
||||
if truncate:
|
||||
os.truncate(path, 0)
|
||||
|
||||
def created(self, path):
|
||||
info = self.info(path=path)
|
||||
return datetime.datetime.fromtimestamp(
|
||||
info["created"], tz=datetime.timezone.utc
|
||||
)
|
||||
|
||||
def modified(self, path):
|
||||
info = self.info(path=path)
|
||||
return datetime.datetime.fromtimestamp(info["mtime"], tz=datetime.timezone.utc)
|
||||
|
||||
@classmethod
|
||||
def _parent(cls, path):
|
||||
path = cls._strip_protocol(path)
|
||||
if os.sep == "/":
|
||||
# posix native
|
||||
return path.rsplit("/", 1)[0] or "/"
|
||||
else:
|
||||
# NT
|
||||
path_ = path.rsplit("/", 1)[0]
|
||||
if len(path_) <= 3:
|
||||
if path_[1:2] == ":":
|
||||
# nt root (something like c:/)
|
||||
return path_[0] + ":/"
|
||||
# More cases may be required here
|
||||
return path_
|
||||
|
||||
@classmethod
|
||||
def _strip_protocol(cls, path):
|
||||
path = stringify_path(path)
|
||||
if path.startswith("file://"):
|
||||
path = path[7:]
|
||||
elif path.startswith("file:"):
|
||||
path = path[5:]
|
||||
elif path.startswith("local://"):
|
||||
path = path[8:]
|
||||
elif path.startswith("local:"):
|
||||
path = path[6:]
|
||||
|
||||
path = make_path_posix(path)
|
||||
if os.sep != "/":
|
||||
# This code-path is a stripped down version of
|
||||
# > drive, path = ntpath.splitdrive(path)
|
||||
if path[1:2] == ":":
|
||||
# Absolute drive-letter path, e.g. X:\Windows
|
||||
# Relative path with drive, e.g. X:Windows
|
||||
drive, path = path[:2], path[2:]
|
||||
elif path[:2] == "//":
|
||||
# UNC drives, e.g. \\server\share or \\?\UNC\server\share
|
||||
# Device drives, e.g. \\.\device or \\?\device
|
||||
if (index1 := path.find("/", 2)) == -1 or (
|
||||
index2 := path.find("/", index1 + 1)
|
||||
) == -1:
|
||||
drive, path = path, ""
|
||||
else:
|
||||
drive, path = path[:index2], path[index2:]
|
||||
else:
|
||||
# Relative path, e.g. Windows
|
||||
drive = ""
|
||||
|
||||
path = path.rstrip("/") or cls.root_marker
|
||||
return drive + path
|
||||
|
||||
else:
|
||||
return path.rstrip("/") or cls.root_marker
|
||||
|
||||
def _isfilestore(self):
|
||||
# Inheriting from DaskFileSystem makes this False (S3, etc. were)
|
||||
# the original motivation. But we are a posix-like file system.
|
||||
# See https://github.com/dask/dask/issues/5526
|
||||
return True
|
||||
|
||||
def chmod(self, path, mode):
|
||||
path = stringify_path(path)
|
||||
return os.chmod(path, mode)
|
||||
|
||||
|
||||
def make_path_posix(path):
|
||||
"""Make path generic and absolute for current OS"""
|
||||
if not isinstance(path, str):
|
||||
if isinstance(path, (list, set, tuple)):
|
||||
return type(path)(make_path_posix(p) for p in path)
|
||||
else:
|
||||
path = stringify_path(path)
|
||||
if not isinstance(path, str):
|
||||
raise TypeError(f"could not convert {path!r} to string")
|
||||
if os.sep == "/":
|
||||
# Native posix
|
||||
if path.startswith("/"):
|
||||
# most common fast case for posix
|
||||
return path
|
||||
elif path.startswith("~"):
|
||||
return osp.expanduser(path)
|
||||
elif path.startswith("./"):
|
||||
path = path[2:]
|
||||
elif path == ".":
|
||||
path = ""
|
||||
return f"{os.getcwd()}/{path}"
|
||||
else:
|
||||
# NT handling
|
||||
if path[0:1] == "/" and path[2:3] == ":":
|
||||
# path is like "/c:/local/path"
|
||||
path = path[1:]
|
||||
if path[1:2] == ":":
|
||||
# windows full path like "C:\\local\\path"
|
||||
if len(path) <= 3:
|
||||
# nt root (something like c:/)
|
||||
return path[0] + ":/"
|
||||
path = path.replace("\\", "/")
|
||||
return path
|
||||
elif path[0:1] == "~":
|
||||
return make_path_posix(osp.expanduser(path))
|
||||
elif path.startswith(("\\\\", "//")):
|
||||
# windows UNC/DFS-style paths
|
||||
return "//" + path[2:].replace("\\", "/")
|
||||
elif path.startswith(("\\", "/")):
|
||||
# windows relative path with root
|
||||
path = path.replace("\\", "/")
|
||||
return f"{osp.splitdrive(os.getcwd())[0]}{path}"
|
||||
else:
|
||||
path = path.replace("\\", "/")
|
||||
if path.startswith("./"):
|
||||
path = path[2:]
|
||||
elif path == ".":
|
||||
path = ""
|
||||
return f"{make_path_posix(os.getcwd())}/{path}"
|
||||
|
||||
|
||||
def trailing_sep(path):
|
||||
"""Return True if the path ends with a path separator.
|
||||
|
||||
A forward slash is always considered a path separator, even on Operating
|
||||
Systems that normally use a backslash.
|
||||
"""
|
||||
# TODO: if all incoming paths were posix-compliant then separator would
|
||||
# always be a forward slash, simplifying this function.
|
||||
# See https://github.com/fsspec/filesystem_spec/pull/1250
|
||||
return path.endswith(os.sep) or (os.altsep is not None and path.endswith(os.altsep))
|
||||
|
||||
|
||||
@lru_cache(maxsize=1)
|
||||
def get_umask(mask: int = 0o666) -> int:
|
||||
"""Get the current umask.
|
||||
|
||||
Follows https://stackoverflow.com/a/44130549 to get the umask.
|
||||
Temporarily sets the umask to the given value, and then resets it to the
|
||||
original value.
|
||||
"""
|
||||
value = os.umask(mask)
|
||||
os.umask(value)
|
||||
return value
|
||||
|
||||
|
||||
class LocalFileOpener(io.IOBase):
|
||||
def __init__(
|
||||
self, path, mode, autocommit=True, fs=None, compression=None, **kwargs
|
||||
):
|
||||
logger.debug("open file: %s", path)
|
||||
self.path = path
|
||||
self.mode = mode
|
||||
self.fs = fs
|
||||
self.f = None
|
||||
self.autocommit = autocommit
|
||||
self.compression = get_compression(path, compression)
|
||||
self.blocksize = io.DEFAULT_BUFFER_SIZE
|
||||
self._open()
|
||||
|
||||
def _open(self):
|
||||
if self.f is None or self.f.closed:
|
||||
if self.autocommit or "w" not in self.mode:
|
||||
self.f = open(self.path, mode=self.mode)
|
||||
if self.compression:
|
||||
compress = compr[self.compression]
|
||||
self.f = compress(self.f, mode=self.mode)
|
||||
else:
|
||||
# TODO: check if path is writable?
|
||||
i, name = tempfile.mkstemp()
|
||||
os.close(i) # we want normal open and normal buffered file
|
||||
self.temp = name
|
||||
self.f = open(name, mode=self.mode)
|
||||
if "w" not in self.mode:
|
||||
self.size = self.f.seek(0, 2)
|
||||
self.f.seek(0)
|
||||
self.f.size = self.size
|
||||
|
||||
def _fetch_range(self, start, end):
|
||||
# probably only used by cached FS
|
||||
if "r" not in self.mode:
|
||||
raise ValueError
|
||||
self._open()
|
||||
self.f.seek(start)
|
||||
return self.f.read(end - start)
|
||||
|
||||
def __setstate__(self, state):
|
||||
self.f = None
|
||||
loc = state.pop("loc", None)
|
||||
self.__dict__.update(state)
|
||||
if "r" in state["mode"]:
|
||||
self.f = None
|
||||
self._open()
|
||||
self.f.seek(loc)
|
||||
|
||||
def __getstate__(self):
|
||||
d = self.__dict__.copy()
|
||||
d.pop("f")
|
||||
if "r" in self.mode:
|
||||
d["loc"] = self.f.tell()
|
||||
else:
|
||||
if not self.f.closed:
|
||||
raise ValueError("Cannot serialise open write-mode local file")
|
||||
return d
|
||||
|
||||
def commit(self):
|
||||
if self.autocommit:
|
||||
raise RuntimeError("Can only commit if not already set to autocommit")
|
||||
try:
|
||||
shutil.move(self.temp, self.path)
|
||||
except PermissionError as e:
|
||||
# shutil.move raises PermissionError if os.rename
|
||||
# and the default copy2 fallback with shutil.copystats fail.
|
||||
# The file should be there nonetheless, but without copied permissions.
|
||||
# If it doesn't exist, there was no permission to create the file.
|
||||
if not os.path.exists(self.path):
|
||||
raise e
|
||||
else:
|
||||
# If PermissionError is not raised, permissions can be set.
|
||||
try:
|
||||
mask = 0o666
|
||||
os.chmod(self.path, mask & ~get_umask(mask))
|
||||
except RuntimeError:
|
||||
pass
|
||||
|
||||
def discard(self):
|
||||
if self.autocommit:
|
||||
raise RuntimeError("Cannot discard if set to autocommit")
|
||||
os.remove(self.temp)
|
||||
|
||||
def readable(self) -> bool:
|
||||
return True
|
||||
|
||||
def writable(self) -> bool:
|
||||
return "r" not in self.mode
|
||||
|
||||
def read(self, *args, **kwargs):
|
||||
return self.f.read(*args, **kwargs)
|
||||
|
||||
def write(self, *args, **kwargs):
|
||||
return self.f.write(*args, **kwargs)
|
||||
|
||||
def tell(self, *args, **kwargs):
|
||||
return self.f.tell(*args, **kwargs)
|
||||
|
||||
def seek(self, *args, **kwargs):
|
||||
return self.f.seek(*args, **kwargs)
|
||||
|
||||
def seekable(self, *args, **kwargs):
|
||||
return self.f.seekable(*args, **kwargs)
|
||||
|
||||
def readline(self, *args, **kwargs):
|
||||
return self.f.readline(*args, **kwargs)
|
||||
|
||||
def readlines(self, *args, **kwargs):
|
||||
return self.f.readlines(*args, **kwargs)
|
||||
|
||||
def close(self):
|
||||
return self.f.close()
|
||||
|
||||
def truncate(self, size=None) -> int:
|
||||
return self.f.truncate(size)
|
||||
|
||||
@property
|
||||
def closed(self):
|
||||
return self.f.closed
|
||||
|
||||
def fileno(self):
|
||||
return self.raw.fileno()
|
||||
|
||||
def flush(self) -> None:
|
||||
self.f.flush()
|
||||
|
||||
def __iter__(self):
|
||||
return self.f.__iter__()
|
||||
|
||||
def __getattr__(self, item):
|
||||
return getattr(self.f, item)
|
||||
|
||||
def __enter__(self):
|
||||
self._incontext = True
|
||||
return self
|
||||
|
||||
def __exit__(self, exc_type, exc_value, traceback):
|
||||
self._incontext = False
|
||||
self.f.__exit__(exc_type, exc_value, traceback)
|
||||
@@ -0,0 +1,311 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from datetime import datetime, timezone
|
||||
from errno import ENOTEMPTY
|
||||
from io import BytesIO
|
||||
from pathlib import PurePath, PureWindowsPath
|
||||
from typing import Any, ClassVar
|
||||
|
||||
from fsspec import AbstractFileSystem
|
||||
from fsspec.implementations.local import LocalFileSystem
|
||||
from fsspec.utils import stringify_path
|
||||
|
||||
logger = logging.getLogger("fsspec.memoryfs")
|
||||
|
||||
|
||||
class MemoryFileSystem(AbstractFileSystem):
|
||||
"""A filesystem based on a dict of BytesIO objects
|
||||
|
||||
This is a global filesystem so instances of this class all point to the same
|
||||
in memory filesystem.
|
||||
"""
|
||||
|
||||
store: ClassVar[dict[str, Any]] = {} # global, do not overwrite!
|
||||
pseudo_dirs = [""] # global, do not overwrite!
|
||||
protocol = "memory"
|
||||
root_marker = "/"
|
||||
|
||||
@classmethod
|
||||
def _strip_protocol(cls, path):
|
||||
if isinstance(path, PurePath):
|
||||
if isinstance(path, PureWindowsPath):
|
||||
return LocalFileSystem._strip_protocol(path)
|
||||
else:
|
||||
path = stringify_path(path)
|
||||
|
||||
path = path.removeprefix("memory://")
|
||||
if "::" in path or "://" in path:
|
||||
return path.rstrip("/")
|
||||
path = path.lstrip("/").rstrip("/")
|
||||
return "/" + path if path else ""
|
||||
|
||||
def ls(self, path, detail=True, **kwargs):
|
||||
path = self._strip_protocol(path)
|
||||
if path in self.store:
|
||||
# there is a key with this exact name
|
||||
if not detail:
|
||||
return [path]
|
||||
return [
|
||||
{
|
||||
"name": path,
|
||||
"size": self.store[path].size,
|
||||
"type": "file",
|
||||
"created": self.store[path].created.timestamp(),
|
||||
}
|
||||
]
|
||||
paths = set()
|
||||
starter = path + "/"
|
||||
out = []
|
||||
for p2 in tuple(self.store):
|
||||
if p2.startswith(starter):
|
||||
if "/" not in p2[len(starter) :]:
|
||||
# exact child
|
||||
out.append(
|
||||
{
|
||||
"name": p2,
|
||||
"size": self.store[p2].size,
|
||||
"type": "file",
|
||||
"created": self.store[p2].created.timestamp(),
|
||||
}
|
||||
)
|
||||
elif len(p2) > len(starter):
|
||||
# implied child directory
|
||||
ppath = starter + p2[len(starter) :].split("/", 1)[0]
|
||||
if ppath not in paths:
|
||||
out = out or []
|
||||
out.append(
|
||||
{
|
||||
"name": ppath,
|
||||
"size": 0,
|
||||
"type": "directory",
|
||||
}
|
||||
)
|
||||
paths.add(ppath)
|
||||
for p2 in self.pseudo_dirs:
|
||||
if p2.startswith(starter):
|
||||
if "/" not in p2[len(starter) :]:
|
||||
# exact child pdir
|
||||
if p2 not in paths:
|
||||
out.append({"name": p2, "size": 0, "type": "directory"})
|
||||
paths.add(p2)
|
||||
else:
|
||||
# directory implied by deeper pdir
|
||||
ppath = starter + p2[len(starter) :].split("/", 1)[0]
|
||||
if ppath not in paths:
|
||||
out.append({"name": ppath, "size": 0, "type": "directory"})
|
||||
paths.add(ppath)
|
||||
if not out:
|
||||
if path in self.pseudo_dirs:
|
||||
# empty dir
|
||||
return []
|
||||
raise FileNotFoundError(path)
|
||||
if detail:
|
||||
return out
|
||||
return sorted([f["name"] for f in out])
|
||||
|
||||
def mkdir(self, path, create_parents=True, **kwargs):
|
||||
path = self._strip_protocol(path)
|
||||
if path in self.store or path in self.pseudo_dirs:
|
||||
raise FileExistsError(path)
|
||||
if self._parent(path).strip("/") and self.isfile(self._parent(path)):
|
||||
raise NotADirectoryError(self._parent(path))
|
||||
if create_parents and self._parent(path).strip("/"):
|
||||
try:
|
||||
self.mkdir(self._parent(path), create_parents, **kwargs)
|
||||
except FileExistsError:
|
||||
pass
|
||||
if path and path not in self.pseudo_dirs:
|
||||
self.pseudo_dirs.append(path)
|
||||
|
||||
def makedirs(self, path, exist_ok=False):
|
||||
try:
|
||||
self.mkdir(path, create_parents=True)
|
||||
except FileExistsError:
|
||||
if not exist_ok:
|
||||
raise
|
||||
|
||||
def pipe_file(self, path, value, mode="overwrite", **kwargs):
|
||||
"""Set the bytes of given file
|
||||
|
||||
Avoids copies of the data if possible
|
||||
"""
|
||||
mode = "xb" if mode == "create" else "wb"
|
||||
self.open(path, mode=mode, data=value)
|
||||
|
||||
def rmdir(self, path):
|
||||
path = self._strip_protocol(path)
|
||||
if path == "":
|
||||
# silently avoid deleting FS root
|
||||
return
|
||||
if path in self.pseudo_dirs:
|
||||
if not self.ls(path):
|
||||
self.pseudo_dirs.remove(path)
|
||||
else:
|
||||
raise OSError(ENOTEMPTY, "Directory not empty", path)
|
||||
else:
|
||||
raise FileNotFoundError(path)
|
||||
|
||||
def info(self, path, **kwargs):
|
||||
logger.debug("info: %s", path)
|
||||
path = self._strip_protocol(path)
|
||||
if path in self.pseudo_dirs or any(
|
||||
p.startswith(path + "/") for p in list(self.store) + self.pseudo_dirs
|
||||
):
|
||||
return {
|
||||
"name": path,
|
||||
"size": 0,
|
||||
"type": "directory",
|
||||
}
|
||||
elif path in self.store:
|
||||
filelike = self.store[path]
|
||||
return {
|
||||
"name": path,
|
||||
"size": filelike.size,
|
||||
"type": "file",
|
||||
"created": getattr(filelike, "created", None),
|
||||
}
|
||||
else:
|
||||
raise FileNotFoundError(path)
|
||||
|
||||
def _open(
|
||||
self,
|
||||
path,
|
||||
mode="rb",
|
||||
block_size=None,
|
||||
autocommit=True,
|
||||
cache_options=None,
|
||||
**kwargs,
|
||||
):
|
||||
path = self._strip_protocol(path)
|
||||
if "x" in mode and self.exists(path):
|
||||
raise FileExistsError
|
||||
if path in self.pseudo_dirs:
|
||||
raise IsADirectoryError(path)
|
||||
parent = path
|
||||
while len(parent) > 1:
|
||||
parent = self._parent(parent)
|
||||
if self.isfile(parent):
|
||||
raise FileExistsError(parent)
|
||||
if mode in ["rb", "ab", "r+b", "a+b"]:
|
||||
if path in self.store:
|
||||
f = self.store[path]
|
||||
if "a" in mode:
|
||||
# position at the end of file
|
||||
f.seek(0, 2)
|
||||
else:
|
||||
# position at the beginning of file
|
||||
f.seek(0)
|
||||
return f
|
||||
else:
|
||||
raise FileNotFoundError(path)
|
||||
elif mode in {"wb", "w+b", "xb", "x+b"}:
|
||||
if "x" in mode and self.exists(path):
|
||||
raise FileExistsError
|
||||
m = MemoryFile(self, path, kwargs.get("data"))
|
||||
if not self._intrans:
|
||||
m.commit()
|
||||
return m
|
||||
else:
|
||||
name = self.__class__.__name__
|
||||
raise ValueError(f"unsupported file mode for {name}: {mode!r}")
|
||||
|
||||
def cp_file(self, path1, path2, **kwargs):
|
||||
path1 = self._strip_protocol(path1)
|
||||
path2 = self._strip_protocol(path2)
|
||||
if self.isfile(path1):
|
||||
self.store[path2] = MemoryFile(
|
||||
self, path2, self.store[path1].getvalue()
|
||||
) # implicit copy
|
||||
elif self.isdir(path1):
|
||||
if path2 not in self.pseudo_dirs:
|
||||
self.pseudo_dirs.append(path2)
|
||||
else:
|
||||
raise FileNotFoundError(path1)
|
||||
|
||||
def cat_file(self, path, start=None, end=None, **kwargs):
|
||||
logger.debug("cat: %s", path)
|
||||
path = self._strip_protocol(path)
|
||||
try:
|
||||
return bytes(self.store[path].getbuffer()[start:end])
|
||||
except KeyError as e:
|
||||
raise FileNotFoundError(path) from e
|
||||
|
||||
def _rm(self, path):
|
||||
path = self._strip_protocol(path)
|
||||
try:
|
||||
del self.store[path]
|
||||
except KeyError as e:
|
||||
raise FileNotFoundError(path) from e
|
||||
|
||||
def modified(self, path):
|
||||
path = self._strip_protocol(path)
|
||||
try:
|
||||
return self.store[path].modified
|
||||
except KeyError as e:
|
||||
raise FileNotFoundError(path) from e
|
||||
|
||||
def created(self, path):
|
||||
path = self._strip_protocol(path)
|
||||
try:
|
||||
return self.store[path].created
|
||||
except KeyError as e:
|
||||
raise FileNotFoundError(path) from e
|
||||
|
||||
def isfile(self, path):
|
||||
path = self._strip_protocol(path)
|
||||
return path in self.store
|
||||
|
||||
def rm(self, path, recursive=False, maxdepth=None):
|
||||
if isinstance(path, str):
|
||||
path = self._strip_protocol(path)
|
||||
else:
|
||||
path = [self._strip_protocol(p) for p in path]
|
||||
paths = self.expand_path(path, recursive=recursive, maxdepth=maxdepth)
|
||||
for p in reversed(paths):
|
||||
if self.isfile(p):
|
||||
self.rm_file(p)
|
||||
# If the expanded path doesn't exist, it is only because the expanded
|
||||
# path was a directory that does not exist in self.pseudo_dirs. This
|
||||
# is possible if you directly create files without making the
|
||||
# directories first.
|
||||
elif not self.exists(p):
|
||||
continue
|
||||
else:
|
||||
self.rmdir(p)
|
||||
|
||||
|
||||
class MemoryFile(BytesIO):
|
||||
"""A BytesIO which can't close and works as a context manager
|
||||
|
||||
Can initialise with data. Each path should only be active once at any moment.
|
||||
|
||||
No need to provide fs, path if auto-committing (default)
|
||||
"""
|
||||
|
||||
def __init__(self, fs=None, path=None, data=None):
|
||||
logger.debug("open file %s", path)
|
||||
self.fs = fs
|
||||
self.path = path
|
||||
self.created = datetime.now(tz=timezone.utc)
|
||||
self.modified = datetime.now(tz=timezone.utc)
|
||||
if data:
|
||||
super().__init__(data)
|
||||
self.seek(0)
|
||||
|
||||
@property
|
||||
def size(self):
|
||||
return self.getbuffer().nbytes
|
||||
|
||||
def __enter__(self):
|
||||
return self
|
||||
|
||||
def close(self):
|
||||
pass
|
||||
|
||||
def discard(self):
|
||||
pass
|
||||
|
||||
def commit(self):
|
||||
self.fs.store[self.path] = self
|
||||
self.modified = datetime.now(tz=timezone.utc)
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,187 @@
|
||||
import datetime
|
||||
import logging
|
||||
import os
|
||||
import types
|
||||
import uuid
|
||||
from stat import S_ISDIR, S_ISLNK
|
||||
|
||||
import paramiko
|
||||
|
||||
from .. import AbstractFileSystem
|
||||
from ..utils import infer_storage_options
|
||||
|
||||
logger = logging.getLogger("fsspec.sftp")
|
||||
|
||||
|
||||
class SFTPFileSystem(AbstractFileSystem):
|
||||
"""Files over SFTP/SSH
|
||||
|
||||
Peer-to-peer filesystem over SSH using paramiko.
|
||||
|
||||
Note: if using this with the ``open`` or ``open_files``, with full URLs,
|
||||
there is no way to tell if a path is relative, so all paths are assumed
|
||||
to be absolute.
|
||||
"""
|
||||
|
||||
protocol = "sftp", "ssh"
|
||||
|
||||
def __init__(self, host, **ssh_kwargs):
|
||||
"""
|
||||
|
||||
Parameters
|
||||
----------
|
||||
host: str
|
||||
Hostname or IP as a string
|
||||
temppath: str
|
||||
Location on the server to put files, when within a transaction
|
||||
ssh_kwargs: dict
|
||||
Parameters passed on to connection. See details in
|
||||
https://docs.paramiko.org/en/3.3/api/client.html#paramiko.client.SSHClient.connect
|
||||
May include port, username, password...
|
||||
"""
|
||||
if self._cached:
|
||||
return
|
||||
super().__init__(**ssh_kwargs)
|
||||
self.temppath = ssh_kwargs.pop("temppath", "/tmp") # remote temp directory
|
||||
self.host = host
|
||||
self.ssh_kwargs = ssh_kwargs
|
||||
self._connect()
|
||||
|
||||
def _connect(self):
|
||||
logger.debug("Connecting to SFTP server %s", self.host)
|
||||
self.client = paramiko.SSHClient()
|
||||
self.client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
|
||||
self.client.connect(self.host, **self.ssh_kwargs)
|
||||
self.ftp = self.client.open_sftp()
|
||||
|
||||
@classmethod
|
||||
def _strip_protocol(cls, path):
|
||||
return infer_storage_options(path)["path"]
|
||||
|
||||
@staticmethod
|
||||
def _get_kwargs_from_urls(urlpath):
|
||||
out = infer_storage_options(urlpath)
|
||||
out.pop("path", None)
|
||||
out.pop("protocol", None)
|
||||
return out
|
||||
|
||||
def mkdir(self, path, create_parents=True, mode=511):
|
||||
path = self._strip_protocol(path)
|
||||
logger.debug("Creating folder %s", path)
|
||||
if self.exists(path):
|
||||
raise FileExistsError(f"File exists: {path}")
|
||||
|
||||
if create_parents:
|
||||
self.makedirs(path)
|
||||
else:
|
||||
self.ftp.mkdir(path, mode)
|
||||
|
||||
def makedirs(self, path, exist_ok=False, mode=511):
|
||||
if self.exists(path) and not exist_ok:
|
||||
raise FileExistsError(f"File exists: {path}")
|
||||
|
||||
parts = path.split("/")
|
||||
new_path = "/" if path[:1] == "/" else ""
|
||||
|
||||
for part in parts:
|
||||
if part:
|
||||
new_path = f"{new_path}/{part}" if new_path else part
|
||||
if not self.exists(new_path):
|
||||
self.ftp.mkdir(new_path, mode)
|
||||
|
||||
def rmdir(self, path):
|
||||
path = self._strip_protocol(path)
|
||||
logger.debug("Removing folder %s", path)
|
||||
self.ftp.rmdir(path)
|
||||
|
||||
def info(self, path):
|
||||
path = self._strip_protocol(path)
|
||||
stat = self._decode_stat(self.ftp.stat(path))
|
||||
stat["name"] = path
|
||||
return stat
|
||||
|
||||
@staticmethod
|
||||
def _decode_stat(stat, parent_path=None):
|
||||
if S_ISDIR(stat.st_mode):
|
||||
t = "directory"
|
||||
elif S_ISLNK(stat.st_mode):
|
||||
t = "link"
|
||||
else:
|
||||
t = "file"
|
||||
out = {
|
||||
"name": "",
|
||||
"size": stat.st_size,
|
||||
"type": t,
|
||||
"uid": stat.st_uid,
|
||||
"gid": stat.st_gid,
|
||||
"time": datetime.datetime.fromtimestamp(
|
||||
stat.st_atime, tz=datetime.timezone.utc
|
||||
),
|
||||
"mtime": datetime.datetime.fromtimestamp(
|
||||
stat.st_mtime, tz=datetime.timezone.utc
|
||||
),
|
||||
}
|
||||
if parent_path:
|
||||
out["name"] = "/".join([parent_path.rstrip("/"), stat.filename])
|
||||
return out
|
||||
|
||||
def ls(self, path, detail=False):
|
||||
path = self._strip_protocol(path)
|
||||
logger.debug("Listing folder %s", path)
|
||||
stats = [self._decode_stat(stat, path) for stat in self.ftp.listdir_iter(path)]
|
||||
if detail:
|
||||
return stats
|
||||
else:
|
||||
paths = [stat["name"] for stat in stats]
|
||||
return sorted(paths)
|
||||
|
||||
def put(self, lpath, rpath, callback=None, **kwargs):
|
||||
rpath = self._strip_protocol(rpath)
|
||||
logger.debug("Put file %s into %s", lpath, rpath)
|
||||
self.ftp.put(lpath, rpath)
|
||||
|
||||
def get_file(self, rpath, lpath, **kwargs):
|
||||
if self.isdir(rpath):
|
||||
os.makedirs(lpath, exist_ok=True)
|
||||
else:
|
||||
self.ftp.get(self._strip_protocol(rpath), lpath)
|
||||
|
||||
def _open(self, path, mode="rb", block_size=None, **kwargs):
|
||||
"""
|
||||
block_size: int or None
|
||||
If 0, no buffering, if 1, line buffering, if >1, buffer that many
|
||||
bytes, if None use default from paramiko.
|
||||
"""
|
||||
logger.debug("Opening file %s", path)
|
||||
if kwargs.get("autocommit", True) is False:
|
||||
# writes to temporary file, move on commit
|
||||
path2 = "/".join([self.temppath, str(uuid.uuid4())])
|
||||
f = self.ftp.open(path2, mode, bufsize=block_size if block_size else -1)
|
||||
f.temppath = path2
|
||||
f.targetpath = path
|
||||
f.fs = self
|
||||
f.commit = types.MethodType(commit_a_file, f)
|
||||
f.discard = types.MethodType(discard_a_file, f)
|
||||
else:
|
||||
f = self.ftp.open(path, mode, bufsize=block_size if block_size else -1)
|
||||
return f
|
||||
|
||||
def _rm(self, path):
|
||||
if self.isdir(path):
|
||||
self.ftp.rmdir(path)
|
||||
else:
|
||||
self.ftp.remove(path)
|
||||
|
||||
def mv(self, old, new):
|
||||
new = self._strip_protocol(new)
|
||||
old = self._strip_protocol(old)
|
||||
logger.debug("Renaming %s into %s", old, new)
|
||||
self.ftp.posix_rename(old, new)
|
||||
|
||||
|
||||
def commit_a_file(self):
|
||||
self.fs.mv(self.temppath, self.targetpath)
|
||||
|
||||
|
||||
def discard_a_file(self):
|
||||
self.fs._rm(self.temppath)
|
||||
@@ -0,0 +1,416 @@
|
||||
"""
|
||||
This module contains SMBFileSystem class responsible for handling access to
|
||||
Windows Samba network shares by using package smbprotocol
|
||||
"""
|
||||
|
||||
import datetime
|
||||
import re
|
||||
import uuid
|
||||
from stat import S_ISDIR, S_ISLNK
|
||||
|
||||
import smbclient
|
||||
import smbprotocol.exceptions
|
||||
|
||||
from .. import AbstractFileSystem
|
||||
from ..utils import infer_storage_options
|
||||
|
||||
# ! pylint: disable=bad-continuation
|
||||
|
||||
|
||||
class SMBFileSystem(AbstractFileSystem):
|
||||
"""Allow reading and writing to Windows and Samba network shares.
|
||||
|
||||
When using `fsspec.open()` for getting a file-like object the URI
|
||||
should be specified as this format:
|
||||
``smb://workgroup;user:password@server:port/share/folder/file.csv``.
|
||||
|
||||
Example::
|
||||
|
||||
>>> import fsspec
|
||||
>>> with fsspec.open(
|
||||
... 'smb://myuser:mypassword@myserver.com/' 'share/folder/file.csv'
|
||||
... ) as smbfile:
|
||||
... df = pd.read_csv(smbfile, sep='|', header=None)
|
||||
|
||||
Note that you need to pass in a valid hostname or IP address for the host
|
||||
component of the URL. Do not use the Windows/NetBIOS machine name for the
|
||||
host component.
|
||||
|
||||
The first component of the path in the URL points to the name of the shared
|
||||
folder. Subsequent path components will point to the directory/folder/file.
|
||||
|
||||
The URL components ``workgroup`` , ``user``, ``password`` and ``port`` may be
|
||||
optional.
|
||||
|
||||
.. note::
|
||||
|
||||
For working this source require `smbprotocol`_ to be installed, e.g.::
|
||||
|
||||
$ pip install smbprotocol
|
||||
# or
|
||||
# pip install smbprotocol[kerberos]
|
||||
|
||||
.. _smbprotocol: https://github.com/jborean93/smbprotocol#requirements
|
||||
|
||||
Note: if using this with the ``open`` or ``open_files``, with full URLs,
|
||||
there is no way to tell if a path is relative, so all paths are assumed
|
||||
to be absolute.
|
||||
"""
|
||||
|
||||
protocol = "smb"
|
||||
|
||||
# pylint: disable=too-many-arguments
|
||||
def __init__(
|
||||
self,
|
||||
host,
|
||||
port=None,
|
||||
username=None,
|
||||
password=None,
|
||||
timeout=60,
|
||||
encrypt=None,
|
||||
share_access=None,
|
||||
register_session_retries=4,
|
||||
register_session_retry_wait=1,
|
||||
register_session_retry_factor=10,
|
||||
auto_mkdir=False,
|
||||
**kwargs,
|
||||
):
|
||||
"""
|
||||
You can use _get_kwargs_from_urls to get some kwargs from
|
||||
a reasonable SMB url.
|
||||
|
||||
Authentication will be anonymous or integrated if username/password are not
|
||||
given.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
host: str
|
||||
The remote server name/ip to connect to
|
||||
port: int or None
|
||||
Port to connect with. Usually 445, sometimes 139.
|
||||
username: str or None
|
||||
Username to connect with. Required if Kerberos auth is not being used.
|
||||
password: str or None
|
||||
User's password on the server, if using username
|
||||
timeout: int
|
||||
Connection timeout in seconds
|
||||
encrypt: bool
|
||||
Whether to force encryption or not, once this has been set to True
|
||||
the session cannot be changed back to False.
|
||||
share_access: str or None
|
||||
Specifies the default access applied to file open operations
|
||||
performed with this file system object.
|
||||
This affects whether other processes can concurrently open a handle
|
||||
to the same file.
|
||||
|
||||
- None (the default): exclusively locks the file until closed.
|
||||
- 'r': Allow other handles to be opened with read access.
|
||||
- 'w': Allow other handles to be opened with write access.
|
||||
- 'd': Allow other handles to be opened with delete access.
|
||||
register_session_retries: int
|
||||
Number of retries to register a session with the server. Retries are not performed
|
||||
for authentication errors, as they are considered as invalid credentials and not network
|
||||
issues. If set to negative value, no register attempts will be performed.
|
||||
register_session_retry_wait: int
|
||||
Time in seconds to wait between each retry. Number must be non-negative.
|
||||
register_session_retry_factor: int
|
||||
Base factor for the wait time between each retry. The wait time
|
||||
is calculated using exponential function. For factor=1 all wait times
|
||||
will be equal to `register_session_retry_wait`. For any number of retries,
|
||||
the last wait time will be equal to `register_session_retry_wait` and for retries>1
|
||||
the first wait time will be equal to `register_session_retry_wait / factor`.
|
||||
Number must be equal to or greater than 1. Optimal factor is 10.
|
||||
auto_mkdir: bool
|
||||
Whether, when opening a file, the directory containing it should
|
||||
be created (if it doesn't already exist). This is assumed by pyarrow
|
||||
and zarr-python code.
|
||||
"""
|
||||
super().__init__(**kwargs)
|
||||
self.host = host
|
||||
self.port = port
|
||||
self.username = username
|
||||
self.password = password
|
||||
self.timeout = timeout
|
||||
self.encrypt = encrypt
|
||||
self.temppath = kwargs.pop("temppath", "")
|
||||
self.share_access = share_access
|
||||
self.register_session_retries = register_session_retries
|
||||
if register_session_retry_wait < 0:
|
||||
raise ValueError(
|
||||
"register_session_retry_wait must be a non-negative integer"
|
||||
)
|
||||
self.register_session_retry_wait = register_session_retry_wait
|
||||
if register_session_retry_factor < 1:
|
||||
raise ValueError(
|
||||
"register_session_retry_factor must be a positive "
|
||||
"integer equal to or greater than 1"
|
||||
)
|
||||
self.register_session_retry_factor = register_session_retry_factor
|
||||
self.auto_mkdir = auto_mkdir
|
||||
self._connect()
|
||||
|
||||
@property
|
||||
def _port(self):
|
||||
return 445 if self.port is None else self.port
|
||||
|
||||
def _connect(self):
|
||||
import time
|
||||
|
||||
if self.register_session_retries <= -1:
|
||||
return
|
||||
|
||||
retried_errors = []
|
||||
|
||||
wait_time = self.register_session_retry_wait
|
||||
n_waits = (
|
||||
self.register_session_retries - 1
|
||||
) # -1 = No wait time after the last retry
|
||||
factor = self.register_session_retry_factor
|
||||
|
||||
# Generate wait times for each retry attempt.
|
||||
# Wait times are calculated using exponential function. For factor=1 all wait times
|
||||
# will be equal to `wait`. For any number of retries the last wait time will be
|
||||
# equal to `wait` and for retries>2 the first wait time will be equal to `wait / factor`.
|
||||
wait_times = iter(
|
||||
factor ** (n / n_waits - 1) * wait_time for n in range(0, n_waits + 1)
|
||||
)
|
||||
|
||||
for attempt in range(self.register_session_retries + 1):
|
||||
try:
|
||||
smbclient.register_session(
|
||||
self.host,
|
||||
username=self.username,
|
||||
password=self.password,
|
||||
port=self._port,
|
||||
encrypt=self.encrypt,
|
||||
connection_timeout=self.timeout,
|
||||
)
|
||||
return
|
||||
except (
|
||||
smbprotocol.exceptions.SMBAuthenticationError,
|
||||
smbprotocol.exceptions.LogonFailure,
|
||||
):
|
||||
# These exceptions should not be repeated, as they clearly indicate
|
||||
# that the credentials are invalid and not a network issue.
|
||||
raise
|
||||
except ValueError as exc:
|
||||
if re.findall(r"\[Errno -\d+]", str(exc)):
|
||||
# This exception is raised by the smbprotocol.transport:Tcp.connect
|
||||
# and originates from socket.gaierror (OSError). These exceptions might
|
||||
# be raised due to network instability. We will retry to connect.
|
||||
retried_errors.append(exc)
|
||||
else:
|
||||
# All another ValueError exceptions should be raised, as they are not
|
||||
# related to network issues.
|
||||
raise
|
||||
except Exception as exc:
|
||||
# Save the exception and retry to connect. This except might be dropped
|
||||
# in the future, once all exceptions suited for retry are identified.
|
||||
retried_errors.append(exc)
|
||||
|
||||
if attempt < self.register_session_retries:
|
||||
time.sleep(next(wait_times))
|
||||
|
||||
# Raise last exception to inform user about the connection issues.
|
||||
# Note: Should we use ExceptionGroup to raise all exceptions?
|
||||
raise retried_errors[-1]
|
||||
|
||||
@classmethod
|
||||
def _strip_protocol(cls, path):
|
||||
return infer_storage_options(path)["path"]
|
||||
|
||||
@staticmethod
|
||||
def _get_kwargs_from_urls(path):
|
||||
# smb://workgroup;user:password@host:port/share/folder/file.csv
|
||||
out = infer_storage_options(path)
|
||||
out.pop("path", None)
|
||||
out.pop("protocol", None)
|
||||
return out
|
||||
|
||||
def mkdir(self, path, create_parents=True, **kwargs):
|
||||
wpath = _as_unc_path(self.host, path)
|
||||
if create_parents:
|
||||
smbclient.makedirs(wpath, exist_ok=False, port=self._port, **kwargs)
|
||||
else:
|
||||
smbclient.mkdir(wpath, port=self._port, **kwargs)
|
||||
|
||||
def makedirs(self, path, exist_ok=False):
|
||||
if _share_has_path(path):
|
||||
wpath = _as_unc_path(self.host, path)
|
||||
smbclient.makedirs(wpath, exist_ok=exist_ok, port=self._port)
|
||||
|
||||
def rmdir(self, path):
|
||||
if _share_has_path(path):
|
||||
wpath = _as_unc_path(self.host, path)
|
||||
smbclient.rmdir(wpath, port=self._port)
|
||||
|
||||
def info(self, path, **kwargs):
|
||||
wpath = _as_unc_path(self.host, path)
|
||||
stats = smbclient.stat(wpath, port=self._port, **kwargs)
|
||||
if S_ISDIR(stats.st_mode):
|
||||
stype = "directory"
|
||||
elif S_ISLNK(stats.st_mode):
|
||||
stype = "link"
|
||||
else:
|
||||
stype = "file"
|
||||
res = {
|
||||
"name": path + "/" if stype == "directory" else path,
|
||||
"size": stats.st_size,
|
||||
"type": stype,
|
||||
"uid": stats.st_uid,
|
||||
"gid": stats.st_gid,
|
||||
"time": stats.st_atime,
|
||||
"mtime": stats.st_mtime,
|
||||
}
|
||||
return res
|
||||
|
||||
def created(self, path):
|
||||
"""Return the created timestamp of a file as a datetime.datetime"""
|
||||
wpath = _as_unc_path(self.host, path)
|
||||
stats = smbclient.stat(wpath, port=self._port)
|
||||
return datetime.datetime.fromtimestamp(stats.st_ctime, tz=datetime.timezone.utc)
|
||||
|
||||
def modified(self, path):
|
||||
"""Return the modified timestamp of a file as a datetime.datetime"""
|
||||
wpath = _as_unc_path(self.host, path)
|
||||
stats = smbclient.stat(wpath, port=self._port)
|
||||
return datetime.datetime.fromtimestamp(stats.st_mtime, tz=datetime.timezone.utc)
|
||||
|
||||
def ls(self, path, detail=True, **kwargs):
|
||||
unc = _as_unc_path(self.host, path)
|
||||
listed = smbclient.listdir(unc, port=self._port, **kwargs)
|
||||
dirs = ["/".join([path.rstrip("/"), p]) for p in listed]
|
||||
if detail:
|
||||
dirs = [self.info(d) for d in dirs]
|
||||
return dirs
|
||||
|
||||
# pylint: disable=too-many-arguments
|
||||
def _open(
|
||||
self,
|
||||
path,
|
||||
mode="rb",
|
||||
block_size=-1,
|
||||
autocommit=True,
|
||||
cache_options=None,
|
||||
**kwargs,
|
||||
):
|
||||
"""
|
||||
block_size: int or None
|
||||
If 0, no buffering, 1, line buffering, >1, buffer that many bytes
|
||||
|
||||
Notes
|
||||
-----
|
||||
By specifying 'share_access' in 'kwargs' it is possible to override the
|
||||
default shared access setting applied in the constructor of this object.
|
||||
"""
|
||||
if self.auto_mkdir and "w" in mode:
|
||||
self.makedirs(self._parent(path), exist_ok=True)
|
||||
bls = block_size if block_size is not None and block_size >= 0 else -1
|
||||
wpath = _as_unc_path(self.host, path)
|
||||
share_access = kwargs.pop("share_access", self.share_access)
|
||||
if "w" in mode and autocommit is False:
|
||||
temp = _as_temp_path(self.host, path, self.temppath)
|
||||
return SMBFileOpener(
|
||||
wpath, temp, mode, port=self._port, block_size=bls, **kwargs
|
||||
)
|
||||
return smbclient.open_file(
|
||||
wpath,
|
||||
mode,
|
||||
buffering=bls,
|
||||
share_access=share_access,
|
||||
port=self._port,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
def copy(self, path1, path2, **kwargs):
|
||||
"""Copy within two locations in the same filesystem"""
|
||||
wpath1 = _as_unc_path(self.host, path1)
|
||||
wpath2 = _as_unc_path(self.host, path2)
|
||||
if self.auto_mkdir:
|
||||
self.makedirs(self._parent(path2), exist_ok=True)
|
||||
smbclient.copyfile(wpath1, wpath2, port=self._port, **kwargs)
|
||||
|
||||
def _rm(self, path):
|
||||
if _share_has_path(path):
|
||||
wpath = _as_unc_path(self.host, path)
|
||||
stats = smbclient.stat(wpath, port=self._port)
|
||||
if S_ISDIR(stats.st_mode):
|
||||
smbclient.rmdir(wpath, port=self._port)
|
||||
else:
|
||||
smbclient.remove(wpath, port=self._port)
|
||||
|
||||
def mv(self, path1, path2, recursive=None, maxdepth=None, **kwargs):
|
||||
wpath1 = _as_unc_path(self.host, path1)
|
||||
wpath2 = _as_unc_path(self.host, path2)
|
||||
smbclient.rename(wpath1, wpath2, port=self._port, **kwargs)
|
||||
|
||||
|
||||
def _as_unc_path(host, path):
|
||||
rpath = path.replace("/", "\\")
|
||||
unc = f"\\\\{host}{rpath}"
|
||||
return unc
|
||||
|
||||
|
||||
def _as_temp_path(host, path, temppath):
|
||||
share = path.split("/")[1]
|
||||
temp_file = f"/{share}{temppath}/{uuid.uuid4()}"
|
||||
unc = _as_unc_path(host, temp_file)
|
||||
return unc
|
||||
|
||||
|
||||
def _share_has_path(path):
|
||||
parts = path.count("/")
|
||||
if path.endswith("/"):
|
||||
return parts > 2
|
||||
return parts > 1
|
||||
|
||||
|
||||
class SMBFileOpener:
|
||||
"""writes to remote temporary file, move on commit"""
|
||||
|
||||
def __init__(self, path, temp, mode, port=445, block_size=-1, **kwargs):
|
||||
self.path = path
|
||||
self.temp = temp
|
||||
self.mode = mode
|
||||
self.block_size = block_size
|
||||
self.kwargs = kwargs
|
||||
self.smbfile = None
|
||||
self._incontext = False
|
||||
self.port = port
|
||||
self._open()
|
||||
|
||||
def _open(self):
|
||||
if self.smbfile is None or self.smbfile.closed:
|
||||
self.smbfile = smbclient.open_file(
|
||||
self.temp,
|
||||
self.mode,
|
||||
port=self.port,
|
||||
buffering=self.block_size,
|
||||
**self.kwargs,
|
||||
)
|
||||
|
||||
def commit(self):
|
||||
"""Move temp file to definitive on success."""
|
||||
# TODO: use transaction support in SMB protocol
|
||||
smbclient.replace(self.temp, self.path, port=self.port)
|
||||
|
||||
def discard(self):
|
||||
"""Remove the temp file on failure."""
|
||||
smbclient.remove(self.temp, port=self.port)
|
||||
|
||||
def __fspath__(self):
|
||||
return self.path
|
||||
|
||||
def __iter__(self):
|
||||
return self.smbfile.__iter__()
|
||||
|
||||
def __getattr__(self, item):
|
||||
return getattr(self.smbfile, item)
|
||||
|
||||
def __enter__(self):
|
||||
self._incontext = True
|
||||
return self.smbfile.__enter__()
|
||||
|
||||
def __exit__(self, exc_type, exc_value, traceback):
|
||||
self._incontext = False
|
||||
self.smbfile.__exit__(exc_type, exc_value, traceback)
|
||||
@@ -0,0 +1,124 @@
|
||||
import logging
|
||||
import tarfile
|
||||
|
||||
import fsspec
|
||||
from fsspec.archive import AbstractArchiveFileSystem
|
||||
from fsspec.compression import compr
|
||||
from fsspec.utils import infer_compression
|
||||
|
||||
typemap = {b"0": "file", b"5": "directory"}
|
||||
|
||||
logger = logging.getLogger("tar")
|
||||
|
||||
|
||||
class TarFileSystem(AbstractArchiveFileSystem):
|
||||
"""Compressed Tar archives as a file-system (read-only)
|
||||
|
||||
Supports the following formats:
|
||||
tar.gz, tar.bz2, tar.xz
|
||||
"""
|
||||
|
||||
root_marker = ""
|
||||
protocol = "tar"
|
||||
cachable = False
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
fo="",
|
||||
index_store=None,
|
||||
target_options=None,
|
||||
target_protocol=None,
|
||||
compression=None,
|
||||
**kwargs,
|
||||
):
|
||||
super().__init__(**kwargs)
|
||||
target_options = target_options or {}
|
||||
|
||||
if isinstance(fo, str):
|
||||
self.of = fsspec.open(fo, protocol=target_protocol, **target_options)
|
||||
fo = self.of.open() # keep the reference
|
||||
|
||||
# Try to infer compression.
|
||||
if compression is None:
|
||||
name = None
|
||||
|
||||
# Try different ways to get hold of the filename. `fo` might either
|
||||
# be a `fsspec.LocalFileOpener`, an `io.BufferedReader` or an
|
||||
# `fsspec.AbstractFileSystem` instance.
|
||||
try:
|
||||
# Amended io.BufferedReader or similar.
|
||||
# This uses a "protocol extension" where original filenames are
|
||||
# propagated to archive-like filesystems in order to let them
|
||||
# infer the right compression appropriately.
|
||||
if hasattr(fo, "original"):
|
||||
name = fo.original
|
||||
|
||||
# fsspec.LocalFileOpener
|
||||
elif hasattr(fo, "path"):
|
||||
name = fo.path
|
||||
|
||||
# io.BufferedReader
|
||||
elif hasattr(fo, "name"):
|
||||
name = fo.name
|
||||
|
||||
# fsspec.AbstractFileSystem
|
||||
elif hasattr(fo, "info"):
|
||||
name = fo.info()["name"]
|
||||
|
||||
except Exception as ex:
|
||||
logger.warning(
|
||||
f"Unable to determine file name, not inferring compression: {ex}"
|
||||
)
|
||||
|
||||
if name is not None:
|
||||
compression = infer_compression(name)
|
||||
logger.info(f"Inferred compression {compression} from file name {name}")
|
||||
|
||||
if compression is not None:
|
||||
# TODO: tarfile already implements compression with modes like "'r:gz'",
|
||||
# but then would seek to offset in the file work?
|
||||
fo = compr[compression](fo)
|
||||
|
||||
self._fo_ref = fo
|
||||
self.fo = fo # the whole instance is a context
|
||||
self.tar = tarfile.TarFile(fileobj=self.fo)
|
||||
self.dir_cache = None
|
||||
|
||||
self.index_store = index_store
|
||||
self.index = None
|
||||
self._index()
|
||||
|
||||
def _index(self):
|
||||
# TODO: load and set saved index, if exists
|
||||
out = {}
|
||||
for ti in self.tar:
|
||||
info = ti.get_info()
|
||||
info["type"] = typemap.get(info["type"], "file")
|
||||
name = ti.get_info()["name"].rstrip("/")
|
||||
out[name] = (info, ti.offset_data)
|
||||
|
||||
self.index = out
|
||||
# TODO: save index to self.index_store here, if set
|
||||
|
||||
def _get_dirs(self):
|
||||
if self.dir_cache is not None:
|
||||
return
|
||||
|
||||
# This enables ls to get directories as children as well as files
|
||||
self.dir_cache = {
|
||||
dirname: {"name": dirname, "size": 0, "type": "directory"}
|
||||
for dirname in self._all_dirnames(self.tar.getnames())
|
||||
}
|
||||
for member in self.tar.getmembers():
|
||||
info = member.get_info()
|
||||
info["name"] = info["name"].rstrip("/")
|
||||
info["type"] = typemap.get(info["type"], "file")
|
||||
self.dir_cache[info["name"]] = info
|
||||
|
||||
def _open(self, path, mode="rb", **kwargs):
|
||||
if mode != "rb":
|
||||
raise ValueError("Read-only filesystem implementation")
|
||||
details, offset = self.index[path]
|
||||
if details["type"] != "file":
|
||||
raise ValueError("Can only handle regular files")
|
||||
return self.tar.extractfile(path)
|
||||
@@ -0,0 +1,485 @@
|
||||
# https://hadoop.apache.org/docs/r1.0.4/webhdfs.html
|
||||
|
||||
import logging
|
||||
import os
|
||||
import secrets
|
||||
import shutil
|
||||
import tempfile
|
||||
import uuid
|
||||
from contextlib import suppress
|
||||
from urllib.parse import quote
|
||||
|
||||
import requests
|
||||
|
||||
from ..spec import AbstractBufferedFile, AbstractFileSystem
|
||||
from ..utils import infer_storage_options, tokenize
|
||||
|
||||
logger = logging.getLogger("webhdfs")
|
||||
|
||||
|
||||
class WebHDFS(AbstractFileSystem):
|
||||
"""
|
||||
Interface to HDFS over HTTP using the WebHDFS API. Supports also HttpFS gateways.
|
||||
|
||||
Four auth mechanisms are supported:
|
||||
|
||||
insecure: no auth is done, and the user is assumed to be whoever they
|
||||
say they are (parameter ``user``), or a predefined value such as
|
||||
"dr.who" if not given
|
||||
spnego: when kerberos authentication is enabled, auth is negotiated by
|
||||
requests_kerberos https://github.com/requests/requests-kerberos .
|
||||
This establishes a session based on existing kinit login and/or
|
||||
specified principal/password; parameters are passed with ``kerb_kwargs``
|
||||
token: uses an existing Hadoop delegation token from another secured
|
||||
service. Indeed, this client can also generate such tokens when
|
||||
not insecure. Note that tokens expire, but can be renewed (by a
|
||||
previously specified user) and may allow for proxying.
|
||||
basic-auth: used when both parameter ``user`` and parameter ``password``
|
||||
are provided.
|
||||
|
||||
"""
|
||||
|
||||
tempdir = str(tempfile.gettempdir())
|
||||
protocol = "webhdfs", "webHDFS"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
host,
|
||||
port=50070,
|
||||
kerberos=False,
|
||||
token=None,
|
||||
user=None,
|
||||
password=None,
|
||||
proxy_to=None,
|
||||
kerb_kwargs=None,
|
||||
data_proxy=None,
|
||||
use_https=False,
|
||||
session_cert=None,
|
||||
session_verify=True,
|
||||
**kwargs,
|
||||
):
|
||||
"""
|
||||
Parameters
|
||||
----------
|
||||
host: str
|
||||
Name-node address
|
||||
port: int
|
||||
Port for webHDFS
|
||||
kerberos: bool
|
||||
Whether to authenticate with kerberos for this connection
|
||||
token: str or None
|
||||
If given, use this token on every call to authenticate. A user
|
||||
and user-proxy may be encoded in the token and should not be also
|
||||
given
|
||||
user: str or None
|
||||
If given, assert the user name to connect with
|
||||
password: str or None
|
||||
If given, assert the password to use for basic auth. If password
|
||||
is provided, user must be provided also
|
||||
proxy_to: str or None
|
||||
If given, the user has the authority to proxy, and this value is
|
||||
the user in who's name actions are taken
|
||||
kerb_kwargs: dict
|
||||
Any extra arguments for HTTPKerberosAuth, see
|
||||
`<https://github.com/requests/requests-kerberos/blob/master/requests_kerberos/kerberos_.py>`_
|
||||
data_proxy: dict, callable or None
|
||||
If given, map data-node addresses. This can be necessary if the
|
||||
HDFS cluster is behind a proxy, running on Docker or otherwise has
|
||||
a mismatch between the host-names given by the name-node and the
|
||||
address by which to refer to them from the client. If a dict,
|
||||
maps host names ``host->data_proxy[host]``; if a callable, full
|
||||
URLs are passed, and function must conform to
|
||||
``url->data_proxy(url)``.
|
||||
use_https: bool
|
||||
Whether to connect to the Name-node using HTTPS instead of HTTP
|
||||
session_cert: str or Tuple[str, str] or None
|
||||
Path to a certificate file, or tuple of (cert, key) files to use
|
||||
for the requests.Session
|
||||
session_verify: str, bool or None
|
||||
Path to a certificate file to use for verifying the requests.Session.
|
||||
kwargs
|
||||
"""
|
||||
if self._cached:
|
||||
return
|
||||
super().__init__(**kwargs)
|
||||
self.url = f"{'https' if use_https else 'http'}://{host}:{port}/webhdfs/v1"
|
||||
self.kerb = kerberos
|
||||
self.kerb_kwargs = kerb_kwargs or {}
|
||||
self.pars = {}
|
||||
self.proxy = data_proxy or {}
|
||||
if token is not None:
|
||||
if user is not None or proxy_to is not None:
|
||||
raise ValueError(
|
||||
"If passing a delegation token, must not set "
|
||||
"user or proxy_to, as these are encoded in the"
|
||||
" token"
|
||||
)
|
||||
self.pars["delegation"] = token
|
||||
self.user = user
|
||||
self.password = password
|
||||
|
||||
if password is not None:
|
||||
if user is None:
|
||||
raise ValueError(
|
||||
"If passing a password, the user must also be"
|
||||
"set in order to set up the basic-auth"
|
||||
)
|
||||
else:
|
||||
if user is not None:
|
||||
self.pars["user.name"] = user
|
||||
|
||||
if proxy_to is not None:
|
||||
self.pars["doas"] = proxy_to
|
||||
if kerberos and user is not None:
|
||||
raise ValueError(
|
||||
"If using Kerberos auth, do not specify the "
|
||||
"user, this is handled by kinit."
|
||||
)
|
||||
|
||||
self.session_cert = session_cert
|
||||
self.session_verify = session_verify
|
||||
|
||||
self._connect()
|
||||
|
||||
self._fsid = f"webhdfs_{tokenize(host, port)}"
|
||||
|
||||
@property
|
||||
def fsid(self):
|
||||
return self._fsid
|
||||
|
||||
def _connect(self):
|
||||
self.session = requests.Session()
|
||||
|
||||
if self.session_cert:
|
||||
self.session.cert = self.session_cert
|
||||
|
||||
self.session.verify = self.session_verify
|
||||
|
||||
if self.kerb:
|
||||
from requests_kerberos import HTTPKerberosAuth
|
||||
|
||||
self.session.auth = HTTPKerberosAuth(**self.kerb_kwargs)
|
||||
|
||||
if self.user is not None and self.password is not None:
|
||||
from requests.auth import HTTPBasicAuth
|
||||
|
||||
self.session.auth = HTTPBasicAuth(self.user, self.password)
|
||||
|
||||
def _call(self, op, method="get", path=None, data=None, redirect=True, **kwargs):
|
||||
path = self._strip_protocol(path) if path is not None else ""
|
||||
url = self._apply_proxy(self.url + quote(path, safe="/="))
|
||||
args = kwargs.copy()
|
||||
args.update(self.pars)
|
||||
args["op"] = op.upper()
|
||||
logger.debug("sending %s with %s", url, method)
|
||||
out = self.session.request(
|
||||
method=method.upper(),
|
||||
url=url,
|
||||
params=args,
|
||||
data=data,
|
||||
allow_redirects=redirect,
|
||||
)
|
||||
if out.status_code in [400, 401, 403, 404, 500]:
|
||||
try:
|
||||
err = out.json()
|
||||
msg = err["RemoteException"]["message"]
|
||||
exp = err["RemoteException"]["exception"]
|
||||
except (ValueError, KeyError):
|
||||
pass
|
||||
else:
|
||||
if exp in ["IllegalArgumentException", "UnsupportedOperationException"]:
|
||||
raise ValueError(msg)
|
||||
elif exp in ["SecurityException", "AccessControlException"]:
|
||||
raise PermissionError(msg)
|
||||
elif exp in ["FileNotFoundException"]:
|
||||
raise FileNotFoundError(msg)
|
||||
else:
|
||||
raise RuntimeError(msg)
|
||||
out.raise_for_status()
|
||||
return out
|
||||
|
||||
def _open(
|
||||
self,
|
||||
path,
|
||||
mode="rb",
|
||||
block_size=None,
|
||||
autocommit=True,
|
||||
replication=None,
|
||||
permissions=None,
|
||||
**kwargs,
|
||||
):
|
||||
"""
|
||||
|
||||
Parameters
|
||||
----------
|
||||
path: str
|
||||
File location
|
||||
mode: str
|
||||
'rb', 'wb', etc.
|
||||
block_size: int
|
||||
Client buffer size for read-ahead or write buffer
|
||||
autocommit: bool
|
||||
If False, writes to temporary file that only gets put in final
|
||||
location upon commit
|
||||
replication: int
|
||||
Number of copies of file on the cluster, write mode only
|
||||
permissions: str or int
|
||||
posix permissions, write mode only
|
||||
kwargs
|
||||
|
||||
Returns
|
||||
-------
|
||||
WebHDFile instance
|
||||
"""
|
||||
block_size = block_size or self.blocksize
|
||||
return WebHDFile(
|
||||
self,
|
||||
path,
|
||||
mode=mode,
|
||||
block_size=block_size,
|
||||
tempdir=self.tempdir,
|
||||
autocommit=autocommit,
|
||||
replication=replication,
|
||||
permissions=permissions,
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def _process_info(info):
|
||||
info["type"] = info["type"].lower()
|
||||
info["size"] = info["length"]
|
||||
return info
|
||||
|
||||
@classmethod
|
||||
def _strip_protocol(cls, path):
|
||||
return infer_storage_options(path)["path"]
|
||||
|
||||
@staticmethod
|
||||
def _get_kwargs_from_urls(urlpath):
|
||||
out = infer_storage_options(urlpath)
|
||||
out.pop("path", None)
|
||||
out.pop("protocol", None)
|
||||
if "username" in out:
|
||||
out["user"] = out.pop("username")
|
||||
return out
|
||||
|
||||
def info(self, path):
|
||||
out = self._call("GETFILESTATUS", path=path)
|
||||
info = out.json()["FileStatus"]
|
||||
info["name"] = path
|
||||
return self._process_info(info)
|
||||
|
||||
def ls(self, path, detail=False, **kwargs):
|
||||
out = self._call("LISTSTATUS", path=path)
|
||||
infos = out.json()["FileStatuses"]["FileStatus"]
|
||||
for info in infos:
|
||||
self._process_info(info)
|
||||
info["name"] = path.rstrip("/") + "/" + info["pathSuffix"]
|
||||
if detail:
|
||||
return sorted(infos, key=lambda i: i["name"])
|
||||
else:
|
||||
return sorted(info["name"] for info in infos)
|
||||
|
||||
def content_summary(self, path):
|
||||
"""Total numbers of files, directories and bytes under path"""
|
||||
out = self._call("GETCONTENTSUMMARY", path=path)
|
||||
return out.json()["ContentSummary"]
|
||||
|
||||
def ukey(self, path):
|
||||
"""Checksum info of file, giving method and result"""
|
||||
out = self._call("GETFILECHECKSUM", path=path, redirect=False)
|
||||
if "Location" in out.headers:
|
||||
location = self._apply_proxy(out.headers["Location"])
|
||||
out2 = self.session.get(location)
|
||||
out2.raise_for_status()
|
||||
return out2.json()["FileChecksum"]
|
||||
else:
|
||||
out.raise_for_status()
|
||||
return out.json()["FileChecksum"]
|
||||
|
||||
def home_directory(self):
|
||||
"""Get user's home directory"""
|
||||
out = self._call("GETHOMEDIRECTORY")
|
||||
return out.json()["Path"]
|
||||
|
||||
def get_delegation_token(self, renewer=None):
|
||||
"""Retrieve token which can give the same authority to other uses
|
||||
|
||||
Parameters
|
||||
----------
|
||||
renewer: str or None
|
||||
User who may use this token; if None, will be current user
|
||||
"""
|
||||
if renewer:
|
||||
out = self._call("GETDELEGATIONTOKEN", renewer=renewer)
|
||||
else:
|
||||
out = self._call("GETDELEGATIONTOKEN")
|
||||
t = out.json()["Token"]
|
||||
if t is None:
|
||||
raise ValueError("No token available for this user/security context")
|
||||
return t["urlString"]
|
||||
|
||||
def renew_delegation_token(self, token):
|
||||
"""Make token live longer. Returns new expiry time"""
|
||||
out = self._call("RENEWDELEGATIONTOKEN", method="put", token=token)
|
||||
return out.json()["long"]
|
||||
|
||||
def cancel_delegation_token(self, token):
|
||||
"""Stop the token from being useful"""
|
||||
self._call("CANCELDELEGATIONTOKEN", method="put", token=token)
|
||||
|
||||
def chmod(self, path, mod):
|
||||
"""Set the permission at path
|
||||
|
||||
Parameters
|
||||
----------
|
||||
path: str
|
||||
location to set (file or directory)
|
||||
mod: str or int
|
||||
posix epresentation or permission, give as oct string, e.g, '777'
|
||||
or 0o777
|
||||
"""
|
||||
self._call("SETPERMISSION", method="put", path=path, permission=mod)
|
||||
|
||||
def chown(self, path, owner=None, group=None):
|
||||
"""Change owning user and/or group"""
|
||||
kwargs = {}
|
||||
if owner is not None:
|
||||
kwargs["owner"] = owner
|
||||
if group is not None:
|
||||
kwargs["group"] = group
|
||||
self._call("SETOWNER", method="put", path=path, **kwargs)
|
||||
|
||||
def set_replication(self, path, replication):
|
||||
"""
|
||||
Set file replication factor
|
||||
|
||||
Parameters
|
||||
----------
|
||||
path: str
|
||||
File location (not for directories)
|
||||
replication: int
|
||||
Number of copies of file on the cluster. Should be smaller than
|
||||
number of data nodes; normally 3 on most systems.
|
||||
"""
|
||||
self._call("SETREPLICATION", path=path, method="put", replication=replication)
|
||||
|
||||
def mkdir(self, path, **kwargs):
|
||||
self._call("MKDIRS", method="put", path=path)
|
||||
|
||||
def makedirs(self, path, exist_ok=False):
|
||||
if exist_ok is False and self.exists(path):
|
||||
raise FileExistsError(path)
|
||||
self.mkdir(path)
|
||||
|
||||
def mv(self, path1, path2, **kwargs):
|
||||
self._call("RENAME", method="put", path=path1, destination=path2)
|
||||
|
||||
def rm(self, path, recursive=False, **kwargs):
|
||||
self._call(
|
||||
"DELETE",
|
||||
method="delete",
|
||||
path=path,
|
||||
recursive="true" if recursive else "false",
|
||||
)
|
||||
|
||||
def rm_file(self, path, **kwargs):
|
||||
self.rm(path)
|
||||
|
||||
def cp_file(self, lpath, rpath, **kwargs):
|
||||
with self.open(lpath) as lstream:
|
||||
tmp_fname = "/".join([self._parent(rpath), f".tmp.{secrets.token_hex(16)}"])
|
||||
# Perform an atomic copy (stream to a temporary file and
|
||||
# move it to the actual destination).
|
||||
try:
|
||||
with self.open(tmp_fname, "wb") as rstream:
|
||||
shutil.copyfileobj(lstream, rstream)
|
||||
self.mv(tmp_fname, rpath)
|
||||
except BaseException:
|
||||
with suppress(FileNotFoundError):
|
||||
self.rm(tmp_fname)
|
||||
raise
|
||||
|
||||
def _apply_proxy(self, location):
|
||||
if self.proxy and callable(self.proxy):
|
||||
location = self.proxy(location)
|
||||
elif self.proxy:
|
||||
# as a dict
|
||||
for k, v in self.proxy.items():
|
||||
location = location.replace(k, v, 1)
|
||||
return location
|
||||
|
||||
|
||||
class WebHDFile(AbstractBufferedFile):
|
||||
"""A file living in HDFS over webHDFS"""
|
||||
|
||||
def __init__(self, fs, path, **kwargs):
|
||||
super().__init__(fs, path, **kwargs)
|
||||
kwargs = kwargs.copy()
|
||||
if kwargs.get("permissions", None) is None:
|
||||
kwargs.pop("permissions", None)
|
||||
if kwargs.get("replication", None) is None:
|
||||
kwargs.pop("replication", None)
|
||||
self.permissions = kwargs.pop("permissions", 511)
|
||||
tempdir = kwargs.pop("tempdir")
|
||||
if kwargs.pop("autocommit", False) is False:
|
||||
self.target = self.path
|
||||
self.path = os.path.join(tempdir, str(uuid.uuid4()))
|
||||
|
||||
def _upload_chunk(self, final=False):
|
||||
"""Write one part of a multi-block file upload
|
||||
|
||||
Parameters
|
||||
==========
|
||||
final: bool
|
||||
This is the last block, so should complete file, if
|
||||
self.autocommit is True.
|
||||
"""
|
||||
out = self.fs.session.post(
|
||||
self.location,
|
||||
data=self.buffer.getvalue(),
|
||||
headers={"content-type": "application/octet-stream"},
|
||||
)
|
||||
out.raise_for_status()
|
||||
return True
|
||||
|
||||
def _initiate_upload(self):
|
||||
"""Create remote file/upload"""
|
||||
kwargs = self.kwargs.copy()
|
||||
if "a" in self.mode:
|
||||
op, method = "APPEND", "POST"
|
||||
else:
|
||||
op, method = "CREATE", "PUT"
|
||||
kwargs["overwrite"] = "true"
|
||||
out = self.fs._call(op, method, self.path, redirect=False, **kwargs)
|
||||
location = self.fs._apply_proxy(out.headers["Location"])
|
||||
if "w" in self.mode:
|
||||
# create empty file to append to
|
||||
out2 = self.fs.session.put(
|
||||
location, headers={"content-type": "application/octet-stream"}
|
||||
)
|
||||
out2.raise_for_status()
|
||||
# after creating empty file, change location to append to
|
||||
out2 = self.fs._call("APPEND", "POST", self.path, redirect=False, **kwargs)
|
||||
self.location = self.fs._apply_proxy(out2.headers["Location"])
|
||||
|
||||
def _fetch_range(self, start, end):
|
||||
start = max(start, 0)
|
||||
end = min(self.size, end)
|
||||
if start >= end or start >= self.size:
|
||||
return b""
|
||||
out = self.fs._call(
|
||||
"OPEN", path=self.path, offset=start, length=end - start, redirect=False
|
||||
)
|
||||
out.raise_for_status()
|
||||
if "Location" in out.headers:
|
||||
location = out.headers["Location"]
|
||||
out2 = self.fs.session.get(self.fs._apply_proxy(location))
|
||||
return out2.content
|
||||
else:
|
||||
return out.content
|
||||
|
||||
def commit(self):
|
||||
self.fs.mv(self.path, self.target)
|
||||
|
||||
def discard(self):
|
||||
self.fs.rm(self.path)
|
||||
@@ -0,0 +1,177 @@
|
||||
import os
|
||||
import zipfile
|
||||
|
||||
import fsspec
|
||||
from fsspec.archive import AbstractArchiveFileSystem
|
||||
|
||||
|
||||
class ZipFileSystem(AbstractArchiveFileSystem):
|
||||
"""Read/Write contents of ZIP archive as a file-system
|
||||
|
||||
Keeps file object open while instance lives.
|
||||
|
||||
This class is pickleable, but not necessarily thread-safe
|
||||
"""
|
||||
|
||||
root_marker = ""
|
||||
protocol = "zip"
|
||||
cachable = False
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
fo="",
|
||||
mode="r",
|
||||
target_protocol=None,
|
||||
target_options=None,
|
||||
compression=zipfile.ZIP_STORED,
|
||||
allowZip64=True,
|
||||
compresslevel=None,
|
||||
**kwargs,
|
||||
):
|
||||
"""
|
||||
Parameters
|
||||
----------
|
||||
fo: str or file-like
|
||||
Contains ZIP, and must exist. If a str, will fetch file using
|
||||
:meth:`~fsspec.open_files`, which must return one file exactly.
|
||||
mode: str
|
||||
Accept: "r", "w", "a"
|
||||
target_protocol: str (optional)
|
||||
If ``fo`` is a string, this value can be used to override the
|
||||
FS protocol inferred from a URL
|
||||
target_options: dict (optional)
|
||||
Kwargs passed when instantiating the target FS, if ``fo`` is
|
||||
a string.
|
||||
compression, allowZip64, compresslevel: passed to ZipFile
|
||||
Only relevant when creating a ZIP
|
||||
"""
|
||||
super().__init__(self, **kwargs)
|
||||
if mode not in set("rwa"):
|
||||
raise ValueError(f"mode '{mode}' no understood")
|
||||
self.mode = mode
|
||||
if isinstance(fo, (str, os.PathLike)):
|
||||
if mode == "a":
|
||||
m = "r+b"
|
||||
else:
|
||||
m = mode + "b"
|
||||
fo = fsspec.open(
|
||||
fo, mode=m, protocol=target_protocol, **(target_options or {})
|
||||
)
|
||||
self.force_zip_64 = allowZip64
|
||||
self.of = fo
|
||||
self.fo = fo.__enter__() # the whole instance is a context
|
||||
self.zip = zipfile.ZipFile(
|
||||
self.fo,
|
||||
mode=mode,
|
||||
compression=compression,
|
||||
allowZip64=allowZip64,
|
||||
compresslevel=compresslevel,
|
||||
)
|
||||
self.dir_cache = None
|
||||
|
||||
@classmethod
|
||||
def _strip_protocol(cls, path):
|
||||
# zip file paths are always relative to the archive root
|
||||
return super()._strip_protocol(path).lstrip("/")
|
||||
|
||||
def __del__(self):
|
||||
if hasattr(self, "zip"):
|
||||
self.close()
|
||||
del self.zip
|
||||
|
||||
def close(self):
|
||||
"""Commits any write changes to the file. Done on ``del`` too."""
|
||||
self.zip.close()
|
||||
|
||||
def _get_dirs(self):
|
||||
if self.dir_cache is None or self.mode in set("wa"):
|
||||
# when writing, dir_cache is always in the ZipFile's attributes,
|
||||
# not read from the file.
|
||||
files = self.zip.infolist()
|
||||
self.dir_cache = {
|
||||
dirname.rstrip("/"): {
|
||||
"name": dirname.rstrip("/"),
|
||||
"size": 0,
|
||||
"type": "directory",
|
||||
}
|
||||
for dirname in self._all_dirnames(self.zip.namelist())
|
||||
}
|
||||
for z in files:
|
||||
f = {s: getattr(z, s, None) for s in zipfile.ZipInfo.__slots__}
|
||||
f.update(
|
||||
{
|
||||
"name": z.filename.rstrip("/"),
|
||||
"size": z.file_size,
|
||||
"type": ("directory" if z.is_dir() else "file"),
|
||||
}
|
||||
)
|
||||
self.dir_cache[f["name"]] = f
|
||||
|
||||
def pipe_file(self, path, value, **kwargs):
|
||||
# override upstream, because we know the exact file size in this case
|
||||
self.zip.writestr(path, value, **kwargs)
|
||||
|
||||
def _open(
|
||||
self,
|
||||
path,
|
||||
mode="rb",
|
||||
block_size=None,
|
||||
autocommit=True,
|
||||
cache_options=None,
|
||||
**kwargs,
|
||||
):
|
||||
path = self._strip_protocol(path)
|
||||
if "r" in mode and self.mode in set("wa"):
|
||||
if self.exists(path):
|
||||
raise OSError("ZipFS can only be open for reading or writing, not both")
|
||||
raise FileNotFoundError(path)
|
||||
if "r" in self.mode and "w" in mode:
|
||||
raise OSError("ZipFS can only be open for reading or writing, not both")
|
||||
out = self.zip.open(path, mode.strip("b"), force_zip64=self.force_zip_64)
|
||||
if "r" in mode:
|
||||
info = self.info(path)
|
||||
out.size = info["size"]
|
||||
out.name = info["name"]
|
||||
return out
|
||||
|
||||
def find(self, path, maxdepth=None, withdirs=False, detail=False, **kwargs):
|
||||
if maxdepth is not None and maxdepth < 1:
|
||||
raise ValueError("maxdepth must be at least 1")
|
||||
|
||||
# Remove the leading slash, as the zip file paths are always
|
||||
# given without a leading slash
|
||||
path = path.lstrip("/")
|
||||
path_parts = list(filter(lambda s: bool(s), path.split("/")))
|
||||
|
||||
def _matching_starts(file_path):
|
||||
file_parts = filter(lambda s: bool(s), file_path.split("/"))
|
||||
return all(a == b for a, b in zip(path_parts, file_parts))
|
||||
|
||||
self._get_dirs()
|
||||
|
||||
result = {}
|
||||
# To match posix find, if an exact file name is given, we should
|
||||
# return only that file
|
||||
if path in self.dir_cache and self.dir_cache[path]["type"] == "file":
|
||||
result[path] = self.dir_cache[path]
|
||||
return result if detail else [path]
|
||||
|
||||
for file_path, file_info in self.dir_cache.items():
|
||||
if not (path == "" or _matching_starts(file_path)):
|
||||
continue
|
||||
|
||||
if file_info["type"] == "directory":
|
||||
if withdirs:
|
||||
if file_path not in result:
|
||||
result[file_path.strip("/")] = file_info
|
||||
continue
|
||||
|
||||
if file_path not in result:
|
||||
result[file_path] = file_info if detail else None
|
||||
|
||||
if maxdepth:
|
||||
path_depth = path.count("/")
|
||||
result = {
|
||||
k: v for k, v in result.items() if k.count("/") - path_depth < maxdepth
|
||||
}
|
||||
return result if detail else sorted(result)
|
||||
Reference in New Issue
Block a user