chore: 添加虚拟环境到仓库

- 添加 backend_service/venv 虚拟环境
- 包含所有Python依赖包
- 注意:虚拟环境约393MB,包含12655个文件
This commit is contained in:
2025-12-03 10:19:25 +08:00
parent a6c2027caa
commit c4f851d387
12655 changed files with 3009376 additions and 0 deletions

View File

@@ -0,0 +1,307 @@
import errno
import io
import os
import secrets
import shutil
from contextlib import suppress
from functools import cached_property, wraps
from urllib.parse import parse_qs
from fsspec.spec import AbstractFileSystem
from fsspec.utils import (
get_package_version_without_import,
infer_storage_options,
mirror_from,
tokenize,
)
def wrap_exceptions(func):
@wraps(func)
def wrapper(*args, **kwargs):
try:
return func(*args, **kwargs)
except OSError as exception:
if not exception.args:
raise
message, *args = exception.args
if isinstance(message, str) and "does not exist" in message:
raise FileNotFoundError(errno.ENOENT, message) from exception
else:
raise
return wrapper
PYARROW_VERSION = None
class ArrowFSWrapper(AbstractFileSystem):
"""FSSpec-compatible wrapper of pyarrow.fs.FileSystem.
Parameters
----------
fs : pyarrow.fs.FileSystem
"""
root_marker = "/"
def __init__(self, fs, **kwargs):
global PYARROW_VERSION
PYARROW_VERSION = get_package_version_without_import("pyarrow")
self.fs = fs
super().__init__(**kwargs)
@property
def protocol(self):
return self.fs.type_name
@cached_property
def fsid(self):
return "hdfs_" + tokenize(self.fs.host, self.fs.port)
@classmethod
def _strip_protocol(cls, path):
ops = infer_storage_options(path)
path = ops["path"]
if path.startswith("//"):
# special case for "hdfs://path" (without the triple slash)
path = path[1:]
return path
def ls(self, path, detail=False, **kwargs):
path = self._strip_protocol(path)
from pyarrow.fs import FileSelector
try:
entries = [
self._make_entry(entry)
for entry in self.fs.get_file_info(FileSelector(path))
]
except (FileNotFoundError, NotADirectoryError):
entries = [self.info(path, **kwargs)]
if detail:
return entries
else:
return [entry["name"] for entry in entries]
def info(self, path, **kwargs):
path = self._strip_protocol(path)
[info] = self.fs.get_file_info([path])
return self._make_entry(info)
def exists(self, path):
path = self._strip_protocol(path)
try:
self.info(path)
except FileNotFoundError:
return False
else:
return True
def _make_entry(self, info):
from pyarrow.fs import FileType
if info.type is FileType.Directory:
kind = "directory"
elif info.type is FileType.File:
kind = "file"
elif info.type is FileType.NotFound:
raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), info.path)
else:
kind = "other"
return {
"name": info.path,
"size": info.size,
"type": kind,
"mtime": info.mtime,
}
@wrap_exceptions
def cp_file(self, path1, path2, **kwargs):
path1 = self._strip_protocol(path1).rstrip("/")
path2 = self._strip_protocol(path2).rstrip("/")
with self._open(path1, "rb") as lstream:
tmp_fname = f"{path2}.tmp.{secrets.token_hex(6)}"
try:
with self.open(tmp_fname, "wb") as rstream:
shutil.copyfileobj(lstream, rstream)
self.fs.move(tmp_fname, path2)
except BaseException:
with suppress(FileNotFoundError):
self.fs.delete_file(tmp_fname)
raise
@wrap_exceptions
def mv(self, path1, path2, **kwargs):
path1 = self._strip_protocol(path1).rstrip("/")
path2 = self._strip_protocol(path2).rstrip("/")
self.fs.move(path1, path2)
@wrap_exceptions
def rm_file(self, path):
path = self._strip_protocol(path)
self.fs.delete_file(path)
@wrap_exceptions
def rm(self, path, recursive=False, maxdepth=None):
path = self._strip_protocol(path).rstrip("/")
if self.isdir(path):
if recursive:
self.fs.delete_dir(path)
else:
raise ValueError("Can't delete directories without recursive=False")
else:
self.fs.delete_file(path)
@wrap_exceptions
def _open(self, path, mode="rb", block_size=None, seekable=True, **kwargs):
if mode == "rb":
if seekable:
method = self.fs.open_input_file
else:
method = self.fs.open_input_stream
elif mode == "wb":
method = self.fs.open_output_stream
elif mode == "ab":
method = self.fs.open_append_stream
else:
raise ValueError(f"unsupported mode for Arrow filesystem: {mode!r}")
_kwargs = {}
if mode != "rb" or not seekable:
if int(PYARROW_VERSION.split(".")[0]) >= 4:
# disable compression auto-detection
_kwargs["compression"] = None
stream = method(path, **_kwargs)
return ArrowFile(self, stream, path, mode, block_size, **kwargs)
@wrap_exceptions
def mkdir(self, path, create_parents=True, **kwargs):
path = self._strip_protocol(path)
if create_parents:
self.makedirs(path, exist_ok=True)
else:
self.fs.create_dir(path, recursive=False)
@wrap_exceptions
def makedirs(self, path, exist_ok=False):
path = self._strip_protocol(path)
self.fs.create_dir(path, recursive=True)
@wrap_exceptions
def rmdir(self, path):
path = self._strip_protocol(path)
self.fs.delete_dir(path)
@wrap_exceptions
def modified(self, path):
path = self._strip_protocol(path)
return self.fs.get_file_info(path).mtime
def cat_file(self, path, start=None, end=None, **kwargs):
kwargs["seekable"] = start not in [None, 0]
return super().cat_file(path, start=None, end=None, **kwargs)
def get_file(self, rpath, lpath, **kwargs):
kwargs["seekable"] = False
super().get_file(rpath, lpath, **kwargs)
@mirror_from(
"stream",
[
"read",
"seek",
"tell",
"write",
"readable",
"writable",
"close",
"size",
"seekable",
],
)
class ArrowFile(io.IOBase):
def __init__(self, fs, stream, path, mode, block_size=None, **kwargs):
self.path = path
self.mode = mode
self.fs = fs
self.stream = stream
self.blocksize = self.block_size = block_size
self.kwargs = kwargs
def __enter__(self):
return self
def __exit__(self, *args):
return self.close()
class HadoopFileSystem(ArrowFSWrapper):
"""A wrapper on top of the pyarrow.fs.HadoopFileSystem
to connect it's interface with fsspec"""
protocol = "hdfs"
def __init__(
self,
host="default",
port=0,
user=None,
kerb_ticket=None,
replication=3,
extra_conf=None,
**kwargs,
):
"""
Parameters
----------
host: str
Hostname, IP or "default" to try to read from Hadoop config
port: int
Port to connect on, or default from Hadoop config if 0
user: str or None
If given, connect as this username
kerb_ticket: str or None
If given, use this ticket for authentication
replication: int
set replication factor of file for write operations. default value is 3.
extra_conf: None or dict
Passed on to HadoopFileSystem
"""
from pyarrow.fs import HadoopFileSystem
fs = HadoopFileSystem(
host=host,
port=port,
user=user,
kerb_ticket=kerb_ticket,
replication=replication,
extra_conf=extra_conf,
)
super().__init__(fs=fs, **kwargs)
@staticmethod
def _get_kwargs_from_urls(path):
ops = infer_storage_options(path)
out = {}
if ops.get("host", None):
out["host"] = ops["host"]
if ops.get("username", None):
out["user"] = ops["username"]
if ops.get("port", None):
out["port"] = ops["port"]
if ops.get("url_query", None):
queries = parse_qs(ops["url_query"])
if queries.get("replication", None):
out["replication"] = int(queries["replication"][0])
return out

View File

@@ -0,0 +1,122 @@
import asyncio
import functools
import inspect
import fsspec
from fsspec.asyn import AsyncFileSystem, running_async
def async_wrapper(func, obj=None, semaphore=None):
"""
Wraps a synchronous function to make it awaitable.
Parameters
----------
func : callable
The synchronous function to wrap.
obj : object, optional
The instance to bind the function to, if applicable.
semaphore : asyncio.Semaphore, optional
A semaphore to limit concurrent calls.
Returns
-------
coroutine
An awaitable version of the function.
"""
@functools.wraps(func)
async def wrapper(*args, **kwargs):
if semaphore:
async with semaphore:
return await asyncio.to_thread(func, *args, **kwargs)
return await asyncio.to_thread(func, *args, **kwargs)
return wrapper
class AsyncFileSystemWrapper(AsyncFileSystem):
"""
A wrapper class to convert a synchronous filesystem into an asynchronous one.
This class takes an existing synchronous filesystem implementation and wraps all
its methods to provide an asynchronous interface.
Parameters
----------
sync_fs : AbstractFileSystem
The synchronous filesystem instance to wrap.
"""
protocol = "asyncwrapper", "async_wrapper"
cachable = False
def __init__(
self,
fs=None,
asynchronous=None,
target_protocol=None,
target_options=None,
semaphore=None,
max_concurrent_tasks=None,
**kwargs,
):
if asynchronous is None:
asynchronous = running_async()
super().__init__(asynchronous=asynchronous, **kwargs)
if fs is not None:
self.sync_fs = fs
else:
self.sync_fs = fsspec.filesystem(target_protocol, **target_options)
self.protocol = self.sync_fs.protocol
self.semaphore = semaphore
self._wrap_all_sync_methods()
@property
def fsid(self):
return f"async_{self.sync_fs.fsid}"
def _wrap_all_sync_methods(self):
"""
Wrap all synchronous methods of the underlying filesystem with asynchronous versions.
"""
excluded_methods = {"open"}
for method_name in dir(self.sync_fs):
if method_name.startswith("_") or method_name in excluded_methods:
continue
attr = inspect.getattr_static(self.sync_fs, method_name)
if isinstance(attr, property):
continue
method = getattr(self.sync_fs, method_name)
if callable(method) and not inspect.iscoroutinefunction(method):
async_method = async_wrapper(method, obj=self, semaphore=self.semaphore)
setattr(self, f"_{method_name}", async_method)
@classmethod
def wrap_class(cls, sync_fs_class):
"""
Create a new class that can be used to instantiate an AsyncFileSystemWrapper
with lazy instantiation of the underlying synchronous filesystem.
Parameters
----------
sync_fs_class : type
The class of the synchronous filesystem to wrap.
Returns
-------
type
A new class that wraps the provided synchronous filesystem class.
"""
class GeneratedAsyncFileSystemWrapper(cls):
def __init__(self, *args, **kwargs):
sync_fs = sync_fs_class(*args, **kwargs)
super().__init__(sync_fs)
GeneratedAsyncFileSystemWrapper.__name__ = (
f"Async{sync_fs_class.__name__}Wrapper"
)
return GeneratedAsyncFileSystemWrapper

View File

@@ -0,0 +1,75 @@
from __future__ import annotations
import abc
import hashlib
from fsspec.implementations.local import make_path_posix
class AbstractCacheMapper(abc.ABC):
"""Abstract super-class for mappers from remote URLs to local cached
basenames.
"""
@abc.abstractmethod
def __call__(self, path: str) -> str: ...
def __eq__(self, other: object) -> bool:
# Identity only depends on class. When derived classes have attributes
# they will need to be included.
return isinstance(other, type(self))
def __hash__(self) -> int:
# Identity only depends on class. When derived classes have attributes
# they will need to be included.
return hash(type(self))
class BasenameCacheMapper(AbstractCacheMapper):
"""Cache mapper that uses the basename of the remote URL and a fixed number
of directory levels above this.
The default is zero directory levels, meaning different paths with the same
basename will have the same cached basename.
"""
def __init__(self, directory_levels: int = 0):
if directory_levels < 0:
raise ValueError(
"BasenameCacheMapper requires zero or positive directory_levels"
)
self.directory_levels = directory_levels
# Separator for directories when encoded as strings.
self._separator = "_@_"
def __call__(self, path: str) -> str:
path = make_path_posix(path)
prefix, *bits = path.rsplit("/", self.directory_levels + 1)
if bits:
return self._separator.join(bits)
else:
return prefix # No separator found, simple filename
def __eq__(self, other: object) -> bool:
return super().__eq__(other) and self.directory_levels == other.directory_levels
def __hash__(self) -> int:
return super().__hash__() ^ hash(self.directory_levels)
class HashCacheMapper(AbstractCacheMapper):
"""Cache mapper that uses a hash of the remote URL."""
def __call__(self, path: str) -> str:
return hashlib.sha256(path.encode()).hexdigest()
def create_cache_mapper(same_names: bool) -> AbstractCacheMapper:
"""Factory method to create cache mapper for backward compatibility with
``CachingFileSystem`` constructor using ``same_names`` kwarg.
"""
if same_names:
return BasenameCacheMapper()
else:
return HashCacheMapper()

View File

@@ -0,0 +1,233 @@
from __future__ import annotations
import os
import pickle
import time
from typing import TYPE_CHECKING
from fsspec.utils import atomic_write
try:
import ujson as json
except ImportError:
if not TYPE_CHECKING:
import json
if TYPE_CHECKING:
from collections.abc import Iterator
from typing import Any, Literal
from typing_extensions import TypeAlias
from .cached import CachingFileSystem
Detail: TypeAlias = dict[str, Any]
class CacheMetadata:
"""Cache metadata.
All reading and writing of cache metadata is performed by this class,
accessing the cached files and blocks is not.
Metadata is stored in a single file per storage directory in JSON format.
For backward compatibility, also reads metadata stored in pickle format
which is converted to JSON when next saved.
"""
def __init__(self, storage: list[str]):
"""
Parameters
----------
storage: list[str]
Directories containing cached files, must be at least one. Metadata
is stored in the last of these directories by convention.
"""
if not storage:
raise ValueError("CacheMetadata expects at least one storage location")
self._storage = storage
self.cached_files: list[Detail] = [{}]
# Private attribute to force saving of metadata in pickle format rather than
# JSON for use in tests to confirm can read both pickle and JSON formats.
self._force_save_pickle = False
def _load(self, fn: str) -> Detail:
"""Low-level function to load metadata from specific file"""
try:
with open(fn, "r") as f:
loaded = json.load(f)
except ValueError:
with open(fn, "rb") as f:
loaded = pickle.load(f)
for c in loaded.values():
if isinstance(c.get("blocks"), list):
c["blocks"] = set(c["blocks"])
return loaded
def _save(self, metadata_to_save: Detail, fn: str) -> None:
"""Low-level function to save metadata to specific file"""
if self._force_save_pickle:
with atomic_write(fn) as f:
pickle.dump(metadata_to_save, f)
else:
with atomic_write(fn, mode="w") as f:
json.dump(metadata_to_save, f)
def _scan_locations(
self, writable_only: bool = False
) -> Iterator[tuple[str, str, bool]]:
"""Yield locations (filenames) where metadata is stored, and whether
writable or not.
Parameters
----------
writable: bool
Set to True to only yield writable locations.
Returns
-------
Yields (str, str, bool)
"""
n = len(self._storage)
for i, storage in enumerate(self._storage):
writable = i == n - 1
if writable_only and not writable:
continue
yield os.path.join(storage, "cache"), storage, writable
def check_file(
self, path: str, cfs: CachingFileSystem | None
) -> Literal[False] | tuple[Detail, str]:
"""If path is in cache return its details, otherwise return ``False``.
If the optional CachingFileSystem is specified then it is used to
perform extra checks to reject possible matches, such as if they are
too old.
"""
for (fn, base, _), cache in zip(self._scan_locations(), self.cached_files):
if path not in cache:
continue
detail = cache[path].copy()
if cfs is not None:
if cfs.check_files and detail["uid"] != cfs.fs.ukey(path):
# Wrong file as determined by hash of file properties
continue
if cfs.expiry and time.time() - detail["time"] > cfs.expiry:
# Cached file has expired
continue
fn = os.path.join(base, detail["fn"])
if os.path.exists(fn):
return detail, fn
return False
def clear_expired(self, expiry_time: int) -> tuple[list[str], bool]:
"""Remove expired metadata from the cache.
Returns names of files corresponding to expired metadata and a boolean
flag indicating whether the writable cache is empty. Caller is
responsible for deleting the expired files.
"""
expired_files = []
for path, detail in self.cached_files[-1].copy().items():
if time.time() - detail["time"] > expiry_time:
fn = detail.get("fn", "")
if not fn:
raise RuntimeError(
f"Cache metadata does not contain 'fn' for {path}"
)
fn = os.path.join(self._storage[-1], fn)
expired_files.append(fn)
self.cached_files[-1].pop(path)
if self.cached_files[-1]:
cache_path = os.path.join(self._storage[-1], "cache")
self._save(self.cached_files[-1], cache_path)
writable_cache_empty = not self.cached_files[-1]
return expired_files, writable_cache_empty
def load(self) -> None:
"""Load all metadata from disk and store in ``self.cached_files``"""
cached_files = []
for fn, _, _ in self._scan_locations():
if os.path.exists(fn):
# TODO: consolidate blocks here
cached_files.append(self._load(fn))
else:
cached_files.append({})
self.cached_files = cached_files or [{}]
def on_close_cached_file(self, f: Any, path: str) -> None:
"""Perform side-effect actions on closing a cached file.
The actual closing of the file is the responsibility of the caller.
"""
# File must be writeble, so in self.cached_files[-1]
c = self.cached_files[-1][path]
if c["blocks"] is not True and len(c["blocks"]) * f.blocksize >= f.size:
c["blocks"] = True
def pop_file(self, path: str) -> str | None:
"""Remove metadata of cached file.
If path is in the cache, return the filename of the cached file,
otherwise return ``None``. Caller is responsible for deleting the
cached file.
"""
details = self.check_file(path, None)
if not details:
return None
_, fn = details
if fn.startswith(self._storage[-1]):
self.cached_files[-1].pop(path)
self.save()
else:
raise PermissionError(
"Can only delete cached file in last, writable cache location"
)
return fn
def save(self) -> None:
"""Save metadata to disk"""
for (fn, _, writable), cache in zip(self._scan_locations(), self.cached_files):
if not writable:
continue
if os.path.exists(fn):
cached_files = self._load(fn)
for k, c in cached_files.items():
if k in cache:
if c["blocks"] is True or cache[k]["blocks"] is True:
c["blocks"] = True
else:
# self.cached_files[*][*]["blocks"] must continue to
# point to the same set object so that updates
# performed by MMapCache are propagated back to
# self.cached_files.
blocks = cache[k]["blocks"]
blocks.update(c["blocks"])
c["blocks"] = blocks
c["time"] = max(c["time"], cache[k]["time"])
c["uid"] = cache[k]["uid"]
# Files can be added to cache after it was written once
for k, c in cache.items():
if k not in cached_files:
cached_files[k] = c
else:
cached_files = cache
cache = {k: v.copy() for k, v in cached_files.items()}
for c in cache.values():
if isinstance(c["blocks"], set):
c["blocks"] = list(c["blocks"])
self._save(cache, fn)
self.cached_files[-1] = cached_files
def update_file(self, path: str, detail: Detail) -> None:
"""Update metadata for specific file in memory, do not save"""
self.cached_files[-1][path] = detail

View File

@@ -0,0 +1,23 @@
from typing import ClassVar
from fsspec import AbstractFileSystem
__all__ = ("ChainedFileSystem",)
class ChainedFileSystem(AbstractFileSystem):
"""Chained filesystem base class.
A chained filesystem is designed to be layered over another FS.
This is useful to implement things like caching.
This base class does very little on its own, but is used as a marker
that the class is designed for chaining.
Right now this is only used in `url_to_fs` to provide the path argument
(`fo`) to the chained filesystem from the underlying filesystem.
Additional functionality may be added in the future.
"""
protocol: ClassVar[str] = "chained"

View File

@@ -0,0 +1,152 @@
import dask
from distributed.client import Client, _get_global_client
from distributed.worker import Worker
from fsspec import filesystem
from fsspec.spec import AbstractBufferedFile, AbstractFileSystem
from fsspec.utils import infer_storage_options
def _get_client(client):
if client is None:
return _get_global_client()
elif isinstance(client, Client):
return client
else:
# e.g., connection string
return Client(client)
def _in_worker():
return bool(Worker._instances)
class DaskWorkerFileSystem(AbstractFileSystem):
"""View files accessible to a worker as any other remote file-system
When instances are run on the worker, uses the real filesystem. When
run on the client, they call the worker to provide information or data.
**Warning** this implementation is experimental, and read-only for now.
"""
def __init__(
self, target_protocol=None, target_options=None, fs=None, client=None, **kwargs
):
super().__init__(**kwargs)
if not (fs is None) ^ (target_protocol is None):
raise ValueError(
"Please provide one of filesystem instance (fs) or"
" target_protocol, not both"
)
self.target_protocol = target_protocol
self.target_options = target_options
self.worker = None
self.client = client
self.fs = fs
self._determine_worker()
@staticmethod
def _get_kwargs_from_urls(path):
so = infer_storage_options(path)
if "host" in so and "port" in so:
return {"client": f"{so['host']}:{so['port']}"}
else:
return {}
def _determine_worker(self):
if _in_worker():
self.worker = True
if self.fs is None:
self.fs = filesystem(
self.target_protocol, **(self.target_options or {})
)
else:
self.worker = False
self.client = _get_client(self.client)
self.rfs = dask.delayed(self)
def mkdir(self, *args, **kwargs):
if self.worker:
self.fs.mkdir(*args, **kwargs)
else:
self.rfs.mkdir(*args, **kwargs).compute()
def rm(self, *args, **kwargs):
if self.worker:
self.fs.rm(*args, **kwargs)
else:
self.rfs.rm(*args, **kwargs).compute()
def copy(self, *args, **kwargs):
if self.worker:
self.fs.copy(*args, **kwargs)
else:
self.rfs.copy(*args, **kwargs).compute()
def mv(self, *args, **kwargs):
if self.worker:
self.fs.mv(*args, **kwargs)
else:
self.rfs.mv(*args, **kwargs).compute()
def ls(self, *args, **kwargs):
if self.worker:
return self.fs.ls(*args, **kwargs)
else:
return self.rfs.ls(*args, **kwargs).compute()
def _open(
self,
path,
mode="rb",
block_size=None,
autocommit=True,
cache_options=None,
**kwargs,
):
if self.worker:
return self.fs._open(
path,
mode=mode,
block_size=block_size,
autocommit=autocommit,
cache_options=cache_options,
**kwargs,
)
else:
return DaskFile(
fs=self,
path=path,
mode=mode,
block_size=block_size,
autocommit=autocommit,
cache_options=cache_options,
**kwargs,
)
def fetch_range(self, path, mode, start, end):
if self.worker:
with self._open(path, mode) as f:
f.seek(start)
return f.read(end - start)
else:
return self.rfs.fetch_range(path, mode, start, end).compute()
class DaskFile(AbstractBufferedFile):
def __init__(self, mode="rb", **kwargs):
if mode != "rb":
raise ValueError('Remote dask files can only be opened in "rb" mode')
super().__init__(**kwargs)
def _upload_chunk(self, final=False):
pass
def _initiate_upload(self):
"""Create remote file/upload"""
pass
def _fetch_range(self, start, end):
"""Get the specified set of bytes from remote"""
return self.fs.fetch_range(self.path, self.mode, start, end)

View File

@@ -0,0 +1,58 @@
import base64
import io
from typing import Optional
from urllib.parse import unquote
from fsspec import AbstractFileSystem
class DataFileSystem(AbstractFileSystem):
"""A handy decoder for data-URLs
Example
-------
>>> with fsspec.open("data:,Hello%2C%20World%21") as f:
... print(f.read())
b"Hello, World!"
See https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/Data_URLs
"""
protocol = "data"
def __init__(self, **kwargs):
"""No parameters for this filesystem"""
super().__init__(**kwargs)
def cat_file(self, path, start=None, end=None, **kwargs):
pref, data = path.split(",", 1)
if pref.endswith("base64"):
return base64.b64decode(data)[start:end]
return unquote(data).encode()[start:end]
def info(self, path, **kwargs):
pref, name = path.split(",", 1)
data = self.cat_file(path)
mime = pref.split(":", 1)[1].split(";", 1)[0]
return {"name": name, "size": len(data), "type": "file", "mimetype": mime}
def _open(
self,
path,
mode="rb",
block_size=None,
autocommit=True,
cache_options=None,
**kwargs,
):
if "r" not in mode:
raise ValueError("Read only filesystem")
return io.BytesIO(self.cat_file(path))
@staticmethod
def encode(data: bytes, mime: Optional[str] = None):
"""Format the given data into data-URL syntax
This version always base64 encodes, even when the data is ascii/url-safe.
"""
return f"data:{mime or ''};base64,{base64.b64encode(data).decode()}"

View File

@@ -0,0 +1,496 @@
from __future__ import annotations
import base64
import urllib
import requests
from requests.adapters import HTTPAdapter, Retry
from typing_extensions import override
from fsspec import AbstractFileSystem
from fsspec.spec import AbstractBufferedFile
class DatabricksException(Exception):
"""
Helper class for exceptions raised in this module.
"""
def __init__(self, error_code, message, details=None):
"""Create a new DatabricksException"""
super().__init__(message)
self.error_code = error_code
self.message = message
self.details = details
class DatabricksFileSystem(AbstractFileSystem):
"""
Get access to the Databricks filesystem implementation over HTTP.
Can be used inside and outside of a databricks cluster.
"""
def __init__(self, instance, token, **kwargs):
"""
Create a new DatabricksFileSystem.
Parameters
----------
instance: str
The instance URL of the databricks cluster.
For example for an Azure databricks cluster, this
has the form adb-<some-number>.<two digits>.azuredatabricks.net.
token: str
Your personal token. Find out more
here: https://docs.databricks.com/dev-tools/api/latest/authentication.html
"""
self.instance = instance
self.token = token
self.session = requests.Session()
self.retries = Retry(
total=10,
backoff_factor=0.05,
status_forcelist=[408, 429, 500, 502, 503, 504],
)
self.session.mount("https://", HTTPAdapter(max_retries=self.retries))
self.session.headers.update({"Authorization": f"Bearer {self.token}"})
super().__init__(**kwargs)
@override
def _ls_from_cache(self, path) -> list[dict[str, str | int]] | None:
"""Check cache for listing
Returns listing, if found (may be empty list for a directory that
exists but contains nothing), None if not in cache.
"""
self.dircache.pop(path.rstrip("/"), None)
parent = self._parent(path)
if parent in self.dircache:
for entry in self.dircache[parent]:
if entry["name"] == path.rstrip("/"):
if entry["type"] != "directory":
return [entry]
return []
raise FileNotFoundError(path)
def ls(self, path, detail=True, **kwargs):
"""
List the contents of the given path.
Parameters
----------
path: str
Absolute path
detail: bool
Return not only the list of filenames,
but also additional information on file sizes
and types.
"""
try:
out = self._ls_from_cache(path)
except FileNotFoundError:
# This happens if the `path`'s parent was cached, but `path` is not
# there. This suggests that `path` is new since the parent was
# cached. Attempt to invalidate parent's cache before continuing.
self.dircache.pop(self._parent(path), None)
out = None
if not out:
try:
r = self._send_to_api(
method="get", endpoint="list", json={"path": path}
)
except DatabricksException as e:
if e.error_code == "RESOURCE_DOES_NOT_EXIST":
raise FileNotFoundError(e.message) from e
raise
files = r.get("files", [])
out = [
{
"name": o["path"],
"type": "directory" if o["is_dir"] else "file",
"size": o["file_size"],
}
for o in files
]
self.dircache[path] = out
if detail:
return out
return [o["name"] for o in out]
def makedirs(self, path, exist_ok=True):
"""
Create a given absolute path and all of its parents.
Parameters
----------
path: str
Absolute path to create
exist_ok: bool
If false, checks if the folder
exists before creating it (and raises an
Exception if this is the case)
"""
if not exist_ok:
try:
# If the following succeeds, the path is already present
self._send_to_api(
method="get", endpoint="get-status", json={"path": path}
)
raise FileExistsError(f"Path {path} already exists")
except DatabricksException as e:
if e.error_code == "RESOURCE_DOES_NOT_EXIST":
pass
try:
self._send_to_api(method="post", endpoint="mkdirs", json={"path": path})
except DatabricksException as e:
if e.error_code == "RESOURCE_ALREADY_EXISTS":
raise FileExistsError(e.message) from e
raise
self.invalidate_cache(self._parent(path))
def mkdir(self, path, create_parents=True, **kwargs):
"""
Create a given absolute path and all of its parents.
Parameters
----------
path: str
Absolute path to create
create_parents: bool
Whether to create all parents or not.
"False" is not implemented so far.
"""
if not create_parents:
raise NotImplementedError
self.mkdirs(path, **kwargs)
def rm(self, path, recursive=False, **kwargs):
"""
Remove the file or folder at the given absolute path.
Parameters
----------
path: str
Absolute path what to remove
recursive: bool
Recursively delete all files in a folder.
"""
try:
self._send_to_api(
method="post",
endpoint="delete",
json={"path": path, "recursive": recursive},
)
except DatabricksException as e:
# This is not really an exception, it just means
# not everything was deleted so far
if e.error_code == "PARTIAL_DELETE":
self.rm(path=path, recursive=recursive)
elif e.error_code == "IO_ERROR":
# Using the same exception as the os module would use here
raise OSError(e.message) from e
raise
self.invalidate_cache(self._parent(path))
def mv(
self, source_path, destination_path, recursive=False, maxdepth=None, **kwargs
):
"""
Move a source to a destination path.
A note from the original [databricks API manual]
(https://docs.databricks.com/dev-tools/api/latest/dbfs.html#move).
When moving a large number of files the API call will time out after
approximately 60s, potentially resulting in partially moved data.
Therefore, for operations that move more than 10k files, we strongly
discourage using the DBFS REST API.
Parameters
----------
source_path: str
From where to move (absolute path)
destination_path: str
To where to move (absolute path)
recursive: bool
Not implemented to far.
maxdepth:
Not implemented to far.
"""
if recursive:
raise NotImplementedError
if maxdepth:
raise NotImplementedError
try:
self._send_to_api(
method="post",
endpoint="move",
json={"source_path": source_path, "destination_path": destination_path},
)
except DatabricksException as e:
if e.error_code == "RESOURCE_DOES_NOT_EXIST":
raise FileNotFoundError(e.message) from e
elif e.error_code == "RESOURCE_ALREADY_EXISTS":
raise FileExistsError(e.message) from e
raise
self.invalidate_cache(self._parent(source_path))
self.invalidate_cache(self._parent(destination_path))
def _open(self, path, mode="rb", block_size="default", **kwargs):
"""
Overwrite the base class method to make sure to create a DBFile.
All arguments are copied from the base method.
Only the default blocksize is allowed.
"""
return DatabricksFile(self, path, mode=mode, block_size=block_size, **kwargs)
def _send_to_api(self, method, endpoint, json):
"""
Send the given json to the DBFS API
using a get or post request (specified by the argument `method`).
Parameters
----------
method: str
Which http method to use for communication; "get" or "post".
endpoint: str
Where to send the request to (last part of the API URL)
json: dict
Dictionary of information to send
"""
if method == "post":
session_call = self.session.post
elif method == "get":
session_call = self.session.get
else:
raise ValueError(f"Do not understand method {method}")
url = urllib.parse.urljoin(f"https://{self.instance}/api/2.0/dbfs/", endpoint)
r = session_call(url, json=json)
# The DBFS API will return a json, also in case of an exception.
# We want to preserve this information as good as possible.
try:
r.raise_for_status()
except requests.HTTPError as e:
# try to extract json error message
# if that fails, fall back to the original exception
try:
exception_json = e.response.json()
except Exception:
raise e from None
raise DatabricksException(**exception_json) from e
return r.json()
def _create_handle(self, path, overwrite=True):
"""
Internal function to create a handle, which can be used to
write blocks of a file to DBFS.
A handle has a unique identifier which needs to be passed
whenever written during this transaction.
The handle is active for 10 minutes - after that a new
write transaction needs to be created.
Make sure to close the handle after you are finished.
Parameters
----------
path: str
Absolute path for this file.
overwrite: bool
If a file already exist at this location, either overwrite
it or raise an exception.
"""
try:
r = self._send_to_api(
method="post",
endpoint="create",
json={"path": path, "overwrite": overwrite},
)
return r["handle"]
except DatabricksException as e:
if e.error_code == "RESOURCE_ALREADY_EXISTS":
raise FileExistsError(e.message) from e
raise
def _close_handle(self, handle):
"""
Close a handle, which was opened by :func:`_create_handle`.
Parameters
----------
handle: str
Which handle to close.
"""
try:
self._send_to_api(method="post", endpoint="close", json={"handle": handle})
except DatabricksException as e:
if e.error_code == "RESOURCE_DOES_NOT_EXIST":
raise FileNotFoundError(e.message) from e
raise
def _add_data(self, handle, data):
"""
Upload data to an already opened file handle
(opened by :func:`_create_handle`).
The maximal allowed data size is 1MB after
conversion to base64.
Remember to close the handle when you are finished.
Parameters
----------
handle: str
Which handle to upload data to.
data: bytes
Block of data to add to the handle.
"""
data = base64.b64encode(data).decode()
try:
self._send_to_api(
method="post",
endpoint="add-block",
json={"handle": handle, "data": data},
)
except DatabricksException as e:
if e.error_code == "RESOURCE_DOES_NOT_EXIST":
raise FileNotFoundError(e.message) from e
elif e.error_code == "MAX_BLOCK_SIZE_EXCEEDED":
raise ValueError(e.message) from e
raise
def _get_data(self, path, start, end):
"""
Download data in bytes from a given absolute path in a block
from [start, start+length].
The maximum number of allowed bytes to read is 1MB.
Parameters
----------
path: str
Absolute path to download data from
start: int
Start position of the block
end: int
End position of the block
"""
try:
r = self._send_to_api(
method="get",
endpoint="read",
json={"path": path, "offset": start, "length": end - start},
)
return base64.b64decode(r["data"])
except DatabricksException as e:
if e.error_code == "RESOURCE_DOES_NOT_EXIST":
raise FileNotFoundError(e.message) from e
elif e.error_code in ["INVALID_PARAMETER_VALUE", "MAX_READ_SIZE_EXCEEDED"]:
raise ValueError(e.message) from e
raise
def invalidate_cache(self, path=None):
if path is None:
self.dircache.clear()
else:
self.dircache.pop(path, None)
super().invalidate_cache(path)
class DatabricksFile(AbstractBufferedFile):
"""
Helper class for files referenced in the DatabricksFileSystem.
"""
DEFAULT_BLOCK_SIZE = 1 * 2**20 # only allowed block size
def __init__(
self,
fs,
path,
mode="rb",
block_size="default",
autocommit=True,
cache_type="readahead",
cache_options=None,
**kwargs,
):
"""
Create a new instance of the DatabricksFile.
The blocksize needs to be the default one.
"""
if block_size is None or block_size == "default":
block_size = self.DEFAULT_BLOCK_SIZE
assert block_size == self.DEFAULT_BLOCK_SIZE, (
f"Only the default block size is allowed, not {block_size}"
)
super().__init__(
fs,
path,
mode=mode,
block_size=block_size,
autocommit=autocommit,
cache_type=cache_type,
cache_options=cache_options or {},
**kwargs,
)
def _initiate_upload(self):
"""Internal function to start a file upload"""
self.handle = self.fs._create_handle(self.path)
def _upload_chunk(self, final=False):
"""Internal function to add a chunk of data to a started upload"""
self.buffer.seek(0)
data = self.buffer.getvalue()
data_chunks = [
data[start:end] for start, end in self._to_sized_blocks(len(data))
]
for data_chunk in data_chunks:
self.fs._add_data(handle=self.handle, data=data_chunk)
if final:
self.fs._close_handle(handle=self.handle)
return True
def _fetch_range(self, start, end):
"""Internal function to download a block of data"""
return_buffer = b""
length = end - start
for chunk_start, chunk_end in self._to_sized_blocks(length, start):
return_buffer += self.fs._get_data(
path=self.path, start=chunk_start, end=chunk_end
)
return return_buffer
def _to_sized_blocks(self, length, start=0):
"""Helper function to split a range from 0 to total_length into blocksizes"""
end = start + length
for data_chunk in range(start, end, self.blocksize):
data_start = data_chunk
data_end = min(end, data_chunk + self.blocksize)
yield data_start, data_end

View File

@@ -0,0 +1,388 @@
from .. import filesystem
from ..asyn import AsyncFileSystem
class DirFileSystem(AsyncFileSystem):
"""Directory prefix filesystem
The DirFileSystem is a filesystem-wrapper. It assumes every path it is dealing with
is relative to the `path`. After performing the necessary paths operation it
delegates everything to the wrapped filesystem.
"""
protocol = "dir"
def __init__(
self,
path=None,
fs=None,
fo=None,
target_protocol=None,
target_options=None,
**storage_options,
):
"""
Parameters
----------
path: str
Path to the directory.
fs: AbstractFileSystem
An instantiated filesystem to wrap.
target_protocol, target_options:
if fs is none, construct it from these
fo: str
Alternate for path; do not provide both
"""
super().__init__(**storage_options)
if fs is None:
fs = filesystem(protocol=target_protocol, **(target_options or {}))
path = path or fo
if self.asynchronous and not fs.async_impl:
raise ValueError("can't use asynchronous with non-async fs")
if fs.async_impl and self.asynchronous != fs.asynchronous:
raise ValueError("both dirfs and fs should be in the same sync/async mode")
self.path = fs._strip_protocol(path)
self.fs = fs
def _join(self, path):
if isinstance(path, str):
if not self.path:
return path
if not path:
return self.path
return self.fs.sep.join((self.path, self._strip_protocol(path)))
if isinstance(path, dict):
return {self._join(_path): value for _path, value in path.items()}
return [self._join(_path) for _path in path]
def _relpath(self, path):
if isinstance(path, str):
if not self.path:
return path
# We need to account for S3FileSystem returning paths that do not
# start with a '/'
if path == self.path or (
self.path.startswith(self.fs.sep) and path == self.path[1:]
):
return ""
prefix = self.path + self.fs.sep
if self.path.startswith(self.fs.sep) and not path.startswith(self.fs.sep):
prefix = prefix[1:]
assert path.startswith(prefix)
return path[len(prefix) :]
return [self._relpath(_path) for _path in path]
# Wrappers below
@property
def sep(self):
return self.fs.sep
async def set_session(self, *args, **kwargs):
return await self.fs.set_session(*args, **kwargs)
async def _rm_file(self, path, **kwargs):
return await self.fs._rm_file(self._join(path), **kwargs)
def rm_file(self, path, **kwargs):
return self.fs.rm_file(self._join(path), **kwargs)
async def _rm(self, path, *args, **kwargs):
return await self.fs._rm(self._join(path), *args, **kwargs)
def rm(self, path, *args, **kwargs):
return self.fs.rm(self._join(path), *args, **kwargs)
async def _cp_file(self, path1, path2, **kwargs):
return await self.fs._cp_file(self._join(path1), self._join(path2), **kwargs)
def cp_file(self, path1, path2, **kwargs):
return self.fs.cp_file(self._join(path1), self._join(path2), **kwargs)
async def _copy(
self,
path1,
path2,
*args,
**kwargs,
):
return await self.fs._copy(
self._join(path1),
self._join(path2),
*args,
**kwargs,
)
def copy(self, path1, path2, *args, **kwargs):
return self.fs.copy(
self._join(path1),
self._join(path2),
*args,
**kwargs,
)
async def _pipe(self, path, *args, **kwargs):
return await self.fs._pipe(self._join(path), *args, **kwargs)
def pipe(self, path, *args, **kwargs):
return self.fs.pipe(self._join(path), *args, **kwargs)
async def _pipe_file(self, path, *args, **kwargs):
return await self.fs._pipe_file(self._join(path), *args, **kwargs)
def pipe_file(self, path, *args, **kwargs):
return self.fs.pipe_file(self._join(path), *args, **kwargs)
async def _cat_file(self, path, *args, **kwargs):
return await self.fs._cat_file(self._join(path), *args, **kwargs)
def cat_file(self, path, *args, **kwargs):
return self.fs.cat_file(self._join(path), *args, **kwargs)
async def _cat(self, path, *args, **kwargs):
ret = await self.fs._cat(
self._join(path),
*args,
**kwargs,
)
if isinstance(ret, dict):
return {self._relpath(key): value for key, value in ret.items()}
return ret
def cat(self, path, *args, **kwargs):
ret = self.fs.cat(
self._join(path),
*args,
**kwargs,
)
if isinstance(ret, dict):
return {self._relpath(key): value for key, value in ret.items()}
return ret
async def _put_file(self, lpath, rpath, **kwargs):
return await self.fs._put_file(lpath, self._join(rpath), **kwargs)
def put_file(self, lpath, rpath, **kwargs):
return self.fs.put_file(lpath, self._join(rpath), **kwargs)
async def _put(
self,
lpath,
rpath,
*args,
**kwargs,
):
return await self.fs._put(
lpath,
self._join(rpath),
*args,
**kwargs,
)
def put(self, lpath, rpath, *args, **kwargs):
return self.fs.put(
lpath,
self._join(rpath),
*args,
**kwargs,
)
async def _get_file(self, rpath, lpath, **kwargs):
return await self.fs._get_file(self._join(rpath), lpath, **kwargs)
def get_file(self, rpath, lpath, **kwargs):
return self.fs.get_file(self._join(rpath), lpath, **kwargs)
async def _get(self, rpath, *args, **kwargs):
return await self.fs._get(self._join(rpath), *args, **kwargs)
def get(self, rpath, *args, **kwargs):
return self.fs.get(self._join(rpath), *args, **kwargs)
async def _isfile(self, path):
return await self.fs._isfile(self._join(path))
def isfile(self, path):
return self.fs.isfile(self._join(path))
async def _isdir(self, path):
return await self.fs._isdir(self._join(path))
def isdir(self, path):
return self.fs.isdir(self._join(path))
async def _size(self, path):
return await self.fs._size(self._join(path))
def size(self, path):
return self.fs.size(self._join(path))
async def _exists(self, path):
return await self.fs._exists(self._join(path))
def exists(self, path):
return self.fs.exists(self._join(path))
async def _info(self, path, **kwargs):
info = await self.fs._info(self._join(path), **kwargs)
info = info.copy()
info["name"] = self._relpath(info["name"])
return info
def info(self, path, **kwargs):
info = self.fs.info(self._join(path), **kwargs)
info = info.copy()
info["name"] = self._relpath(info["name"])
return info
async def _ls(self, path, detail=True, **kwargs):
ret = (await self.fs._ls(self._join(path), detail=detail, **kwargs)).copy()
if detail:
out = []
for entry in ret:
entry = entry.copy()
entry["name"] = self._relpath(entry["name"])
out.append(entry)
return out
return self._relpath(ret)
def ls(self, path, detail=True, **kwargs):
ret = self.fs.ls(self._join(path), detail=detail, **kwargs).copy()
if detail:
out = []
for entry in ret:
entry = entry.copy()
entry["name"] = self._relpath(entry["name"])
out.append(entry)
return out
return self._relpath(ret)
async def _walk(self, path, *args, **kwargs):
async for root, dirs, files in self.fs._walk(self._join(path), *args, **kwargs):
yield self._relpath(root), dirs, files
def walk(self, path, *args, **kwargs):
for root, dirs, files in self.fs.walk(self._join(path), *args, **kwargs):
yield self._relpath(root), dirs, files
async def _glob(self, path, **kwargs):
detail = kwargs.get("detail", False)
ret = await self.fs._glob(self._join(path), **kwargs)
if detail:
return {self._relpath(path): info for path, info in ret.items()}
return self._relpath(ret)
def glob(self, path, **kwargs):
detail = kwargs.get("detail", False)
ret = self.fs.glob(self._join(path), **kwargs)
if detail:
return {self._relpath(path): info for path, info in ret.items()}
return self._relpath(ret)
async def _du(self, path, *args, **kwargs):
total = kwargs.get("total", True)
ret = await self.fs._du(self._join(path), *args, **kwargs)
if total:
return ret
return {self._relpath(path): size for path, size in ret.items()}
def du(self, path, *args, **kwargs):
total = kwargs.get("total", True)
ret = self.fs.du(self._join(path), *args, **kwargs)
if total:
return ret
return {self._relpath(path): size for path, size in ret.items()}
async def _find(self, path, *args, **kwargs):
detail = kwargs.get("detail", False)
ret = await self.fs._find(self._join(path), *args, **kwargs)
if detail:
return {self._relpath(path): info for path, info in ret.items()}
return self._relpath(ret)
def find(self, path, *args, **kwargs):
detail = kwargs.get("detail", False)
ret = self.fs.find(self._join(path), *args, **kwargs)
if detail:
return {self._relpath(path): info for path, info in ret.items()}
return self._relpath(ret)
async def _expand_path(self, path, *args, **kwargs):
return self._relpath(
await self.fs._expand_path(self._join(path), *args, **kwargs)
)
def expand_path(self, path, *args, **kwargs):
return self._relpath(self.fs.expand_path(self._join(path), *args, **kwargs))
async def _mkdir(self, path, *args, **kwargs):
return await self.fs._mkdir(self._join(path), *args, **kwargs)
def mkdir(self, path, *args, **kwargs):
return self.fs.mkdir(self._join(path), *args, **kwargs)
async def _makedirs(self, path, *args, **kwargs):
return await self.fs._makedirs(self._join(path), *args, **kwargs)
def makedirs(self, path, *args, **kwargs):
return self.fs.makedirs(self._join(path), *args, **kwargs)
def rmdir(self, path):
return self.fs.rmdir(self._join(path))
def mv(self, path1, path2, **kwargs):
return self.fs.mv(
self._join(path1),
self._join(path2),
**kwargs,
)
def touch(self, path, **kwargs):
return self.fs.touch(self._join(path), **kwargs)
def created(self, path):
return self.fs.created(self._join(path))
def modified(self, path):
return self.fs.modified(self._join(path))
def sign(self, path, *args, **kwargs):
return self.fs.sign(self._join(path), *args, **kwargs)
def __repr__(self):
return f"{self.__class__.__qualname__}(path='{self.path}', fs={self.fs})"
def open(
self,
path,
*args,
**kwargs,
):
return self.fs.open(
self._join(path),
*args,
**kwargs,
)
async def open_async(
self,
path,
*args,
**kwargs,
):
return await self.fs.open_async(
self._join(path),
*args,
**kwargs,
)

View File

@@ -0,0 +1,387 @@
import os
import uuid
from ftplib import FTP, FTP_TLS, Error, error_perm
from typing import Any
from ..spec import AbstractBufferedFile, AbstractFileSystem
from ..utils import infer_storage_options, isfilelike
class FTPFileSystem(AbstractFileSystem):
"""A filesystem over classic FTP"""
root_marker = "/"
cachable = False
protocol = "ftp"
def __init__(
self,
host,
port=21,
username=None,
password=None,
acct=None,
block_size=None,
tempdir=None,
timeout=30,
encoding="utf-8",
tls=False,
**kwargs,
):
"""
You can use _get_kwargs_from_urls to get some kwargs from
a reasonable FTP url.
Authentication will be anonymous if username/password are not
given.
Parameters
----------
host: str
The remote server name/ip to connect to
port: int
Port to connect with
username: str or None
If authenticating, the user's identifier
password: str of None
User's password on the server, if using
acct: str or None
Some servers also need an "account" string for auth
block_size: int or None
If given, the read-ahead or write buffer size.
tempdir: str
Directory on remote to put temporary files when in a transaction
timeout: int
Timeout of the ftp connection in seconds
encoding: str
Encoding to use for directories and filenames in FTP connection
tls: bool
Use FTP-TLS, by default False
"""
super().__init__(**kwargs)
self.host = host
self.port = port
self.tempdir = tempdir or "/tmp"
self.cred = username or "", password or "", acct or ""
self.timeout = timeout
self.encoding = encoding
if block_size is not None:
self.blocksize = block_size
else:
self.blocksize = 2**16
self.tls = tls
self._connect()
if self.tls:
self.ftp.prot_p()
def _connect(self):
if self.tls:
ftp_cls = FTP_TLS
else:
ftp_cls = FTP
self.ftp = ftp_cls(timeout=self.timeout, encoding=self.encoding)
self.ftp.connect(self.host, self.port)
self.ftp.login(*self.cred)
@classmethod
def _strip_protocol(cls, path):
return "/" + infer_storage_options(path)["path"].lstrip("/").rstrip("/")
@staticmethod
def _get_kwargs_from_urls(urlpath):
out = infer_storage_options(urlpath)
out.pop("path", None)
out.pop("protocol", None)
return out
def ls(self, path, detail=True, **kwargs):
path = self._strip_protocol(path)
out = []
if path not in self.dircache:
try:
try:
out = [
(fn, details)
for (fn, details) in self.ftp.mlsd(path)
if fn not in [".", ".."]
and details["type"] not in ["pdir", "cdir"]
]
except error_perm:
out = _mlsd2(self.ftp, path) # Not platform independent
for fn, details in out:
details["name"] = "/".join(
["" if path == "/" else path, fn.lstrip("/")]
)
if details["type"] == "file":
details["size"] = int(details["size"])
else:
details["size"] = 0
if details["type"] == "dir":
details["type"] = "directory"
self.dircache[path] = out
except Error:
try:
info = self.info(path)
if info["type"] == "file":
out = [(path, info)]
except (Error, IndexError) as exc:
raise FileNotFoundError(path) from exc
files = self.dircache.get(path, out)
if not detail:
return sorted([fn for fn, details in files])
return [details for fn, details in files]
def info(self, path, **kwargs):
# implement with direct method
path = self._strip_protocol(path)
if path == "/":
# special case, since this dir has no real entry
return {"name": "/", "size": 0, "type": "directory"}
files = self.ls(self._parent(path).lstrip("/"), True)
try:
out = next(f for f in files if f["name"] == path)
except StopIteration as exc:
raise FileNotFoundError(path) from exc
return out
def get_file(self, rpath, lpath, **kwargs):
if self.isdir(rpath):
if not os.path.exists(lpath):
os.mkdir(lpath)
return
if isfilelike(lpath):
outfile = lpath
else:
outfile = open(lpath, "wb")
def cb(x):
outfile.write(x)
self.ftp.retrbinary(
f"RETR {rpath}",
blocksize=self.blocksize,
callback=cb,
)
if not isfilelike(lpath):
outfile.close()
def cat_file(self, path, start=None, end=None, **kwargs):
if end is not None:
return super().cat_file(path, start, end, **kwargs)
out = []
def cb(x):
out.append(x)
try:
self.ftp.retrbinary(
f"RETR {path}",
blocksize=self.blocksize,
rest=start,
callback=cb,
)
except (Error, error_perm) as orig_exc:
raise FileNotFoundError(path) from orig_exc
return b"".join(out)
def _open(
self,
path,
mode="rb",
block_size=None,
cache_options=None,
autocommit=True,
**kwargs,
):
path = self._strip_protocol(path)
block_size = block_size or self.blocksize
return FTPFile(
self,
path,
mode=mode,
block_size=block_size,
tempdir=self.tempdir,
autocommit=autocommit,
cache_options=cache_options,
)
def _rm(self, path):
path = self._strip_protocol(path)
self.ftp.delete(path)
self.invalidate_cache(self._parent(path))
def rm(self, path, recursive=False, maxdepth=None):
paths = self.expand_path(path, recursive=recursive, maxdepth=maxdepth)
for p in reversed(paths):
if self.isfile(p):
self.rm_file(p)
else:
self.rmdir(p)
def mkdir(self, path: str, create_parents: bool = True, **kwargs: Any) -> None:
path = self._strip_protocol(path)
parent = self._parent(path)
if parent != self.root_marker and not self.exists(parent) and create_parents:
self.mkdir(parent, create_parents=create_parents)
self.ftp.mkd(path)
self.invalidate_cache(self._parent(path))
def makedirs(self, path: str, exist_ok: bool = False) -> None:
path = self._strip_protocol(path)
if self.exists(path):
# NB: "/" does not "exist" as it has no directory entry
if not exist_ok:
raise FileExistsError(f"{path} exists without `exist_ok`")
# exists_ok=True -> no-op
else:
self.mkdir(path, create_parents=True)
def rmdir(self, path):
path = self._strip_protocol(path)
self.ftp.rmd(path)
self.invalidate_cache(self._parent(path))
def mv(self, path1, path2, **kwargs):
path1 = self._strip_protocol(path1)
path2 = self._strip_protocol(path2)
self.ftp.rename(path1, path2)
self.invalidate_cache(self._parent(path1))
self.invalidate_cache(self._parent(path2))
def __del__(self):
self.ftp.close()
def invalidate_cache(self, path=None):
if path is None:
self.dircache.clear()
else:
self.dircache.pop(path, None)
super().invalidate_cache(path)
class TransferDone(Exception):
"""Internal exception to break out of transfer"""
pass
class FTPFile(AbstractBufferedFile):
"""Interact with a remote FTP file with read/write buffering"""
def __init__(
self,
fs,
path,
mode="rb",
block_size="default",
autocommit=True,
cache_type="readahead",
cache_options=None,
**kwargs,
):
super().__init__(
fs,
path,
mode=mode,
block_size=block_size,
autocommit=autocommit,
cache_type=cache_type,
cache_options=cache_options,
**kwargs,
)
if not autocommit:
self.target = self.path
self.path = "/".join([kwargs["tempdir"], str(uuid.uuid4())])
def commit(self):
self.fs.mv(self.path, self.target)
def discard(self):
self.fs.rm(self.path)
def _fetch_range(self, start, end):
"""Get bytes between given byte limits
Implemented by raising an exception in the fetch callback when the
number of bytes received reaches the requested amount.
Will fail if the server does not respect the REST command on
retrieve requests.
"""
out = []
total = [0]
def callback(x):
total[0] += len(x)
if total[0] > end - start:
out.append(x[: (end - start) - total[0]])
if end < self.size:
raise TransferDone
else:
out.append(x)
if total[0] == end - start and end < self.size:
raise TransferDone
try:
self.fs.ftp.retrbinary(
f"RETR {self.path}",
blocksize=self.blocksize,
rest=start,
callback=callback,
)
except TransferDone:
try:
# stop transfer, we got enough bytes for this block
self.fs.ftp.abort()
self.fs.ftp.getmultiline()
except Error:
self.fs._connect()
return b"".join(out)
def _upload_chunk(self, final=False):
self.buffer.seek(0)
self.fs.ftp.storbinary(
f"STOR {self.path}", self.buffer, blocksize=self.blocksize, rest=self.offset
)
return True
def _mlsd2(ftp, path="."):
"""
Fall back to using `dir` instead of `mlsd` if not supported.
This parses a Linux style `ls -l` response to `dir`, but the response may
be platform dependent.
Parameters
----------
ftp: ftplib.FTP
path: str
Expects to be given path, but defaults to ".".
"""
lines = []
minfo = []
ftp.dir(path, lines.append)
for line in lines:
split_line = line.split()
if len(split_line) < 9:
continue
this = (
split_line[-1],
{
"modify": " ".join(split_line[5:8]),
"unix.owner": split_line[2],
"unix.group": split_line[3],
"unix.mode": split_line[0],
"size": split_line[4],
},
)
if this[1]["unix.mode"][0] == "d":
this[1]["type"] = "dir"
else:
this[1]["type"] = "file"
minfo.append(this)
return minfo

View File

@@ -0,0 +1,241 @@
import requests
from ..spec import AbstractFileSystem
from ..utils import infer_storage_options
from .memory import MemoryFile
class GistFileSystem(AbstractFileSystem):
"""
Interface to files in a single GitHub Gist.
Provides read-only access to a gist's files. Gists do not contain
subdirectories, so file listing is straightforward.
Parameters
----------
gist_id: str
The ID of the gist you want to access (the long hex value from the URL).
filenames: list[str] (optional)
If provided, only make a file system representing these files, and do not fetch
the list of all files for this gist.
sha: str (optional)
If provided, fetch a particular revision of the gist. If omitted,
the latest revision is used.
username: str (optional)
GitHub username for authentication.
token: str (optional)
GitHub personal access token (required if username is given), or.
timeout: (float, float) or float, optional
Connect and read timeouts for requests (default 60s each).
kwargs: dict
Stored on `self.request_kw` and passed to `requests.get` when fetching Gist
metadata or reading ("opening") a file.
"""
protocol = "gist"
gist_url = "https://api.github.com/gists/{gist_id}"
gist_rev_url = "https://api.github.com/gists/{gist_id}/{sha}"
def __init__(
self,
gist_id,
filenames=None,
sha=None,
username=None,
token=None,
timeout=None,
**kwargs,
):
super().__init__()
self.gist_id = gist_id
self.filenames = filenames
self.sha = sha # revision of the gist (optional)
if username is not None and token is None:
raise ValueError("User auth requires a token")
self.username = username
self.token = token
self.request_kw = kwargs
# Default timeouts to 60s connect/read if none provided
self.timeout = timeout if timeout is not None else (60, 60)
# We use a single-level "directory" cache, because a gist is essentially flat
self.dircache[""] = self._fetch_file_list()
@property
def kw(self):
"""Auth parameters passed to 'requests' if we have username/token."""
kw = {
"headers": {
"Accept": "application/vnd.github+json",
"X-GitHub-Api-Version": "2022-11-28",
}
}
kw.update(self.request_kw)
if self.username and self.token:
kw["auth"] = (self.username, self.token)
elif self.token:
kw["headers"]["Authorization"] = f"Bearer {self.token}"
return kw
def _fetch_gist_metadata(self):
"""
Fetch the JSON metadata for this gist (possibly for a specific revision).
"""
if self.sha:
url = self.gist_rev_url.format(gist_id=self.gist_id, sha=self.sha)
else:
url = self.gist_url.format(gist_id=self.gist_id)
r = requests.get(url, timeout=self.timeout, **self.kw)
if r.status_code == 404:
raise FileNotFoundError(
f"Gist not found: {self.gist_id}@{self.sha or 'latest'}"
)
r.raise_for_status()
return r.json()
def _fetch_file_list(self):
"""
Returns a list of dicts describing each file in the gist. These get stored
in self.dircache[""].
"""
meta = self._fetch_gist_metadata()
if self.filenames:
available_files = meta.get("files", {})
files = {}
for fn in self.filenames:
if fn not in available_files:
raise FileNotFoundError(fn)
files[fn] = available_files[fn]
else:
files = meta.get("files", {})
out = []
for fname, finfo in files.items():
if finfo is None:
# Occasionally GitHub returns a file entry with null if it was deleted
continue
# Build a directory entry
out.append(
{
"name": fname, # file's name
"type": "file", # gists have no subdirectories
"size": finfo.get("size", 0), # file size in bytes
"raw_url": finfo.get("raw_url"),
}
)
return out
@classmethod
def _strip_protocol(cls, path):
"""
Remove 'gist://' from the path, if present.
"""
# The default infer_storage_options can handle gist://username:token@id/file
# or gist://id/file, but let's ensure we handle a normal usage too.
# We'll just strip the protocol prefix if it exists.
path = infer_storage_options(path).get("path", path)
return path.lstrip("/")
@staticmethod
def _get_kwargs_from_urls(path):
"""
Parse 'gist://' style URLs into GistFileSystem constructor kwargs.
For example:
gist://:TOKEN@<gist_id>/file.txt
gist://username:TOKEN@<gist_id>/file.txt
"""
so = infer_storage_options(path)
out = {}
if "username" in so and so["username"]:
out["username"] = so["username"]
if "password" in so and so["password"]:
out["token"] = so["password"]
if "host" in so and so["host"]:
# We interpret 'host' as the gist ID
out["gist_id"] = so["host"]
# Extract SHA and filename from path
if "path" in so and so["path"]:
path_parts = so["path"].rsplit("/", 2)[-2:]
if len(path_parts) == 2:
if path_parts[0]: # SHA present
out["sha"] = path_parts[0]
if path_parts[1]: # filename also present
out["filenames"] = [path_parts[1]]
return out
def ls(self, path="", detail=False, **kwargs):
"""
List files in the gist. Gists are single-level, so any 'path' is basically
the filename, or empty for all files.
Parameters
----------
path : str, optional
The filename to list. If empty, returns all files in the gist.
detail : bool, default False
If True, return a list of dicts; if False, return a list of filenames.
"""
path = self._strip_protocol(path or "")
# If path is empty, return all
if path == "":
results = self.dircache[""]
else:
# We want just the single file with this name
all_files = self.dircache[""]
results = [f for f in all_files if f["name"] == path]
if not results:
raise FileNotFoundError(path)
if detail:
return results
else:
return sorted(f["name"] for f in results)
def _open(self, path, mode="rb", block_size=None, **kwargs):
"""
Read a single file from the gist.
"""
if mode != "rb":
raise NotImplementedError("GitHub Gist FS is read-only (no write).")
path = self._strip_protocol(path)
# Find the file entry in our dircache
matches = [f for f in self.dircache[""] if f["name"] == path]
if not matches:
raise FileNotFoundError(path)
finfo = matches[0]
raw_url = finfo.get("raw_url")
if not raw_url:
raise FileNotFoundError(f"No raw_url for file: {path}")
r = requests.get(raw_url, timeout=self.timeout, **self.kw)
if r.status_code == 404:
raise FileNotFoundError(path)
r.raise_for_status()
return MemoryFile(path, None, r.content)
def cat(self, path, recursive=False, on_error="raise", **kwargs):
"""
Return {path: contents} for the given file or files. If 'recursive' is True,
and path is empty, returns all files in the gist.
"""
paths = self.expand_path(path, recursive=recursive)
out = {}
for p in paths:
try:
with self.open(p, "rb") as f:
out[p] = f.read()
except FileNotFoundError as e:
if on_error == "raise":
raise e
elif on_error == "omit":
pass # skip
else:
out[p] = e
if len(paths) == 1 and paths[0] == path:
return out[path]
return out

View File

@@ -0,0 +1,114 @@
import os
import pygit2
from fsspec.spec import AbstractFileSystem
from .memory import MemoryFile
class GitFileSystem(AbstractFileSystem):
"""Browse the files of a local git repo at any hash/tag/branch
(experimental backend)
"""
root_marker = ""
cachable = True
def __init__(self, path=None, fo=None, ref=None, **kwargs):
"""
Parameters
----------
path: str (optional)
Local location of the repo (uses current directory if not given).
May be deprecated in favour of ``fo``. When used with a higher
level function such as fsspec.open(), may be of the form
"git://[path-to-repo[:]][ref@]path/to/file" (but the actual
file path should not contain "@" or ":").
fo: str (optional)
Same as ``path``, but passed as part of a chained URL. This one
takes precedence if both are given.
ref: str (optional)
Reference to work with, could be a hash, tag or branch name. Defaults
to current working tree. Note that ``ls`` and ``open`` also take hash,
so this becomes the default for those operations
kwargs
"""
super().__init__(**kwargs)
self.repo = pygit2.Repository(fo or path or os.getcwd())
self.ref = ref or "master"
@classmethod
def _strip_protocol(cls, path):
path = super()._strip_protocol(path).lstrip("/")
if ":" in path:
path = path.split(":", 1)[1]
if "@" in path:
path = path.split("@", 1)[1]
return path.lstrip("/")
def _path_to_object(self, path, ref):
comm, ref = self.repo.resolve_refish(ref or self.ref)
parts = path.split("/")
tree = comm.tree
for part in parts:
if part and isinstance(tree, pygit2.Tree):
if part not in tree:
raise FileNotFoundError(path)
tree = tree[part]
return tree
@staticmethod
def _get_kwargs_from_urls(path):
path = path.removeprefix("git://")
out = {}
if ":" in path:
out["path"], path = path.split(":", 1)
if "@" in path:
out["ref"], path = path.split("@", 1)
return out
@staticmethod
def _object_to_info(obj, path=None):
# obj.name and obj.filemode are None for the root tree!
is_dir = isinstance(obj, pygit2.Tree)
return {
"type": "directory" if is_dir else "file",
"name": (
"/".join([path, obj.name or ""]).lstrip("/") if path else obj.name
),
"hex": str(obj.id),
"mode": "100644" if obj.filemode is None else f"{obj.filemode:o}",
"size": 0 if is_dir else obj.size,
}
def ls(self, path, detail=True, ref=None, **kwargs):
tree = self._path_to_object(self._strip_protocol(path), ref)
return [
GitFileSystem._object_to_info(obj, path)
if detail
else GitFileSystem._object_to_info(obj, path)["name"]
for obj in (tree if isinstance(tree, pygit2.Tree) else [tree])
]
def info(self, path, ref=None, **kwargs):
tree = self._path_to_object(self._strip_protocol(path), ref)
return GitFileSystem._object_to_info(tree, path)
def ukey(self, path, ref=None):
return self.info(path, ref=ref)["hex"]
def _open(
self,
path,
mode="rb",
block_size=None,
autocommit=True,
cache_options=None,
ref=None,
**kwargs,
):
obj = self._path_to_object(path, ref or self.ref)
return MemoryFile(data=obj.data)

View File

@@ -0,0 +1,333 @@
import base64
import re
import requests
from ..spec import AbstractFileSystem
from ..utils import infer_storage_options
from .memory import MemoryFile
class GithubFileSystem(AbstractFileSystem):
"""Interface to files in github
An instance of this class provides the files residing within a remote github
repository. You may specify a point in the repos history, by SHA, branch
or tag (default is current master).
For files less than 1 MB in size, file content is returned directly in a
MemoryFile. For larger files, or for files tracked by git-lfs, file content
is returned as an HTTPFile wrapping the ``download_url`` provided by the
GitHub API.
When using fsspec.open, allows URIs of the form:
- "github://path/file", in which case you must specify org, repo and
may specify sha in the extra args
- 'github://org:repo@/precip/catalog.yml', where the org and repo are
part of the URI
- 'github://org:repo@sha/precip/catalog.yml', where the sha is also included
``sha`` can be the full or abbreviated hex of the commit you want to fetch
from, or a branch or tag name (so long as it doesn't contain special characters
like "/", "?", which would have to be HTTP-encoded).
For authorised access, you must provide username and token, which can be made
at https://github.com/settings/tokens
"""
url = "https://api.github.com/repos/{org}/{repo}/git/trees/{sha}"
content_url = "https://api.github.com/repos/{org}/{repo}/contents/{path}?ref={sha}"
protocol = "github"
timeout = (60, 60) # connect, read timeouts
def __init__(
self, org, repo, sha=None, username=None, token=None, timeout=None, **kwargs
):
super().__init__(**kwargs)
self.org = org
self.repo = repo
if (username is None) ^ (token is None):
raise ValueError("Auth required both username and token")
self.username = username
self.token = token
if timeout is not None:
self.timeout = timeout
if sha is None:
# look up default branch (not necessarily "master")
u = "https://api.github.com/repos/{org}/{repo}"
r = requests.get(
u.format(org=org, repo=repo), timeout=self.timeout, **self.kw
)
r.raise_for_status()
sha = r.json()["default_branch"]
self.root = sha
self.ls("")
try:
from .http import HTTPFileSystem
self.http_fs = HTTPFileSystem(**kwargs)
except ImportError:
self.http_fs = None
@property
def kw(self):
if self.username:
return {"auth": (self.username, self.token)}
return {}
@classmethod
def repos(cls, org_or_user, is_org=True):
"""List repo names for given org or user
This may become the top level of the FS
Parameters
----------
org_or_user: str
Name of the github org or user to query
is_org: bool (default True)
Whether the name is an organisation (True) or user (False)
Returns
-------
List of string
"""
r = requests.get(
f"https://api.github.com/{['users', 'orgs'][is_org]}/{org_or_user}/repos",
timeout=cls.timeout,
)
r.raise_for_status()
return [repo["name"] for repo in r.json()]
@property
def tags(self):
"""Names of tags in the repo"""
r = requests.get(
f"https://api.github.com/repos/{self.org}/{self.repo}/tags",
timeout=self.timeout,
**self.kw,
)
r.raise_for_status()
return [t["name"] for t in r.json()]
@property
def branches(self):
"""Names of branches in the repo"""
r = requests.get(
f"https://api.github.com/repos/{self.org}/{self.repo}/branches",
timeout=self.timeout,
**self.kw,
)
r.raise_for_status()
return [t["name"] for t in r.json()]
@property
def refs(self):
"""Named references, tags and branches"""
return {"tags": self.tags, "branches": self.branches}
def ls(self, path, detail=False, sha=None, _sha=None, **kwargs):
"""List files at given path
Parameters
----------
path: str
Location to list, relative to repo root
detail: bool
If True, returns list of dicts, one per file; if False, returns
list of full filenames only
sha: str (optional)
List at the given point in the repo history, branch or tag name or commit
SHA
_sha: str (optional)
List this specific tree object (used internally to descend into trees)
"""
path = self._strip_protocol(path)
if path == "":
_sha = sha or self.root
if _sha is None:
parts = path.rstrip("/").split("/")
so_far = ""
_sha = sha or self.root
for part in parts:
out = self.ls(so_far, True, sha=sha, _sha=_sha)
so_far += "/" + part if so_far else part
out = [o for o in out if o["name"] == so_far]
if not out:
raise FileNotFoundError(path)
out = out[0]
if out["type"] == "file":
if detail:
return [out]
else:
return path
_sha = out["sha"]
if path not in self.dircache or sha not in [self.root, None]:
r = requests.get(
self.url.format(org=self.org, repo=self.repo, sha=_sha),
timeout=self.timeout,
**self.kw,
)
if r.status_code == 404:
raise FileNotFoundError(path)
r.raise_for_status()
types = {"blob": "file", "tree": "directory"}
out = [
{
"name": path + "/" + f["path"] if path else f["path"],
"mode": f["mode"],
"type": types[f["type"]],
"size": f.get("size", 0),
"sha": f["sha"],
}
for f in r.json()["tree"]
if f["type"] in types
]
if sha in [self.root, None]:
self.dircache[path] = out
else:
out = self.dircache[path]
if detail:
return out
else:
return sorted([f["name"] for f in out])
def invalidate_cache(self, path=None):
self.dircache.clear()
@classmethod
def _strip_protocol(cls, path):
opts = infer_storage_options(path)
if "username" not in opts:
return super()._strip_protocol(path)
return opts["path"].lstrip("/")
@staticmethod
def _get_kwargs_from_urls(path):
opts = infer_storage_options(path)
if "username" not in opts:
return {}
out = {"org": opts["username"], "repo": opts["password"]}
if opts["host"]:
out["sha"] = opts["host"]
return out
def _open(
self,
path,
mode="rb",
block_size=None,
cache_options=None,
sha=None,
**kwargs,
):
if mode != "rb":
raise NotImplementedError
# construct a url to hit the GitHub API's repo contents API
url = self.content_url.format(
org=self.org, repo=self.repo, path=path, sha=sha or self.root
)
# make a request to this API, and parse the response as JSON
r = requests.get(url, timeout=self.timeout, **self.kw)
if r.status_code == 404:
raise FileNotFoundError(path)
r.raise_for_status()
content_json = r.json()
# if the response's content key is not empty, try to parse it as base64
if content_json["content"]:
content = base64.b64decode(content_json["content"])
# as long as the content does not start with the string
# "version https://git-lfs.github.com/"
# then it is probably not a git-lfs pointer and we can just return
# the content directly
if not content.startswith(b"version https://git-lfs.github.com/"):
return MemoryFile(None, None, content)
# we land here if the content was not present in the first response
# (regular file over 1MB or git-lfs tracked file)
# in this case, we get let the HTTPFileSystem handle the download
if self.http_fs is None:
raise ImportError(
"Please install fsspec[http] to access github files >1 MB "
"or git-lfs tracked files."
)
return self.http_fs.open(
content_json["download_url"],
mode=mode,
block_size=block_size,
cache_options=cache_options,
**kwargs,
)
def rm(self, path, recursive=False, maxdepth=None, message=None):
path = self.expand_path(path, recursive=recursive, maxdepth=maxdepth)
for p in reversed(path):
self.rm_file(p, message=message)
def rm_file(self, path, message=None, **kwargs):
"""
Remove a file from a specified branch using a given commit message.
Since Github DELETE operation requires a branch name, and we can't reliably
determine whether the provided SHA refers to a branch, tag, or commit, we
assume it's a branch. If it's not, the user will encounter an error when
attempting to retrieve the file SHA or delete the file.
Parameters
----------
path: str
The file's location relative to the repository root.
message: str, optional
The commit message for the deletion.
"""
if not self.username:
raise ValueError("Authentication required")
path = self._strip_protocol(path)
# Attempt to get SHA from cache or Github API
sha = self._get_sha_from_cache(path)
if not sha:
url = self.content_url.format(
org=self.org, repo=self.repo, path=path.lstrip("/"), sha=self.root
)
r = requests.get(url, timeout=self.timeout, **self.kw)
if r.status_code == 404:
raise FileNotFoundError(path)
r.raise_for_status()
sha = r.json()["sha"]
# Delete the file
delete_url = self.content_url.format(
org=self.org, repo=self.repo, path=path, sha=self.root
)
branch = self.root
data = {
"message": message or f"Delete {path}",
"sha": sha,
**({"branch": branch} if branch else {}),
}
r = requests.delete(delete_url, json=data, timeout=self.timeout, **self.kw)
error_message = r.json().get("message", "")
if re.search(r"Branch .+ not found", error_message):
error = "Remove only works when the filesystem is initialised from a branch or default (None)"
raise ValueError(error)
r.raise_for_status()
self.invalidate_cache(path)
def _get_sha_from_cache(self, path):
for entries in self.dircache.values():
for entry in entries:
entry_path = entry.get("name")
if entry_path and entry_path == path and "sha" in entry:
return entry["sha"]
return None

View File

@@ -0,0 +1,891 @@
import asyncio
import io
import logging
import re
import weakref
from copy import copy
from urllib.parse import urlparse
import aiohttp
import yarl
from fsspec.asyn import AbstractAsyncStreamedFile, AsyncFileSystem, sync, sync_wrapper
from fsspec.callbacks import DEFAULT_CALLBACK
from fsspec.exceptions import FSTimeoutError
from fsspec.spec import AbstractBufferedFile
from fsspec.utils import (
DEFAULT_BLOCK_SIZE,
glob_translate,
isfilelike,
nullcontext,
tokenize,
)
from ..caching import AllBytes
# https://stackoverflow.com/a/15926317/3821154
ex = re.compile(r"""<(a|A)\s+(?:[^>]*?\s+)?(href|HREF)=["'](?P<url>[^"']+)""")
ex2 = re.compile(r"""(?P<url>http[s]?://[-a-zA-Z0-9@:%_+.~#?&/=]+)""")
logger = logging.getLogger("fsspec.http")
async def get_client(**kwargs):
return aiohttp.ClientSession(**kwargs)
class HTTPFileSystem(AsyncFileSystem):
"""
Simple File-System for fetching data via HTTP(S)
``ls()`` is implemented by loading the parent page and doing a regex
match on the result. If simple_link=True, anything of the form
"http(s)://server.com/stuff?thing=other"; otherwise only links within
HTML href tags will be used.
"""
protocol = ("http", "https")
sep = "/"
def __init__(
self,
simple_links=True,
block_size=None,
same_scheme=True,
size_policy=None,
cache_type="bytes",
cache_options=None,
asynchronous=False,
loop=None,
client_kwargs=None,
get_client=get_client,
encoded=False,
**storage_options,
):
"""
NB: if this is called async, you must await set_client
Parameters
----------
block_size: int
Blocks to read bytes; if 0, will default to raw requests file-like
objects instead of HTTPFile instances
simple_links: bool
If True, will consider both HTML <a> tags and anything that looks
like a URL; if False, will consider only the former.
same_scheme: True
When doing ls/glob, if this is True, only consider paths that have
http/https matching the input URLs.
size_policy: this argument is deprecated
client_kwargs: dict
Passed to aiohttp.ClientSession, see
https://docs.aiohttp.org/en/stable/client_reference.html
For example, ``{'auth': aiohttp.BasicAuth('user', 'pass')}``
get_client: Callable[..., aiohttp.ClientSession]
A callable, which takes keyword arguments and constructs
an aiohttp.ClientSession. Its state will be managed by
the HTTPFileSystem class.
storage_options: key-value
Any other parameters passed on to requests
cache_type, cache_options: defaults used in open()
"""
super().__init__(self, asynchronous=asynchronous, loop=loop, **storage_options)
self.block_size = block_size if block_size is not None else DEFAULT_BLOCK_SIZE
self.simple_links = simple_links
self.same_schema = same_scheme
self.cache_type = cache_type
self.cache_options = cache_options
self.client_kwargs = client_kwargs or {}
self.get_client = get_client
self.encoded = encoded
self.kwargs = storage_options
self._session = None
# Clean caching-related parameters from `storage_options`
# before propagating them as `request_options` through `self.kwargs`.
# TODO: Maybe rename `self.kwargs` to `self.request_options` to make
# it clearer.
request_options = copy(storage_options)
self.use_listings_cache = request_options.pop("use_listings_cache", False)
request_options.pop("listings_expiry_time", None)
request_options.pop("max_paths", None)
request_options.pop("skip_instance_cache", None)
self.kwargs = request_options
@property
def fsid(self):
return "http"
def encode_url(self, url):
return yarl.URL(url, encoded=self.encoded)
@staticmethod
def close_session(loop, session):
if loop is not None and loop.is_running():
try:
sync(loop, session.close, timeout=0.1)
return
except (TimeoutError, FSTimeoutError, NotImplementedError):
pass
connector = getattr(session, "_connector", None)
if connector is not None:
# close after loop is dead
connector._close()
async def set_session(self):
if self._session is None:
self._session = await self.get_client(loop=self.loop, **self.client_kwargs)
if not self.asynchronous:
weakref.finalize(self, self.close_session, self.loop, self._session)
return self._session
@classmethod
def _strip_protocol(cls, path):
"""For HTTP, we always want to keep the full URL"""
return path
@classmethod
def _parent(cls, path):
# override, since _strip_protocol is different for URLs
par = super()._parent(path)
if len(par) > 7: # "http://..."
return par
return ""
async def _ls_real(self, url, detail=True, **kwargs):
# ignoring URL-encoded arguments
kw = self.kwargs.copy()
kw.update(kwargs)
logger.debug(url)
session = await self.set_session()
async with session.get(self.encode_url(url), **self.kwargs) as r:
self._raise_not_found_for_status(r, url)
if "Content-Type" in r.headers:
mimetype = r.headers["Content-Type"].partition(";")[0]
else:
mimetype = None
if mimetype in ("text/html", None):
try:
text = await r.text(errors="ignore")
if self.simple_links:
links = ex2.findall(text) + [u[2] for u in ex.findall(text)]
else:
links = [u[2] for u in ex.findall(text)]
except UnicodeDecodeError:
links = [] # binary, not HTML
else:
links = []
out = set()
parts = urlparse(url)
for l in links:
if isinstance(l, tuple):
l = l[1]
if l.startswith("/") and len(l) > 1:
# absolute URL on this server
l = f"{parts.scheme}://{parts.netloc}{l}"
if l.startswith("http"):
if self.same_schema and l.startswith(url.rstrip("/") + "/"):
out.add(l)
elif l.replace("https", "http").startswith(
url.replace("https", "http").rstrip("/") + "/"
):
# allowed to cross http <-> https
out.add(l)
else:
if l not in ["..", "../"]:
# Ignore FTP-like "parent"
out.add("/".join([url.rstrip("/"), l.lstrip("/")]))
if not out and url.endswith("/"):
out = await self._ls_real(url.rstrip("/"), detail=False)
if detail:
return [
{
"name": u,
"size": None,
"type": "directory" if u.endswith("/") else "file",
}
for u in out
]
else:
return sorted(out)
async def _ls(self, url, detail=True, **kwargs):
if self.use_listings_cache and url in self.dircache:
out = self.dircache[url]
else:
out = await self._ls_real(url, detail=detail, **kwargs)
self.dircache[url] = out
return out
ls = sync_wrapper(_ls)
def _raise_not_found_for_status(self, response, url):
"""
Raises FileNotFoundError for 404s, otherwise uses raise_for_status.
"""
if response.status == 404:
raise FileNotFoundError(url)
response.raise_for_status()
async def _cat_file(self, url, start=None, end=None, **kwargs):
kw = self.kwargs.copy()
kw.update(kwargs)
logger.debug(url)
if start is not None or end is not None:
if start == end:
return b""
headers = kw.pop("headers", {}).copy()
headers["Range"] = await self._process_limits(url, start, end)
kw["headers"] = headers
session = await self.set_session()
async with session.get(self.encode_url(url), **kw) as r:
out = await r.read()
self._raise_not_found_for_status(r, url)
return out
async def _get_file(
self, rpath, lpath, chunk_size=5 * 2**20, callback=DEFAULT_CALLBACK, **kwargs
):
kw = self.kwargs.copy()
kw.update(kwargs)
logger.debug(rpath)
session = await self.set_session()
async with session.get(self.encode_url(rpath), **kw) as r:
try:
size = int(r.headers["content-length"])
except (ValueError, KeyError):
size = None
callback.set_size(size)
self._raise_not_found_for_status(r, rpath)
if isfilelike(lpath):
outfile = lpath
else:
outfile = open(lpath, "wb") # noqa: ASYNC230
try:
chunk = True
while chunk:
chunk = await r.content.read(chunk_size)
outfile.write(chunk)
callback.relative_update(len(chunk))
finally:
if not isfilelike(lpath):
outfile.close()
async def _put_file(
self,
lpath,
rpath,
chunk_size=5 * 2**20,
callback=DEFAULT_CALLBACK,
method="post",
mode="overwrite",
**kwargs,
):
if mode != "overwrite":
raise NotImplementedError("Exclusive write")
async def gen_chunks():
# Support passing arbitrary file-like objects
# and use them instead of streams.
if isinstance(lpath, io.IOBase):
context = nullcontext(lpath)
use_seek = False # might not support seeking
else:
context = open(lpath, "rb") # noqa: ASYNC230
use_seek = True
with context as f:
if use_seek:
callback.set_size(f.seek(0, 2))
f.seek(0)
else:
callback.set_size(getattr(f, "size", None))
chunk = f.read(chunk_size)
while chunk:
yield chunk
callback.relative_update(len(chunk))
chunk = f.read(chunk_size)
kw = self.kwargs.copy()
kw.update(kwargs)
session = await self.set_session()
method = method.lower()
if method not in ("post", "put"):
raise ValueError(
f"method has to be either 'post' or 'put', not: {method!r}"
)
meth = getattr(session, method)
async with meth(self.encode_url(rpath), data=gen_chunks(), **kw) as resp:
self._raise_not_found_for_status(resp, rpath)
async def _exists(self, path, **kwargs):
kw = self.kwargs.copy()
kw.update(kwargs)
try:
logger.debug(path)
session = await self.set_session()
r = await session.get(self.encode_url(path), **kw)
async with r:
return r.status < 400
except aiohttp.ClientError:
return False
async def _isfile(self, path, **kwargs):
return await self._exists(path, **kwargs)
def _open(
self,
path,
mode="rb",
block_size=None,
autocommit=None, # XXX: This differs from the base class.
cache_type=None,
cache_options=None,
size=None,
**kwargs,
):
"""Make a file-like object
Parameters
----------
path: str
Full URL with protocol
mode: string
must be "rb"
block_size: int or None
Bytes to download in one request; use instance value if None. If
zero, will return a streaming Requests file-like instance.
kwargs: key-value
Any other parameters, passed to requests calls
"""
if mode != "rb":
raise NotImplementedError
block_size = block_size if block_size is not None else self.block_size
kw = self.kwargs.copy()
kw["asynchronous"] = self.asynchronous
kw.update(kwargs)
info = {}
size = size or info.update(self.info(path, **kwargs)) or info["size"]
session = sync(self.loop, self.set_session)
if block_size and size and info.get("partial", True):
return HTTPFile(
self,
path,
session=session,
block_size=block_size,
mode=mode,
size=size,
cache_type=cache_type or self.cache_type,
cache_options=cache_options or self.cache_options,
loop=self.loop,
**kw,
)
else:
return HTTPStreamFile(
self,
path,
mode=mode,
loop=self.loop,
session=session,
**kw,
)
async def open_async(self, path, mode="rb", size=None, **kwargs):
session = await self.set_session()
if size is None:
try:
size = (await self._info(path, **kwargs))["size"]
except FileNotFoundError:
pass
return AsyncStreamFile(
self,
path,
loop=self.loop,
session=session,
size=size,
**kwargs,
)
def ukey(self, url):
"""Unique identifier; assume HTTP files are static, unchanging"""
return tokenize(url, self.kwargs, self.protocol)
async def _info(self, url, **kwargs):
"""Get info of URL
Tries to access location via HEAD, and then GET methods, but does
not fetch the data.
It is possible that the server does not supply any size information, in
which case size will be given as None (and certain operations on the
corresponding file will not work).
"""
info = {}
session = await self.set_session()
for policy in ["head", "get"]:
try:
info.update(
await _file_info(
self.encode_url(url),
size_policy=policy,
session=session,
**self.kwargs,
**kwargs,
)
)
if info.get("size") is not None:
break
except Exception as exc:
if policy == "get":
# If get failed, then raise a FileNotFoundError
raise FileNotFoundError(url) from exc
logger.debug("", exc_info=exc)
return {"name": url, "size": None, **info, "type": "file"}
async def _glob(self, path, maxdepth=None, **kwargs):
"""
Find files by glob-matching.
This implementation is idntical to the one in AbstractFileSystem,
but "?" is not considered as a character for globbing, because it is
so common in URLs, often identifying the "query" part.
"""
if maxdepth is not None and maxdepth < 1:
raise ValueError("maxdepth must be at least 1")
import re
ends_with_slash = path.endswith("/") # _strip_protocol strips trailing slash
path = self._strip_protocol(path)
append_slash_to_dirname = ends_with_slash or path.endswith(("/**", "/*"))
idx_star = path.find("*") if path.find("*") >= 0 else len(path)
idx_brace = path.find("[") if path.find("[") >= 0 else len(path)
min_idx = min(idx_star, idx_brace)
detail = kwargs.pop("detail", False)
if not has_magic(path):
if await self._exists(path, **kwargs):
if not detail:
return [path]
else:
return {path: await self._info(path, **kwargs)}
else:
if not detail:
return [] # glob of non-existent returns empty
else:
return {}
elif "/" in path[:min_idx]:
min_idx = path[:min_idx].rindex("/")
root = path[: min_idx + 1]
depth = path[min_idx + 1 :].count("/") + 1
else:
root = ""
depth = path[min_idx + 1 :].count("/") + 1
if "**" in path:
if maxdepth is not None:
idx_double_stars = path.find("**")
depth_double_stars = path[idx_double_stars:].count("/") + 1
depth = depth - depth_double_stars + maxdepth
else:
depth = None
allpaths = await self._find(
root, maxdepth=depth, withdirs=True, detail=True, **kwargs
)
pattern = glob_translate(path + ("/" if ends_with_slash else ""))
pattern = re.compile(pattern)
out = {
(
p.rstrip("/")
if not append_slash_to_dirname
and info["type"] == "directory"
and p.endswith("/")
else p
): info
for p, info in sorted(allpaths.items())
if pattern.match(p.rstrip("/"))
}
if detail:
return out
else:
return list(out)
async def _isdir(self, path):
# override, since all URLs are (also) files
try:
return bool(await self._ls(path))
except (FileNotFoundError, ValueError):
return False
async def _pipe_file(self, path, value, mode="overwrite", **kwargs):
"""
Write bytes to a remote file over HTTP.
Parameters
----------
path : str
Target URL where the data should be written
value : bytes
Data to be written
mode : str
How to write to the file - 'overwrite' or 'append'
**kwargs : dict
Additional parameters to pass to the HTTP request
"""
url = self._strip_protocol(path)
headers = kwargs.pop("headers", {})
headers["Content-Length"] = str(len(value))
session = await self.set_session()
async with session.put(url, data=value, headers=headers, **kwargs) as r:
r.raise_for_status()
class HTTPFile(AbstractBufferedFile):
"""
A file-like object pointing to a remote HTTP(S) resource
Supports only reading, with read-ahead of a predetermined block-size.
In the case that the server does not supply the filesize, only reading of
the complete file in one go is supported.
Parameters
----------
url: str
Full URL of the remote resource, including the protocol
session: aiohttp.ClientSession or None
All calls will be made within this session, to avoid restarting
connections where the server allows this
block_size: int or None
The amount of read-ahead to do, in bytes. Default is 5MB, or the value
configured for the FileSystem creating this file
size: None or int
If given, this is the size of the file in bytes, and we don't attempt
to call the server to find the value.
kwargs: all other key-values are passed to requests calls.
"""
def __init__(
self,
fs,
url,
session=None,
block_size=None,
mode="rb",
cache_type="bytes",
cache_options=None,
size=None,
loop=None,
asynchronous=False,
**kwargs,
):
if mode != "rb":
raise NotImplementedError("File mode not supported")
self.asynchronous = asynchronous
self.loop = loop
self.url = url
self.session = session
self.details = {"name": url, "size": size, "type": "file"}
super().__init__(
fs=fs,
path=url,
mode=mode,
block_size=block_size,
cache_type=cache_type,
cache_options=cache_options,
**kwargs,
)
def read(self, length=-1):
"""Read bytes from file
Parameters
----------
length: int
Read up to this many bytes. If negative, read all content to end of
file. If the server has not supplied the filesize, attempting to
read only part of the data will raise a ValueError.
"""
if (
(length < 0 and self.loc == 0) # explicit read all
# but not when the size is known and fits into a block anyways
and not (self.size is not None and self.size <= self.blocksize)
):
self._fetch_all()
if self.size is None:
if length < 0:
self._fetch_all()
else:
length = min(self.size - self.loc, length)
return super().read(length)
async def async_fetch_all(self):
"""Read whole file in one shot, without caching
This is only called when position is still at zero,
and read() is called without a byte-count.
"""
logger.debug(f"Fetch all for {self}")
if not isinstance(self.cache, AllBytes):
r = await self.session.get(self.fs.encode_url(self.url), **self.kwargs)
async with r:
r.raise_for_status()
out = await r.read()
self.cache = AllBytes(
size=len(out), fetcher=None, blocksize=None, data=out
)
self.size = len(out)
_fetch_all = sync_wrapper(async_fetch_all)
def _parse_content_range(self, headers):
"""Parse the Content-Range header"""
s = headers.get("Content-Range", "")
m = re.match(r"bytes (\d+-\d+|\*)/(\d+|\*)", s)
if not m:
return None, None, None
if m[1] == "*":
start = end = None
else:
start, end = [int(x) for x in m[1].split("-")]
total = None if m[2] == "*" else int(m[2])
return start, end, total
async def async_fetch_range(self, start, end):
"""Download a block of data
The expectation is that the server returns only the requested bytes,
with HTTP code 206. If this is not the case, we first check the headers,
and then stream the output - if the data size is bigger than we
requested, an exception is raised.
"""
logger.debug(f"Fetch range for {self}: {start}-{end}")
kwargs = self.kwargs.copy()
headers = kwargs.pop("headers", {}).copy()
headers["Range"] = f"bytes={start}-{end - 1}"
logger.debug(f"{self.url} : {headers['Range']}")
r = await self.session.get(
self.fs.encode_url(self.url), headers=headers, **kwargs
)
async with r:
if r.status == 416:
# range request outside file
return b""
r.raise_for_status()
# If the server has handled the range request, it should reply
# with status 206 (partial content). But we'll guess that a suitable
# Content-Range header or a Content-Length no more than the
# requested range also mean we have got the desired range.
response_is_range = (
r.status == 206
or self._parse_content_range(r.headers)[0] == start
or int(r.headers.get("Content-Length", end + 1)) <= end - start
)
if response_is_range:
# partial content, as expected
out = await r.read()
elif start > 0:
raise ValueError(
"The HTTP server doesn't appear to support range requests. "
"Only reading this file from the beginning is supported. "
"Open with block_size=0 for a streaming file interface."
)
else:
# Response is not a range, but we want the start of the file,
# so we can read the required amount anyway.
cl = 0
out = []
while True:
chunk = await r.content.read(2**20)
# data size unknown, let's read until we have enough
if chunk:
out.append(chunk)
cl += len(chunk)
if cl > end - start:
break
else:
break
out = b"".join(out)[: end - start]
return out
_fetch_range = sync_wrapper(async_fetch_range)
magic_check = re.compile("([*[])")
def has_magic(s):
match = magic_check.search(s)
return match is not None
class HTTPStreamFile(AbstractBufferedFile):
def __init__(self, fs, url, mode="rb", loop=None, session=None, **kwargs):
self.asynchronous = kwargs.pop("asynchronous", False)
self.url = url
self.loop = loop
self.session = session
if mode != "rb":
raise ValueError
self.details = {"name": url, "size": None}
super().__init__(fs=fs, path=url, mode=mode, cache_type="none", **kwargs)
async def cor():
r = await self.session.get(self.fs.encode_url(url), **kwargs).__aenter__()
self.fs._raise_not_found_for_status(r, url)
return r
self.r = sync(self.loop, cor)
self.loop = fs.loop
def seek(self, loc, whence=0):
if loc == 0 and whence == 1:
return
if loc == self.loc and whence == 0:
return
raise ValueError("Cannot seek streaming HTTP file")
async def _read(self, num=-1):
out = await self.r.content.read(num)
self.loc += len(out)
return out
read = sync_wrapper(_read)
async def _close(self):
self.r.close()
def close(self):
asyncio.run_coroutine_threadsafe(self._close(), self.loop)
super().close()
class AsyncStreamFile(AbstractAsyncStreamedFile):
def __init__(
self, fs, url, mode="rb", loop=None, session=None, size=None, **kwargs
):
self.url = url
self.session = session
self.r = None
if mode != "rb":
raise ValueError
self.details = {"name": url, "size": None}
self.kwargs = kwargs
super().__init__(fs=fs, path=url, mode=mode, cache_type="none")
self.size = size
async def read(self, num=-1):
if self.r is None:
r = await self.session.get(
self.fs.encode_url(self.url), **self.kwargs
).__aenter__()
self.fs._raise_not_found_for_status(r, self.url)
self.r = r
out = await self.r.content.read(num)
self.loc += len(out)
return out
async def close(self):
if self.r is not None:
self.r.close()
self.r = None
await super().close()
async def get_range(session, url, start, end, file=None, **kwargs):
# explicit get a range when we know it must be safe
kwargs = kwargs.copy()
headers = kwargs.pop("headers", {}).copy()
headers["Range"] = f"bytes={start}-{end - 1}"
r = await session.get(url, headers=headers, **kwargs)
r.raise_for_status()
async with r:
out = await r.read()
if file:
with open(file, "r+b") as f: # noqa: ASYNC230
f.seek(start)
f.write(out)
else:
return out
async def _file_info(url, session, size_policy="head", **kwargs):
"""Call HEAD on the server to get details about the file (size/checksum etc.)
Default operation is to explicitly allow redirects and use encoding
'identity' (no compression) to get the true size of the target.
"""
logger.debug("Retrieve file size for %s", url)
kwargs = kwargs.copy()
ar = kwargs.pop("allow_redirects", True)
head = kwargs.get("headers", {}).copy()
head["Accept-Encoding"] = "identity"
kwargs["headers"] = head
info = {}
if size_policy == "head":
r = await session.head(url, allow_redirects=ar, **kwargs)
elif size_policy == "get":
r = await session.get(url, allow_redirects=ar, **kwargs)
else:
raise TypeError(f'size_policy must be "head" or "get", got {size_policy}')
async with r:
r.raise_for_status()
if "Content-Length" in r.headers:
# Some servers may choose to ignore Accept-Encoding and return
# compressed content, in which case the returned size is unreliable.
if "Content-Encoding" not in r.headers or r.headers["Content-Encoding"] in [
"identity",
"",
]:
info["size"] = int(r.headers["Content-Length"])
elif "Content-Range" in r.headers:
info["size"] = int(r.headers["Content-Range"].split("/")[1])
if "Content-Type" in r.headers:
info["mimetype"] = r.headers["Content-Type"].partition(";")[0]
if r.headers.get("Accept-Ranges") == "none":
# Some servers may explicitly discourage partial content requests, but
# the lack of "Accept-Ranges" does not always indicate they would fail
info["partial"] = False
info["url"] = str(r.url)
for checksum_field in ["ETag", "Content-MD5", "Digest", "Last-Modified"]:
if r.headers.get(checksum_field):
info[checksum_field] = r.headers[checksum_field]
return info
async def _file_size(url, session=None, *args, **kwargs):
if session is None:
session = await get_client()
info = await _file_info(url, session=session, *args, **kwargs)
return info.get("size")
file_size = sync_wrapper(_file_size)

View File

@@ -0,0 +1,931 @@
"""This file is largely copied from http.py"""
import io
import logging
import re
import urllib.error
import urllib.parse
from copy import copy
from json import dumps, loads
from urllib.parse import urlparse
try:
import yarl
except (ImportError, ModuleNotFoundError, OSError):
yarl = False
from fsspec.callbacks import _DEFAULT_CALLBACK
from fsspec.registry import register_implementation
from fsspec.spec import AbstractBufferedFile, AbstractFileSystem
from fsspec.utils import DEFAULT_BLOCK_SIZE, isfilelike, nullcontext, tokenize
from ..caching import AllBytes
# https://stackoverflow.com/a/15926317/3821154
ex = re.compile(r"""<(a|A)\s+(?:[^>]*?\s+)?(href|HREF)=["'](?P<url>[^"']+)""")
ex2 = re.compile(r"""(?P<url>http[s]?://[-a-zA-Z0-9@:%_+.~#?&/=]+)""")
logger = logging.getLogger("fsspec.http")
class JsHttpException(urllib.error.HTTPError): ...
class StreamIO(io.BytesIO):
# fake class, so you can set attributes on it
# will eventually actually stream
...
class ResponseProxy:
"""Looks like a requests response"""
def __init__(self, req, stream=False):
self.request = req
self.stream = stream
self._data = None
self._headers = None
@property
def raw(self):
if self._data is None:
b = self.request.response.to_bytes()
if self.stream:
self._data = StreamIO(b)
else:
self._data = b
return self._data
def close(self):
if hasattr(self, "_data"):
del self._data
@property
def headers(self):
if self._headers is None:
self._headers = dict(
[
_.split(": ")
for _ in self.request.getAllResponseHeaders().strip().split("\r\n")
]
)
return self._headers
@property
def status_code(self):
return int(self.request.status)
def raise_for_status(self):
if not self.ok:
raise JsHttpException(
self.url, self.status_code, self.reason, self.headers, None
)
def iter_content(self, chunksize, *_, **__):
while True:
out = self.raw.read(chunksize)
if out:
yield out
else:
break
@property
def reason(self):
return self.request.statusText
@property
def ok(self):
return self.status_code < 400
@property
def url(self):
return self.request.response.responseURL
@property
def text(self):
# TODO: encoding from headers
return self.content.decode()
@property
def content(self):
self.stream = False
return self.raw
def json(self):
return loads(self.text)
class RequestsSessionShim:
def __init__(self):
self.headers = {}
def request(
self,
method,
url,
params=None,
data=None,
headers=None,
cookies=None,
files=None,
auth=None,
timeout=None,
allow_redirects=None,
proxies=None,
hooks=None,
stream=None,
verify=None,
cert=None,
json=None,
):
from js import Blob, XMLHttpRequest
logger.debug("JS request: %s %s", method, url)
if cert or verify or proxies or files or cookies or hooks:
raise NotImplementedError
if data and json:
raise ValueError("Use json= or data=, not both")
req = XMLHttpRequest.new()
extra = auth if auth else ()
if params:
url = f"{url}?{urllib.parse.urlencode(params)}"
req.open(method, url, False, *extra)
if timeout:
req.timeout = timeout
if headers:
for k, v in headers.items():
req.setRequestHeader(k, v)
req.setRequestHeader("Accept", "application/octet-stream")
req.responseType = "arraybuffer"
if json:
blob = Blob.new([dumps(data)], {type: "application/json"})
req.send(blob)
elif data:
if isinstance(data, io.IOBase):
data = data.read()
blob = Blob.new([data], {type: "application/octet-stream"})
req.send(blob)
else:
req.send(None)
return ResponseProxy(req, stream=stream)
def get(self, url, **kwargs):
return self.request("GET", url, **kwargs)
def head(self, url, **kwargs):
return self.request("HEAD", url, **kwargs)
def post(self, url, **kwargs):
return self.request("POST}", url, **kwargs)
def put(self, url, **kwargs):
return self.request("PUT", url, **kwargs)
def patch(self, url, **kwargs):
return self.request("PATCH", url, **kwargs)
def delete(self, url, **kwargs):
return self.request("DELETE", url, **kwargs)
class HTTPFileSystem(AbstractFileSystem):
"""
Simple File-System for fetching data via HTTP(S)
This is the BLOCKING version of the normal HTTPFileSystem. It uses
requests in normal python and the JS runtime in pyodide.
***This implementation is extremely experimental, do not use unless
you are testing pyodide/pyscript integration***
"""
protocol = ("http", "https", "sync-http", "sync-https")
sep = "/"
def __init__(
self,
simple_links=True,
block_size=None,
same_scheme=True,
cache_type="readahead",
cache_options=None,
client_kwargs=None,
encoded=False,
**storage_options,
):
"""
Parameters
----------
block_size: int
Blocks to read bytes; if 0, will default to raw requests file-like
objects instead of HTTPFile instances
simple_links: bool
If True, will consider both HTML <a> tags and anything that looks
like a URL; if False, will consider only the former.
same_scheme: True
When doing ls/glob, if this is True, only consider paths that have
http/https matching the input URLs.
size_policy: this argument is deprecated
client_kwargs: dict
Passed to aiohttp.ClientSession, see
https://docs.aiohttp.org/en/stable/client_reference.html
For example, ``{'auth': aiohttp.BasicAuth('user', 'pass')}``
storage_options: key-value
Any other parameters passed on to requests
cache_type, cache_options: defaults used in open
"""
super().__init__(self, **storage_options)
self.block_size = block_size if block_size is not None else DEFAULT_BLOCK_SIZE
self.simple_links = simple_links
self.same_schema = same_scheme
self.cache_type = cache_type
self.cache_options = cache_options
self.client_kwargs = client_kwargs or {}
self.encoded = encoded
self.kwargs = storage_options
try:
import js # noqa: F401
logger.debug("Starting JS session")
self.session = RequestsSessionShim()
self.js = True
except Exception as e:
import requests
logger.debug("Starting cpython session because of: %s", e)
self.session = requests.Session(**(client_kwargs or {}))
self.js = False
request_options = copy(storage_options)
self.use_listings_cache = request_options.pop("use_listings_cache", False)
request_options.pop("listings_expiry_time", None)
request_options.pop("max_paths", None)
request_options.pop("skip_instance_cache", None)
self.kwargs = request_options
@property
def fsid(self):
return "sync-http"
def encode_url(self, url):
if yarl:
return yarl.URL(url, encoded=self.encoded)
return url
@classmethod
def _strip_protocol(cls, path: str) -> str:
"""For HTTP, we always want to keep the full URL"""
path = path.replace("sync-http://", "http://").replace(
"sync-https://", "https://"
)
return path
@classmethod
def _parent(cls, path):
# override, since _strip_protocol is different for URLs
par = super()._parent(path)
if len(par) > 7: # "http://..."
return par
return ""
def _ls_real(self, url, detail=True, **kwargs):
# ignoring URL-encoded arguments
kw = self.kwargs.copy()
kw.update(kwargs)
logger.debug(url)
r = self.session.get(self.encode_url(url), **self.kwargs)
self._raise_not_found_for_status(r, url)
text = r.text
if self.simple_links:
links = ex2.findall(text) + [u[2] for u in ex.findall(text)]
else:
links = [u[2] for u in ex.findall(text)]
out = set()
parts = urlparse(url)
for l in links:
if isinstance(l, tuple):
l = l[1]
if l.startswith("/") and len(l) > 1:
# absolute URL on this server
l = parts.scheme + "://" + parts.netloc + l
if l.startswith("http"):
if self.same_schema and l.startswith(url.rstrip("/") + "/"):
out.add(l)
elif l.replace("https", "http").startswith(
url.replace("https", "http").rstrip("/") + "/"
):
# allowed to cross http <-> https
out.add(l)
else:
if l not in ["..", "../"]:
# Ignore FTP-like "parent"
out.add("/".join([url.rstrip("/"), l.lstrip("/")]))
if not out and url.endswith("/"):
out = self._ls_real(url.rstrip("/"), detail=False)
if detail:
return [
{
"name": u,
"size": None,
"type": "directory" if u.endswith("/") else "file",
}
for u in out
]
else:
return sorted(out)
def ls(self, url, detail=True, **kwargs):
if self.use_listings_cache and url in self.dircache:
out = self.dircache[url]
else:
out = self._ls_real(url, detail=detail, **kwargs)
self.dircache[url] = out
return out
def _raise_not_found_for_status(self, response, url):
"""
Raises FileNotFoundError for 404s, otherwise uses raise_for_status.
"""
if response.status_code == 404:
raise FileNotFoundError(url)
response.raise_for_status()
def cat_file(self, url, start=None, end=None, **kwargs):
kw = self.kwargs.copy()
kw.update(kwargs)
logger.debug(url)
if start is not None or end is not None:
if start == end:
return b""
headers = kw.pop("headers", {}).copy()
headers["Range"] = self._process_limits(url, start, end)
kw["headers"] = headers
r = self.session.get(self.encode_url(url), **kw)
self._raise_not_found_for_status(r, url)
return r.content
def get_file(
self, rpath, lpath, chunk_size=5 * 2**20, callback=_DEFAULT_CALLBACK, **kwargs
):
kw = self.kwargs.copy()
kw.update(kwargs)
logger.debug(rpath)
r = self.session.get(self.encode_url(rpath), **kw)
try:
size = int(
r.headers.get("content-length", None)
or r.headers.get("Content-Length", None)
)
except (ValueError, KeyError, TypeError):
size = None
callback.set_size(size)
self._raise_not_found_for_status(r, rpath)
if not isfilelike(lpath):
lpath = open(lpath, "wb")
for chunk in r.iter_content(chunk_size, decode_unicode=False):
lpath.write(chunk)
callback.relative_update(len(chunk))
def put_file(
self,
lpath,
rpath,
chunk_size=5 * 2**20,
callback=_DEFAULT_CALLBACK,
method="post",
**kwargs,
):
def gen_chunks():
# Support passing arbitrary file-like objects
# and use them instead of streams.
if isinstance(lpath, io.IOBase):
context = nullcontext(lpath)
use_seek = False # might not support seeking
else:
context = open(lpath, "rb")
use_seek = True
with context as f:
if use_seek:
callback.set_size(f.seek(0, 2))
f.seek(0)
else:
callback.set_size(getattr(f, "size", None))
chunk = f.read(chunk_size)
while chunk:
yield chunk
callback.relative_update(len(chunk))
chunk = f.read(chunk_size)
kw = self.kwargs.copy()
kw.update(kwargs)
method = method.lower()
if method not in ("post", "put"):
raise ValueError(
f"method has to be either 'post' or 'put', not: {method!r}"
)
meth = getattr(self.session, method)
resp = meth(rpath, data=gen_chunks(), **kw)
self._raise_not_found_for_status(resp, rpath)
def _process_limits(self, url, start, end):
"""Helper for "Range"-based _cat_file"""
size = None
suff = False
if start is not None and start < 0:
# if start is negative and end None, end is the "suffix length"
if end is None:
end = -start
start = ""
suff = True
else:
size = size or self.info(url)["size"]
start = size + start
elif start is None:
start = 0
if not suff:
if end is not None and end < 0:
if start is not None:
size = size or self.info(url)["size"]
end = size + end
elif end is None:
end = ""
if isinstance(end, int):
end -= 1 # bytes range is inclusive
return f"bytes={start}-{end}"
def exists(self, path, **kwargs):
kw = self.kwargs.copy()
kw.update(kwargs)
try:
logger.debug(path)
r = self.session.get(self.encode_url(path), **kw)
return r.status_code < 400
except Exception:
return False
def isfile(self, path, **kwargs):
return self.exists(path, **kwargs)
def _open(
self,
path,
mode="rb",
block_size=None,
autocommit=None, # XXX: This differs from the base class.
cache_type=None,
cache_options=None,
size=None,
**kwargs,
):
"""Make a file-like object
Parameters
----------
path: str
Full URL with protocol
mode: string
must be "rb"
block_size: int or None
Bytes to download in one request; use instance value if None. If
zero, will return a streaming Requests file-like instance.
kwargs: key-value
Any other parameters, passed to requests calls
"""
if mode != "rb":
raise NotImplementedError
block_size = block_size if block_size is not None else self.block_size
kw = self.kwargs.copy()
kw.update(kwargs)
size = size or self.info(path, **kwargs)["size"]
if block_size and size:
return HTTPFile(
self,
path,
session=self.session,
block_size=block_size,
mode=mode,
size=size,
cache_type=cache_type or self.cache_type,
cache_options=cache_options or self.cache_options,
**kw,
)
else:
return HTTPStreamFile(
self,
path,
mode=mode,
session=self.session,
**kw,
)
def ukey(self, url):
"""Unique identifier; assume HTTP files are static, unchanging"""
return tokenize(url, self.kwargs, self.protocol)
def info(self, url, **kwargs):
"""Get info of URL
Tries to access location via HEAD, and then GET methods, but does
not fetch the data.
It is possible that the server does not supply any size information, in
which case size will be given as None (and certain operations on the
corresponding file will not work).
"""
info = {}
for policy in ["head", "get"]:
try:
info.update(
_file_info(
self.encode_url(url),
size_policy=policy,
session=self.session,
**self.kwargs,
**kwargs,
)
)
if info.get("size") is not None:
break
except Exception as exc:
if policy == "get":
# If get failed, then raise a FileNotFoundError
raise FileNotFoundError(url) from exc
logger.debug(str(exc))
return {"name": url, "size": None, **info, "type": "file"}
def glob(self, path, maxdepth=None, **kwargs):
"""
Find files by glob-matching.
This implementation is idntical to the one in AbstractFileSystem,
but "?" is not considered as a character for globbing, because it is
so common in URLs, often identifying the "query" part.
"""
import re
ends = path.endswith("/")
path = self._strip_protocol(path)
indstar = path.find("*") if path.find("*") >= 0 else len(path)
indbrace = path.find("[") if path.find("[") >= 0 else len(path)
ind = min(indstar, indbrace)
detail = kwargs.pop("detail", False)
if not has_magic(path):
root = path
depth = 1
if ends:
path += "/*"
elif self.exists(path):
if not detail:
return [path]
else:
return {path: self.info(path)}
else:
if not detail:
return [] # glob of non-existent returns empty
else:
return {}
elif "/" in path[:ind]:
ind2 = path[:ind].rindex("/")
root = path[: ind2 + 1]
depth = None if "**" in path else path[ind2 + 1 :].count("/") + 1
else:
root = ""
depth = None if "**" in path else path[ind + 1 :].count("/") + 1
allpaths = self.find(
root, maxdepth=maxdepth or depth, withdirs=True, detail=True, **kwargs
)
# Escape characters special to python regex, leaving our supported
# special characters in place.
# See https://www.gnu.org/software/bash/manual/html_node/Pattern-Matching.html
# for shell globbing details.
pattern = (
"^"
+ (
path.replace("\\", r"\\")
.replace(".", r"\.")
.replace("+", r"\+")
.replace("//", "/")
.replace("(", r"\(")
.replace(")", r"\)")
.replace("|", r"\|")
.replace("^", r"\^")
.replace("$", r"\$")
.replace("{", r"\{")
.replace("}", r"\}")
.rstrip("/")
)
+ "$"
)
pattern = re.sub("[*]{2}", "=PLACEHOLDER=", pattern)
pattern = re.sub("[*]", "[^/]*", pattern)
pattern = re.compile(pattern.replace("=PLACEHOLDER=", ".*"))
out = {
p: allpaths[p]
for p in sorted(allpaths)
if pattern.match(p.replace("//", "/").rstrip("/"))
}
if detail:
return out
else:
return list(out)
def isdir(self, path):
# override, since all URLs are (also) files
try:
return bool(self.ls(path))
except (FileNotFoundError, ValueError):
return False
class HTTPFile(AbstractBufferedFile):
"""
A file-like object pointing to a remove HTTP(S) resource
Supports only reading, with read-ahead of a predermined block-size.
In the case that the server does not supply the filesize, only reading of
the complete file in one go is supported.
Parameters
----------
url: str
Full URL of the remote resource, including the protocol
session: requests.Session or None
All calls will be made within this session, to avoid restarting
connections where the server allows this
block_size: int or None
The amount of read-ahead to do, in bytes. Default is 5MB, or the value
configured for the FileSystem creating this file
size: None or int
If given, this is the size of the file in bytes, and we don't attempt
to call the server to find the value.
kwargs: all other key-values are passed to requests calls.
"""
def __init__(
self,
fs,
url,
session=None,
block_size=None,
mode="rb",
cache_type="bytes",
cache_options=None,
size=None,
**kwargs,
):
if mode != "rb":
raise NotImplementedError("File mode not supported")
self.url = url
self.session = session
self.details = {"name": url, "size": size, "type": "file"}
super().__init__(
fs=fs,
path=url,
mode=mode,
block_size=block_size,
cache_type=cache_type,
cache_options=cache_options,
**kwargs,
)
def read(self, length=-1):
"""Read bytes from file
Parameters
----------
length: int
Read up to this many bytes. If negative, read all content to end of
file. If the server has not supplied the filesize, attempting to
read only part of the data will raise a ValueError.
"""
if (
(length < 0 and self.loc == 0) # explicit read all
# but not when the size is known and fits into a block anyways
and not (self.size is not None and self.size <= self.blocksize)
):
self._fetch_all()
if self.size is None:
if length < 0:
self._fetch_all()
else:
length = min(self.size - self.loc, length)
return super().read(length)
def _fetch_all(self):
"""Read whole file in one shot, without caching
This is only called when position is still at zero,
and read() is called without a byte-count.
"""
logger.debug(f"Fetch all for {self}")
if not isinstance(self.cache, AllBytes):
r = self.session.get(self.fs.encode_url(self.url), **self.kwargs)
r.raise_for_status()
out = r.content
self.cache = AllBytes(size=len(out), fetcher=None, blocksize=None, data=out)
self.size = len(out)
def _parse_content_range(self, headers):
"""Parse the Content-Range header"""
s = headers.get("Content-Range", "")
m = re.match(r"bytes (\d+-\d+|\*)/(\d+|\*)", s)
if not m:
return None, None, None
if m[1] == "*":
start = end = None
else:
start, end = [int(x) for x in m[1].split("-")]
total = None if m[2] == "*" else int(m[2])
return start, end, total
def _fetch_range(self, start, end):
"""Download a block of data
The expectation is that the server returns only the requested bytes,
with HTTP code 206. If this is not the case, we first check the headers,
and then stream the output - if the data size is bigger than we
requested, an exception is raised.
"""
logger.debug(f"Fetch range for {self}: {start}-{end}")
kwargs = self.kwargs.copy()
headers = kwargs.pop("headers", {}).copy()
headers["Range"] = f"bytes={start}-{end - 1}"
logger.debug("%s : %s", self.url, headers["Range"])
r = self.session.get(self.fs.encode_url(self.url), headers=headers, **kwargs)
if r.status_code == 416:
# range request outside file
return b""
r.raise_for_status()
# If the server has handled the range request, it should reply
# with status 206 (partial content). But we'll guess that a suitable
# Content-Range header or a Content-Length no more than the
# requested range also mean we have got the desired range.
cl = r.headers.get("Content-Length", r.headers.get("content-length", end + 1))
response_is_range = (
r.status_code == 206
or self._parse_content_range(r.headers)[0] == start
or int(cl) <= end - start
)
if response_is_range:
# partial content, as expected
out = r.content
elif start > 0:
raise ValueError(
"The HTTP server doesn't appear to support range requests. "
"Only reading this file from the beginning is supported. "
"Open with block_size=0 for a streaming file interface."
)
else:
# Response is not a range, but we want the start of the file,
# so we can read the required amount anyway.
cl = 0
out = []
for chunk in r.iter_content(2**20, False):
out.append(chunk)
cl += len(chunk)
out = b"".join(out)[: end - start]
return out
magic_check = re.compile("([*[])")
def has_magic(s):
match = magic_check.search(s)
return match is not None
class HTTPStreamFile(AbstractBufferedFile):
def __init__(self, fs, url, mode="rb", session=None, **kwargs):
self.url = url
self.session = session
if mode != "rb":
raise ValueError
self.details = {"name": url, "size": None}
super().__init__(fs=fs, path=url, mode=mode, cache_type="readahead", **kwargs)
r = self.session.get(self.fs.encode_url(url), stream=True, **kwargs)
self.fs._raise_not_found_for_status(r, url)
self.it = r.iter_content(1024, False)
self.leftover = b""
self.r = r
def seek(self, *args, **kwargs):
raise ValueError("Cannot seek streaming HTTP file")
def read(self, num=-1):
bufs = [self.leftover]
leng = len(self.leftover)
while leng < num or num < 0:
try:
out = self.it.__next__()
except StopIteration:
break
if out:
bufs.append(out)
else:
break
leng += len(out)
out = b"".join(bufs)
if num >= 0:
self.leftover = out[num:]
out = out[:num]
else:
self.leftover = b""
self.loc += len(out)
return out
def close(self):
self.r.close()
self.closed = True
def get_range(session, url, start, end, **kwargs):
# explicit get a range when we know it must be safe
kwargs = kwargs.copy()
headers = kwargs.pop("headers", {}).copy()
headers["Range"] = f"bytes={start}-{end - 1}"
r = session.get(url, headers=headers, **kwargs)
r.raise_for_status()
return r.content
def _file_info(url, session, size_policy="head", **kwargs):
"""Call HEAD on the server to get details about the file (size/checksum etc.)
Default operation is to explicitly allow redirects and use encoding
'identity' (no compression) to get the true size of the target.
"""
logger.debug("Retrieve file size for %s", url)
kwargs = kwargs.copy()
ar = kwargs.pop("allow_redirects", True)
head = kwargs.get("headers", {}).copy()
# TODO: not allowed in JS
# head["Accept-Encoding"] = "identity"
kwargs["headers"] = head
info = {}
if size_policy == "head":
r = session.head(url, allow_redirects=ar, **kwargs)
elif size_policy == "get":
r = session.get(url, allow_redirects=ar, **kwargs)
else:
raise TypeError(f'size_policy must be "head" or "get", got {size_policy}')
r.raise_for_status()
# TODO:
# recognise lack of 'Accept-Ranges',
# or 'Accept-Ranges': 'none' (not 'bytes')
# to mean streaming only, no random access => return None
if "Content-Length" in r.headers:
info["size"] = int(r.headers["Content-Length"])
elif "Content-Range" in r.headers:
info["size"] = int(r.headers["Content-Range"].split("/")[1])
elif "content-length" in r.headers:
info["size"] = int(r.headers["content-length"])
elif "content-range" in r.headers:
info["size"] = int(r.headers["content-range"].split("/")[1])
for checksum_field in ["ETag", "Content-MD5", "Digest"]:
if r.headers.get(checksum_field):
info[checksum_field] = r.headers[checksum_field]
return info
# importing this is enough to register it
def register():
register_implementation("http", HTTPFileSystem, clobber=True)
register_implementation("https", HTTPFileSystem, clobber=True)
register_implementation("sync-http", HTTPFileSystem, clobber=True)
register_implementation("sync-https", HTTPFileSystem, clobber=True)
register()
def unregister():
from fsspec.implementations.http import HTTPFileSystem
register_implementation("http", HTTPFileSystem, clobber=True)
register_implementation("https", HTTPFileSystem, clobber=True)

View File

@@ -0,0 +1,129 @@
import base64
import io
import re
import requests
import fsspec
class JupyterFileSystem(fsspec.AbstractFileSystem):
"""View of the files as seen by a Jupyter server (notebook or lab)"""
protocol = ("jupyter", "jlab")
def __init__(self, url, tok=None, **kwargs):
"""
Parameters
----------
url : str
Base URL of the server, like "http://127.0.0.1:8888". May include
token in the string, which is given by the process when starting up
tok : str
If the token is obtained separately, can be given here
kwargs
"""
if "?" in url:
if tok is None:
try:
tok = re.findall("token=([a-z0-9]+)", url)[0]
except IndexError as e:
raise ValueError("Could not determine token") from e
url = url.split("?", 1)[0]
self.url = url.rstrip("/") + "/api/contents"
self.session = requests.Session()
if tok:
self.session.headers["Authorization"] = f"token {tok}"
super().__init__(**kwargs)
def ls(self, path, detail=True, **kwargs):
path = self._strip_protocol(path)
r = self.session.get(f"{self.url}/{path}")
if r.status_code == 404:
raise FileNotFoundError(path)
r.raise_for_status()
out = r.json()
if out["type"] == "directory":
out = out["content"]
else:
out = [out]
for o in out:
o["name"] = o.pop("path")
o.pop("content")
if o["type"] == "notebook":
o["type"] = "file"
if detail:
return out
return [o["name"] for o in out]
def cat_file(self, path, start=None, end=None, **kwargs):
path = self._strip_protocol(path)
r = self.session.get(f"{self.url}/{path}")
if r.status_code == 404:
raise FileNotFoundError(path)
r.raise_for_status()
out = r.json()
if out["format"] == "text":
# data should be binary
b = out["content"].encode()
else:
b = base64.b64decode(out["content"])
return b[start:end]
def pipe_file(self, path, value, **_):
path = self._strip_protocol(path)
json = {
"name": path.rsplit("/", 1)[-1],
"path": path,
"size": len(value),
"content": base64.b64encode(value).decode(),
"format": "base64",
"type": "file",
}
self.session.put(f"{self.url}/{path}", json=json)
def mkdir(self, path, create_parents=True, **kwargs):
path = self._strip_protocol(path)
if create_parents and "/" in path:
self.mkdir(path.rsplit("/", 1)[0], True)
json = {
"name": path.rsplit("/", 1)[-1],
"path": path,
"size": None,
"content": None,
"type": "directory",
}
self.session.put(f"{self.url}/{path}", json=json)
def mv(self, path1, path2, recursive=False, maxdepth=None, **kwargs):
if path1 == path2:
return
self.session.patch(f"{self.url}/{path1}", json={"path": path2})
def _rm(self, path):
path = self._strip_protocol(path)
self.session.delete(f"{self.url}/{path}")
def _open(self, path, mode="rb", **kwargs):
path = self._strip_protocol(path)
if mode == "rb":
data = self.cat_file(path)
return io.BytesIO(data)
else:
return SimpleFileWriter(self, path, mode="wb")
class SimpleFileWriter(fsspec.spec.AbstractBufferedFile):
def _upload_chunk(self, final=False):
"""Never uploads a chunk until file is done
Not suitable for large files
"""
if final is False:
return False
self.buffer.seek(0)
data = self.buffer.read()
self.fs.pipe_file(self.path, data)

View File

@@ -0,0 +1,213 @@
from contextlib import contextmanager
from ctypes import (
CFUNCTYPE,
POINTER,
c_int,
c_longlong,
c_void_p,
cast,
create_string_buffer,
)
import libarchive
import libarchive.ffi as ffi
from fsspec import open_files
from fsspec.archive import AbstractArchiveFileSystem
from fsspec.implementations.memory import MemoryFile
from fsspec.utils import DEFAULT_BLOCK_SIZE
# Libarchive requires seekable files or memory only for certain archive
# types. However, since we read the directory first to cache the contents
# and also allow random access to any file, the file-like object needs
# to be seekable no matter what.
# Seek call-backs (not provided in the libarchive python wrapper)
SEEK_CALLBACK = CFUNCTYPE(c_longlong, c_int, c_void_p, c_longlong, c_int)
read_set_seek_callback = ffi.ffi(
"read_set_seek_callback", [ffi.c_archive_p, SEEK_CALLBACK], c_int, ffi.check_int
)
new_api = hasattr(ffi, "NO_OPEN_CB")
@contextmanager
def custom_reader(file, format_name="all", filter_name="all", block_size=ffi.page_size):
"""Read an archive from a seekable file-like object.
The `file` object must support the standard `readinto` and 'seek' methods.
"""
buf = create_string_buffer(block_size)
buf_p = cast(buf, c_void_p)
def read_func(archive_p, context, ptrptr):
# readinto the buffer, returns number of bytes read
length = file.readinto(buf)
# write the address of the buffer into the pointer
ptrptr = cast(ptrptr, POINTER(c_void_p))
ptrptr[0] = buf_p
# tell libarchive how much data was written into the buffer
return length
def seek_func(archive_p, context, offset, whence):
file.seek(offset, whence)
# tell libarchvie the current position
return file.tell()
read_cb = ffi.READ_CALLBACK(read_func)
seek_cb = SEEK_CALLBACK(seek_func)
if new_api:
open_cb = ffi.NO_OPEN_CB
close_cb = ffi.NO_CLOSE_CB
else:
open_cb = libarchive.read.OPEN_CALLBACK(ffi.VOID_CB)
close_cb = libarchive.read.CLOSE_CALLBACK(ffi.VOID_CB)
with libarchive.read.new_archive_read(format_name, filter_name) as archive_p:
read_set_seek_callback(archive_p, seek_cb)
ffi.read_open(archive_p, None, open_cb, read_cb, close_cb)
yield libarchive.read.ArchiveRead(archive_p)
class LibArchiveFileSystem(AbstractArchiveFileSystem):
"""Compressed archives as a file-system (read-only)
Supports the following formats:
tar, pax , cpio, ISO9660, zip, mtree, shar, ar, raw, xar, lha/lzh, rar
Microsoft CAB, 7-Zip, WARC
See the libarchive documentation for further restrictions.
https://www.libarchive.org/
Keeps file object open while instance lives. It only works in seekable
file-like objects. In case the filesystem does not support this kind of
file object, it is recommended to cache locally.
This class is pickleable, but not necessarily thread-safe (depends on the
platform). See libarchive documentation for details.
"""
root_marker = ""
protocol = "libarchive"
cachable = False
def __init__(
self,
fo="",
mode="r",
target_protocol=None,
target_options=None,
block_size=DEFAULT_BLOCK_SIZE,
**kwargs,
):
"""
Parameters
----------
fo: str or file-like
Contains ZIP, and must exist. If a str, will fetch file using
:meth:`~fsspec.open_files`, which must return one file exactly.
mode: str
Currently, only 'r' accepted
target_protocol: str (optional)
If ``fo`` is a string, this value can be used to override the
FS protocol inferred from a URL
target_options: dict (optional)
Kwargs passed when instantiating the target FS, if ``fo`` is
a string.
"""
super().__init__(self, **kwargs)
if mode != "r":
raise ValueError("Only read from archive files accepted")
if isinstance(fo, str):
files = open_files(fo, protocol=target_protocol, **(target_options or {}))
if len(files) != 1:
raise ValueError(
f'Path "{fo}" did not resolve to exactly one file: "{files}"'
)
fo = files[0]
self.of = fo
self.fo = fo.__enter__() # the whole instance is a context
self.block_size = block_size
self.dir_cache = None
@contextmanager
def _open_archive(self):
self.fo.seek(0)
with custom_reader(self.fo, block_size=self.block_size) as arc:
yield arc
@classmethod
def _strip_protocol(cls, path):
# file paths are always relative to the archive root
return super()._strip_protocol(path).lstrip("/")
def _get_dirs(self):
fields = {
"name": "pathname",
"size": "size",
"created": "ctime",
"mode": "mode",
"uid": "uid",
"gid": "gid",
"mtime": "mtime",
}
if self.dir_cache is not None:
return
self.dir_cache = {}
list_names = []
with self._open_archive() as arc:
for entry in arc:
if not entry.isdir and not entry.isfile:
# Skip symbolic links, fifo entries, etc.
continue
self.dir_cache.update(
{
dirname: {"name": dirname, "size": 0, "type": "directory"}
for dirname in self._all_dirnames(set(entry.name))
}
)
f = {key: getattr(entry, fields[key]) for key in fields}
f["type"] = "directory" if entry.isdir else "file"
list_names.append(entry.name)
self.dir_cache[f["name"]] = f
# libarchive does not seem to return an entry for the directories (at least
# not in all formats), so get the directories names from the files names
self.dir_cache.update(
{
dirname: {"name": dirname, "size": 0, "type": "directory"}
for dirname in self._all_dirnames(list_names)
}
)
def _open(
self,
path,
mode="rb",
block_size=None,
autocommit=True,
cache_options=None,
**kwargs,
):
path = self._strip_protocol(path)
if mode != "rb":
raise NotImplementedError
data = bytes()
with self._open_archive() as arc:
for entry in arc:
if entry.pathname != path:
continue
if entry.size == 0:
# empty file, so there are no blocks
break
for block in entry.get_blocks(entry.size):
data = block
break
else:
raise ValueError
return MemoryFile(fs=self, path=path, data=data)

View File

@@ -0,0 +1,514 @@
import datetime
import io
import logging
import os
import os.path as osp
import shutil
import stat
import tempfile
from functools import lru_cache
from fsspec import AbstractFileSystem
from fsspec.compression import compr
from fsspec.core import get_compression
from fsspec.utils import isfilelike, stringify_path
logger = logging.getLogger("fsspec.local")
class LocalFileSystem(AbstractFileSystem):
"""Interface to files on local storage
Parameters
----------
auto_mkdir: bool
Whether, when opening a file, the directory containing it should
be created (if it doesn't already exist). This is assumed by pyarrow
code.
"""
root_marker = "/"
protocol = "file", "local"
local_file = True
def __init__(self, auto_mkdir=False, **kwargs):
super().__init__(**kwargs)
self.auto_mkdir = auto_mkdir
@property
def fsid(self):
return "local"
def mkdir(self, path, create_parents=True, **kwargs):
path = self._strip_protocol(path)
if self.exists(path):
raise FileExistsError(path)
if create_parents:
self.makedirs(path, exist_ok=True)
else:
os.mkdir(path, **kwargs)
def makedirs(self, path, exist_ok=False):
path = self._strip_protocol(path)
os.makedirs(path, exist_ok=exist_ok)
def rmdir(self, path):
path = self._strip_protocol(path)
os.rmdir(path)
def ls(self, path, detail=False, **kwargs):
path = self._strip_protocol(path)
path_info = self.info(path)
infos = []
if path_info["type"] == "directory":
with os.scandir(path) as it:
for f in it:
try:
# Only get the info if requested since it is a bit expensive (the stat call inside)
# The strip_protocol is also used in info() and calls make_path_posix to always return posix paths
info = self.info(f) if detail else self._strip_protocol(f.path)
infos.append(info)
except FileNotFoundError:
pass
else:
infos = [path_info] if detail else [path_info["name"]]
return infos
def info(self, path, **kwargs):
if isinstance(path, os.DirEntry):
# scandir DirEntry
out = path.stat(follow_symlinks=False)
link = path.is_symlink()
if path.is_dir(follow_symlinks=False):
t = "directory"
elif path.is_file(follow_symlinks=False):
t = "file"
else:
t = "other"
size = out.st_size
if link:
try:
out2 = path.stat(follow_symlinks=True)
size = out2.st_size
except OSError:
size = 0
path = self._strip_protocol(path.path)
else:
# str or path-like
path = self._strip_protocol(path)
out = os.stat(path, follow_symlinks=False)
link = stat.S_ISLNK(out.st_mode)
if link:
out = os.stat(path, follow_symlinks=True)
size = out.st_size
if stat.S_ISDIR(out.st_mode):
t = "directory"
elif stat.S_ISREG(out.st_mode):
t = "file"
else:
t = "other"
# Check for the 'st_birthtime' attribute, which is not always present; fallback to st_ctime
created_time = getattr(out, "st_birthtime", out.st_ctime)
result = {
"name": path,
"size": size,
"type": t,
"created": created_time,
"islink": link,
}
for field in ["mode", "uid", "gid", "mtime", "ino", "nlink"]:
result[field] = getattr(out, f"st_{field}")
if link:
result["destination"] = os.readlink(path)
return result
def lexists(self, path, **kwargs):
return osp.lexists(path)
def cp_file(self, path1, path2, **kwargs):
path1 = self._strip_protocol(path1)
path2 = self._strip_protocol(path2)
if self.auto_mkdir:
self.makedirs(self._parent(path2), exist_ok=True)
if self.isfile(path1):
shutil.copyfile(path1, path2)
elif self.isdir(path1):
self.mkdirs(path2, exist_ok=True)
else:
raise FileNotFoundError(path1)
def isfile(self, path):
path = self._strip_protocol(path)
return os.path.isfile(path)
def isdir(self, path):
path = self._strip_protocol(path)
return os.path.isdir(path)
def get_file(self, path1, path2, callback=None, **kwargs):
if isfilelike(path2):
with open(path1, "rb") as f:
shutil.copyfileobj(f, path2)
else:
return self.cp_file(path1, path2, **kwargs)
def put_file(self, path1, path2, callback=None, **kwargs):
return self.cp_file(path1, path2, **kwargs)
def mv(self, path1, path2, recursive: bool = True, **kwargs):
"""Move files/directories
For the specific case of local, all ops on directories are recursive and
the recursive= kwarg is ignored.
"""
path1 = self._strip_protocol(path1)
path2 = self._strip_protocol(path2)
shutil.move(path1, path2)
def link(self, src, dst, **kwargs):
src = self._strip_protocol(src)
dst = self._strip_protocol(dst)
os.link(src, dst, **kwargs)
def symlink(self, src, dst, **kwargs):
src = self._strip_protocol(src)
dst = self._strip_protocol(dst)
os.symlink(src, dst, **kwargs)
def islink(self, path) -> bool:
return os.path.islink(self._strip_protocol(path))
def rm_file(self, path):
os.remove(self._strip_protocol(path))
def rm(self, path, recursive=False, maxdepth=None):
if not isinstance(path, list):
path = [path]
for p in path:
p = self._strip_protocol(p)
if self.isdir(p):
if not recursive:
raise ValueError("Cannot delete directory, set recursive=True")
if osp.abspath(p) == os.getcwd():
raise ValueError("Cannot delete current working directory")
shutil.rmtree(p)
else:
os.remove(p)
def unstrip_protocol(self, name):
name = self._strip_protocol(name) # normalise for local/win/...
return f"file://{name}"
def _open(self, path, mode="rb", block_size=None, **kwargs):
path = self._strip_protocol(path)
if self.auto_mkdir and "w" in mode:
self.makedirs(self._parent(path), exist_ok=True)
return LocalFileOpener(path, mode, fs=self, **kwargs)
def touch(self, path, truncate=True, **kwargs):
path = self._strip_protocol(path)
if self.auto_mkdir:
self.makedirs(self._parent(path), exist_ok=True)
if self.exists(path):
os.utime(path, None)
else:
open(path, "a").close()
if truncate:
os.truncate(path, 0)
def created(self, path):
info = self.info(path=path)
return datetime.datetime.fromtimestamp(
info["created"], tz=datetime.timezone.utc
)
def modified(self, path):
info = self.info(path=path)
return datetime.datetime.fromtimestamp(info["mtime"], tz=datetime.timezone.utc)
@classmethod
def _parent(cls, path):
path = cls._strip_protocol(path)
if os.sep == "/":
# posix native
return path.rsplit("/", 1)[0] or "/"
else:
# NT
path_ = path.rsplit("/", 1)[0]
if len(path_) <= 3:
if path_[1:2] == ":":
# nt root (something like c:/)
return path_[0] + ":/"
# More cases may be required here
return path_
@classmethod
def _strip_protocol(cls, path):
path = stringify_path(path)
if path.startswith("file://"):
path = path[7:]
elif path.startswith("file:"):
path = path[5:]
elif path.startswith("local://"):
path = path[8:]
elif path.startswith("local:"):
path = path[6:]
path = make_path_posix(path)
if os.sep != "/":
# This code-path is a stripped down version of
# > drive, path = ntpath.splitdrive(path)
if path[1:2] == ":":
# Absolute drive-letter path, e.g. X:\Windows
# Relative path with drive, e.g. X:Windows
drive, path = path[:2], path[2:]
elif path[:2] == "//":
# UNC drives, e.g. \\server\share or \\?\UNC\server\share
# Device drives, e.g. \\.\device or \\?\device
if (index1 := path.find("/", 2)) == -1 or (
index2 := path.find("/", index1 + 1)
) == -1:
drive, path = path, ""
else:
drive, path = path[:index2], path[index2:]
else:
# Relative path, e.g. Windows
drive = ""
path = path.rstrip("/") or cls.root_marker
return drive + path
else:
return path.rstrip("/") or cls.root_marker
def _isfilestore(self):
# Inheriting from DaskFileSystem makes this False (S3, etc. were)
# the original motivation. But we are a posix-like file system.
# See https://github.com/dask/dask/issues/5526
return True
def chmod(self, path, mode):
path = stringify_path(path)
return os.chmod(path, mode)
def make_path_posix(path):
"""Make path generic and absolute for current OS"""
if not isinstance(path, str):
if isinstance(path, (list, set, tuple)):
return type(path)(make_path_posix(p) for p in path)
else:
path = stringify_path(path)
if not isinstance(path, str):
raise TypeError(f"could not convert {path!r} to string")
if os.sep == "/":
# Native posix
if path.startswith("/"):
# most common fast case for posix
return path
elif path.startswith("~"):
return osp.expanduser(path)
elif path.startswith("./"):
path = path[2:]
elif path == ".":
path = ""
return f"{os.getcwd()}/{path}"
else:
# NT handling
if path[0:1] == "/" and path[2:3] == ":":
# path is like "/c:/local/path"
path = path[1:]
if path[1:2] == ":":
# windows full path like "C:\\local\\path"
if len(path) <= 3:
# nt root (something like c:/)
return path[0] + ":/"
path = path.replace("\\", "/")
return path
elif path[0:1] == "~":
return make_path_posix(osp.expanduser(path))
elif path.startswith(("\\\\", "//")):
# windows UNC/DFS-style paths
return "//" + path[2:].replace("\\", "/")
elif path.startswith(("\\", "/")):
# windows relative path with root
path = path.replace("\\", "/")
return f"{osp.splitdrive(os.getcwd())[0]}{path}"
else:
path = path.replace("\\", "/")
if path.startswith("./"):
path = path[2:]
elif path == ".":
path = ""
return f"{make_path_posix(os.getcwd())}/{path}"
def trailing_sep(path):
"""Return True if the path ends with a path separator.
A forward slash is always considered a path separator, even on Operating
Systems that normally use a backslash.
"""
# TODO: if all incoming paths were posix-compliant then separator would
# always be a forward slash, simplifying this function.
# See https://github.com/fsspec/filesystem_spec/pull/1250
return path.endswith(os.sep) or (os.altsep is not None and path.endswith(os.altsep))
@lru_cache(maxsize=1)
def get_umask(mask: int = 0o666) -> int:
"""Get the current umask.
Follows https://stackoverflow.com/a/44130549 to get the umask.
Temporarily sets the umask to the given value, and then resets it to the
original value.
"""
value = os.umask(mask)
os.umask(value)
return value
class LocalFileOpener(io.IOBase):
def __init__(
self, path, mode, autocommit=True, fs=None, compression=None, **kwargs
):
logger.debug("open file: %s", path)
self.path = path
self.mode = mode
self.fs = fs
self.f = None
self.autocommit = autocommit
self.compression = get_compression(path, compression)
self.blocksize = io.DEFAULT_BUFFER_SIZE
self._open()
def _open(self):
if self.f is None or self.f.closed:
if self.autocommit or "w" not in self.mode:
self.f = open(self.path, mode=self.mode)
if self.compression:
compress = compr[self.compression]
self.f = compress(self.f, mode=self.mode)
else:
# TODO: check if path is writable?
i, name = tempfile.mkstemp()
os.close(i) # we want normal open and normal buffered file
self.temp = name
self.f = open(name, mode=self.mode)
if "w" not in self.mode:
self.size = self.f.seek(0, 2)
self.f.seek(0)
self.f.size = self.size
def _fetch_range(self, start, end):
# probably only used by cached FS
if "r" not in self.mode:
raise ValueError
self._open()
self.f.seek(start)
return self.f.read(end - start)
def __setstate__(self, state):
self.f = None
loc = state.pop("loc", None)
self.__dict__.update(state)
if "r" in state["mode"]:
self.f = None
self._open()
self.f.seek(loc)
def __getstate__(self):
d = self.__dict__.copy()
d.pop("f")
if "r" in self.mode:
d["loc"] = self.f.tell()
else:
if not self.f.closed:
raise ValueError("Cannot serialise open write-mode local file")
return d
def commit(self):
if self.autocommit:
raise RuntimeError("Can only commit if not already set to autocommit")
try:
shutil.move(self.temp, self.path)
except PermissionError as e:
# shutil.move raises PermissionError if os.rename
# and the default copy2 fallback with shutil.copystats fail.
# The file should be there nonetheless, but without copied permissions.
# If it doesn't exist, there was no permission to create the file.
if not os.path.exists(self.path):
raise e
else:
# If PermissionError is not raised, permissions can be set.
try:
mask = 0o666
os.chmod(self.path, mask & ~get_umask(mask))
except RuntimeError:
pass
def discard(self):
if self.autocommit:
raise RuntimeError("Cannot discard if set to autocommit")
os.remove(self.temp)
def readable(self) -> bool:
return True
def writable(self) -> bool:
return "r" not in self.mode
def read(self, *args, **kwargs):
return self.f.read(*args, **kwargs)
def write(self, *args, **kwargs):
return self.f.write(*args, **kwargs)
def tell(self, *args, **kwargs):
return self.f.tell(*args, **kwargs)
def seek(self, *args, **kwargs):
return self.f.seek(*args, **kwargs)
def seekable(self, *args, **kwargs):
return self.f.seekable(*args, **kwargs)
def readline(self, *args, **kwargs):
return self.f.readline(*args, **kwargs)
def readlines(self, *args, **kwargs):
return self.f.readlines(*args, **kwargs)
def close(self):
return self.f.close()
def truncate(self, size=None) -> int:
return self.f.truncate(size)
@property
def closed(self):
return self.f.closed
def fileno(self):
return self.raw.fileno()
def flush(self) -> None:
self.f.flush()
def __iter__(self):
return self.f.__iter__()
def __getattr__(self, item):
return getattr(self.f, item)
def __enter__(self):
self._incontext = True
return self
def __exit__(self, exc_type, exc_value, traceback):
self._incontext = False
self.f.__exit__(exc_type, exc_value, traceback)

View File

@@ -0,0 +1,311 @@
from __future__ import annotations
import logging
from datetime import datetime, timezone
from errno import ENOTEMPTY
from io import BytesIO
from pathlib import PurePath, PureWindowsPath
from typing import Any, ClassVar
from fsspec import AbstractFileSystem
from fsspec.implementations.local import LocalFileSystem
from fsspec.utils import stringify_path
logger = logging.getLogger("fsspec.memoryfs")
class MemoryFileSystem(AbstractFileSystem):
"""A filesystem based on a dict of BytesIO objects
This is a global filesystem so instances of this class all point to the same
in memory filesystem.
"""
store: ClassVar[dict[str, Any]] = {} # global, do not overwrite!
pseudo_dirs = [""] # global, do not overwrite!
protocol = "memory"
root_marker = "/"
@classmethod
def _strip_protocol(cls, path):
if isinstance(path, PurePath):
if isinstance(path, PureWindowsPath):
return LocalFileSystem._strip_protocol(path)
else:
path = stringify_path(path)
path = path.removeprefix("memory://")
if "::" in path or "://" in path:
return path.rstrip("/")
path = path.lstrip("/").rstrip("/")
return "/" + path if path else ""
def ls(self, path, detail=True, **kwargs):
path = self._strip_protocol(path)
if path in self.store:
# there is a key with this exact name
if not detail:
return [path]
return [
{
"name": path,
"size": self.store[path].size,
"type": "file",
"created": self.store[path].created.timestamp(),
}
]
paths = set()
starter = path + "/"
out = []
for p2 in tuple(self.store):
if p2.startswith(starter):
if "/" not in p2[len(starter) :]:
# exact child
out.append(
{
"name": p2,
"size": self.store[p2].size,
"type": "file",
"created": self.store[p2].created.timestamp(),
}
)
elif len(p2) > len(starter):
# implied child directory
ppath = starter + p2[len(starter) :].split("/", 1)[0]
if ppath not in paths:
out = out or []
out.append(
{
"name": ppath,
"size": 0,
"type": "directory",
}
)
paths.add(ppath)
for p2 in self.pseudo_dirs:
if p2.startswith(starter):
if "/" not in p2[len(starter) :]:
# exact child pdir
if p2 not in paths:
out.append({"name": p2, "size": 0, "type": "directory"})
paths.add(p2)
else:
# directory implied by deeper pdir
ppath = starter + p2[len(starter) :].split("/", 1)[0]
if ppath not in paths:
out.append({"name": ppath, "size": 0, "type": "directory"})
paths.add(ppath)
if not out:
if path in self.pseudo_dirs:
# empty dir
return []
raise FileNotFoundError(path)
if detail:
return out
return sorted([f["name"] for f in out])
def mkdir(self, path, create_parents=True, **kwargs):
path = self._strip_protocol(path)
if path in self.store or path in self.pseudo_dirs:
raise FileExistsError(path)
if self._parent(path).strip("/") and self.isfile(self._parent(path)):
raise NotADirectoryError(self._parent(path))
if create_parents and self._parent(path).strip("/"):
try:
self.mkdir(self._parent(path), create_parents, **kwargs)
except FileExistsError:
pass
if path and path not in self.pseudo_dirs:
self.pseudo_dirs.append(path)
def makedirs(self, path, exist_ok=False):
try:
self.mkdir(path, create_parents=True)
except FileExistsError:
if not exist_ok:
raise
def pipe_file(self, path, value, mode="overwrite", **kwargs):
"""Set the bytes of given file
Avoids copies of the data if possible
"""
mode = "xb" if mode == "create" else "wb"
self.open(path, mode=mode, data=value)
def rmdir(self, path):
path = self._strip_protocol(path)
if path == "":
# silently avoid deleting FS root
return
if path in self.pseudo_dirs:
if not self.ls(path):
self.pseudo_dirs.remove(path)
else:
raise OSError(ENOTEMPTY, "Directory not empty", path)
else:
raise FileNotFoundError(path)
def info(self, path, **kwargs):
logger.debug("info: %s", path)
path = self._strip_protocol(path)
if path in self.pseudo_dirs or any(
p.startswith(path + "/") for p in list(self.store) + self.pseudo_dirs
):
return {
"name": path,
"size": 0,
"type": "directory",
}
elif path in self.store:
filelike = self.store[path]
return {
"name": path,
"size": filelike.size,
"type": "file",
"created": getattr(filelike, "created", None),
}
else:
raise FileNotFoundError(path)
def _open(
self,
path,
mode="rb",
block_size=None,
autocommit=True,
cache_options=None,
**kwargs,
):
path = self._strip_protocol(path)
if "x" in mode and self.exists(path):
raise FileExistsError
if path in self.pseudo_dirs:
raise IsADirectoryError(path)
parent = path
while len(parent) > 1:
parent = self._parent(parent)
if self.isfile(parent):
raise FileExistsError(parent)
if mode in ["rb", "ab", "r+b", "a+b"]:
if path in self.store:
f = self.store[path]
if "a" in mode:
# position at the end of file
f.seek(0, 2)
else:
# position at the beginning of file
f.seek(0)
return f
else:
raise FileNotFoundError(path)
elif mode in {"wb", "w+b", "xb", "x+b"}:
if "x" in mode and self.exists(path):
raise FileExistsError
m = MemoryFile(self, path, kwargs.get("data"))
if not self._intrans:
m.commit()
return m
else:
name = self.__class__.__name__
raise ValueError(f"unsupported file mode for {name}: {mode!r}")
def cp_file(self, path1, path2, **kwargs):
path1 = self._strip_protocol(path1)
path2 = self._strip_protocol(path2)
if self.isfile(path1):
self.store[path2] = MemoryFile(
self, path2, self.store[path1].getvalue()
) # implicit copy
elif self.isdir(path1):
if path2 not in self.pseudo_dirs:
self.pseudo_dirs.append(path2)
else:
raise FileNotFoundError(path1)
def cat_file(self, path, start=None, end=None, **kwargs):
logger.debug("cat: %s", path)
path = self._strip_protocol(path)
try:
return bytes(self.store[path].getbuffer()[start:end])
except KeyError as e:
raise FileNotFoundError(path) from e
def _rm(self, path):
path = self._strip_protocol(path)
try:
del self.store[path]
except KeyError as e:
raise FileNotFoundError(path) from e
def modified(self, path):
path = self._strip_protocol(path)
try:
return self.store[path].modified
except KeyError as e:
raise FileNotFoundError(path) from e
def created(self, path):
path = self._strip_protocol(path)
try:
return self.store[path].created
except KeyError as e:
raise FileNotFoundError(path) from e
def isfile(self, path):
path = self._strip_protocol(path)
return path in self.store
def rm(self, path, recursive=False, maxdepth=None):
if isinstance(path, str):
path = self._strip_protocol(path)
else:
path = [self._strip_protocol(p) for p in path]
paths = self.expand_path(path, recursive=recursive, maxdepth=maxdepth)
for p in reversed(paths):
if self.isfile(p):
self.rm_file(p)
# If the expanded path doesn't exist, it is only because the expanded
# path was a directory that does not exist in self.pseudo_dirs. This
# is possible if you directly create files without making the
# directories first.
elif not self.exists(p):
continue
else:
self.rmdir(p)
class MemoryFile(BytesIO):
"""A BytesIO which can't close and works as a context manager
Can initialise with data. Each path should only be active once at any moment.
No need to provide fs, path if auto-committing (default)
"""
def __init__(self, fs=None, path=None, data=None):
logger.debug("open file %s", path)
self.fs = fs
self.path = path
self.created = datetime.now(tz=timezone.utc)
self.modified = datetime.now(tz=timezone.utc)
if data:
super().__init__(data)
self.seek(0)
@property
def size(self):
return self.getbuffer().nbytes
def __enter__(self):
return self
def close(self):
pass
def discard(self):
pass
def commit(self):
self.fs.store[self.path] = self
self.modified = datetime.now(tz=timezone.utc)

View File

@@ -0,0 +1,187 @@
import datetime
import logging
import os
import types
import uuid
from stat import S_ISDIR, S_ISLNK
import paramiko
from .. import AbstractFileSystem
from ..utils import infer_storage_options
logger = logging.getLogger("fsspec.sftp")
class SFTPFileSystem(AbstractFileSystem):
"""Files over SFTP/SSH
Peer-to-peer filesystem over SSH using paramiko.
Note: if using this with the ``open`` or ``open_files``, with full URLs,
there is no way to tell if a path is relative, so all paths are assumed
to be absolute.
"""
protocol = "sftp", "ssh"
def __init__(self, host, **ssh_kwargs):
"""
Parameters
----------
host: str
Hostname or IP as a string
temppath: str
Location on the server to put files, when within a transaction
ssh_kwargs: dict
Parameters passed on to connection. See details in
https://docs.paramiko.org/en/3.3/api/client.html#paramiko.client.SSHClient.connect
May include port, username, password...
"""
if self._cached:
return
super().__init__(**ssh_kwargs)
self.temppath = ssh_kwargs.pop("temppath", "/tmp") # remote temp directory
self.host = host
self.ssh_kwargs = ssh_kwargs
self._connect()
def _connect(self):
logger.debug("Connecting to SFTP server %s", self.host)
self.client = paramiko.SSHClient()
self.client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
self.client.connect(self.host, **self.ssh_kwargs)
self.ftp = self.client.open_sftp()
@classmethod
def _strip_protocol(cls, path):
return infer_storage_options(path)["path"]
@staticmethod
def _get_kwargs_from_urls(urlpath):
out = infer_storage_options(urlpath)
out.pop("path", None)
out.pop("protocol", None)
return out
def mkdir(self, path, create_parents=True, mode=511):
path = self._strip_protocol(path)
logger.debug("Creating folder %s", path)
if self.exists(path):
raise FileExistsError(f"File exists: {path}")
if create_parents:
self.makedirs(path)
else:
self.ftp.mkdir(path, mode)
def makedirs(self, path, exist_ok=False, mode=511):
if self.exists(path) and not exist_ok:
raise FileExistsError(f"File exists: {path}")
parts = path.split("/")
new_path = "/" if path[:1] == "/" else ""
for part in parts:
if part:
new_path = f"{new_path}/{part}" if new_path else part
if not self.exists(new_path):
self.ftp.mkdir(new_path, mode)
def rmdir(self, path):
path = self._strip_protocol(path)
logger.debug("Removing folder %s", path)
self.ftp.rmdir(path)
def info(self, path):
path = self._strip_protocol(path)
stat = self._decode_stat(self.ftp.stat(path))
stat["name"] = path
return stat
@staticmethod
def _decode_stat(stat, parent_path=None):
if S_ISDIR(stat.st_mode):
t = "directory"
elif S_ISLNK(stat.st_mode):
t = "link"
else:
t = "file"
out = {
"name": "",
"size": stat.st_size,
"type": t,
"uid": stat.st_uid,
"gid": stat.st_gid,
"time": datetime.datetime.fromtimestamp(
stat.st_atime, tz=datetime.timezone.utc
),
"mtime": datetime.datetime.fromtimestamp(
stat.st_mtime, tz=datetime.timezone.utc
),
}
if parent_path:
out["name"] = "/".join([parent_path.rstrip("/"), stat.filename])
return out
def ls(self, path, detail=False):
path = self._strip_protocol(path)
logger.debug("Listing folder %s", path)
stats = [self._decode_stat(stat, path) for stat in self.ftp.listdir_iter(path)]
if detail:
return stats
else:
paths = [stat["name"] for stat in stats]
return sorted(paths)
def put(self, lpath, rpath, callback=None, **kwargs):
rpath = self._strip_protocol(rpath)
logger.debug("Put file %s into %s", lpath, rpath)
self.ftp.put(lpath, rpath)
def get_file(self, rpath, lpath, **kwargs):
if self.isdir(rpath):
os.makedirs(lpath, exist_ok=True)
else:
self.ftp.get(self._strip_protocol(rpath), lpath)
def _open(self, path, mode="rb", block_size=None, **kwargs):
"""
block_size: int or None
If 0, no buffering, if 1, line buffering, if >1, buffer that many
bytes, if None use default from paramiko.
"""
logger.debug("Opening file %s", path)
if kwargs.get("autocommit", True) is False:
# writes to temporary file, move on commit
path2 = "/".join([self.temppath, str(uuid.uuid4())])
f = self.ftp.open(path2, mode, bufsize=block_size if block_size else -1)
f.temppath = path2
f.targetpath = path
f.fs = self
f.commit = types.MethodType(commit_a_file, f)
f.discard = types.MethodType(discard_a_file, f)
else:
f = self.ftp.open(path, mode, bufsize=block_size if block_size else -1)
return f
def _rm(self, path):
if self.isdir(path):
self.ftp.rmdir(path)
else:
self.ftp.remove(path)
def mv(self, old, new):
new = self._strip_protocol(new)
old = self._strip_protocol(old)
logger.debug("Renaming %s into %s", old, new)
self.ftp.posix_rename(old, new)
def commit_a_file(self):
self.fs.mv(self.temppath, self.targetpath)
def discard_a_file(self):
self.fs._rm(self.temppath)

View File

@@ -0,0 +1,416 @@
"""
This module contains SMBFileSystem class responsible for handling access to
Windows Samba network shares by using package smbprotocol
"""
import datetime
import re
import uuid
from stat import S_ISDIR, S_ISLNK
import smbclient
import smbprotocol.exceptions
from .. import AbstractFileSystem
from ..utils import infer_storage_options
# ! pylint: disable=bad-continuation
class SMBFileSystem(AbstractFileSystem):
"""Allow reading and writing to Windows and Samba network shares.
When using `fsspec.open()` for getting a file-like object the URI
should be specified as this format:
``smb://workgroup;user:password@server:port/share/folder/file.csv``.
Example::
>>> import fsspec
>>> with fsspec.open(
... 'smb://myuser:mypassword@myserver.com/' 'share/folder/file.csv'
... ) as smbfile:
... df = pd.read_csv(smbfile, sep='|', header=None)
Note that you need to pass in a valid hostname or IP address for the host
component of the URL. Do not use the Windows/NetBIOS machine name for the
host component.
The first component of the path in the URL points to the name of the shared
folder. Subsequent path components will point to the directory/folder/file.
The URL components ``workgroup`` , ``user``, ``password`` and ``port`` may be
optional.
.. note::
For working this source require `smbprotocol`_ to be installed, e.g.::
$ pip install smbprotocol
# or
# pip install smbprotocol[kerberos]
.. _smbprotocol: https://github.com/jborean93/smbprotocol#requirements
Note: if using this with the ``open`` or ``open_files``, with full URLs,
there is no way to tell if a path is relative, so all paths are assumed
to be absolute.
"""
protocol = "smb"
# pylint: disable=too-many-arguments
def __init__(
self,
host,
port=None,
username=None,
password=None,
timeout=60,
encrypt=None,
share_access=None,
register_session_retries=4,
register_session_retry_wait=1,
register_session_retry_factor=10,
auto_mkdir=False,
**kwargs,
):
"""
You can use _get_kwargs_from_urls to get some kwargs from
a reasonable SMB url.
Authentication will be anonymous or integrated if username/password are not
given.
Parameters
----------
host: str
The remote server name/ip to connect to
port: int or None
Port to connect with. Usually 445, sometimes 139.
username: str or None
Username to connect with. Required if Kerberos auth is not being used.
password: str or None
User's password on the server, if using username
timeout: int
Connection timeout in seconds
encrypt: bool
Whether to force encryption or not, once this has been set to True
the session cannot be changed back to False.
share_access: str or None
Specifies the default access applied to file open operations
performed with this file system object.
This affects whether other processes can concurrently open a handle
to the same file.
- None (the default): exclusively locks the file until closed.
- 'r': Allow other handles to be opened with read access.
- 'w': Allow other handles to be opened with write access.
- 'd': Allow other handles to be opened with delete access.
register_session_retries: int
Number of retries to register a session with the server. Retries are not performed
for authentication errors, as they are considered as invalid credentials and not network
issues. If set to negative value, no register attempts will be performed.
register_session_retry_wait: int
Time in seconds to wait between each retry. Number must be non-negative.
register_session_retry_factor: int
Base factor for the wait time between each retry. The wait time
is calculated using exponential function. For factor=1 all wait times
will be equal to `register_session_retry_wait`. For any number of retries,
the last wait time will be equal to `register_session_retry_wait` and for retries>1
the first wait time will be equal to `register_session_retry_wait / factor`.
Number must be equal to or greater than 1. Optimal factor is 10.
auto_mkdir: bool
Whether, when opening a file, the directory containing it should
be created (if it doesn't already exist). This is assumed by pyarrow
and zarr-python code.
"""
super().__init__(**kwargs)
self.host = host
self.port = port
self.username = username
self.password = password
self.timeout = timeout
self.encrypt = encrypt
self.temppath = kwargs.pop("temppath", "")
self.share_access = share_access
self.register_session_retries = register_session_retries
if register_session_retry_wait < 0:
raise ValueError(
"register_session_retry_wait must be a non-negative integer"
)
self.register_session_retry_wait = register_session_retry_wait
if register_session_retry_factor < 1:
raise ValueError(
"register_session_retry_factor must be a positive "
"integer equal to or greater than 1"
)
self.register_session_retry_factor = register_session_retry_factor
self.auto_mkdir = auto_mkdir
self._connect()
@property
def _port(self):
return 445 if self.port is None else self.port
def _connect(self):
import time
if self.register_session_retries <= -1:
return
retried_errors = []
wait_time = self.register_session_retry_wait
n_waits = (
self.register_session_retries - 1
) # -1 = No wait time after the last retry
factor = self.register_session_retry_factor
# Generate wait times for each retry attempt.
# Wait times are calculated using exponential function. For factor=1 all wait times
# will be equal to `wait`. For any number of retries the last wait time will be
# equal to `wait` and for retries>2 the first wait time will be equal to `wait / factor`.
wait_times = iter(
factor ** (n / n_waits - 1) * wait_time for n in range(0, n_waits + 1)
)
for attempt in range(self.register_session_retries + 1):
try:
smbclient.register_session(
self.host,
username=self.username,
password=self.password,
port=self._port,
encrypt=self.encrypt,
connection_timeout=self.timeout,
)
return
except (
smbprotocol.exceptions.SMBAuthenticationError,
smbprotocol.exceptions.LogonFailure,
):
# These exceptions should not be repeated, as they clearly indicate
# that the credentials are invalid and not a network issue.
raise
except ValueError as exc:
if re.findall(r"\[Errno -\d+]", str(exc)):
# This exception is raised by the smbprotocol.transport:Tcp.connect
# and originates from socket.gaierror (OSError). These exceptions might
# be raised due to network instability. We will retry to connect.
retried_errors.append(exc)
else:
# All another ValueError exceptions should be raised, as they are not
# related to network issues.
raise
except Exception as exc:
# Save the exception and retry to connect. This except might be dropped
# in the future, once all exceptions suited for retry are identified.
retried_errors.append(exc)
if attempt < self.register_session_retries:
time.sleep(next(wait_times))
# Raise last exception to inform user about the connection issues.
# Note: Should we use ExceptionGroup to raise all exceptions?
raise retried_errors[-1]
@classmethod
def _strip_protocol(cls, path):
return infer_storage_options(path)["path"]
@staticmethod
def _get_kwargs_from_urls(path):
# smb://workgroup;user:password@host:port/share/folder/file.csv
out = infer_storage_options(path)
out.pop("path", None)
out.pop("protocol", None)
return out
def mkdir(self, path, create_parents=True, **kwargs):
wpath = _as_unc_path(self.host, path)
if create_parents:
smbclient.makedirs(wpath, exist_ok=False, port=self._port, **kwargs)
else:
smbclient.mkdir(wpath, port=self._port, **kwargs)
def makedirs(self, path, exist_ok=False):
if _share_has_path(path):
wpath = _as_unc_path(self.host, path)
smbclient.makedirs(wpath, exist_ok=exist_ok, port=self._port)
def rmdir(self, path):
if _share_has_path(path):
wpath = _as_unc_path(self.host, path)
smbclient.rmdir(wpath, port=self._port)
def info(self, path, **kwargs):
wpath = _as_unc_path(self.host, path)
stats = smbclient.stat(wpath, port=self._port, **kwargs)
if S_ISDIR(stats.st_mode):
stype = "directory"
elif S_ISLNK(stats.st_mode):
stype = "link"
else:
stype = "file"
res = {
"name": path + "/" if stype == "directory" else path,
"size": stats.st_size,
"type": stype,
"uid": stats.st_uid,
"gid": stats.st_gid,
"time": stats.st_atime,
"mtime": stats.st_mtime,
}
return res
def created(self, path):
"""Return the created timestamp of a file as a datetime.datetime"""
wpath = _as_unc_path(self.host, path)
stats = smbclient.stat(wpath, port=self._port)
return datetime.datetime.fromtimestamp(stats.st_ctime, tz=datetime.timezone.utc)
def modified(self, path):
"""Return the modified timestamp of a file as a datetime.datetime"""
wpath = _as_unc_path(self.host, path)
stats = smbclient.stat(wpath, port=self._port)
return datetime.datetime.fromtimestamp(stats.st_mtime, tz=datetime.timezone.utc)
def ls(self, path, detail=True, **kwargs):
unc = _as_unc_path(self.host, path)
listed = smbclient.listdir(unc, port=self._port, **kwargs)
dirs = ["/".join([path.rstrip("/"), p]) for p in listed]
if detail:
dirs = [self.info(d) for d in dirs]
return dirs
# pylint: disable=too-many-arguments
def _open(
self,
path,
mode="rb",
block_size=-1,
autocommit=True,
cache_options=None,
**kwargs,
):
"""
block_size: int or None
If 0, no buffering, 1, line buffering, >1, buffer that many bytes
Notes
-----
By specifying 'share_access' in 'kwargs' it is possible to override the
default shared access setting applied in the constructor of this object.
"""
if self.auto_mkdir and "w" in mode:
self.makedirs(self._parent(path), exist_ok=True)
bls = block_size if block_size is not None and block_size >= 0 else -1
wpath = _as_unc_path(self.host, path)
share_access = kwargs.pop("share_access", self.share_access)
if "w" in mode and autocommit is False:
temp = _as_temp_path(self.host, path, self.temppath)
return SMBFileOpener(
wpath, temp, mode, port=self._port, block_size=bls, **kwargs
)
return smbclient.open_file(
wpath,
mode,
buffering=bls,
share_access=share_access,
port=self._port,
**kwargs,
)
def copy(self, path1, path2, **kwargs):
"""Copy within two locations in the same filesystem"""
wpath1 = _as_unc_path(self.host, path1)
wpath2 = _as_unc_path(self.host, path2)
if self.auto_mkdir:
self.makedirs(self._parent(path2), exist_ok=True)
smbclient.copyfile(wpath1, wpath2, port=self._port, **kwargs)
def _rm(self, path):
if _share_has_path(path):
wpath = _as_unc_path(self.host, path)
stats = smbclient.stat(wpath, port=self._port)
if S_ISDIR(stats.st_mode):
smbclient.rmdir(wpath, port=self._port)
else:
smbclient.remove(wpath, port=self._port)
def mv(self, path1, path2, recursive=None, maxdepth=None, **kwargs):
wpath1 = _as_unc_path(self.host, path1)
wpath2 = _as_unc_path(self.host, path2)
smbclient.rename(wpath1, wpath2, port=self._port, **kwargs)
def _as_unc_path(host, path):
rpath = path.replace("/", "\\")
unc = f"\\\\{host}{rpath}"
return unc
def _as_temp_path(host, path, temppath):
share = path.split("/")[1]
temp_file = f"/{share}{temppath}/{uuid.uuid4()}"
unc = _as_unc_path(host, temp_file)
return unc
def _share_has_path(path):
parts = path.count("/")
if path.endswith("/"):
return parts > 2
return parts > 1
class SMBFileOpener:
"""writes to remote temporary file, move on commit"""
def __init__(self, path, temp, mode, port=445, block_size=-1, **kwargs):
self.path = path
self.temp = temp
self.mode = mode
self.block_size = block_size
self.kwargs = kwargs
self.smbfile = None
self._incontext = False
self.port = port
self._open()
def _open(self):
if self.smbfile is None or self.smbfile.closed:
self.smbfile = smbclient.open_file(
self.temp,
self.mode,
port=self.port,
buffering=self.block_size,
**self.kwargs,
)
def commit(self):
"""Move temp file to definitive on success."""
# TODO: use transaction support in SMB protocol
smbclient.replace(self.temp, self.path, port=self.port)
def discard(self):
"""Remove the temp file on failure."""
smbclient.remove(self.temp, port=self.port)
def __fspath__(self):
return self.path
def __iter__(self):
return self.smbfile.__iter__()
def __getattr__(self, item):
return getattr(self.smbfile, item)
def __enter__(self):
self._incontext = True
return self.smbfile.__enter__()
def __exit__(self, exc_type, exc_value, traceback):
self._incontext = False
self.smbfile.__exit__(exc_type, exc_value, traceback)

View File

@@ -0,0 +1,124 @@
import logging
import tarfile
import fsspec
from fsspec.archive import AbstractArchiveFileSystem
from fsspec.compression import compr
from fsspec.utils import infer_compression
typemap = {b"0": "file", b"5": "directory"}
logger = logging.getLogger("tar")
class TarFileSystem(AbstractArchiveFileSystem):
"""Compressed Tar archives as a file-system (read-only)
Supports the following formats:
tar.gz, tar.bz2, tar.xz
"""
root_marker = ""
protocol = "tar"
cachable = False
def __init__(
self,
fo="",
index_store=None,
target_options=None,
target_protocol=None,
compression=None,
**kwargs,
):
super().__init__(**kwargs)
target_options = target_options or {}
if isinstance(fo, str):
self.of = fsspec.open(fo, protocol=target_protocol, **target_options)
fo = self.of.open() # keep the reference
# Try to infer compression.
if compression is None:
name = None
# Try different ways to get hold of the filename. `fo` might either
# be a `fsspec.LocalFileOpener`, an `io.BufferedReader` or an
# `fsspec.AbstractFileSystem` instance.
try:
# Amended io.BufferedReader or similar.
# This uses a "protocol extension" where original filenames are
# propagated to archive-like filesystems in order to let them
# infer the right compression appropriately.
if hasattr(fo, "original"):
name = fo.original
# fsspec.LocalFileOpener
elif hasattr(fo, "path"):
name = fo.path
# io.BufferedReader
elif hasattr(fo, "name"):
name = fo.name
# fsspec.AbstractFileSystem
elif hasattr(fo, "info"):
name = fo.info()["name"]
except Exception as ex:
logger.warning(
f"Unable to determine file name, not inferring compression: {ex}"
)
if name is not None:
compression = infer_compression(name)
logger.info(f"Inferred compression {compression} from file name {name}")
if compression is not None:
# TODO: tarfile already implements compression with modes like "'r:gz'",
# but then would seek to offset in the file work?
fo = compr[compression](fo)
self._fo_ref = fo
self.fo = fo # the whole instance is a context
self.tar = tarfile.TarFile(fileobj=self.fo)
self.dir_cache = None
self.index_store = index_store
self.index = None
self._index()
def _index(self):
# TODO: load and set saved index, if exists
out = {}
for ti in self.tar:
info = ti.get_info()
info["type"] = typemap.get(info["type"], "file")
name = ti.get_info()["name"].rstrip("/")
out[name] = (info, ti.offset_data)
self.index = out
# TODO: save index to self.index_store here, if set
def _get_dirs(self):
if self.dir_cache is not None:
return
# This enables ls to get directories as children as well as files
self.dir_cache = {
dirname: {"name": dirname, "size": 0, "type": "directory"}
for dirname in self._all_dirnames(self.tar.getnames())
}
for member in self.tar.getmembers():
info = member.get_info()
info["name"] = info["name"].rstrip("/")
info["type"] = typemap.get(info["type"], "file")
self.dir_cache[info["name"]] = info
def _open(self, path, mode="rb", **kwargs):
if mode != "rb":
raise ValueError("Read-only filesystem implementation")
details, offset = self.index[path]
if details["type"] != "file":
raise ValueError("Can only handle regular files")
return self.tar.extractfile(path)

View File

@@ -0,0 +1,485 @@
# https://hadoop.apache.org/docs/r1.0.4/webhdfs.html
import logging
import os
import secrets
import shutil
import tempfile
import uuid
from contextlib import suppress
from urllib.parse import quote
import requests
from ..spec import AbstractBufferedFile, AbstractFileSystem
from ..utils import infer_storage_options, tokenize
logger = logging.getLogger("webhdfs")
class WebHDFS(AbstractFileSystem):
"""
Interface to HDFS over HTTP using the WebHDFS API. Supports also HttpFS gateways.
Four auth mechanisms are supported:
insecure: no auth is done, and the user is assumed to be whoever they
say they are (parameter ``user``), or a predefined value such as
"dr.who" if not given
spnego: when kerberos authentication is enabled, auth is negotiated by
requests_kerberos https://github.com/requests/requests-kerberos .
This establishes a session based on existing kinit login and/or
specified principal/password; parameters are passed with ``kerb_kwargs``
token: uses an existing Hadoop delegation token from another secured
service. Indeed, this client can also generate such tokens when
not insecure. Note that tokens expire, but can be renewed (by a
previously specified user) and may allow for proxying.
basic-auth: used when both parameter ``user`` and parameter ``password``
are provided.
"""
tempdir = str(tempfile.gettempdir())
protocol = "webhdfs", "webHDFS"
def __init__(
self,
host,
port=50070,
kerberos=False,
token=None,
user=None,
password=None,
proxy_to=None,
kerb_kwargs=None,
data_proxy=None,
use_https=False,
session_cert=None,
session_verify=True,
**kwargs,
):
"""
Parameters
----------
host: str
Name-node address
port: int
Port for webHDFS
kerberos: bool
Whether to authenticate with kerberos for this connection
token: str or None
If given, use this token on every call to authenticate. A user
and user-proxy may be encoded in the token and should not be also
given
user: str or None
If given, assert the user name to connect with
password: str or None
If given, assert the password to use for basic auth. If password
is provided, user must be provided also
proxy_to: str or None
If given, the user has the authority to proxy, and this value is
the user in who's name actions are taken
kerb_kwargs: dict
Any extra arguments for HTTPKerberosAuth, see
`<https://github.com/requests/requests-kerberos/blob/master/requests_kerberos/kerberos_.py>`_
data_proxy: dict, callable or None
If given, map data-node addresses. This can be necessary if the
HDFS cluster is behind a proxy, running on Docker or otherwise has
a mismatch between the host-names given by the name-node and the
address by which to refer to them from the client. If a dict,
maps host names ``host->data_proxy[host]``; if a callable, full
URLs are passed, and function must conform to
``url->data_proxy(url)``.
use_https: bool
Whether to connect to the Name-node using HTTPS instead of HTTP
session_cert: str or Tuple[str, str] or None
Path to a certificate file, or tuple of (cert, key) files to use
for the requests.Session
session_verify: str, bool or None
Path to a certificate file to use for verifying the requests.Session.
kwargs
"""
if self._cached:
return
super().__init__(**kwargs)
self.url = f"{'https' if use_https else 'http'}://{host}:{port}/webhdfs/v1"
self.kerb = kerberos
self.kerb_kwargs = kerb_kwargs or {}
self.pars = {}
self.proxy = data_proxy or {}
if token is not None:
if user is not None or proxy_to is not None:
raise ValueError(
"If passing a delegation token, must not set "
"user or proxy_to, as these are encoded in the"
" token"
)
self.pars["delegation"] = token
self.user = user
self.password = password
if password is not None:
if user is None:
raise ValueError(
"If passing a password, the user must also be"
"set in order to set up the basic-auth"
)
else:
if user is not None:
self.pars["user.name"] = user
if proxy_to is not None:
self.pars["doas"] = proxy_to
if kerberos and user is not None:
raise ValueError(
"If using Kerberos auth, do not specify the "
"user, this is handled by kinit."
)
self.session_cert = session_cert
self.session_verify = session_verify
self._connect()
self._fsid = f"webhdfs_{tokenize(host, port)}"
@property
def fsid(self):
return self._fsid
def _connect(self):
self.session = requests.Session()
if self.session_cert:
self.session.cert = self.session_cert
self.session.verify = self.session_verify
if self.kerb:
from requests_kerberos import HTTPKerberosAuth
self.session.auth = HTTPKerberosAuth(**self.kerb_kwargs)
if self.user is not None and self.password is not None:
from requests.auth import HTTPBasicAuth
self.session.auth = HTTPBasicAuth(self.user, self.password)
def _call(self, op, method="get", path=None, data=None, redirect=True, **kwargs):
path = self._strip_protocol(path) if path is not None else ""
url = self._apply_proxy(self.url + quote(path, safe="/="))
args = kwargs.copy()
args.update(self.pars)
args["op"] = op.upper()
logger.debug("sending %s with %s", url, method)
out = self.session.request(
method=method.upper(),
url=url,
params=args,
data=data,
allow_redirects=redirect,
)
if out.status_code in [400, 401, 403, 404, 500]:
try:
err = out.json()
msg = err["RemoteException"]["message"]
exp = err["RemoteException"]["exception"]
except (ValueError, KeyError):
pass
else:
if exp in ["IllegalArgumentException", "UnsupportedOperationException"]:
raise ValueError(msg)
elif exp in ["SecurityException", "AccessControlException"]:
raise PermissionError(msg)
elif exp in ["FileNotFoundException"]:
raise FileNotFoundError(msg)
else:
raise RuntimeError(msg)
out.raise_for_status()
return out
def _open(
self,
path,
mode="rb",
block_size=None,
autocommit=True,
replication=None,
permissions=None,
**kwargs,
):
"""
Parameters
----------
path: str
File location
mode: str
'rb', 'wb', etc.
block_size: int
Client buffer size for read-ahead or write buffer
autocommit: bool
If False, writes to temporary file that only gets put in final
location upon commit
replication: int
Number of copies of file on the cluster, write mode only
permissions: str or int
posix permissions, write mode only
kwargs
Returns
-------
WebHDFile instance
"""
block_size = block_size or self.blocksize
return WebHDFile(
self,
path,
mode=mode,
block_size=block_size,
tempdir=self.tempdir,
autocommit=autocommit,
replication=replication,
permissions=permissions,
)
@staticmethod
def _process_info(info):
info["type"] = info["type"].lower()
info["size"] = info["length"]
return info
@classmethod
def _strip_protocol(cls, path):
return infer_storage_options(path)["path"]
@staticmethod
def _get_kwargs_from_urls(urlpath):
out = infer_storage_options(urlpath)
out.pop("path", None)
out.pop("protocol", None)
if "username" in out:
out["user"] = out.pop("username")
return out
def info(self, path):
out = self._call("GETFILESTATUS", path=path)
info = out.json()["FileStatus"]
info["name"] = path
return self._process_info(info)
def ls(self, path, detail=False, **kwargs):
out = self._call("LISTSTATUS", path=path)
infos = out.json()["FileStatuses"]["FileStatus"]
for info in infos:
self._process_info(info)
info["name"] = path.rstrip("/") + "/" + info["pathSuffix"]
if detail:
return sorted(infos, key=lambda i: i["name"])
else:
return sorted(info["name"] for info in infos)
def content_summary(self, path):
"""Total numbers of files, directories and bytes under path"""
out = self._call("GETCONTENTSUMMARY", path=path)
return out.json()["ContentSummary"]
def ukey(self, path):
"""Checksum info of file, giving method and result"""
out = self._call("GETFILECHECKSUM", path=path, redirect=False)
if "Location" in out.headers:
location = self._apply_proxy(out.headers["Location"])
out2 = self.session.get(location)
out2.raise_for_status()
return out2.json()["FileChecksum"]
else:
out.raise_for_status()
return out.json()["FileChecksum"]
def home_directory(self):
"""Get user's home directory"""
out = self._call("GETHOMEDIRECTORY")
return out.json()["Path"]
def get_delegation_token(self, renewer=None):
"""Retrieve token which can give the same authority to other uses
Parameters
----------
renewer: str or None
User who may use this token; if None, will be current user
"""
if renewer:
out = self._call("GETDELEGATIONTOKEN", renewer=renewer)
else:
out = self._call("GETDELEGATIONTOKEN")
t = out.json()["Token"]
if t is None:
raise ValueError("No token available for this user/security context")
return t["urlString"]
def renew_delegation_token(self, token):
"""Make token live longer. Returns new expiry time"""
out = self._call("RENEWDELEGATIONTOKEN", method="put", token=token)
return out.json()["long"]
def cancel_delegation_token(self, token):
"""Stop the token from being useful"""
self._call("CANCELDELEGATIONTOKEN", method="put", token=token)
def chmod(self, path, mod):
"""Set the permission at path
Parameters
----------
path: str
location to set (file or directory)
mod: str or int
posix epresentation or permission, give as oct string, e.g, '777'
or 0o777
"""
self._call("SETPERMISSION", method="put", path=path, permission=mod)
def chown(self, path, owner=None, group=None):
"""Change owning user and/or group"""
kwargs = {}
if owner is not None:
kwargs["owner"] = owner
if group is not None:
kwargs["group"] = group
self._call("SETOWNER", method="put", path=path, **kwargs)
def set_replication(self, path, replication):
"""
Set file replication factor
Parameters
----------
path: str
File location (not for directories)
replication: int
Number of copies of file on the cluster. Should be smaller than
number of data nodes; normally 3 on most systems.
"""
self._call("SETREPLICATION", path=path, method="put", replication=replication)
def mkdir(self, path, **kwargs):
self._call("MKDIRS", method="put", path=path)
def makedirs(self, path, exist_ok=False):
if exist_ok is False and self.exists(path):
raise FileExistsError(path)
self.mkdir(path)
def mv(self, path1, path2, **kwargs):
self._call("RENAME", method="put", path=path1, destination=path2)
def rm(self, path, recursive=False, **kwargs):
self._call(
"DELETE",
method="delete",
path=path,
recursive="true" if recursive else "false",
)
def rm_file(self, path, **kwargs):
self.rm(path)
def cp_file(self, lpath, rpath, **kwargs):
with self.open(lpath) as lstream:
tmp_fname = "/".join([self._parent(rpath), f".tmp.{secrets.token_hex(16)}"])
# Perform an atomic copy (stream to a temporary file and
# move it to the actual destination).
try:
with self.open(tmp_fname, "wb") as rstream:
shutil.copyfileobj(lstream, rstream)
self.mv(tmp_fname, rpath)
except BaseException:
with suppress(FileNotFoundError):
self.rm(tmp_fname)
raise
def _apply_proxy(self, location):
if self.proxy and callable(self.proxy):
location = self.proxy(location)
elif self.proxy:
# as a dict
for k, v in self.proxy.items():
location = location.replace(k, v, 1)
return location
class WebHDFile(AbstractBufferedFile):
"""A file living in HDFS over webHDFS"""
def __init__(self, fs, path, **kwargs):
super().__init__(fs, path, **kwargs)
kwargs = kwargs.copy()
if kwargs.get("permissions", None) is None:
kwargs.pop("permissions", None)
if kwargs.get("replication", None) is None:
kwargs.pop("replication", None)
self.permissions = kwargs.pop("permissions", 511)
tempdir = kwargs.pop("tempdir")
if kwargs.pop("autocommit", False) is False:
self.target = self.path
self.path = os.path.join(tempdir, str(uuid.uuid4()))
def _upload_chunk(self, final=False):
"""Write one part of a multi-block file upload
Parameters
==========
final: bool
This is the last block, so should complete file, if
self.autocommit is True.
"""
out = self.fs.session.post(
self.location,
data=self.buffer.getvalue(),
headers={"content-type": "application/octet-stream"},
)
out.raise_for_status()
return True
def _initiate_upload(self):
"""Create remote file/upload"""
kwargs = self.kwargs.copy()
if "a" in self.mode:
op, method = "APPEND", "POST"
else:
op, method = "CREATE", "PUT"
kwargs["overwrite"] = "true"
out = self.fs._call(op, method, self.path, redirect=False, **kwargs)
location = self.fs._apply_proxy(out.headers["Location"])
if "w" in self.mode:
# create empty file to append to
out2 = self.fs.session.put(
location, headers={"content-type": "application/octet-stream"}
)
out2.raise_for_status()
# after creating empty file, change location to append to
out2 = self.fs._call("APPEND", "POST", self.path, redirect=False, **kwargs)
self.location = self.fs._apply_proxy(out2.headers["Location"])
def _fetch_range(self, start, end):
start = max(start, 0)
end = min(self.size, end)
if start >= end or start >= self.size:
return b""
out = self.fs._call(
"OPEN", path=self.path, offset=start, length=end - start, redirect=False
)
out.raise_for_status()
if "Location" in out.headers:
location = out.headers["Location"]
out2 = self.fs.session.get(self.fs._apply_proxy(location))
return out2.content
else:
return out.content
def commit(self):
self.fs.mv(self.path, self.target)
def discard(self):
self.fs.rm(self.path)

View File

@@ -0,0 +1,177 @@
import os
import zipfile
import fsspec
from fsspec.archive import AbstractArchiveFileSystem
class ZipFileSystem(AbstractArchiveFileSystem):
"""Read/Write contents of ZIP archive as a file-system
Keeps file object open while instance lives.
This class is pickleable, but not necessarily thread-safe
"""
root_marker = ""
protocol = "zip"
cachable = False
def __init__(
self,
fo="",
mode="r",
target_protocol=None,
target_options=None,
compression=zipfile.ZIP_STORED,
allowZip64=True,
compresslevel=None,
**kwargs,
):
"""
Parameters
----------
fo: str or file-like
Contains ZIP, and must exist. If a str, will fetch file using
:meth:`~fsspec.open_files`, which must return one file exactly.
mode: str
Accept: "r", "w", "a"
target_protocol: str (optional)
If ``fo`` is a string, this value can be used to override the
FS protocol inferred from a URL
target_options: dict (optional)
Kwargs passed when instantiating the target FS, if ``fo`` is
a string.
compression, allowZip64, compresslevel: passed to ZipFile
Only relevant when creating a ZIP
"""
super().__init__(self, **kwargs)
if mode not in set("rwa"):
raise ValueError(f"mode '{mode}' no understood")
self.mode = mode
if isinstance(fo, (str, os.PathLike)):
if mode == "a":
m = "r+b"
else:
m = mode + "b"
fo = fsspec.open(
fo, mode=m, protocol=target_protocol, **(target_options or {})
)
self.force_zip_64 = allowZip64
self.of = fo
self.fo = fo.__enter__() # the whole instance is a context
self.zip = zipfile.ZipFile(
self.fo,
mode=mode,
compression=compression,
allowZip64=allowZip64,
compresslevel=compresslevel,
)
self.dir_cache = None
@classmethod
def _strip_protocol(cls, path):
# zip file paths are always relative to the archive root
return super()._strip_protocol(path).lstrip("/")
def __del__(self):
if hasattr(self, "zip"):
self.close()
del self.zip
def close(self):
"""Commits any write changes to the file. Done on ``del`` too."""
self.zip.close()
def _get_dirs(self):
if self.dir_cache is None or self.mode in set("wa"):
# when writing, dir_cache is always in the ZipFile's attributes,
# not read from the file.
files = self.zip.infolist()
self.dir_cache = {
dirname.rstrip("/"): {
"name": dirname.rstrip("/"),
"size": 0,
"type": "directory",
}
for dirname in self._all_dirnames(self.zip.namelist())
}
for z in files:
f = {s: getattr(z, s, None) for s in zipfile.ZipInfo.__slots__}
f.update(
{
"name": z.filename.rstrip("/"),
"size": z.file_size,
"type": ("directory" if z.is_dir() else "file"),
}
)
self.dir_cache[f["name"]] = f
def pipe_file(self, path, value, **kwargs):
# override upstream, because we know the exact file size in this case
self.zip.writestr(path, value, **kwargs)
def _open(
self,
path,
mode="rb",
block_size=None,
autocommit=True,
cache_options=None,
**kwargs,
):
path = self._strip_protocol(path)
if "r" in mode and self.mode in set("wa"):
if self.exists(path):
raise OSError("ZipFS can only be open for reading or writing, not both")
raise FileNotFoundError(path)
if "r" in self.mode and "w" in mode:
raise OSError("ZipFS can only be open for reading or writing, not both")
out = self.zip.open(path, mode.strip("b"), force_zip64=self.force_zip_64)
if "r" in mode:
info = self.info(path)
out.size = info["size"]
out.name = info["name"]
return out
def find(self, path, maxdepth=None, withdirs=False, detail=False, **kwargs):
if maxdepth is not None and maxdepth < 1:
raise ValueError("maxdepth must be at least 1")
# Remove the leading slash, as the zip file paths are always
# given without a leading slash
path = path.lstrip("/")
path_parts = list(filter(lambda s: bool(s), path.split("/")))
def _matching_starts(file_path):
file_parts = filter(lambda s: bool(s), file_path.split("/"))
return all(a == b for a, b in zip(path_parts, file_parts))
self._get_dirs()
result = {}
# To match posix find, if an exact file name is given, we should
# return only that file
if path in self.dir_cache and self.dir_cache[path]["type"] == "file":
result[path] = self.dir_cache[path]
return result if detail else [path]
for file_path, file_info in self.dir_cache.items():
if not (path == "" or _matching_starts(file_path)):
continue
if file_info["type"] == "directory":
if withdirs:
if file_path not in result:
result[file_path.strip("/")] = file_info
continue
if file_path not in result:
result[file_path] = file_info if detail else None
if maxdepth:
path_depth = path.count("/")
result = {
k: v for k, v in result.items() if k.count("/") - path_depth < maxdepth
}
return result if detail else sorted(result)