增加环绕侦察场景适配

This commit is contained in:
2026-01-08 15:44:38 +08:00
parent 3eba1f962b
commit 10c5bb5a8a
5441 changed files with 40219 additions and 379695 deletions

View File

@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
commit_id: COMMIT_ID
__commit_id__: COMMIT_ID
__version__ = version = '2025.10.0'
__version_tuple__ = version_tuple = (2025, 10, 0)
__version__ = version = '2025.12.0'
__version_tuple__ = version_tuple = (2025, 12, 0)
__commit_id__ = commit_id = None

View File

@@ -328,6 +328,11 @@ class AsyncFileSystem(AbstractFileSystem):
return self._loop
async def _rm_file(self, path, **kwargs):
if (
inspect.iscoroutinefunction(self._rm)
and type(self)._rm is not AsyncFileSystem._rm
):
return await self._rm(path, recursive=False, batch_size=1, **kwargs)
raise NotImplementedError
async def _rm(self, path, recursive=False, batch_size=None, **kwargs):
@@ -776,6 +781,7 @@ class AsyncFileSystem(AbstractFileSystem):
min_idx = min(idx_star, idx_qmark, idx_brace)
detail = kwargs.pop("detail", False)
withdirs = kwargs.pop("withdirs", True)
if not has_magic(path):
if await self._exists(path, **kwargs):
@@ -805,7 +811,7 @@ class AsyncFileSystem(AbstractFileSystem):
depth = None
allpaths = await self._find(
root, maxdepth=depth, withdirs=True, detail=True, **kwargs
root, maxdepth=depth, withdirs=withdirs, detail=True, **kwargs
)
pattern = glob_translate(path + ("/" if ends_with_sep else ""))

View File

@@ -6,20 +6,12 @@ import logging
import math
import os
import threading
import warnings
from collections import OrderedDict
from collections.abc import Callable
from concurrent.futures import Future, ThreadPoolExecutor
from itertools import groupby
from operator import itemgetter
from typing import (
TYPE_CHECKING,
Any,
Callable,
ClassVar,
Generic,
NamedTuple,
TypeVar,
)
from typing import TYPE_CHECKING, Any, ClassVar, Generic, NamedTuple, TypeVar
if TYPE_CHECKING:
import mmap
@@ -629,7 +621,7 @@ class KnownPartsOfAFile(BaseCache):
fetcher: Fetcher,
size: int,
data: dict[tuple[int, int], bytes] | None = None,
strict: bool = True,
strict: bool = False,
**_: Any,
):
super().__init__(blocksize, fetcher, size)
@@ -653,50 +645,65 @@ class KnownPartsOfAFile(BaseCache):
else:
self.data = {}
@property
def size(self):
return sum(_[1] - _[0] for _ in self.data)
@size.setter
def size(self, value):
pass
@property
def nblocks(self):
return len(self.data)
@nblocks.setter
def nblocks(self, value):
pass
def _fetch(self, start: int | None, stop: int | None) -> bytes:
if start is None:
start = 0
if stop is None:
stop = self.size
self.total_requested_bytes += stop - start
out = b""
for (loc0, loc1), data in self.data.items():
# If self.strict=False, use zero-padded data
# for reads beyond the end of a "known" buffer
if loc0 <= start < loc1:
started = False
loc_old = 0
for loc0, loc1 in sorted(self.data):
if (loc0 <= start < loc1) and (loc0 <= stop <= loc1):
# entirely within the block
off = start - loc0
out = data[off : off + stop - start]
if not self.strict or loc0 <= stop <= loc1:
# The request is within a known range, or
# it begins within a known range, and we
# are allowed to pad reads beyond the
# buffer with zero
out += b"\x00" * (stop - start - len(out))
self.hit_count += 1
return out
else:
# The request ends outside a known range,
# and we are being "strict" about reads
# beyond the buffer
start = loc1
break
# We only get here if there is a request outside the
# known parts of the file. In an ideal world, this
# should never happen
if self.fetcher is None:
# We cannot fetch the data, so raise an error
raise ValueError(f"Read is outside the known file parts: {(start, stop)}. ")
# We can fetch the data, but should warn the user
# that this may be slow
warnings.warn(
f"Read is outside the known file parts: {(start, stop)}. "
f"IO/caching performance may be poor!"
)
logger.debug(f"KnownPartsOfAFile cache fetching {start}-{stop}")
self.total_requested_bytes += stop - start
self.hit_count += 1
return self.data[(loc0, loc1)][off : off + stop - start]
if stop <= loc0:
break
if started and loc0 > loc_old:
# a gap where we need data
self.miss_count += 1
if self.strict:
raise ValueError
out += b"\x00" * (loc0 - loc_old)
if loc0 <= start < loc1:
# found the start
self.hit_count += 1
off = start - loc0
out = self.data[(loc0, loc1)][off : off + stop - start]
started = True
elif start < loc0 and stop > loc1:
# the whole block
self.hit_count += 1
out += self.data[(loc0, loc1)]
elif loc0 <= stop <= loc1:
# end block
self.hit_count += 1
return out + self.data[(loc0, loc1)][: stop - loc0]
loc_old = loc1
self.miss_count += 1
return out + super()._fetch(start, stop)
if started and not self.strict:
return out + b"\x00" * (stop - loc_old)
raise ValueError
class UpdatableLRU(Generic[P, T]):

View File

@@ -18,7 +18,7 @@ from fsspec.caching import ( # noqa: F401
)
from fsspec.compression import compr
from fsspec.config import conf
from fsspec.registry import filesystem, get_filesystem_class
from fsspec.registry import available_protocols, filesystem, get_filesystem_class
from fsspec.utils import (
_unstrip_protocol,
build_name_function,
@@ -334,34 +334,51 @@ def _un_chain(path, kwargs):
if "::" in path:
x = re.compile(".*[^a-z]+.*") # test for non protocol-like single word
known_protocols = set(available_protocols())
bits = []
# split on '::', then ensure each bit has a protocol
for p in path.split("::"):
if "://" in p or x.match(p):
if p in known_protocols:
bits.append(p + "://")
elif "://" in p or x.match(p):
bits.append(p)
else:
bits.append(p + "://")
else:
bits = [path]
# [[url, protocol, kwargs], ...]
out = []
previous_bit = None
kwargs = kwargs.copy()
for bit in reversed(bits):
protocol = kwargs.pop("protocol", None) or split_protocol(bit)[0] or "file"
cls = get_filesystem_class(protocol)
extra_kwargs = cls._get_kwargs_from_urls(bit)
kws = kwargs.pop(protocol, {})
if bit is bits[0]:
kws.update(kwargs)
kw = dict(
**{k: v for k, v in extra_kwargs.items() if k not in kws or v != kws[k]},
**kws,
)
bit = cls._strip_protocol(bit)
if "target_protocol" not in kw and issubclass(cls, ChainedFileSystem):
if (
"target_protocol" not in kw
and issubclass(cls, ChainedFileSystem)
and not bit
):
# replace bit if we are chaining and no path given
bit = previous_bit
out.append((bit, protocol, kw))
previous_bit = bit
out.reverse()
return out

View File

@@ -205,11 +205,11 @@ class ArrowFSWrapper(AbstractFileSystem):
return self.fs.get_file_info(path).mtime
def cat_file(self, path, start=None, end=None, **kwargs):
kwargs["seekable"] = start not in [None, 0]
kwargs.setdefault("seekable", start not in [None, 0])
return super().cat_file(path, start=None, end=None, **kwargs)
def get_file(self, rpath, lpath, **kwargs):
kwargs["seekable"] = False
kwargs.setdefault("seekable", False)
super().get_file(rpath, lpath, **kwargs)
@@ -223,7 +223,6 @@ class ArrowFSWrapper(AbstractFileSystem):
"readable",
"writable",
"close",
"size",
"seekable",
],
)
@@ -241,6 +240,10 @@ class ArrowFile(io.IOBase):
def __enter__(self):
return self
@property
def size(self):
return self.stream.size()
def __exit__(self, *args):
return self.close()

View File

@@ -5,6 +5,8 @@ import inspect
import fsspec
from fsspec.asyn import AsyncFileSystem, running_async
from .chained import ChainedFileSystem
def async_wrapper(func, obj=None, semaphore=None):
"""
@@ -35,7 +37,7 @@ def async_wrapper(func, obj=None, semaphore=None):
return wrapper
class AsyncFileSystemWrapper(AsyncFileSystem):
class AsyncFileSystemWrapper(AsyncFileSystem, ChainedFileSystem):
"""
A wrapper class to convert a synchronous filesystem into an asynchronous one.

View File

@@ -15,9 +15,7 @@ except ImportError:
if TYPE_CHECKING:
from collections.abc import Iterator
from typing import Any, Literal
from typing_extensions import TypeAlias
from typing import Any, Literal, TypeAlias
from .cached import CachingFileSystem

View File

@@ -6,8 +6,9 @@ import os
import tempfile
import time
import weakref
from collections.abc import Callable
from shutil import rmtree
from typing import TYPE_CHECKING, Any, Callable, ClassVar
from typing import TYPE_CHECKING, Any, ClassVar
from fsspec import filesystem
from fsspec.callbacks import DEFAULT_CALLBACK

View File

@@ -1,6 +1,5 @@
import base64
import io
from typing import Optional
from urllib.parse import unquote
from fsspec import AbstractFileSystem
@@ -50,7 +49,7 @@ class DataFileSystem(AbstractFileSystem):
return io.BytesIO(self.cat_file(path))
@staticmethod
def encode(data: bytes, mime: Optional[str] = None):
def encode(data: bytes, mime: str | None = None):
"""Format the given data into data-URL syntax
This version always base64 encodes, even when the data is ascii/url-safe.

View File

@@ -1,8 +1,9 @@
from .. import filesystem
from ..asyn import AsyncFileSystem
from .chained import ChainedFileSystem
class DirFileSystem(AsyncFileSystem):
class DirFileSystem(AsyncFileSystem, ChainedFileSystem):
"""Directory prefix filesystem
The DirFileSystem is a filesystem-wrapper. It assumes every path it is dealing with

View File

@@ -327,7 +327,7 @@ class HTTPFileSystem(AsyncFileSystem):
async with meth(self.encode_url(rpath), data=gen_chunks(), **kw) as resp:
self._raise_not_found_for_status(resp, rpath)
async def _exists(self, path, **kwargs):
async def _exists(self, path, strict=False, **kwargs):
kw = self.kwargs.copy()
kw.update(kwargs)
try:
@@ -335,8 +335,14 @@ class HTTPFileSystem(AsyncFileSystem):
session = await self.set_session()
r = await session.get(self.encode_url(path), **kw)
async with r:
if strict:
self._raise_not_found_for_status(r, path)
return r.status < 400
except FileNotFoundError:
return False
except aiohttp.ClientError:
if strict:
raise
return False
async def _isfile(self, path, **kwargs):

View File

@@ -463,14 +463,20 @@ class HTTPFileSystem(AbstractFileSystem):
end -= 1 # bytes range is inclusive
return f"bytes={start}-{end}"
def exists(self, path, **kwargs):
def exists(self, path, strict=False, **kwargs):
kw = self.kwargs.copy()
kw.update(kwargs)
try:
logger.debug(path)
r = self.session.get(self.encode_url(path), **kw)
if strict:
self._raise_not_found_for_status(r, path)
return r.status_code < 400
except FileNotFoundError:
return False
except Exception:
if strict:
raise
return False
def isfile(self, path, **kwargs):

View File

@@ -195,7 +195,7 @@ class LibArchiveFileSystem(AbstractArchiveFileSystem):
if mode != "rb":
raise NotImplementedError
data = bytes()
data = b""
with self._open_archive() as arc:
for entry in arc:
if entry.pathname != path:

View File

@@ -219,7 +219,7 @@ class LazyReferenceMapper(collections.abc.MutableMapping):
fs.pipe("/".join([root, ".zmetadata"]), json.dumps(met).encode())
return LazyReferenceMapper(root, fs, **kwargs)
@lru_cache()
@lru_cache
def listdir(self):
"""List top-level directories"""
dirs = (p.rsplit("/", 1)[0] for p in self.zmetadata if not p.startswith(".z"))

View File

@@ -1,13 +1,8 @@
import json
from collections.abc import Mapping, Sequence
from collections.abc import Callable, Mapping, Sequence
from contextlib import suppress
from pathlib import PurePath
from typing import (
Any,
Callable,
ClassVar,
Optional,
)
from typing import Any, ClassVar
from .registry import _import_class, get_filesystem_class
from .spec import AbstractFileSystem
@@ -45,12 +40,12 @@ class FilesystemJSONDecoder(json.JSONDecoder):
def __init__(
self,
*,
object_hook: Optional[Callable[[dict[str, Any]], Any]] = None,
parse_float: Optional[Callable[[str], Any]] = None,
parse_int: Optional[Callable[[str], Any]] = None,
parse_constant: Optional[Callable[[str], Any]] = None,
object_hook: Callable[[dict[str, Any]], Any] | None = None,
parse_float: Callable[[str], Any] | None = None,
parse_int: Callable[[str], Any] | None = None,
parse_constant: Callable[[str], Any] | None = None,
strict: bool = True,
object_pairs_hook: Optional[Callable[[list[tuple[str, Any]]], Any]] = None,
object_pairs_hook: Callable[[list[tuple[str, Any]]], Any] | None = None,
) -> None:
self.original_object_hook = object_hook

View File

@@ -1,8 +1,12 @@
import io
import json
import warnings
from typing import Literal
import fsspec
from .core import url_to_fs
from .spec import AbstractBufferedFile
from .utils import merge_offset_ranges
# Parquet-Specific Utilities for fsspec
@@ -14,19 +18,24 @@ from .utils import merge_offset_ranges
# on remote file systems.
def open_parquet_file(
path,
mode="rb",
fs=None,
class AlreadyBufferedFile(AbstractBufferedFile):
def _fetch_range(self, start, end):
raise NotImplementedError
def open_parquet_files(
path: list[str],
mode: Literal["rb"] = "rb",
fs: None | fsspec.AbstractFileSystem = None,
metadata=None,
columns=None,
row_groups=None,
storage_options=None,
strict=False,
engine="auto",
max_gap=64_000,
max_block=256_000_000,
footer_sample_size=1_000_000,
columns: None | list[str] = None,
row_groups: None | list[int] = None,
storage_options: None | dict = None,
engine: str = "auto",
max_gap: int = 64_000,
max_block: int = 256_000_000,
footer_sample_size: int = 1_000_000,
filters: None | list[list[list[str]]] = None,
**kwargs,
):
"""
@@ -72,12 +81,6 @@ def open_parquet_file(
storage_options : dict, optional
Used to generate an `AbstractFileSystem` object if `fs` was
not specified.
strict : bool, optional
Whether the resulting `KnownPartsOfAFile` cache should
fetch reads that go beyond a known byte-range boundary.
If `False` (the default), any read that ends outside a
known part will be zero padded. Note that using
`strict=True` may be useful for debugging.
max_gap : int, optional
Neighboring byte ranges will only be merged when their
inter-range gap is <= `max_gap`. Default is 64KB.
@@ -89,6 +92,10 @@ def open_parquet_file(
for the footer metadata. If the sampled bytes do not contain
the footer, a second read request will be required, and
performance will suffer. Default is 1MB.
filters : list[list], optional
List of filters to apply to prevent reading row groups, of the
same format as accepted by the loading engines. Ignored if
``row_groups`` is specified.
**kwargs :
Optional key-word arguments to pass to `fs.open`
"""
@@ -96,20 +103,36 @@ def open_parquet_file(
# Make sure we have an `AbstractFileSystem` object
# to work with
if fs is None:
fs = url_to_fs(path, **(storage_options or {}))[0]
path0 = path
if isinstance(path, (list, tuple)):
path = path[0]
fs, path = url_to_fs(path, **(storage_options or {}))
else:
path0 = path
# For now, `columns == []` not supported. Just use
# default `open` command with `path` input
# For now, `columns == []` not supported, is the same
# as all columns
if columns is not None and len(columns) == 0:
return fs.open(path, mode=mode)
columns = None
# Set the engine
engine = _set_engine(engine)
# Fetch the known byte ranges needed to read
# `columns` and/or `row_groups`
if isinstance(path0, (list, tuple)):
paths = path0
elif "*" in path:
paths = fs.glob(path)
elif path0.endswith("/"): # or fs.isdir(path):
paths = [
_
for _ in fs.find(path, withdirs=False, detail=False)
if _.endswith((".parquet", ".parq"))
]
else:
paths = [path]
data = _get_parquet_byte_ranges(
[path],
paths,
fs,
metadata=metadata,
columns=columns,
@@ -118,24 +141,37 @@ def open_parquet_file(
max_gap=max_gap,
max_block=max_block,
footer_sample_size=footer_sample_size,
filters=filters,
)
# Extract file name from `data`
fn = next(iter(data)) if data else path
# Call self.open with "parts" caching
options = kwargs.pop("cache_options", {}).copy()
return fs.open(
fn,
mode=mode,
cache_type="parts",
cache_options={
**options,
"data": data.get(fn, {}),
"strict": strict,
},
**kwargs,
)
return [
AlreadyBufferedFile(
fs=None,
path=fn,
mode=mode,
cache_type="parts",
cache_options={
**options,
"data": data.get(fn, {}),
},
size=max(_[1] for _ in data.get(fn, {})),
**kwargs,
)
for fn in data
]
def open_parquet_file(*args, **kwargs):
"""Create files tailed to reading specific parts of parquet files
Please see ``open_parquet_files`` for details of the arguments. The
difference is, this function always returns a single ``AleadyBufferedFile``,
whereas `open_parquet_files`` always returns a list of files, even if
there are one or zero matching parquet files.
"""
return open_parquet_files(*args, **kwargs)[0]
def _get_parquet_byte_ranges(
@@ -148,6 +184,7 @@ def _get_parquet_byte_ranges(
max_block=256_000_000,
footer_sample_size=1_000_000,
engine="auto",
filters=None,
):
"""Get a dictionary of the known byte ranges needed
to read a specific column/row-group selection from a
@@ -172,6 +209,7 @@ def _get_parquet_byte_ranges(
row_groups=row_groups,
max_gap=max_gap,
max_block=max_block,
filters=filters,
)
# Get file sizes asynchronously
@@ -183,17 +221,16 @@ def _get_parquet_byte_ranges(
data_starts = []
data_ends = []
add_header_magic = True
if columns is None and row_groups is None:
if columns is None and row_groups is None and filters is None:
# We are NOT selecting specific columns or row-groups.
#
# We can avoid sampling the footers, and just transfer
# all file data with cat_ranges
for i, path in enumerate(paths):
result[path] = {}
for b in range(0, file_sizes[i], max_block):
data_paths.append(path)
data_starts.append(b)
data_ends.append(min(b + max_block, file_sizes[i]))
data_paths.append(path)
data_starts.append(0)
data_ends.append(file_sizes[i])
add_header_magic = False # "Magic" should already be included
else:
# We ARE selecting specific columns or row-groups.
@@ -235,29 +272,21 @@ def _get_parquet_byte_ranges(
# Calculate required byte ranges for each path
for i, path in enumerate(paths):
# Deal with small-file case.
# Just include all remaining bytes of the file
# in a single range.
if file_sizes[i] < max_block:
if footer_starts[i] > 0:
# Only need to transfer the data if the
# footer sample isn't already the whole file
data_paths.append(path)
data_starts.append(0)
data_ends.append(footer_starts[i])
continue
# Use "engine" to collect data byte ranges
path_data_starts, path_data_ends = engine._parquet_byte_ranges(
columns,
row_groups=row_groups,
footer=footer_samples[i],
footer_start=footer_starts[i],
filters=filters,
)
data_paths += [path] * len(path_data_starts)
data_starts += path_data_starts
data_ends += path_data_ends
result.setdefault(path, {})[(footer_starts[i], file_sizes[i])] = (
footer_samples[i]
)
# Merge adjacent offset ranges
data_paths, data_starts, data_ends = merge_offset_ranges(
@@ -291,6 +320,7 @@ def _get_parquet_byte_ranges_from_metadata(
row_groups=None,
max_gap=64_000,
max_block=256_000_000,
filters=None,
):
"""Simplified version of `_get_parquet_byte_ranges` for
the case that an engine-specific `metadata` object is
@@ -300,9 +330,7 @@ def _get_parquet_byte_ranges_from_metadata(
# Use "engine" to collect data byte ranges
data_paths, data_starts, data_ends = engine._parquet_byte_ranges(
columns,
row_groups=row_groups,
metadata=metadata,
columns, row_groups=row_groups, metadata=metadata, filters=filters
)
# Merge adjacent offset ranges
@@ -401,16 +429,19 @@ class FastparquetEngine:
metadata=None,
footer=None,
footer_start=None,
filters=None,
):
# Initialize offset ranges and define ParqetFile metadata
pf = metadata
data_paths, data_starts, data_ends = [], [], []
if filters and row_groups:
raise ValueError("filters and row_groups cannot be used together")
if pf is None:
pf = self.fp.ParquetFile(io.BytesIO(footer))
# Convert columns to a set and add any index columns
# specified in the pandas metadata (just in case)
column_set = None if columns is None else set(columns)
column_set = None if columns is None else {c.split(".", 1)[0] for c in columns}
if column_set is not None and hasattr(pf, "pandas_metadata"):
md_index = [
ind
@@ -422,7 +453,12 @@ class FastparquetEngine:
# Check if row_groups is a list of integers
# or a list of row-group metadata
if row_groups and not isinstance(row_groups[0], int):
if filters:
from fastparquet.api import filter_row_groups
row_group_indices = None
row_groups = filter_row_groups(pf, filters)
elif row_groups and not isinstance(row_groups[0], int):
# Input row_groups contains row-group metadata
row_group_indices = None
else:
@@ -486,9 +522,12 @@ class PyarrowEngine:
metadata=None,
footer=None,
footer_start=None,
filters=None,
):
if metadata is not None:
raise ValueError("metadata input not supported for PyarrowEngine")
if filters:
raise NotImplementedError
data_starts, data_ends = [], []
md = self.pq.ParquetFile(io.BytesIO(footer)).metadata

View File

@@ -72,6 +72,9 @@ known_implementations = {
"class": "fsspec.implementations.arrow.HadoopFileSystem",
"err": "pyarrow and local java libraries required for HDFS",
},
"async_wrapper": {
"class": "fsspec.implementations.asyn_wrapper.AsyncFileSystemWrapper",
},
"asynclocal": {
"class": "morefs.asyn_local.AsyncLocalFileSystem",
"err": "Install 'morefs[asynclocalfs]' to use AsyncLocalFileSystem",

View File

@@ -7,23 +7,16 @@ import os
import re
import sys
import tempfile
from collections.abc import Iterable, Iterator, Sequence
from collections.abc import Callable, Iterable, Iterator, Sequence
from functools import partial
from hashlib import md5
from importlib.metadata import version
from typing import (
IO,
TYPE_CHECKING,
Any,
Callable,
TypeVar,
)
from typing import IO, TYPE_CHECKING, Any, TypeVar
from urllib.parse import urlsplit
if TYPE_CHECKING:
import pathlib
from typing_extensions import TypeGuard
from typing import TypeGuard
from fsspec.spec import AbstractFileSystem