增加环绕侦察场景适配
This commit is contained in:
@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
|
||||
commit_id: COMMIT_ID
|
||||
__commit_id__: COMMIT_ID
|
||||
|
||||
__version__ = version = '2025.10.0'
|
||||
__version_tuple__ = version_tuple = (2025, 10, 0)
|
||||
__version__ = version = '2025.12.0'
|
||||
__version_tuple__ = version_tuple = (2025, 12, 0)
|
||||
|
||||
__commit_id__ = commit_id = None
|
||||
|
||||
@@ -328,6 +328,11 @@ class AsyncFileSystem(AbstractFileSystem):
|
||||
return self._loop
|
||||
|
||||
async def _rm_file(self, path, **kwargs):
|
||||
if (
|
||||
inspect.iscoroutinefunction(self._rm)
|
||||
and type(self)._rm is not AsyncFileSystem._rm
|
||||
):
|
||||
return await self._rm(path, recursive=False, batch_size=1, **kwargs)
|
||||
raise NotImplementedError
|
||||
|
||||
async def _rm(self, path, recursive=False, batch_size=None, **kwargs):
|
||||
@@ -776,6 +781,7 @@ class AsyncFileSystem(AbstractFileSystem):
|
||||
min_idx = min(idx_star, idx_qmark, idx_brace)
|
||||
|
||||
detail = kwargs.pop("detail", False)
|
||||
withdirs = kwargs.pop("withdirs", True)
|
||||
|
||||
if not has_magic(path):
|
||||
if await self._exists(path, **kwargs):
|
||||
@@ -805,7 +811,7 @@ class AsyncFileSystem(AbstractFileSystem):
|
||||
depth = None
|
||||
|
||||
allpaths = await self._find(
|
||||
root, maxdepth=depth, withdirs=True, detail=True, **kwargs
|
||||
root, maxdepth=depth, withdirs=withdirs, detail=True, **kwargs
|
||||
)
|
||||
|
||||
pattern = glob_translate(path + ("/" if ends_with_sep else ""))
|
||||
|
||||
@@ -6,20 +6,12 @@ import logging
|
||||
import math
|
||||
import os
|
||||
import threading
|
||||
import warnings
|
||||
from collections import OrderedDict
|
||||
from collections.abc import Callable
|
||||
from concurrent.futures import Future, ThreadPoolExecutor
|
||||
from itertools import groupby
|
||||
from operator import itemgetter
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Any,
|
||||
Callable,
|
||||
ClassVar,
|
||||
Generic,
|
||||
NamedTuple,
|
||||
TypeVar,
|
||||
)
|
||||
from typing import TYPE_CHECKING, Any, ClassVar, Generic, NamedTuple, TypeVar
|
||||
|
||||
if TYPE_CHECKING:
|
||||
import mmap
|
||||
@@ -629,7 +621,7 @@ class KnownPartsOfAFile(BaseCache):
|
||||
fetcher: Fetcher,
|
||||
size: int,
|
||||
data: dict[tuple[int, int], bytes] | None = None,
|
||||
strict: bool = True,
|
||||
strict: bool = False,
|
||||
**_: Any,
|
||||
):
|
||||
super().__init__(blocksize, fetcher, size)
|
||||
@@ -653,50 +645,65 @@ class KnownPartsOfAFile(BaseCache):
|
||||
else:
|
||||
self.data = {}
|
||||
|
||||
@property
|
||||
def size(self):
|
||||
return sum(_[1] - _[0] for _ in self.data)
|
||||
|
||||
@size.setter
|
||||
def size(self, value):
|
||||
pass
|
||||
|
||||
@property
|
||||
def nblocks(self):
|
||||
return len(self.data)
|
||||
|
||||
@nblocks.setter
|
||||
def nblocks(self, value):
|
||||
pass
|
||||
|
||||
def _fetch(self, start: int | None, stop: int | None) -> bytes:
|
||||
if start is None:
|
||||
start = 0
|
||||
if stop is None:
|
||||
stop = self.size
|
||||
self.total_requested_bytes += stop - start
|
||||
|
||||
out = b""
|
||||
for (loc0, loc1), data in self.data.items():
|
||||
# If self.strict=False, use zero-padded data
|
||||
# for reads beyond the end of a "known" buffer
|
||||
if loc0 <= start < loc1:
|
||||
started = False
|
||||
loc_old = 0
|
||||
for loc0, loc1 in sorted(self.data):
|
||||
if (loc0 <= start < loc1) and (loc0 <= stop <= loc1):
|
||||
# entirely within the block
|
||||
off = start - loc0
|
||||
out = data[off : off + stop - start]
|
||||
if not self.strict or loc0 <= stop <= loc1:
|
||||
# The request is within a known range, or
|
||||
# it begins within a known range, and we
|
||||
# are allowed to pad reads beyond the
|
||||
# buffer with zero
|
||||
out += b"\x00" * (stop - start - len(out))
|
||||
self.hit_count += 1
|
||||
return out
|
||||
else:
|
||||
# The request ends outside a known range,
|
||||
# and we are being "strict" about reads
|
||||
# beyond the buffer
|
||||
start = loc1
|
||||
break
|
||||
|
||||
# We only get here if there is a request outside the
|
||||
# known parts of the file. In an ideal world, this
|
||||
# should never happen
|
||||
if self.fetcher is None:
|
||||
# We cannot fetch the data, so raise an error
|
||||
raise ValueError(f"Read is outside the known file parts: {(start, stop)}. ")
|
||||
# We can fetch the data, but should warn the user
|
||||
# that this may be slow
|
||||
warnings.warn(
|
||||
f"Read is outside the known file parts: {(start, stop)}. "
|
||||
f"IO/caching performance may be poor!"
|
||||
)
|
||||
logger.debug(f"KnownPartsOfAFile cache fetching {start}-{stop}")
|
||||
self.total_requested_bytes += stop - start
|
||||
self.hit_count += 1
|
||||
return self.data[(loc0, loc1)][off : off + stop - start]
|
||||
if stop <= loc0:
|
||||
break
|
||||
if started and loc0 > loc_old:
|
||||
# a gap where we need data
|
||||
self.miss_count += 1
|
||||
if self.strict:
|
||||
raise ValueError
|
||||
out += b"\x00" * (loc0 - loc_old)
|
||||
if loc0 <= start < loc1:
|
||||
# found the start
|
||||
self.hit_count += 1
|
||||
off = start - loc0
|
||||
out = self.data[(loc0, loc1)][off : off + stop - start]
|
||||
started = True
|
||||
elif start < loc0 and stop > loc1:
|
||||
# the whole block
|
||||
self.hit_count += 1
|
||||
out += self.data[(loc0, loc1)]
|
||||
elif loc0 <= stop <= loc1:
|
||||
# end block
|
||||
self.hit_count += 1
|
||||
return out + self.data[(loc0, loc1)][: stop - loc0]
|
||||
loc_old = loc1
|
||||
self.miss_count += 1
|
||||
return out + super()._fetch(start, stop)
|
||||
if started and not self.strict:
|
||||
return out + b"\x00" * (stop - loc_old)
|
||||
raise ValueError
|
||||
|
||||
|
||||
class UpdatableLRU(Generic[P, T]):
|
||||
|
||||
@@ -18,7 +18,7 @@ from fsspec.caching import ( # noqa: F401
|
||||
)
|
||||
from fsspec.compression import compr
|
||||
from fsspec.config import conf
|
||||
from fsspec.registry import filesystem, get_filesystem_class
|
||||
from fsspec.registry import available_protocols, filesystem, get_filesystem_class
|
||||
from fsspec.utils import (
|
||||
_unstrip_protocol,
|
||||
build_name_function,
|
||||
@@ -334,34 +334,51 @@ def _un_chain(path, kwargs):
|
||||
|
||||
if "::" in path:
|
||||
x = re.compile(".*[^a-z]+.*") # test for non protocol-like single word
|
||||
known_protocols = set(available_protocols())
|
||||
bits = []
|
||||
|
||||
# split on '::', then ensure each bit has a protocol
|
||||
for p in path.split("::"):
|
||||
if "://" in p or x.match(p):
|
||||
if p in known_protocols:
|
||||
bits.append(p + "://")
|
||||
elif "://" in p or x.match(p):
|
||||
bits.append(p)
|
||||
else:
|
||||
bits.append(p + "://")
|
||||
else:
|
||||
bits = [path]
|
||||
|
||||
# [[url, protocol, kwargs], ...]
|
||||
out = []
|
||||
previous_bit = None
|
||||
kwargs = kwargs.copy()
|
||||
|
||||
for bit in reversed(bits):
|
||||
protocol = kwargs.pop("protocol", None) or split_protocol(bit)[0] or "file"
|
||||
cls = get_filesystem_class(protocol)
|
||||
extra_kwargs = cls._get_kwargs_from_urls(bit)
|
||||
kws = kwargs.pop(protocol, {})
|
||||
|
||||
if bit is bits[0]:
|
||||
kws.update(kwargs)
|
||||
|
||||
kw = dict(
|
||||
**{k: v for k, v in extra_kwargs.items() if k not in kws or v != kws[k]},
|
||||
**kws,
|
||||
)
|
||||
bit = cls._strip_protocol(bit)
|
||||
if "target_protocol" not in kw and issubclass(cls, ChainedFileSystem):
|
||||
|
||||
if (
|
||||
"target_protocol" not in kw
|
||||
and issubclass(cls, ChainedFileSystem)
|
||||
and not bit
|
||||
):
|
||||
# replace bit if we are chaining and no path given
|
||||
bit = previous_bit
|
||||
|
||||
out.append((bit, protocol, kw))
|
||||
previous_bit = bit
|
||||
|
||||
out.reverse()
|
||||
return out
|
||||
|
||||
|
||||
@@ -205,11 +205,11 @@ class ArrowFSWrapper(AbstractFileSystem):
|
||||
return self.fs.get_file_info(path).mtime
|
||||
|
||||
def cat_file(self, path, start=None, end=None, **kwargs):
|
||||
kwargs["seekable"] = start not in [None, 0]
|
||||
kwargs.setdefault("seekable", start not in [None, 0])
|
||||
return super().cat_file(path, start=None, end=None, **kwargs)
|
||||
|
||||
def get_file(self, rpath, lpath, **kwargs):
|
||||
kwargs["seekable"] = False
|
||||
kwargs.setdefault("seekable", False)
|
||||
super().get_file(rpath, lpath, **kwargs)
|
||||
|
||||
|
||||
@@ -223,7 +223,6 @@ class ArrowFSWrapper(AbstractFileSystem):
|
||||
"readable",
|
||||
"writable",
|
||||
"close",
|
||||
"size",
|
||||
"seekable",
|
||||
],
|
||||
)
|
||||
@@ -241,6 +240,10 @@ class ArrowFile(io.IOBase):
|
||||
def __enter__(self):
|
||||
return self
|
||||
|
||||
@property
|
||||
def size(self):
|
||||
return self.stream.size()
|
||||
|
||||
def __exit__(self, *args):
|
||||
return self.close()
|
||||
|
||||
|
||||
@@ -5,6 +5,8 @@ import inspect
|
||||
import fsspec
|
||||
from fsspec.asyn import AsyncFileSystem, running_async
|
||||
|
||||
from .chained import ChainedFileSystem
|
||||
|
||||
|
||||
def async_wrapper(func, obj=None, semaphore=None):
|
||||
"""
|
||||
@@ -35,7 +37,7 @@ def async_wrapper(func, obj=None, semaphore=None):
|
||||
return wrapper
|
||||
|
||||
|
||||
class AsyncFileSystemWrapper(AsyncFileSystem):
|
||||
class AsyncFileSystemWrapper(AsyncFileSystem, ChainedFileSystem):
|
||||
"""
|
||||
A wrapper class to convert a synchronous filesystem into an asynchronous one.
|
||||
|
||||
|
||||
@@ -15,9 +15,7 @@ except ImportError:
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import Iterator
|
||||
from typing import Any, Literal
|
||||
|
||||
from typing_extensions import TypeAlias
|
||||
from typing import Any, Literal, TypeAlias
|
||||
|
||||
from .cached import CachingFileSystem
|
||||
|
||||
|
||||
@@ -6,8 +6,9 @@ import os
|
||||
import tempfile
|
||||
import time
|
||||
import weakref
|
||||
from collections.abc import Callable
|
||||
from shutil import rmtree
|
||||
from typing import TYPE_CHECKING, Any, Callable, ClassVar
|
||||
from typing import TYPE_CHECKING, Any, ClassVar
|
||||
|
||||
from fsspec import filesystem
|
||||
from fsspec.callbacks import DEFAULT_CALLBACK
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
import base64
|
||||
import io
|
||||
from typing import Optional
|
||||
from urllib.parse import unquote
|
||||
|
||||
from fsspec import AbstractFileSystem
|
||||
@@ -50,7 +49,7 @@ class DataFileSystem(AbstractFileSystem):
|
||||
return io.BytesIO(self.cat_file(path))
|
||||
|
||||
@staticmethod
|
||||
def encode(data: bytes, mime: Optional[str] = None):
|
||||
def encode(data: bytes, mime: str | None = None):
|
||||
"""Format the given data into data-URL syntax
|
||||
|
||||
This version always base64 encodes, even when the data is ascii/url-safe.
|
||||
|
||||
@@ -1,8 +1,9 @@
|
||||
from .. import filesystem
|
||||
from ..asyn import AsyncFileSystem
|
||||
from .chained import ChainedFileSystem
|
||||
|
||||
|
||||
class DirFileSystem(AsyncFileSystem):
|
||||
class DirFileSystem(AsyncFileSystem, ChainedFileSystem):
|
||||
"""Directory prefix filesystem
|
||||
|
||||
The DirFileSystem is a filesystem-wrapper. It assumes every path it is dealing with
|
||||
|
||||
@@ -327,7 +327,7 @@ class HTTPFileSystem(AsyncFileSystem):
|
||||
async with meth(self.encode_url(rpath), data=gen_chunks(), **kw) as resp:
|
||||
self._raise_not_found_for_status(resp, rpath)
|
||||
|
||||
async def _exists(self, path, **kwargs):
|
||||
async def _exists(self, path, strict=False, **kwargs):
|
||||
kw = self.kwargs.copy()
|
||||
kw.update(kwargs)
|
||||
try:
|
||||
@@ -335,8 +335,14 @@ class HTTPFileSystem(AsyncFileSystem):
|
||||
session = await self.set_session()
|
||||
r = await session.get(self.encode_url(path), **kw)
|
||||
async with r:
|
||||
if strict:
|
||||
self._raise_not_found_for_status(r, path)
|
||||
return r.status < 400
|
||||
except FileNotFoundError:
|
||||
return False
|
||||
except aiohttp.ClientError:
|
||||
if strict:
|
||||
raise
|
||||
return False
|
||||
|
||||
async def _isfile(self, path, **kwargs):
|
||||
|
||||
@@ -463,14 +463,20 @@ class HTTPFileSystem(AbstractFileSystem):
|
||||
end -= 1 # bytes range is inclusive
|
||||
return f"bytes={start}-{end}"
|
||||
|
||||
def exists(self, path, **kwargs):
|
||||
def exists(self, path, strict=False, **kwargs):
|
||||
kw = self.kwargs.copy()
|
||||
kw.update(kwargs)
|
||||
try:
|
||||
logger.debug(path)
|
||||
r = self.session.get(self.encode_url(path), **kw)
|
||||
if strict:
|
||||
self._raise_not_found_for_status(r, path)
|
||||
return r.status_code < 400
|
||||
except FileNotFoundError:
|
||||
return False
|
||||
except Exception:
|
||||
if strict:
|
||||
raise
|
||||
return False
|
||||
|
||||
def isfile(self, path, **kwargs):
|
||||
|
||||
@@ -195,7 +195,7 @@ class LibArchiveFileSystem(AbstractArchiveFileSystem):
|
||||
if mode != "rb":
|
||||
raise NotImplementedError
|
||||
|
||||
data = bytes()
|
||||
data = b""
|
||||
with self._open_archive() as arc:
|
||||
for entry in arc:
|
||||
if entry.pathname != path:
|
||||
|
||||
@@ -219,7 +219,7 @@ class LazyReferenceMapper(collections.abc.MutableMapping):
|
||||
fs.pipe("/".join([root, ".zmetadata"]), json.dumps(met).encode())
|
||||
return LazyReferenceMapper(root, fs, **kwargs)
|
||||
|
||||
@lru_cache()
|
||||
@lru_cache
|
||||
def listdir(self):
|
||||
"""List top-level directories"""
|
||||
dirs = (p.rsplit("/", 1)[0] for p in self.zmetadata if not p.startswith(".z"))
|
||||
|
||||
@@ -1,13 +1,8 @@
|
||||
import json
|
||||
from collections.abc import Mapping, Sequence
|
||||
from collections.abc import Callable, Mapping, Sequence
|
||||
from contextlib import suppress
|
||||
from pathlib import PurePath
|
||||
from typing import (
|
||||
Any,
|
||||
Callable,
|
||||
ClassVar,
|
||||
Optional,
|
||||
)
|
||||
from typing import Any, ClassVar
|
||||
|
||||
from .registry import _import_class, get_filesystem_class
|
||||
from .spec import AbstractFileSystem
|
||||
@@ -45,12 +40,12 @@ class FilesystemJSONDecoder(json.JSONDecoder):
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
object_hook: Optional[Callable[[dict[str, Any]], Any]] = None,
|
||||
parse_float: Optional[Callable[[str], Any]] = None,
|
||||
parse_int: Optional[Callable[[str], Any]] = None,
|
||||
parse_constant: Optional[Callable[[str], Any]] = None,
|
||||
object_hook: Callable[[dict[str, Any]], Any] | None = None,
|
||||
parse_float: Callable[[str], Any] | None = None,
|
||||
parse_int: Callable[[str], Any] | None = None,
|
||||
parse_constant: Callable[[str], Any] | None = None,
|
||||
strict: bool = True,
|
||||
object_pairs_hook: Optional[Callable[[list[tuple[str, Any]]], Any]] = None,
|
||||
object_pairs_hook: Callable[[list[tuple[str, Any]]], Any] | None = None,
|
||||
) -> None:
|
||||
self.original_object_hook = object_hook
|
||||
|
||||
|
||||
@@ -1,8 +1,12 @@
|
||||
import io
|
||||
import json
|
||||
import warnings
|
||||
from typing import Literal
|
||||
|
||||
import fsspec
|
||||
|
||||
from .core import url_to_fs
|
||||
from .spec import AbstractBufferedFile
|
||||
from .utils import merge_offset_ranges
|
||||
|
||||
# Parquet-Specific Utilities for fsspec
|
||||
@@ -14,19 +18,24 @@ from .utils import merge_offset_ranges
|
||||
# on remote file systems.
|
||||
|
||||
|
||||
def open_parquet_file(
|
||||
path,
|
||||
mode="rb",
|
||||
fs=None,
|
||||
class AlreadyBufferedFile(AbstractBufferedFile):
|
||||
def _fetch_range(self, start, end):
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
def open_parquet_files(
|
||||
path: list[str],
|
||||
mode: Literal["rb"] = "rb",
|
||||
fs: None | fsspec.AbstractFileSystem = None,
|
||||
metadata=None,
|
||||
columns=None,
|
||||
row_groups=None,
|
||||
storage_options=None,
|
||||
strict=False,
|
||||
engine="auto",
|
||||
max_gap=64_000,
|
||||
max_block=256_000_000,
|
||||
footer_sample_size=1_000_000,
|
||||
columns: None | list[str] = None,
|
||||
row_groups: None | list[int] = None,
|
||||
storage_options: None | dict = None,
|
||||
engine: str = "auto",
|
||||
max_gap: int = 64_000,
|
||||
max_block: int = 256_000_000,
|
||||
footer_sample_size: int = 1_000_000,
|
||||
filters: None | list[list[list[str]]] = None,
|
||||
**kwargs,
|
||||
):
|
||||
"""
|
||||
@@ -72,12 +81,6 @@ def open_parquet_file(
|
||||
storage_options : dict, optional
|
||||
Used to generate an `AbstractFileSystem` object if `fs` was
|
||||
not specified.
|
||||
strict : bool, optional
|
||||
Whether the resulting `KnownPartsOfAFile` cache should
|
||||
fetch reads that go beyond a known byte-range boundary.
|
||||
If `False` (the default), any read that ends outside a
|
||||
known part will be zero padded. Note that using
|
||||
`strict=True` may be useful for debugging.
|
||||
max_gap : int, optional
|
||||
Neighboring byte ranges will only be merged when their
|
||||
inter-range gap is <= `max_gap`. Default is 64KB.
|
||||
@@ -89,6 +92,10 @@ def open_parquet_file(
|
||||
for the footer metadata. If the sampled bytes do not contain
|
||||
the footer, a second read request will be required, and
|
||||
performance will suffer. Default is 1MB.
|
||||
filters : list[list], optional
|
||||
List of filters to apply to prevent reading row groups, of the
|
||||
same format as accepted by the loading engines. Ignored if
|
||||
``row_groups`` is specified.
|
||||
**kwargs :
|
||||
Optional key-word arguments to pass to `fs.open`
|
||||
"""
|
||||
@@ -96,20 +103,36 @@ def open_parquet_file(
|
||||
# Make sure we have an `AbstractFileSystem` object
|
||||
# to work with
|
||||
if fs is None:
|
||||
fs = url_to_fs(path, **(storage_options or {}))[0]
|
||||
path0 = path
|
||||
if isinstance(path, (list, tuple)):
|
||||
path = path[0]
|
||||
fs, path = url_to_fs(path, **(storage_options or {}))
|
||||
else:
|
||||
path0 = path
|
||||
|
||||
# For now, `columns == []` not supported. Just use
|
||||
# default `open` command with `path` input
|
||||
# For now, `columns == []` not supported, is the same
|
||||
# as all columns
|
||||
if columns is not None and len(columns) == 0:
|
||||
return fs.open(path, mode=mode)
|
||||
columns = None
|
||||
|
||||
# Set the engine
|
||||
engine = _set_engine(engine)
|
||||
|
||||
# Fetch the known byte ranges needed to read
|
||||
# `columns` and/or `row_groups`
|
||||
if isinstance(path0, (list, tuple)):
|
||||
paths = path0
|
||||
elif "*" in path:
|
||||
paths = fs.glob(path)
|
||||
elif path0.endswith("/"): # or fs.isdir(path):
|
||||
paths = [
|
||||
_
|
||||
for _ in fs.find(path, withdirs=False, detail=False)
|
||||
if _.endswith((".parquet", ".parq"))
|
||||
]
|
||||
else:
|
||||
paths = [path]
|
||||
|
||||
data = _get_parquet_byte_ranges(
|
||||
[path],
|
||||
paths,
|
||||
fs,
|
||||
metadata=metadata,
|
||||
columns=columns,
|
||||
@@ -118,24 +141,37 @@ def open_parquet_file(
|
||||
max_gap=max_gap,
|
||||
max_block=max_block,
|
||||
footer_sample_size=footer_sample_size,
|
||||
filters=filters,
|
||||
)
|
||||
|
||||
# Extract file name from `data`
|
||||
fn = next(iter(data)) if data else path
|
||||
|
||||
# Call self.open with "parts" caching
|
||||
options = kwargs.pop("cache_options", {}).copy()
|
||||
return fs.open(
|
||||
fn,
|
||||
mode=mode,
|
||||
cache_type="parts",
|
||||
cache_options={
|
||||
**options,
|
||||
"data": data.get(fn, {}),
|
||||
"strict": strict,
|
||||
},
|
||||
**kwargs,
|
||||
)
|
||||
return [
|
||||
AlreadyBufferedFile(
|
||||
fs=None,
|
||||
path=fn,
|
||||
mode=mode,
|
||||
cache_type="parts",
|
||||
cache_options={
|
||||
**options,
|
||||
"data": data.get(fn, {}),
|
||||
},
|
||||
size=max(_[1] for _ in data.get(fn, {})),
|
||||
**kwargs,
|
||||
)
|
||||
for fn in data
|
||||
]
|
||||
|
||||
|
||||
def open_parquet_file(*args, **kwargs):
|
||||
"""Create files tailed to reading specific parts of parquet files
|
||||
|
||||
Please see ``open_parquet_files`` for details of the arguments. The
|
||||
difference is, this function always returns a single ``AleadyBufferedFile``,
|
||||
whereas `open_parquet_files`` always returns a list of files, even if
|
||||
there are one or zero matching parquet files.
|
||||
"""
|
||||
return open_parquet_files(*args, **kwargs)[0]
|
||||
|
||||
|
||||
def _get_parquet_byte_ranges(
|
||||
@@ -148,6 +184,7 @@ def _get_parquet_byte_ranges(
|
||||
max_block=256_000_000,
|
||||
footer_sample_size=1_000_000,
|
||||
engine="auto",
|
||||
filters=None,
|
||||
):
|
||||
"""Get a dictionary of the known byte ranges needed
|
||||
to read a specific column/row-group selection from a
|
||||
@@ -172,6 +209,7 @@ def _get_parquet_byte_ranges(
|
||||
row_groups=row_groups,
|
||||
max_gap=max_gap,
|
||||
max_block=max_block,
|
||||
filters=filters,
|
||||
)
|
||||
|
||||
# Get file sizes asynchronously
|
||||
@@ -183,17 +221,16 @@ def _get_parquet_byte_ranges(
|
||||
data_starts = []
|
||||
data_ends = []
|
||||
add_header_magic = True
|
||||
if columns is None and row_groups is None:
|
||||
if columns is None and row_groups is None and filters is None:
|
||||
# We are NOT selecting specific columns or row-groups.
|
||||
#
|
||||
# We can avoid sampling the footers, and just transfer
|
||||
# all file data with cat_ranges
|
||||
for i, path in enumerate(paths):
|
||||
result[path] = {}
|
||||
for b in range(0, file_sizes[i], max_block):
|
||||
data_paths.append(path)
|
||||
data_starts.append(b)
|
||||
data_ends.append(min(b + max_block, file_sizes[i]))
|
||||
data_paths.append(path)
|
||||
data_starts.append(0)
|
||||
data_ends.append(file_sizes[i])
|
||||
add_header_magic = False # "Magic" should already be included
|
||||
else:
|
||||
# We ARE selecting specific columns or row-groups.
|
||||
@@ -235,29 +272,21 @@ def _get_parquet_byte_ranges(
|
||||
|
||||
# Calculate required byte ranges for each path
|
||||
for i, path in enumerate(paths):
|
||||
# Deal with small-file case.
|
||||
# Just include all remaining bytes of the file
|
||||
# in a single range.
|
||||
if file_sizes[i] < max_block:
|
||||
if footer_starts[i] > 0:
|
||||
# Only need to transfer the data if the
|
||||
# footer sample isn't already the whole file
|
||||
data_paths.append(path)
|
||||
data_starts.append(0)
|
||||
data_ends.append(footer_starts[i])
|
||||
continue
|
||||
|
||||
# Use "engine" to collect data byte ranges
|
||||
path_data_starts, path_data_ends = engine._parquet_byte_ranges(
|
||||
columns,
|
||||
row_groups=row_groups,
|
||||
footer=footer_samples[i],
|
||||
footer_start=footer_starts[i],
|
||||
filters=filters,
|
||||
)
|
||||
|
||||
data_paths += [path] * len(path_data_starts)
|
||||
data_starts += path_data_starts
|
||||
data_ends += path_data_ends
|
||||
result.setdefault(path, {})[(footer_starts[i], file_sizes[i])] = (
|
||||
footer_samples[i]
|
||||
)
|
||||
|
||||
# Merge adjacent offset ranges
|
||||
data_paths, data_starts, data_ends = merge_offset_ranges(
|
||||
@@ -291,6 +320,7 @@ def _get_parquet_byte_ranges_from_metadata(
|
||||
row_groups=None,
|
||||
max_gap=64_000,
|
||||
max_block=256_000_000,
|
||||
filters=None,
|
||||
):
|
||||
"""Simplified version of `_get_parquet_byte_ranges` for
|
||||
the case that an engine-specific `metadata` object is
|
||||
@@ -300,9 +330,7 @@ def _get_parquet_byte_ranges_from_metadata(
|
||||
|
||||
# Use "engine" to collect data byte ranges
|
||||
data_paths, data_starts, data_ends = engine._parquet_byte_ranges(
|
||||
columns,
|
||||
row_groups=row_groups,
|
||||
metadata=metadata,
|
||||
columns, row_groups=row_groups, metadata=metadata, filters=filters
|
||||
)
|
||||
|
||||
# Merge adjacent offset ranges
|
||||
@@ -401,16 +429,19 @@ class FastparquetEngine:
|
||||
metadata=None,
|
||||
footer=None,
|
||||
footer_start=None,
|
||||
filters=None,
|
||||
):
|
||||
# Initialize offset ranges and define ParqetFile metadata
|
||||
pf = metadata
|
||||
data_paths, data_starts, data_ends = [], [], []
|
||||
if filters and row_groups:
|
||||
raise ValueError("filters and row_groups cannot be used together")
|
||||
if pf is None:
|
||||
pf = self.fp.ParquetFile(io.BytesIO(footer))
|
||||
|
||||
# Convert columns to a set and add any index columns
|
||||
# specified in the pandas metadata (just in case)
|
||||
column_set = None if columns is None else set(columns)
|
||||
column_set = None if columns is None else {c.split(".", 1)[0] for c in columns}
|
||||
if column_set is not None and hasattr(pf, "pandas_metadata"):
|
||||
md_index = [
|
||||
ind
|
||||
@@ -422,7 +453,12 @@ class FastparquetEngine:
|
||||
|
||||
# Check if row_groups is a list of integers
|
||||
# or a list of row-group metadata
|
||||
if row_groups and not isinstance(row_groups[0], int):
|
||||
if filters:
|
||||
from fastparquet.api import filter_row_groups
|
||||
|
||||
row_group_indices = None
|
||||
row_groups = filter_row_groups(pf, filters)
|
||||
elif row_groups and not isinstance(row_groups[0], int):
|
||||
# Input row_groups contains row-group metadata
|
||||
row_group_indices = None
|
||||
else:
|
||||
@@ -486,9 +522,12 @@ class PyarrowEngine:
|
||||
metadata=None,
|
||||
footer=None,
|
||||
footer_start=None,
|
||||
filters=None,
|
||||
):
|
||||
if metadata is not None:
|
||||
raise ValueError("metadata input not supported for PyarrowEngine")
|
||||
if filters:
|
||||
raise NotImplementedError
|
||||
|
||||
data_starts, data_ends = [], []
|
||||
md = self.pq.ParquetFile(io.BytesIO(footer)).metadata
|
||||
|
||||
@@ -72,6 +72,9 @@ known_implementations = {
|
||||
"class": "fsspec.implementations.arrow.HadoopFileSystem",
|
||||
"err": "pyarrow and local java libraries required for HDFS",
|
||||
},
|
||||
"async_wrapper": {
|
||||
"class": "fsspec.implementations.asyn_wrapper.AsyncFileSystemWrapper",
|
||||
},
|
||||
"asynclocal": {
|
||||
"class": "morefs.asyn_local.AsyncLocalFileSystem",
|
||||
"err": "Install 'morefs[asynclocalfs]' to use AsyncLocalFileSystem",
|
||||
|
||||
@@ -7,23 +7,16 @@ import os
|
||||
import re
|
||||
import sys
|
||||
import tempfile
|
||||
from collections.abc import Iterable, Iterator, Sequence
|
||||
from collections.abc import Callable, Iterable, Iterator, Sequence
|
||||
from functools import partial
|
||||
from hashlib import md5
|
||||
from importlib.metadata import version
|
||||
from typing import (
|
||||
IO,
|
||||
TYPE_CHECKING,
|
||||
Any,
|
||||
Callable,
|
||||
TypeVar,
|
||||
)
|
||||
from typing import IO, TYPE_CHECKING, Any, TypeVar
|
||||
from urllib.parse import urlsplit
|
||||
|
||||
if TYPE_CHECKING:
|
||||
import pathlib
|
||||
|
||||
from typing_extensions import TypeGuard
|
||||
from typing import TypeGuard
|
||||
|
||||
from fsspec.spec import AbstractFileSystem
|
||||
|
||||
|
||||
Reference in New Issue
Block a user