增加环绕侦察场景适配
This commit is contained in:
@@ -1,8 +1,12 @@
|
||||
import io
|
||||
import json
|
||||
import warnings
|
||||
from typing import Literal
|
||||
|
||||
import fsspec
|
||||
|
||||
from .core import url_to_fs
|
||||
from .spec import AbstractBufferedFile
|
||||
from .utils import merge_offset_ranges
|
||||
|
||||
# Parquet-Specific Utilities for fsspec
|
||||
@@ -14,19 +18,24 @@ from .utils import merge_offset_ranges
|
||||
# on remote file systems.
|
||||
|
||||
|
||||
def open_parquet_file(
|
||||
path,
|
||||
mode="rb",
|
||||
fs=None,
|
||||
class AlreadyBufferedFile(AbstractBufferedFile):
|
||||
def _fetch_range(self, start, end):
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
def open_parquet_files(
|
||||
path: list[str],
|
||||
mode: Literal["rb"] = "rb",
|
||||
fs: None | fsspec.AbstractFileSystem = None,
|
||||
metadata=None,
|
||||
columns=None,
|
||||
row_groups=None,
|
||||
storage_options=None,
|
||||
strict=False,
|
||||
engine="auto",
|
||||
max_gap=64_000,
|
||||
max_block=256_000_000,
|
||||
footer_sample_size=1_000_000,
|
||||
columns: None | list[str] = None,
|
||||
row_groups: None | list[int] = None,
|
||||
storage_options: None | dict = None,
|
||||
engine: str = "auto",
|
||||
max_gap: int = 64_000,
|
||||
max_block: int = 256_000_000,
|
||||
footer_sample_size: int = 1_000_000,
|
||||
filters: None | list[list[list[str]]] = None,
|
||||
**kwargs,
|
||||
):
|
||||
"""
|
||||
@@ -72,12 +81,6 @@ def open_parquet_file(
|
||||
storage_options : dict, optional
|
||||
Used to generate an `AbstractFileSystem` object if `fs` was
|
||||
not specified.
|
||||
strict : bool, optional
|
||||
Whether the resulting `KnownPartsOfAFile` cache should
|
||||
fetch reads that go beyond a known byte-range boundary.
|
||||
If `False` (the default), any read that ends outside a
|
||||
known part will be zero padded. Note that using
|
||||
`strict=True` may be useful for debugging.
|
||||
max_gap : int, optional
|
||||
Neighboring byte ranges will only be merged when their
|
||||
inter-range gap is <= `max_gap`. Default is 64KB.
|
||||
@@ -89,6 +92,10 @@ def open_parquet_file(
|
||||
for the footer metadata. If the sampled bytes do not contain
|
||||
the footer, a second read request will be required, and
|
||||
performance will suffer. Default is 1MB.
|
||||
filters : list[list], optional
|
||||
List of filters to apply to prevent reading row groups, of the
|
||||
same format as accepted by the loading engines. Ignored if
|
||||
``row_groups`` is specified.
|
||||
**kwargs :
|
||||
Optional key-word arguments to pass to `fs.open`
|
||||
"""
|
||||
@@ -96,20 +103,36 @@ def open_parquet_file(
|
||||
# Make sure we have an `AbstractFileSystem` object
|
||||
# to work with
|
||||
if fs is None:
|
||||
fs = url_to_fs(path, **(storage_options or {}))[0]
|
||||
path0 = path
|
||||
if isinstance(path, (list, tuple)):
|
||||
path = path[0]
|
||||
fs, path = url_to_fs(path, **(storage_options or {}))
|
||||
else:
|
||||
path0 = path
|
||||
|
||||
# For now, `columns == []` not supported. Just use
|
||||
# default `open` command with `path` input
|
||||
# For now, `columns == []` not supported, is the same
|
||||
# as all columns
|
||||
if columns is not None and len(columns) == 0:
|
||||
return fs.open(path, mode=mode)
|
||||
columns = None
|
||||
|
||||
# Set the engine
|
||||
engine = _set_engine(engine)
|
||||
|
||||
# Fetch the known byte ranges needed to read
|
||||
# `columns` and/or `row_groups`
|
||||
if isinstance(path0, (list, tuple)):
|
||||
paths = path0
|
||||
elif "*" in path:
|
||||
paths = fs.glob(path)
|
||||
elif path0.endswith("/"): # or fs.isdir(path):
|
||||
paths = [
|
||||
_
|
||||
for _ in fs.find(path, withdirs=False, detail=False)
|
||||
if _.endswith((".parquet", ".parq"))
|
||||
]
|
||||
else:
|
||||
paths = [path]
|
||||
|
||||
data = _get_parquet_byte_ranges(
|
||||
[path],
|
||||
paths,
|
||||
fs,
|
||||
metadata=metadata,
|
||||
columns=columns,
|
||||
@@ -118,24 +141,37 @@ def open_parquet_file(
|
||||
max_gap=max_gap,
|
||||
max_block=max_block,
|
||||
footer_sample_size=footer_sample_size,
|
||||
filters=filters,
|
||||
)
|
||||
|
||||
# Extract file name from `data`
|
||||
fn = next(iter(data)) if data else path
|
||||
|
||||
# Call self.open with "parts" caching
|
||||
options = kwargs.pop("cache_options", {}).copy()
|
||||
return fs.open(
|
||||
fn,
|
||||
mode=mode,
|
||||
cache_type="parts",
|
||||
cache_options={
|
||||
**options,
|
||||
"data": data.get(fn, {}),
|
||||
"strict": strict,
|
||||
},
|
||||
**kwargs,
|
||||
)
|
||||
return [
|
||||
AlreadyBufferedFile(
|
||||
fs=None,
|
||||
path=fn,
|
||||
mode=mode,
|
||||
cache_type="parts",
|
||||
cache_options={
|
||||
**options,
|
||||
"data": data.get(fn, {}),
|
||||
},
|
||||
size=max(_[1] for _ in data.get(fn, {})),
|
||||
**kwargs,
|
||||
)
|
||||
for fn in data
|
||||
]
|
||||
|
||||
|
||||
def open_parquet_file(*args, **kwargs):
|
||||
"""Create files tailed to reading specific parts of parquet files
|
||||
|
||||
Please see ``open_parquet_files`` for details of the arguments. The
|
||||
difference is, this function always returns a single ``AleadyBufferedFile``,
|
||||
whereas `open_parquet_files`` always returns a list of files, even if
|
||||
there are one or zero matching parquet files.
|
||||
"""
|
||||
return open_parquet_files(*args, **kwargs)[0]
|
||||
|
||||
|
||||
def _get_parquet_byte_ranges(
|
||||
@@ -148,6 +184,7 @@ def _get_parquet_byte_ranges(
|
||||
max_block=256_000_000,
|
||||
footer_sample_size=1_000_000,
|
||||
engine="auto",
|
||||
filters=None,
|
||||
):
|
||||
"""Get a dictionary of the known byte ranges needed
|
||||
to read a specific column/row-group selection from a
|
||||
@@ -172,6 +209,7 @@ def _get_parquet_byte_ranges(
|
||||
row_groups=row_groups,
|
||||
max_gap=max_gap,
|
||||
max_block=max_block,
|
||||
filters=filters,
|
||||
)
|
||||
|
||||
# Get file sizes asynchronously
|
||||
@@ -183,17 +221,16 @@ def _get_parquet_byte_ranges(
|
||||
data_starts = []
|
||||
data_ends = []
|
||||
add_header_magic = True
|
||||
if columns is None and row_groups is None:
|
||||
if columns is None and row_groups is None and filters is None:
|
||||
# We are NOT selecting specific columns or row-groups.
|
||||
#
|
||||
# We can avoid sampling the footers, and just transfer
|
||||
# all file data with cat_ranges
|
||||
for i, path in enumerate(paths):
|
||||
result[path] = {}
|
||||
for b in range(0, file_sizes[i], max_block):
|
||||
data_paths.append(path)
|
||||
data_starts.append(b)
|
||||
data_ends.append(min(b + max_block, file_sizes[i]))
|
||||
data_paths.append(path)
|
||||
data_starts.append(0)
|
||||
data_ends.append(file_sizes[i])
|
||||
add_header_magic = False # "Magic" should already be included
|
||||
else:
|
||||
# We ARE selecting specific columns or row-groups.
|
||||
@@ -235,29 +272,21 @@ def _get_parquet_byte_ranges(
|
||||
|
||||
# Calculate required byte ranges for each path
|
||||
for i, path in enumerate(paths):
|
||||
# Deal with small-file case.
|
||||
# Just include all remaining bytes of the file
|
||||
# in a single range.
|
||||
if file_sizes[i] < max_block:
|
||||
if footer_starts[i] > 0:
|
||||
# Only need to transfer the data if the
|
||||
# footer sample isn't already the whole file
|
||||
data_paths.append(path)
|
||||
data_starts.append(0)
|
||||
data_ends.append(footer_starts[i])
|
||||
continue
|
||||
|
||||
# Use "engine" to collect data byte ranges
|
||||
path_data_starts, path_data_ends = engine._parquet_byte_ranges(
|
||||
columns,
|
||||
row_groups=row_groups,
|
||||
footer=footer_samples[i],
|
||||
footer_start=footer_starts[i],
|
||||
filters=filters,
|
||||
)
|
||||
|
||||
data_paths += [path] * len(path_data_starts)
|
||||
data_starts += path_data_starts
|
||||
data_ends += path_data_ends
|
||||
result.setdefault(path, {})[(footer_starts[i], file_sizes[i])] = (
|
||||
footer_samples[i]
|
||||
)
|
||||
|
||||
# Merge adjacent offset ranges
|
||||
data_paths, data_starts, data_ends = merge_offset_ranges(
|
||||
@@ -291,6 +320,7 @@ def _get_parquet_byte_ranges_from_metadata(
|
||||
row_groups=None,
|
||||
max_gap=64_000,
|
||||
max_block=256_000_000,
|
||||
filters=None,
|
||||
):
|
||||
"""Simplified version of `_get_parquet_byte_ranges` for
|
||||
the case that an engine-specific `metadata` object is
|
||||
@@ -300,9 +330,7 @@ def _get_parquet_byte_ranges_from_metadata(
|
||||
|
||||
# Use "engine" to collect data byte ranges
|
||||
data_paths, data_starts, data_ends = engine._parquet_byte_ranges(
|
||||
columns,
|
||||
row_groups=row_groups,
|
||||
metadata=metadata,
|
||||
columns, row_groups=row_groups, metadata=metadata, filters=filters
|
||||
)
|
||||
|
||||
# Merge adjacent offset ranges
|
||||
@@ -401,16 +429,19 @@ class FastparquetEngine:
|
||||
metadata=None,
|
||||
footer=None,
|
||||
footer_start=None,
|
||||
filters=None,
|
||||
):
|
||||
# Initialize offset ranges and define ParqetFile metadata
|
||||
pf = metadata
|
||||
data_paths, data_starts, data_ends = [], [], []
|
||||
if filters and row_groups:
|
||||
raise ValueError("filters and row_groups cannot be used together")
|
||||
if pf is None:
|
||||
pf = self.fp.ParquetFile(io.BytesIO(footer))
|
||||
|
||||
# Convert columns to a set and add any index columns
|
||||
# specified in the pandas metadata (just in case)
|
||||
column_set = None if columns is None else set(columns)
|
||||
column_set = None if columns is None else {c.split(".", 1)[0] for c in columns}
|
||||
if column_set is not None and hasattr(pf, "pandas_metadata"):
|
||||
md_index = [
|
||||
ind
|
||||
@@ -422,7 +453,12 @@ class FastparquetEngine:
|
||||
|
||||
# Check if row_groups is a list of integers
|
||||
# or a list of row-group metadata
|
||||
if row_groups and not isinstance(row_groups[0], int):
|
||||
if filters:
|
||||
from fastparquet.api import filter_row_groups
|
||||
|
||||
row_group_indices = None
|
||||
row_groups = filter_row_groups(pf, filters)
|
||||
elif row_groups and not isinstance(row_groups[0], int):
|
||||
# Input row_groups contains row-group metadata
|
||||
row_group_indices = None
|
||||
else:
|
||||
@@ -486,9 +522,12 @@ class PyarrowEngine:
|
||||
metadata=None,
|
||||
footer=None,
|
||||
footer_start=None,
|
||||
filters=None,
|
||||
):
|
||||
if metadata is not None:
|
||||
raise ValueError("metadata input not supported for PyarrowEngine")
|
||||
if filters:
|
||||
raise NotImplementedError
|
||||
|
||||
data_starts, data_ends = [], []
|
||||
md = self.pq.ParquetFile(io.BytesIO(footer)).metadata
|
||||
|
||||
Reference in New Issue
Block a user