增加环绕侦察场景适配

2026-01-08 15:44:38 +08:00
parent 3eba1f962b
commit 10c5bb5a8a
5441 changed files with 40219 additions and 379695 deletions
--- a/backend_service/venv/lib/python3.13/site-packages/fsspec/parquet.py
+++ b/backend_service/venv/lib/python3.13/site-packages/fsspec/parquet.py
@@ -1,8 +1,12 @@
 import io
 import json
 import warnings
+from typing import Literal
+
+import fsspec

 from .core import url_to_fs
+from .spec import AbstractBufferedFile
 from .utils import merge_offset_ranges

 # Parquet-Specific Utilities for fsspec
@@ -14,19 +18,24 @@ from .utils import merge_offset_ranges
 # on remote file systems.


-def open_parquet_file(
-    path,
-    mode="rb",
-    fs=None,
+class AlreadyBufferedFile(AbstractBufferedFile):
+    def _fetch_range(self, start, end):
+        raise NotImplementedError
+
+
+def open_parquet_files(
+    path: list[str],
+    mode: Literal["rb"] = "rb",
+    fs: None | fsspec.AbstractFileSystem = None,
    metadata=None,
-    columns=None,
-    row_groups=None,
-    storage_options=None,
-    strict=False,
-    engine="auto",
-    max_gap=64_000,
-    max_block=256_000_000,
-    footer_sample_size=1_000_000,
+    columns: None | list[str] = None,
+    row_groups: None | list[int] = None,
+    storage_options: None | dict = None,
+    engine: str = "auto",
+    max_gap: int = 64_000,
+    max_block: int = 256_000_000,
+    footer_sample_size: int = 1_000_000,
+    filters: None | list[list[list[str]]] = None,
    **kwargs,
 ):
    """
@@ -72,12 +81,6 @@ def open_parquet_file(
    storage_options : dict, optional
        Used to generate an `AbstractFileSystem` object if `fs` was
        not specified.
-    strict : bool, optional
-        Whether the resulting `KnownPartsOfAFile` cache should
-        fetch reads that go beyond a known byte-range boundary.
-        If `False` (the default), any read that ends outside a
-        known part will be zero padded. Note that using
-        `strict=True` may be useful for debugging.
    max_gap : int, optional
        Neighboring byte ranges will only be merged when their
        inter-range gap is <= `max_gap`. Default is 64KB.
@@ -89,6 +92,10 @@ def open_parquet_file(
        for the footer metadata. If the sampled bytes do not contain
        the footer, a second read request will be required, and
        performance will suffer. Default is 1MB.
+    filters : list[list], optional
+        List of filters to apply to prevent reading row groups, of the
+        same format as accepted by the loading engines. Ignored if
+        ``row_groups`` is specified.
    **kwargs :
        Optional key-word arguments to pass to `fs.open`
    """
@@ -96,20 +103,36 @@ def open_parquet_file(
    # Make sure we have an `AbstractFileSystem` object
    # to work with
    if fs is None:
-        fs = url_to_fs(path, **(storage_options or {}))[0]
+        path0 = path
+        if isinstance(path, (list, tuple)):
+            path = path[0]
+        fs, path = url_to_fs(path, **(storage_options or {}))
+    else:
+        path0 = path

-    # For now, `columns == []` not supported. Just use
-    # default `open` command with `path` input
+    # For now, `columns == []` not supported, is the same
+    # as all columns
    if columns is not None and len(columns) == 0:
-        return fs.open(path, mode=mode)
+        columns = None

    # Set the engine
    engine = _set_engine(engine)

-    # Fetch the known byte ranges needed to read
-    # `columns` and/or `row_groups`
+    if isinstance(path0, (list, tuple)):
+        paths = path0
+    elif "*" in path:
+        paths = fs.glob(path)
+    elif path0.endswith("/"):  # or fs.isdir(path):
+        paths = [
+            _
+            for _ in fs.find(path, withdirs=False, detail=False)
+            if _.endswith((".parquet", ".parq"))
+        ]
+    else:
+        paths = [path]
+
    data = _get_parquet_byte_ranges(
-        [path],
+        paths,
        fs,
        metadata=metadata,
        columns=columns,
@@ -118,24 +141,37 @@ def open_parquet_file(
        max_gap=max_gap,
        max_block=max_block,
        footer_sample_size=footer_sample_size,
+        filters=filters,
    )

-    # Extract file name from `data`
-    fn = next(iter(data)) if data else path
-
    # Call self.open with "parts" caching
    options = kwargs.pop("cache_options", {}).copy()
-    return fs.open(
-        fn,
-        mode=mode,
-        cache_type="parts",
-        cache_options={
-            **options,
-            "data": data.get(fn, {}),
-            "strict": strict,
-        },
-        **kwargs,
-    )
+    return [
+        AlreadyBufferedFile(
+            fs=None,
+            path=fn,
+            mode=mode,
+            cache_type="parts",
+            cache_options={
+                **options,
+                "data": data.get(fn, {}),
+            },
+            size=max(_[1] for _ in data.get(fn, {})),
+            **kwargs,
+        )
+        for fn in data
+    ]
+
+
+def open_parquet_file(*args, **kwargs):
+    """Create files tailed to reading specific parts of parquet files
+
+    Please see ``open_parquet_files`` for details of the arguments. The
+    difference is, this function always returns a single ``AleadyBufferedFile``,
+    whereas `open_parquet_files`` always returns a list of files, even if
+    there are one or zero matching parquet files.
+    """
+    return open_parquet_files(*args, **kwargs)[0]


 def _get_parquet_byte_ranges(
@@ -148,6 +184,7 @@ def _get_parquet_byte_ranges(
    max_block=256_000_000,
    footer_sample_size=1_000_000,
    engine="auto",
+    filters=None,
 ):
    """Get a dictionary of the known byte ranges needed
    to read a specific column/row-group selection from a
@@ -172,6 +209,7 @@ def _get_parquet_byte_ranges(
            row_groups=row_groups,
            max_gap=max_gap,
            max_block=max_block,
+            filters=filters,
        )

    # Get file sizes asynchronously
@@ -183,17 +221,16 @@ def _get_parquet_byte_ranges(
    data_starts = []
    data_ends = []
    add_header_magic = True
-    if columns is None and row_groups is None:
+    if columns is None and row_groups is None and filters is None:
        # We are NOT selecting specific columns or row-groups.
        #
        # We can avoid sampling the footers, and just transfer
        # all file data with cat_ranges
        for i, path in enumerate(paths):
            result[path] = {}
-            for b in range(0, file_sizes[i], max_block):
-                data_paths.append(path)
-                data_starts.append(b)
-                data_ends.append(min(b + max_block, file_sizes[i]))
+            data_paths.append(path)
+            data_starts.append(0)
+            data_ends.append(file_sizes[i])
        add_header_magic = False  # "Magic" should already be included
    else:
        # We ARE selecting specific columns or row-groups.
@@ -235,29 +272,21 @@ def _get_parquet_byte_ranges(

        # Calculate required byte ranges for each path
        for i, path in enumerate(paths):
-            # Deal with small-file case.
-            # Just include all remaining bytes of the file
-            # in a single range.
-            if file_sizes[i] < max_block:
-                if footer_starts[i] > 0:
-                    # Only need to transfer the data if the
-                    # footer sample isn't already the whole file
-                    data_paths.append(path)
-                    data_starts.append(0)
-                    data_ends.append(footer_starts[i])
-                continue
-
            # Use "engine" to collect data byte ranges
            path_data_starts, path_data_ends = engine._parquet_byte_ranges(
                columns,
                row_groups=row_groups,
                footer=footer_samples[i],
                footer_start=footer_starts[i],
+                filters=filters,
            )

            data_paths += [path] * len(path_data_starts)
            data_starts += path_data_starts
            data_ends += path_data_ends
+            result.setdefault(path, {})[(footer_starts[i], file_sizes[i])] = (
+                footer_samples[i]
+            )

        # Merge adjacent offset ranges
        data_paths, data_starts, data_ends = merge_offset_ranges(
@@ -291,6 +320,7 @@ def _get_parquet_byte_ranges_from_metadata(
    row_groups=None,
    max_gap=64_000,
    max_block=256_000_000,
+    filters=None,
 ):
    """Simplified version of `_get_parquet_byte_ranges` for
    the case that an engine-specific `metadata` object is
@@ -300,9 +330,7 @@ def _get_parquet_byte_ranges_from_metadata(

    # Use "engine" to collect data byte ranges
    data_paths, data_starts, data_ends = engine._parquet_byte_ranges(
-        columns,
-        row_groups=row_groups,
-        metadata=metadata,
+        columns, row_groups=row_groups, metadata=metadata, filters=filters
    )

    # Merge adjacent offset ranges
@@ -401,16 +429,19 @@ class FastparquetEngine:
        metadata=None,
        footer=None,
        footer_start=None,
+        filters=None,
    ):
        # Initialize offset ranges and define ParqetFile metadata
        pf = metadata
        data_paths, data_starts, data_ends = [], [], []
+        if filters and row_groups:
+            raise ValueError("filters and row_groups cannot be used together")
        if pf is None:
            pf = self.fp.ParquetFile(io.BytesIO(footer))

        # Convert columns to a set and add any index columns
        # specified in the pandas metadata (just in case)
-        column_set = None if columns is None else set(columns)
+        column_set = None if columns is None else {c.split(".", 1)[0] for c in columns}
        if column_set is not None and hasattr(pf, "pandas_metadata"):
            md_index = [
                ind
@@ -422,7 +453,12 @@ class FastparquetEngine:

        # Check if row_groups is a list of integers
        # or a list of row-group metadata
-        if row_groups and not isinstance(row_groups[0], int):
+        if filters:
+            from fastparquet.api import filter_row_groups
+
+            row_group_indices = None
+            row_groups = filter_row_groups(pf, filters)
+        elif row_groups and not isinstance(row_groups[0], int):
            # Input row_groups contains row-group metadata
            row_group_indices = None
        else:
@@ -486,9 +522,12 @@ class PyarrowEngine:
        metadata=None,
        footer=None,
        footer_start=None,
+        filters=None,
    ):
        if metadata is not None:
            raise ValueError("metadata input not supported for PyarrowEngine")
+        if filters:
+            raise NotImplementedError

        data_starts, data_ends = [], []
        md = self.pq.ParquetFile(io.BytesIO(footer)).metadata