chore: 添加虚拟环境到仓库

- 添加 backend_service/venv 虚拟环境 - 包含所有Python依赖包 - 注意：虚拟环境约393MB，包含12655个文件
2025-12-03 10:19:25 +08:00
parent a6c2027caa
commit c4f851d387
12655 changed files with 3009376 additions and 0 deletions
--- a/backend_service/venv/lib/python3.13/site-packages/chromadb/ingest/init.py
+++ b/backend_service/venv/lib/python3.13/site-packages/chromadb/ingest/init.py
@@ -0,0 +1,121 @@
+from abc import abstractmethod
+from typing import Callable, Optional, Sequence
+from chromadb.types import (
+    OperationRecord,
+    LogRecord,
+    SeqId,
+    Vector,
+    ScalarEncoding,
+)
+from chromadb.config import Component
+from uuid import UUID
+import numpy as np
+
+
+def encode_vector(vector: Vector, encoding: ScalarEncoding) -> bytes:
+    """Encode a vector into a byte array."""
+
+    if encoding == ScalarEncoding.FLOAT32:
+        return np.array(vector, dtype=np.float32).tobytes()
+    elif encoding == ScalarEncoding.INT32:
+        return np.array(vector, dtype=np.int32).tobytes()
+    else:
+        raise ValueError(f"Unsupported encoding: {encoding.value}")
+
+
+def decode_vector(vector: bytes, encoding: ScalarEncoding) -> Vector:
+    """Decode a byte array into a vector"""
+
+    if encoding == ScalarEncoding.FLOAT32:
+        return np.frombuffer(vector, dtype=np.float32)
+    elif encoding == ScalarEncoding.INT32:
+        return np.frombuffer(vector, dtype=np.float32)
+    else:
+        raise ValueError(f"Unsupported encoding: {encoding.value}")
+
+
+class Producer(Component):
+    """Interface for writing embeddings to an ingest stream"""
+
+    @abstractmethod
+    def delete_log(self, collection_id: UUID) -> None:
+        pass
+
+    @abstractmethod
+    def purge_log(self, collection_id: UUID) -> None:
+        """Truncates the log for the given collection, removing all seen records."""
+        pass
+
+    @abstractmethod
+    def submit_embedding(
+        self, collection_id: UUID, embedding: OperationRecord
+    ) -> SeqId:
+        """Add an embedding record to the given collections log. Returns the SeqID of the record."""
+        pass
+
+    @abstractmethod
+    def submit_embeddings(
+        self, collection_id: UUID, embeddings: Sequence[OperationRecord]
+    ) -> Sequence[SeqId]:
+        """Add a batch of embedding records to the given collections log. Returns the SeqIDs of
+        the records. The returned SeqIDs will be in the same order as the given
+        SubmitEmbeddingRecords. However, it is not guaranteed that the SeqIDs will be
+        processed in the same order as the given SubmitEmbeddingRecords. If the number
+        of records exceeds the maximum batch size, an exception will be thrown."""
+        pass
+
+    @property
+    @abstractmethod
+    def max_batch_size(self) -> int:
+        """Return the maximum number of records that can be submitted in a single call
+        to submit_embeddings."""
+        pass
+
+
+ConsumerCallbackFn = Callable[[Sequence[LogRecord]], None]
+
+
+class Consumer(Component):
+    """Interface for reading embeddings off an ingest stream"""
+
+    @abstractmethod
+    def subscribe(
+        self,
+        collection_id: UUID,
+        consume_fn: ConsumerCallbackFn,
+        start: Optional[SeqId] = None,
+        end: Optional[SeqId] = None,
+        id: Optional[UUID] = None,
+    ) -> UUID:
+        """Register a function that will be called to receive embeddings for a given
+        collections log stream. The given function may be called any number of times, with any number of
+        records, and may be called concurrently.
+
+        Only records between start (exclusive) and end (inclusive) SeqIDs will be
+        returned. If start is None, the first record returned will be the next record
+        generated, not including those generated before creating the subscription. If
+        end is None, the consumer will consume indefinitely, otherwise it will
+        automatically be unsubscribed when the end SeqID is reached.
+
+        If the function throws an exception, the function may be called again with the
+        same or different records.
+
+        Takes an optional UUID as a unique subscription ID. If no ID is provided, a new
+        ID will be generated and returned."""
+        pass
+
+    @abstractmethod
+    def unsubscribe(self, subscription_id: UUID) -> None:
+        """Unregister a subscription. The consume function will no longer be invoked,
+        and resources associated with the subscription will be released."""
+        pass
+
+    @abstractmethod
+    def min_seqid(self) -> SeqId:
+        """Return the minimum possible SeqID in this implementation."""
+        pass
+
+    @abstractmethod
+    def max_seqid(self) -> SeqId:
+        """Return the maximum possible SeqID in this implementation."""
+        pass
--- a/backend_service/venv/lib/python3.13/site-packages/chromadb/ingest/impl/utils.py
+++ b/backend_service/venv/lib/python3.13/site-packages/chromadb/ingest/impl/utils.py
@@ -0,0 +1,49 @@
+import re
+from typing import Tuple
+from uuid import UUID
+
+from chromadb.db.base import SqlDB
+from chromadb.segment import SegmentManager, VectorReader
+
+topic_regex = r"persistent:\/\/(?P<tenant>.+)\/(?P<namespace>.+)\/(?P<topic>.+)"
+
+
+def parse_topic_name(topic_name: str) -> Tuple[str, str, str]:
+    """Parse the topic name into the tenant, namespace and topic name"""
+    match = re.match(topic_regex, topic_name)
+    if not match:
+        raise ValueError(f"Invalid topic name: {topic_name}")
+    return match.group("tenant"), match.group("namespace"), match.group("topic")
+
+
+def create_topic_name(tenant: str, namespace: str, collection_id: UUID) -> str:
+    return f"persistent://{tenant}/{namespace}/{str(collection_id)}"
+
+
+def trigger_vector_segments_max_seq_id_migration(
+    db: SqlDB, segment_manager: SegmentManager
+) -> None:
+    """
+    Trigger the migration of vector segments' max_seq_id from the pickled metadata file to SQLite.
+
+    Vector segments migrate this field automatically on init—so this should be used when we know segments are likely unmigrated and unloaded.
+
+    This is a no-op if all vector segments have already migrated their max_seq_id.
+    """
+    with db.tx() as cur:
+        cur.execute(
+            """
+            SELECT collection
+            FROM "segments"
+            WHERE "id" NOT IN (SELECT "segment_id" FROM "max_seq_id") AND
+                  "type" = 'urn:chroma:segment/vector/hnsw-local-persisted'
+        """
+        )
+        collection_ids_with_unmigrated_segments = [row[0] for row in cur.fetchall()]
+
+    if len(collection_ids_with_unmigrated_segments) == 0:
+        return
+
+    for collection_id in collection_ids_with_unmigrated_segments:
+        # Loading the segment triggers the migration on init
+        segment_manager.get_segment(UUID(collection_id), VectorReader)