chore: 添加虚拟环境到仓库

- 添加 backend_service/venv 虚拟环境
- 包含所有Python依赖包
- 注意:虚拟环境约393MB,包含12655个文件
This commit is contained in:
2025-12-03 10:19:25 +08:00
parent a6c2027caa
commit c4f851d387
12655 changed files with 3009376 additions and 0 deletions

View File

@@ -0,0 +1,121 @@
from abc import abstractmethod
from typing import Callable, Optional, Sequence
from chromadb.types import (
OperationRecord,
LogRecord,
SeqId,
Vector,
ScalarEncoding,
)
from chromadb.config import Component
from uuid import UUID
import numpy as np
def encode_vector(vector: Vector, encoding: ScalarEncoding) -> bytes:
"""Encode a vector into a byte array."""
if encoding == ScalarEncoding.FLOAT32:
return np.array(vector, dtype=np.float32).tobytes()
elif encoding == ScalarEncoding.INT32:
return np.array(vector, dtype=np.int32).tobytes()
else:
raise ValueError(f"Unsupported encoding: {encoding.value}")
def decode_vector(vector: bytes, encoding: ScalarEncoding) -> Vector:
"""Decode a byte array into a vector"""
if encoding == ScalarEncoding.FLOAT32:
return np.frombuffer(vector, dtype=np.float32)
elif encoding == ScalarEncoding.INT32:
return np.frombuffer(vector, dtype=np.float32)
else:
raise ValueError(f"Unsupported encoding: {encoding.value}")
class Producer(Component):
"""Interface for writing embeddings to an ingest stream"""
@abstractmethod
def delete_log(self, collection_id: UUID) -> None:
pass
@abstractmethod
def purge_log(self, collection_id: UUID) -> None:
"""Truncates the log for the given collection, removing all seen records."""
pass
@abstractmethod
def submit_embedding(
self, collection_id: UUID, embedding: OperationRecord
) -> SeqId:
"""Add an embedding record to the given collections log. Returns the SeqID of the record."""
pass
@abstractmethod
def submit_embeddings(
self, collection_id: UUID, embeddings: Sequence[OperationRecord]
) -> Sequence[SeqId]:
"""Add a batch of embedding records to the given collections log. Returns the SeqIDs of
the records. The returned SeqIDs will be in the same order as the given
SubmitEmbeddingRecords. However, it is not guaranteed that the SeqIDs will be
processed in the same order as the given SubmitEmbeddingRecords. If the number
of records exceeds the maximum batch size, an exception will be thrown."""
pass
@property
@abstractmethod
def max_batch_size(self) -> int:
"""Return the maximum number of records that can be submitted in a single call
to submit_embeddings."""
pass
ConsumerCallbackFn = Callable[[Sequence[LogRecord]], None]
class Consumer(Component):
"""Interface for reading embeddings off an ingest stream"""
@abstractmethod
def subscribe(
self,
collection_id: UUID,
consume_fn: ConsumerCallbackFn,
start: Optional[SeqId] = None,
end: Optional[SeqId] = None,
id: Optional[UUID] = None,
) -> UUID:
"""Register a function that will be called to receive embeddings for a given
collections log stream. The given function may be called any number of times, with any number of
records, and may be called concurrently.
Only records between start (exclusive) and end (inclusive) SeqIDs will be
returned. If start is None, the first record returned will be the next record
generated, not including those generated before creating the subscription. If
end is None, the consumer will consume indefinitely, otherwise it will
automatically be unsubscribed when the end SeqID is reached.
If the function throws an exception, the function may be called again with the
same or different records.
Takes an optional UUID as a unique subscription ID. If no ID is provided, a new
ID will be generated and returned."""
pass
@abstractmethod
def unsubscribe(self, subscription_id: UUID) -> None:
"""Unregister a subscription. The consume function will no longer be invoked,
and resources associated with the subscription will be released."""
pass
@abstractmethod
def min_seqid(self) -> SeqId:
"""Return the minimum possible SeqID in this implementation."""
pass
@abstractmethod
def max_seqid(self) -> SeqId:
"""Return the maximum possible SeqID in this implementation."""
pass

View File

@@ -0,0 +1,49 @@
import re
from typing import Tuple
from uuid import UUID
from chromadb.db.base import SqlDB
from chromadb.segment import SegmentManager, VectorReader
topic_regex = r"persistent:\/\/(?P<tenant>.+)\/(?P<namespace>.+)\/(?P<topic>.+)"
def parse_topic_name(topic_name: str) -> Tuple[str, str, str]:
"""Parse the topic name into the tenant, namespace and topic name"""
match = re.match(topic_regex, topic_name)
if not match:
raise ValueError(f"Invalid topic name: {topic_name}")
return match.group("tenant"), match.group("namespace"), match.group("topic")
def create_topic_name(tenant: str, namespace: str, collection_id: UUID) -> str:
return f"persistent://{tenant}/{namespace}/{str(collection_id)}"
def trigger_vector_segments_max_seq_id_migration(
db: SqlDB, segment_manager: SegmentManager
) -> None:
"""
Trigger the migration of vector segments' max_seq_id from the pickled metadata file to SQLite.
Vector segments migrate this field automatically on init—so this should be used when we know segments are likely unmigrated and unloaded.
This is a no-op if all vector segments have already migrated their max_seq_id.
"""
with db.tx() as cur:
cur.execute(
"""
SELECT collection
FROM "segments"
WHERE "id" NOT IN (SELECT "segment_id" FROM "max_seq_id") AND
"type" = 'urn:chroma:segment/vector/hnsw-local-persisted'
"""
)
collection_ids_with_unmigrated_segments = [row[0] for row in cur.fetchall()]
if len(collection_ids_with_unmigrated_segments) == 0:
return
for collection_id in collection_ids_with_unmigrated_segments:
# Loading the segment triggers the migration on init
segment_manager.get_segment(UUID(collection_id), VectorReader)