chore: 添加虚拟环境到仓库

- 添加 backend_service/venv 虚拟环境
- 包含所有Python依赖包
- 注意:虚拟环境约393MB,包含12655个文件
This commit is contained in:
2025-12-03 10:19:25 +08:00
parent a6c2027caa
commit c4f851d387
12655 changed files with 3009376 additions and 0 deletions

View File

@@ -0,0 +1,644 @@
from typing import TYPE_CHECKING
from chromadb import (
CollectionMetadata,
Embeddings,
GetResult,
IDs,
Where,
WhereDocument,
Include,
Documents,
Metadatas,
QueryResult,
URIs,
)
from chromadb.api import ServerAPI
if TYPE_CHECKING:
from chromadb.api.models.AttachedFunction import AttachedFunction
from chromadb.api.collection_configuration import (
CreateCollectionConfiguration,
UpdateCollectionConfiguration,
create_collection_configuration_to_json_str,
update_collection_configuration_to_json_str,
)
from chromadb.auth import UserIdentity
from chromadb.config import DEFAULT_DATABASE, DEFAULT_TENANT, Settings, System
from chromadb.telemetry.product import ProductTelemetryClient
from chromadb.telemetry.product.events import (
CollectionAddEvent,
CollectionDeleteEvent,
CollectionGetEvent,
CollectionUpdateEvent,
CollectionQueryEvent,
ClientCreateCollectionEvent,
)
from chromadb.api.types import (
IncludeMetadataDocuments,
IncludeMetadataDocumentsDistances,
IncludeMetadataDocumentsEmbeddings,
Schema,
SearchResult,
)
# TODO(hammadb): Unify imports across types vs root __init__.py
from chromadb.types import Database, Tenant, Collection as CollectionModel
from chromadb.execution.expression.plan import Search
import chromadb_rust_bindings
from typing import Optional, Sequence, List, Dict, Any
from overrides import override
from uuid import UUID
import json
import platform
if platform.system() != "Windows":
import resource
elif platform.system() == "Windows":
import ctypes
# RustBindingsAPI is an implementation of ServerAPI which shims
# the Rust bindings to the Python API, providing a full implementation
# of the API. It could be that bindings was a direct implementation of
# ServerAPI, but in order to prevent propagating the bindings types
# into the Python API, we have to shim it here so we can convert into
# the legacy Python types.
# TODO(hammadb): Propagate the types from the bindings into the Python API
# and remove the python-level types entirely.
class RustBindingsAPI(ServerAPI):
bindings: chromadb_rust_bindings.Bindings
hnsw_cache_size: int
product_telemetry_client: ProductTelemetryClient
def __init__(self, system: System):
super().__init__(system)
self.product_telemetry_client = self.require(ProductTelemetryClient)
if platform.system() != "Windows":
max_file_handles = resource.getrlimit(resource.RLIMIT_NOFILE)[0]
else:
max_file_handles = ctypes.windll.msvcrt._getmaxstdio() # type: ignore
self.hnsw_cache_size = (
max_file_handles
# This is integer division in Python 3, and not a comment.
# Each HNSW index has 4 data files and 1 metadata file
// 5
)
@override
def start(self) -> None:
# Construct the SqliteConfig
# TOOD: We should add a "config converter"
if self._system.settings.require("is_persistent"):
persist_path = self._system.settings.require("persist_directory")
sqlite_persist_path = persist_path + "/chroma.sqlite3"
else:
persist_path = None
sqlite_persist_path = None
hash_type = self._system.settings.require("migrations_hash_algorithm")
hash_type_bindings = (
chromadb_rust_bindings.MigrationHash.MD5
if hash_type == "md5"
else chromadb_rust_bindings.MigrationHash.SHA256
)
migration_mode = self._system.settings.require("migrations")
migration_mode_bindings = (
chromadb_rust_bindings.MigrationMode.Apply
if migration_mode == "apply"
else chromadb_rust_bindings.MigrationMode.Validate
)
sqlite_config = chromadb_rust_bindings.SqliteDBConfig(
hash_type=hash_type_bindings,
migration_mode=migration_mode_bindings,
url=sqlite_persist_path,
)
self.bindings = chromadb_rust_bindings.Bindings(
allow_reset=self._system.settings.require("allow_reset"),
sqlite_db_config=sqlite_config,
persist_path=persist_path,
hnsw_cache_size=self.hnsw_cache_size,
)
@override
def stop(self) -> None:
del self.bindings
# ////////////////////////////// Admin API //////////////////////////////
@override
def create_database(self, name: str, tenant: str = DEFAULT_TENANT) -> None:
return self.bindings.create_database(name, tenant)
@override
def get_database(self, name: str, tenant: str = DEFAULT_TENANT) -> Database:
database = self.bindings.get_database(name, tenant)
return {
"id": database.id,
"name": database.name,
"tenant": database.tenant,
}
@override
def delete_database(self, name: str, tenant: str = DEFAULT_TENANT) -> None:
return self.bindings.delete_database(name, tenant)
@override
def list_databases(
self,
limit: Optional[int] = None,
offset: Optional[int] = None,
tenant: str = DEFAULT_TENANT,
) -> Sequence[Database]:
databases = self.bindings.list_databases(limit, offset, tenant)
return [
{
"id": database.id,
"name": database.name,
"tenant": database.tenant,
}
for database in databases
]
@override
def create_tenant(self, name: str) -> None:
return self.bindings.create_tenant(name)
@override
def get_tenant(self, name: str) -> Tenant:
tenant = self.bindings.get_tenant(name)
return Tenant(name=tenant.name)
# ////////////////////////////// Base API //////////////////////////////
@override
def heartbeat(self) -> int:
return self.bindings.heartbeat()
@override
def count_collections(
self, tenant: str = DEFAULT_TENANT, database: str = DEFAULT_DATABASE
) -> int:
return self.bindings.count_collections(tenant, database)
@override
def list_collections(
self,
limit: Optional[int] = None,
offset: Optional[int] = None,
tenant: str = DEFAULT_TENANT,
database: str = DEFAULT_DATABASE,
) -> Sequence[CollectionModel]:
collections = self.bindings.list_collections(limit, offset, tenant, database)
return [
CollectionModel(
id=collection.id,
name=collection.name,
serialized_schema=collection.schema,
configuration_json=collection.configuration,
metadata=collection.metadata,
dimension=collection.dimension,
tenant=collection.tenant,
database=collection.database,
)
for collection in collections
]
@override
def create_collection(
self,
name: str,
schema: Optional[Schema] = None,
configuration: Optional[CreateCollectionConfiguration] = None,
metadata: Optional[CollectionMetadata] = None,
get_or_create: bool = False,
tenant: str = DEFAULT_TENANT,
database: str = DEFAULT_DATABASE,
) -> CollectionModel:
# TODO: This event doesn't capture the get_or_create case appropriately
# TODO: Re-enable embedding function tracking in create_collection
self.product_telemetry_client.capture(
ClientCreateCollectionEvent(
collection_uuid=str(id),
# embedding_function=embedding_function.__class__.__name__,
)
)
if configuration:
configuration_json_str = create_collection_configuration_to_json_str(
configuration, metadata
)
else:
configuration_json_str = None
if schema:
schema_str = json.dumps(schema.serialize_to_json())
else:
schema_str = None
collection = self.bindings.create_collection(
name,
configuration_json_str,
schema_str,
metadata,
get_or_create,
tenant,
database,
)
collection_model = CollectionModel(
id=collection.id,
name=collection.name,
configuration_json=collection.configuration,
serialized_schema=collection.schema,
metadata=collection.metadata,
dimension=collection.dimension,
tenant=collection.tenant,
database=collection.database,
)
return collection_model
@override
def get_collection(
self,
name: str,
tenant: str = DEFAULT_TENANT,
database: str = DEFAULT_DATABASE,
) -> CollectionModel:
collection = self.bindings.get_collection(name, tenant, database)
return CollectionModel(
id=collection.id,
name=collection.name,
configuration_json=collection.configuration,
serialized_schema=collection.schema,
metadata=collection.metadata,
dimension=collection.dimension,
tenant=collection.tenant,
database=collection.database,
)
@override
def get_or_create_collection(
self,
name: str,
schema: Optional[Schema] = None,
configuration: Optional[CreateCollectionConfiguration] = None,
metadata: Optional[CollectionMetadata] = None,
tenant: str = DEFAULT_TENANT,
database: str = DEFAULT_DATABASE,
) -> CollectionModel:
return self.create_collection(
name, schema, configuration, metadata, True, tenant, database
)
@override
def delete_collection(
self,
name: str,
tenant: str = DEFAULT_TENANT,
database: str = DEFAULT_DATABASE,
) -> None:
self.bindings.delete_collection(name, tenant, database)
@override
def _modify(
self,
id: UUID,
new_name: Optional[str] = None,
new_metadata: Optional[CollectionMetadata] = None,
new_configuration: Optional[UpdateCollectionConfiguration] = None,
tenant: str = DEFAULT_TENANT,
database: str = DEFAULT_DATABASE,
) -> None:
if new_configuration:
new_configuration_json_str = update_collection_configuration_to_json_str(
new_configuration
)
else:
new_configuration_json_str = None
self.bindings.update_collection(
str(id), new_name, new_metadata, new_configuration_json_str
)
@override
def _fork(
self,
collection_id: UUID,
new_name: str,
tenant: str = DEFAULT_TENANT,
database: str = DEFAULT_DATABASE,
) -> CollectionModel:
raise NotImplementedError(
"Collection forking is not implemented for Local Chroma"
)
@override
def _search(
self,
collection_id: UUID,
searches: List[Search],
tenant: str = DEFAULT_TENANT,
database: str = DEFAULT_DATABASE,
) -> SearchResult:
raise NotImplementedError("Search is not implemented for Local Chroma")
@override
def _count(
self,
collection_id: UUID,
tenant: str = DEFAULT_TENANT,
database: str = DEFAULT_DATABASE,
) -> int:
return self.bindings.count(str(collection_id), tenant, database)
@override
def _peek(
self,
collection_id: UUID,
n: int = 10,
tenant: str = DEFAULT_TENANT,
database: str = DEFAULT_DATABASE,
) -> GetResult:
return self._get(
collection_id,
limit=n,
tenant=tenant,
database=database,
include=IncludeMetadataDocumentsEmbeddings,
)
@override
def _get(
self,
collection_id: UUID,
ids: Optional[IDs] = None,
where: Optional[Where] = None,
limit: Optional[int] = None,
offset: Optional[int] = None,
where_document: Optional[WhereDocument] = None,
include: Include = IncludeMetadataDocuments,
tenant: str = DEFAULT_TENANT,
database: str = DEFAULT_DATABASE,
) -> GetResult:
ids_amount = len(ids) if ids else 0
self.product_telemetry_client.capture(
CollectionGetEvent(
collection_uuid=str(collection_id),
ids_count=ids_amount,
limit=limit if limit else 0,
include_metadata=ids_amount if "metadatas" in include else 0,
include_documents=ids_amount if "documents" in include else 0,
include_uris=ids_amount if "uris" in include else 0,
)
)
rust_response = self.bindings.get(
str(collection_id),
ids,
json.dumps(where) if where else None,
limit,
offset or 0,
json.dumps(where_document) if where_document else None,
include,
tenant,
database,
)
return GetResult(
ids=rust_response.ids,
embeddings=rust_response.embeddings,
documents=rust_response.documents,
uris=rust_response.uris,
included=include,
data=None,
metadatas=rust_response.metadatas,
)
@override
def _add(
self,
ids: IDs,
collection_id: UUID,
embeddings: Embeddings,
metadatas: Optional[Metadatas] = None,
documents: Optional[Documents] = None,
uris: Optional[URIs] = None,
tenant: str = DEFAULT_TENANT,
database: str = DEFAULT_DATABASE,
) -> bool:
self.product_telemetry_client.capture(
CollectionAddEvent(
collection_uuid=str(collection_id),
add_amount=len(ids),
with_metadata=len(ids) if metadatas is not None else 0,
with_documents=len(ids) if documents is not None else 0,
with_uris=len(ids) if uris is not None else 0,
)
)
return self.bindings.add(
ids,
str(collection_id),
embeddings,
metadatas,
documents,
uris,
tenant,
database,
)
@override
def _update(
self,
collection_id: UUID,
ids: IDs,
embeddings: Optional[Embeddings] = None,
metadatas: Optional[Metadatas] = None,
documents: Optional[Documents] = None,
uris: Optional[URIs] = None,
tenant: str = DEFAULT_TENANT,
database: str = DEFAULT_DATABASE,
) -> bool:
self.product_telemetry_client.capture(
CollectionUpdateEvent(
collection_uuid=str(collection_id),
update_amount=len(ids),
with_embeddings=len(embeddings) if embeddings else 0,
with_metadata=len(metadatas) if metadatas else 0,
with_documents=len(documents) if documents else 0,
with_uris=len(uris) if uris else 0,
)
)
return self.bindings.update(
str(collection_id),
ids,
embeddings,
metadatas,
documents,
uris,
tenant,
database,
)
@override
def _upsert(
self,
collection_id: UUID,
ids: IDs,
embeddings: Embeddings,
metadatas: Optional[Metadatas] = None,
documents: Optional[Documents] = None,
uris: Optional[URIs] = None,
tenant: str = DEFAULT_TENANT,
database: str = DEFAULT_DATABASE,
) -> bool:
return self.bindings.upsert(
str(collection_id),
ids,
embeddings,
metadatas,
documents,
uris,
tenant,
database,
)
@override
def _query(
self,
collection_id: UUID,
query_embeddings: Embeddings,
ids: Optional[IDs] = None,
n_results: int = 10,
where: Optional[Where] = None,
where_document: Optional[WhereDocument] = None,
include: Include = IncludeMetadataDocumentsDistances,
tenant: str = DEFAULT_TENANT,
database: str = DEFAULT_DATABASE,
) -> QueryResult:
query_amount = len(query_embeddings)
filtered_ids_amount = len(ids) if ids else 0
self.product_telemetry_client.capture(
CollectionQueryEvent(
collection_uuid=str(collection_id),
query_amount=query_amount,
filtered_ids_amount=filtered_ids_amount,
n_results=n_results,
with_metadata_filter=query_amount if where is not None else 0,
with_document_filter=query_amount if where_document is not None else 0,
include_metadatas=query_amount if "metadatas" in include else 0,
include_documents=query_amount if "documents" in include else 0,
include_uris=query_amount if "uris" in include else 0,
include_distances=query_amount if "distances" in include else 0,
)
)
rust_response = self.bindings.query(
str(collection_id),
ids,
query_embeddings,
n_results,
json.dumps(where) if where else None,
json.dumps(where_document) if where_document else None,
include,
tenant,
database,
)
return QueryResult(
ids=rust_response.ids,
embeddings=rust_response.embeddings,
documents=rust_response.documents,
uris=rust_response.uris,
included=include,
data=None,
metadatas=rust_response.metadatas,
distances=rust_response.distances,
)
@override
def _delete(
self,
collection_id: UUID,
ids: Optional[IDs] = None,
where: Optional[Where] = None,
where_document: Optional[WhereDocument] = None,
tenant: str = DEFAULT_TENANT,
database: str = DEFAULT_DATABASE,
) -> None:
self.product_telemetry_client.capture(
CollectionDeleteEvent(
# NOTE: the delete amount is not observable from python
# TODO: Fix this when posthog is pushed into Rust frontend
collection_uuid=str(collection_id),
delete_amount=0,
)
)
return self.bindings.delete(
str(collection_id),
ids,
json.dumps(where) if where else None,
json.dumps(where_document) if where_document else None,
tenant,
database,
)
@override
def reset(self) -> bool:
return self.bindings.reset()
@override
def get_version(self) -> str:
return self.bindings.get_version()
@override
def get_settings(self) -> Settings:
return self._system.settings
@override
def get_max_batch_size(self) -> int:
return self.bindings.get_max_batch_size()
@override
def attach_function(
self,
function_id: str,
name: str,
input_collection_id: UUID,
output_collection: str,
params: Optional[Dict[str, Any]] = None,
tenant: str = DEFAULT_TENANT,
database: str = DEFAULT_DATABASE,
) -> "AttachedFunction":
"""Attached functions are not supported in the Rust bindings (local embedded mode)."""
raise NotImplementedError(
"Attached functions are only supported when connecting to a Chroma server via HttpClient. "
"The Rust bindings (embedded mode) do not support attached function operations."
)
@override
def detach_function(
self,
attached_function_id: UUID,
delete_output: bool = False,
tenant: str = DEFAULT_TENANT,
database: str = DEFAULT_DATABASE,
) -> bool:
"""Attached functions are not supported in the Rust bindings (local embedded mode)."""
raise NotImplementedError(
"Attached functions are only supported when connecting to a Chroma server via HttpClient. "
"The Rust bindings (embedded mode) do not support attached function operations."
)
# TODO: Remove this if it's not planned to be used
@override
def get_user_identity(self) -> UserIdentity:
return UserIdentity(
user_id="",
tenant=DEFAULT_TENANT,
databases=[DEFAULT_DATABASE],
)