chore: 添加虚拟环境到仓库

- 添加 backend_service/venv 虚拟环境
- 包含所有Python依赖包
- 注意:虚拟环境约393MB,包含12655个文件
This commit is contained in:
2025-12-03 10:19:25 +08:00
parent a6c2027caa
commit c4f851d387
12655 changed files with 3009376 additions and 0 deletions

View File

@@ -0,0 +1,71 @@
import sys
import subprocess
import os
import tempfile
from types import ModuleType
from typing import Dict, List
base_install_dir = (
tempfile.gettempdir()
+ f"/worker-{os.environ.get('PYTEST_XDIST_WORKER', 'unknown')}"
+ "/persistence_test_chromadb_versions"
)
def get_path_to_version_install(version: str) -> str:
return base_install_dir + "/" + version
def switch_to_version(version: str, versioned_modules: List[str]) -> ModuleType:
module_name = "chromadb"
# Remove old version from sys.modules, except test modules
old_modules = {
n: m
for n, m in sys.modules.items()
if n == module_name
or (n.startswith(module_name + "."))
or n in versioned_modules
or (any(n.startswith(m + ".") for m in versioned_modules))
}
for n in old_modules:
del sys.modules[n]
# Load the target version and override the path to the installed version
# https://docs.python.org/3/library/importlib.html#importing-a-source-file-directly
sys.path.insert(0, get_path_to_version_install(version))
import chromadb
assert chromadb.__version__ == version
return chromadb
def get_path_to_version_library(version: str) -> str:
return get_path_to_version_install(version) + "/chromadb/__init__.py"
def install_version(version: str, dep_overrides: Dict[str, str]) -> None:
# Check if already installed
version_library = get_path_to_version_library(version)
if os.path.exists(version_library):
return
path = get_path_to_version_install(version)
install(f"chromadb=={version}", path, dep_overrides)
def install(pkg: str, path: str, dep_overrides: Dict[str, str]) -> int:
os.makedirs(path, exist_ok=True)
# -q -q to suppress pip output to ERROR level
# https://pip.pypa.io/en/stable/cli/pip/#quiet
command = [sys.executable, "-m", "pip", "-q", "-q", "install", pkg]
for dep, operator_version in dep_overrides.items():
command.append(f"{dep}{operator_version}")
# Only add --no-binary=chroma-hnswlib if it's in the dependencies
if "chroma-hnswlib" in pkg or any("chroma-hnswlib" in dep for dep in dep_overrides):
command.append("--no-binary=chroma-hnswlib")
command.append(f"--target={path}")
print(f"Installing chromadb version {pkg} to {path}")
return subprocess.check_call(command)

View File

@@ -0,0 +1,7 @@
from chromadb.utils.distance_functions import cosine
import numpy as np
def test_cosine_zero() -> None:
x = np.array([0.0, 0.0], dtype=np.float16)
assert cosine(x, x) == 1.0

View File

@@ -0,0 +1,145 @@
import pytest
from typing import List, Any, Callable
from jsonschema import ValidationError
from unittest.mock import MagicMock, create_autospec
from chromadb.utils.embedding_functions.schemas import (
validate_config_schema,
load_schema,
get_available_schemas,
)
from chromadb.utils.embedding_functions import known_embedding_functions
from chromadb.api.types import Documents, Embeddings
from pytest import MonkeyPatch
# Skip these embedding functions in tests
SKIP_EMBEDDING_FUNCTIONS = [
"chroma_langchain",
]
def get_embedding_function_names() -> List[str]:
"""Get all embedding function names to test"""
return [
name
for name in known_embedding_functions.keys()
if name not in SKIP_EMBEDDING_FUNCTIONS
]
class TestEmbeddingFunctionSchemas:
"""Test class for embedding function schemas"""
@pytest.mark.parametrize("ef_name", get_embedding_function_names())
def test_embedding_function_config_roundtrip(
self,
ef_name: str,
mock_embeddings: Callable[[Documents], Embeddings],
mock_common_deps: MonkeyPatch,
) -> None:
"""Test embedding function configuration roundtrip"""
ef_class = known_embedding_functions[ef_name]
# Create an autospec of the embedding function class
mock_ef = create_autospec(ef_class, instance=True)
# Mock the __call__ method
mock_call = MagicMock(return_value=mock_embeddings(["test"]))
mock_ef.__call__ = mock_call
# For chroma-cloud-qwen, mock get_config to return valid data
if ef_name == "chroma-cloud-qwen":
from chromadb.utils.embedding_functions.chroma_cloud_qwen_embedding_function import (
ChromaCloudQwenEmbeddingModel,
CHROMA_CLOUD_QWEN_DEFAULT_INSTRUCTIONS,
)
mock_ef.get_config.return_value = {
"api_key_env_var": "CHROMA_API_KEY",
"model": ChromaCloudQwenEmbeddingModel.QWEN3_EMBEDDING_0p6B.value,
"task": "nl_to_code",
"instructions": CHROMA_CLOUD_QWEN_DEFAULT_INSTRUCTIONS,
}
# Mock the class constructor to return our mock instance
mock_common_deps.setattr(
ef_class, "__new__", lambda cls, *args, **kwargs: mock_ef
)
# Create instance with minimal args (constructor will be mocked)
ef_instance = ef_class()
# Get the config (this will use the real method)
config = ef_instance.get_config()
# Test recreation from config
new_instance = ef_class.build_from_config(config)
new_config = new_instance.get_config()
# Configs should match
assert (
config == new_config
), f"Configs don't match after recreation for {ef_name}"
def test_schema_required_fields(self) -> None:
"""Test that schemas enforce required fields"""
for schema_name in get_available_schemas():
schema = load_schema(schema_name)
if "required" not in schema:
continue
# Create minimal valid config
config = {}
for field in schema["required"]:
field_schema = schema["properties"][field]
field_type = (
field_schema["type"][0]
if isinstance(field_schema["type"], list)
else field_schema["type"]
)
config[field] = self._get_dummy_value(field_type)
# Test each required field
for field in schema["required"]:
test_config = config.copy()
del test_config[field]
with pytest.raises(ValidationError):
validate_config_schema(test_config, schema_name)
@staticmethod
def _get_dummy_value(field_type: str) -> Any:
"""Get a dummy value for a given field type"""
type_map = {
"string": "dummy",
"integer": 0,
"number": 0.0,
"boolean": False,
"object": {},
"array": [],
}
return type_map.get(field_type, "dummy")
def test_schema_additional_properties(self) -> None:
"""Test that schemas reject additional properties"""
for schema_name in get_available_schemas():
schema = load_schema(schema_name)
config = {}
# Add required fields
if "required" in schema:
for field in schema["required"]:
field_schema = schema["properties"][field]
field_type = (
field_schema["type"][0]
if isinstance(field_schema["type"], list)
else field_schema["type"]
)
config[field] = self._get_dummy_value(field_type)
# Add additional property
test_config = config.copy()
test_config["additional_property"] = "value"
# Test validation
if schema.get("additionalProperties", True) is False:
with pytest.raises(ValidationError):
validate_config_schema(test_config, schema_name)

View File

@@ -0,0 +1,184 @@
import numpy as np
from typing import List, Dict, Any, cast, Union
from chromadb.utils.results import (
_transform_embeddings,
_add_query_fields,
_add_get_fields,
query_result_to_dfs,
get_result_to_df,
)
from chromadb.api.types import (
QueryResult,
GetResult,
)
from numpy.typing import NDArray
def test_transform_embeddings() -> None:
# Test with None input
assert _transform_embeddings(None) is None
# Test with numpy arrays
embeddings = cast(
List[NDArray[Union[np.int32, np.float32]]],
[np.array([1.0, 2.0]), np.array([3.0, 4.0])],
)
transformed = _transform_embeddings(embeddings)
assert isinstance(transformed, list)
assert transformed == [[1.0, 2.0], [3.0, 4.0]]
# Test with list of lists
embeddings = cast(
List[NDArray[Union[np.int32, np.float32]]],
[np.array([1.0, 2.0]), np.array([3.0, 4.0])],
)
transformed = _transform_embeddings(embeddings)
assert transformed == [[1.0, 2.0], [3.0, 4.0]]
def test_add_query_fields() -> None:
data_dict: Dict[str, Any] = {}
query_result: QueryResult = {
"ids": [["id1"], ["id2"]],
"embeddings": [[np.array([1.0, 2.0])], [np.array([3.0, 4.0])]],
"documents": [["doc1"], ["doc2"]],
"metadatas": [[{"key": "value1"}], [{"key": "value2"}]],
"distances": [[0.1], [0.2]],
"uris": [["uri1", "uri2"]],
"data": [
[np.array([1, 2, 3]), np.array([4, 5, 6])]
], # Using numpy arrays as Image type
"included": ["embeddings", "documents", "metadatas", "distances"],
}
_add_query_fields(data_dict, query_result, 0)
assert np.array_equal(data_dict["embedding"], [np.array([1.0, 2.0])])
assert data_dict["document"] == ["doc1"]
assert data_dict["metadata"] == [{"key": "value1"}]
assert data_dict["distance"] == [0.1]
def test_add_get_fields() -> None:
data_dict: Dict[str, Any] = {}
get_result: GetResult = {
"ids": ["id1", "id2"],
"embeddings": [np.array([1.0, 2.0]), np.array([3.0, 4.0])],
"documents": ["doc1", "doc2"],
"metadatas": [{"key": "value1"}, {"key": "value2"}],
"uris": ["uri1", "uri2"],
"data": [
np.array([1, 2, 3]),
np.array([4, 5, 6]),
], # Using numpy arrays as Image type
"included": ["embeddings", "documents", "metadatas"],
}
_add_get_fields(data_dict, get_result)
assert all(
np.array_equal(a, b)
for a, b in zip(
data_dict["embedding"], [np.array([1.0, 2.0]), np.array([3.0, 4.0])]
)
)
assert data_dict["document"] == ["doc1", "doc2"]
assert data_dict["metadata"] == [{"key": "value1"}, {"key": "value2"}]
def test_query_result_to_dfs() -> None:
query_result: QueryResult = {
"ids": [["id1", "id2"]],
"embeddings": [[np.array([1.0, 2.0]), np.array([3.0, 4.0])]],
"documents": [["doc1", "doc2"]],
"metadatas": [[{"key": "value1"}, {"key": "value2"}]],
"distances": [[0.1, 0.2]],
"uris": [["uri1", "uri2"]],
"data": [
[np.array([1, 2, 3]), np.array([4, 5, 6])]
], # Using numpy arrays as Image type
"included": ["embeddings", "documents", "metadatas", "distances"],
}
dfs = query_result_to_dfs(query_result)
assert len(dfs) == 1 # Only one query
# Test DataFrame
df = dfs[0]
assert df.index[0] == "id1"
assert df["document"].iloc[0] == "doc1"
assert df["metadata"].iloc[0] == {"key": "value1"}
assert np.array_equal(df["embedding"].iloc[0], np.array([1.0, 2.0]))
assert df["distance"].iloc[0] == 0.1
# Test column order
assert list(df.columns) == ["embedding", "document", "metadata", "distance"]
def test_get_result_to_df() -> None:
get_result: GetResult = {
"ids": ["id1", "id2"],
"embeddings": [np.array([1.0, 2.0]), np.array([3.0, 4.0])],
"documents": ["doc1", "doc2"],
"metadatas": [{"key": "value1"}, {"key": "value2"}],
"uris": ["uri1", "uri2"],
"data": [
np.array([1, 2, 3]),
np.array([4, 5, 6]),
], # Using numpy arrays as Image type
"included": ["embeddings", "documents", "metadatas"],
}
df = get_result_to_df(get_result)
assert len(df) == 2
assert list(df.index) == ["id1", "id2"]
assert df["document"].tolist() == ["doc1", "doc2"]
assert df["metadata"].tolist() == [{"key": "value1"}, {"key": "value2"}]
assert all(
np.array_equal(a, b)
for a, b in zip(
df["embedding"].tolist(), [np.array([1.0, 2.0]), np.array([3.0, 4.0])]
)
)
# Test column order
assert list(df.columns) == ["embedding", "document", "metadata"]
def test_query_result_to_dfs_with_missing_fields() -> None:
query_result: QueryResult = {
"ids": [["id1"]],
"documents": [["doc1"]],
"embeddings": [[]], # type:ignore
"metadatas": [[]],
"distances": [[]],
"uris": [[]],
"data": [[]],
"included": ["documents"],
}
dfs = query_result_to_dfs(query_result)
assert len(dfs) == 1
df = dfs[0]
assert df.index[0] == "id1"
assert df["document"].iloc[0] == "doc1"
assert "metadata" not in df.columns
assert "embedding" not in df.columns
assert "distance" not in df.columns
def test_get_result_to_df_with_missing_fields() -> None:
get_result: GetResult = {
"ids": ["id1", "id2"],
"documents": ["doc1", "doc2"],
"embeddings": [],
"metadatas": [],
"uris": [],
"data": [],
"included": ["documents"],
}
df = get_result_to_df(get_result)
assert len(df) == 2
assert list(df.index) == ["id1", "id2"]
assert df["document"].tolist() == ["doc1", "doc2"]
assert "metadata" not in df.columns
assert "embedding" not in df.columns

View File

@@ -0,0 +1,30 @@
import time
from chromadb.api import ClientAPI
from chromadb.test.conftest import COMPACTION_SLEEP
TIMEOUT_INTERVAL = 1
def get_collection_version(client: ClientAPI, collection_name: str) -> int:
coll = client.get_collection(collection_name)
return coll.get_model()["version"]
def wait_for_version_increase(
client: ClientAPI,
collection_name: str,
initial_version: int,
additional_time: int = 0,
) -> int:
timeout = COMPACTION_SLEEP
initial_time = time.time() + additional_time
curr_version = get_collection_version(client, collection_name)
while curr_version == initial_version:
time.sleep(TIMEOUT_INTERVAL)
if time.time() - initial_time > timeout:
collection_id = client.get_collection(collection_name).id
raise TimeoutError(f"Model was not updated in time for {collection_id}")
curr_version = get_collection_version(client, collection_name)
return curr_version