chore: 添加虚拟环境到仓库
- 添加 backend_service/venv 虚拟环境 - 包含所有Python依赖包 - 注意:虚拟环境约393MB,包含12655个文件
This commit is contained in:
@@ -0,0 +1,71 @@
|
||||
import sys
|
||||
import subprocess
|
||||
import os
|
||||
import tempfile
|
||||
from types import ModuleType
|
||||
from typing import Dict, List
|
||||
|
||||
base_install_dir = (
|
||||
tempfile.gettempdir()
|
||||
+ f"/worker-{os.environ.get('PYTEST_XDIST_WORKER', 'unknown')}"
|
||||
+ "/persistence_test_chromadb_versions"
|
||||
)
|
||||
|
||||
|
||||
def get_path_to_version_install(version: str) -> str:
|
||||
return base_install_dir + "/" + version
|
||||
|
||||
|
||||
def switch_to_version(version: str, versioned_modules: List[str]) -> ModuleType:
|
||||
module_name = "chromadb"
|
||||
# Remove old version from sys.modules, except test modules
|
||||
old_modules = {
|
||||
n: m
|
||||
for n, m in sys.modules.items()
|
||||
if n == module_name
|
||||
or (n.startswith(module_name + "."))
|
||||
or n in versioned_modules
|
||||
or (any(n.startswith(m + ".") for m in versioned_modules))
|
||||
}
|
||||
for n in old_modules:
|
||||
del sys.modules[n]
|
||||
# Load the target version and override the path to the installed version
|
||||
# https://docs.python.org/3/library/importlib.html#importing-a-source-file-directly
|
||||
sys.path.insert(0, get_path_to_version_install(version))
|
||||
import chromadb
|
||||
|
||||
assert chromadb.__version__ == version
|
||||
return chromadb
|
||||
|
||||
|
||||
def get_path_to_version_library(version: str) -> str:
|
||||
return get_path_to_version_install(version) + "/chromadb/__init__.py"
|
||||
|
||||
|
||||
def install_version(version: str, dep_overrides: Dict[str, str]) -> None:
|
||||
# Check if already installed
|
||||
version_library = get_path_to_version_library(version)
|
||||
if os.path.exists(version_library):
|
||||
return
|
||||
path = get_path_to_version_install(version)
|
||||
install(f"chromadb=={version}", path, dep_overrides)
|
||||
|
||||
|
||||
def install(pkg: str, path: str, dep_overrides: Dict[str, str]) -> int:
|
||||
os.makedirs(path, exist_ok=True)
|
||||
|
||||
# -q -q to suppress pip output to ERROR level
|
||||
# https://pip.pypa.io/en/stable/cli/pip/#quiet
|
||||
command = [sys.executable, "-m", "pip", "-q", "-q", "install", pkg]
|
||||
|
||||
for dep, operator_version in dep_overrides.items():
|
||||
command.append(f"{dep}{operator_version}")
|
||||
|
||||
# Only add --no-binary=chroma-hnswlib if it's in the dependencies
|
||||
if "chroma-hnswlib" in pkg or any("chroma-hnswlib" in dep for dep in dep_overrides):
|
||||
command.append("--no-binary=chroma-hnswlib")
|
||||
|
||||
command.append(f"--target={path}")
|
||||
|
||||
print(f"Installing chromadb version {pkg} to {path}")
|
||||
return subprocess.check_call(command)
|
||||
@@ -0,0 +1,7 @@
|
||||
from chromadb.utils.distance_functions import cosine
|
||||
import numpy as np
|
||||
|
||||
|
||||
def test_cosine_zero() -> None:
|
||||
x = np.array([0.0, 0.0], dtype=np.float16)
|
||||
assert cosine(x, x) == 1.0
|
||||
@@ -0,0 +1,145 @@
|
||||
import pytest
|
||||
from typing import List, Any, Callable
|
||||
from jsonschema import ValidationError
|
||||
from unittest.mock import MagicMock, create_autospec
|
||||
from chromadb.utils.embedding_functions.schemas import (
|
||||
validate_config_schema,
|
||||
load_schema,
|
||||
get_available_schemas,
|
||||
)
|
||||
from chromadb.utils.embedding_functions import known_embedding_functions
|
||||
from chromadb.api.types import Documents, Embeddings
|
||||
from pytest import MonkeyPatch
|
||||
|
||||
# Skip these embedding functions in tests
|
||||
SKIP_EMBEDDING_FUNCTIONS = [
|
||||
"chroma_langchain",
|
||||
]
|
||||
|
||||
|
||||
def get_embedding_function_names() -> List[str]:
|
||||
"""Get all embedding function names to test"""
|
||||
return [
|
||||
name
|
||||
for name in known_embedding_functions.keys()
|
||||
if name not in SKIP_EMBEDDING_FUNCTIONS
|
||||
]
|
||||
|
||||
|
||||
class TestEmbeddingFunctionSchemas:
|
||||
"""Test class for embedding function schemas"""
|
||||
|
||||
@pytest.mark.parametrize("ef_name", get_embedding_function_names())
|
||||
def test_embedding_function_config_roundtrip(
|
||||
self,
|
||||
ef_name: str,
|
||||
mock_embeddings: Callable[[Documents], Embeddings],
|
||||
mock_common_deps: MonkeyPatch,
|
||||
) -> None:
|
||||
"""Test embedding function configuration roundtrip"""
|
||||
ef_class = known_embedding_functions[ef_name]
|
||||
|
||||
# Create an autospec of the embedding function class
|
||||
mock_ef = create_autospec(ef_class, instance=True)
|
||||
|
||||
# Mock the __call__ method
|
||||
mock_call = MagicMock(return_value=mock_embeddings(["test"]))
|
||||
mock_ef.__call__ = mock_call
|
||||
|
||||
# For chroma-cloud-qwen, mock get_config to return valid data
|
||||
if ef_name == "chroma-cloud-qwen":
|
||||
from chromadb.utils.embedding_functions.chroma_cloud_qwen_embedding_function import (
|
||||
ChromaCloudQwenEmbeddingModel,
|
||||
CHROMA_CLOUD_QWEN_DEFAULT_INSTRUCTIONS,
|
||||
)
|
||||
|
||||
mock_ef.get_config.return_value = {
|
||||
"api_key_env_var": "CHROMA_API_KEY",
|
||||
"model": ChromaCloudQwenEmbeddingModel.QWEN3_EMBEDDING_0p6B.value,
|
||||
"task": "nl_to_code",
|
||||
"instructions": CHROMA_CLOUD_QWEN_DEFAULT_INSTRUCTIONS,
|
||||
}
|
||||
|
||||
# Mock the class constructor to return our mock instance
|
||||
mock_common_deps.setattr(
|
||||
ef_class, "__new__", lambda cls, *args, **kwargs: mock_ef
|
||||
)
|
||||
|
||||
# Create instance with minimal args (constructor will be mocked)
|
||||
ef_instance = ef_class()
|
||||
|
||||
# Get the config (this will use the real method)
|
||||
config = ef_instance.get_config()
|
||||
|
||||
# Test recreation from config
|
||||
new_instance = ef_class.build_from_config(config)
|
||||
new_config = new_instance.get_config()
|
||||
|
||||
# Configs should match
|
||||
assert (
|
||||
config == new_config
|
||||
), f"Configs don't match after recreation for {ef_name}"
|
||||
|
||||
def test_schema_required_fields(self) -> None:
|
||||
"""Test that schemas enforce required fields"""
|
||||
for schema_name in get_available_schemas():
|
||||
schema = load_schema(schema_name)
|
||||
if "required" not in schema:
|
||||
continue
|
||||
|
||||
# Create minimal valid config
|
||||
config = {}
|
||||
for field in schema["required"]:
|
||||
field_schema = schema["properties"][field]
|
||||
field_type = (
|
||||
field_schema["type"][0]
|
||||
if isinstance(field_schema["type"], list)
|
||||
else field_schema["type"]
|
||||
)
|
||||
config[field] = self._get_dummy_value(field_type)
|
||||
|
||||
# Test each required field
|
||||
for field in schema["required"]:
|
||||
test_config = config.copy()
|
||||
del test_config[field]
|
||||
with pytest.raises(ValidationError):
|
||||
validate_config_schema(test_config, schema_name)
|
||||
|
||||
@staticmethod
|
||||
def _get_dummy_value(field_type: str) -> Any:
|
||||
"""Get a dummy value for a given field type"""
|
||||
type_map = {
|
||||
"string": "dummy",
|
||||
"integer": 0,
|
||||
"number": 0.0,
|
||||
"boolean": False,
|
||||
"object": {},
|
||||
"array": [],
|
||||
}
|
||||
return type_map.get(field_type, "dummy")
|
||||
|
||||
def test_schema_additional_properties(self) -> None:
|
||||
"""Test that schemas reject additional properties"""
|
||||
for schema_name in get_available_schemas():
|
||||
schema = load_schema(schema_name)
|
||||
config = {}
|
||||
|
||||
# Add required fields
|
||||
if "required" in schema:
|
||||
for field in schema["required"]:
|
||||
field_schema = schema["properties"][field]
|
||||
field_type = (
|
||||
field_schema["type"][0]
|
||||
if isinstance(field_schema["type"], list)
|
||||
else field_schema["type"]
|
||||
)
|
||||
config[field] = self._get_dummy_value(field_type)
|
||||
|
||||
# Add additional property
|
||||
test_config = config.copy()
|
||||
test_config["additional_property"] = "value"
|
||||
|
||||
# Test validation
|
||||
if schema.get("additionalProperties", True) is False:
|
||||
with pytest.raises(ValidationError):
|
||||
validate_config_schema(test_config, schema_name)
|
||||
@@ -0,0 +1,184 @@
|
||||
import numpy as np
|
||||
from typing import List, Dict, Any, cast, Union
|
||||
from chromadb.utils.results import (
|
||||
_transform_embeddings,
|
||||
_add_query_fields,
|
||||
_add_get_fields,
|
||||
query_result_to_dfs,
|
||||
get_result_to_df,
|
||||
)
|
||||
from chromadb.api.types import (
|
||||
QueryResult,
|
||||
GetResult,
|
||||
)
|
||||
from numpy.typing import NDArray
|
||||
|
||||
|
||||
def test_transform_embeddings() -> None:
|
||||
# Test with None input
|
||||
assert _transform_embeddings(None) is None
|
||||
|
||||
# Test with numpy arrays
|
||||
embeddings = cast(
|
||||
List[NDArray[Union[np.int32, np.float32]]],
|
||||
[np.array([1.0, 2.0]), np.array([3.0, 4.0])],
|
||||
)
|
||||
transformed = _transform_embeddings(embeddings)
|
||||
assert isinstance(transformed, list)
|
||||
assert transformed == [[1.0, 2.0], [3.0, 4.0]]
|
||||
|
||||
# Test with list of lists
|
||||
embeddings = cast(
|
||||
List[NDArray[Union[np.int32, np.float32]]],
|
||||
[np.array([1.0, 2.0]), np.array([3.0, 4.0])],
|
||||
)
|
||||
transformed = _transform_embeddings(embeddings)
|
||||
assert transformed == [[1.0, 2.0], [3.0, 4.0]]
|
||||
|
||||
|
||||
def test_add_query_fields() -> None:
|
||||
data_dict: Dict[str, Any] = {}
|
||||
query_result: QueryResult = {
|
||||
"ids": [["id1"], ["id2"]],
|
||||
"embeddings": [[np.array([1.0, 2.0])], [np.array([3.0, 4.0])]],
|
||||
"documents": [["doc1"], ["doc2"]],
|
||||
"metadatas": [[{"key": "value1"}], [{"key": "value2"}]],
|
||||
"distances": [[0.1], [0.2]],
|
||||
"uris": [["uri1", "uri2"]],
|
||||
"data": [
|
||||
[np.array([1, 2, 3]), np.array([4, 5, 6])]
|
||||
], # Using numpy arrays as Image type
|
||||
"included": ["embeddings", "documents", "metadatas", "distances"],
|
||||
}
|
||||
|
||||
_add_query_fields(data_dict, query_result, 0)
|
||||
assert np.array_equal(data_dict["embedding"], [np.array([1.0, 2.0])])
|
||||
assert data_dict["document"] == ["doc1"]
|
||||
assert data_dict["metadata"] == [{"key": "value1"}]
|
||||
assert data_dict["distance"] == [0.1]
|
||||
|
||||
|
||||
def test_add_get_fields() -> None:
|
||||
data_dict: Dict[str, Any] = {}
|
||||
get_result: GetResult = {
|
||||
"ids": ["id1", "id2"],
|
||||
"embeddings": [np.array([1.0, 2.0]), np.array([3.0, 4.0])],
|
||||
"documents": ["doc1", "doc2"],
|
||||
"metadatas": [{"key": "value1"}, {"key": "value2"}],
|
||||
"uris": ["uri1", "uri2"],
|
||||
"data": [
|
||||
np.array([1, 2, 3]),
|
||||
np.array([4, 5, 6]),
|
||||
], # Using numpy arrays as Image type
|
||||
"included": ["embeddings", "documents", "metadatas"],
|
||||
}
|
||||
|
||||
_add_get_fields(data_dict, get_result)
|
||||
assert all(
|
||||
np.array_equal(a, b)
|
||||
for a, b in zip(
|
||||
data_dict["embedding"], [np.array([1.0, 2.0]), np.array([3.0, 4.0])]
|
||||
)
|
||||
)
|
||||
assert data_dict["document"] == ["doc1", "doc2"]
|
||||
assert data_dict["metadata"] == [{"key": "value1"}, {"key": "value2"}]
|
||||
|
||||
|
||||
def test_query_result_to_dfs() -> None:
|
||||
query_result: QueryResult = {
|
||||
"ids": [["id1", "id2"]],
|
||||
"embeddings": [[np.array([1.0, 2.0]), np.array([3.0, 4.0])]],
|
||||
"documents": [["doc1", "doc2"]],
|
||||
"metadatas": [[{"key": "value1"}, {"key": "value2"}]],
|
||||
"distances": [[0.1, 0.2]],
|
||||
"uris": [["uri1", "uri2"]],
|
||||
"data": [
|
||||
[np.array([1, 2, 3]), np.array([4, 5, 6])]
|
||||
], # Using numpy arrays as Image type
|
||||
"included": ["embeddings", "documents", "metadatas", "distances"],
|
||||
}
|
||||
|
||||
dfs = query_result_to_dfs(query_result)
|
||||
assert len(dfs) == 1 # Only one query
|
||||
|
||||
# Test DataFrame
|
||||
df = dfs[0]
|
||||
assert df.index[0] == "id1"
|
||||
assert df["document"].iloc[0] == "doc1"
|
||||
assert df["metadata"].iloc[0] == {"key": "value1"}
|
||||
assert np.array_equal(df["embedding"].iloc[0], np.array([1.0, 2.0]))
|
||||
assert df["distance"].iloc[0] == 0.1
|
||||
|
||||
# Test column order
|
||||
assert list(df.columns) == ["embedding", "document", "metadata", "distance"]
|
||||
|
||||
|
||||
def test_get_result_to_df() -> None:
|
||||
get_result: GetResult = {
|
||||
"ids": ["id1", "id2"],
|
||||
"embeddings": [np.array([1.0, 2.0]), np.array([3.0, 4.0])],
|
||||
"documents": ["doc1", "doc2"],
|
||||
"metadatas": [{"key": "value1"}, {"key": "value2"}],
|
||||
"uris": ["uri1", "uri2"],
|
||||
"data": [
|
||||
np.array([1, 2, 3]),
|
||||
np.array([4, 5, 6]),
|
||||
], # Using numpy arrays as Image type
|
||||
"included": ["embeddings", "documents", "metadatas"],
|
||||
}
|
||||
|
||||
df = get_result_to_df(get_result)
|
||||
assert len(df) == 2
|
||||
assert list(df.index) == ["id1", "id2"]
|
||||
assert df["document"].tolist() == ["doc1", "doc2"]
|
||||
assert df["metadata"].tolist() == [{"key": "value1"}, {"key": "value2"}]
|
||||
assert all(
|
||||
np.array_equal(a, b)
|
||||
for a, b in zip(
|
||||
df["embedding"].tolist(), [np.array([1.0, 2.0]), np.array([3.0, 4.0])]
|
||||
)
|
||||
)
|
||||
|
||||
# Test column order
|
||||
assert list(df.columns) == ["embedding", "document", "metadata"]
|
||||
|
||||
|
||||
def test_query_result_to_dfs_with_missing_fields() -> None:
|
||||
query_result: QueryResult = {
|
||||
"ids": [["id1"]],
|
||||
"documents": [["doc1"]],
|
||||
"embeddings": [[]], # type:ignore
|
||||
"metadatas": [[]],
|
||||
"distances": [[]],
|
||||
"uris": [[]],
|
||||
"data": [[]],
|
||||
"included": ["documents"],
|
||||
}
|
||||
|
||||
dfs = query_result_to_dfs(query_result)
|
||||
assert len(dfs) == 1
|
||||
df = dfs[0]
|
||||
assert df.index[0] == "id1"
|
||||
assert df["document"].iloc[0] == "doc1"
|
||||
assert "metadata" not in df.columns
|
||||
assert "embedding" not in df.columns
|
||||
assert "distance" not in df.columns
|
||||
|
||||
|
||||
def test_get_result_to_df_with_missing_fields() -> None:
|
||||
get_result: GetResult = {
|
||||
"ids": ["id1", "id2"],
|
||||
"documents": ["doc1", "doc2"],
|
||||
"embeddings": [],
|
||||
"metadatas": [],
|
||||
"uris": [],
|
||||
"data": [],
|
||||
"included": ["documents"],
|
||||
}
|
||||
|
||||
df = get_result_to_df(get_result)
|
||||
assert len(df) == 2
|
||||
assert list(df.index) == ["id1", "id2"]
|
||||
assert df["document"].tolist() == ["doc1", "doc2"]
|
||||
assert "metadata" not in df.columns
|
||||
assert "embedding" not in df.columns
|
||||
@@ -0,0 +1,30 @@
|
||||
import time
|
||||
from chromadb.api import ClientAPI
|
||||
from chromadb.test.conftest import COMPACTION_SLEEP
|
||||
|
||||
TIMEOUT_INTERVAL = 1
|
||||
|
||||
|
||||
def get_collection_version(client: ClientAPI, collection_name: str) -> int:
|
||||
coll = client.get_collection(collection_name)
|
||||
return coll.get_model()["version"]
|
||||
|
||||
|
||||
def wait_for_version_increase(
|
||||
client: ClientAPI,
|
||||
collection_name: str,
|
||||
initial_version: int,
|
||||
additional_time: int = 0,
|
||||
) -> int:
|
||||
timeout = COMPACTION_SLEEP
|
||||
initial_time = time.time() + additional_time
|
||||
|
||||
curr_version = get_collection_version(client, collection_name)
|
||||
while curr_version == initial_version:
|
||||
time.sleep(TIMEOUT_INTERVAL)
|
||||
if time.time() - initial_time > timeout:
|
||||
collection_id = client.get_collection(collection_name).id
|
||||
raise TimeoutError(f"Model was not updated in time for {collection_id}")
|
||||
curr_version = get_collection_version(client, collection_name)
|
||||
|
||||
return curr_version
|
||||
Reference in New Issue
Block a user