chore: 添加虚拟环境到仓库

- 添加 backend_service/venv 虚拟环境 - 包含所有Python依赖包 - 注意：虚拟环境约393MB，包含12655个文件
2025-12-03 10:19:25 +08:00
parent a6c2027caa
commit c4f851d387
12655 changed files with 3009376 additions and 0 deletions
--- a/backend_service/venv/lib/python3.13/site-packages/chromadb/test/utils/cross_version.py
+++ b/backend_service/venv/lib/python3.13/site-packages/chromadb/test/utils/cross_version.py
@@ -0,0 +1,71 @@
+import sys
+import subprocess
+import os
+import tempfile
+from types import ModuleType
+from typing import Dict, List
+
+base_install_dir = (
+    tempfile.gettempdir()
+    + f"/worker-{os.environ.get('PYTEST_XDIST_WORKER', 'unknown')}"
+    + "/persistence_test_chromadb_versions"
+)
+
+
+def get_path_to_version_install(version: str) -> str:
+    return base_install_dir + "/" + version
+
+
+def switch_to_version(version: str, versioned_modules: List[str]) -> ModuleType:
+    module_name = "chromadb"
+    # Remove old version from sys.modules, except test modules
+    old_modules = {
+        n: m
+        for n, m in sys.modules.items()
+        if n == module_name
+        or (n.startswith(module_name + "."))
+        or n in versioned_modules
+        or (any(n.startswith(m + ".") for m in versioned_modules))
+    }
+    for n in old_modules:
+        del sys.modules[n]
+    # Load the target version and override the path to the installed version
+    # https://docs.python.org/3/library/importlib.html#importing-a-source-file-directly
+    sys.path.insert(0, get_path_to_version_install(version))
+    import chromadb
+
+    assert chromadb.__version__ == version
+    return chromadb
+
+
+def get_path_to_version_library(version: str) -> str:
+    return get_path_to_version_install(version) + "/chromadb/__init__.py"
+
+
+def install_version(version: str, dep_overrides: Dict[str, str]) -> None:
+    # Check if already installed
+    version_library = get_path_to_version_library(version)
+    if os.path.exists(version_library):
+        return
+    path = get_path_to_version_install(version)
+    install(f"chromadb=={version}", path, dep_overrides)
+
+
+def install(pkg: str, path: str, dep_overrides: Dict[str, str]) -> int:
+    os.makedirs(path, exist_ok=True)
+
+    # -q -q to suppress pip output to ERROR level
+    # https://pip.pypa.io/en/stable/cli/pip/#quiet
+    command = [sys.executable, "-m", "pip", "-q", "-q", "install", pkg]
+
+    for dep, operator_version in dep_overrides.items():
+        command.append(f"{dep}{operator_version}")
+
+    # Only add --no-binary=chroma-hnswlib if it's in the dependencies
+    if "chroma-hnswlib" in pkg or any("chroma-hnswlib" in dep for dep in dep_overrides):
+        command.append("--no-binary=chroma-hnswlib")
+
+    command.append(f"--target={path}")
+
+    print(f"Installing chromadb version {pkg} to {path}")
+    return subprocess.check_call(command)
--- a/backend_service/venv/lib/python3.13/site-packages/chromadb/test/utils/distance_functions.py
+++ b/backend_service/venv/lib/python3.13/site-packages/chromadb/test/utils/distance_functions.py
@@ -0,0 +1,7 @@
+from chromadb.utils.distance_functions import cosine
+import numpy as np
+
+
+def test_cosine_zero() -> None:
+    x = np.array([0.0, 0.0], dtype=np.float16)
+    assert cosine(x, x) == 1.0
--- a/backend_service/venv/lib/python3.13/site-packages/chromadb/test/utils/test_embedding_function_schemas.py
+++ b/backend_service/venv/lib/python3.13/site-packages/chromadb/test/utils/test_embedding_function_schemas.py
@@ -0,0 +1,145 @@
+import pytest
+from typing import List, Any, Callable
+from jsonschema import ValidationError
+from unittest.mock import MagicMock, create_autospec
+from chromadb.utils.embedding_functions.schemas import (
+    validate_config_schema,
+    load_schema,
+    get_available_schemas,
+)
+from chromadb.utils.embedding_functions import known_embedding_functions
+from chromadb.api.types import Documents, Embeddings
+from pytest import MonkeyPatch
+
+# Skip these embedding functions in tests
+SKIP_EMBEDDING_FUNCTIONS = [
+    "chroma_langchain",
+]
+
+
+def get_embedding_function_names() -> List[str]:
+    """Get all embedding function names to test"""
+    return [
+        name
+        for name in known_embedding_functions.keys()
+        if name not in SKIP_EMBEDDING_FUNCTIONS
+    ]
+
+
+class TestEmbeddingFunctionSchemas:
+    """Test class for embedding function schemas"""
+
+    @pytest.mark.parametrize("ef_name", get_embedding_function_names())
+    def test_embedding_function_config_roundtrip(
+        self,
+        ef_name: str,
+        mock_embeddings: Callable[[Documents], Embeddings],
+        mock_common_deps: MonkeyPatch,
+    ) -> None:
+        """Test embedding function configuration roundtrip"""
+        ef_class = known_embedding_functions[ef_name]
+
+        # Create an autospec of the embedding function class
+        mock_ef = create_autospec(ef_class, instance=True)
+
+        # Mock the __call__ method
+        mock_call = MagicMock(return_value=mock_embeddings(["test"]))
+        mock_ef.__call__ = mock_call
+
+        # For chroma-cloud-qwen, mock get_config to return valid data
+        if ef_name == "chroma-cloud-qwen":
+            from chromadb.utils.embedding_functions.chroma_cloud_qwen_embedding_function import (
+                ChromaCloudQwenEmbeddingModel,
+                CHROMA_CLOUD_QWEN_DEFAULT_INSTRUCTIONS,
+            )
+
+            mock_ef.get_config.return_value = {
+                "api_key_env_var": "CHROMA_API_KEY",
+                "model": ChromaCloudQwenEmbeddingModel.QWEN3_EMBEDDING_0p6B.value,
+                "task": "nl_to_code",
+                "instructions": CHROMA_CLOUD_QWEN_DEFAULT_INSTRUCTIONS,
+            }
+
+        # Mock the class constructor to return our mock instance
+        mock_common_deps.setattr(
+            ef_class, "__new__", lambda cls, *args, **kwargs: mock_ef
+        )
+
+        # Create instance with minimal args (constructor will be mocked)
+        ef_instance = ef_class()
+
+        # Get the config (this will use the real method)
+        config = ef_instance.get_config()
+
+        # Test recreation from config
+        new_instance = ef_class.build_from_config(config)
+        new_config = new_instance.get_config()
+
+        # Configs should match
+        assert (
+            config == new_config
+        ), f"Configs don't match after recreation for {ef_name}"
+
+    def test_schema_required_fields(self) -> None:
+        """Test that schemas enforce required fields"""
+        for schema_name in get_available_schemas():
+            schema = load_schema(schema_name)
+            if "required" not in schema:
+                continue
+
+            # Create minimal valid config
+            config = {}
+            for field in schema["required"]:
+                field_schema = schema["properties"][field]
+                field_type = (
+                    field_schema["type"][0]
+                    if isinstance(field_schema["type"], list)
+                    else field_schema["type"]
+                )
+                config[field] = self._get_dummy_value(field_type)
+
+            # Test each required field
+            for field in schema["required"]:
+                test_config = config.copy()
+                del test_config[field]
+                with pytest.raises(ValidationError):
+                    validate_config_schema(test_config, schema_name)
+
+    @staticmethod
+    def _get_dummy_value(field_type: str) -> Any:
+        """Get a dummy value for a given field type"""
+        type_map = {
+            "string": "dummy",
+            "integer": 0,
+            "number": 0.0,
+            "boolean": False,
+            "object": {},
+            "array": [],
+        }
+        return type_map.get(field_type, "dummy")
+
+    def test_schema_additional_properties(self) -> None:
+        """Test that schemas reject additional properties"""
+        for schema_name in get_available_schemas():
+            schema = load_schema(schema_name)
+            config = {}
+
+            # Add required fields
+            if "required" in schema:
+                for field in schema["required"]:
+                    field_schema = schema["properties"][field]
+                    field_type = (
+                        field_schema["type"][0]
+                        if isinstance(field_schema["type"], list)
+                        else field_schema["type"]
+                    )
+                    config[field] = self._get_dummy_value(field_type)
+
+            # Add additional property
+            test_config = config.copy()
+            test_config["additional_property"] = "value"
+
+            # Test validation
+            if schema.get("additionalProperties", True) is False:
+                with pytest.raises(ValidationError):
+                    validate_config_schema(test_config, schema_name)
--- a/backend_service/venv/lib/python3.13/site-packages/chromadb/test/utils/test_result_df_transform.py
+++ b/backend_service/venv/lib/python3.13/site-packages/chromadb/test/utils/test_result_df_transform.py
@@ -0,0 +1,184 @@
+import numpy as np
+from typing import List, Dict, Any, cast, Union
+from chromadb.utils.results import (
+    _transform_embeddings,
+    _add_query_fields,
+    _add_get_fields,
+    query_result_to_dfs,
+    get_result_to_df,
+)
+from chromadb.api.types import (
+    QueryResult,
+    GetResult,
+)
+from numpy.typing import NDArray
+
+
+def test_transform_embeddings() -> None:
+    # Test with None input
+    assert _transform_embeddings(None) is None
+
+    # Test with numpy arrays
+    embeddings = cast(
+        List[NDArray[Union[np.int32, np.float32]]],
+        [np.array([1.0, 2.0]), np.array([3.0, 4.0])],
+    )
+    transformed = _transform_embeddings(embeddings)
+    assert isinstance(transformed, list)
+    assert transformed == [[1.0, 2.0], [3.0, 4.0]]
+
+    # Test with list of lists
+    embeddings = cast(
+        List[NDArray[Union[np.int32, np.float32]]],
+        [np.array([1.0, 2.0]), np.array([3.0, 4.0])],
+    )
+    transformed = _transform_embeddings(embeddings)
+    assert transformed == [[1.0, 2.0], [3.0, 4.0]]
+
+
+def test_add_query_fields() -> None:
+    data_dict: Dict[str, Any] = {}
+    query_result: QueryResult = {
+        "ids": [["id1"], ["id2"]],
+        "embeddings": [[np.array([1.0, 2.0])], [np.array([3.0, 4.0])]],
+        "documents": [["doc1"], ["doc2"]],
+        "metadatas": [[{"key": "value1"}], [{"key": "value2"}]],
+        "distances": [[0.1], [0.2]],
+        "uris": [["uri1", "uri2"]],
+        "data": [
+            [np.array([1, 2, 3]), np.array([4, 5, 6])]
+        ],  # Using numpy arrays as Image type
+        "included": ["embeddings", "documents", "metadatas", "distances"],
+    }
+
+    _add_query_fields(data_dict, query_result, 0)
+    assert np.array_equal(data_dict["embedding"], [np.array([1.0, 2.0])])
+    assert data_dict["document"] == ["doc1"]
+    assert data_dict["metadata"] == [{"key": "value1"}]
+    assert data_dict["distance"] == [0.1]
+
+
+def test_add_get_fields() -> None:
+    data_dict: Dict[str, Any] = {}
+    get_result: GetResult = {
+        "ids": ["id1", "id2"],
+        "embeddings": [np.array([1.0, 2.0]), np.array([3.0, 4.0])],
+        "documents": ["doc1", "doc2"],
+        "metadatas": [{"key": "value1"}, {"key": "value2"}],
+        "uris": ["uri1", "uri2"],
+        "data": [
+            np.array([1, 2, 3]),
+            np.array([4, 5, 6]),
+        ],  # Using numpy arrays as Image type
+        "included": ["embeddings", "documents", "metadatas"],
+    }
+
+    _add_get_fields(data_dict, get_result)
+    assert all(
+        np.array_equal(a, b)
+        for a, b in zip(
+            data_dict["embedding"], [np.array([1.0, 2.0]), np.array([3.0, 4.0])]
+        )
+    )
+    assert data_dict["document"] == ["doc1", "doc2"]
+    assert data_dict["metadata"] == [{"key": "value1"}, {"key": "value2"}]
+
+
+def test_query_result_to_dfs() -> None:
+    query_result: QueryResult = {
+        "ids": [["id1", "id2"]],
+        "embeddings": [[np.array([1.0, 2.0]), np.array([3.0, 4.0])]],
+        "documents": [["doc1", "doc2"]],
+        "metadatas": [[{"key": "value1"}, {"key": "value2"}]],
+        "distances": [[0.1, 0.2]],
+        "uris": [["uri1", "uri2"]],
+        "data": [
+            [np.array([1, 2, 3]), np.array([4, 5, 6])]
+        ],  # Using numpy arrays as Image type
+        "included": ["embeddings", "documents", "metadatas", "distances"],
+    }
+
+    dfs = query_result_to_dfs(query_result)
+    assert len(dfs) == 1  # Only one query
+
+    # Test DataFrame
+    df = dfs[0]
+    assert df.index[0] == "id1"
+    assert df["document"].iloc[0] == "doc1"
+    assert df["metadata"].iloc[0] == {"key": "value1"}
+    assert np.array_equal(df["embedding"].iloc[0], np.array([1.0, 2.0]))
+    assert df["distance"].iloc[0] == 0.1
+
+    # Test column order
+    assert list(df.columns) == ["embedding", "document", "metadata", "distance"]
+
+
+def test_get_result_to_df() -> None:
+    get_result: GetResult = {
+        "ids": ["id1", "id2"],
+        "embeddings": [np.array([1.0, 2.0]), np.array([3.0, 4.0])],
+        "documents": ["doc1", "doc2"],
+        "metadatas": [{"key": "value1"}, {"key": "value2"}],
+        "uris": ["uri1", "uri2"],
+        "data": [
+            np.array([1, 2, 3]),
+            np.array([4, 5, 6]),
+        ],  # Using numpy arrays as Image type
+        "included": ["embeddings", "documents", "metadatas"],
+    }
+
+    df = get_result_to_df(get_result)
+    assert len(df) == 2
+    assert list(df.index) == ["id1", "id2"]
+    assert df["document"].tolist() == ["doc1", "doc2"]
+    assert df["metadata"].tolist() == [{"key": "value1"}, {"key": "value2"}]
+    assert all(
+        np.array_equal(a, b)
+        for a, b in zip(
+            df["embedding"].tolist(), [np.array([1.0, 2.0]), np.array([3.0, 4.0])]
+        )
+    )
+
+    # Test column order
+    assert list(df.columns) == ["embedding", "document", "metadata"]
+
+
+def test_query_result_to_dfs_with_missing_fields() -> None:
+    query_result: QueryResult = {
+        "ids": [["id1"]],
+        "documents": [["doc1"]],
+        "embeddings": [[]],  # type:ignore
+        "metadatas": [[]],
+        "distances": [[]],
+        "uris": [[]],
+        "data": [[]],
+        "included": ["documents"],
+    }
+
+    dfs = query_result_to_dfs(query_result)
+    assert len(dfs) == 1
+    df = dfs[0]
+    assert df.index[0] == "id1"
+    assert df["document"].iloc[0] == "doc1"
+    assert "metadata" not in df.columns
+    assert "embedding" not in df.columns
+    assert "distance" not in df.columns
+
+
+def test_get_result_to_df_with_missing_fields() -> None:
+    get_result: GetResult = {
+        "ids": ["id1", "id2"],
+        "documents": ["doc1", "doc2"],
+        "embeddings": [],
+        "metadatas": [],
+        "uris": [],
+        "data": [],
+        "included": ["documents"],
+    }
+
+    df = get_result_to_df(get_result)
+    assert len(df) == 2
+    assert list(df.index) == ["id1", "id2"]
+    assert df["document"].tolist() == ["doc1", "doc2"]
+    assert "metadata" not in df.columns
+    assert "embedding" not in df.columns
--- a/backend_service/venv/lib/python3.13/site-packages/chromadb/test/utils/wait_for_version_increase.py
+++ b/backend_service/venv/lib/python3.13/site-packages/chromadb/test/utils/wait_for_version_increase.py
@@ -0,0 +1,30 @@
+import time
+from chromadb.api import ClientAPI
+from chromadb.test.conftest import COMPACTION_SLEEP
+
+TIMEOUT_INTERVAL = 1
+
+
+def get_collection_version(client: ClientAPI, collection_name: str) -> int:
+    coll = client.get_collection(collection_name)
+    return coll.get_model()["version"]
+
+
+def wait_for_version_increase(
+    client: ClientAPI,
+    collection_name: str,
+    initial_version: int,
+    additional_time: int = 0,
+) -> int:
+    timeout = COMPACTION_SLEEP
+    initial_time = time.time() + additional_time
+
+    curr_version = get_collection_version(client, collection_name)
+    while curr_version == initial_version:
+        time.sleep(TIMEOUT_INTERVAL)
+        if time.time() - initial_time > timeout:
+            collection_id = client.get_collection(collection_name).id
+            raise TimeoutError(f"Model was not updated in time for {collection_id}")
+        curr_version = get_collection_version(client, collection_name)
+
+    return curr_version