DataMate/runtime/datamate-python/app/module/kg_graphrag/kg_client.py

"""KG 服务 REST 客户端。

通过 httpx 调用 Java 侧 knowledge-graph-service 的查询 API，
包括全文检索和子图导出。

失败策略：fail-open —— KG 服务不可用时返回空结果 + 日志告警。
"""

from __future__ import annotations

import httpx

from app.core.logging import get_logger
from app.module.kg_graphrag.cache import get_cache, make_cache_key
from app.module.kg_graphrag.models import EntitySummary, RelationSummary

logger = get_logger(__name__)


class KGServiceClient:
    """Java KG 服务 REST 客户端。"""

    def __init__(
        self,
        *,
        base_url: str = "http://datamate-backend:8080",
        internal_token: str = "",
        timeout: float = 30.0,
    ) -> None:
        self._base_url = base_url.rstrip("/")
        self._internal_token = internal_token
        self._timeout = timeout
        self._client: httpx.AsyncClient | None = None

    @classmethod
    def from_settings(cls) -> KGServiceClient:
        from app.core.config import settings

        return cls(
            base_url=settings.graphrag_kg_service_url,
            internal_token=settings.graphrag_kg_internal_token,
            timeout=30.0,
        )

    def _get_client(self) -> httpx.AsyncClient:
        if self._client is None:
            self._client = httpx.AsyncClient(
                base_url=self._base_url,
                timeout=self._timeout,
            )
        return self._client

    def _headers(self, user_id: str = "") -> dict[str, str]:
        headers: dict[str, str] = {}
        if self._internal_token:
            headers["X-Internal-Token"] = self._internal_token
        if user_id:
            headers["X-User-Id"] = user_id
        return headers

    async def fulltext_search(
        self,
        graph_id: str,
        query: str,
        size: int = 10,
        user_id: str = "",
    ) -> list[EntitySummary]:
        """调用 KG 服务全文检索，返回匹配的实体列表。

        Fail-open: KG 服务不可用时返回空列表。
        结果会被缓存（TTL 由 graphrag_cache_kg_ttl 控制）。
        """
        cache = get_cache()
        cache_key = make_cache_key("fulltext", graph_id, query, size, user_id)
        cached = cache.get_kg(cache_key)
        if cached is not None:
            return cached
        try:
            result = await self._fulltext_search_impl(graph_id, query, size, user_id)
            cache.set_kg(cache_key, result)
            return result
        except Exception:
            logger.exception(
                "KG fulltext search failed for graph_id=%s (fail-open, returning empty)",
                graph_id,
            )
            return []

    async def _fulltext_search_impl(
        self,
        graph_id: str,
        query: str,
        size: int,
        user_id: str,
    ) -> list[EntitySummary]:
        client = self._get_client()
        resp = await client.get(
            f"/api/knowledge-graph/{graph_id}/query/search",
            params={"q": query, "size": size},
            headers=self._headers(user_id),
        )
        resp.raise_for_status()
        body = resp.json()

        # Java 返回 PagedResponse<SearchHitVO>:
        #   可能被全局包装为 {"code": 200, "data": PagedResponse}
        #   也可能直接返回 PagedResponse {"page": 0, "content": [...]}
        data = body.get("data", body)
        # PagedResponse 将实体列表放在 content 字段中
        items: list[dict] = (
            data.get("content", []) if isinstance(data, dict) else data if isinstance(data, list) else []
        )
        entities: list[EntitySummary] = []
        for item in items:
            entities.append(
                EntitySummary(
                    id=str(item.get("id", "")),
                    name=item.get("name", ""),
                    type=item.get("type", ""),
                    description=item.get("description", ""),
                )
            )
        return entities

    async def get_subgraph(
        self,
        graph_id: str,
        entity_ids: list[str],
        depth: int = 1,
        user_id: str = "",
    ) -> tuple[list[EntitySummary], list[RelationSummary]]:
        """获取种子实体的 N-hop 子图。

        Fail-open: KG 服务不可用时返回空子图。
        结果会被缓存（TTL 由 graphrag_cache_kg_ttl 控制）。
        """
        cache = get_cache()
        cache_key = make_cache_key("subgraph", graph_id, sorted(entity_ids), depth, user_id)
        cached = cache.get_kg(cache_key)
        if cached is not None:
            return cached
        try:
            result = await self._get_subgraph_impl(graph_id, entity_ids, depth, user_id)
            cache.set_kg(cache_key, result)
            return result
        except Exception:
            logger.exception(
                "KG subgraph export failed for graph_id=%s (fail-open, returning empty)",
                graph_id,
            )
            return [], []

    async def _get_subgraph_impl(
        self,
        graph_id: str,
        entity_ids: list[str],
        depth: int,
        user_id: str,
    ) -> tuple[list[EntitySummary], list[RelationSummary]]:
        client = self._get_client()
        resp = await client.post(
            f"/api/knowledge-graph/{graph_id}/query/subgraph/export",
            params={"depth": depth},
            json={"entityIds": entity_ids},
            headers=self._headers(user_id),
        )
        resp.raise_for_status()
        body = resp.json()

        # Java 返回 SubgraphExportVO:
        #   可能被全局包装为 {"code": 200, "data": SubgraphExportVO}
        #   也可能直接返回 SubgraphExportVO {"nodes": [...], "edges": [...]}
        data = body.get("data", body) if isinstance(body.get("data"), dict) else body
        nodes_raw = data.get("nodes", [])
        edges_raw = data.get("edges", [])

        # ExportNodeVO: id, name, type, description, properties (Map)
        entities: list[EntitySummary] = []
        for node in nodes_raw:
            entities.append(
                EntitySummary(
                    id=str(node.get("id", "")),
                    name=node.get("name", ""),
                    type=node.get("type", ""),
                    description=node.get("description", ""),
                )
            )

        relations: list[RelationSummary] = []
        # 构建 id -> entity 的映射用于查找 source/target 名称和类型
        entity_map = {e.id: e for e in entities}
        # ExportEdgeVO: sourceEntityId, targetEntityId, relationType
        # 注意：sourceId 是数据来源 ID，不是源实体 ID
        for edge in edges_raw:
            source_id = str(edge.get("sourceEntityId", ""))
            target_id = str(edge.get("targetEntityId", ""))
            source_entity = entity_map.get(source_id)
            target_entity = entity_map.get(target_id)
            relations.append(
                RelationSummary(
                    source_name=source_entity.name if source_entity else source_id,
                    source_type=source_entity.type if source_entity else "",
                    target_name=target_entity.name if target_entity else target_id,
                    target_type=target_entity.type if target_entity else "",
                    relation_type=edge.get("relationType", ""),
                )
            )

        return entities, relations

    async def close(self) -> None:
        if self._client is not None:
            await self._client.aclose()
            self._client = None