Initial travel knowledge graph release
This commit is contained in:
148
app/agents/field_mapping.py
Normal file
148
app/agents/field_mapping.py
Normal file
@@ -0,0 +1,148 @@
|
||||
"""Phase 1 — Field Mapping Agent.
|
||||
|
||||
Suggests which CSV columns map to which entity/relation fields.
|
||||
Uses LLM to analyze column names + sample values against ontology schema targets.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from dataclasses import dataclass, field
|
||||
|
||||
from app.config import settings
|
||||
from app.llm_client import LlmClient
|
||||
|
||||
SYSTEM_PROMPT = """你是一个知识图谱数据映射专家。给一个 CSV 的列名和样本值、以及目标 Schema 的实体类型和字段定义,
|
||||
输出每个列的推荐映射关系。输出 JSON。
|
||||
|
||||
规则:
|
||||
1. 精确匹配列名和字段名(如 "name" → "name")
|
||||
2. 语义匹配(如 "地址" → "address", "电话" → "phone")
|
||||
3. 值类型推断(如样本全是数字的可能是 "price" 或 "rating")
|
||||
4. 无法映射的列标记为 "unmapped"
|
||||
|
||||
只输出 JSON。"""
|
||||
|
||||
USER_TEMPLATE = """CSV 列名 + 样本值(每列最多 3 个样本):
|
||||
{columns_json}
|
||||
|
||||
目标 Schema — 实体类型和字段:
|
||||
{schema_json}
|
||||
|
||||
输出 schema:
|
||||
{{
|
||||
"mappings": [
|
||||
{{
|
||||
"source_column": "列名",
|
||||
"target_entity_type": "目标实体类型",
|
||||
"target_field": "目标字段名",
|
||||
"confidence": 0.0~1.0
|
||||
}}
|
||||
],
|
||||
"unmapped_columns": ["列名"],
|
||||
"suggested_new_fields": [
|
||||
{{
|
||||
"column_name": "列名",
|
||||
"suggested_entity_type": "建议关联的实体类型",
|
||||
"sample_values": ["值1", "值2"],
|
||||
"reason": "一句话理由"
|
||||
}}
|
||||
]
|
||||
}}"""
|
||||
|
||||
|
||||
@dataclass
|
||||
class MappingSuggestion:
|
||||
source_column: str
|
||||
target_entity_type: str
|
||||
target_field: str
|
||||
confidence: float = 0.5
|
||||
|
||||
|
||||
@dataclass
|
||||
class FieldMappingResult:
|
||||
mappings: list[MappingSuggestion] = field(default_factory=list)
|
||||
unmapped_columns: list[str] = field(default_factory=list)
|
||||
suggested_new_fields: list[dict] = field(default_factory=list)
|
||||
proposals: list[dict] = field(default_factory=list) # Schema extension proposals
|
||||
|
||||
|
||||
def _rule_based_mapping(columns: list[str], samples: dict[str, list], schema_targets: dict) -> FieldMappingResult:
|
||||
"""Fallback rule-based mapping when LLM is unavailable."""
|
||||
NAME_MAP: dict[str, tuple[str, str]] = {
|
||||
"name": ("POI", "name"),
|
||||
"名称": ("POI", "name"),
|
||||
"address": ("POI", "address"),
|
||||
"地址": ("POI", "address"),
|
||||
"phone": ("POI", "contact"),
|
||||
"电话": ("POI", "contact"),
|
||||
"open": ("POI", "opening_hours"),
|
||||
"开放时间": ("POI", "opening_hours"),
|
||||
"price": ("POI", "price_level"),
|
||||
"价格": ("POI", "price_level"),
|
||||
"lat": ("POI", "latitude"),
|
||||
"lng": ("POI", "longitude"),
|
||||
"lon": ("POI", "longitude"),
|
||||
"type": ("POI", "category"),
|
||||
"类型": ("POI", "category"),
|
||||
}
|
||||
|
||||
mappings = []
|
||||
unmapped = []
|
||||
for col in columns:
|
||||
key = col.lower().strip()
|
||||
if key in NAME_MAP:
|
||||
et, field = NAME_MAP[key]
|
||||
mappings.append(MappingSuggestion(col, et, field, 0.9))
|
||||
else:
|
||||
unmapped.append(col)
|
||||
|
||||
return FieldMappingResult(mappings=mappings, unmapped_columns=unmapped)
|
||||
|
||||
|
||||
async def suggest_field_mapping(
|
||||
columns: list[str],
|
||||
samples: dict[str, list],
|
||||
schema_targets: dict,
|
||||
llm: LlmClient | None = None,
|
||||
) -> FieldMappingResult:
|
||||
"""Suggest field mappings for a CSV import.
|
||||
|
||||
Args:
|
||||
columns: CSV column names
|
||||
samples: {column_name: [sample_values]}
|
||||
schema_targets: Ontology schema targets (entity types + fields)
|
||||
llm: Optional LLM client
|
||||
"""
|
||||
if llm is None:
|
||||
llm = LlmClient.from_settings() if settings.llm_api_key else None
|
||||
|
||||
if llm and llm.available():
|
||||
try:
|
||||
columns_json = json.dumps(
|
||||
{col: samples.get(col, [])[:3] for col in columns},
|
||||
ensure_ascii=False, indent=2,
|
||||
)
|
||||
schema_json = json.dumps(schema_targets, ensure_ascii=False, indent=2)
|
||||
result = llm.chat_json(
|
||||
system=SYSTEM_PROMPT,
|
||||
user=USER_TEMPLATE.format(columns_json=columns_json, schema_json=schema_json),
|
||||
)
|
||||
mappings = [
|
||||
MappingSuggestion(
|
||||
source_column=m["source_column"],
|
||||
target_entity_type=m["target_entity_type"],
|
||||
target_field=m["target_field"],
|
||||
confidence=m.get("confidence", 0.5),
|
||||
)
|
||||
for m in result.get("mappings", [])
|
||||
]
|
||||
return FieldMappingResult(
|
||||
mappings=mappings,
|
||||
unmapped_columns=result.get("unmapped_columns", []),
|
||||
suggested_new_fields=result.get("suggested_new_fields", []),
|
||||
proposals=result.get("suggested_new_fields", []), # Schema extension proposals
|
||||
)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return _rule_based_mapping(columns, samples, schema_targets)
|
||||
Reference in New Issue
Block a user