# llm_ocr_claude.py import os import json import time from typing import Any, Dict, Optional import kyc_policy_engine_llm_v2 as kyc try: from anthropic import Anthropic from anthropic import APIConnectionError, APITimeoutError, RateLimitError, APIStatusError except Exception: # pragma: no cover Anthropic = None # type: ignore APIConnectionError = APITimeoutError = RateLimitError = APIStatusError = Exception # type: ignore _DEFAULT_MODEL = os.getenv("KYC_OCR_MODEL", "claude-sonnet-4-5") _MAX_OUTPUT_TOKENS = int(os.getenv("KYC_OCR_MAX_OUTPUT_TOKENS", "1400")) _MAX_RETRIES = int(os.getenv("KYC_OCR_MAX_RETRIES", "2")) _RETRY_BASE_SLEEP = float(os.getenv("KYC_OCR_RETRY_BASE_SLEEP", "0.5")) _ANTH_CLIENT: Optional["Anthropic"] = None def _get_client() -> "Anthropic": global _ANTH_CLIENT if _ANTH_CLIENT is not None: return _ANTH_CLIENT if Anthropic is None: raise RuntimeError("anthropic package not available (pip install anthropic)") api_key = os.getenv("ANTHROPIC_API_KEY") if not api_key: raise RuntimeError("ANTHROPIC_API_KEY is not set") _ANTH_CLIENT = Anthropic(api_key=api_key) return _ANTH_CLIENT def _extract_json_strict(s: str) -> Dict[str, Any]: s = (s or "").strip() if not s: raise ValueError("Empty model output") try: return json.loads(s) except Exception: pass start = s.find("{") end = s.rfind("}") if start >= 0 and end > start: return json.loads(s[start : end + 1]) raise ValueError("Model output is not valid JSON") def _build_output_schema(bundle: Dict[str, Any]) -> Dict[str, Any]: """ Enforce: { "fields": { "": {candidate_idx:int, text:string, confidence:number, normalized:string } } } We do NOT require all keys here; we fill missing keys in post-processing. """ fields_plan = bundle.get("fields") or [] keys = [] for f in fields_plan: k = f.get("key") if isinstance(k, str) and k.strip(): keys.append(k.strip()) field_value_schema: Dict[str, Any] = { "type": "object", "properties": { "candidate_idx": {"type": "integer"}, "text": {"type": "string"}, "confidence": {"type": "number"}, "normalized": {"type": "string"}, }, "required": ["candidate_idx", "text", "confidence", "normalized"], "additionalProperties": False, } # Allow only known keys (avoid model inventing random keys) fields_props = {k: field_value_schema for k in keys} return { "type": "object", "properties": { "fields": { "type": "object", "properties": fields_props, "additionalProperties": False, } }, "required": ["fields"], "additionalProperties": False, } def llm_ocr(bundle: Dict[str, Any]) -> Dict[str, Any]: client = _get_client() # text + base64 images (Claude-compatible blocks) content_blocks = kyc.KYCEngine.anthropic_content_items_from_bundle(bundle) output_schema = _build_output_schema(bundle) # expected keys (so we can fill defaults even if the model omits) keys = [] for f in (bundle.get("fields") or []): k = f.get("key") if isinstance(k, str) and k.strip(): keys.append(k.strip()) for attempt in range(_MAX_RETRIES + 1): try: resp = client.messages.create( model=_DEFAULT_MODEL, max_tokens=_MAX_OUTPUT_TOKENS, messages=[{"role": "user", "content": content_blocks}], output_config={ "format": { "type": "json_schema", "schema": output_schema, } }, temperature=0, ) # Claude JSON outputs are returned in response.content[0].text out_text = "" content = getattr(resp, "content", None) or [] for block in content: if isinstance(block, dict): if block.get("type") == "text": out_text += block.get("text") or "" else: if getattr(block, "type", None) == "text": out_text += getattr(block, "text", "") or "" data = _extract_json_strict(out_text) fields = data.get("fields") if not isinstance(fields, dict): fields = {} # Fill missing keys + normalize (match your OpenAI wrapper expectations) for k in keys: v = fields.get(k) if not isinstance(v, dict): v = {"candidate_idx": 0, "text": "", "confidence": 0.0, "normalized": ""} fields[k] = v # candidate_idx try: v["candidate_idx"] = int(v.get("candidate_idx", 0)) except Exception: v["candidate_idx"] = 0 # text t = v.get("text", "") v["text"] = "" if t is None else str(t) # confidence c = v.get("confidence", 0.0) try: cc = float(c) v["confidence"] = max(0.0, min(1.0, cc)) except Exception: v["confidence"] = 0.0 # normalized (empty => None) n = v.get("normalized", "") n = "" if n is None else str(n) v["normalized"] = None if not n.strip() else n return {"fields": fields} except (RateLimitError, APITimeoutError, APIConnectionError) as e: if attempt < _MAX_RETRIES: time.sleep(_RETRY_BASE_SLEEP * (2 ** attempt)) continue raise RuntimeError(f"llm_ocr retryable failure: {e}") from e except APIStatusError as e: code = getattr(e, "status_code", None) if code in (500, 529) and attempt < _MAX_RETRIES: time.sleep(_RETRY_BASE_SLEEP * (2 ** attempt)) continue raise RuntimeError(f"llm_ocr APIStatusError: {e}") from e except Exception as e: raise RuntimeError(f"llm_ocr non-retryable failure: {e}") from e