#!/usr/bin/env bash
set -euo pipefail

ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
ENV_FILE="${ENV_FILE:-$ROOT_DIR/.env.paddleocr.verify}"
BACKEND_BASE_URL="${BACKEND_BASE_URL:-http://127.0.0.1:18090}"
PADDLE_HEALTH_URL="${PADDLE_HEALTH_URL:-http://192.168.1.13:18080/health}"
SAMPLE_FILE="${SAMPLE_FILE:-$ROOT_DIR/samples/images/56CDF923B8CAE7000FE3274C10ECBC0C.jpg}"
START_BACKEND="${START_BACKEND:-true}"
BACKEND_REQUIRED="${BACKEND_REQUIRED:-false}"
BACKEND_LOG="${BACKEND_LOG:-/tmp/ocr-platform-paddle-verify-backend.log}"

load_env() {
  while IFS= read -r line || [[ -n "$line" ]]; do
    [[ -z "$line" || "$line" =~ ^[[:space:]]*# ]] && continue
    [[ "$line" != *=* ]] && continue
    local key="${line%%=*}"
    local value="${line#*=}"
    key="$(echo "$key" | xargs)"
    [[ -z "$key" ]] && continue
    export "$key=$value"
  done < "$ENV_FILE"
}

json_get() {
  local path="$1"
  python3 -c 'import json,sys; data=json.load(sys.stdin)
for key in sys.argv[1].split("."):
    if key:
        data=data[key]
print(data)' "$path"
}

wait_for_backend() {
  for _ in $(seq 1 60); do
    if curl -fsS --max-time 2 "$BACKEND_BASE_URL/api/v1/health" >/dev/null 2>&1; then
      return 0
    fi
    sleep 2
  done
  echo "Backend did not become healthy. See $BACKEND_LOG" >&2
  return 1
}

wait_for_task() {
  local task_id="$1"
  local summary status
  for _ in $(seq 1 90); do
    summary="$(curl -fsS --max-time 10 "$BACKEND_BASE_URL/api/v1/ocr/tasks/$task_id")"
    status="$(printf '%s' "$summary" | json_get data.status)"
    echo "task $task_id status: $status"
    case "$status" in
      COMPLETED)
        echo "$summary"
        return 0
        ;;
      FAILED|CANCELED)
        echo "$summary" >&2
        return 1
        ;;
    esac
    sleep 3
  done
  echo "Task $task_id did not finish in time" >&2
  return 1
}

load_env

echo "== PaddleOCR layout verification =="
echo "env file: $ENV_FILE"
echo "sample: $SAMPLE_FILE"

echo
echo "== External PaddleOCR health =="
curl -fsS --max-time 10 "$PADDLE_HEALTH_URL"
echo

echo
echo "== Direct PaddleOCR layout-parsing request =="
python3 - "$PADDLE_LAYOUT_ENDPOINT" "$SAMPLE_FILE" <<'PY'
import base64
import json
import sys
import urllib.error
import urllib.request

endpoint = sys.argv[1]
sample_file = sys.argv[2]
with open(sample_file, "rb") as fh:
    encoded = base64.b64encode(fh.read()).decode("ascii")

payload = json.dumps({"file": encoded, "fileType": 1}).encode("utf-8")
request = urllib.request.Request(
    endpoint,
    data=payload,
    headers={"Content-Type": "application/json"},
    method="POST",
)

try:
    with urllib.request.urlopen(request, timeout=120) as response:
        body = response.read().decode("utf-8")
except urllib.error.HTTPError as exc:
    body = exc.read().decode("utf-8", errors="replace")
    raise SystemExit(f"PaddleOCR HTTP {exc.code}: {body[:1000]}")

data = json.loads(body)
if data.get("errorCode") != 0:
    raise SystemExit(f"PaddleOCR service error: {data.get('errorCode')} {data.get('errorMsg')}")

layout_results = data.get("result", {}).get("layoutParsingResults", [])
text_count = 0
table_count = 0
first_text = ""
for layout in layout_results:
    pruned = layout.get("prunedResult", {})
    texts = pruned.get("overall_ocr_res", {}).get("rec_texts", [])
    text_count += len(texts)
    if texts and not first_text:
        first_text = texts[0]
    table_count += len(pruned.get("table_res_list", []))

print(json.dumps({
    "errorCode": data.get("errorCode"),
    "layoutPages": len(layout_results),
    "textCount": text_count,
    "tableCount": table_count,
    "firstText": first_text[:120],
}, ensure_ascii=False))

if text_count <= 0:
    raise SystemExit("PaddleOCR did not return any recognized text")
PY

if [[ "$START_BACKEND" == "true" ]]; then
  if curl -fsS --max-time 2 "$BACKEND_BASE_URL/api/v1/health" >/dev/null 2>&1; then
    echo "Backend already reachable at $BACKEND_BASE_URL"
  else
    echo "Starting backend with PaddleOCR verification profile..."
    (
      cd "$ROOT_DIR/backend"
      mvn -q spring-boot:run
    ) >"$BACKEND_LOG" 2>&1 &
    BACKEND_PID=$!
    trap 'kill "$BACKEND_PID" >/dev/null 2>&1 || true' EXIT
    if ! wait_for_backend; then
      if [[ "$BACKEND_REQUIRED" == "true" ]]; then
        exit 1
      fi
      echo "Skipping platform E2E because backend is unavailable. Direct PaddleOCR provider verification passed."
      exit 0
    fi
  fi
else
  if ! wait_for_backend; then
    if [[ "$BACKEND_REQUIRED" == "true" ]]; then
      exit 1
    fi
    echo "Skipping platform E2E because backend is unavailable. Direct PaddleOCR provider verification passed."
    exit 0
  fi
fi

echo
echo "== Engine registry =="
curl -fsS --max-time 10 "$BACKEND_BASE_URL/api/v1/engines"
echo

echo
echo "== Create OCR task =="
create_response="$(curl -fsS --max-time 30 \
  -F "sourceSystem=VERIFY_SCRIPT" \
  -F "businessType=PADDLE_LAYOUT_VERIFY" \
  -F "templateCode=VERIFY_PADDLE_LAYOUT" \
  -F "priority=10" \
  -F "file=@$SAMPLE_FILE" \
  "$BACKEND_BASE_URL/api/v1/ocr/tasks")"
echo "$create_response"
task_id="$(printf '%s' "$create_response" | json_get data.taskId)"

echo
echo "== Wait for completion =="
detail="$(wait_for_task "$task_id")"

echo
echo "== Text block count =="
text_blocks_file="/tmp/ocr-platform-text-blocks-$task_id.json"
curl -fsS --max-time 60 "$BACKEND_BASE_URL/api/v1/ocr/tasks/$task_id/text-blocks" -o "$text_blocks_file"
python3 - "$text_blocks_file" <<'PY'
import json
import sys

payload = json.load(open(sys.argv[1]))
items = payload["data"]
print(len(items))
if not items:
    raise SystemExit("No text blocks were produced by paddle-layout")
print(items[0].get("textContent", "")[:120])
PY

echo
echo "Verification task completed: $task_id"
