# Very rough and minimal web interface for DeepSeek-OCR.
# There may be bugs, but it's a good starting point.

from fastapi import FastAPI
from fastapi.staticfiles import StaticFiles
import gradio as gr
import subprocess
from pathlib import Path
import re
import shutil

# Minimal, hardcoded paths
PROJECT_ROOT = Path("/home/ubuntu/DeepSeek-OCR/DeepSeek-OCR-master/DeepSeek-OCR-vllm")
RUN_SCRIPT_PDF = PROJECT_ROOT / "run_dpsk_ocr_pdf.py"
RUN_SCRIPT_IMAGE = PROJECT_ROOT / "run_dpsk_ocr_image.py"
CONFIG_PATH = PROJECT_ROOT / "config.py"
INPUT_DIR = PROJECT_ROOT / "inputs"
OUTPUT_DIR = PROJECT_ROOT / "outputs"


def write_simple_config(input_pdf: Path):
    # Preserve existing config if present; override only INPUT_PATH/OUTPUT_PATH
    backup_path = CONFIG_PATH.with_suffix(".bak")
    existing = CONFIG_PATH.read_text(encoding="utf-8") if CONFIG_PATH.exists() else ""

    if existing:
        backup_path.write_text(existing, encoding="utf-8")
        # Ensure required keys exist; append defaults for missing ones
        required_defaults = {
            "MODEL_PATH": "'DeepSeekAI/deepseek-ocr'",
            "PROMPT": "'<image> Please OCR the page into Markdown.'",
            "SKIP_REPEAT": "False",
            "MAX_CONCURRENCY": "2",
            "NUM_WORKERS": "2",
            "CROP_MODE": "'none'",
        }
        new_content = existing
        for key, value in required_defaults.items():
            if not re.search(rf"^\s*{key}\s*=", new_content, flags=re.MULTILINE):
                new_content += f"\n{key} = {value}"
        # Always override paths
        new_content += (
            f"\n# --- overridden by web/main.py ---\n"
            f"INPUT_PATH = '{input_pdf}'\n"
            f"OUTPUT_PATH = '{OUTPUT_DIR}'\n"
        )
        CONFIG_PATH.write_text(new_content + "\n", encoding="utf-8")
    else:
        # Minimal fallback config (may be insufficient if model path not set)
        CONFIG_PATH.write_text(
            "\n".join(
                [
                    f"INPUT_PATH = '{input_pdf}'",
                    f"OUTPUT_PATH = '{OUTPUT_DIR}'",
                    "MODEL_PATH = 'DeepSeekAI/deepseek-ocr'",
                    "PROMPT = '<image> Please OCR the page into Markdown.'",
                    "SKIP_REPEAT = False",
                    "MAX_CONCURRENCY = 2",
                    "NUM_WORKERS = 2",
                    "CROP_MODE = 'none'",
                ]
            )
            + "\n",
            encoding="utf-8",
        )
    return backup_path if existing else None


def simple_name(name: str) -> str:
    # Basic filename cleanup
    return re.sub(r"[^A-Za-z0-9._-]", "_", name)


def mmd_to_md(raw_text: str) -> str:
    # Keep content unchanged; only caller may adapt links for display
    return raw_text


def process_file(file_path: str):
    if not file_path:
        # preview, md_link, markdown, logs
        return "", "", "Drop a .pdf or image file.", ""

    INPUT_DIR.mkdir(parents=True, exist_ok=True)
    OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

    src = Path(file_path)
    dst = INPUT_DIR / simple_name(src.name)
    if src.resolve() != dst.resolve():
        shutil.copy2(str(src), str(dst))

    backup_cfg = write_simple_config(dst)

    # Decide which runner to use based on file extension
    ext = dst.suffix.lower()
    if ext == ".pdf":
        runner = RUN_SCRIPT_PDF
    elif ext in {".jpg", ".jpeg", ".png"}:
        runner = RUN_SCRIPT_IMAGE
    else:
        return "", "", "Unsupported file type. Allowed: .pdf, .jpg, .jpeg, .png", ""

    # Run OCR tool (stdout captured for quick inspection)
    proc = subprocess.run(
        ["python3", str(runner)],
        cwd=str(PROJECT_ROOT),
        text=True,
        capture_output=True,
    )

    # Restore original config if we backed it up
    if backup_cfg and backup_cfg.exists():
        try:
            CONFIG_PATH.write_text(
                backup_cfg.read_text(encoding="utf-8"), encoding="utf-8"
            )
            backup_cfg.unlink(missing_ok=True)
        except Exception:
            pass

    # Determine expected output
    stem = dst.stem
    if ext == ".pdf":
        md = OUTPUT_DIR / f"{stem}.md"
        mmd = OUTPUT_DIR / f"{stem}.mmd"
        out_file = md if md.exists() else (mmd if mmd.exists() else None)
    else:
        out_file = OUTPUT_DIR / "result.mmd"
        if not out_file.exists():
            out_file = None

    if not out_file:
        log = (proc.stdout or "") + "\n" + (proc.stderr or "")
        rc = proc.returncode
        tail = log[-4000:]
        # preview empty, md link empty, markdown shows failure message, logs tail provided
        return (
            "",
            "",
            f"OCR ran, but no output for '{stem}' in {OUTPUT_DIR}. Return code: {rc}.",
            tail,
        )

    raw_text = out_file.read_text(encoding="utf-8")
    # Convert .mmd to .md and fix image links; also save a .md alongside for convenience
    display_md = raw_text
    md_path = None
    if out_file.suffix.lower() == ".mmd":
        display_md = mmd_to_md(raw_text)
        if ext == ".pdf":
            md_path = OUTPUT_DIR / f"{stem}.md"
        else:
            md_path = OUTPUT_DIR / "result.md"
        try:
            # Save verbatim content as .md; do not modify except extension
            md_path.write_text(raw_text, encoding="utf-8")
        except Exception:
            pass
    else:
        md_path = out_file
    display_md = display_md.replace("](images/", "](/outputs/images/")

    # Build a markdown download link (served from /outputs)
    md_link_html = ""
    if md_path and md_path.exists():
        md_download_url = f"/outputs/{md_path.name}"
        md_link_html = (
            f'<div style="margin: 8px 0;">'
            f'<a href="{md_download_url}" download target="_blank">Download Markdown</a>'
            f"</div>"
        )

    # Build preview (bounding boxes): image for single images, PDF for PDFs
    if ext == ".pdf":
        layouts_pdf = OUTPUT_DIR / f"{stem}_layouts.pdf"
        if layouts_pdf.exists():
            preview_url = f"/outputs/{layouts_pdf.name}"
            preview_html = (
                f'<div style="display:flex;gap:8px;align-items:center;">'
                f'<a href="{preview_url}" target="_blank">Download layouts PDF</a>'
                f"</div>"
                f'<iframe src="{preview_url}" style="width:100%;height:800px;border:1px solid #ddd;border-radius:6px;"></iframe>'
            )
        else:
            preview_html = "<div>No layouts PDF found.</div>"
    else:
        bbox_img = OUTPUT_DIR / "result_with_boxes.jpg"
        if bbox_img.exists():
            preview_url = f"/outputs/{bbox_img.name}"
            preview_html = (
                f'<div style="display:flex;gap:8px;align-items:center;">'
                f'<a href="{preview_url}" target="_blank">Open bounding-box image</a>'
                f"</div>"
                f'<img src="{preview_url}" style="max-width:100%;height:auto;border:1px solid #ddd;border-radius:6px;" />'
            )
        else:
            preview_html = "<div>No bounding-box image found.</div>"

    # Return preview, markdown link, markdown content, and logs
    return preview_html, md_link_html, display_md, (proc.stdout or "")[-2000:]


# UI
with gr.Blocks(title="DeepSeek-OCR -> Markdown") as demo:
    gr.Markdown("### Drop a PDF or image to convert to Markdown")
    inp = gr.File(
        label="PDF or Image",
        file_types=[".pdf", ".jpg", ".jpeg", ".png"],
        type="filepath",
    )
    with gr.Tabs():
        with gr.TabItem("Preview", id=0):
            preview = gr.HTML()
        with gr.TabItem("Markdown", id=1):
            md_link = gr.HTML()
            out = gr.Markdown()
    with gr.Accordion("Details", open=True):
        info = gr.Textbox(
            label="Run log (last 2000 chars)", lines=6, show_copy_button=True
        )
    gr.Markdown(
        "After processing completes, upload another file above to start a new run."
    )
    inp.change(process_file, inputs=inp, outputs=[preview, md_link, out, info])

# FastAPI + static mount for images
app = FastAPI()
INPUT_DIR.mkdir(parents=True, exist_ok=True)
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
app.mount(
    "/outputs", StaticFiles(directory=str(OUTPUT_DIR), html=False), name="outputs"
)
app = gr.mount_gradio_app(app, demo, path="/")
