"""
OCR Converter — extract text from images and scanned PDFs using Tesseract.

System requirement: tesseract-ocr must be installed.
  Ubuntu/Debian:  sudo apt install tesseract-ocr
  Fedora/RHEL:    sudo dnf install tesseract
  macOS:          brew install tesseract

Python requirement: pip install pytesseract pdf2image python-docx PyMuPDF

Supported input:  jpg, jpeg, png, tiff, tif, bmp, gif, webp, pdf
Supported output: txt, pdf (searchable), docx
"""

import os
import uuid
import json


SUPPORTED_INPUT_EXTS = {
    ".jpg", ".jpeg", ".png", ".tiff", ".tif",
    ".bmp", ".gif", ".webp", ".pdf",
}
SUPPORTED_OUTPUT = {"txt", "pdf", "docx"}


def _parse_url_entry(url_entry):
    if isinstance(url_entry, str):
        try:
            parsed = json.loads(url_entry)
            if isinstance(parsed, dict):
                return parsed
        except (json.JSONDecodeError, TypeError):
            name, ext = os.path.splitext(os.path.basename(url_entry))
            return {"path": url_entry, "name": name, "ext": ext}
    elif isinstance(url_entry, dict):
        return url_entry
    return None


def _require_pytesseract():
    try:
        import pytesseract
        return pytesseract
    except ImportError:
        raise RuntimeError(
            "pytesseract is not installed. Run: pip install pytesseract  "
            "Also install the system package: sudo apt install tesseract-ocr"
        )


def _get_images(full_path, ext_lower):
    """Return a list of PIL Images from the input file."""
    if ext_lower == ".pdf":
        try:
            from pdf2image import convert_from_path
        except ImportError:
            raise RuntimeError(
                "pdf2image is not installed. Run: pip install pdf2image  "
                "Also install: sudo apt install poppler-utils"
            )
        return convert_from_path(full_path, dpi=200)
    else:
        from PIL import Image
        img = Image.open(full_path)
        img.load()
        return [img]


def _run_ocr_text(images):
    """OCR a list of PIL Images; return list of text strings (one per image)."""
    tsr = _require_pytesseract()
    texts = []
    for img in images:
        try:
            text = tsr.image_to_string(img)
        except Exception as e:
            text = f"[OCR error: {e}]"
        texts.append(text)
    return texts


def _write_txt(texts, output_path):
    with open(output_path, "w", encoding="utf-8") as f:
        for i, text in enumerate(texts):
            if i > 0:
                f.write("\n\n--- Page Break ---\n\n")
            f.write(text)


def _write_docx(texts, output_path):
    try:
        from docx import Document
    except ImportError:
        raise RuntimeError(
            "python-docx is not installed. Run: pip install python-docx"
        )
    doc = Document()
    for i, text in enumerate(texts):
        if i > 0:
            doc.add_page_break()
        for line in text.splitlines():
            doc.add_paragraph(line)
    doc.save(output_path)


def _write_searchable_pdf(images, output_path):
    """Create a searchable PDF (image + invisible text layer) via Tesseract."""
    tsr = _require_pytesseract()

    page_pdfs = []
    for img in images:
        pdf_bytes = tsr.image_to_pdf_or_hocr(img, extension="pdf")
        page_pdfs.append(pdf_bytes)

    if len(page_pdfs) == 1:
        with open(output_path, "wb") as f:
            f.write(page_pdfs[0])
        return

    # Merge pages using PyMuPDF (preferred) or PyPDF2 (fallback)
    try:
        import fitz
        merged = fitz.open()
        for page_bytes in page_pdfs:
            sub = fitz.open("pdf", page_bytes)
            merged.insert_pdf(sub)
            sub.close()
        merged.save(output_path)
        merged.close()
        return
    except ImportError:
        pass

    try:
        import io
        from PyPDF2 import PdfWriter, PdfReader
        writer = PdfWriter()
        for page_bytes in page_pdfs:
            reader = PdfReader(io.BytesIO(page_bytes))
            for page in reader.pages:
                writer.add_page(page)
        with open(output_path, "wb") as f:
            writer.write(f)
        return
    except ImportError:
        pass

    # Last resort: write the first page only
    with open(output_path, "wb") as f:
        f.write(page_pdfs[0])


def _convert_single(file_object, target_format, upload_dir):
    file_path = file_object.get("path", "")
    file_name = file_object.get("name", "") or os.path.splitext(
        os.path.basename(file_path)
    )[0]
    file_ext = file_object.get("ext", os.path.splitext(file_path)[1])

    if not file_path:
        raise ValueError("No file path provided")

    full_path = os.path.join(upload_dir, file_path)
    if not os.path.exists(full_path):
        full_path = file_path
    if not os.path.exists(full_path):
        raise FileNotFoundError(f"File not found: {os.path.basename(file_path)}")

    ext_lower = file_ext.lower() if file_ext.startswith(".") else f".{file_ext.lower()}"
    if ext_lower not in SUPPORTED_INPUT_EXTS:
        raise ValueError(f"Unsupported input format for OCR: {file_ext}")

    images = _get_images(full_path, ext_lower)

    output_folder = uuid.uuid4().hex
    output_dir = os.path.join(upload_dir, output_folder)
    os.makedirs(output_dir, exist_ok=True)

    fmt = target_format.lower()
    output_path = os.path.join(output_dir, f"{file_name}.{fmt}")

    if fmt == "txt":
        texts = _run_ocr_text(images)
        _write_txt(texts, output_path)
    elif fmt == "docx":
        texts = _run_ocr_text(images)
        _write_docx(texts, output_path)
    elif fmt == "pdf":
        _write_searchable_pdf(images, output_path)
    else:
        raise ValueError(f"Unsupported OCR output format: {target_format}")

    if not os.path.exists(output_path) or os.path.getsize(output_path) == 0:
        raise RuntimeError("OCR output file was not created or is empty.")

    return output_path


def convert(urls, target_format, options, config):
    try:
        upload_dir = config.get("UPLOAD_DIR", "static/uploads")
        fmt = target_format.lower()

        if fmt not in SUPPORTED_OUTPUT:
            return {
                "error": True,
                "message": (
                    f"Unsupported OCR output format: {target_format}. "
                    "Supported formats: txt, pdf, docx"
                ),
            }

        results = []
        errors = []

        for url_entry in urls:
            file_object = _parse_url_entry(url_entry)
            if not file_object:
                continue
            file_path = file_object.get("path", "")
            if not file_path or file_path == "empty":
                continue

            try:
                output_path = _convert_single(file_object, fmt, upload_dir)
                results.append(output_path)
            except Exception as e:
                fname = os.path.basename(file_path) if file_path else "unknown"
                errors.append(f"OCR failed for {fname}: {str(e)}")

        if not results and errors:
            return {"error": True, "message": "; ".join(errors)}

        if not results and not errors:
            return {"error": True, "message": "No files were provided for OCR."}

        return {
            "error": False,
            "results": results,
            "output_path": results[0] if results else "",
            "errors": errors if errors else None,
        }

    except Exception as e:
        return {"error": True, "message": f"OCR conversion failed: {str(e)}"}
