#!/usr/bin/env python3
import argparse
import re
import sys
from typing import List, Tuple


def read_input(args) -> str:
    if args.input_file:
        with open(args.input_file, "r", encoding="utf-8", errors="replace") as f:
            return f.read()
    if not sys.stdin.isatty():
        return sys.stdin.read()
    if args.input_html is not None:
        return args.input_html
    return ""


def clean_html(html: str) -> str:
    # If it doesn't look like HTML, return as-is.
    if "<" not in html or ">" not in html:
        return html

    s = html

    # Preserve ld+json script blocks by tokenizing them first.
    preserved: List[str] = []

    def preserve_ldjson(match: re.Match) -> str:
        token = f"__OPENCLAW_LDJSON_SCRIPT_{len(preserved)}__"
        preserved.append(match.group(0))
        return token

    # Match <script ... type="application/ld+json" ...>...</script>
    ldjson_re = re.compile(
        r"<script\b[^>]*\btype=[\"']?application/ld\+json[\"']?[^>]*>.*?</script>",
        flags=re.IGNORECASE | re.DOTALL,
    )
    s = ldjson_re.sub(preserve_ldjson, s)

    # Strip <style> blocks
    s = re.sub(r"<style\b[^>]*>.*?</style>", "", s, flags=re.IGNORECASE | re.DOTALL)

    # Strip SVG blocks
    s = re.sub(r"<svg\b[^>]*>.*?</svg>", "", s, flags=re.IGNORECASE | re.DOTALL)
    # Also strip self-closing svg tags
    s = re.sub(r"<svg\b[^>]*/>", "", s, flags=re.IGNORECASE | re.DOTALL)

    # Remove all remaining script tags (we already tokenized ld+json ones)
    s = re.sub(r"<script\b[^>]*>.*?</script>", "", s, flags=re.IGNORECASE | re.DOTALL)
    s = re.sub(r"<script\b[^>]*/>", "", s, flags=re.IGNORECASE | re.DOTALL)

    # Restore preserved ld+json blocks
    for i, block in enumerate(preserved):
        s = s.replace(f"__OPENCLAW_LDJSON_SCRIPT_{i}__", block)

    # Compress excessive whitespace a bit.
    s = re.sub(r"[\t\r\f]+", " ", s)
    s = re.sub(r"\n{3,}", "\n\n", s)

    return s


def main():
    ap = argparse.ArgumentParser()
    ap.add_argument("--input-file", help="Read HTML from file")
    ap.add_argument("--input-html", help="Read HTML from argument")
    args = ap.parse_args()

    html = read_input(args)
    cleaned = clean_html(html)
    sys.stdout.write(cleaned)


if __name__ == "__main__":
    main()
