tech-bid-manage20260423A/modules/dark_bid_format_check.py

"""
技术暗标 HTML 格式检查（由 清标工具.js 迁移，不依赖浏览器/jsdom）。
仅解析内联 style 与文档内 <style> 中的 @page 简单规则；无内联样式时部分项可能判为不符合。
"""
from __future__ import annotations

import re
from typing import Any

from bs4 import BeautifulSoup, Tag

# 1pt ≈ 96/72 px (CSS 标准)
_PT_PX = 96.0 / 72.0

# 三号 16pt / 四号 14pt / 五号 10.5pt / 行距 26pt
_TARGET_H = 16 * _PT_PX  # 21.333...
_TARGET_BODY = 14 * _PT_PX
_TARGET_LH = 26 * _PT_PX
_TARGET_FIG = 10.5 * _PT_PX


def _parse_style_attr(style: str | None) -> dict[str, str]:
    if not style or not style.strip():
        return {}
    out: dict[str, str] = {}
    for part in style.split(";"):
        part = part.strip()
        if ":" not in part:
            continue
        k, v = part.split(":", 1)
        k, v = k.strip().lower(), v.strip()
        if k:
            out[k] = v
    return out


def _num(s: str) -> float:
    try:
        return float(re.sub(r"[^\d.\-]", "", s) or "nan")
    except ValueError:
        return float("nan")


def _length_to_px(val: str, font_size_px: float | None = None) -> float:
    """将 font-size / line-height 等长度转为近似 px 浮点，用于与 JS 中 getComputedStyle(px) 对齐。"""
    val = (val or "").strip().lower()
    if not val or val in ("normal", "inherit", "initial"):
        return float("nan")
    if val.isdigit():
        return float(val)
    m = re.match(r"^([\d.]+)\s*(pt|px|em|rem)?\s*$", val)
    if not m:
        m2 = re.match(r"^([\d.]+)", val)
        return float(m2.group(1)) if m2 else float("nan")
    n, unit = float(m.group(1)), (m.group(2) or "px")
    if unit == "pt":
        return n * _PT_PX
    if unit == "px":
        return n
    if unit in ("em", "rem") and font_size_px and font_size_px == font_size_px:
        return n * font_size_px
    if unit in ("em", "rem"):
        return n  # 无字号时仅返回 em 数，供 text-indent 等判断
    return n


def _indent_value(style: dict[str, str], font_size_px: float) -> float:
    """与 JS 中 parseFloat(textIndent) 对齐：'2em' -> 2.0；'2ch' 等取首数字段。"""
    raw = (style.get("text-indent") or "").strip()
    if not raw:
        return float("nan")
    if "em" in raw.lower():
        m = re.search(r"([\d.]+)\s*em", raw, re.I)
        return float(m.group(1)) if m else _num(raw)
    # px 转 em 近似
    px = _length_to_px(raw, font_size_px)
    if px == px and font_size_px > 0:
        return px / font_size_px
    return _num(raw)


def _color_normalized(style: dict[str, str]) -> str:
    c = (style.get("color") or "").strip().lower()
    if not c:
        return ""
    c = c.replace(" ", "")
    if c in ("#000", "#000000", "black", "rgb(0,0,0)"):
        return "rgb(0, 0, 0)"
    m = re.match(r"rgb\s*\(\s*(\d+)\s*,\s*(\d+)\s*,\s*(\d+)\s*\)", c)
    if m:
        r, g, b = int(m.group(1)), int(m.group(2)), int(m.group(3))
        if r == 0 and g == 0 and b == 0:
            return "rgb(0, 0, 0)"
        return c
    return c


def _el_style_dict(tag: Tag) -> dict[str, str]:
    s = tag.get("style")
    if isinstance(s, str):
        return _parse_style_attr(s)
    if isinstance(s, list):
        return _parse_style_attr(";".join(s))
    return {}


def _get_inline_property(tag: Tag, prop: str) -> str:
    d = _el_style_dict(tag)
    return d.get(prop.lower(), "")


def _outer_html_sample(tag: Tag, limit: int = 200) -> str:
    s = str(tag)
    return s[:limit] if len(s) > limit else s


def _is_under(node: Tag | None, ancestor: Tag | None) -> bool:
    if node is None or ancestor is None:
        return False
    p: Tag | None = node
    while p is not None:
        if p is ancestor:
            return True
        p = p.parent
    return False


def _body_text(soup: BeautifulSoup) -> str:
    body = soup.body
    if not body:
        return soup.get_text("\n", strip=True)
    return body.get_text("\n", strip=True)


def _parse_page_margins_from_html(raw_html: str) -> dict[str, str] | None:
    """从 <style> 中粗提取 @page 块内 margin 与 size。"""
    for m in re.finditer(
        r"@page\s*\{([^}]+)\}",
        raw_html,
        re.I | re.DOTALL,
    ):
        block = m.group(1)
        msh = re.search(r"margin\s*:\s*([^;]+);", block, re.I)
        if msh:
            return {"shorthand": msh.group(1).strip()}
        margins: dict[str, str] = {}
        for name, key in (
            (r"margin-top\s*:\s*([^;]+)", "top"),
            (r"margin-bottom\s*:\s*([^;]+)", "bottom"),
            (r"margin-left\s*:\s*([^;]+)", "left"),
            (r"margin-right\s*:\s*([^;]+)", "right"),
            (r"size\s*:\s*([^;]+)", "size"),
        ):
            mm = re.search(name, block, re.I)
            if mm:
                margins[key] = mm.group(1).strip()
        if margins:
            return margins
    return None


def check_technical_bid(html_content: str) -> dict[str, Any]:
    """
    对技术暗标 HTML 执行格式检查。
    返回结构与清标数据.json 一致：overall, details, violations。
    """
    results: dict[str, Any] = {
        "overall": True,
        "details": [],
        "violations": [],
    }

    def add_result(
        rule_name: str,
        passed: bool,
        message: str,
        elements: list[Tag] | None = None,
    ) -> None:
        results["details"].append(
            {"rule": rule_name, "passed": passed, "message": message}
        )
        if not passed:
            results["overall"] = False
            el_snips: list[str] = []
            for el in elements or []:
                if isinstance(el, Tag):
                    el_snips.append(_outer_html_sample(el))
            results["violations"].append(
                {"rule": rule_name, "message": message, "elements": el_snips}
            )

    if not (html_content or "").strip():
        add_result("身份信息隐藏", False, "HTML 内容为空", [])
        return results

    raw_html = html_content
    soup = BeautifulSoup(html_content, "lxml")
    if not soup.body:
        soup = BeautifulSoup(f"<html><body>{html_content}</body></html>", "lxml")

    body = soup.body
    if not body:
        add_result("身份信息隐藏", False, "无法解析 body", [])
        return results

    # ---- 1. 身份 ----
    body_text = _body_text(soup)
    company_pattern = re.compile(
        r"(?:我公司|本公司|[（(]?[A-Za-z\u4e00-\u9fa5]+(?:集团|股份|有限|责任|公司)[）)]?)"
    )
    addr_pattern = re.compile(
        r"(?:省|市|区|县|镇|路|街|大道|号|大厦|楼|层)[\u4e00-\u9fa50-9]+"
    )
    name_pattern = re.compile(
        r"(?:总监理工程师|专业监理工程师|技术负责人|项目经理)[：:]\s*"
        r"[^甲乙丙丁戊己庚辛壬癸\s]{2,4}(?=[，。；\s]|$)"
    )
    found_company = bool(company_pattern.search(body_text))
    found_addr = bool(addr_pattern.search(body_text))
    found_name = bool(name_pattern.search(body_text))
    has_logo = False
    for img in soup.find_all("img"):
        if not isinstance(img, Tag):
            continue
        alt = (img.get("alt") or "") + ""
        src = (img.get("src") or "") + ""
        if re.search(r"logo|商标|微标|公司|品牌", alt, re.I) or re.search(
            r"logo", src, re.I
        ):
            has_logo = True
            break
    passed_id = not (
        found_company or found_addr or found_name or has_logo
    )
    add_result(
        "身份信息隐藏",
        passed_id,
        "未发现投标人身份信息"
        if passed_id
        else "发现投标人身份信息（公司名/地址/真实姓名/商标）",
    )

    def heading_style_ok(tag: Tag) -> bool:
        st = _el_style_dict(tag)
        fs_raw = st.get("font-size", "")
        fs_px = _length_to_px(fs_raw)
        if "em" in (fs_raw or "").lower() and "rem" not in (fs_raw or "").lower():
            fs_px = _num(fs_raw) * 16.0
        size_ok = abs(fs_px - _TARGET_H) <= 3
        fam = (st.get("font-family") or "").lower()
        font_ok = "黑体" in fam or "simhei" in fam or "microsoft yahei" in fam
        font_style = (st.get("font-style") or "").lower()
        style_ok = font_style != "italic"
        text_dec = (st.get("text-decoration") or "").lower()
        decor_ok = "underline" not in text_dec
        cr = (st.get("color") or "").strip().lower()
        if not cr or cr in ("inherit", "initial"):
            color_ok = True
        else:
            cn = _color_normalized(st)
            color_ok = cn == "rgb(0, 0, 0)" or cr in (
                "#000",
                "#000000",
                "black",
                "rgb(0,0,0)",
            )
        fw = (st.get("font-weight") or "400").lower()
        weight_ok = fw not in ("400", "normal")
        if not st.get("font-size"):
            size_ok = False
        return (
            size_ok
            and font_ok
            and style_ok
            and decor_ok
            and color_ok
            and weight_ok
        )

    # ---- 2. 标题 ----
    heading_tags: list[Tag] = []
    for sel in ("h1", "h2", "h3", "h4", "h5", "h6"):
        heading_tags.extend(soup.find_all(sel))
    for t in soup.find_all(attrs={"role": "heading"}):
        if isinstance(t, Tag):
            heading_tags.append(t)
    for t in soup.select(".heading, .title"):
        if isinstance(t, Tag) and t not in heading_tags:
            heading_tags.append(t)

    invalid_h: list[Tag] = []
    for h in heading_tags:
        if not isinstance(h, Tag):
            continue
        if not heading_style_ok(h):
            invalid_h.append(h)
    h_ok = len(invalid_h) == 0
    add_result(
        "标题格式",
        h_ok,
        "所有标题符合三号黑体要求"
        if h_ok
        else "部分标题字号/字体/颜色/下划线不符合要求",
        invalid_h,
    )

    def body_el_ok(el: Tag) -> bool:
        st = _el_style_dict(el)
        if el.name in ("h1", "h2", "h3", "h4", "h5", "h6"):
            return True
        cls = " ".join(el.get("class", [])) if el.get("class") else ""
        if any(
            x in cls
            for x in ("header", "footer", "toc", "目录", "table-of-contents")
        ):
            return True
        text = el.get_text(strip=True)
        if not text:
            return True
        fs_raw = st.get("font-size", "")
        font_px = _length_to_px(fs_raw)
        if not fs_raw:
            return False
        size_ok = abs(font_px - _TARGET_BODY) <= 2
        fam = (st.get("font-family") or "").lower()
        font_ok = "宋体" in fam or "simsun" in fam or "serif" in fam
        col = st.get("color", "")
        color_ok = (not col) or _color_normalized(st) == "rgb(0, 0, 0)" or col.lower() in (
            "#000",
            "#000000",
            "black",
            "rgb(0,0,0)",
        )
        ind = _indent_value(st, font_px)
        indent_ok = ind == ind and 1.8 <= ind <= 2.2
        lh_raw = (st.get("line-height") or "").strip()
        if not lh_raw:
            line_ok = False
        else:
            if "pt" in lh_raw or "px" in lh_raw:
                lh_px = _length_to_px(lh_raw, font_px)
            elif re.match(r"^[\d.]+$", lh_raw):
                lh_px = float(lh_raw) * font_px
            else:
                lh_px = _length_to_px(lh_raw, font_px)
            line_ok = abs(lh_px - _TARGET_LH) <= 2
        tdec = (st.get("text-decoration") or "").lower()
        decor_ok = "underline" not in tdec
        fw = (st.get("font-weight") or "400").lower()
        weight_ok = fw in ("400", "normal", "")
        fst = (st.get("font-style") or "").lower()
        style_ok = fst != "italic"
        return (
            size_ok
            and font_ok
            and color_ok
            and indent_ok
            and line_ok
            and decor_ok
            and weight_ok
            and style_ok
        )

    exclude_set = {
        "h1",
        "h2",
        "h3",
        "h4",
        "h5",
        "h6",
    }
    invalid_body: list[Tag] = []
    for el in soup.find_all(["p", "div", "span", "li", "td", "th"]):
        if not isinstance(el, Tag):
            continue
        if el.name in exclude_set:
            continue
        if "header" in " ".join(el.get("class", [])):
            continue
        if "footer" in " ".join(el.get("class", [])):
            continue
        if "toc" in " ".join(el.get("class", [])) or "目录" in " ".join(
            el.get("class", [])
        ):
            continue
        if not el.get_text(strip=True):
            continue
        if not body_el_ok(el):
            invalid_body.append(el)

    b_ok = len(invalid_body) == 0
    add_result(
        "正文格式",
        b_ok,
        "所有正文符合四号宋体/缩进/行距/颜色要求"
        if b_ok
        else "部分正文段落格式不符合要求",
        invalid_body,
    )

    # ---- 4. 目录 ----
    toc_els: list[Tag] = []
    for cls in ("toc", "table-of-contents", "目录"):
        for t in soup.find_all(class_=cls):
            if isinstance(t, Tag) and t not in toc_els:
                toc_els.append(t)
    for t in soup.find_all(attrs={"role": "directory"}):
        if isinstance(t, Tag) and t not in toc_els:
            toc_els.append(t)

    if not toc_els:
        add_result("目录要求", False, "未检测到目录，请确保包含目录且目录无页码无页眉页脚")
    else:
        no_pn = True
        no_hf = True
        for toc in toc_els:
            text = toc.get_text("\n", strip=True)
            lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
            for line in lines:
                if re.search(r"\d+\s*$", line) and re.search(r"\d$", line):
                    if re.search(r"\.{2,}\s*\d+", line) or re.match(
                        r"^.*\d$", line
                    ):
                        if re.search(r"\.{2,}\s*\d+", line):
                            no_pn = False
                if re.search(r"\.{2,}\s*\d+", line):
                    no_pn = False
            if toc.find(class_=re.compile("header|page-header", re.I)):
                no_hf = False
            if toc.find(class_=re.compile("footer|page-footer", re.I)):
                no_hf = False
        t_ok = no_pn and no_hf
        add_result(
            "目录要求",
            t_ok,
            "目录符合无页码、无页眉页脚要求"
            if t_ok
            else "目录中存在页码或页眉页脚",
        )

    # ---- 5. 图表 / 附件（合法选择器）----
    appendix: Tag | None = None
    for sel in (
        "#appendix",
        ".appendix",
        ".attachment",
        '[id*="附件"]',
        '[class*="附件"]',
        '[class*="附表"]',
    ):
        hit = soup.select_one(sel)
        if hit and isinstance(hit, Tag):
            appendix = hit
            break

    illegal: list[Tag] = []
    for tbl in soup.find_all("table"):
        if isinstance(tbl, Tag) and not _is_under(tbl, appendix):
            illegal.append(tbl)
    for im in soup.find_all("img"):
        if isinstance(im, Tag) and not _is_under(im, appendix):
            illegal.append(im)
    for el in soup.find_all("figure"):
        if isinstance(el, Tag) and not _is_under(el, appendix):
            illegal.append(el)
    for el in soup.find_all(class_="chart"):
        if isinstance(el, Tag) and not _is_under(el, appendix) and el not in illegal:
            illegal.append(el)

    chart_text_valid = True
    if appendix:
        for el in appendix.select("table, td, th, figcaption, .chart-text"):
            if not isinstance(el, Tag):
                continue
            st = _el_style_dict(el)
            if not st.get("font-size"):
                continue
            fs = _length_to_px(st.get("font-size", ""))
            size_ok = abs(fs - _TARGET_FIG) <= 1.5
            fam = (st.get("font-family") or "").lower()
            font_ok = "宋体" in fam or "simsun" in fam
            c_raw = (st.get("color") or "").strip()
            if c_raw and c_raw.lower() not in ("inherit", "initial"):
                c_ok = _color_normalized(st) == "rgb(0, 0, 0)" or c_raw.lower() in (
                    "#000",
                    "#000000",
                    "black",
                    "rgb(0,0,0)",
                )
            else:
                c_ok = True
            if not (size_ok and font_ok and c_ok):
                chart_text_valid = False

    c_ok2 = len(illegal) == 0 and chart_text_valid
    add_result(
        "图表规范",
        c_ok2,
        "图表仅出现在附件/附表内，且图表文字符合五号宋体"
        if c_ok2
        else f"正文中发现{len(illegal)}个图表或附件内图表文字格式错误",
        illegal,
    )

    # ---- 6. 颜色与装饰 ----
    color_v: list[Tag] = []
    decor_v: list[Tag] = []
    for el in soup.find_all(True):
        if not isinstance(el, Tag):
            continue
        st = _el_style_dict(el)
        if not st.get("color") and not st.get("text-decoration") and not st.get(
            "border-bottom-style"
        ):
            continue
        col = (st.get("color") or "").strip().lower()
        if col and col not in (
            "inherit",
            "initial",
            "",
            "#000",
            "#000000",
            "black",
            "rgb(0,0,0)",
            "rgb(0, 0, 0)",
        ):
            if _color_normalized(st) and _color_normalized(st) != "rgb(0, 0, 0)":
                if el.get_text(strip=True):
                    color_v.append(el)
        tdec = (st.get("text-decoration") or "").lower()
        if "underline" in tdec and el.get_text(strip=True):
            decor_v.append(el)
        bbs = (st.get("border-bottom-style") or "").lower()
        if bbs in ("solid", "dotted") and el.get_text(strip=True):
            decor_v.append(el)
    col_ok = len(color_v) == 0 and len(decor_v) == 0
    add_result(
        "颜色与装饰",
        col_ok,
        "无彩色文字、无下划线、无着重号"
        if col_ok
        else f"发现{len(color_v)}处彩色文字，{len(decor_v)}处下划线/着重号",
        (color_v + decor_v)[:20],
    )

    # ---- 7. 页面 ----
    page_valid = True
    margin_top = margin_bottom = margin_left = margin_right = None
    page_info = _parse_page_margins_from_html(raw_html)
    # Word 常把 @page 写在 style 里，已在 raw_html 中解析
    if page_info and "shorthand" in page_info:
        # margin: 2.54cm 3.18cm
        parts = page_info["shorthand"].split()
        if len(parts) >= 4:
            margin_top, margin_right, margin_bottom, margin_left = (
                parts[0],
                parts[1],
                parts[2],
                parts[3],
            )
        elif len(parts) == 2:
            margin_top = margin_bottom = parts[0]
            margin_left = margin_right = parts[1]
    elif page_info:
        margin_top = page_info.get("top")
        margin_bottom = page_info.get("bottom")
        margin_left = page_info.get("left")
        margin_right = page_info.get("right")

    bst = _el_style_dict(body) if body else {}
    mraw = bst.get("margin", "")
    if mraw and not margin_top:
        margins = mraw.split()
        if len(margins) >= 1:
            margin_top = margins[0]
        if len(margins) >= 2:
            margin_right = margins[1]
        if len(margins) >= 3:
            margin_bottom = margins[2]
        if len(margins) >= 4:
            margin_left = margins[3]
        else:
            margin_left = margin_right

    if not margin_top and body:
        margin_top = _get_inline_property(body, "margin-top")
        margin_bottom = _get_inline_property(body, "margin-bottom")
        margin_left = _get_inline_property(body, "margin-left")
        margin_right = _get_inline_property(body, "margin-right")

    if not any([margin_top, margin_bottom, margin_left, margin_right]) and not page_info:
        page_valid = False

    def m_ok(
        m: str | None,
        target: float,
    ) -> bool:
        if not m:
            return False
        s = m.strip()
        if "cm" in s:
            return abs(_num(s) - target) < 0.01
        return abs(_num(s) - target) < 0.01

    top_ok = m_ok(margin_top, 2.54) or (
        (margin_top or "") in ("2.54cm", "1in")
    )
    bottom_ok = m_ok(margin_bottom, 2.54) or (
        (margin_bottom or "") in ("2.54cm", "1in")
    )
    left_ok = m_ok(margin_left, 3.18) or (margin_left or "").startswith("3.18")
    right_ok = m_ok(margin_right, 3.18) or (margin_right or "").startswith("3.18")

    html_tag = soup.find("html")
    w = _get_inline_property(html_tag, "width") if isinstance(html_tag, Tag) else ""  # type: ignore[arg-type]
    page_orientation = "横向" if w and w != "auto" and "%" not in w else "纵向"
    page_ok = bool(
        top_ok
        and bottom_ok
        and left_ok
        and right_ok
        and (page_orientation != "横向" or w in ("", "auto"))
    )
    if not margin_top:
        page_ok = False

    add_result(
        "页面设置",
        page_ok,
        "页面设置符合A4纵向/边距要求"
        if page_ok
        else "页面边距或纸张方向不符合要求",
    )

    return results