tech-bid-manage20260423A/modules/dark_bid_format_check.py
2026-04-23 17:10:38 +08:00

636 lines
21 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
技术暗标 HTML 格式检查(由 清标工具.js 迁移,不依赖浏览器/jsdom
仅解析内联 style 与文档内 <style> 中的 @page 简单规则;无内联样式时部分项可能判为不符合。
"""
from __future__ import annotations
import re
from typing import Any
from bs4 import BeautifulSoup, Tag
# 1pt ≈ 96/72 px (CSS 标准)
_PT_PX = 96.0 / 72.0
# 三号 16pt / 四号 14pt / 五号 10.5pt / 行距 26pt
_TARGET_H = 16 * _PT_PX # 21.333...
_TARGET_BODY = 14 * _PT_PX
_TARGET_LH = 26 * _PT_PX
_TARGET_FIG = 10.5 * _PT_PX
def _parse_style_attr(style: str | None) -> dict[str, str]:
if not style or not style.strip():
return {}
out: dict[str, str] = {}
for part in style.split(";"):
part = part.strip()
if ":" not in part:
continue
k, v = part.split(":", 1)
k, v = k.strip().lower(), v.strip()
if k:
out[k] = v
return out
def _num(s: str) -> float:
try:
return float(re.sub(r"[^\d.\-]", "", s) or "nan")
except ValueError:
return float("nan")
def _length_to_px(val: str, font_size_px: float | None = None) -> float:
"""将 font-size / line-height 等长度转为近似 px 浮点,用于与 JS 中 getComputedStyle(px) 对齐。"""
val = (val or "").strip().lower()
if not val or val in ("normal", "inherit", "initial"):
return float("nan")
if val.isdigit():
return float(val)
m = re.match(r"^([\d.]+)\s*(pt|px|em|rem)?\s*$", val)
if not m:
m2 = re.match(r"^([\d.]+)", val)
return float(m2.group(1)) if m2 else float("nan")
n, unit = float(m.group(1)), (m.group(2) or "px")
if unit == "pt":
return n * _PT_PX
if unit == "px":
return n
if unit in ("em", "rem") and font_size_px and font_size_px == font_size_px:
return n * font_size_px
if unit in ("em", "rem"):
return n # 无字号时仅返回 em 数,供 text-indent 等判断
return n
def _indent_value(style: dict[str, str], font_size_px: float) -> float:
"""与 JS 中 parseFloat(textIndent) 对齐:'2em' -> 2.0'2ch' 等取首数字段。"""
raw = (style.get("text-indent") or "").strip()
if not raw:
return float("nan")
if "em" in raw.lower():
m = re.search(r"([\d.]+)\s*em", raw, re.I)
return float(m.group(1)) if m else _num(raw)
# px 转 em 近似
px = _length_to_px(raw, font_size_px)
if px == px and font_size_px > 0:
return px / font_size_px
return _num(raw)
def _color_normalized(style: dict[str, str]) -> str:
c = (style.get("color") or "").strip().lower()
if not c:
return ""
c = c.replace(" ", "")
if c in ("#000", "#000000", "black", "rgb(0,0,0)"):
return "rgb(0, 0, 0)"
m = re.match(r"rgb\s*\(\s*(\d+)\s*,\s*(\d+)\s*,\s*(\d+)\s*\)", c)
if m:
r, g, b = int(m.group(1)), int(m.group(2)), int(m.group(3))
if r == 0 and g == 0 and b == 0:
return "rgb(0, 0, 0)"
return c
return c
def _el_style_dict(tag: Tag) -> dict[str, str]:
s = tag.get("style")
if isinstance(s, str):
return _parse_style_attr(s)
if isinstance(s, list):
return _parse_style_attr(";".join(s))
return {}
def _get_inline_property(tag: Tag, prop: str) -> str:
d = _el_style_dict(tag)
return d.get(prop.lower(), "")
def _outer_html_sample(tag: Tag, limit: int = 200) -> str:
s = str(tag)
return s[:limit] if len(s) > limit else s
def _is_under(node: Tag | None, ancestor: Tag | None) -> bool:
if node is None or ancestor is None:
return False
p: Tag | None = node
while p is not None:
if p is ancestor:
return True
p = p.parent
return False
def _body_text(soup: BeautifulSoup) -> str:
body = soup.body
if not body:
return soup.get_text("\n", strip=True)
return body.get_text("\n", strip=True)
def _parse_page_margins_from_html(raw_html: str) -> dict[str, str] | None:
"""从 <style> 中粗提取 @page 块内 margin 与 size。"""
for m in re.finditer(
r"@page\s*\{([^}]+)\}",
raw_html,
re.I | re.DOTALL,
):
block = m.group(1)
msh = re.search(r"margin\s*:\s*([^;]+);", block, re.I)
if msh:
return {"shorthand": msh.group(1).strip()}
margins: dict[str, str] = {}
for name, key in (
(r"margin-top\s*:\s*([^;]+)", "top"),
(r"margin-bottom\s*:\s*([^;]+)", "bottom"),
(r"margin-left\s*:\s*([^;]+)", "left"),
(r"margin-right\s*:\s*([^;]+)", "right"),
(r"size\s*:\s*([^;]+)", "size"),
):
mm = re.search(name, block, re.I)
if mm:
margins[key] = mm.group(1).strip()
if margins:
return margins
return None
def check_technical_bid(html_content: str) -> dict[str, Any]:
"""
对技术暗标 HTML 执行格式检查。
返回结构与清标数据.json 一致overall, details, violations。
"""
results: dict[str, Any] = {
"overall": True,
"details": [],
"violations": [],
}
def add_result(
rule_name: str,
passed: bool,
message: str,
elements: list[Tag] | None = None,
) -> None:
results["details"].append(
{"rule": rule_name, "passed": passed, "message": message}
)
if not passed:
results["overall"] = False
el_snips: list[str] = []
for el in elements or []:
if isinstance(el, Tag):
el_snips.append(_outer_html_sample(el))
results["violations"].append(
{"rule": rule_name, "message": message, "elements": el_snips}
)
if not (html_content or "").strip():
add_result("身份信息隐藏", False, "HTML 内容为空", [])
return results
raw_html = html_content
soup = BeautifulSoup(html_content, "lxml")
if not soup.body:
soup = BeautifulSoup(f"<html><body>{html_content}</body></html>", "lxml")
body = soup.body
if not body:
add_result("身份信息隐藏", False, "无法解析 body", [])
return results
# ---- 1. 身份 ----
body_text = _body_text(soup)
company_pattern = re.compile(
r"(?:我公司|本公司|[(]?[A-Za-z\u4e00-\u9fa5]+(?:集团|股份|有限|责任|公司)[)]?)"
)
addr_pattern = re.compile(
r"(?:省|市|区|县|镇|路|街|大道|号|大厦|楼|层)[\u4e00-\u9fa50-9]+"
)
name_pattern = re.compile(
r"(?:总监理工程师|专业监理工程师|技术负责人|项目经理)[:]\s*"
r"[^甲乙丙丁戊己庚辛壬癸\s]{2,4}(?=[,。;\s]|$)"
)
found_company = bool(company_pattern.search(body_text))
found_addr = bool(addr_pattern.search(body_text))
found_name = bool(name_pattern.search(body_text))
has_logo = False
for img in soup.find_all("img"):
if not isinstance(img, Tag):
continue
alt = (img.get("alt") or "") + ""
src = (img.get("src") or "") + ""
if re.search(r"logo|商标|微标|公司|品牌", alt, re.I) or re.search(
r"logo", src, re.I
):
has_logo = True
break
passed_id = not (
found_company or found_addr or found_name or has_logo
)
add_result(
"身份信息隐藏",
passed_id,
"未发现投标人身份信息"
if passed_id
else "发现投标人身份信息(公司名/地址/真实姓名/商标)",
)
def heading_style_ok(tag: Tag) -> bool:
st = _el_style_dict(tag)
fs_raw = st.get("font-size", "")
fs_px = _length_to_px(fs_raw)
if "em" in (fs_raw or "").lower() and "rem" not in (fs_raw or "").lower():
fs_px = _num(fs_raw) * 16.0
size_ok = abs(fs_px - _TARGET_H) <= 3
fam = (st.get("font-family") or "").lower()
font_ok = "黑体" in fam or "simhei" in fam or "microsoft yahei" in fam
font_style = (st.get("font-style") or "").lower()
style_ok = font_style != "italic"
text_dec = (st.get("text-decoration") or "").lower()
decor_ok = "underline" not in text_dec
cr = (st.get("color") or "").strip().lower()
if not cr or cr in ("inherit", "initial"):
color_ok = True
else:
cn = _color_normalized(st)
color_ok = cn == "rgb(0, 0, 0)" or cr in (
"#000",
"#000000",
"black",
"rgb(0,0,0)",
)
fw = (st.get("font-weight") or "400").lower()
weight_ok = fw not in ("400", "normal")
if not st.get("font-size"):
size_ok = False
return (
size_ok
and font_ok
and style_ok
and decor_ok
and color_ok
and weight_ok
)
# ---- 2. 标题 ----
heading_tags: list[Tag] = []
for sel in ("h1", "h2", "h3", "h4", "h5", "h6"):
heading_tags.extend(soup.find_all(sel))
for t in soup.find_all(attrs={"role": "heading"}):
if isinstance(t, Tag):
heading_tags.append(t)
for t in soup.select(".heading, .title"):
if isinstance(t, Tag) and t not in heading_tags:
heading_tags.append(t)
invalid_h: list[Tag] = []
for h in heading_tags:
if not isinstance(h, Tag):
continue
if not heading_style_ok(h):
invalid_h.append(h)
h_ok = len(invalid_h) == 0
add_result(
"标题格式",
h_ok,
"所有标题符合三号黑体要求"
if h_ok
else "部分标题字号/字体/颜色/下划线不符合要求",
invalid_h,
)
def body_el_ok(el: Tag) -> bool:
st = _el_style_dict(el)
if el.name in ("h1", "h2", "h3", "h4", "h5", "h6"):
return True
cls = " ".join(el.get("class", [])) if el.get("class") else ""
if any(
x in cls
for x in ("header", "footer", "toc", "目录", "table-of-contents")
):
return True
text = el.get_text(strip=True)
if not text:
return True
fs_raw = st.get("font-size", "")
font_px = _length_to_px(fs_raw)
if not fs_raw:
return False
size_ok = abs(font_px - _TARGET_BODY) <= 2
fam = (st.get("font-family") or "").lower()
font_ok = "宋体" in fam or "simsun" in fam or "serif" in fam
col = st.get("color", "")
color_ok = (not col) or _color_normalized(st) == "rgb(0, 0, 0)" or col.lower() in (
"#000",
"#000000",
"black",
"rgb(0,0,0)",
)
ind = _indent_value(st, font_px)
indent_ok = ind == ind and 1.8 <= ind <= 2.2
lh_raw = (st.get("line-height") or "").strip()
if not lh_raw:
line_ok = False
else:
if "pt" in lh_raw or "px" in lh_raw:
lh_px = _length_to_px(lh_raw, font_px)
elif re.match(r"^[\d.]+$", lh_raw):
lh_px = float(lh_raw) * font_px
else:
lh_px = _length_to_px(lh_raw, font_px)
line_ok = abs(lh_px - _TARGET_LH) <= 2
tdec = (st.get("text-decoration") or "").lower()
decor_ok = "underline" not in tdec
fw = (st.get("font-weight") or "400").lower()
weight_ok = fw in ("400", "normal", "")
fst = (st.get("font-style") or "").lower()
style_ok = fst != "italic"
return (
size_ok
and font_ok
and color_ok
and indent_ok
and line_ok
and decor_ok
and weight_ok
and style_ok
)
exclude_set = {
"h1",
"h2",
"h3",
"h4",
"h5",
"h6",
}
invalid_body: list[Tag] = []
for el in soup.find_all(["p", "div", "span", "li", "td", "th"]):
if not isinstance(el, Tag):
continue
if el.name in exclude_set:
continue
if "header" in " ".join(el.get("class", [])):
continue
if "footer" in " ".join(el.get("class", [])):
continue
if "toc" in " ".join(el.get("class", [])) or "目录" in " ".join(
el.get("class", [])
):
continue
if not el.get_text(strip=True):
continue
if not body_el_ok(el):
invalid_body.append(el)
b_ok = len(invalid_body) == 0
add_result(
"正文格式",
b_ok,
"所有正文符合四号宋体/缩进/行距/颜色要求"
if b_ok
else "部分正文段落格式不符合要求",
invalid_body,
)
# ---- 4. 目录 ----
toc_els: list[Tag] = []
for cls in ("toc", "table-of-contents", "目录"):
for t in soup.find_all(class_=cls):
if isinstance(t, Tag) and t not in toc_els:
toc_els.append(t)
for t in soup.find_all(attrs={"role": "directory"}):
if isinstance(t, Tag) and t not in toc_els:
toc_els.append(t)
if not toc_els:
add_result("目录要求", False, "未检测到目录,请确保包含目录且目录无页码无页眉页脚")
else:
no_pn = True
no_hf = True
for toc in toc_els:
text = toc.get_text("\n", strip=True)
lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
for line in lines:
if re.search(r"\d+\s*$", line) and re.search(r"\d$", line):
if re.search(r"\.{2,}\s*\d+", line) or re.match(
r"^.*\d$", line
):
if re.search(r"\.{2,}\s*\d+", line):
no_pn = False
if re.search(r"\.{2,}\s*\d+", line):
no_pn = False
if toc.find(class_=re.compile("header|page-header", re.I)):
no_hf = False
if toc.find(class_=re.compile("footer|page-footer", re.I)):
no_hf = False
t_ok = no_pn and no_hf
add_result(
"目录要求",
t_ok,
"目录符合无页码、无页眉页脚要求"
if t_ok
else "目录中存在页码或页眉页脚",
)
# ---- 5. 图表 / 附件(合法选择器)----
appendix: Tag | None = None
for sel in (
"#appendix",
".appendix",
".attachment",
'[id*="附件"]',
'[class*="附件"]',
'[class*="附表"]',
):
hit = soup.select_one(sel)
if hit and isinstance(hit, Tag):
appendix = hit
break
illegal: list[Tag] = []
for tbl in soup.find_all("table"):
if isinstance(tbl, Tag) and not _is_under(tbl, appendix):
illegal.append(tbl)
for im in soup.find_all("img"):
if isinstance(im, Tag) and not _is_under(im, appendix):
illegal.append(im)
for el in soup.find_all("figure"):
if isinstance(el, Tag) and not _is_under(el, appendix):
illegal.append(el)
for el in soup.find_all(class_="chart"):
if isinstance(el, Tag) and not _is_under(el, appendix) and el not in illegal:
illegal.append(el)
chart_text_valid = True
if appendix:
for el in appendix.select("table, td, th, figcaption, .chart-text"):
if not isinstance(el, Tag):
continue
st = _el_style_dict(el)
if not st.get("font-size"):
continue
fs = _length_to_px(st.get("font-size", ""))
size_ok = abs(fs - _TARGET_FIG) <= 1.5
fam = (st.get("font-family") or "").lower()
font_ok = "宋体" in fam or "simsun" in fam
c_raw = (st.get("color") or "").strip()
if c_raw and c_raw.lower() not in ("inherit", "initial"):
c_ok = _color_normalized(st) == "rgb(0, 0, 0)" or c_raw.lower() in (
"#000",
"#000000",
"black",
"rgb(0,0,0)",
)
else:
c_ok = True
if not (size_ok and font_ok and c_ok):
chart_text_valid = False
c_ok2 = len(illegal) == 0 and chart_text_valid
add_result(
"图表规范",
c_ok2,
"图表仅出现在附件/附表内,且图表文字符合五号宋体"
if c_ok2
else f"正文中发现{len(illegal)}个图表或附件内图表文字格式错误",
illegal,
)
# ---- 6. 颜色与装饰 ----
color_v: list[Tag] = []
decor_v: list[Tag] = []
for el in soup.find_all(True):
if not isinstance(el, Tag):
continue
st = _el_style_dict(el)
if not st.get("color") and not st.get("text-decoration") and not st.get(
"border-bottom-style"
):
continue
col = (st.get("color") or "").strip().lower()
if col and col not in (
"inherit",
"initial",
"",
"#000",
"#000000",
"black",
"rgb(0,0,0)",
"rgb(0, 0, 0)",
):
if _color_normalized(st) and _color_normalized(st) != "rgb(0, 0, 0)":
if el.get_text(strip=True):
color_v.append(el)
tdec = (st.get("text-decoration") or "").lower()
if "underline" in tdec and el.get_text(strip=True):
decor_v.append(el)
bbs = (st.get("border-bottom-style") or "").lower()
if bbs in ("solid", "dotted") and el.get_text(strip=True):
decor_v.append(el)
col_ok = len(color_v) == 0 and len(decor_v) == 0
add_result(
"颜色与装饰",
col_ok,
"无彩色文字、无下划线、无着重号"
if col_ok
else f"发现{len(color_v)}处彩色文字,{len(decor_v)}处下划线/着重号",
(color_v + decor_v)[:20],
)
# ---- 7. 页面 ----
page_valid = True
margin_top = margin_bottom = margin_left = margin_right = None
page_info = _parse_page_margins_from_html(raw_html)
# Word 常把 @page 写在 style 里,已在 raw_html 中解析
if page_info and "shorthand" in page_info:
# margin: 2.54cm 3.18cm
parts = page_info["shorthand"].split()
if len(parts) >= 4:
margin_top, margin_right, margin_bottom, margin_left = (
parts[0],
parts[1],
parts[2],
parts[3],
)
elif len(parts) == 2:
margin_top = margin_bottom = parts[0]
margin_left = margin_right = parts[1]
elif page_info:
margin_top = page_info.get("top")
margin_bottom = page_info.get("bottom")
margin_left = page_info.get("left")
margin_right = page_info.get("right")
bst = _el_style_dict(body) if body else {}
mraw = bst.get("margin", "")
if mraw and not margin_top:
margins = mraw.split()
if len(margins) >= 1:
margin_top = margins[0]
if len(margins) >= 2:
margin_right = margins[1]
if len(margins) >= 3:
margin_bottom = margins[2]
if len(margins) >= 4:
margin_left = margins[3]
else:
margin_left = margin_right
if not margin_top and body:
margin_top = _get_inline_property(body, "margin-top")
margin_bottom = _get_inline_property(body, "margin-bottom")
margin_left = _get_inline_property(body, "margin-left")
margin_right = _get_inline_property(body, "margin-right")
if not any([margin_top, margin_bottom, margin_left, margin_right]) and not page_info:
page_valid = False
def m_ok(
m: str | None,
target: float,
) -> bool:
if not m:
return False
s = m.strip()
if "cm" in s:
return abs(_num(s) - target) < 0.01
return abs(_num(s) - target) < 0.01
top_ok = m_ok(margin_top, 2.54) or (
(margin_top or "") in ("2.54cm", "1in")
)
bottom_ok = m_ok(margin_bottom, 2.54) or (
(margin_bottom or "") in ("2.54cm", "1in")
)
left_ok = m_ok(margin_left, 3.18) or (margin_left or "").startswith("3.18")
right_ok = m_ok(margin_right, 3.18) or (margin_right or "").startswith("3.18")
html_tag = soup.find("html")
w = _get_inline_property(html_tag, "width") if isinstance(html_tag, Tag) else "" # type: ignore[arg-type]
page_orientation = "横向" if w and w != "auto" and "%" not in w else "纵向"
page_ok = bool(
top_ok
and bottom_ok
and left_ok
and right_ok
and (page_orientation != "横向" or w in ("", "auto"))
)
if not margin_top:
page_ok = False
add_result(
"页面设置",
page_ok,
"页面设置符合A4纵向/边距要求"
if page_ok
else "页面边距或纸张方向不符合要求",
)
return results