tech-bid-manage20260422/utils/attachment_section.py
2026-04-23 14:36:26 +08:00

187 lines
6.0 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
附件类章节识别标题匹配、expand_outline 跳过、以及正文模式(完整正文 vs 仅单图单表)。
"""
from __future__ import annotations
import json
import logging
import os
import re
from typing import Any, Dict, List, Optional
import config
logger = logging.getLogger(__name__)
DEFAULT_ATTACHMENT_RULES: Dict[str, Any] = {
'schema_version': 1,
'title_regex': [
r'附件\s*[一二三四五六七八九十0-9A-Za-z、:.]',
r'\s*图',
r'\s*表',
r'\s*件\s*\(',
r'^\s*[\d一二三四五六七八九十\..、]+\s*附件',
],
'table_hint_keywords': [
'附表', '一览表', '清单表', '统计表', '明细表',
],
'figure_hint_keywords': [
'附图', '示意图', '平面图', '流程图', '布置图', '组织图', '横道',
],
'default_kind_when_ambiguous': 'table',
# stack_charts_only默认意图栈只输出 [FIGURE]/[TABLE] 无正文full长文single_chart_only栈顶仅一块
'attachment_leaf_body_mode': 'stack_charts_only',
}
def attachment_rules_path() -> str:
return os.path.join(config.DATA_DIR, 'attachment_section_rules.json')
def load_attachment_rules(path: Optional[str] = None) -> Dict[str, Any]:
p = path or attachment_rules_path()
data = dict(DEFAULT_ATTACHMENT_RULES)
if not os.path.isfile(p):
return data
try:
with open(p, encoding='utf-8') as f:
raw = json.load(f)
if isinstance(raw, dict):
for k, v in raw.items():
if k.startswith('_'):
continue
data[k] = v
except Exception as e:
logger.warning('加载 attachment_section_rules.json 失败,使用内置默认: %s', e)
return data
def attachment_leaf_body_mode(rules: Optional[Dict[str, Any]] = None) -> str:
"""
附件叶节点正文策略:
stack_charts_only默认、full完整技术正文、single_chart_only栈顶仅一块图或表
"""
r = rules or get_attachment_rules_cached()
mode = (r.get('attachment_leaf_body_mode') or 'stack_charts_only').strip().lower()
if mode in ('single_chart_only', 'stack_charts_only', 'full'):
return mode
return 'stack_charts_only'
def use_attachment_stack_charts_body(rules: Optional[Dict[str, Any]] = None) -> bool:
"""附件走「意图栈仅图/表、无长文」路径(含 single_chart_only 的单栈顶版本)。"""
m = attachment_leaf_body_mode(rules)
return m in ('stack_charts_only', 'single_chart_only')
def use_attachment_single_chart_only_body(rules: Optional[Dict[str, Any]] = None) -> bool:
return attachment_leaf_body_mode(rules) == 'single_chart_only'
def use_attachment_full_body(rules: Optional[Dict[str, Any]] = None) -> bool:
return attachment_leaf_body_mode(rules) == 'full'
def is_attachment_only_section(section_title: str, rules: Optional[Dict[str, Any]] = None) -> bool:
"""标题是否属于附件类(附图/附表/附件N 等),用于 expand_outline 跳过与正文分支。"""
t = (section_title or '').strip()
if not t:
return False
r = rules or load_attachment_rules()
patterns: List[str] = list(r.get('title_regex') or [])
for pat in patterns:
try:
if re.search(pat, t):
return True
except re.error:
logger.warning('无效 attachment title_regex已跳过: %s', pat[:80])
return False
def pick_single_figure_or_table(
section_title: str,
enable_figure: bool,
enable_table: bool,
rules: Optional[Dict[str, Any]] = None,
) -> Optional[str]:
"""
返回 'figure' | 'table' | None。
两开关均关返回 None仅开一个则取对应类型。
"""
if not enable_figure and not enable_table:
return None
if enable_figure and not enable_table:
return 'figure'
if enable_table and not enable_figure:
return 'table'
r = rules or load_attachment_rules()
t = (section_title or '')
tbl_kw = list(r.get('table_hint_keywords') or [])
fig_kw = list(r.get('figure_hint_keywords') or [])
# 单独「表」字易误判,仅当同时存在附表类或与其它词组合时再偏表
for kw in tbl_kw:
if kw and kw in t:
return 'table'
for kw in fig_kw:
if kw and kw in t:
return 'figure'
# 泛「表」在附件语境下常见
if '' in t and '' not in t:
return 'table'
if '' in t and '' not in t:
return 'figure'
default = (r.get('default_kind_when_ambiguous') or 'table').strip().lower()
if default == 'figure':
return 'figure'
return 'table'
_cached_rules: Optional[Dict[str, Any]] = None
def get_attachment_rules_cached() -> Dict[str, Any]:
global _cached_rules
if _cached_rules is None:
_cached_rules = load_attachment_rules()
return _cached_rules
def should_skip_expand_subchapters(title: str) -> bool:
"""
AI 自动填充小章节expand_outline附件类一级主章不调用子章节生成。
判定与 is_attachment_only_section 一致。
"""
return is_attachment_only_section(title, get_attachment_rules_cached())
# 从标题中提取「附件几」等标签,用于日志
_ATTACHMENT_LABEL_RE = re.compile(
r'附件\s*[:]?\s*([一二三四五六七八九十百0-9A-Za-z]+)',
)
_ATTACHMENT_FIG_TBL_RE = re.compile(r'\s*[图表]\s*([一二三四五六七八九十百0-9]*)')
def parse_attachment_label(title: str) -> Optional[str]:
t = (title or '').strip()
if not t:
return None
m = _ATTACHMENT_LABEL_RE.search(t)
if m:
return m.group(1).strip() or None
m2 = _ATTACHMENT_FIG_TBL_RE.search(t)
if m2:
rest = (m2.group(1) or '').strip()
if rest:
return rest
matched = m2.group(0)
if '' in matched:
return '附图'
return '附表'
if re.search(r'\s*图', t):
return '附图'
if re.search(r'\s*表', t):
return '附表'
return None