""" 附件类章节识别:标题匹配、expand_outline 跳过、以及正文模式(完整正文 vs 仅单图单表)。 """ from __future__ import annotations import json import logging import os import re from typing import Any, Dict, List, Optional import config logger = logging.getLogger(__name__) DEFAULT_ATTACHMENT_RULES: Dict[str, Any] = { 'schema_version': 1, 'title_regex': [ r'附件\s*[一二三四五六七八九十0-9A-Za-z、::.]', r'附\s*图', r'附\s*表', r'附\s*件\s*\(', r'^\s*[\d一二三四五六七八九十\..、]+\s*附件', ], 'table_hint_keywords': [ '附表', '一览表', '清单表', '统计表', '明细表', ], 'figure_hint_keywords': [ '附图', '示意图', '平面图', '流程图', '布置图', '组织图', '横道', ], 'default_kind_when_ambiguous': 'table', # stack_charts_only:默认,意图栈只输出 [FIGURE]/[TABLE] 无正文;full:长文;single_chart_only:栈顶仅一块 'attachment_leaf_body_mode': 'stack_charts_only', } def attachment_rules_path() -> str: return os.path.join(config.DATA_DIR, 'attachment_section_rules.json') def load_attachment_rules(path: Optional[str] = None) -> Dict[str, Any]: p = path or attachment_rules_path() data = dict(DEFAULT_ATTACHMENT_RULES) if not os.path.isfile(p): return data try: with open(p, encoding='utf-8') as f: raw = json.load(f) if isinstance(raw, dict): for k, v in raw.items(): if k.startswith('_'): continue data[k] = v except Exception as e: logger.warning('加载 attachment_section_rules.json 失败,使用内置默认: %s', e) return data def attachment_leaf_body_mode(rules: Optional[Dict[str, Any]] = None) -> str: """ 附件叶节点正文策略: stack_charts_only(默认)、full(完整技术正文)、single_chart_only(栈顶仅一块图或表)。 """ r = rules or get_attachment_rules_cached() mode = (r.get('attachment_leaf_body_mode') or 'stack_charts_only').strip().lower() if mode in ('single_chart_only', 'stack_charts_only', 'full'): return mode return 'stack_charts_only' def use_attachment_stack_charts_body(rules: Optional[Dict[str, Any]] = None) -> bool: """附件走「意图栈仅图/表、无长文」路径(含 single_chart_only 的单栈顶版本)。""" m = attachment_leaf_body_mode(rules) return m in ('stack_charts_only', 'single_chart_only') def use_attachment_single_chart_only_body(rules: Optional[Dict[str, Any]] = None) -> bool: return attachment_leaf_body_mode(rules) == 'single_chart_only' def use_attachment_full_body(rules: Optional[Dict[str, Any]] = None) -> bool: return attachment_leaf_body_mode(rules) == 'full' def is_attachment_only_section(section_title: str, rules: Optional[Dict[str, Any]] = None) -> bool: """标题是否属于附件类(附图/附表/附件N 等),用于 expand_outline 跳过与正文分支。""" t = (section_title or '').strip() if not t: return False r = rules or load_attachment_rules() patterns: List[str] = list(r.get('title_regex') or []) for pat in patterns: try: if re.search(pat, t): return True except re.error: logger.warning('无效 attachment title_regex,已跳过: %s', pat[:80]) return False def pick_single_figure_or_table( section_title: str, enable_figure: bool, enable_table: bool, rules: Optional[Dict[str, Any]] = None, ) -> Optional[str]: """ 返回 'figure' | 'table' | None。 两开关均关返回 None;仅开一个则取对应类型。 """ if not enable_figure and not enable_table: return None if enable_figure and not enable_table: return 'figure' if enable_table and not enable_figure: return 'table' r = rules or load_attachment_rules() t = (section_title or '') tbl_kw = list(r.get('table_hint_keywords') or []) fig_kw = list(r.get('figure_hint_keywords') or []) # 单独「表」字易误判,仅当同时存在附表类或与其它词组合时再偏表 for kw in tbl_kw: if kw and kw in t: return 'table' for kw in fig_kw: if kw and kw in t: return 'figure' # 泛「表」在附件语境下常见 if '表' in t and '图' not in t: return 'table' if '图' in t and '表' not in t: return 'figure' default = (r.get('default_kind_when_ambiguous') or 'table').strip().lower() if default == 'figure': return 'figure' return 'table' _cached_rules: Optional[Dict[str, Any]] = None def get_attachment_rules_cached() -> Dict[str, Any]: global _cached_rules if _cached_rules is None: _cached_rules = load_attachment_rules() return _cached_rules def should_skip_expand_subchapters(title: str) -> bool: """ AI 自动填充小章节(expand_outline)时:附件类一级主章不调用子章节生成。 判定与 is_attachment_only_section 一致。 """ return is_attachment_only_section(title, get_attachment_rules_cached()) # 从标题中提取「附件几」等标签,用于日志 _ATTACHMENT_LABEL_RE = re.compile( r'附件\s*[::]?\s*([一二三四五六七八九十百0-9A-Za-z]+)', ) _ATTACHMENT_FIG_TBL_RE = re.compile(r'附\s*[图表]\s*([一二三四五六七八九十百0-9]*)') def parse_attachment_label(title: str) -> Optional[str]: t = (title or '').strip() if not t: return None m = _ATTACHMENT_LABEL_RE.search(t) if m: return m.group(1).strip() or None m2 = _ATTACHMENT_FIG_TBL_RE.search(t) if m2: rest = (m2.group(1) or '').strip() if rest: return rest matched = m2.group(0) if '图' in matched: return '附图' return '附表' if re.search(r'附\s*图', t): return '附图' if re.search(r'附\s*表', t): return '附表' return None