187 lines
6.0 KiB
Python
187 lines
6.0 KiB
Python
"""
|
||
附件类章节识别:标题匹配、expand_outline 跳过、以及正文模式(完整正文 vs 仅单图单表)。
|
||
"""
|
||
from __future__ import annotations
|
||
|
||
import json
|
||
import logging
|
||
import os
|
||
import re
|
||
from typing import Any, Dict, List, Optional
|
||
|
||
import config
|
||
|
||
logger = logging.getLogger(__name__)
|
||
|
||
DEFAULT_ATTACHMENT_RULES: Dict[str, Any] = {
|
||
'schema_version': 1,
|
||
'title_regex': [
|
||
r'附件\s*[一二三四五六七八九十0-9A-Za-z、::.]',
|
||
r'附\s*图',
|
||
r'附\s*表',
|
||
r'附\s*件\s*\(',
|
||
r'^\s*[\d一二三四五六七八九十\..、]+\s*附件',
|
||
],
|
||
'table_hint_keywords': [
|
||
'附表', '一览表', '清单表', '统计表', '明细表',
|
||
],
|
||
'figure_hint_keywords': [
|
||
'附图', '示意图', '平面图', '流程图', '布置图', '组织图', '横道',
|
||
],
|
||
'default_kind_when_ambiguous': 'table',
|
||
# stack_charts_only:默认,意图栈只输出 [FIGURE]/[TABLE] 无正文;full:长文;single_chart_only:栈顶仅一块
|
||
'attachment_leaf_body_mode': 'stack_charts_only',
|
||
}
|
||
|
||
|
||
def attachment_rules_path() -> str:
|
||
return os.path.join(config.DATA_DIR, 'attachment_section_rules.json')
|
||
|
||
|
||
def load_attachment_rules(path: Optional[str] = None) -> Dict[str, Any]:
|
||
p = path or attachment_rules_path()
|
||
data = dict(DEFAULT_ATTACHMENT_RULES)
|
||
if not os.path.isfile(p):
|
||
return data
|
||
try:
|
||
with open(p, encoding='utf-8') as f:
|
||
raw = json.load(f)
|
||
if isinstance(raw, dict):
|
||
for k, v in raw.items():
|
||
if k.startswith('_'):
|
||
continue
|
||
data[k] = v
|
||
except Exception as e:
|
||
logger.warning('加载 attachment_section_rules.json 失败,使用内置默认: %s', e)
|
||
return data
|
||
|
||
|
||
def attachment_leaf_body_mode(rules: Optional[Dict[str, Any]] = None) -> str:
|
||
"""
|
||
附件叶节点正文策略:
|
||
stack_charts_only(默认)、full(完整技术正文)、single_chart_only(栈顶仅一块图或表)。
|
||
"""
|
||
r = rules or get_attachment_rules_cached()
|
||
mode = (r.get('attachment_leaf_body_mode') or 'stack_charts_only').strip().lower()
|
||
if mode in ('single_chart_only', 'stack_charts_only', 'full'):
|
||
return mode
|
||
return 'stack_charts_only'
|
||
|
||
|
||
def use_attachment_stack_charts_body(rules: Optional[Dict[str, Any]] = None) -> bool:
|
||
"""附件走「意图栈仅图/表、无长文」路径(含 single_chart_only 的单栈顶版本)。"""
|
||
m = attachment_leaf_body_mode(rules)
|
||
return m in ('stack_charts_only', 'single_chart_only')
|
||
|
||
|
||
def use_attachment_single_chart_only_body(rules: Optional[Dict[str, Any]] = None) -> bool:
|
||
return attachment_leaf_body_mode(rules) == 'single_chart_only'
|
||
|
||
|
||
def use_attachment_full_body(rules: Optional[Dict[str, Any]] = None) -> bool:
|
||
return attachment_leaf_body_mode(rules) == 'full'
|
||
|
||
|
||
def is_attachment_only_section(section_title: str, rules: Optional[Dict[str, Any]] = None) -> bool:
|
||
"""标题是否属于附件类(附图/附表/附件N 等),用于 expand_outline 跳过与正文分支。"""
|
||
t = (section_title or '').strip()
|
||
if not t:
|
||
return False
|
||
r = rules or load_attachment_rules()
|
||
patterns: List[str] = list(r.get('title_regex') or [])
|
||
for pat in patterns:
|
||
try:
|
||
if re.search(pat, t):
|
||
return True
|
||
except re.error:
|
||
logger.warning('无效 attachment title_regex,已跳过: %s', pat[:80])
|
||
return False
|
||
|
||
|
||
def pick_single_figure_or_table(
|
||
section_title: str,
|
||
enable_figure: bool,
|
||
enable_table: bool,
|
||
rules: Optional[Dict[str, Any]] = None,
|
||
) -> Optional[str]:
|
||
"""
|
||
返回 'figure' | 'table' | None。
|
||
两开关均关返回 None;仅开一个则取对应类型。
|
||
"""
|
||
if not enable_figure and not enable_table:
|
||
return None
|
||
if enable_figure and not enable_table:
|
||
return 'figure'
|
||
if enable_table and not enable_figure:
|
||
return 'table'
|
||
|
||
r = rules or load_attachment_rules()
|
||
t = (section_title or '')
|
||
tbl_kw = list(r.get('table_hint_keywords') or [])
|
||
fig_kw = list(r.get('figure_hint_keywords') or [])
|
||
# 单独「表」字易误判,仅当同时存在附表类或与其它词组合时再偏表
|
||
for kw in tbl_kw:
|
||
if kw and kw in t:
|
||
return 'table'
|
||
for kw in fig_kw:
|
||
if kw and kw in t:
|
||
return 'figure'
|
||
# 泛「表」在附件语境下常见
|
||
if '表' in t and '图' not in t:
|
||
return 'table'
|
||
if '图' in t and '表' not in t:
|
||
return 'figure'
|
||
|
||
default = (r.get('default_kind_when_ambiguous') or 'table').strip().lower()
|
||
if default == 'figure':
|
||
return 'figure'
|
||
return 'table'
|
||
|
||
|
||
_cached_rules: Optional[Dict[str, Any]] = None
|
||
|
||
|
||
def get_attachment_rules_cached() -> Dict[str, Any]:
|
||
global _cached_rules
|
||
if _cached_rules is None:
|
||
_cached_rules = load_attachment_rules()
|
||
return _cached_rules
|
||
|
||
|
||
def should_skip_expand_subchapters(title: str) -> bool:
|
||
"""
|
||
AI 自动填充小章节(expand_outline)时:附件类一级主章不调用子章节生成。
|
||
判定与 is_attachment_only_section 一致。
|
||
"""
|
||
return is_attachment_only_section(title, get_attachment_rules_cached())
|
||
|
||
|
||
# 从标题中提取「附件几」等标签,用于日志
|
||
_ATTACHMENT_LABEL_RE = re.compile(
|
||
r'附件\s*[::]?\s*([一二三四五六七八九十百0-9A-Za-z]+)',
|
||
)
|
||
_ATTACHMENT_FIG_TBL_RE = re.compile(r'附\s*[图表]\s*([一二三四五六七八九十百0-9]*)')
|
||
|
||
|
||
def parse_attachment_label(title: str) -> Optional[str]:
|
||
t = (title or '').strip()
|
||
if not t:
|
||
return None
|
||
m = _ATTACHMENT_LABEL_RE.search(t)
|
||
if m:
|
||
return m.group(1).strip() or None
|
||
m2 = _ATTACHMENT_FIG_TBL_RE.search(t)
|
||
if m2:
|
||
rest = (m2.group(1) or '').strip()
|
||
if rest:
|
||
return rest
|
||
matched = m2.group(0)
|
||
if '图' in matched:
|
||
return '附图'
|
||
return '附表'
|
||
if re.search(r'附\s*图', t):
|
||
return '附图'
|
||
if re.search(r'附\s*表', t):
|
||
return '附表'
|
||
return None
|