commit 909e228a9bb9eb434c0b02b01c9e940786d0275d Author: wanghui Date: Thu Apr 23 14:37:19 2026 +0800 提交 diff --git a/.deps_installed b/.deps_installed new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/.deps_installed @@ -0,0 +1 @@ + diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..1a7e8c6 --- /dev/null +++ b/.gitignore @@ -0,0 +1,6 @@ +.env +*.log +__pycache__/ +*.pyc +.venv/ +venv/ diff --git a/.idea/.gitignore b/.idea/.gitignore new file mode 100644 index 0000000..f6906f2 --- /dev/null +++ b/.idea/.gitignore @@ -0,0 +1,10 @@ +# 默认忽略的文件 +/shelf/ +/workspace.xml +# 基于编辑器的 HTTP 客户端请求 +/httpRequests/ +# 已忽略包含查询文件的默认文件夹 +/queries/ +# Datasource local storage ignored files +/dataSources/ +/dataSources.local.xml diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml new file mode 100644 index 0000000..105ce2d --- /dev/null +++ b/.idea/inspectionProfiles/profiles_settings.xml @@ -0,0 +1,6 @@ + + + + \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml new file mode 100644 index 0000000..3a0e561 --- /dev/null +++ b/.idea/modules.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/tech-bid-manage20260423.iml b/.idea/tech-bid-manage20260423.iml new file mode 100644 index 0000000..6c86ab2 --- /dev/null +++ b/.idea/tech-bid-manage20260423.iml @@ -0,0 +1,16 @@ + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 0000000..35eb1dd --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..d9945ab --- /dev/null +++ b/README.md @@ -0,0 +1,113 @@ +# 标伙伴 · AI 标书助手 + +基于大模型的智能标书生成工具(单机版),支持解析招标文件、自动生成技术标书、导出 Word 文档。 + +## 快速开始 + +### 方式一:双击启动(Windows) + +直接双击 `start.bat`,首次运行会自动安装依赖。 + +### 方式二:命令行启动 + +```bash +# 1. 安装依赖 +pip install -r requirements.txt + +# 2. 启动应用 +python app.py +``` + +浏览器访问 **http://localhost:5000** + +--- + +## 配置 API Key + +首次使用前,点击右上角 ⚙️ 设置图标,选择模型提供商并填入 API Key: + +| 提供商 | 推荐模型 | 申请地址 | +|--------|---------|---------| +| 通义千问 | qwen-max | https://dashscope.aliyun.com/ | +| DeepSeek | deepseek-chat (V3) | https://platform.deepseek.com/ | +| OpenAI | gpt-4o | https://platform.openai.com/ | + +> **DeepSeek 说明**:deepseek-chat (V3) 性价比极高,推荐用于生产环境。 +> 由于 DeepSeek 暂不提供 Embedding API,使用知识库功能时会自动回退到本地 sentence-transformers 模型(首次使用需下载约 90MB)。 + +也可通过环境变量配置: + +```bash +# 通义千问 +set QWEN_API_KEY=sk-xxxxxxxx +set MODEL_PROVIDER=qwen + +# DeepSeek +set DEEPSEEK_API_KEY=sk-xxxxxxxx +set MODEL_PROVIDER=deepseek + +python app.py +``` + +--- + +## 使用流程 + +1. **新建项目** → 输入项目名称 +2. **上传招标文件** → 支持 PDF / DOC / DOCX +3. **AI 解析** → 自动提取评分要求、资质条件、商务条款 +4. **生成大纲** → 按评分权重生成四级章节目录 +5. **生成内容** → 逐章节或一键全部生成 +6. **合规检查** → 对照招标要求检验覆盖情况 +7. **导出 Word** → 专业排版,直接使用 + +--- + +## 目录结构 + +``` +autorfp/ +├── app.py # Flask 主程序 +├── config.py # 配置文件 +├── requirements.txt # Python 依赖 +├── start.bat # Windows 一键启动 +├── prompts/ # AI 提示词模板 +├── modules/ # 功能模块 +│ ├── parser.py # 招标文件解析 +│ ├── generator.py # 标书内容生成 +│ ├── checker.py # 合规检查 +│ ├── exporter.py # Word 导出 +│ └── knowledge.py # 企业知识库 +├── utils/ # 工具函数 +│ ├── ai_client.py # AI API 封装 +│ ├── file_utils.py # 文件处理 +│ └── prompts.py # 提示词加载 +├── templates/ # HTML 模板 +├── static/ # 静态资源 +└── data/ # 数据目录(自动创建) + ├── projects.db # SQLite 数据库 + ├── uploads/ # 上传的招标文件 + ├── exports/ # 导出的标书 + ├── knowledge/ # 知识库文件 + └── chroma/ # 向量数据库 +``` + +--- + +## 企业知识库 + +在项目页面切换到「知识库」标签,上传历史标书文件。 +系统会自动将文件分块存入向量数据库,生成内容时自动检索相关片段,让 AI 更好地体现企业优势。 + +--- + +## 常见问题 + +**Q: 解析速度很慢?** +A: 招标文件越长耗时越长,通常 30-120 秒。建议使用 qwen-max 或 gpt-4o。 + +**Q: 内容生成失败?** +A: 检查 API Key 是否正确,以及账户余额是否充足。 + +**Q: 导出的 Word 文件乱码?** +A: 请使用 Microsoft Word 2016 及以上版本打开。 diff --git a/app.py b/app.py new file mode 100644 index 0000000..66bbcc4 --- /dev/null +++ b/app.py @@ -0,0 +1,1163 @@ +""" +标伙伴 - AI 标书助手(单机版) +启动命令:python app.py +访问地址:http://localhost:5000 +""" +import os +import sys + + +def _bootstrap_env_file(): + """在 import config 之前加载项目根目录 .env,便于注入 API Key;不覆盖已存在的环境变量。""" + if getattr(sys, 'frozen', False): + base = os.path.dirname(sys.executable) + else: + base = os.path.dirname(os.path.abspath(__file__)) + path = os.path.join(base, '.env') + if not os.path.isfile(path): + return + try: + with open(path, encoding='utf-8') as f: + for raw in f: + line = raw.strip() + if not line or line.startswith('#') or '=' not in line: + continue + key, _, val = line.partition('=') + key, val = key.strip(), val.strip().strip('"').strip("'") + if key and key not in os.environ: + os.environ[key] = val + except OSError: + pass + + +_bootstrap_env_file() +import json +import sqlite3 +import threading +import logging +from datetime import datetime +from flask import Flask, request, jsonify, render_template, send_from_directory, abort + +import config +from utils import settings as _settings + +# ── 日志配置 ──────────────────────────────────────────────────────────────── +_log_handlers = [logging.StreamHandler()] +if getattr(sys, 'frozen', False): + _log_file = os.path.join(os.path.dirname(sys.executable), 'bid_partner.log') + try: + _log_handlers.append(logging.FileHandler(_log_file, encoding='utf-8')) + except Exception: + pass + +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s [%(levelname)s] %(name)s: %(message)s', + datefmt='%H:%M:%S', + handlers=_log_handlers, +) +logger = logging.getLogger(__name__) + + +def _safe_json_load(raw): + if not raw or not isinstance(raw, str): + return None + try: + return json.loads(raw) + except Exception: + return None + + +# ── Flask 应用 ─────────────────────────────────────────────────────────────── +_bundle = getattr(sys, '_MEIPASS', os.path.dirname(os.path.abspath(__file__))) +app = Flask(__name__, + template_folder=os.path.join(_bundle, 'templates'), + static_folder=os.path.join(_bundle, 'static')) +app.secret_key = config.SECRET_KEY +app.config['MAX_CONTENT_LENGTH'] = config.MAX_FILE_SIZE_MB * 1024 * 1024 + + +# ═══════════════════════════════════════════════════════════════════════════ +# 数据库初始化 +# ═══════════════════════════════════════════════════════════════════════════ + +def init_db(): + """创建所有必要的目录和数据库表""" + for d in [config.DATA_DIR, config.UPLOAD_DIR, config.EXPORT_DIR, + config.KNOWLEDGE_DIR, config.CHROMA_DIR]: + os.makedirs(d, exist_ok=True) + + # 初始化持久化配置,启动时恢复上次保存的 API Key 等设置 + settings_path = os.path.join(config.DATA_DIR, 'settings.json') + _settings.init(settings_path) + _settings.load(config) + logger.info(f'当前模型: {config.MODEL_PROVIDER}') + + conn = sqlite3.connect(config.DB_PATH) + cur = conn.cursor() + # WAL 模式:允许多个读写线程并发操作,不互相阻塞 + cur.execute('PRAGMA journal_mode=WAL') + cur.execute('PRAGMA synchronous=NORMAL') # WAL 下可适当降低同步级别以提速 + + cur.executescript(''' + CREATE TABLE IF NOT EXISTS projects ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + name TEXT NOT NULL, + outline_status TEXT DEFAULT 'none', + outline_error TEXT DEFAULT '', + anon_requirements TEXT DEFAULT '', + enable_figure INTEGER DEFAULT 0, + enable_table INTEGER DEFAULT 0, + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP + ); + + CREATE TABLE IF NOT EXISTS tender_data ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + project_id INTEGER NOT NULL UNIQUE, + file_name TEXT, + raw_text TEXT, + summary TEXT, + rating_requirements TEXT, + rating_json TEXT, + outline TEXT, + boq_file_name TEXT DEFAULT '', + boq_text TEXT DEFAULT '', + boq_summary TEXT DEFAULT '', + boq_analysis_json TEXT DEFAULT '', + boq_status TEXT DEFAULT 'none', + boq_error TEXT DEFAULT '', + tender_kind TEXT DEFAULT 'engineering', + status TEXT DEFAULT 'pending', + error_message TEXT DEFAULT '', + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + FOREIGN KEY (project_id) REFERENCES projects(id) ON DELETE CASCADE + ); + + CREATE TABLE IF NOT EXISTS bid_sections ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + project_id INTEGER NOT NULL, + section_number TEXT, + section_title TEXT NOT NULL, + level INTEGER DEFAULT 1, + is_leaf INTEGER DEFAULT 1, + content TEXT DEFAULT '', + intro_content TEXT DEFAULT '', + order_index INTEGER DEFAULT 0, + status TEXT DEFAULT 'pending', + error_message TEXT DEFAULT '', + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + FOREIGN KEY (project_id) REFERENCES projects(id) ON DELETE CASCADE + ); + + CREATE TABLE IF NOT EXISTS knowledge_files ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + file_name TEXT NOT NULL UNIQUE, + file_path TEXT, + chunk_count INTEGER DEFAULT 0, + added_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP + ); + ''') + conn.commit() + # 兼容旧数据库:追加新列(已存在时忽略错误) + migrations = [ + ("ALTER TABLE projects ADD COLUMN anon_requirements TEXT DEFAULT ''", + 'projects.anon_requirements'), + ("ALTER TABLE projects ADD COLUMN enable_figure INTEGER DEFAULT 0", + 'projects.enable_figure'), + ("ALTER TABLE projects ADD COLUMN enable_table INTEGER DEFAULT 0", + 'projects.enable_table'), + ("ALTER TABLE tender_data ADD COLUMN boq_file_name TEXT DEFAULT ''", + 'tender_data.boq_file_name'), + ("ALTER TABLE tender_data ADD COLUMN boq_text TEXT DEFAULT ''", + 'tender_data.boq_text'), + ("ALTER TABLE tender_data ADD COLUMN boq_summary TEXT DEFAULT ''", + 'tender_data.boq_summary'), + ("ALTER TABLE tender_data ADD COLUMN boq_status TEXT DEFAULT 'none'", + 'tender_data.boq_status'), + ("ALTER TABLE tender_data ADD COLUMN boq_error TEXT DEFAULT ''", + 'tender_data.boq_error'), + ("ALTER TABLE tender_data ADD COLUMN boq_analysis_json TEXT DEFAULT ''", + 'tender_data.boq_analysis_json'), + ("ALTER TABLE tender_data ADD COLUMN tender_kind TEXT DEFAULT 'engineering'", + 'tender_data.tender_kind'), + ("ALTER TABLE tender_data ADD COLUMN target_pages INTEGER DEFAULT 0", + 'tender_data.target_pages'), + ] + for sql, col in migrations: + try: + conn.execute(sql) + conn.commit() + logger.info(f'数据库迁移:新增 {col} 列') + except Exception: + pass # 列已存在 + conn.close() + logger.info('数据库初始化完成') + + +def get_db(): + return sqlite3.connect(config.DB_PATH) + + +# ═══════════════════════════════════════════════════════════════════════════ +# 页面路由 +# ═══════════════════════════════════════════════════════════════════════════ + +@app.route('/') +def index(): + return render_template('index.html') + + +@app.route('/project/') +def project_page(project_id): + conn = get_db() + cur = conn.cursor() + cur.execute("SELECT id, name, created_at FROM projects WHERE id=?", (project_id,)) + row = cur.fetchone() + conn.close() + if not row: + abort(404) + return render_template('project.html', project={'id': row[0], 'name': row[1], 'created_at': row[2]}) + + +# ═══════════════════════════════════════════════════════════════════════════ +# API:项目管理 +# ═══════════════════════════════════════════════════════════════════════════ + +@app.route('/api/projects', methods=['GET']) +def api_list_projects(): + conn = get_db() + cur = conn.cursor() + cur.execute(''' + SELECT p.id, p.name, p.created_at, p.outline_status, + td.status as parse_status, td.file_name, + (SELECT COUNT(*) FROM bid_sections WHERE project_id=p.id) as section_count, + (SELECT COUNT(*) FROM bid_sections WHERE project_id=p.id AND status='done') as done_count + FROM projects p + LEFT JOIN tender_data td ON td.project_id = p.id + ORDER BY p.created_at DESC + ''') + rows = cur.fetchall() + conn.close() + projects = [] + for r in rows: + projects.append({ + 'id': r[0], 'name': r[1], 'created_at': r[2], + 'outline_status': r[3], 'parse_status': r[4] or 'none', + 'file_name': r[5], 'section_count': r[6], 'done_count': r[7], + }) + return jsonify({'projects': projects}) + + +@app.route('/api/projects', methods=['POST']) +def api_create_project(): + data = request.get_json() + name = (data or {}).get('name', '').strip() + if not name: + return jsonify({'error': '项目名称不能为空'}), 400 + conn = get_db() + cur = conn.cursor() + cur.execute("INSERT INTO projects (name) VALUES (?)", (name,)) + project_id = cur.lastrowid + conn.commit() + conn.close() + return jsonify({'id': project_id, 'name': name}), 201 + + +@app.route('/api/projects/', methods=['DELETE']) +def api_delete_project(project_id): + conn = get_db() + cur = conn.cursor() + cur.execute("DELETE FROM projects WHERE id=?", (project_id,)) + conn.commit() + conn.close() + return jsonify({'success': True}) + + +@app.route('/api/projects/', methods=['GET']) +def api_get_project(project_id): + conn = get_db() + cur = conn.cursor() + cur.execute(''' + SELECT p.id, p.name, p.created_at, p.outline_status, p.outline_error, + td.file_name, td.status as parse_status, td.error_message, + td.summary, td.rating_requirements, td.rating_json, td.outline, + p.anon_requirements, p.enable_figure, p.enable_table, + td.boq_file_name, td.boq_summary, td.boq_status, td.boq_error, + td.boq_analysis_json, td.tender_kind, COALESCE(td.target_pages, 0) + FROM projects p + LEFT JOIN tender_data td ON td.project_id = p.id + WHERE p.id=? + ''', (project_id,)) + row = cur.fetchone() + conn.close() + if not row: + return jsonify({'error': '项目不存在'}), 404 + + return jsonify({ + 'id': row[0], 'name': row[1], 'created_at': row[2], + 'outline_status': row[3], 'outline_error': row[4], + 'file_name': row[5], 'parse_status': row[6] or 'none', + 'parse_error': row[7], 'summary': row[8], + 'rating_requirements': row[9], 'rating_json': row[10], + 'outline': row[11], 'anon_requirements': row[12] or '', + 'enable_figure': bool(row[13]), 'enable_table': bool(row[14]), + 'boq_file_name': row[15] or '', 'boq_summary': row[16] or '', + 'boq_status': row[17] or 'none', 'boq_error': row[18] or '', + 'boq_analysis': _safe_json_load(row[19]), + 'tender_kind': row[20] or 'engineering', + 'target_pages': int(row[21] or 0), + }) + + +# ═══════════════════════════════════════════════════════════════════════════ +# API:文件上传与解析 +# ═══════════════════════════════════════════════════════════════════════════ + +@app.route('/api/projects//upload', methods=['POST']) +def api_upload(project_id): + from utils.file_utils import allowed_file, safe_filename + + if 'file' not in request.files: + return jsonify({'error': '未选择文件'}), 400 + f = request.files['file'] + if not f.filename: + return jsonify({'error': '文件名为空'}), 400 + if not allowed_file(f.filename): + return jsonify({'error': '仅支持 PDF / DOC / DOCX 格式'}), 400 + + filename = safe_filename(f.filename) + save_path = os.path.join(config.UPLOAD_DIR, f'{project_id}_{filename}') + f.save(save_path) + + # 初始化 tender_data 记录 + conn = get_db() + cur = conn.cursor() + cur.execute(''' + INSERT INTO tender_data (project_id, file_name, status) + VALUES (?, ?, 'uploaded') + ON CONFLICT(project_id) DO UPDATE SET file_name=?, status='uploaded', error_message='', updated_at=? + ''', (project_id, filename, filename, datetime.now())) + conn.commit() + conn.close() + + return jsonify({'success': True, 'file_name': filename, 'path': save_path}) + + +@app.route('/api/projects//tender-data', methods=['PUT']) +def api_update_tender_data(project_id): + """允许用户手动修改并保存解析结果(摘要、技术评分要求、标书类型)""" + data = request.get_json() or {} + fields = {} + if 'summary' in data: + fields['summary'] = data['summary'] + if 'rating_requirements' in data: + fields['rating_requirements'] = data['rating_requirements'] + if 'tender_kind' in data: + tk = (data.get('tender_kind') or 'engineering').strip().lower() + if tk not in ('engineering', 'service', 'goods'): + return jsonify({'error': 'tender_kind 须为 engineering / service / goods'}), 400 + fields['tender_kind'] = tk + if 'target_pages' in data: + try: + fields['target_pages'] = max(0, int(data['target_pages'])) + except (TypeError, ValueError): + return jsonify({'error': 'target_pages 须为非负整数'}), 400 + if not fields: + return jsonify({'error': '无可更新字段'}), 400 + + conn = get_db() + cur = conn.cursor() + set_clause = ', '.join(f'{k}=?' for k in fields) + values = list(fields.values()) + [datetime.now(), project_id] + cur.execute( + f'UPDATE tender_data SET {set_clause}, updated_at=? WHERE project_id=?', + values + ) + conn.commit() + conn.close() + return jsonify({'success': True}) + + +@app.route('/api/projects//upload-boq', methods=['POST']) +def api_upload_boq(project_id): + """上传工程量清单文件(独立于招标文件)""" + from utils.file_utils import safe_filename + + if 'file' not in request.files: + return jsonify({'error': '未选择文件'}), 400 + f = request.files['file'] + if not f.filename: + return jsonify({'error': '文件名为空'}), 400 + + ext = os.path.splitext(f.filename)[1].lower() + allowed_exts = {'.xlsx', '.xls', '.csv', '.pdf', '.docx', '.doc'} + if ext not in allowed_exts: + return jsonify({'error': f'不支持的格式 {ext},请使用 xlsx/xls/csv/pdf/docx/doc'}), 400 + + filename = safe_filename(f.filename) + save_path = os.path.join(config.UPLOAD_DIR, f'{project_id}_boq_{filename}') + f.save(save_path) + + # 确保 tender_data 记录存在 + conn = get_db() + cur = conn.cursor() + cur.execute(''' + INSERT INTO tender_data (project_id, boq_file_name, boq_status) + VALUES (?, ?, 'uploaded') + ON CONFLICT(project_id) DO UPDATE + SET boq_file_name=?, boq_status='uploaded', boq_error='', updated_at=? + ''', (project_id, filename, filename, datetime.now())) + conn.commit() + conn.close() + + return jsonify({'success': True, 'file_name': filename, 'path': save_path}) + + +@app.route('/api/projects//parse-boq', methods=['POST']) +def api_parse_boq(project_id): + """后台解析工程量清单 → AI 摘要""" + from modules.parser import parse_boq_file + + conn = get_db() + cur = conn.cursor() + cur.execute("SELECT boq_file_name FROM tender_data WHERE project_id=?", (project_id,)) + row = cur.fetchone() + conn.close() + + if not row or not row[0]: + return jsonify({'error': '请先上传工程量清单文件'}), 400 + + file_name = row[0] + # 同时尝试带/不带 boq_ 前缀的路径 + path1 = os.path.join(config.UPLOAD_DIR, f'{project_id}_boq_{file_name}') + path2 = os.path.join(config.UPLOAD_DIR, f'{project_id}_{file_name}') + file_path = path1 if os.path.exists(path1) else path2 + if not os.path.exists(file_path): + return jsonify({'error': '清单文件不存在,请重新上传'}), 404 + + t = threading.Thread( + target=parse_boq_file, + args=(config.DB_PATH, project_id, file_path, file_name), + daemon=True, + ) + t.start() + return jsonify({'success': True}) + + +@app.route('/api/projects//boq', methods=['PUT']) +def api_update_boq(project_id): + """手动保存用户编辑后的工程量清单摘要""" + data = request.get_json() or {} + boq_summary = data.get('boq_summary', '') + conn = get_db() + cur = conn.cursor() + cur.execute( + "UPDATE tender_data SET boq_summary=?, updated_at=? WHERE project_id=?", + (boq_summary, datetime.now(), project_id) + ) + conn.commit() + conn.close() + return jsonify({'success': True}) + + +@app.route('/api/projects//parse', methods=['POST']) +def api_parse(project_id): + from modules.parser import parse_tender_file + + conn = get_db() + cur = conn.cursor() + cur.execute("SELECT file_name FROM tender_data WHERE project_id=?", (project_id,)) + row = cur.fetchone() + conn.close() + + if not row or not row[0]: + return jsonify({'error': '请先上传招标文件'}), 400 + + file_name = row[0] + file_path = os.path.join(config.UPLOAD_DIR, f'{project_id}_{file_name}') + if not os.path.exists(file_path): + return jsonify({'error': f'文件不存在: {file_name}'}), 404 + + t = threading.Thread( + target=parse_tender_file, + args=(config.DB_PATH, project_id, file_path, file_name), + daemon=True, + ) + t.start() + return jsonify({'success': True, 'message': '解析任务已启动'}) + + +@app.route('/api/projects//parse-status', methods=['GET']) +def api_parse_status(project_id): + conn = get_db() + cur = conn.cursor() + cur.execute( + "SELECT status, error_message, summary, rating_requirements, rating_json, tender_kind " + "FROM tender_data WHERE project_id=?", + (project_id,) + ) + row = cur.fetchone() + conn.close() + if not row: + return jsonify({'status': 'none'}) + return jsonify({ + 'status': row[0], + 'message': row[1], + 'has_summary': bool(row[2]), + 'has_rating': bool(row[3]), + 'has_rating_json': bool(row[4]), + 'tender_kind': row[5] or 'engineering', + }) + + +# ═══════════════════════════════════════════════════════════════════════════ +# API:大纲生成 +# ═══════════════════════════════════════════════════════════════════════════ + +@app.route('/api/projects//outline', methods=['PUT']) +def api_update_outline(project_id): + """ + 用户手动修改大纲后保存:重新解析大纲文本,更新 bid_sections。 + 注意:已生成的章节内容将被清除,需重新生成。 + """ + from modules.generator import _parse_outline, _save_sections, _save_outline_text + + data = request.get_json() or {} + outline_text = (data.get('outline') or '').strip() + if not outline_text: + return jsonify({'error': '大纲内容不能为空'}), 400 + + try: + # 解析并自动重排序号,返回规范化文本 + _, sections, normalized_text = _parse_outline(outline_text) + if not sections: + return jsonify({'error': '大纲解析失败,未识别到任何章节,请检查格式'}), 400 + + conn = get_db() + cur = conn.cursor() + # 存储重排序号后的规范文本 + _save_outline_text(conn, project_id, normalized_text) + _save_sections(conn, project_id, sections) + cur.execute( + "UPDATE projects SET outline_status='outline_done', outline_error='', updated_at=? WHERE id=?", + (datetime.now(), project_id) + ) + cur.execute("SELECT length(outline) FROM tender_data WHERE project_id=?", (project_id,)) + persisted_len = (cur.fetchone() or [0])[0] or 0 + conn.commit() + conn.close() + # 把规范化文本返回给前端,前端据此更新编辑器内容 + return jsonify({ + 'success': True, + 'section_count': len(sections), + 'normalized_outline': normalized_text, + 'persisted_outline_len': persisted_len, + }) + except Exception as e: + logger.exception('手动保存大纲失败') + return jsonify({'error': str(e)}), 500 + + +@app.route('/api/projects//generate-outline', methods=['POST']) +def api_generate_outline(project_id): + from modules.generator import generate_outline + data = request.get_json(silent=True) or {} + force = bool(data.get('force', False)) + + conn = get_db() + cur = conn.cursor() + cur.execute( + ''' + SELECT p.outline_status, COALESCE(td.outline, '') + FROM projects p + LEFT JOIN tender_data td ON td.project_id = p.id + WHERE p.id = ? + ''', + (project_id,), + ) + row = cur.fetchone() + if not row: + conn.close() + return jsonify({'success': False, 'error': '项目不存在'}), 404 + + ostatus, outline_text = (row[0] or 'none', row[1] or '') + if ostatus == 'outline_generating': + conn.close() + return jsonify({'success': True, 'message': '大纲正在生成中,请稍候'}), 200 + + if not force and outline_text.strip(): + conn.close() + return jsonify({ + 'success': False, + 'error': '当前项目已有大纲,重新生成会覆盖现有大纲。请确认后以 force=true 再次请求。' + }), 409 + + # 立即落库,避免仅依赖后台线程的首条 SQL 时产生竞态(重复点击、轮询、刷新时状态不一致) + cur.execute( + "UPDATE projects SET outline_status=?, outline_error=?, updated_at=? WHERE id=?", + ('outline_generating', '', datetime.now(), project_id), + ) + conn.commit() + conn.close() + + t = threading.Thread( + target=generate_outline, + args=(config.DB_PATH, project_id), + daemon=True, + ) + t.start() + return jsonify({'success': True, 'message': '大纲生成任务已启动'}) + + +@app.route('/api/projects//expand-outline', methods=['POST']) +def api_expand_outline(project_id): + """根据当前编辑大纲自动补全小章节,并直接落库重建章节树。""" + from modules.generator import ( + expand_outline, + _parse_outline, + _save_outline_text, + _save_sections, + ) + + from utils import volume_chapters as vol_ch + + data = request.get_json() or {} + outline = data.get('outline', '') + if not outline.strip(): + return jsonify({'success': False, 'error': '大纲内容不能为空'}), 400 + + no_subchapter_limit = data.get('no_subchapter_limit') in (True, 1, '1', 'true', 'True') + if 'target_pages' in data: + try: + request_pages = max(0, int(data['target_pages'])) + except (TypeError, ValueError): + request_pages = None + else: + request_pages = None + + conn = get_db() + cur = conn.cursor() + cur.execute( + "SELECT summary, rating_requirements, COALESCE(target_pages, 0) FROM tender_data " + "WHERE project_id=?", + (project_id,), + ) + row = cur.fetchone() + conn.close() + + summary = row[0] if row else '' + rating_requirements = row[1] if row else '' + db_tp = int(row[2] or 0) if row else 0 + cfg_tp = int(getattr(config, 'TARGET_PAGES', 0) or 0) + effective_target_pages = vol_ch.resolve_expand_target_pages( + request_pages, no_subchapter_limit, db_tp, cfg_tp + ) + + try: + expanded_outline = expand_outline( + outline, summary, rating_requirements, project_id, effective_target_pages + ) + _, sections, normalized_text = _parse_outline(expanded_outline) + if not sections: + return jsonify({'success': False, 'error': '扩充后大纲解析失败,请检查章节格式'}), 400 + + conn = get_db() + cur = conn.cursor() + _save_outline_text(conn, project_id, normalized_text) + _save_sections(conn, project_id, sections) + cur.execute( + "UPDATE projects SET outline_status='outline_done', outline_error='', updated_at=? WHERE id=?", + (datetime.now(), project_id), + ) + conn.commit() + cur.execute("SELECT length(outline) FROM tender_data WHERE project_id=?", (project_id,)) + persisted_len = (cur.fetchone() or [0])[0] or 0 + conn.close() + + return jsonify({ + 'success': True, + 'expanded_outline': expanded_outline, + 'normalized_outline': normalized_text, + 'section_count': len(sections), + 'persisted_outline_len': persisted_len, + 'used_target_pages': effective_target_pages, + 'no_subchapter_limit': no_subchapter_limit, + }) + except Exception as e: + logger.exception(f'expand_outline failed for project {project_id}') + return jsonify({'success': False, 'error': str(e)}), 500 + + +@app.route('/api/projects//outline-status', methods=['GET']) +def api_outline_status(project_id): + conn = get_db() + cur = conn.cursor() + cur.execute("SELECT outline_status, outline_error FROM projects WHERE id=?", (project_id,)) + row = cur.fetchone() + conn.close() + if not row: + return jsonify({'status': 'none'}) + return jsonify({'status': row[0], 'error': row[1]}) + + +# ═══════════════════════════════════════════════════════════════════════════ +# API:章节管理与内容生成 +# ═══════════════════════════════════════════════════════════════════════════ + +@app.route('/api/projects//sections', methods=['GET']) +def api_list_sections(project_id): + conn = get_db() + cur = conn.cursor() + cur.execute(''' + SELECT id, section_number, section_title, level, is_leaf, + status, error_message, length(content) as content_len + FROM bid_sections + WHERE project_id=? + ORDER BY order_index + ''', (project_id,)) + rows = cur.fetchall() + conn.close() + sections = [] + for r in rows: + sections.append({ + 'id': r[0], 'number': r[1], 'title': r[2], 'level': r[3], + 'is_leaf': bool(r[4]), 'status': r[5], 'error': r[6], + 'has_content': (r[7] or 0) > 0, + }) + return jsonify({'sections': sections}) + + +@app.route('/api/projects//sections/', methods=['GET']) +def api_get_section(project_id, section_id): + conn = get_db() + cur = conn.cursor() + cur.execute( + "SELECT id, section_number, section_title, level, is_leaf, content, intro_content, status FROM bid_sections WHERE id=? AND project_id=?", + (section_id, project_id) + ) + row = cur.fetchone() + conn.close() + if not row: + return jsonify({'error': '章节不存在'}), 404 + return jsonify({ + 'id': row[0], 'number': row[1], 'title': row[2], 'level': row[3], + 'is_leaf': bool(row[4]), 'content': row[5], 'intro_content': row[6], 'status': row[7], + }) + + +@app.route('/api/projects//sections/', methods=['PUT']) +def api_update_section(project_id, section_id): + data = request.get_json() or {} + content = data.get('content', '') + conn = get_db() + cur = conn.cursor() + cur.execute( + "UPDATE bid_sections SET content=?, status='done', updated_at=? WHERE id=? AND project_id=?", + (content, datetime.now(), section_id, project_id) + ) + conn.commit() + conn.close() + return jsonify({'success': True}) + + +@app.route('/api/projects//sections//chat', methods=['POST']) +def api_section_chat(project_id, section_id): + """ + 对话式章节生成:接受多轮对话历史,结合章节上下文调用 AI,返回新一轮回复。 + 请求体:{ "messages": [{"role": "user"|"assistant", "content": "..."}] } + """ + from utils import ai_client + + data = request.get_json() or {} + messages = data.get('messages', []) + if not messages: + return jsonify({'error': '消息列表不能为空'}), 400 + + conn = get_db() + cur = conn.cursor() + cur.execute( + "SELECT section_title FROM bid_sections WHERE id=? AND project_id=?", + (section_id, project_id) + ) + row = cur.fetchone() + if not row: + conn.close() + return jsonify({'error': '章节不存在'}), 404 + section_title = row[0] + + cur.execute( + "SELECT summary, outline, tender_kind FROM tender_data WHERE project_id=?", + (project_id,), + ) + td = cur.fetchone() + conn.close() + + summary = ((td[0] or '')[:3000]) if td else '' + outline = ((td[1] or '')[:2000]) if td else '' + tk = (td[2] or 'engineering').strip().lower() if td else 'engineering' + if tk not in ('engineering', 'service', 'goods'): + tk = 'engineering' + + from utils.tender_kind_sections import CHAT_KIND_INSTRUCTION + + kind_hint = CHAT_KIND_INSTRUCTION.get(tk, CHAT_KIND_INSTRUCTION['engineering']) + + system = f"""你是一位资深的投标文件撰写专家,正在协助用户以对话方式撰写技术标书中「{section_title}」章节的正文内容。 + +【项目背景摘要】 +{summary or '(未提供)'} + +【标书目录结构】 +{outline or '(未提供)'} +{kind_hint} + +【撰写规范(必须遵守)】 +- 投标方自称统一用"我方",禁用"我们""我公司" +- 禁止套话:综上所述、高度重视、全力以赴、不断优化、稳步推进等 +- 每项措施须有可检验的实质内容(做法、节点、标准编号,或招标文件/清单已给出的量化依据); + 未载明的型号、数量、吨位、时限等不得编造,用概括性定性表述写清含义;禁止使用方括号待填项(如[型号][数量]) +- 列举用(1)(2)(3)编号,禁止"首先其次最后"连接 +- 纯文本输出,段落间用空行分隔,不使用 Markdown 符号 +- 直接输出正文,不含章节标题、解释说明或"以下是..."引导语""" + + valid_messages = [m for m in messages if m.get('role') in ('user', 'assistant')] + + try: + content = ai_client.chat_with_history(system, valid_messages, + temperature=0.7, max_tokens=4096) + return jsonify({'success': True, 'content': content}) + except Exception as e: + logger.exception(f'对话式章节生成失败 section_id={section_id}') + return jsonify({'error': str(e)}), 500 + + +@app.route('/api/projects//generate-section', methods=['POST']) +def api_generate_section(project_id): + from modules.generator import generate_section + + data = request.get_json() or {} + section_id = data.get('section_id') + if not section_id: + return jsonify({'error': '缺少 section_id'}), 400 + + conn = get_db() + cur = conn.cursor() + cur.execute( + "SELECT anon_requirements, enable_figure, enable_table FROM projects WHERE id=?", + (project_id,) + ) + row = cur.fetchone() + conn.close() + anon_req = (row[0] or '') if row else '' + enable_fig = bool(row[1]) if row else False + enable_tbl = bool(row[2]) if row else False + + t = threading.Thread( + target=generate_section, + args=(config.DB_PATH, project_id, section_id, anon_req, enable_fig, enable_tbl), + daemon=True, + ) + t.start() + return jsonify({'success': True}) + + +@app.route('/api/projects//diagram', methods=['PUT']) +def api_update_diagram(project_id): + """保存图表模式开关""" + data = request.get_json() or {} + enable_figure = 1 if data.get('enable_figure') else 0 + enable_table = 1 if data.get('enable_table') else 0 + conn = get_db() + cur = conn.cursor() + cur.execute( + "UPDATE projects SET enable_figure=?, enable_table=?, updated_at=? WHERE id=?", + (enable_figure, enable_table, datetime.now(), project_id) + ) + conn.commit() + conn.close() + return jsonify({'success': True}) + + +@app.route('/api/projects//anon', methods=['PUT']) +def api_update_anon(project_id): + """保存暗标要求""" + data = request.get_json() or {} + anon_requirements = data.get('anon_requirements', '') + conn = get_db() + cur = conn.cursor() + cur.execute( + "UPDATE projects SET anon_requirements=?, updated_at=? WHERE id=?", + (anon_requirements, datetime.now(), project_id) + ) + conn.commit() + conn.close() + return jsonify({'success': True}) + + +@app.route('/api/projects//generate-all-sections', methods=['POST']) +def api_generate_all_sections(project_id): + from modules.generator import generate_all_sections + + conn = get_db() + cur = conn.cursor() + cur.execute( + "SELECT anon_requirements, enable_figure, enable_table FROM projects WHERE id=?", + (project_id,) + ) + row = cur.fetchone() + conn.close() + anon_req = (row[0] or '') if row else '' + enable_fig = bool(row[1]) if row else False + enable_tbl = bool(row[2]) if row else False + + t = threading.Thread( + target=generate_all_sections, + args=(config.DB_PATH, project_id, anon_req, enable_fig, enable_tbl), + daemon=True, + ) + t.start() + return jsonify({'success': True, 'message': '全量生成任务已启动'}) + + +@app.route('/api/projects//section-progress', methods=['GET']) +def api_section_progress(project_id): + conn = get_db() + cur = conn.cursor() + cur.execute(''' + SELECT + COUNT(*) as total, + SUM(CASE WHEN status='done' THEN 1 ELSE 0 END) as done, + SUM(CASE WHEN status='generating' THEN 1 ELSE 0 END) as running, + SUM(CASE WHEN status='error' THEN 1 ELSE 0 END) as error_count + FROM bid_sections WHERE project_id=? + ''', (project_id,)) + r = cur.fetchone() + conn.close() + total, done, running, errors = r + return jsonify({ + 'total': total or 0, 'done': done or 0, + 'running': running or 0, 'errors': errors or 0, + 'percent': round((done or 0) / max(total or 1, 1) * 100), + }) + + +# ═══════════════════════════════════════════════════════════════════════════ +# API:合规检查 +# ═══════════════════════════════════════════════════════════════════════════ + +@app.route('/api/projects//check', methods=['POST']) +def api_check(project_id): + from modules.checker import check_compliance + result = check_compliance(config.DB_PATH, project_id) + return jsonify(result) + + +@app.route('/api/projects//check-dark-bid-format', methods=['POST']) +def api_check_dark_bid_format(project_id): + """技术暗标 HTML 格式清标(不调用 AI)。请求体 JSON:{"html": "..."}""" + conn = get_db() + cur = conn.cursor() + cur.execute("SELECT id FROM projects WHERE id=?", (project_id,)) + if not cur.fetchone(): + conn.close() + return jsonify({'error': '项目不存在'}), 404 + conn.close() + + data = request.get_json(silent=True) or {} + html = (data.get('html') or '').strip() + if not html: + return jsonify({'error': '请提供 html 字段(技术暗标完整 HTML 内容)'}), 400 + + from modules.dark_bid_format_check import check_technical_bid + try: + result = check_technical_bid(html) + return jsonify(result) + except Exception as e: + logger.exception('技术暗标格式清标失败') + return jsonify({'error': str(e)}), 500 + + +# ═══════════════════════════════════════════════════════════════════════════ +# API:导出 +# ═══════════════════════════════════════════════════════════════════════════ + +@app.route('/api/projects//export', methods=['POST']) +def api_export(project_id): + from modules.exporter import export_to_word + try: + filename = export_to_word(config.DB_PATH, project_id) + return jsonify({'success': True, 'filename': filename, 'url': f'/api/download/{filename}'}) + except Exception as e: + logger.exception('导出失败') + return jsonify({'error': str(e)}), 500 + + +@app.route('/api/download/') +def api_download(filename): + return send_from_directory(config.EXPORT_DIR, filename, as_attachment=True) + + +# ═══════════════════════════════════════════════════════════════════════════ +# API:知识库管理 +# ═══════════════════════════════════════════════════════════════════════════ + +@app.route('/api/knowledge/status', methods=['GET']) +def api_knowledge_status(): + from modules.knowledge import is_available, list_files + status = is_available() + status['file_count'] = len(list_files(config.DB_PATH)) + return jsonify(status) + + +@app.route('/api/knowledge/files', methods=['GET']) +def api_knowledge_list(): + from modules.knowledge import list_files + files = list_files(config.DB_PATH) + return jsonify({'files': files}) + + +@app.route('/api/knowledge/upload', methods=['POST']) +def api_knowledge_upload(): + from modules.knowledge import add_file + from utils.file_utils import allowed_file, safe_filename + import threading + + if 'file' not in request.files: + return jsonify({'error': '未选择文件'}), 400 + f = request.files['file'] + if not f.filename or not allowed_file(f.filename): + return jsonify({'error': '仅支持 PDF / DOC / DOCX'}), 400 + + filename = safe_filename(f.filename) + save_path = os.path.join(config.KNOWLEDGE_DIR, filename) + f.save(save_path) + + # 后台线程入库(提取文本 + 向量化可能耗时,避免请求超时) + def _ingest(): + result = add_file(save_path, config.DB_PATH) + if not result.get('success'): + logger.error(f'知识库入库失败 {filename}: {result.get("error")}') + + threading.Thread(target=_ingest, daemon=True).start() + + return jsonify({'success': True, 'queued': True, 'filename': filename}) + + +@app.route('/api/knowledge/delete', methods=['POST']) +def api_knowledge_delete(): + from modules.knowledge import delete_file + data = request.get_json() or {} + file_name = data.get('file_name', '') + if not file_name: + return jsonify({'error': '缺少 file_name'}), 400 + result = delete_file(file_name, config.DB_PATH) + return jsonify(result) + + +# ═══════════════════════════════════════════════════════════════════════════ +# API:AI 配置 +# ═══════════════════════════════════════════════════════════════════════════ + +@app.route('/api/config', methods=['GET']) +def api_get_config(): + def _has_key(k): return bool(k and not k.startswith('sk-your')) + return jsonify({ + 'model_provider': config.MODEL_PROVIDER, + 'qwen_model': config.QWEN_MODEL, + 'qwen_base_url': config.QWEN_BASE_URL, + 'openai_model': config.OPENAI_MODEL, + 'openai_base_url': config.OPENAI_BASE_URL, + 'deepseek_model': config.DEEPSEEK_MODEL, + 'deepseek_base_url': config.DEEPSEEK_BASE_URL, + 'ollama_base_url': config.OLLAMA_BASE_URL, + 'ollama_model': config.OLLAMA_MODEL, + 'doubao_model': config.DOUBAO_MODEL, + 'doubao_base_url': config.DOUBAO_BASE_URL, + 'kimi_model': config.KIMI_MODEL, + 'kimi_base_url': config.KIMI_BASE_URL, + 'has_qwen_key': _has_key(config.QWEN_API_KEY), + 'has_openai_key': _has_key(config.OPENAI_API_KEY), + 'has_deepseek_key': _has_key(config.DEEPSEEK_API_KEY), + 'has_doubao_key': _has_key(config.DOUBAO_API_KEY), + 'has_kimi_key': _has_key(config.KIMI_API_KEY), + 'max_concurrent': config.MAX_CONCURRENT_SECTIONS, + 'content_volume': config.CONTENT_VOLUME, + 'target_pages': getattr(config, 'TARGET_PAGES', 0), + 'page_char_estimate': getattr(config, 'PAGE_CHAR_ESTIMATE', 700), + }) + + +@app.route('/api/config', methods=['POST']) +def api_save_config(): + data = request.get_json() or {} + if 'model_provider' in data: + config.MODEL_PROVIDER = data['model_provider'] + if 'qwen_api_key' in data and data['qwen_api_key']: + config.QWEN_API_KEY = data['qwen_api_key'] + if 'qwen_model' in data and data['qwen_model']: + config.QWEN_MODEL = data['qwen_model'] + if 'qwen_base_url' in data and data['qwen_base_url']: + config.QWEN_BASE_URL = data['qwen_base_url'] + if 'openai_api_key' in data and data['openai_api_key']: + config.OPENAI_API_KEY = data['openai_api_key'] + if 'openai_model' in data and data['openai_model']: + config.OPENAI_MODEL = data['openai_model'] + if 'openai_base_url' in data and data['openai_base_url']: + config.OPENAI_BASE_URL = data['openai_base_url'] + if 'deepseek_api_key' in data and data['deepseek_api_key']: + config.DEEPSEEK_API_KEY = data['deepseek_api_key'] + if 'deepseek_model' in data and data['deepseek_model']: + config.DEEPSEEK_MODEL = data['deepseek_model'] + if 'deepseek_base_url' in data and data['deepseek_base_url']: + config.DEEPSEEK_BASE_URL = data['deepseek_base_url'] + if 'ollama_base_url' in data and data['ollama_base_url']: + config.OLLAMA_BASE_URL = data['ollama_base_url'] + if 'ollama_model' in data and data['ollama_model']: + config.OLLAMA_MODEL = data['ollama_model'] + if 'doubao_api_key' in data and data['doubao_api_key']: + config.DOUBAO_API_KEY = data['doubao_api_key'] + if 'doubao_model' in data and data['doubao_model']: + config.DOUBAO_MODEL = data['doubao_model'] + if 'doubao_base_url' in data and data['doubao_base_url']: + config.DOUBAO_BASE_URL = data['doubao_base_url'] + if 'kimi_api_key' in data and data['kimi_api_key']: + config.KIMI_API_KEY = data['kimi_api_key'] + if 'kimi_model' in data and data['kimi_model']: + config.KIMI_MODEL = data['kimi_model'] + if 'kimi_base_url' in data and data['kimi_base_url']: + config.KIMI_BASE_URL = data['kimi_base_url'] + if 'max_concurrent' in data: + v = int(data['max_concurrent']) + config.MAX_CONCURRENT_SECTIONS = max(1, min(v, 20)) + if 'content_volume' in data and data['content_volume'] in ('concise', 'standard', 'detailed', 'full'): + config.CONTENT_VOLUME = data['content_volume'] + if 'target_pages' in data: + try: + config.TARGET_PAGES = max(0, int(data['target_pages'])) + except (ValueError, TypeError): + pass + if 'page_char_estimate' in data: + try: + config.PAGE_CHAR_ESTIMATE = max(300, min(3000, int(data['page_char_estimate']))) + except (ValueError, TypeError): + pass + + _settings.save(config) + return jsonify({'success': True}) + + +# ═══════════════════════════════════════════════════════════════════════════ +# 启动 +# ═══════════════════════════════════════════════════════════════════════════ + +if __name__ == '__main__': + init_db() + print('\n' + '=' * 60) + print(' BidPartner - AI Bid Writing Assistant') + print('=' * 60) + print(' URL: http://localhost:5000') + print(' Press Ctrl+C to quit\n') + app.run(host='0.0.0.0', port=5000, debug=False, threaded=True) diff --git a/bid_partner.spec b/bid_partner.spec new file mode 100644 index 0000000..6f4517e --- /dev/null +++ b/bid_partner.spec @@ -0,0 +1,118 @@ +# -*- mode: python ; coding: utf-8 -*- +""" +PyInstaller spec for 标伙伴 · AI标书助手 +Build: pyinstaller bid_partner.spec + +知识库改用 SQLite + 纯 Python 向量存储,已不依赖 ChromaDB,打包更小。 +""" +import os +from PyInstaller.utils.hooks import collect_all, collect_data_files + +block_cipher = None + +# ── Collect complex packages ───────────────────────────────────────────────── +openai_datas, openai_bins, openai_hidden = collect_all('openai') +pydantic_datas, pydantic_bins, pydantic_hidden = collect_all('pydantic') + +# tiktoken data (BPE vocab files) +tiktoken_datas = collect_data_files('tiktoken') + +a = Analysis( + ['launcher.py'], + pathex=['.'], + binaries=openai_bins + pydantic_bins, + datas=[ + # ── App assets (read-only, go into _MEIPASS) ── + ('templates', 'templates'), + ('static', 'static'), + # ── Package data ── + *openai_datas, + *pydantic_datas, + *tiktoken_datas, + ], + hiddenimports=[ + # Flask / Werkzeug + 'flask', 'flask_cors', 'werkzeug', 'werkzeug.serving', + 'werkzeug.routing', 'werkzeug.middleware.proxy_fix', + 'jinja2', 'jinja2.ext', + # SQLite (stdlib, always present) + 'sqlite3', + # OpenAI + *openai_hidden, + # Pydantic + *pydantic_hidden, + # Document processing + 'PyPDF2', 'pypdf', 'pypdf.errors', + 'pdfminer', 'pdfminer.high_level', 'pdfminer.layout', + 'pdfminer.pdfpage', 'pdfminer.pdfinterp', 'pdfminer.converter', + 'docx', 'docx.oxml', 'docx.oxml.ns', 'docx.shared', + 'docx.enum', 'docx.enum.text', 'docx.enum.style', + 'python_docx', + # tiktoken + 'tiktoken', 'tiktoken.core', 'tiktoken.model', + 'tiktoken_ext', 'tiktoken_ext.openai_public', + # Network / encoding + 'requests', 'chardet', 'httpx', 'httpcore', + 'anyio', 'anyio.streams', 'anyio.streams.memory', + 'sniffio', 'certifi', + # Stdlib extras + 'importlib.metadata', 'importlib.resources', + 'pkg_resources', 'json', 'math', 'threading', + # Local project modules (explicitly include all) + 'config', 'app', + 'utils', 'utils.ai_client', 'utils.file_utils', + 'utils.prompts', 'utils.settings', 'utils.boq_parser', 'utils.bill_analysis', + 'modules', 'modules.parser', 'modules.generator', + 'modules.checker', 'modules.exporter', 'modules.knowledge', + ], + hookspath=[], + hooksconfig={}, + runtime_hooks=[], + excludes=[ + # Heavy packages not used in this app + 'matplotlib', 'pandas', 'scipy', 'numpy', + 'IPython', 'jupyter', 'notebook', + 'PIL', 'Pillow', + 'cv2', 'torch', 'tensorflow', + 'pytest', 'unittest', + # ChromaDB 及其依赖(已移除,改用 SQLite 内置存储) + 'chromadb', 'hnswlib', 'posthog', 'pypika', + 'mmh3', 'overrides', 'monotonic', + 'sentence_transformers', 'onnxruntime', + ], + win_no_prefer_redirects=False, + win_private_assemblies=False, + cipher=block_cipher, + noarchive=False, +) + +pyz = PYZ(a.pure, a.zipped_data, cipher=block_cipher) + +exe = EXE( + pyz, + a.scripts, + [], + exclude_binaries=True, + name='bid_partner', + debug=False, + bootloader_ignore_signals=False, + strip=False, + upx=False, + console=False, # no black console window — GUI launcher takes over + disable_windowed_traceback=False, + argv_emulation=False, + target_arch=None, + codesign_identity=None, + entitlements_file=None, +) + +coll = COLLECT( + exe, + a.binaries, + a.zipfiles, + a.datas, + strip=False, + upx=False, + upx_exclude=[], + name='BidPartner', +) diff --git a/bill-worker.js b/bill-worker.js new file mode 100644 index 0000000..ee84bb9 --- /dev/null +++ b/bill-worker.js @@ -0,0 +1,672 @@ +/** + * bill-worker.js — PDF 清单解析调度器(Worker Thread) + * + * 架构(v3 — SharedArrayBuffer 零拷贝): + * Phase 1 — 并行文本提取 + * 将 PDF 数据写入 SharedArrayBuffer(一次分配,所有子线程共享读) + * 启动 N 个 page-worker,每个负责固定 20 页 + * + * Phase 2 — 清单页筛选 + 文本解析(纯正则,毫秒级) + * 汇总全部页面文本 → 关键字筛选清单页 → 多行合并 → 逐行解析 + */ +'use strict'; +const { parentPort } = require('worker_threads'); +const { Worker } = require('worker_threads'); +const path = require('path'); + +const PAGES_PER_CHUNK = 20; + +parentPort.on('message', async (msg) => { + if (msg.type !== 'parse') return; + const t0 = Date.now(); + try { + // 立即做一次干净的拷贝,确保拥有独立的 ArrayBuffer + const raw = msg.buffer; + const buf = Buffer.alloc(raw.byteLength); + Buffer.from(raw).copy(buf); + + if (buf.length === 0) { + parentPort.postMessage({ type: 'done', ok: false, error: '收到空 PDF 数据' }); + return; + } + + // ── 获取总页数 ── + const pdfjsModule = await import('pdfjs-dist/build/pdf.mjs'); + const pdfjsLib = pdfjsModule.default || pdfjsModule; + // 给 pdfjs 一份独立拷贝(pdfjs 内部可能 detach buffer) + const pdfData = new Uint8Array(buf.length); + buf.copy(Buffer.from(pdfData.buffer)); + const pdf = await pdfjsLib.getDocument({ data: pdfData, isEvalSupported: false }).promise; + const totalPages = pdf.numPages; + + // ── 将 PDF 数据写入 SharedArrayBuffer(一次分配,所有子线程共享读)── + const sab = new SharedArrayBuffer(buf.length); + const sabView = new Uint8Array(sab); + buf.copy(Buffer.from(sabView.buffer)); // 从独立 buf 拷贝到共享内存 + + const workerCount = Math.ceil(totalPages / PAGES_PER_CHUNK); + console.log(`[BillWorker] PDF ${totalPages} 页, ${workerCount} 路并行 (SharedArrayBuffer ${(buf.length/1024/1024).toFixed(1)}MB)`); + + // Phase 1: 并行文本提取 + const pageTexts = await parallelExtract(sab, buf.length, totalPages, workerCount); + const t1 = Date.now(); + + const extractedCount = pageTexts.filter(t => t.length > 0).length; + console.log(`[BillWorker] Phase1 完成: ${t1 - t0}ms, ${extractedCount}/${totalPages} 页有文本`); + + // 扫描件判断 + const totalChars = pageTexts.reduce((s, t) => s + t.length, 0); + if (totalChars < 50) { + parentPort.postMessage({ type: 'done', ok: true, data: { scanned: true, reason: 'noText', totalPages } }); + return; + } + + // Phase 2: 筛选清单页(宽松策略 + 连续页补全) + const BILL_KW = ['项目编码', '项目名称', '工程量', '计量单位', '综合单价', '清单编码']; + const SEC_KW = ['分部分项', '分类分项', '措施项目', '其他项目', '工程量清单计价']; + // 第一轮:标记确定的清单页 + const billFlags = new Array(pageTexts.length).fill(false); + for (let i = 0; i < pageTexts.length; i++) { + const t = pageTexts[i]; + if (!t.trim()) continue; + const hHits = BILL_KW.filter(k => t.includes(k)).length; + const sHit = SEC_KW.some(k => t.includes(k)); + const hasCode = /\d{9}/.test(t); + // 放宽:有9位编码即可(不再要求同时命中表头关键字) + if (hHits >= 2 || sHit || hasCode) { + billFlags[i] = true; + } + } + // 第二轮:连续页补全 — 两个清单页之间的非空页也视为清单页(续页无表头) + // 但排除纯费用/税金页面(它们不含施工清单项) + const FEE_PAGE_KW = ['规费', '税金', '社会保险费', '住房公积金', '养老保险', + '工伤保险', '失业保险', '医疗保险', '教育费附加', '城市维护建设税']; + const firstBill = billFlags.indexOf(true); + const lastBill = billFlags.lastIndexOf(true); + if (firstBill >= 0 && lastBill > firstBill) { + for (let i = firstBill; i <= lastBill; i++) { + if (!billFlags[i] && pageTexts[i] && pageTexts[i].trim().length > 30) { + const t = pageTexts[i]; + const feeHits = FEE_PAGE_KW.filter(kw => t.includes(kw)).length; + // 命中 2+ 个费用关键字且没有9位工程编码 → 纯费用页,排除 + if (feeHits >= 2 && !/\d{9}/.test(t)) continue; + billFlags[i] = true; + } + } + } + const billTexts = []; + for (let i = 0; i < pageTexts.length; i++) { + if (billFlags[i]) billTexts.push(pageTexts[i]); + } + + if (!billTexts.length) { + parentPort.postMessage({ type: 'done', ok: true, data: { scanned: false, noBillPages: true, totalPages } }); + return; + } + + console.log(`[BillWorker] ${totalPages} 页 → ${billTexts.length} 页清单 (原始识别 ${billFlags.filter(f=>f).length - (lastBill - firstBill >= 0 ? 0 : 0)} / 补全后 ${billTexts.length})`); + + // Phase 3: 文本解析 + const merged = billTexts.join('\n'); + const parsed = parseBillText(merged); + const t2 = Date.now(); + console.log(`[BillWorker] Phase2+3: ${t2 - t1}ms, 总耗时: ${t2 - t0}ms`); + + parentPort.postMessage({ + type: 'done', ok: true, + data: { + scanned: false, + ...parsed, + _meta: { + method: 'local-parallel', + workers: workerCount, + billPages: billTexts.length, + totalPages, + extractMs: t1 - t0, + parseMs: t2 - t1, + totalMs: t2 - t0, + } + } + }); + } catch (err) { + console.error('[BillWorker] 错误:', err.message); + parentPort.postMessage({ type: 'done', ok: false, error: err.message }); + } +}); + +// ================================================================ +// Phase 1: 多 Worker 并行提取(SharedArrayBuffer 零拷贝) +// ================================================================ + +function parallelExtract(sab, dataLength, totalPages, workerCount) { + return new Promise((resolve) => { + const workerPath = path.join(__dirname, 'page-worker.js'); + const allPageTexts = new Array(totalPages).fill(''); + const workerStatus = new Array(workerCount).fill('pending'); // pending, done, failed + let resolved = false; + + const checkComplete = () => { + if (resolved) return; + const doneCount = workerStatus.filter(s => s === 'done' || s === 'failed').length; + if (doneCount >= workerCount) { + resolved = true; + // 检查是否有失败的worker,打印警告 + const failedCount = workerStatus.filter(s => s === 'failed').length; + if (failedCount > 0) { + console.warn(`[BillWorker] ${failedCount}/${workerCount} 个worker失败,可能导致部分页面无内容`); + } + resolve(allPageTexts); + } + }; + + for (let i = 0; i < workerCount; i++) { + const startPage = i * PAGES_PER_CHUNK + 1; + const endPage = Math.min((i + 1) * PAGES_PER_CHUNK, totalPages); + + // workerData 传 SharedArrayBuffer(跨线程共享,不会被清空) + const w = new Worker(workerPath, { + workerData: { sab, dataLength, startPage, endPage } + }); + + let workerDone = false; + + const markDone = (status) => { + if (workerDone) return; + workerDone = true; + workerStatus[i] = status; + checkComplete(); + }; + + w.on('message', (msg) => { + if (msg.ok && msg.results) { + for (const r of msg.results) { + allPageTexts[r.page - 1] = r.text; + } + markDone('done'); + } else if (!msg.ok) { + console.warn(`[BillWorker] page-worker[${startPage}-${endPage}] 失败: ${msg.error}`); + markDone('failed'); + } + }); + + w.on('error', (err) => { + console.warn(`[BillWorker] page-worker[${startPage}-${endPage}] 异常: ${err.message}`); + markDone('failed'); + }); + + w.on('exit', (code) => { + // exit 在 message 之后触发,但如果 worker 崩溃没发 message 则在这里兜底 + if (code !== 0 && !workerDone) { + console.warn(`[BillWorker] page-worker[${startPage}-${endPage}] 意外退出(code=${code})`); + markDone('failed'); + } else if (!workerDone) { + markDone('done'); + } + }); + } + + if (workerCount <= 0) { + resolved = true; + resolve(allPageTexts); + } + }); +} + +// ================================================================ +// Phase 3: 清单文本解析(纯正则 + 字符串处理,毫秒级) +// ================================================================ + +function parseBillText(text) { + const rawLines = text.split(/\n/).map(l => { + let line = l.replace(/\t/g, ' ').trim(); + // 规范化带横杠的编码:如 "010-101-001-001" → "010101001001" + line = line.replace(/(\d{2,4})[-‐–](\d{2,4})[-‐–](\d{2,4})(?:[-‐–](\d{2,4}))?/g, + (m, a, b, c, d) => { + const combined = a + b + c + (d || ''); + return (combined.length >= 9 && combined.length <= 12) ? combined : m; + }); + return line; + }); + + // ── Step 1: 多行合并成逻辑行 ── + // pdfjs 按 Y 坐标分行,表格一行通常 = 一条文本行 + // 但有时 项目特征/名称 会折行,需要合并 + // + // 新逻辑行的起始标志(任一命中即切断): + // a) 序号模式:1.1.1.1.5 开头 + // b) 清单编码:9-12位数字 或 B+5-6位数字 开头 + // c) 中文大标题:一 二 三 ... 或 (一)(二)... + // d) 表头行内容(跳过) + // e) 纯数字序号 + 空格 + 编码(如 "5 500101004001") + + const ITEM_START = /^\d+(\.\d+)+\s/; // 1.1 或 1.1.1 等序号 + const CODE_INLINE = /(?:^|\s)(\d{9,12}|(? raw.startsWith(m + ' ') || raw.startsWith(m + '\u3000'))) return true; + return false; + } + + for (const raw of rawLines) { + if (!raw || PAGE_MARK.test(raw)) continue; + if (HEADER_RE.test(raw) || HEADER_KW.test(raw)) continue; + if (/^(元)|^款章节号|^备注$|^第\d+页/.test(raw)) continue; + + if (isNewLineTrigger(raw)) { + if (currentLine) logicLines.push(currentLine); + currentLine = raw; + } else if (CODE_INLINE.test(raw) && raw.length > 15) { + // 行内包含编码且够长(像是完整的表格行)→ 也开新行 + if (currentLine) logicLines.push(currentLine); + currentLine = raw; + } else { + // 续行(项目特征折行等短文本) + // 安全阀:已合并行过长时强制切断,防止整页吞并 + if (currentLine && currentLine.length > 300) { + logicLines.push(currentLine); + currentLine = raw; + } else { + currentLine = currentLine ? currentLine + ' ' + raw : raw; + } + } + } + if (currentLine) logicLines.push(currentLine); + + console.log(`[BillWorker] 合并后 ${logicLines.length} 条逻辑行(原始 ${rawLines.length} 行)`); + // 打印前5条逻辑行供调试 + for (let i = 0; i < Math.min(5, logicLines.length); i++) { + console.log(`[BillWorker] L${i}: ${logicLines[i].substring(0, 120)}`); + } + + const categories = []; + let curCat = null, curItem = null; + + // 编码匹配:支持行内任意位置的9-12位数字或B编码(排除 GB/DB 等标准号前缀) + const CODE_RE = /(? u.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')); + const UNIT_RE = new RegExp(`(?:^|\\s)(${unitEscaped.join('|')})(?=\\s|\\d|$)`); + const SKIP_RE = /合\s*计|小\s*计|本页小计|总\s*计|价税合计/; + + for (const line of logicLines) { + if (SKIP_RE.test(line)) continue; + + // 去掉行首的序号部分("1.1.1.1.5 " 或 "5 " 等纯序号前缀) + let stripped = line.replace(/^\d+(\.\d+)*\s+/, '').trim(); + if (!stripped) stripped = line.trim(); + if (!stripped) continue; + + const cm = stripped.match(CODE_RE); + if (cm) { + if (curItem && curCat) curCat.items.push(curItem); + if (!curCat) { curCat = { name: '未分类', items: [] }; categories.push(curCat); } + + const code = cm[1]; + let rest = stripped.substring(cm.index + cm[0].length).trim(); + let name = '', unit = '', quantity = '', spec = ''; + + const unitMatch = rest.match(UNIT_RE); + if (unitMatch) { + const ui = rest.indexOf(unitMatch[0]); + let rawName = rest.substring(0, ui).trim(); + unit = unitMatch[1]; + const afterUnit = rest.substring(ui + unitMatch[0].length).trim(); + const qm = afterUnit.match(/^([\d,.]+)/); + if (qm) { + quantity = qm[1]; + // 提取 quantity 之后的尾部文本,跳过纯数字字段(综合单价、合价等) + let tail = afterUnit.substring(qm.index + qm[0].length).trim(); + if (tail) { + const tailTokens = tail.split(/\s+/); + let si = 0; + while (si < tailTokens.length && /^[\d,.%\-]+$/.test(tailTokens[si])) si++; + const specTail = tailTokens.slice(si).join(' ').trim(); + if (specTail) spec = specTail; + } + } + // 分离 rawName 中的"项目名称"和内联"项目特征" + const ns = splitNameAndSpec(rawName); + name = ns.name; + if (ns.spec) spec = ns.spec + (spec ? ';' + spec : ''); + } else { + const tokens = rest.split(/\s+/).filter(t => t); + let foundUnitIdx = -1; + for (let ti = tokens.length - 1; ti >= 1; ti--) { + if (UNIT_SET.has(tokens[ti])) { foundUnitIdx = ti; break; } + } + if (foundUnitIdx >= 1) { + const rawNameStr = tokens.slice(0, foundUnitIdx).join(' '); + const ns = splitNameAndSpec(rawNameStr); + name = ns.name; + if (ns.spec) spec = ns.spec; + unit = tokens[foundUnitIdx]; + const afterTokens = tokens.slice(foundUnitIdx + 1); + if (afterTokens.length && /^[\d,.]+$/.test(afterTokens[0])) { + quantity = afterTokens[0]; + let si = 1; + while (si < afterTokens.length && /^[\d,.%\-]+$/.test(afterTokens[si])) si++; + const specTail = afterTokens.slice(si).join(' ').trim(); + if (specTail) spec = spec ? spec + ';' + specTail : specTail; + } + } else { + name = rest; + } + } + + name = name.replace(/\s+/g, '').trim(); + for (const u of UNIT_TOKENS) { + if (name.endsWith(u) && name.length > u.length) { + unit = unit || u; + name = name.substring(0, name.length - u.length); + break; + } + } + + curItem = { code, name, unit, quantity, spec }; + continue; + } + + // ── 回退:无标准编码但有 "名称 单位 数量" 结构 → 也视为清单项 ── + // 常见于措施项目、未编码的补充清单项 + if (!cm && stripped.length > 4) { + const uniMatch = stripped.match(UNIT_RE); + if (uniMatch) { + const ui = stripped.indexOf(uniMatch[0]); + const beforeUnit = stripped.substring(0, ui).trim(); + const afterUnit = stripped.substring(ui + uniMatch[0].length).trim(); + const hasQty = /^[\d,.]+/.test(afterUnit); + // 名称 2-50 字、含中文、有数量、不是分部标题 + if (beforeUnit.length >= 2 && beforeUnit.length <= 50 && hasQty + && /[\u4e00-\u9fff]/.test(beforeUnit)) { + if (curItem && curCat) curCat.items.push(curItem); + if (!curCat) { curCat = { name: '未分类', items: [] }; categories.push(curCat); } + const unit = uniMatch[1]; + const qm = afterUnit.match(/^([\d,.]+)/); + const quantity = qm ? qm[1] : ''; + const ns = splitNameAndSpec(beforeUnit); + const name = ns.name.replace(/\s+/g, '').trim(); + const spec = ns.spec || ''; + curItem = { code: '', name, unit, quantity, spec }; + continue; + } + } + } + + // 分部标题判断:不含编码、较短的文本、含工程关键字 + // 关键守卫:如果行里有计量单位,说明是清单项,不是标题 + if (stripped.length > 2 && stripped.length < 60 && !CODE_RE.test(stripped)) { + if (UNIT_RE.test(stripped) && /\d+\.?\d*\s*$/.test(stripped)) { + if (curItem) curItem.spec = curItem.spec ? curItem.spec + ';' + stripped : stripped; + continue; + } + if (isCatTitle(stripped) && !UNIT_RE.test(stripped) && !isFeeCatTitle(stripped)) { + if (curItem && curCat) { curCat.items.push(curItem); curItem = null; } + const cleanTitle = stripped.replace(/\s+(座|个|项|处|m|km|段|条)\s+\d+[\d.]*\s*$/, '').trim(); + curCat = { name: cleanTitle, items: [] }; + categories.push(curCat); + continue; + } + } + + if (/^[一二三四五六七八九十]+\s/.test(stripped) || /^([一二三四五六七八九十\d]+)/.test(stripped)) { + // 中文序号标题也需要排除费用类 + const cleanTitle = stripped.replace(/\s+(座|个|项|处)\s+\d+[\d.]*\s*$/, '').trim(); + if (isFeeCatTitle(cleanTitle)) { + // 费用类标题:跳过,不建分部(其下的行会作为续行处理) + continue; + } + if (curItem && curCat) { curCat.items.push(curItem); curItem = null; } + curCat = { name: cleanTitle, items: [] }; + categories.push(curCat); + continue; + } + + if (curItem && stripped.length > 1) { + curItem.spec = curItem.spec ? curItem.spec + ';' + stripped : stripped; + } + } + + if (curItem && curCat) curCat.items.push(curItem); + + // 过滤费用项:只保留需要写入技术标的施工清单项 + let feeFiltered = 0; + for (const cat of categories) { + if (cat.items) { + const before = cat.items.length; + cat.items = cat.items.filter(it => !isFeeItem(it.name)); + feeFiltered += before - cat.items.length; + } + } + if (feeFiltered > 0) console.log(`[BillWorker] 费用项过滤: 移除 ${feeFiltered} 项`); + + // ========== 按项目名称合并(核心去重,大幅减少清单项数量)========== + // 规则:同一分部内,name 相同的清单项合并为一条 + // - code: 保留第一个非空编码 + // - unit: 保留第一个非空单位 + // - quantity: 尝试数值求和,否则用分号拼接 + // - spec: 去重后用分号拼接(截断过长的) + let totalBeforeMerge = 0, totalAfterMerge = 0; + for (const cat of categories) { + if (!cat.items || !cat.items.length) continue; + totalBeforeMerge += cat.items.length; + + const nameMap = new Map(); // name → merged item + for (const item of cat.items) { + const key = (item.name || '').replace(/\s+/g, '').trim(); + if (!key) continue; + + if (!nameMap.has(key)) { + nameMap.set(key, { + code: item.code || '', + name: item.name, + unit: item.unit || '', + quantity: item.quantity || '', + spec: item.spec || '', + _count: 1, + _quantities: item.quantity ? [item.quantity] : [], + _specs: item.spec ? [item.spec] : [], + }); + } else { + const m = nameMap.get(key); + m._count++; + // code: 取第一个非空的 + if (!m.code && item.code) m.code = item.code; + // unit: 取第一个非空的 + if (!m.unit && item.unit) m.unit = item.unit; + // quantity: 收集所有 + if (item.quantity) m._quantities.push(item.quantity); + // spec: 收集不重复的 + if (item.spec && !m._specs.includes(item.spec)) { + m._specs.push(item.spec); + } + } + } + + // 后处理:合成最终字段 + const merged = []; + for (const [, m] of nameMap) { + // quantity: 尝试数值求和 + if (m._quantities.length > 1) { + const nums = m._quantities.map(q => parseFloat(q.replace(/,/g, ''))); + if (nums.every(n => !isNaN(n))) { + const sum = nums.reduce((a, b) => a + b, 0); + m.quantity = sum % 1 === 0 ? String(sum) : sum.toFixed(2); + } else { + m.quantity = m._quantities.join('; '); + } + } else if (m._quantities.length === 1) { + m.quantity = m._quantities[0]; + } + // spec: 拼接去重后的 spec,每条最多120字 + if (m._specs.length > 0) { + const trimmed = m._specs.map(s => s.length > 120 ? s.substring(0, 120) + '...' : s); + m.spec = trimmed.join('; '); + // 总 spec 上限 300 字 + if (m.spec.length > 300) m.spec = m.spec.substring(0, 300) + '...'; + } + // 清理临时字段 + delete m._count; delete m._quantities; delete m._specs; + merged.push(m); + } + cat.items = merged; + totalAfterMerge += merged.length; + } + + const mergedCount = totalBeforeMerge - totalAfterMerge; + if (mergedCount > 0) { + console.log(`[BillWorker] 按名称合并: ${totalBeforeMerge} → ${totalAfterMerge} 项(合并 ${mergedCount} 个重复项)`); + } + + const valid = categories.filter(c => c.items && c.items.length > 0); + const totalItems = valid.reduce((s, c) => s + c.items.length, 0); + const withSpec = valid.reduce((s, c) => s + c.items.filter(it => it.spec).length, 0); + const withCode = valid.reduce((s, c) => s + c.items.filter(it => it.code).length, 0); + console.log(`[BillWorker] 最终结果: ${valid.length} 分部, ${totalItems} 清单项 (${withCode} 有编码, ${withSpec} 有spec)`); + // 打印前 3 个 item 供调试 + let debugCount = 0; + for (const cat of valid) { + for (const it of cat.items) { + if (debugCount < 3) { + console.log(`[BillWorker] 样例: [${it.code}] ${it.name} | ${it.unit} | qty=${it.quantity} | spec=${(it.spec||'').substring(0, 80)}`); + debugCount++; + } + } + } + + return { + project_summary: { remark: `本地解析:${valid.length} 个分部,${totalItems} 个清单项(合并前 ${totalBeforeMerge} 项)` }, + categories: valid, + }; +} + +/** + * 判断清单项是否为"费用项"(非施工内容,不写入技术标) + * 如:安全文明措施费、规费、税金、暂列金额等 + */ +function isFeeItem(name) { + if (!name) return false; + const n = name.replace(/\s+/g, ''); + + // ── 1. 精确匹配 ── + const EXACT = [ + '规费', '税金', '利润', '增值税', '暂列金额', '暂估价', '计日工', + '总承包服务费', '企业管理费', '甲供材料保管费', '价税合计', + ]; + if (EXACT.includes(n)) return true; + + // ── 2. 包含匹配:措施费/规费/保险/行政类 ── + const FEE_KW = [ + '安全文明', '文明施工费', '环境保护费', '临时设施费', + '夜间施工增加费', '夜间施工费', + '冬雨季施工增加费', '冬雨季施工费', + '二次搬运费', '大型机械设备进出场', '大型机械进出场', + '施工排水降水', '排水降水费', + '已完工程及设备保护', '已完工程保护费', + '工程排污费', '社会保障费', '住房公积金', + '工伤保险', '劳动保险', '意外伤害保险', '建筑工程保险', + '城市维护建设税', '城市建设维护税', + '教育费附加', '地方教育附加', + '材料暂估', '专业工程暂估', + '超高施工增加费', '安全防护费', + '措施项目费', '其他项目费', '不可竞争费', + ]; + for (const kw of FEE_KW) { + if (n.includes(kw)) return true; + } + + return false; +} + +/** + * 将 rawName 中的"项目名称"与内联"项目特征描述"分离 + * 例: "土方开挖 1.土壤类别:普通土" → { name: "土方开挖", spec: "1.土壤类别:普通土" } + */ +function splitNameAndSpec(rawName) { + if (!rawName) return { name: '', spec: '' }; + // Pattern 1: 数字+点+中文(如 "1.土壤类别" "2、强度等级") + const m = rawName.match(/\d+[.、.)\uFF09]\s*[\u4e00-\u9fff]/); + if (m && m.index > 0) { + return { + name: rawName.substring(0, m.index).trim(), + spec: rawName.substring(m.index).trim() + }; + } + // Pattern 2: 特征关键字+冒号(如 "材质:" "规格:") + const SPEC_KW_RE = /(材质|规格|型号|品牌|颜色|尺寸|厚度|直径|管径|强度|等级|类别|类型|做法|要求|标准|内容|工作内容|土壤|含量|配合比|工艺|方式|形式|范围|部位|位置|高度|宽度|长度|深度|坡度|截面|跨度|运距|开挖|回填|混凝土|钢筋|压实)[::]/; + const kw = rawName.match(SPEC_KW_RE); + if (kw && kw.index > 0) { + return { + name: rawName.substring(0, kw.index).trim(), + spec: rawName.substring(kw.index).trim() + }; + } + // Pattern 3: 括号开头的特征描述 "(1)" "(1)" + const paren = rawName.match(/[((]\d+[))]/); + if (paren && paren.index > 0) { + return { + name: rawName.substring(0, paren.index).trim(), + spec: rawName.substring(paren.index).trim() + }; + } + return { name: rawName, spec: '' }; +} + +function isCatTitle(text) { + const KW = [ + '土建','建筑','结构','装饰','装修','安装','给排水','暖通','空调','通风', + '电气','强电','弱电','消防','智能化','幕墙','门窗','园林','绿化','景观', + '市政','道路','桥梁','管网','基础','地基','桩基','主体','屋面','防水', + '保温','钢结构','排水','给水','照明','动力','防雷','电梯','人防','室外', + '附属','分部','工程','措施','清单','土石方','混凝土','砌筑','模板','脚手架', + '水利','河道','管道','阀门','设备','仪表','自动化','通信','网络', + '拆除','外墙','内墙','楼地面','天棚','吊顶','栏杆','屋顶','涂料','抹灰', + '廊道','阀门井','蓄水池','泵站','供水','引水','水源','渠道','闸门', + '围栏','警示','检修','管线','配电','水池','水塔','取水','净水', + ]; + return KW.some(k => text.includes(k)); +} + +/** + * 判断分部标题是否为"费用类"(不应创建分部分类) + * 如:规费、税金、措施项目费、其他项目费 等非施工类分部 + */ +function isFeeCatTitle(text) { + if (!text) return false; + const t = text.replace(/\s+/g, ''); + // 精确匹配整个标题 + const EXACT = [ + '规费', '税金', '利润', '增值税', '暂列金额', '暂估价', '计日工', + '总承包服务费', '企业管理费', '价税合计', + '措施项目费', '其他项目费', '不可竞争费', + ]; + if (EXACT.includes(t)) return true; + // 包含匹配 + const FEE_CAT_KW = [ + '措施项目费', '其他项目费', '不可竞争费', + '规费汇总', '税金汇总', '费率', '费用汇总', '费用合计', + '暂列金额', '暂估价', '计日工', '总承包服务费', + '安全文明施工费', '社会保障费', '住房公积金', + '工伤保险', '教育费附加', '城市维护建设税', + ]; + for (const kw of FEE_CAT_KW) { + if (t.includes(kw)) return true; + } + return false; +} diff --git a/build.bat b/build.bat new file mode 100644 index 0000000..eb607e2 --- /dev/null +++ b/build.bat @@ -0,0 +1,95 @@ +@echo off +chcp 65001 >nul 2>&1 +setlocal + +echo ============================================================ +echo BidPartner - Build Desktop EXE +echo ============================================================ +echo. + +:: ── 1. Check Python ──────────────────────────────────────────────────────── +python --version >nul 2>&1 +if errorlevel 1 ( + echo [ERROR] Python not found. Please install Python 3.9+. + pause & exit /b 1 +) + +:: ── 2. Install / upgrade PyInstaller ─────────────────────────────────────── +echo [Step 1/4] Installing PyInstaller... +pip install --quiet --upgrade pyinstaller +if errorlevel 1 ( + echo [ERROR] Failed to install PyInstaller. + pause & exit /b 1 +) + +:: ── 3. Install project dependencies (if not already installed) ───────────── +echo [Step 2/4] Checking dependencies... +pip install --quiet -r requirements.txt +if errorlevel 1 ( + echo [ERROR] Failed to install dependencies. + pause & exit /b 1 +) + +:: ── 4. Sanitize settings.json - REMOVE API KEYS before build ─────────────── +echo [Step 3/4] Sanitizing settings (removing API keys from build)... +if exist "data\settings.json" ( + :: Back up real settings + copy /y "data\settings.json" "data\settings.json.bak" >nul +) +:: Write a clean settings file with no real keys +( + echo { + echo "model_provider": "deepseek", + echo "qwen_api_key": "sk-your-qwen-key", + echo "qwen_model": "qwen3.6-plus", + echo "openai_api_key": "sk-your-openai-key", + echo "openai_model": "gpt-4o", + echo "deepseek_api_key": "sk-your-deepseek-key", + echo "deepseek_model": "deepseek-chat", + echo "max_concurrent": 5, + echo "content_volume": "standard" + echo } +) > "data\settings_clean.tmp" + +:: ── 5. Build ──────────────────────────────────────────────────────────────── +echo [Step 4/4] Building EXE with PyInstaller... +echo (This may take 3-10 minutes on first run) +echo. + +:: Clean previous build artifacts +if exist "build" rd /s /q "build" >nul 2>&1 +if exist "dist\BidPartner" rd /s /q "dist\BidPartner" >nul 2>&1 + +pyinstaller bid_partner.spec --noconfirm +set BUILD_RESULT=%errorlevel% + +:: ── Restore real settings ─────────────────────────────────────────────────── +if exist "data\settings.json.bak" ( + copy /y "data\settings.json.bak" "data\settings.json" >nul + del /f /q "data\settings.json.bak" >nul 2>&1 +) +del /f /q "data\settings_clean.tmp" >nul 2>&1 + +if %BUILD_RESULT% neq 0 ( + echo. + echo [ERROR] PyInstaller build failed. See output above for details. + pause & exit /b 1 +) + +:: ── 6. Result ─────────────────────────────────────────────────────────────── +echo. +echo ============================================================ +echo Build SUCCESSFUL! +echo Output: dist\BidPartner\bid_partner.exe +echo ============================================================ +echo. +echo The 'dist\BidPartner' folder is your distributable package. +echo Users only need this folder - no Python installation required. +echo Each user must set their own API key in the app settings. +echo. + +:: Open the output folder +explorer "dist\BidPartner" >nul 2>&1 + +endlocal +pause diff --git a/config.py b/config.py new file mode 100644 index 0000000..c010a0b --- /dev/null +++ b/config.py @@ -0,0 +1,76 @@ +import os +import sys + +# When running as a PyInstaller bundle: +# sys._MEIPASS → read-only bundle dir (templates, static, prompts) +# sys.executable dir → writable dir next to the .exe (data, settings, db) +if getattr(sys, 'frozen', False): + _BUNDLE_DIR = sys._MEIPASS # bundled app files + BASE_DIR = os.path.dirname(sys.executable) # writable runtime dir +else: + _BUNDLE_DIR = os.path.dirname(os.path.abspath(__file__)) + BASE_DIR = _BUNDLE_DIR + +DATA_DIR = os.path.join(BASE_DIR, 'data') +UPLOAD_DIR = os.path.join(DATA_DIR, 'uploads') +EXPORT_DIR = os.path.join(DATA_DIR, 'exports') +KNOWLEDGE_DIR= os.path.join(DATA_DIR, 'knowledge') +DB_PATH = os.path.join(DATA_DIR, 'projects.db') +CHROMA_DIR = os.path.join(DATA_DIR, 'chroma') +PROMPTS_DIR = os.path.join(_BUNDLE_DIR, 'prompts') + +# ==================== AI 模型配置 ==================== +# 模型选择:'openai' | 'qwen' | 'deepseek' | 'ollama' +MODEL_PROVIDER = os.environ.get('MODEL_PROVIDER', 'qwen') + +# OpenAI +OPENAI_API_KEY = os.environ.get('OPENAI_API_KEY', 'sk-your-openai-key') +OPENAI_MODEL = os.environ.get('OPENAI_MODEL', 'gpt-4.1') +OPENAI_BASE_URL = os.environ.get('OPENAI_BASE_URL', 'https://api.openai.com/v1') + +# 阿里云通义千问 +QWEN_API_KEY = os.environ.get('QWEN_API_KEY', 'sk-your-qwen-key') +QWEN_MODEL = os.environ.get('QWEN_MODEL', 'qwen3.6-plus') +QWEN_BASE_URL = os.environ.get('QWEN_BASE_URL', 'https://dashscope.aliyuncs.com/compatible-mode/v1') + +# DeepSeek +DEEPSEEK_API_KEY = os.environ.get('DEEPSEEK_API_KEY', 'sk-your-deepseek-key') +DEEPSEEK_MODEL = os.environ.get('DEEPSEEK_MODEL', 'deepseek-chat') +DEEPSEEK_BASE_URL = os.environ.get('DEEPSEEK_BASE_URL', 'https://api.deepseek.com/v1') + +# Ollama 本地(OpenAI 兼容接口) +OLLAMA_BASE_URL = os.environ.get('OLLAMA_BASE_URL', 'http://localhost:11434/v1') +OLLAMA_MODEL = os.environ.get('OLLAMA_MODEL', 'qwen3:8b') + +# 豆包 / 火山引擎(字节跳动,OpenAI 兼容接口) +DOUBAO_API_KEY = os.environ.get('DOUBAO_API_KEY', 'sk-your-doubao-key') +DOUBAO_MODEL = os.environ.get('DOUBAO_MODEL', 'doubao-1-5-pro-32k') +DOUBAO_BASE_URL = os.environ.get('DOUBAO_BASE_URL', 'https://ark.cn-beijing.volces.com/api/v3') + +# Kimi / Moonshot AI(OpenAI 兼容接口,支持 Embedding) +KIMI_API_KEY = os.environ.get('KIMI_API_KEY', 'sk-your-kimi-key') +KIMI_MODEL = os.environ.get('KIMI_MODEL', 'moonshot-v1-32k') +KIMI_BASE_URL = os.environ.get('KIMI_BASE_URL', 'https://api.moonshot.cn/v1') + +# Embedding 模型 +OPENAI_EMBEDDING_MODEL = 'text-embedding-3-small' +QWEN_EMBEDDING_MODEL = 'text-embedding-v3' +KIMI_EMBEDDING_MODEL = 'moonshot-v1-embedding' + +# ==================== 应用配置 ==================== +MAX_FILE_SIZE_MB = 50 +ALLOWED_EXTENSIONS = {'pdf', 'doc', 'docx'} +SECRET_KEY = 'bidhuo-partner-secret-2024' + +# ==================== 生成配置 ==================== +MAX_RETRIES = 3 +REQUEST_TIMEOUT = int(os.environ.get('REQUEST_TIMEOUT', '180')) +# 大纲生成单次提示词长、输出大,适当延长读超时(秒),避免接口未返回即被客户端断开 +OUTLINE_REQUEST_TIMEOUT = int(os.environ.get('OUTLINE_REQUEST_TIMEOUT', '300')) +CHUNK_SIZE = 2000 # 知识库文本分块大小(字符数) +CHUNK_OVERLAP = 200 # 分块重叠大小 +TOP_K_KNOWLEDGE = 3 # 知识库检索数量 +MAX_CONCURRENT_SECTIONS = int(os.environ.get('MAX_CONCURRENT_SECTIONS', '5')) # 并发生成章节数 +CONTENT_VOLUME = os.environ.get('CONTENT_VOLUME', 'standard') # 篇幅档位: concise / standard / detailed / full +TARGET_PAGES = int(os.environ.get('TARGET_PAGES', '0') or '0') # 目标页数(0=不启用) +PAGE_CHAR_ESTIMATE = int(os.environ.get('PAGE_CHAR_ESTIMATE', '700') or '700') # 粗略每页字数估算 diff --git a/data/attachment_section_rules.json b/data/attachment_section_rules.json new file mode 100644 index 0000000..363e491 --- /dev/null +++ b/data/attachment_section_rules.json @@ -0,0 +1,22 @@ +{ + "_meta": "附件类章节:stack_charts_only 为默认,叶节点按 diagram 意图栈只输出 [FIGURE]/[TABLE] 块、无叙述正文;full 为长文;single_chart_only 为栈顶单块。修改后重启生效。", + "_field_docs": { + "title_regex": "标题任一则正则匹配即视为附件节(Python re 语法)", + "table_hint_keywords": "标题含此类子串且双开关均开时倾向表格", + "figure_hint_keywords": "标题含此类子串且双开关均开时倾向图示", + "default_kind_when_ambiguous": "双开且标题无倾向词时的默认:figure 或 table", + "attachment_leaf_body_mode": "stack_charts_only:意图栈只生成图/表块;full:与常规章节相同长文;single_chart_only:仅栈顶一块图或表" + }, + "schema_version": 1, + "attachment_leaf_body_mode": "stack_charts_only", + "title_regex": [ + "附件\\s*[一二三四五六七八九十0-9A-Za-z、::.]", + "附\\s*图", + "附\\s*表", + "附\\s*件\\s*\\(", + "^\\s*[\\d一二三四五六七八九十\\..、]+\\s*附件" + ], + "table_hint_keywords": ["附表", "一览表", "清单表", "表", "统计表", "明细表"], + "figure_hint_keywords": ["附图", "示意图", "平面图", "流程图", "布置图", "组织图", "横道"], + "default_kind_when_ambiguous": "table" +} diff --git a/data/diagram_intent_rules.json b/data/diagram_intent_rules.json new file mode 100644 index 0000000..f8d3119 --- /dev/null +++ b/data/diagram_intent_rules.json @@ -0,0 +1,51 @@ +{ + "_meta": "章节级图/表意图:标题与大纲窗口关键词计分,阈值入栈,按栈序拼接图示/表格生成规范。修改后重启服务生效。", + "_field_docs": { + "threshold_figure": "图示倾向分达到此值才入栈", + "threshold_table": "表格倾向分达到此值才入栈", + "title_weight": "标题命中的权重乘子", + "context_weight": "大纲上下文窗口命中的权重乘子", + "outline_context_lines": "before/after 为相对匹配行上下扩展行数", + "stack_order_when_both": "figure_first | table_first | score_desc(两者同时入栈时的顺序,栈顶为 index 0)", + "figure_keywords": "字符串或 {text,weight} 对象列表", + "table_keywords": "同上" + }, + "schema_version": 1, + "threshold_figure": 1.0, + "threshold_table": 1.0, + "title_weight": 1.0, + "context_weight": 0.6, + "outline_context_lines": {"before": 4, "after": 6}, + "stack_order_when_both": "score_desc", + "figure_keywords": [ + {"text": "组织", "weight": 1.0}, + {"text": "架构", "weight": 1.0}, + {"text": "流程", "weight": 1.2}, + {"text": "工序", "weight": 1.0}, + {"text": "进度", "weight": 1.2}, + {"text": "横道", "weight": 1.5}, + {"text": "网络图", "weight": 1.5}, + {"text": "平面", "weight": 1.0}, + {"text": "布置", "weight": 0.8}, + {"text": "监测", "weight": 0.8}, + {"text": "示意", "weight": 0.8}, + {"text": "应急", "weight": 0.8} + ], + "table_keywords": [ + {"text": "一览表", "weight": 1.5}, + {"text": "人员", "weight": 1.0}, + {"text": "配置", "weight": 0.8}, + {"text": "设备", "weight": 1.0}, + {"text": "机械", "weight": 0.9}, + {"text": "劳动力", "weight": 1.2}, + {"text": "工种", "weight": 1.0}, + {"text": "检验", "weight": 1.0}, + {"text": "验收", "weight": 0.9}, + {"text": "材料", "weight": 1.0}, + {"text": "供应", "weight": 0.9}, + {"text": "风险", "weight": 1.0}, + {"text": "措施", "weight": 0.6}, + {"text": "清单", "weight": 0.8}, + {"text": "计划", "weight": 0.7} + ] +} diff --git a/data/exports/20260420测试海东技术标_20260420_180450.docx b/data/exports/20260420测试海东技术标_20260420_180450.docx new file mode 100644 index 0000000..99fc638 Binary files /dev/null and b/data/exports/20260420测试海东技术标_20260420_180450.docx differ diff --git a/data/exports/30260420投标技术文档_20260420_170252.docx b/data/exports/30260420投标技术文档_20260420_170252.docx new file mode 100644 index 0000000..d134a23 Binary files /dev/null and b/data/exports/30260420投标技术文档_20260420_170252.docx differ diff --git a/data/exports/A2121212_20260421_103738.docx b/data/exports/A2121212_20260421_103738.docx new file mode 100644 index 0000000..c659ef4 Binary files /dev/null and b/data/exports/A2121212_20260421_103738.docx differ diff --git a/data/exports/A666_20260422_143004.docx b/data/exports/A666_20260422_143004.docx new file mode 100644 index 0000000..dc6cfb7 Binary files /dev/null and b/data/exports/A666_20260422_143004.docx differ diff --git a/data/exports/A666_20260422_153137.docx b/data/exports/A666_20260422_153137.docx new file mode 100644 index 0000000..e2c309d Binary files /dev/null and b/data/exports/A666_20260422_153137.docx differ diff --git a/data/exports/A666_20260422_160459.docx b/data/exports/A666_20260422_160459.docx new file mode 100644 index 0000000..0f83a32 Binary files /dev/null and b/data/exports/A666_20260422_160459.docx differ diff --git a/data/projects.db b/data/projects.db new file mode 100644 index 0000000..e4d0a16 Binary files /dev/null and b/data/projects.db differ diff --git a/data/settings.json b/data/settings.json new file mode 100644 index 0000000..2b3d244 --- /dev/null +++ b/data/settings.json @@ -0,0 +1,24 @@ +{ + "model_provider": "qwen", + "qwen_api_key": "sk-999173b3ca7f425a97cc4b12a2d3575f", + "qwen_model": "qwen3.6-plus", + "qwen_base_url": "https://dashscope.aliyuncs.com/compatible-mode/v1", + "openai_api_key": "sk-your-openai-key", + "openai_model": "gpt-4.1", + "openai_base_url": "https://api.openai.com/v1", + "deepseek_api_key": "sk-your-deepseek-key", + "deepseek_model": "deepseek-chat", + "deepseek_base_url": "https://api.deepseek.com/v1", + "ollama_base_url": "http://localhost:11434/v1", + "ollama_model": "qwen3:8b", + "doubao_api_key": "sk-your-doubao-key", + "doubao_model": "doubao-1-5-pro-32k", + "doubao_base_url": "https://ark.cn-beijing.volces.com/api/v3", + "kimi_api_key": "sk-your-kimi-key", + "kimi_model": "moonshot-v1-32k", + "kimi_base_url": "https://api.moonshot.cn/v1", + "max_concurrent": 10, + "content_volume": "full", + "target_pages": 120, + "page_char_estimate": 700 +} \ No newline at end of file diff --git a/data/uploads/10_boq_工程量清单.pdf b/data/uploads/10_boq_工程量清单.pdf new file mode 100644 index 0000000..24c7709 Binary files /dev/null and b/data/uploads/10_boq_工程量清单.pdf differ diff --git a/data/uploads/10_招标文件正文.pdf b/data/uploads/10_招标文件正文.pdf new file mode 100644 index 0000000..30df632 Binary files /dev/null and b/data/uploads/10_招标文件正文.pdf differ diff --git a/data/uploads/11_boq_工程量清单.pdf b/data/uploads/11_boq_工程量清单.pdf new file mode 100644 index 0000000..24c7709 Binary files /dev/null and b/data/uploads/11_boq_工程量清单.pdf differ diff --git a/data/uploads/11_招标文件正文.pdf b/data/uploads/11_招标文件正文.pdf new file mode 100644 index 0000000..30df632 Binary files /dev/null and b/data/uploads/11_招标文件正文.pdf differ diff --git a/data/uploads/12_boq_工程量清单.pdf b/data/uploads/12_boq_工程量清单.pdf new file mode 100644 index 0000000..24c7709 Binary files /dev/null and b/data/uploads/12_boq_工程量清单.pdf differ diff --git a/data/uploads/12_招标文件正文.pdf b/data/uploads/12_招标文件正文.pdf new file mode 100644 index 0000000..30df632 Binary files /dev/null and b/data/uploads/12_招标文件正文.pdf differ diff --git a/data/uploads/15_boq_工程量清单.pdf b/data/uploads/15_boq_工程量清单.pdf new file mode 100644 index 0000000..24c7709 Binary files /dev/null and b/data/uploads/15_boq_工程量清单.pdf differ diff --git a/data/uploads/15_招标文件正文.pdf b/data/uploads/15_招标文件正文.pdf new file mode 100644 index 0000000..30df632 Binary files /dev/null and b/data/uploads/15_招标文件正文.pdf differ diff --git a/data/uploads/16_boq_工程量清单.pdf b/data/uploads/16_boq_工程量清单.pdf new file mode 100644 index 0000000..24c7709 Binary files /dev/null and b/data/uploads/16_boq_工程量清单.pdf differ diff --git a/data/uploads/16_招标文件正文.pdf b/data/uploads/16_招标文件正文.pdf new file mode 100644 index 0000000..30df632 Binary files /dev/null and b/data/uploads/16_招标文件正文.pdf differ diff --git a/data/uploads/17_boq_工程量清单.pdf b/data/uploads/17_boq_工程量清单.pdf new file mode 100644 index 0000000..24c7709 Binary files /dev/null and b/data/uploads/17_boq_工程量清单.pdf differ diff --git a/data/uploads/17_招标文件正文.pdf b/data/uploads/17_招标文件正文.pdf new file mode 100644 index 0000000..30df632 Binary files /dev/null and b/data/uploads/17_招标文件正文.pdf differ diff --git a/data/uploads/1_boq_工程量清单1.pdf b/data/uploads/1_boq_工程量清单1.pdf new file mode 100644 index 0000000..e129e5a Binary files /dev/null and b/data/uploads/1_boq_工程量清单1.pdf differ diff --git a/data/uploads/1_招标文件正文1.pdf b/data/uploads/1_招标文件正文1.pdf new file mode 100644 index 0000000..136700e Binary files /dev/null and b/data/uploads/1_招标文件正文1.pdf differ diff --git a/data/uploads/2_boq_工程量清单1.pdf b/data/uploads/2_boq_工程量清单1.pdf new file mode 100644 index 0000000..e129e5a Binary files /dev/null and b/data/uploads/2_boq_工程量清单1.pdf differ diff --git a/data/uploads/2_招标文件正文1.pdf b/data/uploads/2_招标文件正文1.pdf new file mode 100644 index 0000000..136700e Binary files /dev/null and b/data/uploads/2_招标文件正文1.pdf differ diff --git a/data/uploads/3_boq_工程量清单1.pdf b/data/uploads/3_boq_工程量清单1.pdf new file mode 100644 index 0000000..e129e5a Binary files /dev/null and b/data/uploads/3_boq_工程量清单1.pdf differ diff --git a/data/uploads/3_招标文件正文1.pdf b/data/uploads/3_招标文件正文1.pdf new file mode 100644 index 0000000..136700e Binary files /dev/null and b/data/uploads/3_招标文件正文1.pdf differ diff --git a/data/uploads/4_boq_工程量清单1.pdf b/data/uploads/4_boq_工程量清单1.pdf new file mode 100644 index 0000000..e129e5a Binary files /dev/null and b/data/uploads/4_boq_工程量清单1.pdf differ diff --git a/data/uploads/4_招标文件正文1.pdf b/data/uploads/4_招标文件正文1.pdf new file mode 100644 index 0000000..136700e Binary files /dev/null and b/data/uploads/4_招标文件正文1.pdf differ diff --git a/data/uploads/5_boq_工程量清单1.pdf b/data/uploads/5_boq_工程量清单1.pdf new file mode 100644 index 0000000..e129e5a Binary files /dev/null and b/data/uploads/5_boq_工程量清单1.pdf differ diff --git a/data/uploads/5_招标文件正文1.pdf b/data/uploads/5_招标文件正文1.pdf new file mode 100644 index 0000000..136700e Binary files /dev/null and b/data/uploads/5_招标文件正文1.pdf differ diff --git a/data/uploads/6_boq_工程量清单.pdf b/data/uploads/6_boq_工程量清单.pdf new file mode 100644 index 0000000..24c7709 Binary files /dev/null and b/data/uploads/6_boq_工程量清单.pdf differ diff --git a/data/uploads/6_招标文件正文.pdf b/data/uploads/6_招标文件正文.pdf new file mode 100644 index 0000000..30df632 Binary files /dev/null and b/data/uploads/6_招标文件正文.pdf differ diff --git a/data/uploads/7_boq_工程量清单1.pdf b/data/uploads/7_boq_工程量清单1.pdf new file mode 100644 index 0000000..e129e5a Binary files /dev/null and b/data/uploads/7_boq_工程量清单1.pdf differ diff --git a/data/uploads/7_招标文件正文1.pdf b/data/uploads/7_招标文件正文1.pdf new file mode 100644 index 0000000..136700e Binary files /dev/null and b/data/uploads/7_招标文件正文1.pdf differ diff --git a/data/uploads/8_boq_工程量清单.pdf b/data/uploads/8_boq_工程量清单.pdf new file mode 100644 index 0000000..24c7709 Binary files /dev/null and b/data/uploads/8_boq_工程量清单.pdf differ diff --git a/data/uploads/8_招标文件正文.pdf b/data/uploads/8_招标文件正文.pdf new file mode 100644 index 0000000..30df632 Binary files /dev/null and b/data/uploads/8_招标文件正文.pdf differ diff --git a/data/uploads/9_boq_工程量清单.pdf b/data/uploads/9_boq_工程量清单.pdf new file mode 100644 index 0000000..24c7709 Binary files /dev/null and b/data/uploads/9_boq_工程量清单.pdf differ diff --git a/data/uploads/9_招标文件正文.pdf b/data/uploads/9_招标文件正文.pdf new file mode 100644 index 0000000..30df632 Binary files /dev/null and b/data/uploads/9_招标文件正文.pdf differ diff --git a/data/word_allocation_rules.json b/data/word_allocation_rules.json new file mode 100644 index 0000000..307ef8b --- /dev/null +++ b/data/word_allocation_rules.json @@ -0,0 +1,30 @@ +{ + "_meta": "字数分配约束规则:与「标书篇幅预期」四档的 base/core 配合,按技术评分项权重与章节标题相关性分配各叶节点最低字数与提示词中的评分要点提示。修改后重启服务生效;字段说明见同文件 _field_docs。", + "_field_docs": { + "schema_version": "规则文件版本号,解析时可做迁移", + "alpha": "0~1,评分驱动强度;越大则高分相关章节越接近 core、低相关越接近 base", + "budget_mode": "target_pages:启用目标页数且 TARGET_PAGES>0 时,全书叶节点目标总字数为 TARGET_PAGES*PAGE_CHAR_ESTIMATE;无技术评分时叶节均分该总预算。未启用页数时无评分则返回 None。anchor_mean:N*(base+core)/2;anchor_base:N*base", + "per_section_floor": "单节 min_chars 下限(不低于此整数)", + "per_section_cap": "单节 min_chars 上限(不超过 core 时可设为 core 或略高)", + "relevance.method": "keyword_overlap:标题与评分项名称/关键词的字面重叠度", + "relevance.min_rating_weight": "忽略权重低于此值的评分项(减少噪声)", + "rating_parse": "预留;解析器内置多形态 rating_json,无需在此配置", + "prompt.top_k_rating_items": "写入本节字数说明中的相关评分项名称条数上限", + "max_tokens_scale": "若为 true,按 min_chars/base 比例缩放本段 max_tokens(仍受模型上限约束)" + }, + "schema_version": 1, + "alpha": 0.85, + "budget_mode": "target_pages", + "per_section_floor": null, + "per_section_cap": null, + "relevance": { + "method": "keyword_overlap", + "min_rating_weight": 0.01 + }, + "rating_parse": {}, + "prompt": { + "top_k_rating_items": 4, + "intro_line": "本节须对下列技术评分要点作实质展开(结合工艺、流程、标准与可验证措施,禁止空泛承诺与复述招标文件):" + }, + "max_tokens_scale": false +} diff --git a/launcher.py b/launcher.py new file mode 100644 index 0000000..ded7331 --- /dev/null +++ b/launcher.py @@ -0,0 +1,172 @@ +""" +标伙伴 · AI标书助手 — 桌面启动器 +运行此文件 (或打包后的 bid_partner.exe) 即可自动启动本地服务并打开浏览器。 +""" +import os +import sys +import socket +import threading +import time +import webbrowser +import urllib.request +import logging + + +# ── 找可用端口 ────────────────────────────────────────────────────────────── +def _find_free_port(start: int = 5000, attempts: int = 20) -> int: + for port in range(start, start + attempts): + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: + try: + s.bind(('127.0.0.1', port)) + return port + except OSError: + continue + return start # 最坏情况:直接用 5000,让 Flask 报错 + + +PORT = _find_free_port() + + +# ── 日志 ──────────────────────────────────────────────────────────────────── +def _setup_logging(): + if getattr(sys, 'frozen', False): + log_dir = os.path.dirname(sys.executable) + else: + log_dir = os.path.dirname(os.path.abspath(__file__)) + log_path = os.path.join(log_dir, 'bid_partner.log') + logging.basicConfig( + level=logging.INFO, + format='%(asctime)s [%(levelname)s] %(name)s: %(message)s', + handlers=[logging.FileHandler(log_path, encoding='utf-8', mode='a')], + ) + + +# ── 启动 Flask 服务 ───────────────────────────────────────────────────────── +def _start_server(): + try: + import app as flask_app + flask_app.init_db() + flask_app.app.run( + host='127.0.0.1', + port=PORT, + debug=False, + threaded=True, + use_reloader=False, + ) + except Exception as e: + logging.getLogger('launcher').error(f'服务启动失败: {e}', exc_info=True) + + +# ── 等待服务就绪 ───────────────────────────────────────────────────────────── +def _wait_for_server(timeout: int = 60) -> bool: + url = f'http://127.0.0.1:{PORT}' + deadline = time.time() + timeout + while time.time() < deadline: + try: + urllib.request.urlopen(url, timeout=1) + return True + except Exception: + time.sleep(0.4) + return False + + +# ── 主界面 (tkinter) ───────────────────────────────────────────────────────── +def _run_gui(): + import tkinter as tk + from tkinter import ttk, font as tkfont + + URL = f'http://127.0.0.1:{PORT}' + + root = tk.Tk() + root.title('标伙伴 · AI标书助手') + root.geometry('400x220') + root.resizable(False, False) + root.configure(bg='#f5f5f5') + + # ── 标题 ── + title_font = tkfont.Font(family='微软雅黑', size=14, weight='bold') + tk.Label(root, text='标伙伴 · AI 标书助手', font=title_font, + bg='#f5f5f5', fg='#1a1a2e').pack(pady=(22, 4)) + + # ── 状态行 ── + status_var = tk.StringVar(value='正在启动服务,请稍候…') + status_lbl = tk.Label(root, textvariable=status_var, + font=('微软雅黑', 10), bg='#f5f5f5', fg='#555') + status_lbl.pack(pady=4) + + # ── URL 链接 ── + url_lbl = tk.Label(root, text='', font=('Consolas', 10), + bg='#f5f5f5', fg='#1a73e8', cursor='hand2') + url_lbl.pack(pady=2) + url_lbl.bind('', lambda _: webbrowser.open(URL)) + + # ── 按钮区 ── + btn_frame = tk.Frame(root, bg='#f5f5f5') + btn_frame.pack(pady=18) + + open_btn = ttk.Button(btn_frame, text='打开浏览器', + command=lambda: webbrowser.open(URL), + state='disabled', width=14) + open_btn.pack(side='left', padx=8) + + quit_btn = ttk.Button(btn_frame, text='退出程序', + command=root.destroy, width=10) + quit_btn.pack(side='left', padx=8) + + # ── 版本信息 ── + tk.Label(root, text='单机版 · 本地运行 · 数据不上传', + font=('微软雅黑', 8), bg='#f5f5f5', fg='#aaa').pack(pady=(0, 10)) + + # ── 后台轮询,服务就绪后更新 UI ── + def _on_ready(): + status_var.set('服务已就绪 ✓') + status_lbl.config(fg='#2e7d32') + url_lbl.config(text=URL) + open_btn.config(state='normal') + webbrowser.open(URL) + + def _on_timeout(): + status_var.set('启动超时,请查看 bid_partner.log') + status_lbl.config(fg='#c62828') + + def _check(): + if _wait_for_server(): + root.after(0, _on_ready) + else: + root.after(0, _on_timeout) + + threading.Thread(target=_check, daemon=True).start() + root.mainloop() + + +# ── 无图形模式(仅控制台) ──────────────────────────────────────────────────── +def _run_headless(): + print(f'[标伙伴] Starting server on port {PORT} ...') + if _wait_for_server(): + print(f'[标伙伴] Ready → http://127.0.0.1:{PORT}') + webbrowser.open(f'http://127.0.0.1:{PORT}') + # 阻塞,直到用户 Ctrl+C + try: + while True: + time.sleep(1) + except KeyboardInterrupt: + print('[标伙伴] Shutting down.') + else: + print('[标伙伴] Server did not start within 60 s. Check bid_partner.log.') + + +# ── 入口 ───────────────────────────────────────────────────────────────────── +def main(): + _setup_logging() + + server_thread = threading.Thread(target=_start_server, daemon=True) + server_thread.start() + + try: + _run_gui() + except Exception: + _run_headless() + + +if __name__ == '__main__': + main() diff --git a/modules/__init__.py b/modules/__init__.py new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/modules/__init__.py @@ -0,0 +1 @@ + diff --git a/modules/checker.py b/modules/checker.py new file mode 100644 index 0000000..8292fc9 --- /dev/null +++ b/modules/checker.py @@ -0,0 +1,98 @@ +""" +合规检查模块:检查生成的标书是否响应了招标关键要求 +""" +import json +import logging +import re +import sqlite3 + +from utils import ai_client + +logger = logging.getLogger(__name__) + +CHECK_PROMPT = """你是一位专业的投标文件技术审核专家。请对照以下【技术评分要求】,检查【标书技术内容】的覆盖情况,输出技术合规检查报告。 + +重要限制(必须遵守): +★ 本次检查范围仅限技术内容,包括:技术方案、实施能力、技术指标、质量保障、人员配置、技术创新等 +★ 严禁将商务评分、价格评分、资质评分、报价、合同条款、付款方式等商务内容纳入检查项 +★ 若技术评分要求中混有商务条款,直接忽略,不得作为检查项输出 + +【技术评分要求】 +{requirements} + +【标书技术内容(各章节摘要)】 +{content} + +请输出以下格式的 JSON,每个 item 均为技术评分项,不含任何商务内容: +{{ + "overall_score": 85, + "status": "良好", + "items": [ + {{ + "requirement": "技术评分要求描述", + "covered": true, + "note": "说明" + }} + ], + "missing_points": ["未覆盖的技术要点1", "未覆盖的技术要点2"], + "suggestions": ["技术内容改进建议1", "技术内容改进建议2"] +}} +""" + + +def check_compliance(db_path: str, project_id: int) -> dict: + """ + 执行合规检查,返回检查结果字典。 + """ + conn = sqlite3.connect(db_path) + try: + # 获取招标要求 + cur = conn.cursor() + cur.execute( + "SELECT summary, rating_requirements FROM tender_data WHERE project_id=?", + (project_id,) + ) + td = cur.fetchone() + if not td: + return {'error': '尚未解析招标文件'} + + # 只使用技术评分要求作为检查基准,排除 summary 中可能包含的商务内容 + requirements = (td[1] or '').strip() + if not requirements: + return {'error': '尚未提取技术评分要求,请先完成步骤一的招标文件解析'} + + # 收集已生成的章节内容(取前 500 字) + cur.execute( + "SELECT section_title, content FROM bid_sections WHERE project_id=? AND status='done' ORDER BY order_index", + (project_id,) + ) + rows = cur.fetchall() + if not rows: + return {'error': '尚未生成标书内容,请先生成'} + + content_parts = [] + for title, content in rows: + snippet = (content or '')[:500].replace('\n', ' ') + content_parts.append(f"【{title}】{snippet}") + content_str = '\n'.join(content_parts) + + # 调用 AI 检查 + prompt = CHECK_PROMPT.format(requirements=requirements[:3000], content=content_str[:6000]) + raw = ai_client.chat(prompt, temperature=0.2, max_tokens=2048) + + # 解析 JSON + raw = re.sub(r'```(?:json)?\s*', '', raw).replace('```', '').strip() + m = re.search(r'\{[\s\S]*\}', raw) + if m: + raw = m.group(0) + result = json.loads(raw) + return result + + except json.JSONDecodeError as e: + logger.error(f'合规检查结果解析失败: {e}') + return {'error': f'AI 返回格式异常: {e}', 'raw': raw} + except Exception as e: + logger.exception('合规检查失败') + return {'error': str(e)} + finally: + conn.close() diff --git a/modules/dark_bid_format_check.py b/modules/dark_bid_format_check.py new file mode 100644 index 0000000..4564643 --- /dev/null +++ b/modules/dark_bid_format_check.py @@ -0,0 +1,635 @@ +""" +技术暗标 HTML 格式检查(由 清标工具.js 迁移,不依赖浏览器/jsdom)。 +仅解析内联 style 与文档内 + + + + +
+
+
+
+ + + +
+
+ 标伙伴 + AI 标书助手 +
+
+
+ + +
+
+
+ + +
+ + +
+

篇幅目标(按页数粗略换算)请进入某标书项目,在 步骤1「解析」 中设置:100/150/200/250/300 页、自定义、保存页数设置、使用原档位、当前页等,保存后用于后续章节生成。

+
+ + + + + + + + + +
+ + +
+
+

新建标书项目

+
+ + +
+
+ + +
+
+
+ + +
+
+

AI 模型配置

+ +
+ +
+ + + + + + +
+
+ + + + + + + + + + + + + + + + +
+

+ 全稿「目标总页数」在已打开的标书项目中,到 步骤1「解析」 里设置,与下方「每节字数档」是两项不同设置。 +

+ +
+ + + + +
+
+ + +
+ +
+ + +
+
+ 保守(1路) + 推荐(3-5路) + 激进(10路) +
+
+ +
+ + +
+
+
+ + + + +
+

© 标书老崔

+

本工具仅限学习交流免费使用,生成的技术方案请人工核对。本工具不会在任何平台售卖,请注意甄别。

+
+ + diff --git a/templates/project.html b/templates/project.html new file mode 100644 index 0000000..2b3cf3f --- /dev/null +++ b/templates/project.html @@ -0,0 +1,2356 @@ + + + + + +{{ project.name }} · 标伙伴 + + + + + + + + +
+
+ + + + + +
+
+ + + +
+

{{ project.name }}

+
+ + + + +
+ + +
+ + + + + +
+ + +
+ + +
+

+ 1 + 上传招标文件 +

+
+ + + + +

拖拽文件到此处,或

+ +

支持 PDF、DOC、DOCX,最大 50MB

+
+ + +
+
+
+ + + +
+
+

+

已上传

+
+
+ +
+ + +
+
+
+ 上传中... +
+
+
+ + +
+
+
+

篇幅目标(按页数粗略换算)

+

100/150/200/250/300 页或自定义,保存后用于后续章节生成。在此「保存页数设置」会同步全局配置并写入本标书项目。

+
+ +
+
+ + +
+
+ 自定义页数 + + + 当前: +
+ +
+ + +
+
+ + + +

工程量清单导入 (可选)解析后可联动招标内容,让生成内容包含准确工程量

+
+ + +
+ + + + +

拖拽清单文件到此处,或点击选择

+

支持 Excel(xlsx/xls)、CSV、PDF、Word,最大 50MB

+
+ + +
+
+
+ + + +
+
+

+

+
+
+
+ + + + +
+
+ + +
+
+ 上传中... +
+ + +
+ + + + +
+ + +
+ + +
+ + +
+
+

+ + 工程量清单摘要 +

+
+ + +
+
+ +
+ +
+ +

修改后点击"保存",将在生成章节内容时作为工程量参考

+
+
+
+ + +
+
+
+ +
+
+ + + + +
+
+ + +
+ + +
+
+
+

标书类型

+

解析完成后自动识别为工程类 / 服务类 / 货物类;步骤 3 生成章节将套用对应写作模板(施工组织 / 服务方案 / 供货方案)。识别有误可在此修正。

+
+
+ + +
+
+
+ + +
+
+

+ + 招标文件摘要 +

+
+ + +
+
+ +
+ +
+ +

修改后点击"保存",将作为生成大纲的依据

+
+
+ + +
+
+

+ + 技术评分要求 +

+
+ + +
+
+ +
+ +
+ +

修改后点击"保存",将作为生成大纲的依据(只保留技术评分,删除商务/价格评分内容)

+
+
+ +
+ +
+ + +
+
+

+ 2 + 生成标书大纲 +

+ +
+

请先完成招标文件解析

+ +
+ +
+

AI 将根据招标文件摘要和技术评分标准,生成结构化的四级标书目录。

+ +
+

控长与章节数(建议按顺序执行)

+
    +
  1. 定总节数 N:在大纲中合并/删除不需要的编号行,点「保存并更新章节」;步骤 3「生成」中「共 N 个章节」、左侧章节目录与这里的 N 一致,由本步大纲内容决定。
  2. +
  3. 用页数约束小目 AI 树:在「步骤1 解析」中「上传」与「清单」之间的「篇幅目标」设页;使用「AI 自动填充小章节」时一般勿勾选「不限制小章节」;系统将按目标页对小章节条数作映射与限幅。
  4. +
  5. 全稿再回调:到「生成」用「一键并发生成」后看总字数/估算;若仍过长,回本页大纲继续并节/删行、保存,再回「生成」重跑。
  6. +
+
+ +
+

+ 目标页数、不限制小章节在 + + (上传与清单之间)的「篇幅目标」中设置。 +

+
+ + +

+ 当前用于生成的大纲:约 字, 个章节 +

+
+ + +
+
+ AI 正在生成标书大纲,通常需要 30-60 秒... +
+
+ + +
+
+

+ 大纲 + 预览 + 编辑 + ( 个章节) +

+
+ + + + +
+
+ + +
+

「AI 自动填充小章节」与目标页

+

+ +
+ + +
+ +
+ + +
+
+ + + + 保存后将按新大纲重新划分章节,已生成的章节正文内容将被清除,需重新生成。编辑时请保持层级编号格式(一、1.1、1.1.1…)不变。 +
+ +

格式示例:第一行为标书名称,章节用"一、""1.1""1.1.1""1.1.1.1"等格式

+
+
+
+
+ + +
+
+
+

+ 3 + 章节内容生成 +

+
+ + + + + + +
+
+ +
+

篇幅与章节数

+

+ 目标页数只在 + + 「上传」与「清单」之间的「篇幅目标」中配置。本页仅并发生成与查看进度。列表「共 N 节」与大纲行数相关,需并节/删行时回步骤2 大纲。 +

+
+ + +
+
+
+ + + + 暗标模式 + + +
+ +
+ +
+

+ 暗标要求将附加到每个章节的 AI 生成规范中,AI 必须严格遵守。适用于评标文件不得暴露投标人身份的项目。 +

+ +
+ 常用预设: + + + + + + +
+ +

+ 填写后点击「保存暗标要求」,再点击「一键并发生成」或单章节「AI 生成」,暗标规则将自动注入 AI 提示词。 +

+
+
+ 启用暗标模式后,可设置禁止 AI 在生成内容中暴露投标人身份的具体规则。 +
+
+ + +
+
+
+ + + + 图表生成模式 +
+ +
+ +
+ + + + +
+ +

+ 设置后点击「保存设置」,再点击「一键并发生成」或单章节「AI 生成」,图表将自动生成并嵌入正文;导出 Word 文档时自动渲染为带标题的图示块和正式表格。 +

+
+ + +
+
+
+ 生成进度 + + + + + + 路并发 + +
+ +
+
+
+
+
+
+ + + 已完成 + + + + 生成中 + + + + 失败 + +
+ 个章节 +
+
+ +
+

请先生成标书大纲

+ +
+ + +
+ +
+
+
+ + +
+
+

从左侧章节列表或内容列表中选择一个章节

+ +
+ +
+ +
+
+

+
+
+ +
+ + +
+ +
+ + + +
+ +
+ +
+
+
+ + +
+
+ AI 正在生成内容,请稍候... +
+ + +
+ +
+

章节引言

+

+
+ +
+ +
+ +
+
+ + + +

暂无内容,点击"AI 生成"或切换"对话生成"模式

+
+
+
+
+ + +
+ +
+ + +
+
+ AI +
+
+
+ + + +
+
+
+
+ + +
+
+ + +
+

点击 AI 回复下方的「采用此内容 → 填入编辑框」将内容写入编辑器,再点「保存」完成。

+
+
+
+
+ + +
+
+

+ 4 + 合规性检查 +

+

AI 将对照招标要求检查标书内容的覆盖情况,给出改进建议。

+ + + + +
+ + +
+

+ + 技术暗标格式清标 +

+

对导出的技术暗标 HTML 做版式规则检查(身份信息、标题/正文、目录、图表位置、颜色、页边距等),不调用大模型。请粘贴完整 HTML 或选择本地 .html 文件。

+

说明:检查依赖内联 style;从 Word 另存为「网页」或含完整样式的 HTML 效果最佳;纯标签无样式时部分项易判为不通过。

+ +
+ +
+ + +
+ +
+ +
+
+ + +
+ + +
+
+
+
+ + + +
+
+

企业知识库

+

上传历史标书,AI 生成时自动检索企业优势内容

+
+
+ +
+ + + + 语义向量检索 · 个文本块 + + + + + 关键词检索模式 · 个文本块 + +
+
+ + +
+

💡 当前使用关键词检索

+

DeepSeek / Ollama 暂不提供 Embedding API,知识库将以关键词匹配方式检索相关内容。 + 切换为 Qwen 或 OpenAI 模型(在首页 AI 配置中设置)可启用更精准的语义向量检索。

+
+
+ + +
+

+ + + + 添加知识文档 +

+ + + + +

+ 推荐上传:历史技术方案、同类项目标书、企业资质简介、施工工法说明等。
+ 上传后 AI 在生成章节内容时将自动检索相关片段作为写作参考。 +

+
+ + +
+

+ + + + 已上传文件 + +

+ + + + + + + + +
+ + + +

知识库暂无文件

+

上传历史标书后,AI 生成内容时将自动引用

+
+
+ + +
+

+ + + + 使用说明 +

+
    +
  • + 1 + 上传企业历史技术标书、施工方案、资质简介等文档(支持 PDF/DOC/DOCX) +
  • +
  • + 2 + 系统自动将文档切分并向量化入库(首次入库需等待 AI 处理完成) +
  • +
  • + 3 + 生成章节内容(步骤 3)时,系统将自动检索知识库中最相关的段落供 AI 参考写作 +
  • +
  • + 4 + 知识库为全局共享,对所有项目均有效;可随时添加或删除文档 +
  • +
+
+ +
+ +
+
+ + + + + + + +
+

© 标书老崔

+

本工具仅限学习交流免费使用,生成的技术方案请人工核对。本工具不会在任何平台售卖,请注意甄别。

+
+ + diff --git a/tests/fixtures/dark_bid_report_sample.json b/tests/fixtures/dark_bid_report_sample.json new file mode 100644 index 0000000..11aecd2 --- /dev/null +++ b/tests/fixtures/dark_bid_report_sample.json @@ -0,0 +1,13 @@ +{ + "overall": false, + "details": [ + {"rule": "身份信息隐藏", "passed": true, "message": "未发现投标人身份信息"}, + {"rule": "标题格式", "passed": false, "message": "部分标题字号/字体/颜色/下划线不符合要求"}, + {"rule": "正文格式", "passed": false, "message": "部分正文段落格式不符合要求"}, + {"rule": "目录要求", "passed": true, "message": "目录符合无页码、无页眉页脚要求"}, + {"rule": "图表规范", "passed": false, "message": "正文中发现2个图表或附件内图表文字格式错误"}, + {"rule": "颜色与装饰", "passed": true, "message": "无彩色文字、无下划线、无着重号"}, + {"rule": "页面设置", "passed": false, "message": "页面边距或纸张方向不符合要求"} + ], + "violations": [] +} diff --git a/tests/test_attachment_section.py b/tests/test_attachment_section.py new file mode 100644 index 0000000..13f438b --- /dev/null +++ b/tests/test_attachment_section.py @@ -0,0 +1,95 @@ +"""附件类章节识别与单图/单表类型选择。""" +import unittest + +from utils import attachment_section as att + + +class TestIsAttachment(unittest.TestCase): + def test_positive(self): + r = att.load_attachment_rules() + self.assertTrue(att.is_attachment_only_section('附件一:施工平面布置', r)) + self.assertTrue(att.is_attachment_only_section('附图 组织机构', r)) + self.assertTrue(att.is_attachment_only_section('附表 人员一览', r)) + + def test_negative(self): + r = att.load_attachment_rules() + self.assertFalse(att.is_attachment_only_section('施工组织设计', r)) + self.assertFalse(att.is_attachment_only_section('', r)) + + +class TestPickKind(unittest.TestCase): + def test_only_figure_switch(self): + r = att.DEFAULT_ATTACHMENT_RULES + self.assertEqual( + att.pick_single_figure_or_table('附件一:xxx', True, False, r), + 'figure', + ) + + def test_only_table_switch(self): + r = att.DEFAULT_ATTACHMENT_RULES + self.assertEqual( + att.pick_single_figure_or_table('附件一:xxx', False, True, r), + 'table', + ) + + def test_both_off(self): + self.assertIsNone( + att.pick_single_figure_or_table('附件一', False, False, None), + ) + + def test_table_hint(self): + r = att.DEFAULT_ATTACHMENT_RULES + k = att.pick_single_figure_or_table('附件三 工程量一览表', True, True, r) + self.assertEqual(k, 'table') + + def test_figure_hint(self): + r = att.DEFAULT_ATTACHMENT_RULES + k = att.pick_single_figure_or_table('附图 施工平面示意图', True, True, r) + self.assertEqual(k, 'figure') + + def test_default_ambiguous(self): + r = dict(att.DEFAULT_ATTACHMENT_RULES) + r['default_kind_when_ambiguous'] = 'table' + k = att.pick_single_figure_or_table('附件五 其他资料', True, True, r) + self.assertEqual(k, 'table') + + +class TestAttachmentBodyMode(unittest.TestCase): + def test_default_stack_charts_only(self): + r = att.DEFAULT_ATTACHMENT_RULES + self.assertEqual(att.attachment_leaf_body_mode(r), 'stack_charts_only') + self.assertTrue(att.use_attachment_stack_charts_body(r)) + self.assertFalse(att.use_attachment_single_chart_only_body(r)) + self.assertFalse(att.use_attachment_full_body(r)) + + def test_full_mode(self): + r = dict(att.DEFAULT_ATTACHMENT_RULES) + r['attachment_leaf_body_mode'] = 'full' + self.assertEqual(att.attachment_leaf_body_mode(r), 'full') + self.assertTrue(att.use_attachment_full_body(r)) + self.assertFalse(att.use_attachment_stack_charts_body(r)) + + def test_single_chart_only(self): + r = dict(att.DEFAULT_ATTACHMENT_RULES) + r['attachment_leaf_body_mode'] = 'single_chart_only' + self.assertTrue(att.use_attachment_single_chart_only_body(r)) + self.assertTrue(att.use_attachment_stack_charts_body(r)) + + +class TestExpandOutlineSkip(unittest.TestCase): + def test_should_skip_attachment(self): + self.assertTrue(att.should_skip_expand_subchapters('附件一:平面图')) + self.assertTrue(att.should_skip_expand_subchapters('附图 示意')) + + def test_should_skip_normal_chapter(self): + self.assertFalse(att.should_skip_expand_subchapters('施工组织设计')) + self.assertFalse(att.should_skip_expand_subchapters('质量管理体系与措施')) + + def test_parse_attachment_label(self): + self.assertEqual(att.parse_attachment_label('附件一:平面图'), '一') + self.assertEqual(att.parse_attachment_label('附件2 承诺书'), '2') + self.assertEqual(att.parse_attachment_label('附图 总平面'), '附图') + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/test_bill_analysis.py b/tests/test_bill_analysis.py new file mode 100644 index 0000000..b2a2167 --- /dev/null +++ b/tests/test_bill_analysis.py @@ -0,0 +1,52 @@ +"""工程量清单本地分析单元测试。""" +import unittest + +from utils.bill_analysis import ( + analyze_boq_pages, + filter_bill_pages, + parse_bill_text, +) + + +class TestParseBillText(unittest.TestCase): + def test_code_name_unit_qty(self): + text = '010101001001 挖土方 m3 100.5 土壤类别:三类土' + r = parse_bill_text(text) + self.assertIn('categories', r) + self.assertTrue(r['categories']) + cat = r['categories'][0] + self.assertEqual(cat['name'], '未分类') + self.assertEqual(len(cat['items']), 1) + it = cat['items'][0] + self.assertEqual(it['code'], '010101001001') + self.assertIn('挖土', it['name']) + self.assertEqual(it['unit'], 'm3') + self.assertEqual(it['quantity'], '100.5') + + def test_hierarchical_line_prefix(self): + text = '1.1 010101001001 基础开挖 m3 50' + r = parse_bill_text(text) + it = r['categories'][0]['items'][0] + self.assertEqual(it['code'], '010101001001') + + +class TestFilterBillPages(unittest.TestCase): + def test_two_pages_gap_fill(self): + p0 = '目录 前言' + p1 = '分部分项工程量清单\n项目编码 项目名称 工程量\n010101001001 项 m3 1' + p2 = '续表无表头\n010101002001 土 m2 2' + p3 = '规费 税金 社会保险费 住房公积金 其他说明' + pages, meta = filter_bill_pages([p0, p1, p2, p3]) + self.assertEqual(meta['total_pages'], 4) + self.assertGreaterEqual(len(pages), 2) + merged = '\n'.join(pages) + self.assertIn('010101001001', merged) + self.assertIn('010101002001', merged) + + def test_analyze_scanned_empty(self): + r = analyze_boq_pages(['', ' ', '']) + self.assertTrue(r.get('scanned')) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/test_dark_bid_format_check.py b/tests/test_dark_bid_format_check.py new file mode 100644 index 0000000..28ac5a3 --- /dev/null +++ b/tests/test_dark_bid_format_check.py @@ -0,0 +1,63 @@ +"""技术暗标 HTML 格式检查:结构校验与极简用例(标准库 unittest)。""" +import json +import os +import sys +import unittest + +# 保证可 `python tests/test_*.py` 从项目根导入 `modules` +_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +if _ROOT not in sys.path: + sys.path.insert(0, _ROOT) + +from modules.dark_bid_format_check import check_technical_bid # noqa: E402 + + +def _sample_schema_path(): + return os.path.join(os.path.dirname(__file__), "fixtures", "dark_bid_report_sample.json") + + +class TestDarkBidFormatCheck(unittest.TestCase): + def test_sample_fixture_keys(self): + with open(_sample_schema_path(), encoding="utf-8") as f: + sample = json.load(f) + self.assertIn("overall", sample) + self.assertIn("details", sample) + self.assertIn("violations", sample) + for d in sample["details"]: + self.assertTrue({"rule", "passed", "message"}.issubset(d.keys())) + + def test_check_returns_structure(self): + html = """ +
第一章 概述
+

标题

+

+ 正文内容示例。

+ """ + r = check_technical_bid(html) + self.assertIsInstance(r["overall"], bool) + self.assertEqual(len(r["details"]), 7) + rules = [x["rule"] for x in r["details"]] + self.assertIn("身份信息隐藏", rules) + self.assertIn("标题格式", rules) + + def test_empty_html(self): + r = check_technical_bid("") + self.assertFalse(r["overall"]) + + def test_identity_fail_on_company(self): + html = ( + "

我公司参与投标

" + "
x
" + "" + "" + ) + r = check_technical_bid(html) + id_rule = next(x for x in r["details"] if x["rule"] == "身份信息隐藏") + self.assertFalse(id_rule["passed"]) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_diagram_intent.py b/tests/test_diagram_intent.py new file mode 100644 index 0000000..47a4779 --- /dev/null +++ b/tests/test_diagram_intent.py @@ -0,0 +1,121 @@ +"""图表意图栈与特征计分。""" +import unittest + +from utils import diagram_intent as di + + +def _base_rules() -> dict: + return { + 'schema_version': 1, + 'threshold_figure': 1.0, + 'threshold_table': 1.0, + 'title_weight': 1.0, + 'context_weight': 0.6, + 'outline_context_lines': {'before': 2, 'after': 2}, + 'stack_order_when_both': 'score_desc', + 'figure_keywords': [ + {'text': '进度', 'weight': 1.2}, + {'text': '横道', 'weight': 1.5}, + ], + 'table_keywords': [ + {'text': '一览表', 'weight': 1.5}, + {'text': '人员', 'weight': 1.0}, + ], + } + + +class TestScoreFigureTable(unittest.TestCase): + def test_figure_higher_on_progress(self): + r = _base_rules() + f, t = di.score_figure_table('施工进度与横道计划', '', r) + self.assertGreater(f, t) + + def test_table_higher_on_roster(self): + r = _base_rules() + f, t = di.score_figure_table('主要管理人员配置一览表', '', r) + self.assertGreater(t, f) + + +class TestBuildStack(unittest.TestCase): + def test_gate_figure_off(self): + r = _base_rules() + st = di.build_stack(5.0, 5.0, r, enable_figure=False, enable_table=True) + self.assertEqual(len(st), 1) + self.assertEqual(st[0].kind, 'table') + + def test_score_desc_order(self): + r = dict(_base_rules()) + r['stack_order_when_both'] = 'score_desc' + st = di.build_stack(3.0, 1.0, r, True, True) + self.assertEqual(len(st), 2) + self.assertEqual(st[0].kind, 'figure') + self.assertGreater(st[0].score, st[1].score) + + def test_figure_first(self): + r = dict(_base_rules()) + r['stack_order_when_both'] = 'figure_first' + st = di.build_stack(2.0, 5.0, r, True, True) + self.assertEqual(st[0].kind, 'figure') + self.assertEqual(st[1].kind, 'table') + + def test_below_threshold_empty(self): + r = dict(_base_rules()) + r['threshold_figure'] = 10.0 + r['threshold_table'] = 10.0 + st = di.build_stack(1.0, 1.0, r, True, True) + self.assertEqual(st, []) + + +class TestOutlineWindow(unittest.TestCase): + def test_finds_title_line(self): + outline = '一、总则\n二、进度\n 2.1 横道计划\n三、尾' + w = di.extract_outline_window(outline, '2.1 横道计划', 1, 1) + self.assertIn('横道', w) + + def test_fallback_prefix(self): + w = di.extract_outline_window('abc' * 400, '不存在的标题', 2, 2) + self.assertTrue(len(w) > 0) + + +class TestAgentRender(unittest.TestCase): + def test_render_non_empty_when_match(self): + r = dict(_base_rules()) + r['threshold_figure'] = 0.5 + r['threshold_table'] = 0.5 + agent = di.DiagramIntentAgent(r) + s = agent.render_for_section( + '施工进度横道图编制说明', + '大纲\n进度\n横道', + True, + True, + ) + self.assertIn('图示生成规范', s) + self.assertIn('本节图表生成优先级', s) + + def test_render_empty_when_scores_low(self): + r = dict(_base_rules()) + r['threshold_figure'] = 100.0 + r['threshold_table'] = 100.0 + agent = di.DiagramIntentAgent(r) + s = agent.render_for_section('无关标题', '无关', True, True) + self.assertEqual(s, '') + + +class TestStackHelpers(unittest.TestCase): + def test_stack_compact_labels(self): + st = [ + di.DiagramIntent('figure', 1.0, 't'), + di.DiagramIntent('table', 1.0, 't'), + ] + lab = di.stack_compact_labels(st) + self.assertEqual(len(lab), 2) + self.assertIn('[FIGURE]', lab[0]) + + def test_make_fallback_stack(self): + st = di.make_fallback_stack('figure') + self.assertEqual(len(st), 1) + self.assertEqual(st[0].kind, 'figure') + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/test_outline_numbering.py b/tests/test_outline_numbering.py new file mode 100644 index 0000000..c0f462a --- /dev/null +++ b/tests/test_outline_numbering.py @@ -0,0 +1,44 @@ +"""目录号格式化与大纲带号写回。""" +import os +import sys +import unittest + +_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +if _ROOT not in sys.path: + sys.path.insert(0, _ROOT) + +from modules.generator import _parse_outline, _sections_to_outline_text # noqa: E402 +from utils.outline_numbering import format_heading_display, int_to_chinese_numeral # noqa: E402 + + +class TestOutlineNumbering(unittest.TestCase): + def test_int_to_chinese(self): + self.assertEqual(int_to_chinese_numeral(1), "一") + self.assertEqual(int_to_chinese_numeral(10), "十") + self.assertEqual(int_to_chinese_numeral(11), "十一") + self.assertEqual(int_to_chinese_numeral(23), "二十三") + + def test_format_heading(self): + self.assertEqual(format_heading_display(1, "3", "总体"), "三、总体") + self.assertEqual(format_heading_display(2, "1.2", "子节"), "1.2 子节") + + def test_sections_to_outline_text_has_numbers(self): + sections = [ + {"level": 1, "title": "第一章", "number": "1"}, + {"level": 2, "title": "小节", "number": "1.1"}, + ] + text = _sections_to_outline_text("某项目技术标书", sections) + self.assertIn("某项目技术标书", text) + self.assertIn("一、第一章", text) + self.assertIn("1.1 小节", text) + + def test_parse_roundtrip_numbered_outline(self): + raw = "标书标题\n一、第一章\n1.1 节A\n" + _, sections, normalized = _parse_outline(raw) + self.assertGreaterEqual(len(sections), 2) + self.assertIn("一、第一章", normalized) + self.assertIn("1.1 节A", normalized) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_parse_outline.py b/tests/test_parse_outline.py new file mode 100644 index 0000000..de4d4fb --- /dev/null +++ b/tests/test_parse_outline.py @@ -0,0 +1,24 @@ +"""大纲解析:1.1 类编号不得被误拆成一级 1 与 title '.1 标题'。""" +import unittest + +from modules.generator import _parse_outline + + +class TestParseOutline(unittest.TestCase): + def test_11_stays_single_section(self): + text = "某某项目标书标题\n1.1 沟槽开挖与支护\n1.2 排降水\n" + _, sections, _ = _parse_outline(text) + self.assertEqual(len(sections), 2, [s.get('number') for s in sections]) + for s in sections: + if s.get('level') == 1: + self.assertFalse( + (s.get('title') or '').lstrip().startswith('.'), + '不得出现一级章节 title 以 .1 开头(误将 1.1 拆成 1 与 .1 标题)', + ) + titles = ' '.join(s['title'] for s in sections) + self.assertIn('沟槽', titles) + self.assertIn('排降', titles) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/test_volume_chapters.py b/tests/test_volume_chapters.py new file mode 100644 index 0000000..f359a5f --- /dev/null +++ b/tests/test_volume_chapters.py @@ -0,0 +1,86 @@ +"""目标页数与一级篇章区间。""" +import random +import unittest + +from utils import volume_chapters as vc + + +class TestVolumeChapters(unittest.TestCase): + def test_top_level_default_pages_zero(self): + lo, hi = vc.top_level_chapter_range_from_pages(0) + self.assertEqual((lo, hi), (8, 10)) + + def test_ranges_match_effective_volume_bands(self): + self.assertEqual(vc.top_level_chapter_range_from_pages(100), (6, 8)) + self.assertEqual(vc.top_level_chapter_range_from_pages(125), (6, 8)) + self.assertEqual(vc.top_level_chapter_range_from_pages(150), (8, 10)) + self.assertEqual(vc.top_level_chapter_range_from_pages(200), (10, 12)) + self.assertEqual(vc.top_level_chapter_range_from_pages(300), (12, 16)) + + def test_hint_default_no_pages(self): + h = vc.outline_chapter_count_hint(0, 'standard') + self.assertIn('8-10', h) + self.assertIn('不超过10', h) + + def test_hint_with_pages(self): + h = vc.outline_chapter_count_hint(150, 'standard', 700) + self.assertIn('约 8–10', h) + self.assertIn('150', h) + self.assertIn('105000', h) # 150×700 总字目标 + self.assertIn('过细', h) + + def test_subchapter_base_anchor_points(self): + self.assertAlmostEqual(vc.subchapter_total_base_from_pages(100), 78.0, places=5) + self.assertAlmostEqual(vc.subchapter_total_base_from_pages(300), 212.0, places=5) + self.assertEqual(vc.SUBCHAPTER_PAGES_SLOPE, 0.67) + self.assertEqual(vc.SUBCHAPTER_PAGES_INTERCEPT, 11.0) + + def test_subchapter_jitter_bounds_78_anchor(self): + """100 页基线 78 章,±10% 严格为 [70, 86]。""" + self.assertEqual(vc.subchapter_jitter_bounds(78.0), (70, 86)) + + def test_subchapter_jitter_bounds_300_pages(self): + self.assertEqual(vc.subchapter_jitter_bounds(212.0), (191, 233)) + + def test_allocate_subchapters_to_mains(self): + self.assertEqual(vc.allocate_subchapters_to_mains(10, 3), [4, 3, 3]) + self.assertEqual(vc.allocate_subchapters_to_mains(0, 3), [0, 0, 0]) + self.assertEqual(vc.allocate_subchapters_to_mains(5, 2), [3, 2]) + self.assertEqual(vc.allocate_subchapters_to_mains(7, 0), []) + # n < k 时多出的主章 quota 为 0 + a = vc.allocate_subchapters_to_mains(70, 100) + self.assertEqual(len(a), 100) + self.assertEqual(sum(a), 70) + self.assertEqual(a.count(1), 70) + self.assertEqual(a.count(0), 30) + + def test_subchapter_effective_respects_k_floor_and_jitter(self): + # round(78 * u) for u in [0.9, 1.1] stays in [70, 86] for 78.0 base + for seed in range(800): + n = vc.subchapter_total_effective(100, 1, random.Random(seed)) + self.assertGreaterEqual(n, 70) + self.assertLessEqual(n, 86) + # 主章数很大时,总条数仍须在 [70, 86](不得被 max(n,k) 抬到数百) + for seed in range(20): + nk = vc.subchapter_total_effective(100, 500, random.Random(seed)) + self.assertGreaterEqual(nk, 70, msg=f'seed={seed}') + self.assertLessEqual(nk, 86, msg=f'seed={seed}') + + def test_subchapter_effective_zero_pages(self): + self.assertEqual(vc.subchapter_total_effective(0, 5), 0) + self.assertEqual(vc.subchapter_total_effective(100, 0), 0) + + def test_resolve_expand_target_pages(self): + self.assertEqual(vc.resolve_expand_target_pages(None, True, 100, 200), 0) + self.assertEqual(vc.resolve_expand_target_pages(200, False, 100, 50), 200) + self.assertEqual(vc.resolve_expand_target_pages(0, False, 80, 0), 80) + self.assertEqual(vc.resolve_expand_target_pages(0, False, 0, 50), 50) + self.assertEqual( + vc.resolve_expand_target_pages(0, False, 0, 0), + vc.EXPAND_OUTLINE_DEFAULT_TARGET_PAGES, + ) + self.assertEqual(vc.EXPAND_OUTLINE_DEFAULT_TARGET_PAGES, 100) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/test_word_allocation.py b/tests/test_word_allocation.py new file mode 100644 index 0000000..f144964 --- /dev/null +++ b/tests/test_word_allocation.py @@ -0,0 +1,170 @@ +"""字数分配与 rating_json 解析单元测试。""" +import json +import unittest + +import config as cfg + +from utils import word_allocation as wa + + +class TestParseRatingJson(unittest.TestCase): + def test_canonical_items(self): + raw = json.dumps( + { + 'items': [ + {'id': 'T1', 'name': '施工方案', 'weight': 30, 'keywords': ['工艺']}, + {'id': 'T2', 'name': '质量保证', 'weight': 10, 'keywords': []}, + ], + 'notes': '', + }, + ensure_ascii=False, + ) + items = wa.parse_rating_json(raw) + self.assertEqual(len(items), 2) + names = {x['name'] for x in items} + self.assertIn('施工方案', names) + self.assertIn('质量保证', names) + wmap = {x['name']: x['weight'] for x in items} + self.assertEqual(wmap['施工方案'], 30.0) + + def test_malformed_returns_empty(self): + self.assertEqual(wa.parse_rating_json('not json'), []) + self.assertEqual(wa.parse_rating_json(''), []) + + +class TestComputeLeafAllocations(unittest.TestCase): + def test_none_when_no_rating_and_not_target_pages_budget(self): + leaves = [{'id': 1, 'section_title': '一、总体方案'}] + rules = dict(wa.DEFAULT_RULES) + rules['budget_mode'] = 'anchor_mean' + self.assertIsNone( + wa.compute_leaf_allocations('standard', leaves, '', rules) + ) + + def test_uniform_when_no_rating_but_target_pages(self): + """无技术评分时仍按目标页均分 B=页×每页字,全稿不随节数 N 线性爆量。""" + leaves = [ + {'id': 1, 'section_title': 'A'}, + {'id': 2, 'section_title': 'B'}, + ] + rules = dict(wa.DEFAULT_RULES) + rules['budget_mode'] = 'target_pages' + old_tp = cfg.TARGET_PAGES + old_pce = cfg.PAGE_CHAR_ESTIMATE + try: + cfg.TARGET_PAGES = 100 + cfg.PAGE_CHAR_ESTIMATE = 700 + out = wa.compute_leaf_allocations('standard', leaves, '', rules) + finally: + cfg.TARGET_PAGES = old_tp + cfg.PAGE_CHAR_ESTIMATE = old_pce + self.assertIsNotNone(out) + s = out[1]['target_chars'] + out[2]['target_chars'] + self.assertEqual(s, 100 * 700) + self.assertEqual(out[1]['target_chars'], out[2]['target_chars']) + + def test_monotonicity_high_weight_match(self): + rating = json.dumps( + { + 'items': [ + {'name': '施工组织设计', 'weight': 50, 'keywords': ['进度']}, + {'name': '页眉页脚规范', 'weight': 2, 'keywords': []}, + ] + }, + ensure_ascii=False, + ) + leaves = [ + {'id': 10, 'section_title': '3.1 施工组织设计与进度计划'}, + {'id': 11, 'section_title': '9.9 页眉格式说明'}, + ] + rules = dict(wa.DEFAULT_RULES) + rules['alpha'] = 0.95 + out = wa.compute_leaf_allocations('standard', leaves, rating, rules) + self.assertIsNotNone(out) + t_high = out[10]['target_chars'] + t_low = out[11]['target_chars'] + self.assertGreaterEqual(t_high, t_low, '强匹配高分项的章节应不低于弱匹配章节') + self.assertIn('施工组织设计', out[10]['word_count_spec']) + + def test_budget_anchor_mean(self): + rating = json.dumps( + {'items': [{'name': '技术部分', 'weight': 100}]}, + ensure_ascii=False, + ) + leaves = [ + {'id': 1, 'section_title': 'A'}, + {'id': 2, 'section_title': 'B'}, + {'id': 3, 'section_title': 'C'}, + ] + rules = dict(wa.DEFAULT_RULES) + rules['budget_mode'] = 'anchor_mean' + rules['alpha'] = 0.0 + old_tp = getattr(cfg, 'TARGET_PAGES', 0) + setattr(cfg, 'TARGET_PAGES', 0) + try: + out = wa.compute_leaf_allocations('standard', leaves, rating, rules) + finally: + setattr(cfg, 'TARGET_PAGES', old_tp) + self.assertIsNotNone(out) + base, core, _, _ = wa.VOLUME_PRESETS['standard'] + expect = int(round(len(leaves) * (base + core) / 2.0)) + s = sum(out[i]['target_chars'] for i in (1, 2, 3)) + self.assertEqual(s, expect) + + def test_budget_target_pages(self): + rating = json.dumps( + {'items': [{'name': '技术部分', 'weight': 100}]}, + ensure_ascii=False, + ) + leaves = [ + {'id': 1, 'section_title': 'A'}, + {'id': 2, 'section_title': 'B'}, + ] + rules = dict(wa.DEFAULT_RULES) + rules['budget_mode'] = 'target_pages' + rules['alpha'] = 0.0 + old_tp = cfg.TARGET_PAGES + old_pce = cfg.PAGE_CHAR_ESTIMATE + try: + cfg.TARGET_PAGES = 100 + cfg.PAGE_CHAR_ESTIMATE = 700 + out = wa.compute_leaf_allocations('standard', leaves, rating, rules) + finally: + cfg.TARGET_PAGES = old_tp + cfg.PAGE_CHAR_ESTIMATE = old_pce + self.assertIsNotNone(out) + expect = 100 * 700 + s = sum(out[i]['target_chars'] for i in (1, 2)) + self.assertEqual(s, expect) + + def test_budget_target_pages_falls_back_when_pages_zero(self): + rating = json.dumps( + {'items': [{'name': '技术部分', 'weight': 100}]}, + ensure_ascii=False, + ) + leaves = [ + {'id': 1, 'section_title': 'A'}, + {'id': 2, 'section_title': 'B'}, + ] + rules = dict(wa.DEFAULT_RULES) + rules['budget_mode'] = 'target_pages' + rules['alpha'] = 0.0 + old_tp = cfg.TARGET_PAGES + try: + cfg.TARGET_PAGES = 0 + out = wa.compute_leaf_allocations('standard', leaves, rating, rules) + finally: + cfg.TARGET_PAGES = old_tp + self.assertIsNotNone(out) + base, core, _, _ = wa.VOLUME_PRESETS['standard'] + expect = int(round(len(leaves) * (base + core) / 2.0)) + s = sum(out[i]['target_chars'] for i in (1, 2)) + self.assertEqual(s, expect) + + def test_continuation_threshold(self): + self.assertEqual(wa.continuation_threshold(2000), 1300) + self.assertEqual(wa.continuation_threshold(100), 200) + + +if __name__ == '__main__': + unittest.main() diff --git a/utils/__init__.py b/utils/__init__.py new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/utils/__init__.py @@ -0,0 +1 @@ + diff --git a/utils/ai_client.py b/utils/ai_client.py new file mode 100644 index 0000000..9b6aea1 --- /dev/null +++ b/utils/ai_client.py @@ -0,0 +1,252 @@ +""" +AI API 调用封装,支持 OpenAI、阿里云通义千问、DeepSeek、Ollama(均兼容 OpenAI SDK) +""" +import re +import time +import logging +from openai import OpenAI +import config + +logger = logging.getLogger(__name__) + +PROVIDER_NAMES = { + 'qwen': '通义千问 (Qwen)', + 'deepseek': 'DeepSeek', + 'openai': 'OpenAI', + 'ollama': 'Ollama 本地', + 'doubao': '豆包 (Doubao)', + 'kimi': 'Kimi (Moonshot)', +} + +PROVIDER_LINKS = { + 'qwen': 'https://dashscope.aliyun.com/', + 'deepseek': 'https://platform.deepseek.com/', + 'openai': 'https://platform.openai.com/', + 'ollama': 'https://ollama.com/', + 'doubao': 'https://console.volcengine.com/ark/', + 'kimi': 'https://platform.moonshot.cn/', +} + + +def _check_api_key(): + """调用前预检 API Key,无效时直接抛出友好提示,不做无意义的重试""" + provider = config.MODEL_PROVIDER + + # Ollama 本地无需 API Key,跳过检查 + if provider == 'ollama': + return + + name = PROVIDER_NAMES.get(provider, provider) + link = PROVIDER_LINKS.get(provider, '') + + if provider == 'qwen': + key = config.QWEN_API_KEY + elif provider == 'deepseek': + key = config.DEEPSEEK_API_KEY + elif provider == 'doubao': + key = config.DOUBAO_API_KEY + elif provider == 'kimi': + key = config.KIMI_API_KEY + else: + key = config.OPENAI_API_KEY + + if not key or key.startswith('sk-your'): + raise RuntimeError( + f'尚未配置 {name} 的 API Key。' + f'请点击右上角设置按钮,选择"{name}"并填入有效的 API Key。' + f'申请地址:{link}' + ) + + +def _get_client() -> OpenAI: + """根据 MODEL_PROVIDER 返回对应的 OpenAI 兼容客户端""" + if config.MODEL_PROVIDER == 'qwen': + return OpenAI(api_key=config.QWEN_API_KEY, base_url=config.QWEN_BASE_URL) + if config.MODEL_PROVIDER == 'deepseek': + return OpenAI(api_key=config.DEEPSEEK_API_KEY, base_url=config.DEEPSEEK_BASE_URL) + if config.MODEL_PROVIDER == 'ollama': + return OpenAI(api_key='ollama', base_url=config.OLLAMA_BASE_URL) + if config.MODEL_PROVIDER == 'doubao': + return OpenAI(api_key=config.DOUBAO_API_KEY, base_url=config.DOUBAO_BASE_URL) + if config.MODEL_PROVIDER == 'kimi': + return OpenAI(api_key=config.KIMI_API_KEY, base_url=config.KIMI_BASE_URL) + return OpenAI(api_key=config.OPENAI_API_KEY, base_url=config.OPENAI_BASE_URL) + + +def _get_model() -> str: + if config.MODEL_PROVIDER == 'qwen': + return config.QWEN_MODEL + if config.MODEL_PROVIDER == 'deepseek': + return config.DEEPSEEK_MODEL + if config.MODEL_PROVIDER == 'ollama': + return config.OLLAMA_MODEL + if config.MODEL_PROVIDER == 'doubao': + return config.DOUBAO_MODEL + if config.MODEL_PROVIDER == 'kimi': + return config.KIMI_MODEL + return config.OPENAI_MODEL + + +def _clean_response(text: str) -> str: + """ + 过滤推理模型(DeepSeek R1 / QwQ 等)输出的 ... 思考过程标签, + 只保留最终正文内容,避免思考链污染标书正文。 + """ + # 去除 ... 块(含跨行内容) + text = re.sub(r'[\s\S]*?', '', text, flags=re.IGNORECASE) + return text.strip() + + +def _is_auth_error(e: Exception) -> bool: + """判断是否为认证错误(401 / invalid_api_key),无需重试""" + # 优先用 openai 原生异常类型判断 + try: + from openai import AuthenticationError, PermissionDeniedError + if isinstance(e, (AuthenticationError, PermissionDeniedError)): + return True + except ImportError: + pass + # 兜底:字符串匹配 + err_str = str(e).lower() + return ('401' in err_str or 'invalid_api_key' in err_str + or 'incorrect api key' in err_str or 'authentication' in err_str) + + +# OpenAI o 系列推理模型:不支持 temperature,max_tokens 需用 max_completion_tokens +_OPENAI_REASONING_MODELS = {'o1', 'o1-mini', 'o1-pro', 'o3', 'o3-mini', 'o3-pro', 'o4-mini'} + + +def _build_chat_kwargs( + model: str, + messages: list, + temperature: float, + max_tokens: int, + request_timeout: float | None = None, +) -> dict: + """ + 根据模型类型构建 chat.completions.create 的参数字典。 + OpenAI o 系列推理模型不接受 temperature,且使用 max_completion_tokens 替代 max_tokens。 + """ + base_model = model.split(':')[0] # 去掉 ollama tag 后缀 + is_reasoning = base_model in _OPENAI_REASONING_MODELS + + to = request_timeout if request_timeout is not None else config.REQUEST_TIMEOUT + kwargs = { + 'model': model, + 'messages': messages, + 'timeout': to, + } + if is_reasoning: + kwargs['max_completion_tokens'] = max_tokens + else: + kwargs['temperature'] = temperature + kwargs['max_tokens'] = max_tokens + return kwargs + + +def chat( + prompt: str, + system: str = '你是一位专业的投标文件撰写专家。', + temperature: float = 0.7, + max_tokens: int = 8192, + retries: int = None, + request_timeout: float | None = None, +) -> str: + """ + 调用 AI 接口,返回文本响应。 + 认证错误立即终止;其他错误指数退避重试。 + 自动兼容 OpenAI o 系列推理模型的参数差异。 + """ + _check_api_key() + + max_retries = retries if retries is not None else config.MAX_RETRIES + client = _get_client() + model = _get_model() + provider = config.MODEL_PROVIDER + name = PROVIDER_NAMES.get(provider, provider) + + messages = [ + {'role': 'system', 'content': system}, + {'role': 'user', 'content': prompt}, + ] + + for attempt in range(max_retries): + try: + kwargs = _build_chat_kwargs( + model, messages, temperature, max_tokens, request_timeout=request_timeout + ) + resp = client.chat.completions.create(**kwargs) + return _clean_response(resp.choices[0].message.content.strip()) + except Exception as e: + if _is_auth_error(e): + raise RuntimeError( + f'{name} API Key 无效或已过期,请在设置中重新配置。' + f'申请地址:{PROVIDER_LINKS.get(provider, "")}' + ) from e + + wait = 2 ** attempt + logger.warning(f'AI 请求失败 (第{attempt+1}次),{wait}s 后重试: {e}') + if attempt < max_retries - 1: + time.sleep(wait) + else: + raise RuntimeError(f'AI 接口调用失败(已重试 {max_retries} 次): {e}') from e + + return '' + + +def chat_with_history(system: str, messages: list, + temperature: float = 0.7, max_tokens: int = 4096) -> str: + """ + 多轮对话接口,支持完整历史上下文,用于对话式章节生成。 + messages 格式:[{'role': 'user'|'assistant', 'content': str}, ...] + """ + _check_api_key() + + client = _get_client() + model = _get_model() + provider = config.MODEL_PROVIDER + name = PROVIDER_NAMES.get(provider, provider) + + full_messages = [{'role': 'system', 'content': system}] + messages + + for attempt in range(config.MAX_RETRIES): + try: + kwargs = _build_chat_kwargs(model, full_messages, temperature, max_tokens) + resp = client.chat.completions.create(**kwargs) + return _clean_response(resp.choices[0].message.content.strip()) + except Exception as e: + if _is_auth_error(e): + raise RuntimeError( + f'{name} API Key 无效或已过期,请在设置中重新配置。' + f'申请地址:{PROVIDER_LINKS.get(provider, "")}' + ) from e + wait = 2 ** attempt + logger.warning(f'对话 AI 请求失败 (第{attempt+1}次),{wait}s 后重试: {e}') + if attempt < config.MAX_RETRIES - 1: + time.sleep(wait) + else: + raise RuntimeError(f'AI 接口调用失败(已重试 {config.MAX_RETRIES} 次): {e}') from e + + return '' + + +def get_embeddings(texts: list[str]) -> list[list[float]]: + """获取文本嵌入向量。 + 支持 Qwen、OpenAI、Kimi;DeepSeek / Ollama / 豆包 暂不提供 Embedding API。 + """ + provider = config.MODEL_PROVIDER + if provider in ('deepseek', 'ollama', 'doubao'): + raise NotImplementedError( + f'{PROVIDER_NAMES.get(provider)} 暂不支持 Embedding API,知识库将使用关键词检索降级' + ) + + client = _get_client() + if provider == 'qwen': + model = config.QWEN_EMBEDDING_MODEL + elif provider == 'kimi': + model = config.KIMI_EMBEDDING_MODEL + else: + model = config.OPENAI_EMBEDDING_MODEL + + resp = client.embeddings.create(model=model, input=texts) + return [item.embedding for item in resp.data] diff --git a/utils/attachment_section.py b/utils/attachment_section.py new file mode 100644 index 0000000..6df16f4 --- /dev/null +++ b/utils/attachment_section.py @@ -0,0 +1,186 @@ +""" +附件类章节识别:标题匹配、expand_outline 跳过、以及正文模式(完整正文 vs 仅单图单表)。 +""" +from __future__ import annotations + +import json +import logging +import os +import re +from typing import Any, Dict, List, Optional + +import config + +logger = logging.getLogger(__name__) + +DEFAULT_ATTACHMENT_RULES: Dict[str, Any] = { + 'schema_version': 1, + 'title_regex': [ + r'附件\s*[一二三四五六七八九十0-9A-Za-z、::.]', + r'附\s*图', + r'附\s*表', + r'附\s*件\s*\(', + r'^\s*[\d一二三四五六七八九十\..、]+\s*附件', + ], + 'table_hint_keywords': [ + '附表', '一览表', '清单表', '统计表', '明细表', + ], + 'figure_hint_keywords': [ + '附图', '示意图', '平面图', '流程图', '布置图', '组织图', '横道', + ], + 'default_kind_when_ambiguous': 'table', + # stack_charts_only:默认,意图栈只输出 [FIGURE]/[TABLE] 无正文;full:长文;single_chart_only:栈顶仅一块 + 'attachment_leaf_body_mode': 'stack_charts_only', +} + + +def attachment_rules_path() -> str: + return os.path.join(config.DATA_DIR, 'attachment_section_rules.json') + + +def load_attachment_rules(path: Optional[str] = None) -> Dict[str, Any]: + p = path or attachment_rules_path() + data = dict(DEFAULT_ATTACHMENT_RULES) + if not os.path.isfile(p): + return data + try: + with open(p, encoding='utf-8') as f: + raw = json.load(f) + if isinstance(raw, dict): + for k, v in raw.items(): + if k.startswith('_'): + continue + data[k] = v + except Exception as e: + logger.warning('加载 attachment_section_rules.json 失败,使用内置默认: %s', e) + return data + + +def attachment_leaf_body_mode(rules: Optional[Dict[str, Any]] = None) -> str: + """ + 附件叶节点正文策略: + stack_charts_only(默认)、full(完整技术正文)、single_chart_only(栈顶仅一块图或表)。 + """ + r = rules or get_attachment_rules_cached() + mode = (r.get('attachment_leaf_body_mode') or 'stack_charts_only').strip().lower() + if mode in ('single_chart_only', 'stack_charts_only', 'full'): + return mode + return 'stack_charts_only' + + +def use_attachment_stack_charts_body(rules: Optional[Dict[str, Any]] = None) -> bool: + """附件走「意图栈仅图/表、无长文」路径(含 single_chart_only 的单栈顶版本)。""" + m = attachment_leaf_body_mode(rules) + return m in ('stack_charts_only', 'single_chart_only') + + +def use_attachment_single_chart_only_body(rules: Optional[Dict[str, Any]] = None) -> bool: + return attachment_leaf_body_mode(rules) == 'single_chart_only' + + +def use_attachment_full_body(rules: Optional[Dict[str, Any]] = None) -> bool: + return attachment_leaf_body_mode(rules) == 'full' + + +def is_attachment_only_section(section_title: str, rules: Optional[Dict[str, Any]] = None) -> bool: + """标题是否属于附件类(附图/附表/附件N 等),用于 expand_outline 跳过与正文分支。""" + t = (section_title or '').strip() + if not t: + return False + r = rules or load_attachment_rules() + patterns: List[str] = list(r.get('title_regex') or []) + for pat in patterns: + try: + if re.search(pat, t): + return True + except re.error: + logger.warning('无效 attachment title_regex,已跳过: %s', pat[:80]) + return False + + +def pick_single_figure_or_table( + section_title: str, + enable_figure: bool, + enable_table: bool, + rules: Optional[Dict[str, Any]] = None, +) -> Optional[str]: + """ + 返回 'figure' | 'table' | None。 + 两开关均关返回 None;仅开一个则取对应类型。 + """ + if not enable_figure and not enable_table: + return None + if enable_figure and not enable_table: + return 'figure' + if enable_table and not enable_figure: + return 'table' + + r = rules or load_attachment_rules() + t = (section_title or '') + tbl_kw = list(r.get('table_hint_keywords') or []) + fig_kw = list(r.get('figure_hint_keywords') or []) + # 单独「表」字易误判,仅当同时存在附表类或与其它词组合时再偏表 + for kw in tbl_kw: + if kw and kw in t: + return 'table' + for kw in fig_kw: + if kw and kw in t: + return 'figure' + # 泛「表」在附件语境下常见 + if '表' in t and '图' not in t: + return 'table' + if '图' in t and '表' not in t: + return 'figure' + + default = (r.get('default_kind_when_ambiguous') or 'table').strip().lower() + if default == 'figure': + return 'figure' + return 'table' + + +_cached_rules: Optional[Dict[str, Any]] = None + + +def get_attachment_rules_cached() -> Dict[str, Any]: + global _cached_rules + if _cached_rules is None: + _cached_rules = load_attachment_rules() + return _cached_rules + + +def should_skip_expand_subchapters(title: str) -> bool: + """ + AI 自动填充小章节(expand_outline)时:附件类一级主章不调用子章节生成。 + 判定与 is_attachment_only_section 一致。 + """ + return is_attachment_only_section(title, get_attachment_rules_cached()) + + +# 从标题中提取「附件几」等标签,用于日志 +_ATTACHMENT_LABEL_RE = re.compile( + r'附件\s*[::]?\s*([一二三四五六七八九十百0-9A-Za-z]+)', +) +_ATTACHMENT_FIG_TBL_RE = re.compile(r'附\s*[图表]\s*([一二三四五六七八九十百0-9]*)') + + +def parse_attachment_label(title: str) -> Optional[str]: + t = (title or '').strip() + if not t: + return None + m = _ATTACHMENT_LABEL_RE.search(t) + if m: + return m.group(1).strip() or None + m2 = _ATTACHMENT_FIG_TBL_RE.search(t) + if m2: + rest = (m2.group(1) or '').strip() + if rest: + return rest + matched = m2.group(0) + if '图' in matched: + return '附图' + return '附表' + if re.search(r'附\s*图', t): + return '附图' + if re.search(r'附\s*表', t): + return '附表' + return None diff --git a/utils/bill_analysis.py b/utils/bill_analysis.py new file mode 100644 index 0000000..1a3c90d --- /dev/null +++ b/utils/bill_analysis.py @@ -0,0 +1,577 @@ +""" +工程量清单本地分析(从 bill-worker.js Phase 2/3 移植)。 +Phase 2:按页关键字筛选清单页;Phase 3:正则解析分部与清单项。 +""" +from __future__ import annotations + +import logging +import re +from typing import Any + +logger = logging.getLogger(__name__) + +BILL_KW = ['项目编码', '项目名称', '工程量', '计量单位', '综合单价', '清单编码'] +SEC_KW = ['分部分项', '分类分项', '措施项目', '其他项目', '工程量清单计价'] +FEE_PAGE_KW = [ + '规费', '税金', '社会保险费', '住房公积金', '养老保险', + '工伤保险', '失业保险', '医疗保险', '教育费附加', '城市维护建设税', +] + +ITEM_START = re.compile(r'^\d+(\.\d+)+\s') +CODE_INLINE = re.compile(r'(?:^|\s)(\d{9,12}|(? str: + def repl(m: re.Match) -> str: + a, b, c, d = m.group(1), m.group(2), m.group(3), m.group(4) or '' + combined = a + b + c + d + if 9 <= len(combined) <= 12: + return combined + return m.group(0) + + return _DASH_CODE.sub(repl, line) + + +def is_fee_item(name: str) -> bool: + if not name: + return False + n = re.sub(r'\s+', '', name) + if n in _EXACT_FEE_ITEM: + return True + for kw in _FEE_KW: + if kw in n: + return True + return False + + +def split_name_and_spec(raw_name: str) -> tuple[str, str]: + if not raw_name: + return '', '' + m = re.search(r'\d+[.、.)\uFF09]\s*[\u4e00-\u9fff]', raw_name) + if m and m.start() > 0: + return raw_name[:m.start()].strip(), raw_name[m.start():].strip() + kw = _SPEC_KW_RE.search(raw_name) + if kw and kw.start() > 0: + return raw_name[:kw.start()].strip(), raw_name[kw.start():].strip() + paren = re.search(r'[((]\d+[))]', raw_name) + if paren and paren.start() > 0: + return raw_name[:paren.start()].strip(), raw_name[paren.start():].strip() + return raw_name, '' + + +def is_cat_title(text: str) -> bool: + return any(k in text for k in _CAT_KW) + + +def is_fee_cat_title(text: str) -> bool: + if not text: + return False + t = re.sub(r'\s+', '', text) + if t in _EXACT_FEE_CAT: + return True + for kw in _FEE_CAT_KW: + if kw in t: + return True + return False + + +def _is_new_line_trigger(raw: str) -> bool: + if ITEM_START.match(raw): + return True + if CODE_START_RE.match(raw): + return True + if SEQ_CODE_RE.match(raw): + return True + for m in CATEGORY_MARKERS: + if raw.startswith(m + ' ') or raw.startswith(m + '\u3000'): + return True + return False + + +def parse_bill_text(text: str) -> dict[str, Any]: + raw_lines = [] + for l in text.split('\n'): + line = l.replace('\t', ' ').strip() + line = _fold_dash_codes(line) + raw_lines.append(line) + + logic_lines: list[str] = [] + current_line = '' + + for raw in raw_lines: + if not raw or PAGE_MARK.match(raw): + continue + if HEADER_RE.match(raw) or HEADER_KW.match(raw): + continue + if re.match(r'^(元)|^款章节号|^备注$|^第\d+页', raw): + continue + + if _is_new_line_trigger(raw): + if current_line: + logic_lines.append(current_line) + current_line = raw + elif CODE_INLINE.search(raw) and len(raw) > 15: + if current_line: + logic_lines.append(current_line) + current_line = raw + else: + if current_line and len(current_line) > 300: + logic_lines.append(current_line) + current_line = raw + else: + current_line = current_line + ' ' + raw if current_line else raw + if current_line: + logic_lines.append(current_line) + + logger.debug('合并后 %s 条逻辑行(原始 %s 行)', len(logic_lines), len(raw_lines)) + + categories: list[dict[str, Any]] = [] + cur_cat: dict[str, Any] | None = None + cur_item: dict[str, Any] | None = None + + for line in logic_lines: + if SKIP_RE.search(line): + continue + + # 行首序号:多级如「1.1.1.1 」;或「1–4 位序号 + 空格 + 9 位以上编码」。 + # 避免误删「行首即 9–12 位清单编码 + 空格」整段(JS 原 \d+(\.\d+)* 会吞掉编码)。 + stripped = line.strip() + m_hier = re.match(r'^\d+(?:\.\d+)+\s+', stripped) + if m_hier: + stripped = stripped[m_hier.end():].strip() + elif re.match(r'^\d{1,4}\s+\d{9}', stripped): + stripped = re.sub(r'^\d{1,4}\s+', '', stripped, count=1).strip() + if not stripped: + stripped = line.strip() + if not stripped: + continue + + cm = CODE_RE.search(stripped) + if cm: + if cur_item and cur_cat: + cur_cat['items'].append(cur_item) + if not cur_cat: + cur_cat = {'name': '未分类', 'items': []} + categories.append(cur_cat) + + code = cm.group(1) + rest = stripped[cm.end():].strip() + name, unit, quantity, spec = '', '', '', '' + + unit_match = UNIT_RE.search(rest) + if unit_match: + ui = rest.find(unit_match.group(0)) + raw_name = rest[:ui].strip() + unit = unit_match.group(1) + after_unit = rest[ui + len(unit_match.group(0)):].strip() + qm = re.match(r'^([\d,.]+)', after_unit) + if qm: + quantity = qm.group(1) + tail = after_unit[qm.end():].strip() + if tail: + tail_tokens = tail.split() + si = 0 + while si < len(tail_tokens) and re.match(r'^[\d,.%\-]+$', tail_tokens[si]): + si += 1 + spec_tail = ' '.join(tail_tokens[si:]).strip() + if spec_tail: + spec = spec_tail + ns_name, ns_spec = split_name_and_spec(raw_name) + name = ns_name + if ns_spec: + spec = ns_spec + (';' + spec if spec else '') + else: + tokens = [t for t in rest.split() if t] + found_unit_idx = -1 + for ti in range(len(tokens) - 1, 0, -1): + if tokens[ti] in UNIT_SET: + found_unit_idx = ti + break + if found_unit_idx >= 1: + raw_name_str = ' '.join(tokens[:found_unit_idx]) + ns_name, ns_spec = split_name_and_spec(raw_name_str) + name = ns_name + if ns_spec: + spec = ns_spec + unit = tokens[found_unit_idx] + after_tokens = tokens[found_unit_idx + 1:] + if after_tokens and re.match(r'^[\d,.]+$', after_tokens[0]): + quantity = after_tokens[0] + si = 1 + while si < len(after_tokens) and re.match(r'^[\d,.%\-]+$', after_tokens[si]): + si += 1 + spec_tail = ' '.join(after_tokens[si:]).strip() + if spec_tail: + spec = spec + ';' + spec_tail if spec else spec_tail + else: + name = rest + + name = re.sub(r'\s+', '', name).strip() + for u in UNIT_TOKENS: + if name.endswith(u) and len(name) > len(u): + unit = unit or u + name = name[: len(name) - len(u)] + break + + cur_item = {'code': code, 'name': name, 'unit': unit, 'quantity': quantity, 'spec': spec} + continue + + if len(stripped) > 4: + uni_match = UNIT_RE.search(stripped) + if uni_match: + ui = stripped.find(uni_match.group(0)) + before_unit = stripped[:ui].strip() + after_unit = stripped[ui + len(uni_match.group(0)):].strip() + has_qty = bool(re.match(r'^[\d,.]+', after_unit)) + if ( + 2 <= len(before_unit) <= 50 + and has_qty + and re.search(r'[\u4e00-\u9fff]', before_unit) + ): + if cur_item and cur_cat: + cur_cat['items'].append(cur_item) + if not cur_cat: + cur_cat = {'name': '未分类', 'items': []} + categories.append(cur_cat) + unit_fb = uni_match.group(1) + qm = re.match(r'^([\d,.]+)', after_unit) + quantity_fb = qm.group(1) if qm else '' + ns_name, ns_spec = split_name_and_spec(before_unit) + name_fb = re.sub(r'\s+', '', ns_name).strip() + spec_fb = ns_spec or '' + cur_item = {'code': '', 'name': name_fb, 'unit': unit_fb, 'quantity': quantity_fb, 'spec': spec_fb} + continue + + if 2 < len(stripped) < 60 and not CODE_RE.search(stripped): + if UNIT_RE.search(stripped) and re.search(r'\d+\.?\d*\s*$', stripped): + if cur_item: + cur_item['spec'] = (cur_item.get('spec') or '') + ( + ';' + stripped if cur_item.get('spec') else stripped + ) + continue + if is_cat_title(stripped) and not UNIT_RE.search(stripped) and not is_fee_cat_title(stripped): + if cur_item and cur_cat: + cur_cat['items'].append(cur_item) + cur_item = None + clean_title = re.sub( + r'\s+(座|个|项|处|m|km|段|条)\s+\d+[\d.]*\s*$', '', stripped + ).strip() + cur_cat = {'name': clean_title, 'items': []} + categories.append(cur_cat) + continue + + if re.match(r'^[一二三四五六七八九十]+\s', stripped) or re.match( + r'^([一二三四五六七八九十\d]+)', stripped + ): + clean_title = re.sub(r'\s+(座|个|项|处)\s+\d+[\d.]*\s*$', '', stripped).strip() + if is_fee_cat_title(clean_title): + continue + if cur_item and cur_cat: + cur_cat['items'].append(cur_item) + cur_item = None + cur_cat = {'name': clean_title, 'items': []} + categories.append(cur_cat) + continue + + if cur_item and len(stripped) > 1: + cur_item['spec'] = (cur_item.get('spec') or '') + ( + ';' + stripped if cur_item.get('spec') else stripped + ) + + if cur_item and cur_cat: + cur_cat['items'].append(cur_item) + + fee_filtered = 0 + for cat in categories: + if cat.get('items'): + before = len(cat['items']) + cat['items'] = [it for it in cat['items'] if not is_fee_item(it.get('name', ''))] + fee_filtered += before - len(cat['items']) + if fee_filtered: + logger.debug('费用项过滤: 移除 %s 项', fee_filtered) + + total_before_merge = 0 + total_after_merge = 0 + for cat in categories: + items = cat.get('items') or [] + if not items: + continue + total_before_merge += len(items) + name_map: dict[str, dict[str, Any]] = {} + for item in items: + key = re.sub(r'\s+', '', (item.get('name') or '')).strip() + if not key: + continue + if key not in name_map: + name_map[key] = { + 'code': item.get('code') or '', + 'name': item['name'], + 'unit': item.get('unit') or '', + 'quantity': item.get('quantity') or '', + 'spec': item.get('spec') or '', + '_quantities': [item['quantity']] if item.get('quantity') else [], + '_specs': [item['spec']] if item.get('spec') else [], + } + else: + m = name_map[key] + if not m['code'] and item.get('code'): + m['code'] = item['code'] + if not m['unit'] and item.get('unit'): + m['unit'] = item['unit'] + if item.get('quantity'): + m['_quantities'].append(item['quantity']) + if item.get('spec') and item['spec'] not in m['_specs']: + m['_specs'].append(item['spec']) + + merged_items: list[dict[str, str]] = [] + for m in name_map.values(): + qlist = m['_quantities'] + if len(qlist) > 1: + nums = [] + ok = True + for q in qlist: + try: + nums.append(float(q.replace(',', ''))) + except ValueError: + ok = False + break + if ok: + s = sum(nums) + m['quantity'] = str(int(s)) if s % 1 == 0 else f'{s:.2f}' + else: + m['quantity'] = '; '.join(qlist) + elif len(qlist) == 1: + m['quantity'] = qlist[0] + + if m['_specs']: + trimmed = [s[:120] + '...' if len(s) > 120 else s for s in m['_specs']] + m['spec'] = '; '.join(trimmed) + if len(m['spec']) > 300: + m['spec'] = m['spec'][:300] + '...' + for k in ('_quantities', '_specs'): + m.pop(k, None) + merged_items.append( + {k: m[k] for k in ('code', 'name', 'unit', 'quantity', 'spec')} + ) + cat['items'] = merged_items + total_after_merge += len(merged_items) + + merged_count = total_before_merge - total_after_merge + if merged_count > 0: + logger.debug('按名称合并: %s → %s 项', total_before_merge, total_after_merge) + + valid = [c for c in categories if c.get('items')] + total_items = sum(len(c['items']) for c in valid) + logger.debug( + '最终结果: %s 分部, %s 清单项', len(valid), total_items + ) + + return { + 'project_summary': { + 'remark': f'本地解析:{len(valid)} 个分部,{total_items} 个清单项(合并前 {total_before_merge} 项)', + }, + 'categories': valid, + } + + +def filter_bill_pages(page_texts: list[str]) -> tuple[list[str], dict[str, Any]]: + """ + 从按页文本中筛选工程量清单相关页;返回 (bill_page_texts, meta)。 + """ + n = len(page_texts) + meta: dict[str, Any] = {'total_pages': n, 'scanned': False, 'no_bill_pages': False} + + total_chars = sum(len(t or '') for t in page_texts) + if total_chars < 50: + meta['scanned'] = True + meta['reason'] = 'noText' + return [], meta + + bill_flags = [False] * n + for i, t in enumerate(page_texts): + if not (t or '').strip(): + continue + t = t or '' + h_hits = sum(1 for k in BILL_KW if k in t) + s_hit = any(k in t for k in SEC_KW) + has_code = bool(re.search(r'\d{9}', t)) + if h_hits >= 2 or s_hit or has_code: + bill_flags[i] = True + + first_bill = next((i for i, f in enumerate(bill_flags) if f), -1) + last_bill = max((i for i, f in enumerate(bill_flags) if f), default=-1) + if first_bill >= 0 and last_bill > first_bill: + for i in range(first_bill, last_bill + 1): + if bill_flags[i]: + continue + t = page_texts[i] or '' + if not t.strip() or len(t.strip()) <= 30: + continue + fee_hits = sum(1 for kw in FEE_PAGE_KW if kw in t) + if fee_hits >= 2 and not re.search(r'\d{9}', t): + continue + bill_flags[i] = True + + bill_texts = [page_texts[i] for i in range(n) if bill_flags[i]] + if not bill_texts: + meta['no_bill_pages'] = True + + meta['bill_page_indices'] = [i for i in range(n) if bill_flags[i]] + meta['bill_pages'] = len(bill_texts) + return bill_texts, meta + + +def analyze_boq_pages(page_texts: list[str]) -> dict[str, Any]: + """ + 串联筛选 + parse_bill_text;返回结构含 _meta,供持久化与前端。 + """ + total_pages = len(page_texts) + total_chars = sum(len(t or '') for t in page_texts) + + if total_chars < 50: + return { + 'scanned': True, + 'reason': 'noText', + 'totalPages': total_pages, + 'project_summary': {'remark': '文本过少,疑似扫描件或未提取到文字'}, + 'categories': [], + '_meta': { + 'method': 'python-local', + 'total_pages': total_pages, + 'bill_pages': 0, + }, + } + + bill_texts, fmeta = filter_bill_pages(page_texts) + if not bill_texts: + return { + 'scanned': False, + 'no_bill_pages': True, + 'totalPages': total_pages, + 'project_summary': {'remark': '未识别到清单相关页面'}, + 'categories': [], + '_meta': { + 'method': 'python-local', + 'total_pages': total_pages, + 'bill_pages': 0, + **{k: fmeta[k] for k in ('no_bill_pages',) if k in fmeta}, + }, + } + + merged = '\n'.join(bill_texts) + parsed = parse_bill_text(merged) + return { + 'scanned': False, + **parsed, + '_meta': { + 'method': 'python-local', + 'total_pages': total_pages, + 'bill_pages': len(bill_texts), + 'bill_page_indices': fmeta.get('bill_page_indices', []), + }, + } + + +def categories_to_prompt_appendix( + analysis: dict[str, Any], + max_chars: int = 3000, + max_per_cat: int = 40, +) -> str: + """将本地解析结果压成短文本,注入 AI 摘要提示词。""" + cats = analysis.get('categories') or [] + lines: list[str] = [] + for cat in cats: + name = cat.get('name', '') + items = cat.get('items') or [] + lines.append(f'【{name}】') + for it in items[:max_per_cat]: + code = it.get('code') or '-' + n = it.get('name') or '' + u = it.get('unit') or '' + q = it.get('quantity') or '' + lines.append(f' {code} {n} {u} {q}'.strip()) + if len(items) > max_per_cat: + lines.append(f' …共 {len(items)} 条,此处省略其余') + text = '\n'.join(lines).strip() + if len(text) > max_chars: + return text[:max_chars] + '\n…(附录已截断)' + return text diff --git a/utils/boq_parser.py b/utils/boq_parser.py new file mode 100644 index 0000000..0e4cae7 --- /dev/null +++ b/utils/boq_parser.py @@ -0,0 +1,138 @@ +""" +工程量清单解析模块:从 Excel / CSV / PDF / Word 文件中提取结构化文本。 +""" +import csv +import logging +import re +from pathlib import Path + +logger = logging.getLogger(__name__) + +# 最大返回字符数(送给 AI 做摘要时截断) +MAX_BOQ_CHARS = 12000 + + +def extract_boq_text(file_path: str) -> str: + """ + 从工程量清单文件提取原始结构化文本。 + 支持:.xlsx / .xls / .csv / .pdf / .docx / .doc + """ + ext = Path(file_path).suffix.lower() + if ext in ('.xlsx', '.xls'): + text = _extract_excel(file_path) + elif ext == '.csv': + text = _extract_csv(file_path) + elif ext == '.pdf': + from utils.file_utils import _extract_pdf + text = _extract_pdf(file_path) + elif ext == '.docx': + from utils.file_utils import _extract_docx + text = _extract_docx(file_path) + elif ext == '.doc': + from utils.file_utils import _extract_doc + text = _extract_doc(file_path) + else: + raise ValueError(f'不支持的文件格式 {ext},请使用 xlsx/xls/csv/pdf/docx/doc') + + return text[:MAX_BOQ_CHARS] + + +def extract_boq_pages(file_path: str) -> list[str]: + """ + 返回按「页」切分的清单文本:PDF 为每页一段;Excel/CSV/Word 为单元素全文。 + """ + ext = Path(file_path).suffix.lower() + if ext == '.pdf': + from utils.file_utils import extract_pdf_pages + return extract_pdf_pages(file_path) + text = extract_boq_text(file_path) + return [text] if text else [''] + + +# ─── Excel ──────────────────────────────────────────────────────────────── + +def _extract_excel(file_path: str) -> str: + try: + import openpyxl + wb = openpyxl.load_workbook(file_path, data_only=True, read_only=True) + parts = [] + for name in wb.sheetnames: + ws = wb[name] + block = _sheet_to_text(ws, name) + if block.strip(): + parts.append(block) + wb.close() + return '\n\n'.join(parts) + except ImportError: + return _extract_xls_fallback(file_path) + except Exception as e: + raise RuntimeError(f'Excel 解析失败:{e}') from e + + +def _sheet_to_text(ws, sheet_name: str) -> str: + """将一个 Sheet 转为管道分隔文本,自动过滤全空行和全空列。""" + raw_rows = [] + for row in ws.iter_rows(values_only=True): + cells = ['' if v is None else str(v).strip() for v in row] + if any(cells): + raw_rows.append(cells) + + if not raw_rows: + return '' + + # 对齐列数 + max_cols = max(len(r) for r in raw_rows) + raw_rows = [r + [''] * (max_cols - len(r)) for r in raw_rows] + + # 找出有内容的列索引 + active_cols = [j for j in range(max_cols) + if any(raw_rows[i][j] for i in range(len(raw_rows)))] + if not active_cols: + return '' + + lines = [f'【{sheet_name}】'] + for row in raw_rows: + line = ' | '.join(row[j] for j in active_cols) + if line.replace('|', '').strip(): + lines.append(line) + return '\n'.join(lines) + + +def _extract_xls_fallback(file_path: str) -> str: + """旧版 .xls 使用 xlrd 兜底(需安装 xlrd<2)""" + try: + import xlrd # type: ignore + wb = xlrd.open_workbook(file_path) + parts = [] + for sheet in wb.sheets(): + lines = [f'【{sheet.name}】'] + for rx in range(sheet.nrows): + cells = [str(sheet.cell_value(rx, cx)).strip() + for cx in range(sheet.ncols)] + line = ' | '.join(c for c in cells if c) + if line: + lines.append(line) + parts.append('\n'.join(lines)) + return '\n\n'.join(parts) + except Exception as e: + raise RuntimeError(f'.xls 解析失败,请另存为 .xlsx 后重试:{e}') from e + + +# ─── CSV ───────────────────────────────────────────────────────────────── + +def _extract_csv(file_path: str) -> str: + encodings = ['utf-8-sig', 'gbk', 'utf-8', 'gb18030', 'latin-1'] + for enc in encodings: + try: + lines = [] + with open(file_path, 'r', encoding=enc, newline='') as f: + for row in csv.reader(f): + line = ' | '.join(c.strip() for c in row if c.strip()) + if line: + lines.append(line) + return '\n'.join(lines) + except (UnicodeDecodeError, UnicodeError): + continue + except Exception as e: + raise RuntimeError(f'CSV 解析失败:{e}') from e + raise RuntimeError('CSV 文件编码不支持,请另存为 UTF-8 格式后重试') diff --git a/utils/diagram_intent.py b/utils/diagram_intent.py new file mode 100644 index 0000000..9da7e48 --- /dev/null +++ b/utils/diagram_intent.py @@ -0,0 +1,283 @@ +""" +章节级图/表意图:字符特征 + 大纲上下文窗口计分,栈式优先级,驱动提示词附加段。 +""" +from __future__ import annotations + +import json +import logging +import os +import re +from dataclasses import dataclass +from typing import Any, Dict, List, Optional, Sequence, Tuple + +import config +from utils import prompts as P + +logger = logging.getLogger(__name__) + +DEFAULT_DIAGRAM_RULES: Dict[str, Any] = { + 'schema_version': 1, + 'threshold_figure': 1.0, + 'threshold_table': 1.0, + 'title_weight': 1.0, + 'context_weight': 0.6, + 'outline_context_lines': {'before': 4, 'after': 6}, + 'stack_order_when_both': 'score_desc', + 'figure_keywords': [], + 'table_keywords': [], +} + + +def diagram_rules_path() -> str: + return os.path.join(config.DATA_DIR, 'diagram_intent_rules.json') + + +def load_diagram_rules(path: Optional[str] = None) -> Dict[str, Any]: + """加载规则 JSON;文件缺失或解析失败时返回内置默认。""" + p = path or diagram_rules_path() + data = dict(DEFAULT_DIAGRAM_RULES) + if not os.path.isfile(p): + return data + try: + with open(p, encoding='utf-8') as f: + raw = json.load(f) + if isinstance(raw, dict): + for k, v in raw.items(): + if k.startswith('_'): + continue + if k == 'outline_context_lines' and isinstance(v, dict): + data['outline_context_lines'] = { + **data.get('outline_context_lines', {}), + **v, + } + else: + data[k] = v + except Exception as e: + logger.warning('加载 diagram_intent_rules.json 失败,使用内置默认: %s', e) + return data + + +def _normalize_keyword_entries(raw: Any) -> List[Tuple[str, float]]: + out: List[Tuple[str, float]] = [] + if not isinstance(raw, list): + return out + for item in raw: + if isinstance(item, str) and item.strip(): + out.append((item.strip(), 1.0)) + elif isinstance(item, dict): + t = (item.get('text') or item.get('pattern') or '').strip() + if not t: + continue + w = float(item.get('weight', 1.0)) + out.append((t, w)) + return out + + +def _score_text(text: str, entries: Sequence[Tuple[str, float]]) -> float: + if not text or not entries: + return 0.0 + s = 0.0 + for kw, w in entries: + if kw in text: + s += w + return s + + +DiagramKind = str # 'figure' | 'table' + + +@dataclass(frozen=True) +class DiagramIntent: + kind: str + score: float + sources: str + + +# 栈顶 = index 0,优先生效 +DiagramStack = List[DiagramIntent] + + +def score_figure_table( + title: str, + context_snippet: str, + rules: Dict[str, Any], +) -> Tuple[float, float]: + """标题与上下文分别计分后按权重合并。""" + fig_kw = _normalize_keyword_entries(rules.get('figure_keywords')) + tbl_kw = _normalize_keyword_entries(rules.get('table_keywords')) + tw = float(rules.get('title_weight', 1.0)) + cw = float(rules.get('context_weight', 0.6)) + t = title or '' + c = context_snippet or '' + fig = tw * _score_text(t, fig_kw) + cw * _score_text(c, fig_kw) + tbl = tw * _score_text(t, tbl_kw) + cw * _score_text(c, tbl_kw) + return fig, tbl + + +def extract_outline_window( + outline_text: str, + section_title: str, + before: int, + after: int, + fallback_chars: int = 1200, +) -> str: + """ + 在大纲中定位章节标题所在行,取上下窗口;找不到则取全文前缀。 + """ + if not outline_text or not section_title: + return (outline_text or '')[:fallback_chars] + title_stripped = section_title.strip() + if not title_stripped: + return outline_text[:fallback_chars] + lines = outline_text.splitlines() + idx = -1 + # 优先整行包含;否则子串匹配(去编号后) + def _strip_serial(s: str) -> str: + return re.sub(r'^\s*[\d一二三四五六七八九十]+[、..\s]+', '', s).strip() + + core = _strip_serial(title_stripped) + for i, line in enumerate(lines): + line_s = line.strip() + if title_stripped in line_s or (core and core in _strip_serial(line_s)): + idx = i + break + if core and core in line_s: + idx = i + break + if idx < 0: + return outline_text[:fallback_chars] + lo = max(0, idx - max(0, before)) + hi = min(len(lines), idx + max(0, after) + 1) + return '\n'.join(lines[lo:hi]) + + +def build_stack( + fig_score: float, + tbl_score: float, + rules: Dict[str, Any], + enable_figure: bool, + enable_table: bool, +) -> DiagramStack: + tf = float(rules.get('threshold_figure', 1.0)) + tt = float(rules.get('threshold_table', 1.0)) + mode = (rules.get('stack_order_when_both') or 'score_desc').strip() + + fig_ok = enable_figure and fig_score >= tf + tbl_ok = enable_table and tbl_score >= tt + + intents: List[DiagramIntent] = [] + if fig_ok: + intents.append( + DiagramIntent('figure', fig_score, 'title+context') + ) + if tbl_ok: + intents.append( + DiagramIntent('table', tbl_score, 'title+context') + ) + if len(intents) <= 1: + return intents + + a, b = intents[0], intents[1] + if mode == 'figure_first': + order = [a, b] if a.kind == 'figure' else [b, a] + elif mode == 'table_first': + order = [a, b] if a.kind == 'table' else [b, a] + else: # score_desc — 高分在栈顶 + order = sorted([a, b], key=lambda x: -x.score) + return order + + +def stack_compact_labels(stack: DiagramStack) -> List[str]: + """与 stack_to_addon 中 labels 一致,供附件仅块输出的提示词。""" + labels: List[str] = [] + for it in stack: + if it.kind == 'figure': + labels.append('图示([FIGURE] 块)') + else: + labels.append('表格([TABLE] 块)') + return labels + + +def make_fallback_stack(kind: str) -> DiagramStack: + """栈空且需生成时,按单一 figure/table 占位。""" + k = (kind or '').strip().lower() + if k not in ('figure', 'table'): + k = 'table' + return [DiagramIntent(k, 1.0, 'fallback')] + + +def stack_to_addon(stack: DiagramStack) -> str: + """按栈序拼接优先级说明 + 图示/表格规范全文。""" + if not stack: + return '' + labels: List[str] = [] + for it in stack: + if it.kind == 'figure': + labels.append('图示([FIGURE] 块)') + else: + labels.append('表格([TABLE] 块)') + parts: List[str] = [P.diagram_priority_preamble(labels)] + for it in stack: + if it.kind == 'figure': + parts.append(P.get_figure_addon()) + else: + parts.append(P.get_table_addon()) + return ''.join(parts) + + +class DiagramIntentAgent: + """可配置规则实例:对单节计算栈并渲染附加提示词。""" + + def __init__(self, rules: Optional[Dict[str, Any]] = None) -> None: + self.rules = rules or load_diagram_rules() + + @classmethod + def load_default(cls) -> 'DiagramIntentAgent': + return cls(load_diagram_rules()) + + def plan( + self, + section_title: str, + outline_text: str, + enable_figure: bool, + enable_table: bool, + ) -> DiagramStack: + r = self.rules + oc = r.get('outline_context_lines') or {} + before = int(oc.get('before', 4)) + after = int(oc.get('after', 6)) + ctx = extract_outline_window( + outline_text, section_title, before, after, + ) + fig_s, tbl_s = score_figure_table(section_title, ctx, r) + return build_stack(fig_s, tbl_s, r, enable_figure, enable_table) + + def render_for_section( + self, + section_title: str, + outline_text: str, + enable_figure: bool, + enable_table: bool, + ) -> str: + if not enable_figure and not enable_table: + return '' + stack = self.plan( + section_title, outline_text, enable_figure, enable_table, + ) + return stack_to_addon(stack) + + +# 模块级默认实例,供 generator 单次调用 +_default_agent: Optional[DiagramIntentAgent] = None + + +def get_diagram_agent() -> DiagramIntentAgent: + global _default_agent + if _default_agent is None: + _default_agent = DiagramIntentAgent.load_default() + return _default_agent + + +def invalidate_diagram_agent_cache() -> None: + global _default_agent + _default_agent = None diff --git a/utils/file_utils.py b/utils/file_utils.py new file mode 100644 index 0000000..c23ea3d --- /dev/null +++ b/utils/file_utils.py @@ -0,0 +1,205 @@ +""" +文件处理工具:从 PDF / Word 文件中提取纯文本 +""" +import os +import logging +from pathlib import Path + +logger = logging.getLogger(__name__) + + +def extract_text(file_path: str) -> str: + """ + 根据文件扩展名提取文本。 + 支持 .pdf / .docx / .doc + """ + path = Path(file_path) + ext = path.suffix.lower() + + if ext == '.pdf': + return _extract_pdf(file_path) + elif ext == '.docx': + return _extract_docx(file_path) + elif ext == '.doc': + return _extract_doc(file_path) + else: + raise ValueError(f'不支持的文件类型: {ext}') + + +def _extract_pdf(file_path: str) -> str: + """提取 PDF 文本,优先使用 pypdf,回退到 pdfminer""" + try: + from pypdf import PdfReader + reader = PdfReader(file_path) + parts = [] + for page in reader.pages: + text = page.extract_text() + if text: + parts.append(text) + result = '\n'.join(parts) + if result.strip(): + return result + except Exception as e: + logger.warning(f'pypdf 提取失败: {e},尝试 pdfminer') + + try: + from pdfminer.high_level import extract_text as pm_extract + result = pm_extract(file_path) + return result or '' + except Exception as e: + logger.error(f'pdfminer 提取失败: {e}') + raise RuntimeError(f'PDF 文本提取失败: {e}') + + +def extract_pdf_pages(file_path: str) -> list[str]: + """ + 按页提取 PDF 文本(用于工程量清单页筛选)。 + 优先 pypdf 逐页;若各页均无文本则回退 pdfminer 整篇作为单元素列表。 + """ + pages: list[str] = [] + try: + from pypdf import PdfReader + reader = PdfReader(file_path) + for page in reader.pages: + text = page.extract_text() + pages.append((text or '').strip()) + if any(pages): + return pages + except Exception as e: + logger.warning(f'pypdf 按页提取失败: {e},尝试 pdfminer') + + try: + from pdfminer.high_level import extract_text as pm_extract + blob = (pm_extract(file_path) or '').strip() + return [blob] if blob else [''] + except Exception as e: + logger.error(f'pdfminer 提取失败: {e}') + raise RuntimeError(f'PDF 文本提取失败: {e}') + + +def _extract_docx(file_path: str) -> str: + """提取 .docx 文档文本(python-docx)""" + try: + from docx import Document + doc = Document(file_path) + parts = [] + for para in doc.paragraphs: + if para.text.strip(): + parts.append(para.text) + for table in doc.tables: + for row in table.rows: + row_texts = [cell.text.strip() for cell in row.cells if cell.text.strip()] + if row_texts: + parts.append(' '.join(row_texts)) + return '\n'.join(parts) + except Exception as e: + logger.error(f'.docx 提取失败: {e}') + raise RuntimeError(f'Word 文本提取失败: {e}') + + +def _extract_doc(file_path: str) -> str: + """ + 提取旧版 .doc 文件文本,按优先级依次尝试: + 1. win32com(Windows + Microsoft Word 已安装,最准确) + 2. LibreOffice 命令行转换(需安装 LibreOffice) + 3. python-docx 兼容尝试(部分以 XML 保存的伪 .doc 可读) + 全部失败时提示用户手动另存为 .docx + """ + abs_path = str(Path(file_path).resolve()) + + # ── 方案1:win32com(Windows + Word)────────────────────────────────── + try: + import win32com.client + import pythoncom + pythoncom.CoInitialize() + word = None + try: + word = win32com.client.Dispatch('Word.Application') + word.Visible = False + doc = word.Documents.Open(abs_path, ReadOnly=True) + text = doc.Range().Text + doc.Close(False) + logger.info(f'.doc 通过 win32com 提取成功: {file_path}') + return text or '' + finally: + if word: + try: + word.Quit() + except Exception: + pass + pythoncom.CoUninitialize() + except ImportError: + logger.info('pywin32 未安装,跳过 win32com 方案') + except Exception as e: + logger.warning(f'win32com 提取 .doc 失败: {e}') + + # ── 方案2:LibreOffice 命令行 ───────────────────────────────────────── + try: + import subprocess + import tempfile + tmp_dir = tempfile.mkdtemp() + for soffice_cmd in ('soffice', 'libreoffice'): + try: + result = subprocess.run( + [soffice_cmd, '--headless', '--convert-to', 'txt:Text', + '--outdir', tmp_dir, abs_path], + capture_output=True, text=True, timeout=60, + ) + if result.returncode == 0: + txt_file = os.path.join(tmp_dir, Path(file_path).stem + '.txt') + if os.path.exists(txt_file): + with open(txt_file, 'r', encoding='utf-8', errors='ignore') as f: + content = f.read() + logger.info(f'.doc 通过 LibreOffice 提取成功: {file_path}') + return content + except FileNotFoundError: + continue + except subprocess.TimeoutExpired: + logger.warning('LibreOffice 转换超时') + break + except Exception as e: + logger.warning(f'LibreOffice 提取 .doc 失败: {e}') + + # ── 方案3:python-docx 兼容尝试(部分另存的 .doc 实为 XML 格式)────── + try: + result = _extract_docx(file_path) + if result.strip(): + logger.info(f'.doc 通过 python-docx 兼容读取成功: {file_path}') + return result + except Exception as e: + logger.warning(f'python-docx 兼容读取 .doc 失败: {e}') + + raise RuntimeError( + '无法读取 .doc 格式文件。请在 Word 中打开该文件,' + '选择「另存为」→「Word 文档 (.docx)」后重新上传。' + ) + + +def truncate_text(text: str, max_chars: int = 60000) -> str: + """截断超长文本,避免超出 AI Token 限制""" + if len(text) <= max_chars: + return text + return text[:max_chars] + '\n\n...[文档内容已截断,仅展示前段]' + + +def split_text_chunks(text: str, chunk_size: int = 2000, overlap: int = 200) -> list[str]: + """将文本按固定大小分块(用于知识库)""" + chunks = [] + start = 0 + while start < len(text): + end = min(start + chunk_size, len(text)) + chunks.append(text[start:end]) + start += chunk_size - overlap + return chunks + + +def allowed_file(filename: str) -> bool: + allowed = {'pdf', 'doc', 'docx'} + return '.' in filename and filename.rsplit('.', 1)[1].lower() in allowed + + +def safe_filename(filename: str) -> str: + """生成安全的文件名""" + import re + name = re.sub(r'[^\w\u4e00-\u9fff.\-]', '_', filename) + return name diff --git a/utils/outline_numbering.py b/utils/outline_numbering.py new file mode 100644 index 0000000..addf0ac --- /dev/null +++ b/utils/outline_numbering.py @@ -0,0 +1,52 @@ +""" +标书目录号展示:一级为汉字+顿号,子级为数字多级编号(与 AI 大纲示例一致)。 +""" +from __future__ import annotations + + +def int_to_chinese_numeral(n: int) -> str: + """将正整数转为中文数字(一、二、…、十、十一、…、九十九、一百)。""" + if n <= 0: + return str(n) + digits = "零一二三四五六七八九" + if n < 10: + return digits[n] + if n == 10: + return "十" + if n < 20: + return "十" + (digits[n % 10] if n % 10 else "") + if n < 100: + t, o = divmod(n, 10) + s = digits[t] + "十" + if o: + s += digits[o] + return s + if n < 1000: + h, r = divmod(n, 100) + s = digits[h] + "百" + if r == 0: + return s + if r < 10: + return s + "零" + digits[r] + return s + int_to_chinese_numeral(r) + # 极少需要百级以上章,保守处理 + return str(n) + + +def format_heading_display(level: int, section_number: str, title: str) -> str: + """ + 生成带目录号的章节展示行(用于大纲文本、Word 标题、目录页)。 + - 一级:汉字、顿号 + 标题,如「一、总体方案」 + - 二级及以下:「1.1 子标题」 + """ + title = (title or "").strip() + sn = (section_number or "").strip() + lv = int(level) if level else 1 + if lv <= 1: + main = sn.split(".")[0] + try: + idx = int(main) + except ValueError: + idx = 1 + return f"{int_to_chinese_numeral(idx)}、{title}" + return f"{sn} {title}".strip() diff --git a/utils/prompts.py b/utils/prompts.py new file mode 100644 index 0000000..3248229 --- /dev/null +++ b/utils/prompts.py @@ -0,0 +1,1069 @@ +""" +所有提示词模板(已内嵌,打包后不暴露明文文件) +""" +from typing import Optional + +import config +from utils import volume_chapters as _vc + +# ── 内嵌提示词常量 ───────────────────────────────────────────────────────── + +PROJECT_SUMMARY = """\ +- 角色:招标文件编写专家,精通招标文件结构化、摘要编写 + +- 任务:根据用户提供的项目招标文件内容,生成一份专业、清晰的结构化摘要 + +- 要求: + + 一、摘要框架 + 1. 项目概况 + - 项目名称 + - 建设地点 + - 工程性质(新建/改建/扩建) + - 核心建设内容 + - 关键工程量指标 + - 特殊施工工艺(如顶管/盾构等) + - 项目概况 + + 2. 技术要求体系 + - 专业监测要求(分项列出核心监测指标) + - 技术标准规范 + - 质量管控要点 + - 特殊工艺标准 + + 3. 交付物矩阵 + - 阶段性成果清单(含时间节点) + - 最终交付文件要求 + - 成果验收标准 + - 备案审批流程 + + 4. 商务条款摘要 + - 合同期限 + - 支付结构 + - 报价约束条件 + - 违约条款要点 + - 知识产权约定 + + 5. 资质要求矩阵 + - 企业资质门槛 + - 人员资格要求 + - 设备配置标准 + - 同类项目经验 + + 6. 评标要素体系 + - 技术评分维度 + - 商务评分权重 + - 否决性条款 + - 实质性条款 + - 围标识别机制 + + + 二、处理规范 + 1. 信息抽取规则: + - 采用三级信息提炼法(关键数据→技术参数→约束条件) + - 识别并标注法定强制性条款(★号条款) + - 提取特殊工艺参数(例如顶管直径、沉井尺寸等) + + 2. 结构化呈现要求: + - 使用Markdown分级标题系统 + - 技术参数格式化处理 + - 流程节点采用时间轴呈现 + - 关键数据突出显示(例如预算金额、最高限价) + + 3. 专业术语处理: + - 保持行业术语准确性 + - 工程计量单位标准化转换 + - 法律条款原文引述 + + 三、输出示例 + 1.确保包含但不仅限于: + - 项目背景的技术参数分解 + - 监测要求的分类归纳 + - 成果交付的阶段性要求 + - 商务条款的要点提炼 + + 四、质量保障 + 1. 完整性核查清单: + - 验证五证要求(资质/业绩/人员/设备/资金) + - 检查三大核心条款(技术/商务/法律) + - 确认关键日期节点(工期/交付期/质保期) + + 2. 风险提示机制: + - 标注异常约束条款 + - 识别排他性要求 + - 提示潜在履约风险点 + +请严格按照上述结构化框架处理输入的招标文件,生成专业、准确、易读的项目摘要报告。 +输出内容需符合工程领域专业规范,重点数据需二次核验确保准确性。 +严格按照招标文件的内容,确保输出内容的完整性。 +直接给出摘要,禁止说明和引导词。 + +- 用户提供的招标文件内容如下: + {bid_document} +""" + +RATING_REQUIREMENTS = """\ +- 角色:招标文件信息提取专家,精通技术评分/技术评审要求的提取 + +- 任务:请严格按照以下步骤分析提供的招标文件内容,**仅提取技术评分标准**,完整输出所有技术评分细则: + +- 重要限制(必须遵守): + ★ 只提取"技术评分"/"技术评审"部分,禁止提取商务评分、价格评分、资质评分、报价等非技术内容 + ★ 若招标文件包含商务/价格评分,直接忽略,不得出现在输出中 + +- 步骤与要求: + + 1. **结构解析** + - 识别文件整体结构,定位"技术评分"/"技术评审要求"章节 + - 标注技术评分的总权重占比(如出现,如"技术分占60%") + - 跳过并忽略商务评分、价格评分、资质评审等非技术评分章节 + + 2. **技术评分要素提取** + 对"技术评分"板块进行完整深度解析: + - 提取全部技术评分细项,不能省略任何子项 + - 明确列出量化指标(如"ISO认证+3分"、"项目经验每年加1分") + - 区分强制性条款(必须满足项/否决项)与竞争性条款(择优评分项) + - 标注特殊技术要求(技术方案、实施能力、技术创新、服务响应等) + - 标注每个评分项的分值/权重 + + 3. **异常识别** + - 标出技术评分中表述模糊的评分项(如"酌情加分""优/良/差等级") + - 识别可能存在的矛盾条款 + - 提示隐藏的技术得分点 + + 4. **结果呈现** + 参考以下示例输出markdown结构化格式: + + # 技术评分细则(技术分共XX分) + + ## 一、技术方案(XX分) + ### 1.1 方案设计(XX分) + → 要求:…… + → 评分标准:…… + + ## 二、实施能力(XX分) + (继续展开...) + +请严格按照上述结构化框架处理输入的招标文件,生成专业、准确的项目技术评分要求。 +严格按照招标文件的内容,确保输出内容的完整性,禁止虚构或补充文件未提及的内容。 +直接输出技术评分要求,禁止说明和引导词。 + +- 招标文件内容如下: + {bid_document} +""" + +RATING_JSON = """\ +- 任务:从工程项目招标文件中提取技术评分要求,并以严格的JSON格式输出。 + +- 要求: + 必须生成完整有效的JSON对象,不使用JSON之外的文本说明 + 数值类型字段不添加单位符号 + 包含所有的评分项及其权重分配 + 特殊说明字段仅在存在否决条款(强制性条款)时出现 + +- 输出结构(必须严格遵守根字段名与数组名,便于后续章节字数与要点映射): + { + "items": [ + { + "id": "唯一短标识,如 T01", + "name": "评分项名称(与招标文件表述一致或精简概括)", + "weight": 数值型权重或分值(如 10 表示 10 分或 10%), + "keywords": ["与本项相关的可选关键词1", "关键词2"] + } + ], + "notes": "可选:否决条款、阶梯得分等特殊说明;无则写空字符串" + } + +- 技术评分要求内容如下: + {tech_rating}\ +""" + +OUTLINES = """\ +- 角色:技术标书架构师 +- 任务:生成适配技术评分标准的技术标书目录 +- 输出要求: + 采用四级嵌套编码体系(X.X.X.X)下实现按需分层 + 直接给出生成的目录,禁止解释和引导词 + 编号硬性规则:一级标题必须为「中文数字+顿号+标题」(例:一、总体方案),二级及以下必须为「阿拉伯数字多级编号+半角空格+标题」(例:1.1 子节标题);每个目录行必须带完整目录号,禁止输出无编号的纯标题行。 + +- 约束控制: + 根据项目生成标书的名称,如"XXXX项目技术标书" + {chapter_count_hint} + 章节颗粒度与评分指标权重正相关 + 技术实施类章节必须达到四级深度,管理保障类章节允许三级结构 + 同级节点数量必须有波动区间:技术方案类(4-7)、实施保障类(2-4)、创新应用类(1-3) + 目录的章节不能缺少包含以下关键词的内容: + - 对本项目的了解和分析 + - 项目工作重难点分析 + - 项目实施方案 + - 服务进度保障措施 + - 服务质量保障方案 + - 合理化建议 + - 服务承诺及处罚措施 + 目录不包含成本和预算内容,但要平衡项目预算、技术可行性以及技术的专业度 + +- 示例输出: + + 花岭新城BIM项目技术标书 + 一、总体实施方案 +  1.1 项目理解与需求分析 +   1.1.1 项目概述 +     1.1.1.1 建设地点及规模 +     1.1.1.2 工程地质勘察报告 +     1.1.1.3 抗震设防烈度与防火等级 +     1.1.1.4 建筑结构形式与建筑面积分布 +   1.1.2 项目背景 +     1.1.2.1 核心宗旨与目标 +     1.1.2.2 地理位置与项目规模 +   1.1.3 项目目标 +     1.1.3.1 就业机会与基础设施提升 +     1.1.3.2 乡村振兴与经济增长 +   1.1.4 项目特点 +     1.1.4.1 框筒结构抗震性能 +     1.1.4.2 分阶段工程地质勘察 +     1.1.4.3 功能区域多样化 + + 二、建筑设计 +  2.1 主要设计依据 +     2.1.1 国家标准与规范 +     2.1.2 行业标准与图集 +  2.2 建筑结构设计 +     2.2.1 结构形式 +     2.2.2 结构材料 +     2.2.3 结构布局 +     2.2.4 结构经济指标 +     2.2.5 结构细节设计 +  2.3 建筑功能布局 +     2.3.1 C1#楼(厂房) +       2.3.1.1 功能分区明确 +       2.3.1.2 流线优化与安全性 +     2.3.2 配电房 +       2.3.2.1 设计目标与设备布置 +       2.3.2.2 空间规划与电气主接线方案 +     2.3.3 外廊及架空建筑 +       2.3.3.1 功能区域与景观设计 +       2.3.3.2 光照与通风优化 +  2.4 建筑材料选用 +  2.5 建筑外观设计 +  2.6 建筑室内布局 +     2.6.1 功能分区与设计要点 +  2.7 建筑安全和消防设计 +     2.7.1 建筑安全体系 +     2.7.2 消防系统设计 +  2.8 建筑节能设计 +     2.8.1 节能措施与绿色建材 +     2.8.2 雨水收集系统 + + 三、结构设计 +  3.1 结构形式 +  3.2 结构材料 +     3.2.1 混凝土与钢材选用 +  3.3 结构布局 +     3.3.1 结构柱网与通风疏散通道 +  3.4 结构经济指标 +     3.4.1 抗震设计要求与用材控制 +  3.5 结构细节设计 +     3.5.1 基础设计与钢结构细节 +     3.5.2 混凝土结构与抗震设计 +  3.6 结构分析与计算 + + 四、给排水设计 +  4.1 引言 +  4.2 供水系统设计 +     4.2.1 供水管道与消防水源 +     4.2.2 节水设计与雨水收集 +  4.3 排水系统设计 +     4.3.1 排水管道与雨水管理 +     4.3.2 污水处理与分流制度 +  4.4 给排水设备选择 +  4.5 细节设计 +  4.6 监测与维护 + + 五、暖通设计 +  5.1 引言 +  5.2 供暖系统设计 +     5.2.1 供暖方式与设备选择 +     5.2.2 温度控制系统 +  5.3 通风系统设计 +     5.3.1 通风方式与设备选择 +     5.3.2 空气质量控制 +  5.4 空调系统设计 +     5.4.1 空调方式与设备选择 +     5.4.2 温湿度控制系统 +  5.5 热水系统设计 +  5.6 细节设计与监测维护 + + + 六、BIM设计 +  6.1 项目总图与单体建筑设计 +  6.2 道路与排水设计 +  6.3 电气系统设计 +  6.4 绿化设计 +  6.5 BIM协同设计与施工管理 +  6.6 数据管理与培训支持 + + 七、设计说明 +  7.1 项目设计依据 +  7.2 设计原则 +  7.3 结构经济合理化 +  7.4 建筑功能分区 +  7.5 设计细节要求 + + 八、合理化建议 +  8.1 建筑专业合理化建议 +  8.2 结构专业合理化建议 +  8.3 给排水专业合理化建议 +  8.4 暖通专业合理化建议 +  8.5 BIM专业合理化建议 + 8.6 技术和工艺方面的建议 + 8.7 成本和预算方面的建议 + 8.8 时间和进度方面的建议 + 8.9 施工质量管理方面的建议 + 8.10 质量和安全方面的建议 + 8.11 环境和可持续性方面的建议 + + 九、施工进度安排 +  9.1 施工进度安排 +  9.2 施工进度跟踪与管理 +  9.3 施工质量管理 +  9.4 施工现场管理 +  9.5 施工结项与验收 + + 十、本项目工作重点难点分析 +  10.1 工程特点与设计工作难点 +  10.2 重点与难点分析 +  10.3 综合解决措施 + + +- 招标文件内容: +{document_text}\ +""" + +OUTLINES_WITH_RATING = """\ +- 角色:技术标书架构师 +- 任务:生成适配技术评分标准的技术标书目录 +- 输出要求: + 采用四级嵌套编码体系(X.X.X.X)下实现按需分层 + 直接给出生成的目录,禁止解释和引导词 + 编号硬性规则:一级标题必须为「中文数字+顿号+标题」,二级及以下必须为「阿拉伯数字多级编号+半角空格+标题」;每个目录行必须带完整目录号,禁止无编号的纯标题行。 + +- 约束控制: + 根据项目生成标书的名称,如"XXXX项目技术标书" + {chapter_count_hint} + 目录的章节必须按照技术评分标准的项目生成,题目应包括技术评分项目中的关键词: + 章节颗粒度与评分指标权重正相关 + 技术方案类章节必须达到四级深度,管理保障类章节允许三级结构 + 同级节点数量必须有波动区间:技术方案类(4-7)、实施保障类(2-4)、创新应用类(1-3) + 目录禁止包含报价、团队、资质、文件等商务性质的章节 + +- 示例输出: + + 花岭新城BIM项目技术标书 + 一、总体实施方案 +  1.1 项目理解与需求分析 +   1.1.1 项目概述 +     1.1.1.1 建设地点及规模 +     1.1.1.2 工程地质勘察报告 +     1.1.1.3 抗震设防烈度与防火等级 +     1.1.1.4 建筑结构形式与建筑面积分布 +   1.1.2 项目背景 +     1.1.2.1 核心宗旨与目标 +     1.1.2.2 地理位置与项目规模 +   1.1.3 项目目标 +     1.1.3.1 就业机会与基础设施提升 +     1.1.3.2 乡村振兴与经济增长 +   1.1.4 项目特点 +     1.1.4.1 框筒结构抗震性能 +     1.1.4.2 分阶段工程地质勘察 +     1.1.4.3 功能区域多样化 + + 二、建筑设计 +  2.1 主要设计依据 +     2.1.1 国家标准与规范 +     2.1.2 行业标准与图集 +  2.2 建筑结构设计 +     2.2.1 结构形式 +     2.2.2 结构材料 +     2.2.3 结构布局 +     2.2.4 结构经济指标 +     2.2.5 结构细节设计 +  2.3 建筑功能布局 +     2.3.1 C1#楼(厂房) +       2.3.1.1 功能分区明确 +       2.3.1.2 流线优化与安全性 +     2.3.2 配电房 +       2.3.2.1 设计目标与设备布置 +       2.3.2.2 空间规划与电气主接线方案 +     2.3.3 外廊及架空建筑 +       2.3.3.1 功能区域与景观设计 +       2.3.3.2 光照与通风优化 +  2.4 建筑材料选用 +  2.5 建筑外观设计 +  2.6 建筑室内布局 +     2.6.1 功能分区与设计要点 +  2.7 建筑安全和消防设计 +     2.7.1 建筑安全体系 +     2.7.2 消防系统设计 +  2.8 建筑节能设计 +     2.8.1 节能措施与绿色建材 +     2.8.2 雨水收集系统 + + 三、结构设计 +  3.1 结构形式 +  3.2 结构材料 +     3.2.1 混凝土与钢材选用 +  3.3 结构布局 +     3.3.1 结构柱网与通风疏散通道 +  3.4 结构经济指标 +     3.4.1 抗震设计要求与用材控制 +  3.5 结构细节设计 +     3.5.1 基础设计与钢结构细节 +     3.5.2 混凝土结构与抗震设计 +  3.6 结构分析与计算 + + 四、给排水设计 +  4.1 引言 +  4.2 供水系统设计 +     4.2.1 供水管道与消防水源 +     4.2.2 节水设计与雨水收集 +  4.3 排水系统设计 +     4.3.1 排水管道与雨水管理 +     4.3.2 污水处理与分流制度 +  4.4 给排水设备选择 +  4.5 细节设计 +  4.6 监测与维护 + + 五、暖通设计 +  5.1 引言 +  5.2 供暖系统设计 +     5.2.1 供暖方式与设备选择 +     5.2.2 温度控制系统 +  5.3 通风系统设计 +     5.3.1 通风方式与设备选择 +     5.3.2 空气质量控制 +  5.4 空调系统设计 +     5.4.1 空调方式与设备选择 +     5.4.2 温湿度控制系统 +  5.5 热水系统设计 +  5.6 细节设计与监测维护 + + + 六、BIM设计 +  6.1 项目总图与单体建筑设计 +  6.2 道路与排水设计 +  6.3 电气系统设计 +  6.4 绿化设计 +  6.5 BIM协同设计与施工管理 +  6.6 数据管理与培训支持 + + 七、设计说明 +  7.1 项目设计依据 +  7.2 设计原则 +  7.3 结构经济合理化 +  7.4 建筑功能分区 +  7.5 设计细节要求 + + 八、合理化建议 +  8.1 建筑专业合理化建议 +  8.2 结构专业合理化建议 +  8.3 给排水专业合理化建议 +  8.4 暖通专业合理化建议 +  8.5 BIM专业合理化建议 + 8.6 技术和工艺方面的建议 + 8.7 成本和预算方面的建议 + 8.8 时间和进度方面的建议 + 8.9 施工质量管理方面的建议 + 8.10 质量和安全方面的建议 + 8.11 环境和可持续性方面的建议 + + 九、施工进度安排 +  9.1 施工进度安排 +  9.2 施工进度跟踪与管理 +  9.3 施工质量管理 +  9.4 施工现场管理 +  9.5 施工结项与验收 + + 十、本项目工作重点难点分析 +  10.1 工程特点与设计工作难点 +  10.2 重点与难点分析 +  10.3 综合解决措施 + + +- 招标文件摘要: +{summary} + +- 技术评分标准: +{rating}\ +""" + +CHAPTER_OUTLINE = """\ +- 角色:技术标书架构师 + +- 能力: + - 单章节深度解构能力 + - 跨章节协同规划视野 + - 评分权重动态分配策略 + +- 任务:根据招标文件概要、章节主题、评分要求,生成结构化的技术标书该章节的目录 + +- 输出要求: + - 采用四级嵌套编码体系(X.X.X.X)确保章节颗粒度可控 + - 只输出子章节,不输出主章节标题,不要解释和引导词 + - 编号必须从 X.1 开始递增,禁止出现 X.0、X.0.1、01 等编号 + - 允许纯文本输出,不使用 markdown 代码块 +{sub_chapter_count_hint} + +- 示例输出,以"服务进度保障措施"为例: + 二、智慧物流系统全生命周期进度保障 +  2.1 基于BIM的进度协同管理平台 +   2.1.1 多级进度计划耦合模型 +    2.1.1.1 WBS-Milestone映射矩阵 +    2.1.1.2 Primavera P6进度基线 +   2.1.2 资源约束进度优化算法 +    2.1.2.1 基于CPM的缓冲区间动态分配 +    2.1.2.2 资源平滑度R=0.92 + +- 招标文件概要: + {summary} + +- 章节主题: + {chapter} + +- 评分要求: + {score}\ +""" + +SECTION_DETAILS = """\ +【字数硬性要求】 +{word_count_spec} +注意:字数须由实质性方案内容支撑,禁止用重复背景、空洞承诺或复述招标要求来凑字数。 + +━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ +角色:资深工程投标技术方案撰写专家 +任务:以执行方视角,针对本章节标题所对应的工作内容,撰写具体可操作的技术方案正文。 + +【写作铁律】 +▌写方案,不写回应——开门见山描述具体做法,把招标参数直接融入方案 + × 禁止:"根据招标文件要求,我方将……""针对贵方提出的XXX要求,我方承诺……" +▌不重申已知信息(最常见废稿场景) + 禁止在正文中出现项目名称/建设单位/建设地点/合同工期等基本信息; + 禁止将工程量数字("X条渠道""X公里""X座""X台""X万平方米"等)引入各章节开头 + 作为背景铺垫——这类数字只在"项目概况"章节出现一次,其他章节直接展开专业内容 +▌不复述招标参数——技术规格、工程量、服务数量均已知,直接体现在方案中 +▌不虚构优越参数——招标文件规定的参数/数量/规格是上限基准,不得无依据地写成"优于要求" + × 禁止:招标要求10台,方案里写"我方投入15台以确保万无一失"(无根据拔高) + × 禁止:招标要求C30混凝土,方案里写"我方采用C35以体现高标准"(无依据提升规格) + √ 正确:按招标要求的数量/规格如实描述,竞争力体现在工艺方法和管理措施上 +▌不虚构优越参数——招标文件规定的参数/数量/规格如实描述,不得无依据拔高 + × 禁止:招标要求10台 → 方案写"我方投入15台"(无根据) + × 禁止:招标要求C30混凝土 → 方案写"我方采用C35体现高标准"(无依据) + √ 如需体现竞争力,在工艺方法、管理制度、响应速度等维度展开,不在规格数量上自行拔高 +▌不用套话——禁用:高度重视、全力以赴、竭诚服务、确保圆满完成、综上所述、通过以上措施 +▌格式——纯文本,段落空行分隔,列举用(1)(2)(3),不用markdown符号 + +【参考背景(仅供理解语境,禁止复述到正文中)】 +- 项目概要: +{summary} + +- 标书目录: +{outline} + +【本次撰写的章节标题】 +{subsection_title} + +直接输出正文,不含标题行,不含任何说明语。\ +""" + +SCORING_RULES = """\ +"你是一名专业的招标文件分析师,请按照以下步骤处理用户提供的项目招标文件内容: + +1. **结构识别** +- 仔细解析文件结构,定位'评分标准'、'评审办法'、'投标人须知'等关键章节 +- 特别注意包含'分值'、'评分项'、'权重'等关键词的段落 + +2. **核心要素提取** +- 系统提取以下要素形成结构化表格: + │ 类别 │ 评分项名称 │ 分值权重 │ 具体要求 │ 否决条款 │ +- 分类标准: + ● 技术部分(方案设计、实施能力、技术创新等) + ● 商务部分(资质证明、业绩案例、团队经验等) + ● 价格部分(报价合理性、计价方式等) + ● 其他专项(售后服务、本地化服务等) + +3. **深度分析** +- 计算权重配比(示例:技术60% = 方案设计30% + 实施能力20% + 创新10%) +- 识别否决性条款(如"▲"标记项或特定强制要求) +- 标注特殊评分规则:阶梯得分、区间赋分、横向比较等机制 + +4. **风险提示** +- 标出易被忽视的得分点(如ISO认证、专利数量等) +- 识别矛盾条款(如总分值≠100%的情况) +- 提示资质门槛要求(注册资金、特定资质证书等) + +5. **输出格式** +采用Markdown输出以下结构: +\`\`\`markdown +# 招标评分要点汇总 + +## 核心指标配比 +- 总评分构成:技术分(__%)+ 商务分(__%)+ 价格分(__%) + +## 详细评分矩阵 +| 类别 | 评分项 | 分值 | 具体要求 | 关键指标 | +|------|-------|-----|---------|---------| +| ... | ... | ... | ... | ... | + +## 重点提示 +⚠️ 否决条款:列出所有一票否决项 +💡 得分要点:突出3-5个高权重核心指标 +⏱️ 时间节点:标注与评分相关的时限要求 +\`\`\` +请先确认理解任务要求,待用户提供招标文件内容后执行分析。"\ +""" + +# ── 来自 section_detail.py 的提示词 ──────────────────────────────────────── + +GEN_LEAF_DETAIL_PROMT = """\ +【字数硬性要求】 +{word_count_spec} +注意:字数须由实质性方案内容支撑,禁止用重复背景、空洞承诺或复述招标要求来凑字数。 + +━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ +角色:资深工程投标技术方案撰写专家 +任务:以执行方视角,针对本章节标题所对应的工作内容,撰写具体可操作的技术方案正文。 + +【写作方式——铁律,违反即视为废稿】 + +▌写方案,不写回应 +× 错误:"根据招标文件要求,我方将……" +× 错误:"针对贵方提出的XXX要求,我方承诺……" +× 错误:"招标文件明确规定了……对此,我方将……" +√ 正确:开门见山写具体做法,把招标参数直接融入方案中 + +▌不重申已知信息(最常见废稿场景) +× 禁止:在正文中出现项目名称、建设单位、建设地点、合同工期等基本信息 +× 禁止:将招标文件中的具体工程量数字(如"X条渠道""X公里""X座建筑物""X台设备" + "X万平方米"等)引入到本章节开头作为背景铺垫——这类数字只能在"项目概况/背景" + 章节里出现一次,质量管理、安全措施、进度计划、技术方案等专业章节一律直接展开 +× 禁止:重复其他章节已经出现过的项目背景介绍段落 + +▌不虚构优越参数 +× 禁止:招标要求10台 → 写成"我方投入15台以确保万无一失"(无依据拔高数量) +× 禁止:招标要求C30混凝土 → 写成"我方采用C35体现高标准"(无依据提升规格) +× 禁止:招标方规定了参数/工程量 → 写成"我方承诺优于招标要求"(空洞吹捧) +√ 如需体现竞争力,在工艺方法、管理精细度、响应时效等维度展开,不在规格数量上无依据拔高 + +▌不用空话套话 +× 禁用:"高度重视""全力以赴""竭诚服务""确保圆满完成""我方将严格按照" +× 禁用:"综上所述""首先其次再次""通过以上措施" +× 禁用:以"……"或"等"结尾的列举 + +▌能概括的简洁写,有细节的展开写 +- 原则性的管理制度可一段简洁描述 +- 操作步骤、技术参数、人员配置、时间节点等有实质内容的须逐条详细展开 +- 每项措施给出具体方法或量化指标,不写"我方将采取有效措施确保"类句子 + +▌格式 +- 纯文本,段落间空行分隔 +- 列举用(1)(2)(3),不用markdown符号,不用"首先其次" + +【参考背景(仅供理解项目语境,禁止复述到正文中)】 +- 项目概要: +{summary} + +- 标书目录(用于理解本章节在全书中的定位): +{outline} + +【本次撰写的章节标题】 +{title} + +开始撰写,直接输出正文,不含标题行,不含任何说明语。\ +""" + +GEN_SECTION_INTRODUCTION_PROMT = """\ +- 角色:资深投标文件撰写专家 +- 任务:为章节撰写简短开篇引言(100~200字),直接点明本章的核心做法或服务重点 +- 使用"我方"自称,禁止套话,禁止复述招标要求,禁止重写项目背景,纯文本输出 +- 若本章内容不需要引言可直接输出空白 + +- 项目概要(仅供参考,禁止复述): +{summary} + +- 技术标书目录: +{outline} + +- 章节标题: +{title}\ +""" + + +# ── 对外接口函数 ──────────────────────────────────────────────────────────── + +def get_project_summary_prompt(bid_document: str) -> str: + return PROJECT_SUMMARY.replace('{bid_document}', bid_document) + + +def get_rating_requirements_prompt(bid_document: str) -> str: + return RATING_REQUIREMENTS.replace('{bid_document}', bid_document) + + +def get_rating_json_prompt(tech_rating: str) -> str: + return RATING_JSON.replace('{tech_rating}', tech_rating) + + +def get_outlines_prompt(document_text: str) -> str: + pages = int(getattr(config, 'TARGET_PAGES', 0) or 0) + cv = getattr(config, 'CONTENT_VOLUME', 'standard') + pce = int(getattr(config, 'PAGE_CHAR_ESTIMATE', 700) or 700) + hint = _vc.outline_chapter_count_hint(pages, cv, pce) + return ( + OUTLINES.replace('{chapter_count_hint}', hint) + .replace('{document_text}', document_text) + ) + + +def get_outlines_with_rating_prompt(summary: str, rating: str) -> str: + pages = int(getattr(config, 'TARGET_PAGES', 0) or 0) + cv = getattr(config, 'CONTENT_VOLUME', 'standard') + pce = int(getattr(config, 'PAGE_CHAR_ESTIMATE', 700) or 700) + hint = _vc.outline_chapter_count_hint_with_rating_variant(pages, cv, pce) + return ( + OUTLINES_WITH_RATING.replace('{chapter_count_hint}', hint) + .replace('{summary}', summary) + .replace('{rating}', rating) + ) + + +def get_chapter_outline_prompt( + summary: str, + chapter: str, + score: str, + max_subchapters: Optional[int] = None, +) -> str: + if max_subchapters is not None and max_subchapters > 0: + sub_chapter_count_hint = ( + f' - 子章节条数:本主章节下各级编号子章节行总数应控制在约 {max_subchapters} 条以内,' + f'可依据技术内容在合理范围略作筛选,但输出条数不得超过 {max_subchapters} 条。' + ) + else: + sub_chapter_count_hint = '' + return ( + CHAPTER_OUTLINE.replace('{summary}', summary) + .replace('{chapter}', chapter) + .replace('{score}', score) + .replace('{sub_chapter_count_hint}', sub_chapter_count_hint) + ) + + +BOQ_SUMMARY = """\ +- 角色:工程量清单分析专家 + +- 任务:从以下工程量清单数据中提取关键工程信息,生成结构化摘要,供技术标书章节写作使用。 + +- 提取重点: + 1. 主要分部分项工程类别(土建、安装、装饰、市政、绿化等) + 2. 每类工程的核心工程内容与数量(保留单位和数量值) + 3. 主要材料、设备的规格和数量 + 4. 关键施工工艺或特殊要求(如有) + +- 输出格式: + - 按工程类别分段输出,每类列举3-8个代表性工程量项 + - 保留量化数据(数量+单位),例如:"混凝土浇筑 C30 约 800m³" + - 突出与技术标书密切相关的工程内容 + - 不输出单价、金额、合计等商务数据 + - 总字数控制在 600-1200 字 + +- 工程量清单原始文本如下: +{boq_text} + +- 以下为本地规则解析得到的结构化清单附录(分部、编码、名称、单位、工程量);若为空则仅依据上文原始文本: +{boq_structured} +""" + + +def get_boq_summary_prompt(boq_text: str, boq_structured: str = '') -> str: + return ( + BOQ_SUMMARY.replace('{boq_text}', boq_text) + .replace('{boq_structured}', boq_structured or '(无本地结构化附录)') + ) + + +def get_section_detail_prompt(summary: str, outline: str, title: str, + word_count_spec: str = '', + boq_summary: str = '', + tender_kind: str = 'engineering') -> str: + """章节正文提示词。按 tender_kind 选用工程/服务/货物模板(见 utils.tender_kind_sections)。""" + from utils.tender_kind_sections import build_section_detail_prompt, normalize_tender_kind + + if not word_count_spec: + word_count_spec = ( + '- 一般小节:不少于 2000 字;核心技术/重点评分章节:不少于 4000 字\n' + '- 字数须由实质方案内容支撑,禁止用重复项目背景或复述招标要求凑字数\n' + '- 有实质细节的展开写,原则性描述可简洁处理,不强求堆砌篇幅' + ) + return build_section_detail_prompt( + normalize_tender_kind(tender_kind), + summary, + outline, + title, + word_count_spec, + boq_summary, + ) + + +def get_section_intro_prompt(summary: str, outline: str, title: str) -> str: + if not GEN_SECTION_INTRODUCTION_PROMT: + return '' + return GEN_SECTION_INTRODUCTION_PROMT.replace('{summary}', summary).replace('{outline}', outline).replace('{title}', title) + + +def diagram_priority_preamble(labels: list) -> str: + """ + 本节图表生成顺序说明;labels 顺序与意图栈栈顶一致(index 0 优先级最高)。 + """ + if not labels: + return '' + body = '\n'.join(f' {i + 1}) {name}' for i, name in enumerate(labels)) + return f""" + +【本节图表生成优先级(须按序在正文中体现:先完成高优先级项,再写下一项)】 +{body} +""" + + +def get_attachment_single_chart_prompt( + summary: str, + outline_text: str, + title: str, + kind: str, +) -> str: + """ + 附件类章节:仅输出一个 [FIGURE] 或 [TABLE] 块,禁止长技术方案正文。 + kind: 'figure' | 'table' + """ + summ = (summary or '')[:2000] + outl = (outline_text or '')[:1500] + if kind == 'figure': + spec = """▌输出格式(必须遵守,不得改括号与标签名): +[FIGURE:具体图示标题] +(用简洁文字或 ASCII 符号绘制,与标题一致) +[/FIGURE] +除上述块外不要输出技术方案段落;可在此之前写 1~2 句过渡语(可选)。""" + else: + spec = """▌输出格式(必须遵守,不得改括号与标签名): +[TABLE:具体表格标题] +| 列名1 | 列名2 | 列名3 | +|-------|-------|-------| +| 数据行 | +[/TABLE] +除上述块外不要输出技术方案段落;可在此之前写 1~2 句过渡语(可选)。""" + return f"""【附件项】本节仅生成一个图示或表格,禁止撰写长技术标正文。 + +【章节标题】 +{title} + +【项目摘要(可摘录入图/表,勿展开论述)】 +{summ} + +【大纲上下文(节选)】 +{outl} + +【任务】 +{spec} +图示/表格须紧扣章节标题与摘要,不得使用 XXX、待定 等占位;列数 4~6 为宜(表格)。""" + + +def get_attachment_chart_disabled_prompt(title: str) -> str: + """项目未开启图/表开关时,附件节的简短说明(非长文)。""" + return f"""【附件项】章节标题:{title} + +当前项目未开启「图示」或「表格」生成开关。请仅输出一两句说明:本附件内容需在项目设置中开启图示或表格后由系统自动生成;不要写技术方案正文,不要编造 [FIGURE] 或 [TABLE] 块。""" + + +def get_attachment_chapter_emphasis_hint() -> str: + """附件类叶节点在 full 模式下追加:要求展开大纲子节与量化内容,不得只写单图单表。""" + return """ + +【附件类章节(须写全篇实质内容)】 +须按大纲层级与项目摘要展开:进度计划、工程量与节点、机械设备与劳动力、布置与临时用地、合规与环保等须具体可核验;可结合工程量清单摘要。若项目已开启图示/表格,可在正文合适位置插入与本节一致的 [FIGURE] 或 [TABLE],但不得仅用单图/单表代替正文。""" + + +# 附件 stack_charts_only / single_chart_only:专用 system,避免 BID_WRITING_SYSTEM 诱发作答长文 +ATTACHMENT_STACK_CHARTS_SYSTEM = ( + '你只输出附件所需的图示块与/或表格块;禁止撰写段落性技术说明、小节标题与项目背景复述。' + '除 [FIGURE:…]…[/FIGURE]、[TABLE:…]…[/TABLE] 块内文字与表格外,不得出现其他正文;块内须扣题、可引用摘要要点,勿套通用模板。' +) + + +def get_attachment_stack_charts_prompt( + summary: str, + outline_text: str, + title: str, + stack_labels: list, +) -> str: + """ + 附件叶节点:按意图栈顺序仅输出 FIGURE/TABLE 块;stack_labels 与 diagram_intent.stack_compact_labels 一致。 + """ + summ = (summary or '')[:1500] + outl = (outline_text or '')[:1200] + priority_block = diagram_priority_preamble(stack_labels) if stack_labels else '' + return f"""【附件项·仅输出图或表块(最高优先级)】 + +【章节标题】{title} + +【项目摘要(仅供块内摘录,勿写摘要式段落)】 +{summ} + +【大纲上下文(节选)】 +{outl} +{priority_block} +▌按上列优先级顺序,各输出对应类型的一块(先完成①再②): +图示块: +[FIGURE:须具体反映本节标题的图示名] +(正文仅用 ASCII/简洁中文示意) +[/FIGURE] + +表格块: +[TABLE:须具体反映本节标题的表名] +| 列1 | 列2 | 列3 | +|-----|-----|-----| +| 数据 | +[/TABLE] + +▌禁止:编号小节、过渡段、复述招标要求、与块无关的任何正文。块内数据须与标题及摘要一致,禁用 XXX、待定。""" + + +def get_figure_addon() -> str: + """启用"图"模式时,追加到章节生成提示词末尾的图示生成规范""" + return """ + +【图示生成规范(必须遵守)】 +在正文适当位置根据本章节具体内容自动插入图示,图示内容必须与所写章节紧密对应,严禁套用与本章无关的通用模板。 + +▌标记格式(不得修改括号和斜杠,标题须具体反映图示内容): +[FIGURE:具体图示标题] +图示内容(用文字、ASCII 符号绘制) +[/FIGURE] + +▌四类触发场景及示例: + +① 组织机构类(涉及管理架构、项目班组、质量/安全/监测机构等)→ 树形图 +[FIGURE:本项目质量管理组织架构图] +项目经理 +├── 技术负责人 ──→ 专职质检员(2人)、测量员(2人) +├── 施工队长 ──→ 土建作业班(8人)、安装班(4人) +└── 安全负责人 ──→ 专职安全员(1人)、消防员(1人) +[/FIGURE] + +② 流程类(涉及施工工序、管理流程、验收程序、应急响应等)→ 流程图 +[FIGURE:监测数据处理与预警响应流程图] +现场采集 ──→ 质检复核 ──→ 数据入库 + ↓ 超阈值 + 预警分级判断 + ↓ 黄色预警 ↓ 红色预警 + 加密监测频次 立即暂停施工 + 应急响应 + ↓ 恢复正常 + 出具监测日报 ──→ 提交建设单位 +[/FIGURE] + +③ 进度计划类(涉及工期安排、里程碑节点、施工阶段等)→ 横道进度图 +[FIGURE:本项目施工进度计划示意图] +第 1- 2 周 ██ 施工准备(人员进场、测量放线、物资备货) +第 3- 6 周 ████ 土方开挖及基础处理 +第 7-11 周 ████████ 主体结构施工 +第12-14 周 ██████ 机电安装及调试 +第15-16 周 ████ 装饰收尾及自检 +第17 周 ██ 竣工验收及资料移交 +[/FIGURE] + +④ 平面布置类(涉及施工现场、监测点位、管线布置等)→ 示意平面图 +[FIGURE:施工现场平面布置示意图] +┌───────────────────────────────────────────┐ +│ [出入口/门卫] [材料堆场] [钢筋加工棚] │ +│ │ +│ [主施工区 A 段] [主施工区 B 段] │ +│ │ +│ [办公/会议室] [宿舍区] [设备停放场地] │ +└───────────────────────────────────────────┘ +[/FIGURE] + +▌执行要求: +- 每章节最多插入 2~3 个图示,按需插入,勿为凑数而强行添加 +- 图示标题须具体,如"本项目安全管理组织架构图"而非"组织架构图" +- 每个图示前后各须有至少一段正文说明,不得孤立出现 +- 图示中的岗位、人数、节点须结合本章节正文内容填写,不得留有"XXX"等占位符""" + + +def get_table_addon() -> str: + """启用"表"模式时,追加到章节生成提示词末尾的表格生成规范""" + return """ + +【表格生成规范(必须遵守)】 +在正文适当位置根据本章节具体内容自动插入表格,表格数据须结合本章节实际内容填写,严禁套用与本章无关的通用模板。 + +▌标记格式(不得修改括号和斜杠,标题须具体反映表格内容): +[TABLE:具体表格标题] +| 列名1 | 列名2 | 列名3 | +|-------|-------|-------| +| 数据1 | 数据2 | 数据3 | +[/TABLE] + +▌六类触发场景及示例: + +① 人员配置类(涉及项目管理团队、专业人员配置等) +[TABLE:本项目主要管理人员配置一览表] +| 序号 | 岗位 | 拟派人数 | 资质要求 | 主要职责 | +|------|------|---------|---------|---------| +| 1 | 项目经理 | 1 | 一级建造师,从业 10 年以上 | 全面统筹项目实施 | +| 2 | 技术负责人 | 1 | 高级工程师,从业 8 年以上 | 技术方案与质量管控 | +| 3 | 安全负责人 | 1 | 注册安全工程师,具备安全 C 证 | 安全生产管理 | +| 4 | 专职质检员 | 2 | 质检员证,从业 5 年以上 | 过程质量检验与记录 | +[/TABLE] + +② 设备投入类(涉及施工机械、监测仪器、工具设备等) +[TABLE:主要施工设备及仪器投入一览表] +| 序号 | 设备名称 | 规格型号 | 数量 | 状态 | 主要用途 | +|------|---------|---------|------|------|---------| +| 1 | 全站仪 | 徕卡 TS16 | 2 台 | 自有 | 平面及高程测量 | +| 2 | 静力水准仪 | BGK-4700 | 8 套 | 自有 | 沉降自动化监测 | +| 3 | 挖掘机 | 卡特 320D | 2 台 | 租赁 | 基坑开挖 | +[/TABLE] + +③ 劳动力计划类(涉及各工种、各阶段人数安排等) +[TABLE:劳动力配置计划表] +| 工种 | 准备阶段(人) | 施工高峰期(人) | 收尾阶段(人) | 备注 | +|------|-------------|---------------|-------------|------| +| 测量工 | 4 | 6 | 2 | 含 1 名高级测量技师 | +| 土建工 | 8 | 20 | 6 | 持证特殊工种优先 | +| 安装工 | 0 | 10 | 4 | 含持证电工、焊工 | +[/TABLE] + +④ 质量/安全检查类(涉及关键工序验收、安全巡检等) +[TABLE:关键工序质量检验项目一览表] +| 序号 | 检验项目 | 检验方法 | 检验频率 | 合格标准 | 责任人 | +|------|---------|---------|---------|---------|------| +| 1 | 基础轴线偏差 | 全站仪复测 | 每道工序 | ≤±5mm | 测量员 | +| 2 | 混凝土强度 | 试块取样 | 每 50m³ | ≥C30 | 质检员 | +[/TABLE] + +⑤ 材料供应类(涉及主要材料规格、用量计划等) +[TABLE:主要材料供应计划表] +| 序号 | 材料名称 | 规格 | 计划用量 | 供应商 | 进场时间 | +|------|---------|------|---------|------|---------| +| 1 | 商品混凝土 | C30 | 约 800m³ | 本地搅拌站 | 第 5 周 | +| 2 | 钢筋 | HRB400Φ16-25 | 约 60t | 资质合规厂商 | 第 4 周 | +[/TABLE] + +⑥ 风险/应急类(涉及风险识别、应急预案等) +[TABLE:主要施工风险及应对措施一览表] +| 风险类型 | 诱因 | 等级 | 预防措施 | 应急响应 | +|---------|------|------|---------|---------| +| 基坑坍塌 | 降雨渗水 | 高 | 坡面防护+排水沟 | 立即撤场+加固 | +| 管线破坏 | 机械误挖 | 中 | 人工开挖保护区 | 停工+抢修 | +[/TABLE] + +▌执行要求: +- 表格数据须根据本章节正文内容填写,不得使用"XXX""待定"等占位符 +- 表格列数控制在 4~6 列,行数视内容而定,不强求凑满 +- 每张表格前后各须有至少一段正文说明 +- 每章节最多插入 2~3 张表格,按需插入""" diff --git a/utils/settings.py b/utils/settings.py new file mode 100644 index 0000000..56365d9 --- /dev/null +++ b/utils/settings.py @@ -0,0 +1,141 @@ +""" +配置持久化:将用户在界面中设置的 API Key 等配置保存到 data/settings.json, +服务重启后自动恢复,不再每次重启都丢失 Key。 +""" +import json +import os +import logging + +logger = logging.getLogger(__name__) + +_SETTINGS_PATH: str = '' # 由 app.py 初始化时注入 + + +def init(settings_path: str): + global _SETTINGS_PATH + _SETTINGS_PATH = settings_path + + +def load(cfg) -> None: + """从 settings.json 加载配置,覆盖 config 模块中的默认值""" + if not _SETTINGS_PATH or not os.path.exists(_SETTINGS_PATH): + _apply_env_overrides(cfg) + return + try: + with open(_SETTINGS_PATH, 'r', encoding='utf-8') as f: + data = json.load(f) + + _apply(cfg, data) + _apply_env_overrides(cfg) + logger.info(f'已从 {_SETTINGS_PATH} 恢复配置,当前 provider={cfg.MODEL_PROVIDER}') + except Exception as e: + logger.warning(f'加载配置文件失败: {e}') + _apply_env_overrides(cfg) + + +_ENV_API_KEYS = ( + ('QWEN_API_KEY', 'QWEN_API_KEY'), + ('OPENAI_API_KEY', 'OPENAI_API_KEY'), + ('DEEPSEEK_API_KEY', 'DEEPSEEK_API_KEY'), + ('DOUBAO_API_KEY', 'DOUBAO_API_KEY'), + ('KIMI_API_KEY', 'KIMI_API_KEY'), +) + + +def _apply_env_overrides(cfg) -> None: + """环境变量中的 API Key 优先于 settings.json(便于 Docker / 本机 .env 注入)。""" + mp = os.environ.get('MODEL_PROVIDER') + if mp and isinstance(mp, str) and mp.strip(): + cfg.MODEL_PROVIDER = mp.strip() + for env_name, attr in _ENV_API_KEYS: + val = os.environ.get(env_name) + if val and isinstance(val, str) and not val.startswith('sk-your'): + setattr(cfg, attr, val.strip()) + + +def save(cfg) -> None: + """将当前 config 模块的关键配置写入 settings.json""" + if not _SETTINGS_PATH: + return + data = { + 'model_provider': cfg.MODEL_PROVIDER, + 'qwen_api_key': cfg.QWEN_API_KEY, + 'qwen_model': cfg.QWEN_MODEL, + 'qwen_base_url': cfg.QWEN_BASE_URL, + 'openai_api_key': cfg.OPENAI_API_KEY, + 'openai_model': cfg.OPENAI_MODEL, + 'openai_base_url': cfg.OPENAI_BASE_URL, + 'deepseek_api_key': cfg.DEEPSEEK_API_KEY, + 'deepseek_model': cfg.DEEPSEEK_MODEL, + 'deepseek_base_url': cfg.DEEPSEEK_BASE_URL, + 'ollama_base_url': cfg.OLLAMA_BASE_URL, + 'ollama_model': cfg.OLLAMA_MODEL, + 'doubao_api_key': cfg.DOUBAO_API_KEY, + 'doubao_model': cfg.DOUBAO_MODEL, + 'doubao_base_url': cfg.DOUBAO_BASE_URL, + 'kimi_api_key': cfg.KIMI_API_KEY, + 'kimi_model': cfg.KIMI_MODEL, + 'kimi_base_url': cfg.KIMI_BASE_URL, + 'max_concurrent': cfg.MAX_CONCURRENT_SECTIONS, + 'content_volume': cfg.CONTENT_VOLUME, + 'target_pages': getattr(cfg, 'TARGET_PAGES', 0), + 'page_char_estimate': getattr(cfg, 'PAGE_CHAR_ESTIMATE', 700), + } + try: + os.makedirs(os.path.dirname(_SETTINGS_PATH), exist_ok=True) + with open(_SETTINGS_PATH, 'w', encoding='utf-8') as f: + json.dump(data, f, ensure_ascii=False, indent=2) + except Exception as e: + logger.warning(f'保存配置文件失败: {e}') + + +def _apply(cfg, data: dict) -> None: + """将 dict 中的值安全地写回 config 模块""" + str_fields = { + 'model_provider': 'MODEL_PROVIDER', + 'qwen_api_key': 'QWEN_API_KEY', + 'qwen_model': 'QWEN_MODEL', + 'qwen_base_url': 'QWEN_BASE_URL', + 'openai_api_key': 'OPENAI_API_KEY', + 'openai_model': 'OPENAI_MODEL', + 'openai_base_url': 'OPENAI_BASE_URL', + 'deepseek_api_key': 'DEEPSEEK_API_KEY', + 'deepseek_model': 'DEEPSEEK_MODEL', + 'deepseek_base_url': 'DEEPSEEK_BASE_URL', + 'ollama_base_url': 'OLLAMA_BASE_URL', + 'ollama_model': 'OLLAMA_MODEL', + 'doubao_api_key': 'DOUBAO_API_KEY', + 'doubao_model': 'DOUBAO_MODEL', + 'doubao_base_url': 'DOUBAO_BASE_URL', + 'kimi_api_key': 'KIMI_API_KEY', + 'kimi_model': 'KIMI_MODEL', + 'kimi_base_url': 'KIMI_BASE_URL', + } + for key, attr in str_fields.items(): + val = data.get(key) + if val and isinstance(val, str): + setattr(cfg, attr, val) + + if 'max_concurrent' in data: + try: + v = int(data['max_concurrent']) + cfg.MAX_CONCURRENT_SECTIONS = max(1, min(v, 20)) + except (ValueError, TypeError): + pass + + valid_volumes = ('concise', 'standard', 'detailed', 'full') + vol = data.get('content_volume') + if vol and vol in valid_volumes: + cfg.CONTENT_VOLUME = vol + + if 'target_pages' in data: + try: + cfg.TARGET_PAGES = max(0, int(data['target_pages'])) + except (ValueError, TypeError): + pass + + if 'page_char_estimate' in data: + try: + cfg.PAGE_CHAR_ESTIMATE = max(300, min(3000, int(data['page_char_estimate']))) + except (ValueError, TypeError): + pass diff --git a/utils/tender_kind_sections.py b/utils/tender_kind_sections.py new file mode 100644 index 0000000..12875f3 --- /dev/null +++ b/utils/tender_kind_sections.py @@ -0,0 +1,278 @@ +""" +按招标文件类型(工程 / 服务 / 货物)区分的章节正文生成提示词模板。 +与 modules.generator.BID_WRITING_SYSTEM 配合使用;自称以系统铁律为准,统一用「我方」。 +""" +import re +from typing import Optional + +VALID_TENDER_KINDS = frozenset({'engineering', 'service', 'goods'}) + +DEFAULT_WORD_COUNT_SPEC = ( + '- 一般小节:不少于 2000 字;核心技术/重点评分章节:不少于 4000 字\n' + '- 字数须由实质方案内容支撑,禁止用重复项目背景或复述招标要求凑字数\n' + '- 有实质细节的展开写,原则性描述可简洁处理;通过流程、节点、比选、管控展开满足篇幅' +) + +TENDER_KIND_CLASSIFY = """\ +你是一名招标文件分类专家。根据以下招标文件摘录,判断本项目技术标书应采用的「写作模板类型」。 + +只输出以下三个英文单词之一,不要输出任何其他文字、标点、换行或解释: +engineering +service +goods + +含义: +- engineering:工程施工类(建筑、市政、公路、水利、装修、园林、拆除等,以现场施工组织、工艺、机械、进度网络为主) +- service:服务类(咨询、设计、监理、运维、物业、保洁、餐饮配送、培训、安保、技术服务等,以人力/智力交付、流程、SLA 为主) +- goods:货物类(设备、材料、车辆、家具、软硬件供货等,以产品规格、供货、质保、验收为主;含附带安装指导仍以供货为主可归此类) + +判定规则: +若主要为施工安装且涉及土建/结构/施工机械与工期,归为 engineering。 +若主要为服务过程、人员驻场、响应时效与服务质量体系,归为 service。 +若主要为产品技术规格、供货批次、出厂检验与到货验收,归为 goods。 +若施工与供货并重,以现场施工量与工期为主则 engineering,以设备物资交付为主则 goods。 + +【招标文件摘录】 +{excerpt} +""" + + +def get_tender_kind_classify_prompt(excerpt: str) -> str: + return TENDER_KIND_CLASSIFY.replace('{excerpt}', excerpt or '') + + +def parse_tender_kind_response(response: str) -> str: + """从模型返回中解析出 engineering / service / goods,失败则 engineering。""" + if not response: + return 'engineering' + tokens = re.sub(r'[^a-zA-Z]+', ' ', response).lower().split() + for w in tokens: + if w in VALID_TENDER_KINDS: + return w + low = response.lower() + for k in ('engineering', 'service', 'goods'): + if k in low: + return k + return 'engineering' + + +def normalize_tender_kind(kind: Optional[str]) -> str: + k = (kind or '').strip().lower() + return k if k in VALID_TENDER_KINDS else 'engineering' + + +# ── 工程类 ─────────────────────────────────────────────────────────────── + +SECTION_DETAILS_ENGINEERING = """\ +━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ +- 角色:资深工程施工组织设计专家 +- 任务:撰写通用型工程施工组织设计技术章节 + +【核心定位】 +- 通用施工模板,适用于建筑、市政、公路、水利等工程施工类项目 +- 聚焦:施工方案、工艺方法、机械设备、进度计划、质量安全控制 +- 正文为可直接提交的成稿语句:凡招标文件概要或工程量清单摘要已给出的工程量、地质、工期、指标等,可如实融入叙述;未给出的具体数值、型号、台数、吨位等,一律用通顺的中文概括表达(如"相应规格""与进度及作业面相匹配的台套""符合设计及规范要求的能级"),不得使用方括号或待填项留白 + +【内容特征】 +- 施工工艺描述到"方法层面";可引用规范条文名称或编号(如"应符合JTG/T 3610要求");无依据处不写臆造数字 +- 设备与资源配置:写清设备类别与用途,用"按工况与设计要求选配相应规格与数量""满足流水作业与峰值强度需要"等概括句式,禁止出现"[型号][数量]台"类占位 +- 进度计划使用相对阶段("施工准备期"、"主体施工期")而非具体日期 +- 技术措施可提供多方案比选,用"视地质与水文条件选用适宜工艺"等自然语言衔接现场条件,禁止方括号待填 + +【未定参数的写法(替代一切占位符)】 +- 工程规模与结构:用"本工程相应单体与线路区段""按设计结构形式与跨度条件"等概括,不罗列未提供的具体数字 +- 技术参数:已见于招标/清单的写具体值;未见者写"按设计强度等级与验收标准执行""压实度与分层厚度满足规范及设计要求" +- 机械与劳动力:写"配置满足峰值强度与关键线路需要的机械组合""劳动力按施工阶段动态投入并保持关键岗位持证齐备" +- 时间节点:写"在招标工期内划分准备、主体、收尾阶段并设置可控里程碑",无具体日历则不用臆造周数 + +【行文规范】 +- 自称统一用「我方」,禁用「我们」「本公司」 +- 招标人称「招标方」或「建设单位」 +- 禁止前导句和AI套话(综上所述、高度重视等) +- 列举用(1)(2)(3),禁用"首先其次" +- 纯文本输出,段落间空行分隔 + +【防过拟合约束】 +- 不绑定具体地名与局地气候细节,改为"结合项目环境与季节特点采取针对性措施" +- 不绑定特定施工方法(如不说"必须用旋挖钻",改为"根据地质选用适宜桩基工艺") +- 使用弹性表述:"按设计要求"、"视现场情况"、"符合规范规定" + +【字数要求】 +{word_count_spec} +- 通过展开多方案比选、详细工艺流程、管控节点来满足篇幅 + +【输入】 +- 招标文件概要:{summary} +- 标书目录:{outline} +- 子小节标题:{subsection_title} + +直接输出正文,不含标题和解释。""" + + +# ── 服务类 ─────────────────────────────────────────────────────────────── + +SECTION_DETAILS_SERVICE = """\ +━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ +- 角色:资深服务方案架构师 +- 任务:撰写通用型服务项目实施方案 + +【核心定位】 +- 通用服务模板,适用于咨询服务、运维服务、技术服务、物业管理、培训服务等 +- 聚焦:服务方案、实施流程、人员配置、质量保障、响应机制、服务标准 +- 严禁出现工程施工技术参数(如混凝土标号、压实度等) +- 正文为成稿:招标/采购文件已载明的服务范围、人数、响应时限、到场要求等可如实写入;未载明的不得用方括号待填,改用"按采购文件与服务等级要求配置""满足驻场与高峰时段人力需要""建立分级响应与升级机制"等概括表述写清含义 + +【内容特征】 +- 服务流程:按"接收需求→分析评估→方案制定→实施执行→验收交付→持续改进"框架展开 +- 人员配置:强调专业资质与岗位角色齐全,用"配备满足本项目服务范围与关键岗位持证要求的人员力量""项目经理及骨干具备相应执业或认证资格"等完整句子,禁止"[资质][岗位][数量]名"式占位 +- 质量保障:使用服务体系标准(如ISO 9001、ITIL、ITSS)而非工程规范 +- 响应机制:写清"受理—分派—处理—回访/关闭"闭环;时限已见于招标文件的写具体值,未见者写"按招标文件及行业通行服务等级划分响应与处理时限,并设置升级与应急通道" +- 服务标准:可引用SLA框架,用自然语言描述指标层级与考核方式,禁止用方括号代替指标 + +【未定参数的写法】 +- 服务范围与对象:用"采购文件约定的服务内容与交付边界""服务对象规模与业务场景按项目实际确定"等概括 +- 人员与资源:用"与峰值并发与服务等级相匹配的人力与工具配置" +- 场地与备件:用"按需设置服务场所与备件储备,保障连续性与可用性目标" + +【行文规范】 +- 自称统一用「我方」,禁用「我们」「本公司」 +- 招标人称「招标方」「采购人」或「甲方」 +- 禁止前导句和AI套话 +- 列举用(1)(2)(3),禁用"首先其次" +- 纯文本输出,段落间空行分隔 +- 强调"服务承诺"与"保障措施"的可执行性,避免空泛 + +【防过拟合约束】 +- 不预设具体行业细节(如不说"针对医院HIS系统",改为"针对采购人业务系统与数据环境") +- 服务方案提供"标准模块+可选配置"结构("基础服务包包含...,增值服务可选...") +- 使用"结合采购人行业特点与监管要求""参照同类项目成熟实践"等弹性表述 + +【内容禁区】 +- 禁止出现:施工工艺、材料设备技术参数、工程量计算、施工机械配置 +- 禁止出现:建筑结构、土木工程技术措施 + +【字数要求】 +{word_count_spec} +- 通过详细描述服务流程节点、人员职责分工、质量检查点、应急预案来满足篇幅 + +【输入】 +- 招标文件概要:{summary} +- 标书目录:{outline} +- 子小节标题:{subsection_title} + +直接输出正文,不含标题和解释。""" + + +# ── 货物类 ─────────────────────────────────────────────────────────────── + +SECTION_DETAILS_GOODS = """\ +━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ +- 角色:资深供货方案技术专家 +- 任务:撰写通用型货物采购项目技术响应方案 + +【核心定位】 +- 通用供货模板,适用于设备采购、材料供应、系统集成、软件采购等 +- 聚焦:产品技术规格、供货方案、质量保证、安装调试(如有)、售后服务 +- 正文为成稿:采购文件、技术规范书或清单中已列明的型号、数量、指标、交货期、质保期等可如实响应;未列明的不得臆造优于招标的数字,亦不得用方括号待填;用"不低于采购文件对应条款""满足招标文件列明的性能与符合性要求""供货批次与到货节奏与现场安装计划相衔接"等概括语言写全句 + +【内容特征】 +- 技术规格:按"指标项—符合性说明"展开;已给出阈值的照写;未给出的写"满足招标文件技术指标与检测方法要求""与同类应用场景主流水平相当且不降低实质性响应" +- 产品描述:强调功能特性、可靠性与标准符合性,避免绑定特定品牌(除非招标文件指定) +- 供货方案:分阶段描述(签约后组织生产或备货、出厂检验、运输与到货验收);具体天数仅在有依据时写出,否则用"按合同与采购文件约定的供货周期执行" +- 质量保障:强调"出厂检验+第三方检测(如要求)+质保期服务"分层体系 +- 售后服务:写清质保责任边界、备件与技术支持渠道;时长以招标为准,无则写"按采购文件及国家相关规定执行" + +【未定参数的写法】 +- 性能与容量:用"满足采购文件规定的处理能力/精度/兼容性等关键指标" +- 数量与批次:用"与合同清单及现场需求匹配的供货批次与配套件配置" +- 服务时效:用"建立可追踪的报修、响应与闭环机制,时限不低于采购文件要求" + +【行文规范】 +- 自称统一用「我方」,禁用「我们」「本公司」 +- 招标人称「招标方」「采购人」或「甲方」 +- 禁止前导句和AI套话 +- 列举用(1)(2)(3),禁用"首先其次" +- 纯文本输出,段落间空行分隔 +- 技术描述客观准确,避免夸大(不用"最先进"、"行业第一",改用"符合国家标准或采购文件引用标准的要求""满足招标文件实质性条款") + +【防过拟合约束】 +- 不绑定特定品牌(如不说"采用华为服务器",改为"提供满足采购文件性能与安全要求的服务器设备") +- 无具体数值依据时,不写虚构的"≥某数值",改为对符合性与可检测性的承诺 +- 供货方案考虑多种交付场景(国内供货、进口设备、定制生产等)时,用自然语言比较路径优劣与适用条件 + +【内容禁区】 +- 禁止出现:施工组织、安装工艺(除非含安装服务)、土建工程、人员现场施工配置 +- 禁止出现:工程管理流程(如施工进度网络图) + +【字数要求】 +{word_count_spec} +- 通过详细展开技术参数说明、供货流程节点、质量检验程序、售后服务细则来满足篇幅 + +【输入】 +- 招标文件概要:{summary} +- 标书目录:{outline} +- 子小节标题:{subsection_title} + +直接输出正文,不含标题和解释。""" + + +def build_section_detail_prompt( + kind: str, + summary: str, + outline: str, + title: str, + word_count_spec: str = '', + boq_summary: str = '', +) -> str: + k = normalize_tender_kind(kind) + if k == 'service': + base = SECTION_DETAILS_SERVICE + elif k == 'goods': + base = SECTION_DETAILS_GOODS + else: + base = SECTION_DETAILS_ENGINEERING + + wc = word_count_spec.strip() or DEFAULT_WORD_COUNT_SPEC + text = base.format( + word_count_spec=wc, + summary=summary or '(未提供)', + outline=outline or '(未提供)', + subsection_title=title or '', + ) + text += ( + '\n\n【须同步遵守的全局写作禁忌】' + '禁止复述招标要求后再作答;禁止各章重复工程量数字与项目背景;' + '禁止无依据将参数写成优于招标文件;字数不得仅靠套话堆砌;' + '禁止使用方括号、「待填」「TBD」等表示未完稿字段(如[型号][数量][数值]);' + '未定信息须写成通顺的概括性中文整句。' + '若本任务提示词末尾另有「图示/表格」专用输出规范,其中的结构化标记按该规范执行,' + '不视为待填占位。' + ) + + if boq_summary.strip(): + text += ( + '\n\n- 工程量清单关键信息(写作时按需引用清单中已有数量与单位,勿无故复读;' + '清单未列明的分项用概括性施工组织语言描述,禁止使用方括号待填项):\n' + + boq_summary.strip() + ) + return text + + +# 对话模式:按类型追加的系统说明片段(与 app.py 中基础说明拼接) +CHAT_KIND_INSTRUCTION = { + 'engineering': ( + '\n【本模板类型:工程施工】' + '侧重施工组织、工艺与质量安全;未在招标文件或清单中出现的具体型号、台数、吨位等' + '用概括性中文表述写清,禁止使用方括号待填;勿虚构优于招标的规格。' + ), + 'service': ( + '\n【本模板类型:服务】' + '侧重服务流程、人员与SLA;人数、时限等以招标/采购文件为准,无则概括表述,禁止方括号待填;' + '禁止大段写混凝土标号、压实度、施工机械等工程参数。' + ), + 'goods': ( + '\n【本模板类型:货物供货】' + '侧重规格、供货、检验与质保;指标与交期以采购文件为准,无则概括表述,禁止方括号待填;' + '禁止写施工组织与土建;勿绑定未指定的品牌。' + ), +} diff --git a/utils/volume_chapters.py b/utils/volume_chapters.py new file mode 100644 index 0000000..46764d8 --- /dev/null +++ b/utils/volume_chapters.py @@ -0,0 +1,173 @@ +""" +目标页数与一级篇章数量区间:阈值与 generator._effective_volume 一致。 + +小章节(自动填充子目录行)总条数:与「目标页数」线性映射,见 subchapter_total_* 与 +allocate_subchapters_to_main *。 +""" +from __future__ import annotations + +import random +from typing import List, Optional, Tuple + +# 与 modules.generator._effective_volume 页数分界一致 +PAGE_VOLUME_THRESHOLDS = (125, 175, 225) + +# 各篇幅档位对应的一级篇章数量 [min, max](与页数映射表一致) +TOP_LEVEL_CHAPTER_RANGES = { + 'concise': (6, 8), + 'standard': (8, 10), + 'detailed': (10, 12), + 'full': (12, 16), +} + +# 小章节总条数 = slope * pages + intercept(过点 100->78, 300->212) +SUBCHAPTER_PAGES_SLOPE = 0.67 +SUBCHAPTER_PAGES_INTERCEPT = 11.0 +SUBCHAPTER_JITTER_LOW = 0.9 +SUBCHAPTER_JITTER_HIGH = 1.1 +# expand 在请求/库/配置均未给出页数时,按 100 页 ≈ 基线 78 章 ±10%,避免小章节失控到数百 +EXPAND_OUTLINE_DEFAULT_TARGET_PAGES = 100 + + +def subchapter_total_base_from_pages(pages: int) -> float: + return SUBCHAPTER_PAGES_SLOPE * float(pages) + SUBCHAPTER_PAGES_INTERCEPT + + +def subchapter_jitter_bounds(n_base: float) -> Tuple[int, int]: + """ + 对线性基线 N_base 的严格 ±10% 整数闭区间 [lo, hi](用于全标小章节行总数抽样后夹紧)。 + 例:N_base=78(约 100 页)→ lo=70, hi=86。 + """ + lo = max(1, int(round(n_base * SUBCHAPTER_JITTER_LOW))) + hi = max(lo, int(round(n_base * SUBCHAPTER_JITTER_HIGH))) + return lo, hi + + +def subchapter_total_effective( + pages: int, + k: int, + rng: Optional[random.Random] = None, +) -> int: + """ + 在目标页数 P 下,对一次「小章节自动填充」抽样的子章节行总数上界(全标合计)。 + 先按 N_base(P)=0.67*P+11 与 U~Uniform(0.9,1.1) 取整,再**严格夹紧**到 [round(N_base*0.9), round(N_base*1.1)], + 故 100 页时锚定 78±10% → 恒在 70–86 条(在仅受随机影响时)。 + + 不再用 max(n, k) 抬升总数:主章数 k 很大时若强行「每章至少 1 条」会把 N 抬到 300+,与 78±10% 目标冲突。 + 当 n < k 时由 allocate_subchapters_to_mains 将额度优先分给部分主章,其余主章 quota 为 0(该次不填小章)。 + pages<=0 或 k<=0 时返回 0(调用方不应在 TARGET_PAGES>0 且可扩展主章>0 之外使用)。 + """ + if pages <= 0 or k <= 0: + return 0 + r = rng if rng is not None else random.Random() + n_base = subchapter_total_base_from_pages(pages) + lo, hi = subchapter_jitter_bounds(n_base) + n = int(round(n_base * r.uniform(SUBCHAPTER_JITTER_LOW, SUBCHAPTER_JITTER_HIGH))) + n = min(max(n, lo), hi) + return n + + +def allocate_subchapters_to_mains(n: int, k: int) -> List[int]: + """ + 将整数 n 均分到 k 个主章:前 n%k 个主章得 floor+1,其余得 floor;k=0 返回 []。 + """ + if k <= 0: + return [] + n = max(0, n) + q, r = n // k, n % k + return [q + 1] * r + [q] * (k - r) + + +def resolve_expand_target_pages( + request_pages: Optional[int], + no_subchapter_limit: bool, + db_pages: int, + config_pages: int, +) -> int: + """ + 得到本次「自动填充小章节」使用的目标页数 P(>0 则启用条数上界,0=不限制)。 + + 显式不限制时返回 0;否则优先正数 request → 落库值 → 全局配置 → 默认 100 页。 + """ + if no_subchapter_limit: + return 0 + if request_pages is not None and int(request_pages) > 0: + return int(request_pages) + d = int(db_pages or 0) + if d > 0: + return d + c = int(config_pages or 0) + if c > 0: + return c + return EXPAND_OUTLINE_DEFAULT_TARGET_PAGES + + +def volume_key_from_target_pages(pages: int, content_volume_default: str = 'standard') -> str: + """与 _effective_volume 相同逻辑的档位 key(不读 config,便于测试)。""" + if pages <= 0: + return content_volume_default + if pages <= PAGE_VOLUME_THRESHOLDS[0]: + return 'concise' + if pages <= PAGE_VOLUME_THRESHOLDS[1]: + return 'standard' + if pages <= PAGE_VOLUME_THRESHOLDS[2]: + return 'detailed' + return 'full' + + +def top_level_chapter_range_from_pages(pages: int, content_volume_default: str = 'standard') -> Tuple[int, int]: + """ + 返回一级篇章数量区间 (lo, hi)。 + 未设置目标页数时沿用默认 8–10 章。 + """ + if pages <= 0: + return TOP_LEVEL_CHAPTER_RANGES['standard'] + vk = volume_key_from_target_pages(pages, content_volume_default) + return TOP_LEVEL_CHAPTER_RANGES[vk] + + +def outline_chapter_count_hint( + pages: int, + content_volume_default: str = 'standard', + page_char_estimate: int = 700, +) -> str: + """ + 嵌入大纲提示词的篇章约束句(替换原固定「8–10 个」相关描述)。 + + 当 pages>0 时提醒:全稿正文字量与「页数×每页字数」可替换的总目标同量级,目录 + 层次不宜过细,以免成稿后每节可写篇幅过薄、难成合理技术应答。 + """ + pce = max(1, int(page_char_estimate or 700)) + if pages <= 0: + return ( + '总的章节数应该控制在8-10个,一级篇章总数不超过10个' + ) + lo, hi = top_level_chapter_range_from_pages(pages, content_volume_default) + total_g = int(round(pages * pce)) + return ( + f'总的章节数应该控制在约 {lo}–{hi} 个,一级篇章总数不超过 {hi} 个' + f'(目标约 {pages} 页,按目标页数映射的篇幅档位估算)。' + f'全稿正文字量规模需与总目标约 {total_g} 字' + f'({pages} 页×约每页 {pce} 字的粗略换算计)同量级,目录层次与末级小节目不宜过细,' + f'避免叶节数过多时单节篇幅过薄、难以成文。' + ) + + +def outline_chapter_count_hint_with_rating_variant( + pages: int, + content_volume_default: str = 'standard', + page_char_estimate: int = 700, +) -> str: + """带评分目录模板中的同类约束(原含「不超过10个」的收紧表述)。""" + pce = max(1, int(page_char_estimate or 700)) + if pages <= 0: + return ( + '总的章节数应该控制在8-10个,不超过10个' + ) + lo, hi = top_level_chapter_range_from_pages(pages, content_volume_default) + total_g = int(round(pages * pce)) + return ( + f'总的章节数应该控制在约 {lo}–{hi} 个,不超过{hi} 个' + f'(目标约 {pages} 页,按目标页数映射的篇幅档位估算)' + f'全稿正文字量约与总目标 {total_g} 字同量级,末级子目不宜过细' + ) diff --git a/utils/word_allocation.py b/utils/word_allocation.py new file mode 100644 index 0000000..d3c41b1 --- /dev/null +++ b/utils/word_allocation.py @@ -0,0 +1,371 @@ +""" +技术评分驱动的章节字数分配:读取 data/word_allocation_rules.json, +结合 VOLUME_PRESETS 的 base/core 与项目 rating_json,为每个叶节点生成 +min_chars、word_count_spec(及可选 max_tokens)。 +""" +from __future__ import annotations + +import json +import logging +import os +import re +from typing import Any, Dict, List, Optional, Tuple + +import config + +logger = logging.getLogger(__name__) + +# 与 modules/generator.VOLUME_PRESETS 保持一致 +VOLUME_PRESETS: Dict[str, Tuple[int, int, str, int]] = { + 'concise': (1200, 2500, '精简版', 5000), + 'standard': (2000, 4000, '标准版', 8000), + 'detailed': (3000, 5500, '详细版', 12000), + 'full': (4000, 7000, '充实版', 16000), +} + +_PROVIDER_TOKEN_LIMITS = { + 'deepseek': 8192, + 'qwen': 8192, + 'openai': 16384, + 'ollama': 8192, + 'doubao': 8192, + 'kimi': 8192, +} + +DEFAULT_RULES: Dict[str, Any] = { + 'schema_version': 1, + 'alpha': 0.85, + 'budget_mode': 'target_pages', + 'per_section_floor': None, + 'per_section_cap': None, + 'relevance': {'method': 'keyword_overlap', 'min_rating_weight': 0.01}, + 'rating_parse': {}, + 'prompt': {'top_k_rating_items': 4, 'intro_line': ''}, + 'max_tokens_scale': False, +} + + +def rules_path() -> str: + return os.path.join(config.DATA_DIR, 'word_allocation_rules.json') + + +def load_rules(path: Optional[str] = None) -> Dict[str, Any]: + """加载规则 JSON;文件缺失或解析失败时返回内置 DEFAULT_RULES。""" + p = path or rules_path() + data = dict(DEFAULT_RULES) + if not os.path.isfile(p): + return data + try: + with open(p, encoding='utf-8') as f: + raw = json.load(f) + if isinstance(raw, dict): + for k, v in raw.items(): + if k.startswith('_'): + continue + if k == 'relevance' and isinstance(v, dict): + data['relevance'] = {**data.get('relevance', {}), **v} + elif k == 'prompt' and isinstance(v, dict): + data['prompt'] = {**data.get('prompt', {}), **v} + else: + data[k] = v + except Exception as e: + logger.warning('加载 word_allocation_rules.json 失败,使用内置默认: %s', e) + return data + + +def _as_float(x: Any, default: float = 0.0) -> float: + if x is None: + return default + if isinstance(x, (int, float)): + return float(x) + if isinstance(x, str): + s = re.sub(r'[^\d.\-]', '', x) + if not s: + return default + try: + return float(s) + except ValueError: + return default + return default + + +def _item_name(d: Dict[str, Any]) -> str: + for k in ('name', 'title', 'item_name', '评分项', '评分项名称', 'indicator'): + v = d.get(k) + if isinstance(v, str) and v.strip(): + return v.strip() + return '' + + +def _item_weight(d: Dict[str, Any]) -> float: + for k in ('weight', 'score', '分值', 'max_score', '满分', 'points'): + if k in d: + w = _as_float(d.get(k), 0.0) + if w > 0: + return w + return 1.0 + + +def _collect_rating_dicts(obj: Any, acc: List[Dict[str, Any]]) -> None: + if isinstance(obj, dict): + acc.append(obj) + for v in obj.values(): + _collect_rating_dicts(v, acc) + elif isinstance(obj, list): + for v in obj: + _collect_rating_dicts(v, acc) + + +def parse_rating_json(raw: Optional[str]) -> List[Dict[str, Any]]: + """ + 从 rating_json 字符串解析评分项列表。 + 每项: { 'name': str, 'weight': float, 'keywords': List[str] } + """ + if not raw or not isinstance(raw, str) or not raw.strip(): + return [] + try: + root = json.loads(raw.strip()) + except json.JSONDecodeError: + return [] + + dicts: List[Dict[str, Any]] = [] + _collect_rating_dicts(root, dicts) + + items: List[Dict[str, Any]] = [] + seen: set = set() + for d in dicts: + name = _item_name(d) + if not name or len(name) < 2: + continue + key = name.lower() + if key in seen: + continue + w = _item_weight(d) + kws: List[str] = [] + kw = d.get('keywords') or d.get('keyword') or d.get('要点') + if isinstance(kw, list): + kws = [str(x).strip() for x in kw if isinstance(x, (str, int, float)) and str(x).strip()] + elif isinstance(kw, str) and kw.strip(): + kws = [kw.strip()] + seen.add(key) + items.append({'name': name, 'weight': w, 'keywords': kws}) + + return items + + +def _title_tokens(title: str) -> List[str]: + if not title: + return [] + s = re.sub(r'[\s\d..、,,;;::/\\()()【】\[\]「」]+', ' ', title) + parts = [p for p in s.split() if len(p) >= 2] + toks = list(parts) + for m in re.findall(r'[\u4e00-\u9fff]{2,}', title): + if m not in toks: + toks.append(m) + return toks + + +def _overlap_score(title: str, item: Dict[str, Any]) -> float: + tokens = _title_tokens(title) + if not tokens: + return 0.0 + blob = item['name'] + ''.join(item.get('keywords') or []) + hit = sum(1 for t in tokens if t and t in blob) + score = hit / max(len(tokens), 1) + if item['name'] in title or title in item['name']: + score = max(score, 0.85) + for kw in item.get('keywords') or []: + if isinstance(kw, str) and len(kw) >= 2 and kw in title: + score = max(score, 0.7) + return min(1.0, score) + + +def _raw_utilities( + leaves: List[Dict[str, Any]], + items: List[Dict[str, Any]], + min_w: float, +) -> Tuple[List[float], List[List[Tuple[str, float]]]]: + """每节 u_i = sum_j w_j * c_ij;返回 u 与每节 top 相关项 (name, contrib)。""" + filtered = [it for it in items if it['weight'] >= min_w] + if not filtered: + filtered = items + n = len(leaves) + u = [0.0] * n + top_lists: List[List[Tuple[str, float]]] = [[] for _ in range(n)] + + for i, leaf in enumerate(leaves): + title = leaf.get('section_title') or '' + contribs: List[Tuple[str, float]] = [] + for it in filtered: + c = _overlap_score(title, it) + contrib = it['weight'] * c + if contrib > 0: + contribs.append((it['name'], contrib)) + u[i] += contrib + contribs.sort(key=lambda x: -x[1]) + top_lists[i] = contribs[:12] + + max_u = max(u) if u else 0.0 + if max_u <= 0: + u = [1.0] * n + else: + u = [x / max_u for x in u] + return u, top_lists + + +def _clamp_int(x: int, lo: int, hi: int) -> int: + return max(lo, min(hi, x)) + + +def _water_adjust( + targets: List[int], + budget: int, + floor_v: int, + cap_v: int, + priority: List[float], +) -> List[int]: + """在 [floor_v, cap_v] 内将 targets 整数化并尽量使 sum 接近 budget。""" + n = len(targets) + if n == 0: + return [] + if floor_v > cap_v: + floor_v, cap_v = cap_v, floor_v + if n * floor_v > budget: + floor_v = max(1, budget // n) + if n * cap_v < budget: + cap_v = max(floor_v, (budget + n - 1) // n) + cur = [_clamp_int(t, floor_v, cap_v) for t in targets] + s = sum(cur) + delta = budget - s + order = sorted(range(n), key=lambda i: -priority[i]) + inv_order = sorted(range(n), key=lambda i: priority[i]) + step = 0 + max_steps = max(n * 2000, abs(delta) + n) + while delta != 0 and step < max_steps: + step += 1 + if delta > 0: + moved = False + for i in order: + if cur[i] < cap_v: + cur[i] += 1 + delta -= 1 + moved = True + break + if not moved: + break + else: + moved = False + for i in inv_order: + if cur[i] > floor_v: + cur[i] -= 1 + delta += 1 + moved = True + break + if not moved: + break + return cur + + +def compute_leaf_allocations( + volume_key: str, + leaves: List[Dict[str, Any]], + rating_raw: Optional[str], + rules: Optional[Dict[str, Any]] = None, +) -> Optional[Dict[int, Dict[str, Any]]]: + """ + 为每个叶节点计算 target_chars、word_count_spec、max_tokens。 + + 有技术评分项时按标题相关性分配;无评分项时,若规则为按目标页控总篇且已设页数, + 则均分全稿总预算 B=目标页数×每页字数(否则返回 None,调用方沿用旧逻辑)。 + leaves: [{'id': int, 'section_title': str}, ...] + """ + rules = rules or load_rules() + if not leaves: + return {} + + base, core, _, preset_tokens = VOLUME_PRESETS.get( + volume_key, VOLUME_PRESETS['standard'] + ) + floor_default = int(base * 0.5) + cap_default = core + floor_v = int(rules['per_section_floor']) if rules.get('per_section_floor') is not None else floor_default + cap_v = int(rules['per_section_cap']) if rules.get('per_section_cap') is not None else cap_default + floor_v = min(floor_v, cap_v) + alpha = float(rules.get('alpha', 0.85)) + alpha = max(0.0, min(1.0, alpha)) + min_w = float(rules.get('relevance', {}).get('min_rating_weight', 0.01)) + + n = len(leaves) + mode = (rules.get('budget_mode') or 'anchor_mean').strip() + pages_cfg = int(getattr(config, 'TARGET_PAGES', 0) or 0) + pce = max(1, int(getattr(config, 'PAGE_CHAR_ESTIMATE', 700) or 700)) + if mode == 'target_pages' and pages_cfg > 0: + budget = int(round(pages_cfg * pce)) + elif mode == 'anchor_base': + budget = int(round(n * base)) + else: + budget = int(round(n * (base + core) / 2.0)) + + items = parse_rating_json(rating_raw) + if not items: + if not (mode == 'target_pages' and pages_cfg > 0): + return None + u = [1.0] * n + top_lists = [[] for _ in range(n)] + mid = 0.5 * (base + core) + raw_float = [float(mid)] * n + else: + u, top_lists = _raw_utilities(leaves, items, min_w) + band = core - base + raw_float = [ + base + band * (alpha * u[i] + (1.0 - alpha) * 0.5) for i in range(n) + ] + + targets = [int(round(x)) for x in raw_float] + adjusted = _water_adjust(targets, budget, floor_v, cap_v, u) + + provider = getattr(config, 'MODEL_PROVIDER', 'openai') + tok_limit = _PROVIDER_TOKEN_LIMITS.get(provider, 8192) + base_max_tok = min(preset_tokens, tok_limit) + scale_tokens = bool(rules.get('max_tokens_scale', False)) + + prompt_cfg = rules.get('prompt') or {} + top_k = int(prompt_cfg.get('top_k_rating_items', 4)) + intro = (prompt_cfg.get('intro_line') or '').strip() or ( + '本节须对下列技术评分要点作实质展开(结合工艺、流程、标准与可验证措施,禁止空泛承诺与复述招标文件):' + ) + + out: Dict[int, Dict[str, Any]] = {} + for i, leaf in enumerate(leaves): + sid = int(leaf['id']) + min_chars = max(1, adjusted[i]) + contribs = top_lists[i][:top_k] + if contribs: + lines = '\n'.join(f' · {name}' for name, _ in contribs[:top_k]) + spec = ( + f'- 字数硬性要求(必须达到,不达标将续写补足):本节正文不少于 {min_chars} 字\n' + f'- {intro}\n{lines}\n' + f'- 内容须由可检验的技术与管理措施支撑,禁止堆砌套话与重复背景' + ) + else: + spec = ( + f'- 字数硬性要求(必须达到,不达标将续写补足):本节正文不少于 {min_chars} 字\n' + f'- 须紧扣章节标题与标书目录定位,充分展开可执行方案细节\n' + f'- 内容须由可检验的技术与管理措施支撑,禁止堆砌套话与重复背景' + ) + + max_tok = base_max_tok + if scale_tokens and base > 0: + max_tok = int(min(tok_limit, max(1024, base_max_tok * min_chars / base))) + + out[sid] = { + 'target_chars': min_chars, + 'word_count_spec': spec, + 'max_tokens': max_tok, + } + return out + + +def continuation_threshold(target_chars: int) -> int: + """与 generator._get_min_chars 一致:续写到约目标字数的 65% 即停(多轮叠加逼近全文目标)。""" + return int(max(200, target_chars * 0.65)) diff --git a/暗标标书格式清除与设定模板.py b/暗标标书格式清除与设定模板.py new file mode 100644 index 0000000..e69de29 diff --git a/清标工具.js b/清标工具.js new file mode 100644 index 0000000..61de18d --- /dev/null +++ b/清标工具.js @@ -0,0 +1,311 @@ +// darkBidChecker.js +// 逻辑已迁移至项目内 Python:modules/dark_bid_format_check.py(本文件保留作参考) +// 依赖: jsdom (npm install jsdom) +const { JSDOM } = require('jsdom'); + +/** + * 暗标格式检查器 + * @param {string} htmlContent - 技术暗标的HTML内容(由Word/PDF导出的完整HTML) + * @param {Object} options - 可选配置 + * @returns {Object} 符合格式的JSON检查报告 + */ +function checkTechnicalBid(htmlContent, options = {}) { + const dom = new JSDOM(htmlContent); + const document = dom.window.document; + const styleSheets = Array.from(document.styleSheets); + + // 辅助函数:获取元素实际渲染样式(jsdom支持有限,但可获取内联和style标签定义) + function getStyle(element, property) { + return dom.window.getComputedStyle(element).getPropertyValue(property); + } + + // 结果收集器 + const results = { + overall: true, + details: [], + violations: [] + }; + + // 通用添加结果方法 + function addResult(ruleName, passed, message, elements = []) { + results.details.push({ rule: ruleName, passed, message }); + if (!passed) { + results.overall = false; + results.violations.push({ rule: ruleName, message, elements: elements.map(el => el.outerHTML.slice(0, 200)) }); + } + } + + // ========== 1. 检查是否存在投标人身份信息 ========== + function checkIdentityInfo() { + const bodyText = document.body.innerText; + // 公司名称模式(可扩展) + const companyPattern = /(?:我公司|本公司|[((]?[A-Za-z\u4e00-\u9fa5]+(?:集团|股份|有限|责任|公司)[))]?)/g; + // 地址模式(省市区路号等) + const addrPattern = /(?:省|市|区|县|镇|路|街|大道|号|大厦|楼|层)[\u4e00-\u9fa50-9]+/g; + // 总监/专监真实姓名模式(除甲、乙等代称外) + const namePattern = /(?:总监理工程师|专业监理工程师|技术负责人|项目经理)[::]\s*[^甲乙丙丁戊己庚辛壬癸\s]{2,4}(?=[,。;\s]|$)/g; + + let foundCompany = false; + let foundAddr = false; + let foundRealName = false; + + if (companyPattern.test(bodyText)) foundCompany = true; + if (addrPattern.test(bodyText)) foundAddr = true; + if (namePattern.test(bodyText)) foundRealName = true; + + // 检查图片alt或title是否包含公司标识 + const images = document.querySelectorAll('img'); + let hasLogo = false; + images.forEach(img => { + const alt = img.alt || ''; + const src = img.src || ''; + if (/logo|商标|微标|公司|品牌/i.test(alt) || /logo/i.test(src)) hasLogo = true; + }); + + const passed = !(foundCompany || foundAddr || foundRealName || hasLogo); + addResult('身份信息隐藏', passed, + passed ? '未发现投标人身份信息' : '发现投标人身份信息(公司名/地址/真实姓名/商标)'); + } + + // ========== 2. 标题格式检查(三号黑体,非斜体,无下划线等) ========== + function checkHeadings() { + // 标题选择器:h1-h6 或任何 role="heading" 或具有大纲级别样式的元素 + const headings = document.querySelectorAll('h1, h2, h3, h4, h5, h6, [role="heading"], .heading, .title'); + let allValid = true; + const invalidHeadings = []; + + headings.forEach(heading => { + const fontSize = getStyle(heading, 'font-size'); + const fontFamily = getStyle(heading, 'font-family').toLowerCase(); + const fontStyle = getStyle(heading, 'font-style'); + const textDecoration = getStyle(heading, 'text-decoration'); + const color = getStyle(heading, 'color'); + const fontWeight = getStyle(heading, 'font-weight'); + + // 三号 ≈ 16pt (21.33px) 允许误差 ±2px + const sizeOk = Math.abs(parseFloat(fontSize) - 21.33) <= 3; + const fontOk = fontFamily.includes('黑体') || fontFamily.includes('simhei') || fontFamily.includes('microsoft yahei'); + const styleOk = fontStyle !== 'italic'; + const decorOk = !textDecoration.includes('underline'); + const colorOk = color === 'rgb(0, 0, 0)' || color === '#000000'; + const weightOk = fontWeight !== '400' && fontWeight !== 'normal'; // 黑体通常加粗 + + if (!(sizeOk && fontOk && styleOk && decorOk && colorOk && weightOk)) { + allValid = false; + invalidHeadings.push(heading); + } + }); + + addResult('标题格式', allValid, + allValid ? '所有标题符合三号黑体要求' : '部分标题字号/字体/颜色/下划线不符合要求', invalidHeadings); + } + + // ========== 3. 正文格式检查(四号宋体,首行缩进2字符,行距固定值26磅,无着色等) ========== + function checkBodyText() { + const bodyElements = document.querySelectorAll('p, div, span, li, td, th'); + let allValid = true; + const invalidElements = []; + // 排除标题、页眉页脚、目录等 + const excludeSelectors = 'h1, h2, h3, h4, h5, h6, .header, .footer, .toc, .目录'; + + bodyElements.forEach(el => { + if (el.matches(excludeSelectors)) return; + const text = el.innerText.trim(); + if (text.length === 0) return; + + const fontSize = getStyle(el, 'font-size'); + const fontFamily = getStyle(el, 'font-family').toLowerCase(); + const color = getStyle(el, 'color'); + const textIndent = getStyle(el, 'text-indent'); + const lineHeight = getStyle(el, 'line-height'); + const textDecoration = getStyle(el, 'text-decoration'); + const fontWeight = getStyle(el, 'font-weight'); + const fontStyle = getStyle(el, 'font-style'); + + // 四号 ≈ 14pt (18.67px) + const sizeOk = Math.abs(parseFloat(fontSize) - 18.67) <= 2; + const fontOk = fontFamily.includes('宋体') || fontFamily.includes('simsun') || fontFamily.includes('serif'); + const colorOk = color === 'rgb(0, 0, 0)' || color === '#000000'; + // 首行缩进2字符(以em为单位,2em代表2个汉字) + const indentOk = parseFloat(textIndent) >= 1.8 && parseFloat(textIndent) <= 2.2; + // 行距固定值26磅 (34.67px) + const lineHeightOk = Math.abs(parseFloat(lineHeight) - 34.67) <= 2; + const decorOk = !textDecoration.includes('underline'); + const weightOk = fontWeight === '400' || fontWeight === 'normal'; + const styleOk = fontStyle !== 'italic'; + + if (!(sizeOk && fontOk && colorOk && indentOk && lineHeightOk && decorOk && weightOk && styleOk)) { + allValid = false; + invalidElements.push(el); + } + }); + + addResult('正文格式', allValid, + allValid ? '所有正文符合四号宋体/缩进/行距/颜色要求' : '部分正文段落格式不符合要求', invalidElements); + } + + // ========== 4. 目录检查(无页码,无页眉页脚) ========== + function checkTOC() { + const tocElements = document.querySelectorAll('.toc, .table-of-contents, .目录, [role="directory"]'); + let noPageNumbers = true; + let noHeaderFooter = true; + + tocElements.forEach(toc => { + const text = toc.innerText; + // 检查是否存在页码(数字独立在行尾或制表符后) + if (/\d+$/.test(text.trim()) || /\.{2,}\s*\d+/.test(text)) { + noPageNumbers = false; + } + // 检查内部是否有页眉页脚元素 + if (toc.querySelector('.header, .footer, .page-header, .page-footer')) { + noHeaderFooter = false; + } + }); + + // 若没有目录元素,按规则应存在目录但不得有页码,这里假设必须存在目录(招标要求通常有目录) + if (tocElements.length === 0) { + addResult('目录要求', false, '未检测到目录,请确保包含目录且目录无页码无页眉页脚'); + } else { + const passed = noPageNumbers && noHeaderFooter; + addResult('目录要求', passed, + passed ? '目录符合无页码、无页眉页脚要求' : '目录中存在页码或页眉页脚'); + } + } + + // ========== 5. 图表位置及图表内文字格式 ========== + function checkChartsAndTables() { + // 定位附件/附表章节 + const appendix = document.querySelector('#appendix, .appendix, .attachment, 附件, 附表'); + const isInAppendix = (el) => appendix && appendix.contains(el); + + const allTables = document.querySelectorAll('table'); + const allImages = document.querySelectorAll('img'); + const allFigures = document.querySelectorAll('figure, .chart'); + let illegalCharts = []; + + // 正文中不允许有图表,除非在附件内 + [...allTables, ...allImages, ...allFigures].forEach(chart => { + if (!isInAppendix(chart)) { + illegalCharts.push(chart); + } + }); + + let chartTextValid = true; + // 附件内图表文字需五号宋体 + if (appendix) { + const chartTexts = appendix.querySelectorAll('table, td, th, figcaption, .chart-text'); + chartTexts.forEach(el => { + const fontSize = getStyle(el, 'font-size'); + const fontFamily = getStyle(el, 'font-family').toLowerCase(); + const color = getStyle(el, 'color'); + const sizeOk = Math.abs(parseFloat(fontSize) - 10.5) <= 1.5; // 五号=10.5pt + const fontOk = fontFamily.includes('宋体') || fontFamily.includes('simsun'); + const colorOk = color === 'rgb(0, 0, 0)' || color === '#000000'; + if (!(sizeOk && fontOk && colorOk)) chartTextValid = false; + }); + } + + const chartsPassed = illegalCharts.length === 0 && chartTextValid; + addResult('图表规范', chartsPassed, + chartsPassed ? '图表仅出现在附件/附表内,且图表文字符合五号宋体' : + `正文中发现${illegalCharts.length}个图表或附件内图表文字格式错误`, illegalCharts); + } + + // ========== 6. 全文字体颜色检查(无彩色,无着重号,无下划线) ========== + function checkColorsAndDecorations() { + const allElements = document.querySelectorAll('*'); + let colorViolations = []; + let decorationViolations = []; + + allElements.forEach(el => { + const color = getStyle(el, 'color'); + if (color !== 'rgb(0, 0, 0)' && color !== '#000000' && color !== 'black') { + if (el.innerText.trim().length > 0) colorViolations.push(el); + } + const textDecor = getStyle(el, 'text-decoration'); + if (textDecor.includes('underline')) decorationViolations.push(el); + // 着重号检测(一般使用伪元素或border-bottom,简单检测样式) + const borderBottom = getStyle(el, 'border-bottom-style'); + if (borderBottom === 'solid' || borderBottom === 'dotted') { + decorationViolations.push(el); + } + }); + + const passed = colorViolations.length === 0 && decorationViolations.length === 0; + addResult('颜色与装饰', passed, + passed ? '无彩色文字、无下划线、无着重号' : + `发现${colorViolations.length}处彩色文字,${decorationViolations.length}处下划线/着重号`, + [...colorViolations, ...decorationViolations]); + } + + // ========== 7. 页面设置检查(A4纵向,页边距) ========== + function checkPageSetup() { + // 检查@page规则或根元素margin + let pageValid = true; + let marginTop, marginBottom, marginLeft, marginRight; + // 尝试从styleSheets中获取@page + let pageRule = null; + for (let sheet of styleSheets) { + try { + const rules = sheet.cssRules || sheet.rules; + for (let rule of rules) { + if (rule.type === CSSRule.PAGE_RULE) { + pageRule = rule.style; + break; + } + } + } catch(e) { /* 跨域限制忽略 */ } + } + + if (pageRule) { + marginTop = pageRule.marginTop; + marginBottom = pageRule.marginBottom; + marginLeft = pageRule.marginLeft; + marginRight = pageRule.marginRight; + const size = pageRule.size; + if (size && size.toLowerCase() !== 'a4') pageValid = false; + } else { + // 检查body或根容器的margin + const bodyStyle = getStyle(document.body, 'margin'); + if (bodyStyle) { + const margins = bodyStyle.split(' '); + // 简单近似 + marginTop = margins[0]; + marginBottom = margins[2] || margins[0]; + marginLeft = margins[3] || margins[1]; + marginRight = margins[1]; + } else { + pageValid = false; + } + } + + const topOk = marginTop === '2.54cm' || parseFloat(marginTop) === 2.54; + const bottomOk = marginBottom === '2.54cm' || parseFloat(marginBottom) === 2.54; + const leftOk = marginLeft === '3.18cm' || parseFloat(marginLeft) === 3.18; + const rightOk = marginRight === '3.18cm' || parseFloat(marginRight) === 3.18; + const pageOrientation = document.documentElement.style.width === 'auto' ? '纵向' : '横向'; // 近似 + + const passed = topOk && bottomOk && leftOk && rightOk && pageOrientation !== '横向'; + addResult('页面设置', passed, + passed ? '页面设置符合A4纵向/边距要求' : '页面边距或纸张方向不符合要求'); + } + + // 执行所有检查 + checkIdentityInfo(); + checkHeadings(); + checkBodyText(); + checkTOC(); + checkChartsAndTables(); + checkColorsAndDecorations(); + checkPageSetup(); + + return results; +} + +// 导出函数供外部使用 +module.exports = { checkTechnicalBid }; + +// ==================== 使用示例 ==================== +// 假设已有htmlContent(技术暗标HTML字符串) +// const report = checkTechnicalBid(htmlContent); +// console.log(JSON.stringify(report, null, 2)); \ No newline at end of file diff --git a/清标数据.json b/清标数据.json new file mode 100644 index 0000000..8827a11 --- /dev/null +++ b/清标数据.json @@ -0,0 +1,63 @@ + +{ + "overall": false, + "details": [ + { + "rule": "身份信息隐藏", + "passed": true, + "message": "未发现投标人身份信息" + }, + { + "rule": "标题格式", + "passed": false, + "message": "部分标题字号/字体/颜色/下划线不符合要求" + }, + { + "rule": "正文格式", + "passed": false, + "message": "部分正文段落格式不符合要求" + }, + { + "rule": "目录要求", + "passed": true, + "message": "目录符合无页码、无页眉页脚要求" + }, + { + "rule": "图表规范", + "passed": false, + "message": "正文中发现2个图表或附件内图表文字格式错误" + }, + { + "rule": "颜色与装饰", + "passed": true, + "message": "无彩色文字、无下划线、无着重号" + }, + { + "rule": "页面设置", + "passed": false, + "message": "页面边距或纸张方向不符合要求" + } + ], + "violations": [ + { + "rule": "标题格式", + "message": "部分标题字号/字体/颜色/下划线不符合要求", + "elements": ["

第一章 项目概况

"] + }, + { + "rule": "正文格式", + "message": "部分正文段落格式不符合要求", + "elements": ["

投标人应..."] + }, + { + "rule": "图表规范", + "message": "正文中发现2个图表或附件内图表文字格式错误", + "elements": ["", "...
"] + }, + { + "rule": "页面设置", + "message": "页面边距或纸张方向不符合要求", + "elements": [] + } + ] + } \ No newline at end of file