提交版本
This commit is contained in:
commit
003af9dc9c
1
.deps_installed
Normal file
1
.deps_installed
Normal file
@ -0,0 +1 @@
|
|||||||
|
|
||||||
6
.gitignore
vendored
Normal file
6
.gitignore
vendored
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
.env
|
||||||
|
*.log
|
||||||
|
__pycache__/
|
||||||
|
*.pyc
|
||||||
|
.venv/
|
||||||
|
venv/
|
||||||
10
.idea/.gitignore
generated
vendored
Normal file
10
.idea/.gitignore
generated
vendored
Normal file
@ -0,0 +1,10 @@
|
|||||||
|
# 默认忽略的文件
|
||||||
|
/shelf/
|
||||||
|
/workspace.xml
|
||||||
|
# 基于编辑器的 HTTP 客户端请求
|
||||||
|
/httpRequests/
|
||||||
|
# 已忽略包含查询文件的默认文件夹
|
||||||
|
/queries/
|
||||||
|
# Datasource local storage ignored files
|
||||||
|
/dataSources/
|
||||||
|
/dataSources.local.xml
|
||||||
6
.idea/inspectionProfiles/profiles_settings.xml
generated
Normal file
6
.idea/inspectionProfiles/profiles_settings.xml
generated
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
<component name="InspectionProjectProfileManager">
|
||||||
|
<settings>
|
||||||
|
<option name="USE_PROJECT_PROFILE" value="false" />
|
||||||
|
<version value="1.0" />
|
||||||
|
</settings>
|
||||||
|
</component>
|
||||||
8
.idea/modules.xml
generated
Normal file
8
.idea/modules.xml
generated
Normal file
@ -0,0 +1,8 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<project version="4">
|
||||||
|
<component name="ProjectModuleManager">
|
||||||
|
<modules>
|
||||||
|
<module fileurl="file://$PROJECT_DIR$/.idea/tech-bid-manage20260422.iml" filepath="$PROJECT_DIR$/.idea/tech-bid-manage20260422.iml" />
|
||||||
|
</modules>
|
||||||
|
</component>
|
||||||
|
</project>
|
||||||
16
.idea/tech-bid-manage20260422.iml
generated
Normal file
16
.idea/tech-bid-manage20260422.iml
generated
Normal file
@ -0,0 +1,16 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<module type="PYTHON_MODULE" version="4">
|
||||||
|
<component name="NewModuleRootManager">
|
||||||
|
<content url="file://$MODULE_DIR$" />
|
||||||
|
<orderEntry type="inheritedJdk" />
|
||||||
|
<orderEntry type="sourceFolder" forTests="false" />
|
||||||
|
</component>
|
||||||
|
<component name="TemplatesService">
|
||||||
|
<option name="TEMPLATE_CONFIGURATION" value="Jinja2" />
|
||||||
|
<option name="TEMPLATE_FOLDERS">
|
||||||
|
<list>
|
||||||
|
<option value="$MODULE_DIR$/templates" />
|
||||||
|
</list>
|
||||||
|
</option>
|
||||||
|
</component>
|
||||||
|
</module>
|
||||||
6
.idea/vcs.xml
generated
Normal file
6
.idea/vcs.xml
generated
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<project version="4">
|
||||||
|
<component name="VcsDirectoryMappings">
|
||||||
|
<mapping directory="" vcs="Git" />
|
||||||
|
</component>
|
||||||
|
</project>
|
||||||
113
README.md
Normal file
113
README.md
Normal file
@ -0,0 +1,113 @@
|
|||||||
|
# 标伙伴 · AI 标书助手
|
||||||
|
|
||||||
|
基于大模型的智能标书生成工具(单机版),支持解析招标文件、自动生成技术标书、导出 Word 文档。
|
||||||
|
|
||||||
|
## 快速开始
|
||||||
|
|
||||||
|
### 方式一:双击启动(Windows)
|
||||||
|
|
||||||
|
直接双击 `start.bat`,首次运行会自动安装依赖。
|
||||||
|
|
||||||
|
### 方式二:命令行启动
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# 1. 安装依赖
|
||||||
|
pip install -r requirements.txt
|
||||||
|
|
||||||
|
# 2. 启动应用
|
||||||
|
python app.py
|
||||||
|
```
|
||||||
|
|
||||||
|
浏览器访问 **http://localhost:5000**
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 配置 API Key
|
||||||
|
|
||||||
|
首次使用前,点击右上角 ⚙️ 设置图标,选择模型提供商并填入 API Key:
|
||||||
|
|
||||||
|
| 提供商 | 推荐模型 | 申请地址 |
|
||||||
|
|--------|---------|---------|
|
||||||
|
| 通义千问 | qwen-max | https://dashscope.aliyun.com/ |
|
||||||
|
| DeepSeek | deepseek-chat (V3) | https://platform.deepseek.com/ |
|
||||||
|
| OpenAI | gpt-4o | https://platform.openai.com/ |
|
||||||
|
|
||||||
|
> **DeepSeek 说明**:deepseek-chat (V3) 性价比极高,推荐用于生产环境。
|
||||||
|
> 由于 DeepSeek 暂不提供 Embedding API,使用知识库功能时会自动回退到本地 sentence-transformers 模型(首次使用需下载约 90MB)。
|
||||||
|
|
||||||
|
也可通过环境变量配置:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# 通义千问
|
||||||
|
set QWEN_API_KEY=sk-xxxxxxxx
|
||||||
|
set MODEL_PROVIDER=qwen
|
||||||
|
|
||||||
|
# DeepSeek
|
||||||
|
set DEEPSEEK_API_KEY=sk-xxxxxxxx
|
||||||
|
set MODEL_PROVIDER=deepseek
|
||||||
|
|
||||||
|
python app.py
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 使用流程
|
||||||
|
|
||||||
|
1. **新建项目** → 输入项目名称
|
||||||
|
2. **上传招标文件** → 支持 PDF / DOC / DOCX
|
||||||
|
3. **AI 解析** → 自动提取评分要求、资质条件、商务条款
|
||||||
|
4. **生成大纲** → 按评分权重生成四级章节目录
|
||||||
|
5. **生成内容** → 逐章节或一键全部生成
|
||||||
|
6. **合规检查** → 对照招标要求检验覆盖情况
|
||||||
|
7. **导出 Word** → 专业排版,直接使用
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 目录结构
|
||||||
|
|
||||||
|
```
|
||||||
|
autorfp/
|
||||||
|
├── app.py # Flask 主程序
|
||||||
|
├── config.py # 配置文件
|
||||||
|
├── requirements.txt # Python 依赖
|
||||||
|
├── start.bat # Windows 一键启动
|
||||||
|
├── prompts/ # AI 提示词模板
|
||||||
|
├── modules/ # 功能模块
|
||||||
|
│ ├── parser.py # 招标文件解析
|
||||||
|
│ ├── generator.py # 标书内容生成
|
||||||
|
│ ├── checker.py # 合规检查
|
||||||
|
│ ├── exporter.py # Word 导出
|
||||||
|
│ └── knowledge.py # 企业知识库
|
||||||
|
├── utils/ # 工具函数
|
||||||
|
│ ├── ai_client.py # AI API 封装
|
||||||
|
│ ├── file_utils.py # 文件处理
|
||||||
|
│ └── prompts.py # 提示词加载
|
||||||
|
├── templates/ # HTML 模板
|
||||||
|
├── static/ # 静态资源
|
||||||
|
└── data/ # 数据目录(自动创建)
|
||||||
|
├── projects.db # SQLite 数据库
|
||||||
|
├── uploads/ # 上传的招标文件
|
||||||
|
├── exports/ # 导出的标书
|
||||||
|
├── knowledge/ # 知识库文件
|
||||||
|
└── chroma/ # 向量数据库
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 企业知识库
|
||||||
|
|
||||||
|
在项目页面切换到「知识库」标签,上传历史标书文件。
|
||||||
|
系统会自动将文件分块存入向量数据库,生成内容时自动检索相关片段,让 AI 更好地体现企业优势。
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 常见问题
|
||||||
|
|
||||||
|
**Q: 解析速度很慢?**
|
||||||
|
A: 招标文件越长耗时越长,通常 30-120 秒。建议使用 qwen-max 或 gpt-4o。
|
||||||
|
|
||||||
|
**Q: 内容生成失败?**
|
||||||
|
A: 检查 API Key 是否正确,以及账户余额是否充足。
|
||||||
|
|
||||||
|
**Q: 导出的 Word 文件乱码?**
|
||||||
|
A: 请使用 Microsoft Word 2016 及以上版本打开。
|
||||||
118
bid_partner.spec
Normal file
118
bid_partner.spec
Normal file
@ -0,0 +1,118 @@
|
|||||||
|
# -*- mode: python ; coding: utf-8 -*-
|
||||||
|
"""
|
||||||
|
PyInstaller spec for 标伙伴 · AI标书助手
|
||||||
|
Build: pyinstaller bid_partner.spec
|
||||||
|
|
||||||
|
知识库改用 SQLite + 纯 Python 向量存储,已不依赖 ChromaDB,打包更小。
|
||||||
|
"""
|
||||||
|
import os
|
||||||
|
from PyInstaller.utils.hooks import collect_all, collect_data_files
|
||||||
|
|
||||||
|
block_cipher = None
|
||||||
|
|
||||||
|
# ── Collect complex packages ─────────────────────────────────────────────────
|
||||||
|
openai_datas, openai_bins, openai_hidden = collect_all('openai')
|
||||||
|
pydantic_datas, pydantic_bins, pydantic_hidden = collect_all('pydantic')
|
||||||
|
|
||||||
|
# tiktoken data (BPE vocab files)
|
||||||
|
tiktoken_datas = collect_data_files('tiktoken')
|
||||||
|
|
||||||
|
a = Analysis(
|
||||||
|
['launcher.py'],
|
||||||
|
pathex=['.'],
|
||||||
|
binaries=openai_bins + pydantic_bins,
|
||||||
|
datas=[
|
||||||
|
# ── App assets (read-only, go into _MEIPASS) ──
|
||||||
|
('templates', 'templates'),
|
||||||
|
('static', 'static'),
|
||||||
|
# ── Package data ──
|
||||||
|
*openai_datas,
|
||||||
|
*pydantic_datas,
|
||||||
|
*tiktoken_datas,
|
||||||
|
],
|
||||||
|
hiddenimports=[
|
||||||
|
# Flask / Werkzeug
|
||||||
|
'flask', 'flask_cors', 'werkzeug', 'werkzeug.serving',
|
||||||
|
'werkzeug.routing', 'werkzeug.middleware.proxy_fix',
|
||||||
|
'jinja2', 'jinja2.ext',
|
||||||
|
# SQLite (stdlib, always present)
|
||||||
|
'sqlite3',
|
||||||
|
# OpenAI
|
||||||
|
*openai_hidden,
|
||||||
|
# Pydantic
|
||||||
|
*pydantic_hidden,
|
||||||
|
# Document processing
|
||||||
|
'PyPDF2', 'pypdf', 'pypdf.errors',
|
||||||
|
'pdfminer', 'pdfminer.high_level', 'pdfminer.layout',
|
||||||
|
'pdfminer.pdfpage', 'pdfminer.pdfinterp', 'pdfminer.converter',
|
||||||
|
'docx', 'docx.oxml', 'docx.oxml.ns', 'docx.shared',
|
||||||
|
'docx.enum', 'docx.enum.text', 'docx.enum.style',
|
||||||
|
'python_docx',
|
||||||
|
# tiktoken
|
||||||
|
'tiktoken', 'tiktoken.core', 'tiktoken.model',
|
||||||
|
'tiktoken_ext', 'tiktoken_ext.openai_public',
|
||||||
|
# Network / encoding
|
||||||
|
'requests', 'chardet', 'httpx', 'httpcore',
|
||||||
|
'anyio', 'anyio.streams', 'anyio.streams.memory',
|
||||||
|
'sniffio', 'certifi',
|
||||||
|
# Stdlib extras
|
||||||
|
'importlib.metadata', 'importlib.resources',
|
||||||
|
'pkg_resources', 'json', 'math', 'threading',
|
||||||
|
# Local project modules (explicitly include all)
|
||||||
|
'config', 'app',
|
||||||
|
'utils', 'utils.ai_client', 'utils.file_utils',
|
||||||
|
'utils.prompts', 'utils.settings', 'utils.boq_parser', 'utils.bill_analysis',
|
||||||
|
'modules', 'modules.parser', 'modules.generator',
|
||||||
|
'modules.checker', 'modules.exporter', 'modules.knowledge',
|
||||||
|
],
|
||||||
|
hookspath=[],
|
||||||
|
hooksconfig={},
|
||||||
|
runtime_hooks=[],
|
||||||
|
excludes=[
|
||||||
|
# Heavy packages not used in this app
|
||||||
|
'matplotlib', 'pandas', 'scipy', 'numpy',
|
||||||
|
'IPython', 'jupyter', 'notebook',
|
||||||
|
'PIL', 'Pillow',
|
||||||
|
'cv2', 'torch', 'tensorflow',
|
||||||
|
'pytest', 'unittest',
|
||||||
|
# ChromaDB 及其依赖(已移除,改用 SQLite 内置存储)
|
||||||
|
'chromadb', 'hnswlib', 'posthog', 'pypika',
|
||||||
|
'mmh3', 'overrides', 'monotonic',
|
||||||
|
'sentence_transformers', 'onnxruntime',
|
||||||
|
],
|
||||||
|
win_no_prefer_redirects=False,
|
||||||
|
win_private_assemblies=False,
|
||||||
|
cipher=block_cipher,
|
||||||
|
noarchive=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
pyz = PYZ(a.pure, a.zipped_data, cipher=block_cipher)
|
||||||
|
|
||||||
|
exe = EXE(
|
||||||
|
pyz,
|
||||||
|
a.scripts,
|
||||||
|
[],
|
||||||
|
exclude_binaries=True,
|
||||||
|
name='bid_partner',
|
||||||
|
debug=False,
|
||||||
|
bootloader_ignore_signals=False,
|
||||||
|
strip=False,
|
||||||
|
upx=False,
|
||||||
|
console=False, # no black console window — GUI launcher takes over
|
||||||
|
disable_windowed_traceback=False,
|
||||||
|
argv_emulation=False,
|
||||||
|
target_arch=None,
|
||||||
|
codesign_identity=None,
|
||||||
|
entitlements_file=None,
|
||||||
|
)
|
||||||
|
|
||||||
|
coll = COLLECT(
|
||||||
|
exe,
|
||||||
|
a.binaries,
|
||||||
|
a.zipfiles,
|
||||||
|
a.datas,
|
||||||
|
strip=False,
|
||||||
|
upx=False,
|
||||||
|
upx_exclude=[],
|
||||||
|
name='BidPartner',
|
||||||
|
)
|
||||||
672
bill-worker.js
Normal file
672
bill-worker.js
Normal file
@ -0,0 +1,672 @@
|
|||||||
|
/**
|
||||||
|
* bill-worker.js — PDF 清单解析调度器(Worker Thread)
|
||||||
|
*
|
||||||
|
* 架构(v3 — SharedArrayBuffer 零拷贝):
|
||||||
|
* Phase 1 — 并行文本提取
|
||||||
|
* 将 PDF 数据写入 SharedArrayBuffer(一次分配,所有子线程共享读)
|
||||||
|
* 启动 N 个 page-worker,每个负责固定 20 页
|
||||||
|
*
|
||||||
|
* Phase 2 — 清单页筛选 + 文本解析(纯正则,毫秒级)
|
||||||
|
* 汇总全部页面文本 → 关键字筛选清单页 → 多行合并 → 逐行解析
|
||||||
|
*/
|
||||||
|
'use strict';
|
||||||
|
const { parentPort } = require('worker_threads');
|
||||||
|
const { Worker } = require('worker_threads');
|
||||||
|
const path = require('path');
|
||||||
|
|
||||||
|
const PAGES_PER_CHUNK = 20;
|
||||||
|
|
||||||
|
parentPort.on('message', async (msg) => {
|
||||||
|
if (msg.type !== 'parse') return;
|
||||||
|
const t0 = Date.now();
|
||||||
|
try {
|
||||||
|
// 立即做一次干净的拷贝,确保拥有独立的 ArrayBuffer
|
||||||
|
const raw = msg.buffer;
|
||||||
|
const buf = Buffer.alloc(raw.byteLength);
|
||||||
|
Buffer.from(raw).copy(buf);
|
||||||
|
|
||||||
|
if (buf.length === 0) {
|
||||||
|
parentPort.postMessage({ type: 'done', ok: false, error: '收到空 PDF 数据' });
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── 获取总页数 ──
|
||||||
|
const pdfjsModule = await import('pdfjs-dist/build/pdf.mjs');
|
||||||
|
const pdfjsLib = pdfjsModule.default || pdfjsModule;
|
||||||
|
// 给 pdfjs 一份独立拷贝(pdfjs 内部可能 detach buffer)
|
||||||
|
const pdfData = new Uint8Array(buf.length);
|
||||||
|
buf.copy(Buffer.from(pdfData.buffer));
|
||||||
|
const pdf = await pdfjsLib.getDocument({ data: pdfData, isEvalSupported: false }).promise;
|
||||||
|
const totalPages = pdf.numPages;
|
||||||
|
|
||||||
|
// ── 将 PDF 数据写入 SharedArrayBuffer(一次分配,所有子线程共享读)──
|
||||||
|
const sab = new SharedArrayBuffer(buf.length);
|
||||||
|
const sabView = new Uint8Array(sab);
|
||||||
|
buf.copy(Buffer.from(sabView.buffer)); // 从独立 buf 拷贝到共享内存
|
||||||
|
|
||||||
|
const workerCount = Math.ceil(totalPages / PAGES_PER_CHUNK);
|
||||||
|
console.log(`[BillWorker] PDF ${totalPages} 页, ${workerCount} 路并行 (SharedArrayBuffer ${(buf.length/1024/1024).toFixed(1)}MB)`);
|
||||||
|
|
||||||
|
// Phase 1: 并行文本提取
|
||||||
|
const pageTexts = await parallelExtract(sab, buf.length, totalPages, workerCount);
|
||||||
|
const t1 = Date.now();
|
||||||
|
|
||||||
|
const extractedCount = pageTexts.filter(t => t.length > 0).length;
|
||||||
|
console.log(`[BillWorker] Phase1 完成: ${t1 - t0}ms, ${extractedCount}/${totalPages} 页有文本`);
|
||||||
|
|
||||||
|
// 扫描件判断
|
||||||
|
const totalChars = pageTexts.reduce((s, t) => s + t.length, 0);
|
||||||
|
if (totalChars < 50) {
|
||||||
|
parentPort.postMessage({ type: 'done', ok: true, data: { scanned: true, reason: 'noText', totalPages } });
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Phase 2: 筛选清单页(宽松策略 + 连续页补全)
|
||||||
|
const BILL_KW = ['项目编码', '项目名称', '工程量', '计量单位', '综合单价', '清单编码'];
|
||||||
|
const SEC_KW = ['分部分项', '分类分项', '措施项目', '其他项目', '工程量清单计价'];
|
||||||
|
// 第一轮:标记确定的清单页
|
||||||
|
const billFlags = new Array(pageTexts.length).fill(false);
|
||||||
|
for (let i = 0; i < pageTexts.length; i++) {
|
||||||
|
const t = pageTexts[i];
|
||||||
|
if (!t.trim()) continue;
|
||||||
|
const hHits = BILL_KW.filter(k => t.includes(k)).length;
|
||||||
|
const sHit = SEC_KW.some(k => t.includes(k));
|
||||||
|
const hasCode = /\d{9}/.test(t);
|
||||||
|
// 放宽:有9位编码即可(不再要求同时命中表头关键字)
|
||||||
|
if (hHits >= 2 || sHit || hasCode) {
|
||||||
|
billFlags[i] = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// 第二轮:连续页补全 — 两个清单页之间的非空页也视为清单页(续页无表头)
|
||||||
|
// 但排除纯费用/税金页面(它们不含施工清单项)
|
||||||
|
const FEE_PAGE_KW = ['规费', '税金', '社会保险费', '住房公积金', '养老保险',
|
||||||
|
'工伤保险', '失业保险', '医疗保险', '教育费附加', '城市维护建设税'];
|
||||||
|
const firstBill = billFlags.indexOf(true);
|
||||||
|
const lastBill = billFlags.lastIndexOf(true);
|
||||||
|
if (firstBill >= 0 && lastBill > firstBill) {
|
||||||
|
for (let i = firstBill; i <= lastBill; i++) {
|
||||||
|
if (!billFlags[i] && pageTexts[i] && pageTexts[i].trim().length > 30) {
|
||||||
|
const t = pageTexts[i];
|
||||||
|
const feeHits = FEE_PAGE_KW.filter(kw => t.includes(kw)).length;
|
||||||
|
// 命中 2+ 个费用关键字且没有9位工程编码 → 纯费用页,排除
|
||||||
|
if (feeHits >= 2 && !/\d{9}/.test(t)) continue;
|
||||||
|
billFlags[i] = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
const billTexts = [];
|
||||||
|
for (let i = 0; i < pageTexts.length; i++) {
|
||||||
|
if (billFlags[i]) billTexts.push(pageTexts[i]);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!billTexts.length) {
|
||||||
|
parentPort.postMessage({ type: 'done', ok: true, data: { scanned: false, noBillPages: true, totalPages } });
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
console.log(`[BillWorker] ${totalPages} 页 → ${billTexts.length} 页清单 (原始识别 ${billFlags.filter(f=>f).length - (lastBill - firstBill >= 0 ? 0 : 0)} / 补全后 ${billTexts.length})`);
|
||||||
|
|
||||||
|
// Phase 3: 文本解析
|
||||||
|
const merged = billTexts.join('\n');
|
||||||
|
const parsed = parseBillText(merged);
|
||||||
|
const t2 = Date.now();
|
||||||
|
console.log(`[BillWorker] Phase2+3: ${t2 - t1}ms, 总耗时: ${t2 - t0}ms`);
|
||||||
|
|
||||||
|
parentPort.postMessage({
|
||||||
|
type: 'done', ok: true,
|
||||||
|
data: {
|
||||||
|
scanned: false,
|
||||||
|
...parsed,
|
||||||
|
_meta: {
|
||||||
|
method: 'local-parallel',
|
||||||
|
workers: workerCount,
|
||||||
|
billPages: billTexts.length,
|
||||||
|
totalPages,
|
||||||
|
extractMs: t1 - t0,
|
||||||
|
parseMs: t2 - t1,
|
||||||
|
totalMs: t2 - t0,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
} catch (err) {
|
||||||
|
console.error('[BillWorker] 错误:', err.message);
|
||||||
|
parentPort.postMessage({ type: 'done', ok: false, error: err.message });
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
// ================================================================
|
||||||
|
// Phase 1: 多 Worker 并行提取(SharedArrayBuffer 零拷贝)
|
||||||
|
// ================================================================
|
||||||
|
|
||||||
|
function parallelExtract(sab, dataLength, totalPages, workerCount) {
|
||||||
|
return new Promise((resolve) => {
|
||||||
|
const workerPath = path.join(__dirname, 'page-worker.js');
|
||||||
|
const allPageTexts = new Array(totalPages).fill('');
|
||||||
|
const workerStatus = new Array(workerCount).fill('pending'); // pending, done, failed
|
||||||
|
let resolved = false;
|
||||||
|
|
||||||
|
const checkComplete = () => {
|
||||||
|
if (resolved) return;
|
||||||
|
const doneCount = workerStatus.filter(s => s === 'done' || s === 'failed').length;
|
||||||
|
if (doneCount >= workerCount) {
|
||||||
|
resolved = true;
|
||||||
|
// 检查是否有失败的worker,打印警告
|
||||||
|
const failedCount = workerStatus.filter(s => s === 'failed').length;
|
||||||
|
if (failedCount > 0) {
|
||||||
|
console.warn(`[BillWorker] ${failedCount}/${workerCount} 个worker失败,可能导致部分页面无内容`);
|
||||||
|
}
|
||||||
|
resolve(allPageTexts);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
for (let i = 0; i < workerCount; i++) {
|
||||||
|
const startPage = i * PAGES_PER_CHUNK + 1;
|
||||||
|
const endPage = Math.min((i + 1) * PAGES_PER_CHUNK, totalPages);
|
||||||
|
|
||||||
|
// workerData 传 SharedArrayBuffer(跨线程共享,不会被清空)
|
||||||
|
const w = new Worker(workerPath, {
|
||||||
|
workerData: { sab, dataLength, startPage, endPage }
|
||||||
|
});
|
||||||
|
|
||||||
|
let workerDone = false;
|
||||||
|
|
||||||
|
const markDone = (status) => {
|
||||||
|
if (workerDone) return;
|
||||||
|
workerDone = true;
|
||||||
|
workerStatus[i] = status;
|
||||||
|
checkComplete();
|
||||||
|
};
|
||||||
|
|
||||||
|
w.on('message', (msg) => {
|
||||||
|
if (msg.ok && msg.results) {
|
||||||
|
for (const r of msg.results) {
|
||||||
|
allPageTexts[r.page - 1] = r.text;
|
||||||
|
}
|
||||||
|
markDone('done');
|
||||||
|
} else if (!msg.ok) {
|
||||||
|
console.warn(`[BillWorker] page-worker[${startPage}-${endPage}] 失败: ${msg.error}`);
|
||||||
|
markDone('failed');
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
w.on('error', (err) => {
|
||||||
|
console.warn(`[BillWorker] page-worker[${startPage}-${endPage}] 异常: ${err.message}`);
|
||||||
|
markDone('failed');
|
||||||
|
});
|
||||||
|
|
||||||
|
w.on('exit', (code) => {
|
||||||
|
// exit 在 message 之后触发,但如果 worker 崩溃没发 message 则在这里兜底
|
||||||
|
if (code !== 0 && !workerDone) {
|
||||||
|
console.warn(`[BillWorker] page-worker[${startPage}-${endPage}] 意外退出(code=${code})`);
|
||||||
|
markDone('failed');
|
||||||
|
} else if (!workerDone) {
|
||||||
|
markDone('done');
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
if (workerCount <= 0) {
|
||||||
|
resolved = true;
|
||||||
|
resolve(allPageTexts);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
// ================================================================
|
||||||
|
// Phase 3: 清单文本解析(纯正则 + 字符串处理,毫秒级)
|
||||||
|
// ================================================================
|
||||||
|
|
||||||
|
function parseBillText(text) {
|
||||||
|
const rawLines = text.split(/\n/).map(l => {
|
||||||
|
let line = l.replace(/\t/g, ' ').trim();
|
||||||
|
// 规范化带横杠的编码:如 "010-101-001-001" → "010101001001"
|
||||||
|
line = line.replace(/(\d{2,4})[-‐–](\d{2,4})[-‐–](\d{2,4})(?:[-‐–](\d{2,4}))?/g,
|
||||||
|
(m, a, b, c, d) => {
|
||||||
|
const combined = a + b + c + (d || '');
|
||||||
|
return (combined.length >= 9 && combined.length <= 12) ? combined : m;
|
||||||
|
});
|
||||||
|
return line;
|
||||||
|
});
|
||||||
|
|
||||||
|
// ── Step 1: 多行合并成逻辑行 ──
|
||||||
|
// pdfjs 按 Y 坐标分行,表格一行通常 = 一条文本行
|
||||||
|
// 但有时 项目特征/名称 会折行,需要合并
|
||||||
|
//
|
||||||
|
// 新逻辑行的起始标志(任一命中即切断):
|
||||||
|
// a) 序号模式:1.1.1.1.5 开头
|
||||||
|
// b) 清单编码:9-12位数字 或 B+5-6位数字 开头
|
||||||
|
// c) 中文大标题:一 二 三 ... 或 (一)(二)...
|
||||||
|
// d) 表头行内容(跳过)
|
||||||
|
// e) 纯数字序号 + 空格 + 编码(如 "5 500101004001")
|
||||||
|
|
||||||
|
const ITEM_START = /^\d+(\.\d+)+\s/; // 1.1 或 1.1.1 等序号
|
||||||
|
const CODE_INLINE = /(?:^|\s)(\d{9,12}|(?<![A-Za-z])B\d{5,6})\s/; // 行内含清单编码(排除 GB/DB 等标准号)
|
||||||
|
const CODE_START_RE = /^(\d{9,12}|B\d{5,6})\s/; // 行首就是清单编码(行首 B 不会有前缀字母)
|
||||||
|
const SEQ_CODE_RE = /^\d{1,4}\s+(\d{9,12}|(?<![A-Za-z])B\d{5,6})\s/; // "序号 编码"格式
|
||||||
|
const PAGE_MARK = /^--\s*\d+\s+of\s+\d+\s*--/;
|
||||||
|
const HEADER_RE = /^序号\s+(项目编码|项目名称)/;
|
||||||
|
const HEADER_KW = /^(项目编码|项目名称|清单编码|计量单位|综合单价|工程量|合\s*价|金额|序号)\s/;
|
||||||
|
const CATEGORY_MARKERS = ['一', '二', '三', '四', '五', '六', '七', '八', '九', '十',
|
||||||
|
'(一)', '(二)', '(三)', '(四)', '(五)'];
|
||||||
|
|
||||||
|
const logicLines = [];
|
||||||
|
let currentLine = '';
|
||||||
|
|
||||||
|
function isNewLineTrigger(raw) {
|
||||||
|
if (ITEM_START.test(raw)) return true;
|
||||||
|
if (CODE_START_RE.test(raw)) return true;
|
||||||
|
if (SEQ_CODE_RE.test(raw)) return true;
|
||||||
|
if (CATEGORY_MARKERS.some(m => raw.startsWith(m + ' ') || raw.startsWith(m + '\u3000'))) return true;
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (const raw of rawLines) {
|
||||||
|
if (!raw || PAGE_MARK.test(raw)) continue;
|
||||||
|
if (HEADER_RE.test(raw) || HEADER_KW.test(raw)) continue;
|
||||||
|
if (/^(元)|^款章节号|^备注$|^第\d+页/.test(raw)) continue;
|
||||||
|
|
||||||
|
if (isNewLineTrigger(raw)) {
|
||||||
|
if (currentLine) logicLines.push(currentLine);
|
||||||
|
currentLine = raw;
|
||||||
|
} else if (CODE_INLINE.test(raw) && raw.length > 15) {
|
||||||
|
// 行内包含编码且够长(像是完整的表格行)→ 也开新行
|
||||||
|
if (currentLine) logicLines.push(currentLine);
|
||||||
|
currentLine = raw;
|
||||||
|
} else {
|
||||||
|
// 续行(项目特征折行等短文本)
|
||||||
|
// 安全阀:已合并行过长时强制切断,防止整页吞并
|
||||||
|
if (currentLine && currentLine.length > 300) {
|
||||||
|
logicLines.push(currentLine);
|
||||||
|
currentLine = raw;
|
||||||
|
} else {
|
||||||
|
currentLine = currentLine ? currentLine + ' ' + raw : raw;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (currentLine) logicLines.push(currentLine);
|
||||||
|
|
||||||
|
console.log(`[BillWorker] 合并后 ${logicLines.length} 条逻辑行(原始 ${rawLines.length} 行)`);
|
||||||
|
// 打印前5条逻辑行供调试
|
||||||
|
for (let i = 0; i < Math.min(5, logicLines.length); i++) {
|
||||||
|
console.log(`[BillWorker] L${i}: ${logicLines[i].substring(0, 120)}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
const categories = [];
|
||||||
|
let curCat = null, curItem = null;
|
||||||
|
|
||||||
|
// 编码匹配:支持行内任意位置的9-12位数字或B编码(排除 GB/DB 等标准号前缀)
|
||||||
|
const CODE_RE = /(?<![A-Za-z])(\d{9,12}|(?<![A-Za-z])B\d{5,6})/;
|
||||||
|
const UNIT_TOKENS = ['m³','m²','m3','m2','km','hm2','㎡','㎥','t','kg',
|
||||||
|
'个','台','套','组','根','块','片','张','只','吨','项',
|
||||||
|
'处','座','件','段','条','把','扇','口','圈','道','孔',
|
||||||
|
'对','副','樘','方','延m','株','棵','m'];
|
||||||
|
const UNIT_SET = new Set(UNIT_TOKENS);
|
||||||
|
const unitEscaped = UNIT_TOKENS.map(u => u.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'));
|
||||||
|
const UNIT_RE = new RegExp(`(?:^|\\s)(${unitEscaped.join('|')})(?=\\s|\\d|$)`);
|
||||||
|
const SKIP_RE = /合\s*计|小\s*计|本页小计|总\s*计|价税合计/;
|
||||||
|
|
||||||
|
for (const line of logicLines) {
|
||||||
|
if (SKIP_RE.test(line)) continue;
|
||||||
|
|
||||||
|
// 去掉行首的序号部分("1.1.1.1.5 " 或 "5 " 等纯序号前缀)
|
||||||
|
let stripped = line.replace(/^\d+(\.\d+)*\s+/, '').trim();
|
||||||
|
if (!stripped) stripped = line.trim();
|
||||||
|
if (!stripped) continue;
|
||||||
|
|
||||||
|
const cm = stripped.match(CODE_RE);
|
||||||
|
if (cm) {
|
||||||
|
if (curItem && curCat) curCat.items.push(curItem);
|
||||||
|
if (!curCat) { curCat = { name: '未分类', items: [] }; categories.push(curCat); }
|
||||||
|
|
||||||
|
const code = cm[1];
|
||||||
|
let rest = stripped.substring(cm.index + cm[0].length).trim();
|
||||||
|
let name = '', unit = '', quantity = '', spec = '';
|
||||||
|
|
||||||
|
const unitMatch = rest.match(UNIT_RE);
|
||||||
|
if (unitMatch) {
|
||||||
|
const ui = rest.indexOf(unitMatch[0]);
|
||||||
|
let rawName = rest.substring(0, ui).trim();
|
||||||
|
unit = unitMatch[1];
|
||||||
|
const afterUnit = rest.substring(ui + unitMatch[0].length).trim();
|
||||||
|
const qm = afterUnit.match(/^([\d,.]+)/);
|
||||||
|
if (qm) {
|
||||||
|
quantity = qm[1];
|
||||||
|
// 提取 quantity 之后的尾部文本,跳过纯数字字段(综合单价、合价等)
|
||||||
|
let tail = afterUnit.substring(qm.index + qm[0].length).trim();
|
||||||
|
if (tail) {
|
||||||
|
const tailTokens = tail.split(/\s+/);
|
||||||
|
let si = 0;
|
||||||
|
while (si < tailTokens.length && /^[\d,.%\-]+$/.test(tailTokens[si])) si++;
|
||||||
|
const specTail = tailTokens.slice(si).join(' ').trim();
|
||||||
|
if (specTail) spec = specTail;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// 分离 rawName 中的"项目名称"和内联"项目特征"
|
||||||
|
const ns = splitNameAndSpec(rawName);
|
||||||
|
name = ns.name;
|
||||||
|
if (ns.spec) spec = ns.spec + (spec ? ';' + spec : '');
|
||||||
|
} else {
|
||||||
|
const tokens = rest.split(/\s+/).filter(t => t);
|
||||||
|
let foundUnitIdx = -1;
|
||||||
|
for (let ti = tokens.length - 1; ti >= 1; ti--) {
|
||||||
|
if (UNIT_SET.has(tokens[ti])) { foundUnitIdx = ti; break; }
|
||||||
|
}
|
||||||
|
if (foundUnitIdx >= 1) {
|
||||||
|
const rawNameStr = tokens.slice(0, foundUnitIdx).join(' ');
|
||||||
|
const ns = splitNameAndSpec(rawNameStr);
|
||||||
|
name = ns.name;
|
||||||
|
if (ns.spec) spec = ns.spec;
|
||||||
|
unit = tokens[foundUnitIdx];
|
||||||
|
const afterTokens = tokens.slice(foundUnitIdx + 1);
|
||||||
|
if (afterTokens.length && /^[\d,.]+$/.test(afterTokens[0])) {
|
||||||
|
quantity = afterTokens[0];
|
||||||
|
let si = 1;
|
||||||
|
while (si < afterTokens.length && /^[\d,.%\-]+$/.test(afterTokens[si])) si++;
|
||||||
|
const specTail = afterTokens.slice(si).join(' ').trim();
|
||||||
|
if (specTail) spec = spec ? spec + ';' + specTail : specTail;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
name = rest;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
name = name.replace(/\s+/g, '').trim();
|
||||||
|
for (const u of UNIT_TOKENS) {
|
||||||
|
if (name.endsWith(u) && name.length > u.length) {
|
||||||
|
unit = unit || u;
|
||||||
|
name = name.substring(0, name.length - u.length);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
curItem = { code, name, unit, quantity, spec };
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── 回退:无标准编码但有 "名称 单位 数量" 结构 → 也视为清单项 ──
|
||||||
|
// 常见于措施项目、未编码的补充清单项
|
||||||
|
if (!cm && stripped.length > 4) {
|
||||||
|
const uniMatch = stripped.match(UNIT_RE);
|
||||||
|
if (uniMatch) {
|
||||||
|
const ui = stripped.indexOf(uniMatch[0]);
|
||||||
|
const beforeUnit = stripped.substring(0, ui).trim();
|
||||||
|
const afterUnit = stripped.substring(ui + uniMatch[0].length).trim();
|
||||||
|
const hasQty = /^[\d,.]+/.test(afterUnit);
|
||||||
|
// 名称 2-50 字、含中文、有数量、不是分部标题
|
||||||
|
if (beforeUnit.length >= 2 && beforeUnit.length <= 50 && hasQty
|
||||||
|
&& /[\u4e00-\u9fff]/.test(beforeUnit)) {
|
||||||
|
if (curItem && curCat) curCat.items.push(curItem);
|
||||||
|
if (!curCat) { curCat = { name: '未分类', items: [] }; categories.push(curCat); }
|
||||||
|
const unit = uniMatch[1];
|
||||||
|
const qm = afterUnit.match(/^([\d,.]+)/);
|
||||||
|
const quantity = qm ? qm[1] : '';
|
||||||
|
const ns = splitNameAndSpec(beforeUnit);
|
||||||
|
const name = ns.name.replace(/\s+/g, '').trim();
|
||||||
|
const spec = ns.spec || '';
|
||||||
|
curItem = { code: '', name, unit, quantity, spec };
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// 分部标题判断:不含编码、较短的文本、含工程关键字
|
||||||
|
// 关键守卫:如果行里有计量单位,说明是清单项,不是标题
|
||||||
|
if (stripped.length > 2 && stripped.length < 60 && !CODE_RE.test(stripped)) {
|
||||||
|
if (UNIT_RE.test(stripped) && /\d+\.?\d*\s*$/.test(stripped)) {
|
||||||
|
if (curItem) curItem.spec = curItem.spec ? curItem.spec + ';' + stripped : stripped;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if (isCatTitle(stripped) && !UNIT_RE.test(stripped) && !isFeeCatTitle(stripped)) {
|
||||||
|
if (curItem && curCat) { curCat.items.push(curItem); curItem = null; }
|
||||||
|
const cleanTitle = stripped.replace(/\s+(座|个|项|处|m|km|段|条)\s+\d+[\d.]*\s*$/, '').trim();
|
||||||
|
curCat = { name: cleanTitle, items: [] };
|
||||||
|
categories.push(curCat);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (/^[一二三四五六七八九十]+\s/.test(stripped) || /^([一二三四五六七八九十\d]+)/.test(stripped)) {
|
||||||
|
// 中文序号标题也需要排除费用类
|
||||||
|
const cleanTitle = stripped.replace(/\s+(座|个|项|处)\s+\d+[\d.]*\s*$/, '').trim();
|
||||||
|
if (isFeeCatTitle(cleanTitle)) {
|
||||||
|
// 费用类标题:跳过,不建分部(其下的行会作为续行处理)
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if (curItem && curCat) { curCat.items.push(curItem); curItem = null; }
|
||||||
|
curCat = { name: cleanTitle, items: [] };
|
||||||
|
categories.push(curCat);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (curItem && stripped.length > 1) {
|
||||||
|
curItem.spec = curItem.spec ? curItem.spec + ';' + stripped : stripped;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (curItem && curCat) curCat.items.push(curItem);
|
||||||
|
|
||||||
|
// 过滤费用项:只保留需要写入技术标的施工清单项
|
||||||
|
let feeFiltered = 0;
|
||||||
|
for (const cat of categories) {
|
||||||
|
if (cat.items) {
|
||||||
|
const before = cat.items.length;
|
||||||
|
cat.items = cat.items.filter(it => !isFeeItem(it.name));
|
||||||
|
feeFiltered += before - cat.items.length;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (feeFiltered > 0) console.log(`[BillWorker] 费用项过滤: 移除 ${feeFiltered} 项`);
|
||||||
|
|
||||||
|
// ========== 按项目名称合并(核心去重,大幅减少清单项数量)==========
|
||||||
|
// 规则:同一分部内,name 相同的清单项合并为一条
|
||||||
|
// - code: 保留第一个非空编码
|
||||||
|
// - unit: 保留第一个非空单位
|
||||||
|
// - quantity: 尝试数值求和,否则用分号拼接
|
||||||
|
// - spec: 去重后用分号拼接(截断过长的)
|
||||||
|
let totalBeforeMerge = 0, totalAfterMerge = 0;
|
||||||
|
for (const cat of categories) {
|
||||||
|
if (!cat.items || !cat.items.length) continue;
|
||||||
|
totalBeforeMerge += cat.items.length;
|
||||||
|
|
||||||
|
const nameMap = new Map(); // name → merged item
|
||||||
|
for (const item of cat.items) {
|
||||||
|
const key = (item.name || '').replace(/\s+/g, '').trim();
|
||||||
|
if (!key) continue;
|
||||||
|
|
||||||
|
if (!nameMap.has(key)) {
|
||||||
|
nameMap.set(key, {
|
||||||
|
code: item.code || '',
|
||||||
|
name: item.name,
|
||||||
|
unit: item.unit || '',
|
||||||
|
quantity: item.quantity || '',
|
||||||
|
spec: item.spec || '',
|
||||||
|
_count: 1,
|
||||||
|
_quantities: item.quantity ? [item.quantity] : [],
|
||||||
|
_specs: item.spec ? [item.spec] : [],
|
||||||
|
});
|
||||||
|
} else {
|
||||||
|
const m = nameMap.get(key);
|
||||||
|
m._count++;
|
||||||
|
// code: 取第一个非空的
|
||||||
|
if (!m.code && item.code) m.code = item.code;
|
||||||
|
// unit: 取第一个非空的
|
||||||
|
if (!m.unit && item.unit) m.unit = item.unit;
|
||||||
|
// quantity: 收集所有
|
||||||
|
if (item.quantity) m._quantities.push(item.quantity);
|
||||||
|
// spec: 收集不重复的
|
||||||
|
if (item.spec && !m._specs.includes(item.spec)) {
|
||||||
|
m._specs.push(item.spec);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// 后处理:合成最终字段
|
||||||
|
const merged = [];
|
||||||
|
for (const [, m] of nameMap) {
|
||||||
|
// quantity: 尝试数值求和
|
||||||
|
if (m._quantities.length > 1) {
|
||||||
|
const nums = m._quantities.map(q => parseFloat(q.replace(/,/g, '')));
|
||||||
|
if (nums.every(n => !isNaN(n))) {
|
||||||
|
const sum = nums.reduce((a, b) => a + b, 0);
|
||||||
|
m.quantity = sum % 1 === 0 ? String(sum) : sum.toFixed(2);
|
||||||
|
} else {
|
||||||
|
m.quantity = m._quantities.join('; ');
|
||||||
|
}
|
||||||
|
} else if (m._quantities.length === 1) {
|
||||||
|
m.quantity = m._quantities[0];
|
||||||
|
}
|
||||||
|
// spec: 拼接去重后的 spec,每条最多120字
|
||||||
|
if (m._specs.length > 0) {
|
||||||
|
const trimmed = m._specs.map(s => s.length > 120 ? s.substring(0, 120) + '...' : s);
|
||||||
|
m.spec = trimmed.join('; ');
|
||||||
|
// 总 spec 上限 300 字
|
||||||
|
if (m.spec.length > 300) m.spec = m.spec.substring(0, 300) + '...';
|
||||||
|
}
|
||||||
|
// 清理临时字段
|
||||||
|
delete m._count; delete m._quantities; delete m._specs;
|
||||||
|
merged.push(m);
|
||||||
|
}
|
||||||
|
cat.items = merged;
|
||||||
|
totalAfterMerge += merged.length;
|
||||||
|
}
|
||||||
|
|
||||||
|
const mergedCount = totalBeforeMerge - totalAfterMerge;
|
||||||
|
if (mergedCount > 0) {
|
||||||
|
console.log(`[BillWorker] 按名称合并: ${totalBeforeMerge} → ${totalAfterMerge} 项(合并 ${mergedCount} 个重复项)`);
|
||||||
|
}
|
||||||
|
|
||||||
|
const valid = categories.filter(c => c.items && c.items.length > 0);
|
||||||
|
const totalItems = valid.reduce((s, c) => s + c.items.length, 0);
|
||||||
|
const withSpec = valid.reduce((s, c) => s + c.items.filter(it => it.spec).length, 0);
|
||||||
|
const withCode = valid.reduce((s, c) => s + c.items.filter(it => it.code).length, 0);
|
||||||
|
console.log(`[BillWorker] 最终结果: ${valid.length} 分部, ${totalItems} 清单项 (${withCode} 有编码, ${withSpec} 有spec)`);
|
||||||
|
// 打印前 3 个 item 供调试
|
||||||
|
let debugCount = 0;
|
||||||
|
for (const cat of valid) {
|
||||||
|
for (const it of cat.items) {
|
||||||
|
if (debugCount < 3) {
|
||||||
|
console.log(`[BillWorker] 样例: [${it.code}] ${it.name} | ${it.unit} | qty=${it.quantity} | spec=${(it.spec||'').substring(0, 80)}`);
|
||||||
|
debugCount++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return {
|
||||||
|
project_summary: { remark: `本地解析:${valid.length} 个分部,${totalItems} 个清单项(合并前 ${totalBeforeMerge} 项)` },
|
||||||
|
categories: valid,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 判断清单项是否为"费用项"(非施工内容,不写入技术标)
|
||||||
|
* 如:安全文明措施费、规费、税金、暂列金额等
|
||||||
|
*/
|
||||||
|
function isFeeItem(name) {
|
||||||
|
if (!name) return false;
|
||||||
|
const n = name.replace(/\s+/g, '');
|
||||||
|
|
||||||
|
// ── 1. 精确匹配 ──
|
||||||
|
const EXACT = [
|
||||||
|
'规费', '税金', '利润', '增值税', '暂列金额', '暂估价', '计日工',
|
||||||
|
'总承包服务费', '企业管理费', '甲供材料保管费', '价税合计',
|
||||||
|
];
|
||||||
|
if (EXACT.includes(n)) return true;
|
||||||
|
|
||||||
|
// ── 2. 包含匹配:措施费/规费/保险/行政类 ──
|
||||||
|
const FEE_KW = [
|
||||||
|
'安全文明', '文明施工费', '环境保护费', '临时设施费',
|
||||||
|
'夜间施工增加费', '夜间施工费',
|
||||||
|
'冬雨季施工增加费', '冬雨季施工费',
|
||||||
|
'二次搬运费', '大型机械设备进出场', '大型机械进出场',
|
||||||
|
'施工排水降水', '排水降水费',
|
||||||
|
'已完工程及设备保护', '已完工程保护费',
|
||||||
|
'工程排污费', '社会保障费', '住房公积金',
|
||||||
|
'工伤保险', '劳动保险', '意外伤害保险', '建筑工程保险',
|
||||||
|
'城市维护建设税', '城市建设维护税',
|
||||||
|
'教育费附加', '地方教育附加',
|
||||||
|
'材料暂估', '专业工程暂估',
|
||||||
|
'超高施工增加费', '安全防护费',
|
||||||
|
'措施项目费', '其他项目费', '不可竞争费',
|
||||||
|
];
|
||||||
|
for (const kw of FEE_KW) {
|
||||||
|
if (n.includes(kw)) return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 将 rawName 中的"项目名称"与内联"项目特征描述"分离
|
||||||
|
* 例: "土方开挖 1.土壤类别:普通土" → { name: "土方开挖", spec: "1.土壤类别:普通土" }
|
||||||
|
*/
|
||||||
|
function splitNameAndSpec(rawName) {
|
||||||
|
if (!rawName) return { name: '', spec: '' };
|
||||||
|
// Pattern 1: 数字+点+中文(如 "1.土壤类别" "2、强度等级")
|
||||||
|
const m = rawName.match(/\d+[.、.)\uFF09]\s*[\u4e00-\u9fff]/);
|
||||||
|
if (m && m.index > 0) {
|
||||||
|
return {
|
||||||
|
name: rawName.substring(0, m.index).trim(),
|
||||||
|
spec: rawName.substring(m.index).trim()
|
||||||
|
};
|
||||||
|
}
|
||||||
|
// Pattern 2: 特征关键字+冒号(如 "材质:" "规格:")
|
||||||
|
const SPEC_KW_RE = /(材质|规格|型号|品牌|颜色|尺寸|厚度|直径|管径|强度|等级|类别|类型|做法|要求|标准|内容|工作内容|土壤|含量|配合比|工艺|方式|形式|范围|部位|位置|高度|宽度|长度|深度|坡度|截面|跨度|运距|开挖|回填|混凝土|钢筋|压实)[::]/;
|
||||||
|
const kw = rawName.match(SPEC_KW_RE);
|
||||||
|
if (kw && kw.index > 0) {
|
||||||
|
return {
|
||||||
|
name: rawName.substring(0, kw.index).trim(),
|
||||||
|
spec: rawName.substring(kw.index).trim()
|
||||||
|
};
|
||||||
|
}
|
||||||
|
// Pattern 3: 括号开头的特征描述 "(1)" "(1)"
|
||||||
|
const paren = rawName.match(/[((]\d+[))]/);
|
||||||
|
if (paren && paren.index > 0) {
|
||||||
|
return {
|
||||||
|
name: rawName.substring(0, paren.index).trim(),
|
||||||
|
spec: rawName.substring(paren.index).trim()
|
||||||
|
};
|
||||||
|
}
|
||||||
|
return { name: rawName, spec: '' };
|
||||||
|
}
|
||||||
|
|
||||||
|
function isCatTitle(text) {
|
||||||
|
const KW = [
|
||||||
|
'土建','建筑','结构','装饰','装修','安装','给排水','暖通','空调','通风',
|
||||||
|
'电气','强电','弱电','消防','智能化','幕墙','门窗','园林','绿化','景观',
|
||||||
|
'市政','道路','桥梁','管网','基础','地基','桩基','主体','屋面','防水',
|
||||||
|
'保温','钢结构','排水','给水','照明','动力','防雷','电梯','人防','室外',
|
||||||
|
'附属','分部','工程','措施','清单','土石方','混凝土','砌筑','模板','脚手架',
|
||||||
|
'水利','河道','管道','阀门','设备','仪表','自动化','通信','网络',
|
||||||
|
'拆除','外墙','内墙','楼地面','天棚','吊顶','栏杆','屋顶','涂料','抹灰',
|
||||||
|
'廊道','阀门井','蓄水池','泵站','供水','引水','水源','渠道','闸门',
|
||||||
|
'围栏','警示','检修','管线','配电','水池','水塔','取水','净水',
|
||||||
|
];
|
||||||
|
return KW.some(k => text.includes(k));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 判断分部标题是否为"费用类"(不应创建分部分类)
|
||||||
|
* 如:规费、税金、措施项目费、其他项目费 等非施工类分部
|
||||||
|
*/
|
||||||
|
function isFeeCatTitle(text) {
|
||||||
|
if (!text) return false;
|
||||||
|
const t = text.replace(/\s+/g, '');
|
||||||
|
// 精确匹配整个标题
|
||||||
|
const EXACT = [
|
||||||
|
'规费', '税金', '利润', '增值税', '暂列金额', '暂估价', '计日工',
|
||||||
|
'总承包服务费', '企业管理费', '价税合计',
|
||||||
|
'措施项目费', '其他项目费', '不可竞争费',
|
||||||
|
];
|
||||||
|
if (EXACT.includes(t)) return true;
|
||||||
|
// 包含匹配
|
||||||
|
const FEE_CAT_KW = [
|
||||||
|
'措施项目费', '其他项目费', '不可竞争费',
|
||||||
|
'规费汇总', '税金汇总', '费率', '费用汇总', '费用合计',
|
||||||
|
'暂列金额', '暂估价', '计日工', '总承包服务费',
|
||||||
|
'安全文明施工费', '社会保障费', '住房公积金',
|
||||||
|
'工伤保险', '教育费附加', '城市维护建设税',
|
||||||
|
];
|
||||||
|
for (const kw of FEE_CAT_KW) {
|
||||||
|
if (t.includes(kw)) return true;
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
95
build.bat
Normal file
95
build.bat
Normal file
@ -0,0 +1,95 @@
|
|||||||
|
@echo off
|
||||||
|
chcp 65001 >nul 2>&1
|
||||||
|
setlocal
|
||||||
|
|
||||||
|
echo ============================================================
|
||||||
|
echo BidPartner - Build Desktop EXE
|
||||||
|
echo ============================================================
|
||||||
|
echo.
|
||||||
|
|
||||||
|
:: ── 1. Check Python ────────────────────────────────────────────────────────
|
||||||
|
python --version >nul 2>&1
|
||||||
|
if errorlevel 1 (
|
||||||
|
echo [ERROR] Python not found. Please install Python 3.9+.
|
||||||
|
pause & exit /b 1
|
||||||
|
)
|
||||||
|
|
||||||
|
:: ── 2. Install / upgrade PyInstaller ───────────────────────────────────────
|
||||||
|
echo [Step 1/4] Installing PyInstaller...
|
||||||
|
pip install --quiet --upgrade pyinstaller
|
||||||
|
if errorlevel 1 (
|
||||||
|
echo [ERROR] Failed to install PyInstaller.
|
||||||
|
pause & exit /b 1
|
||||||
|
)
|
||||||
|
|
||||||
|
:: ── 3. Install project dependencies (if not already installed) ─────────────
|
||||||
|
echo [Step 2/4] Checking dependencies...
|
||||||
|
pip install --quiet -r requirements.txt
|
||||||
|
if errorlevel 1 (
|
||||||
|
echo [ERROR] Failed to install dependencies.
|
||||||
|
pause & exit /b 1
|
||||||
|
)
|
||||||
|
|
||||||
|
:: ── 4. Sanitize settings.json - REMOVE API KEYS before build ───────────────
|
||||||
|
echo [Step 3/4] Sanitizing settings (removing API keys from build)...
|
||||||
|
if exist "data\settings.json" (
|
||||||
|
:: Back up real settings
|
||||||
|
copy /y "data\settings.json" "data\settings.json.bak" >nul
|
||||||
|
)
|
||||||
|
:: Write a clean settings file with no real keys
|
||||||
|
(
|
||||||
|
echo {
|
||||||
|
echo "model_provider": "deepseek",
|
||||||
|
echo "qwen_api_key": "sk-your-qwen-key",
|
||||||
|
echo "qwen_model": "qwen3.6-plus",
|
||||||
|
echo "openai_api_key": "sk-your-openai-key",
|
||||||
|
echo "openai_model": "gpt-4o",
|
||||||
|
echo "deepseek_api_key": "sk-your-deepseek-key",
|
||||||
|
echo "deepseek_model": "deepseek-chat",
|
||||||
|
echo "max_concurrent": 5,
|
||||||
|
echo "content_volume": "standard"
|
||||||
|
echo }
|
||||||
|
) > "data\settings_clean.tmp"
|
||||||
|
|
||||||
|
:: ── 5. Build ────────────────────────────────────────────────────────────────
|
||||||
|
echo [Step 4/4] Building EXE with PyInstaller...
|
||||||
|
echo (This may take 3-10 minutes on first run)
|
||||||
|
echo.
|
||||||
|
|
||||||
|
:: Clean previous build artifacts
|
||||||
|
if exist "build" rd /s /q "build" >nul 2>&1
|
||||||
|
if exist "dist\BidPartner" rd /s /q "dist\BidPartner" >nul 2>&1
|
||||||
|
|
||||||
|
pyinstaller bid_partner.spec --noconfirm
|
||||||
|
set BUILD_RESULT=%errorlevel%
|
||||||
|
|
||||||
|
:: ── Restore real settings ───────────────────────────────────────────────────
|
||||||
|
if exist "data\settings.json.bak" (
|
||||||
|
copy /y "data\settings.json.bak" "data\settings.json" >nul
|
||||||
|
del /f /q "data\settings.json.bak" >nul 2>&1
|
||||||
|
)
|
||||||
|
del /f /q "data\settings_clean.tmp" >nul 2>&1
|
||||||
|
|
||||||
|
if %BUILD_RESULT% neq 0 (
|
||||||
|
echo.
|
||||||
|
echo [ERROR] PyInstaller build failed. See output above for details.
|
||||||
|
pause & exit /b 1
|
||||||
|
)
|
||||||
|
|
||||||
|
:: ── 6. Result ───────────────────────────────────────────────────────────────
|
||||||
|
echo.
|
||||||
|
echo ============================================================
|
||||||
|
echo Build SUCCESSFUL!
|
||||||
|
echo Output: dist\BidPartner\bid_partner.exe
|
||||||
|
echo ============================================================
|
||||||
|
echo.
|
||||||
|
echo The 'dist\BidPartner' folder is your distributable package.
|
||||||
|
echo Users only need this folder - no Python installation required.
|
||||||
|
echo Each user must set their own API key in the app settings.
|
||||||
|
echo.
|
||||||
|
|
||||||
|
:: Open the output folder
|
||||||
|
explorer "dist\BidPartner" >nul 2>&1
|
||||||
|
|
||||||
|
endlocal
|
||||||
|
pause
|
||||||
102
config.py
Normal file
102
config.py
Normal file
@ -0,0 +1,102 @@
|
|||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import threading
|
||||||
|
import random
|
||||||
|
import time
|
||||||
|
from contextlib import contextmanager
|
||||||
|
|
||||||
|
# When running as a PyInstaller bundle:
|
||||||
|
# sys._MEIPASS → read-only bundle dir (templates, static, prompts)
|
||||||
|
# sys.executable dir → writable dir next to the .exe (data, settings, db)
|
||||||
|
if getattr(sys, 'frozen', False):
|
||||||
|
_BUNDLE_DIR = sys._MEIPASS # bundled app files
|
||||||
|
BASE_DIR = os.path.dirname(sys.executable) # writable runtime dir
|
||||||
|
else:
|
||||||
|
_BUNDLE_DIR = os.path.dirname(os.path.abspath(__file__))
|
||||||
|
BASE_DIR = _BUNDLE_DIR
|
||||||
|
|
||||||
|
DATA_DIR = os.path.join(BASE_DIR, 'data')
|
||||||
|
UPLOAD_DIR = os.path.join(DATA_DIR, 'uploads')
|
||||||
|
EXPORT_DIR = os.path.join(DATA_DIR, 'exports')
|
||||||
|
KNOWLEDGE_DIR= os.path.join(DATA_DIR, 'knowledge')
|
||||||
|
DB_PATH = os.path.join(DATA_DIR, 'projects.db')
|
||||||
|
CHROMA_DIR = os.path.join(DATA_DIR, 'chroma')
|
||||||
|
PROMPTS_DIR = os.path.join(_BUNDLE_DIR, 'prompts')
|
||||||
|
|
||||||
|
# ==================== AI 模型配置 ====================
|
||||||
|
# 模型选择:'openai' | 'qwen' | 'deepseek' | 'ollama'
|
||||||
|
MODEL_PROVIDER = os.environ.get('MODEL_PROVIDER', 'qwen')
|
||||||
|
|
||||||
|
# OpenAI
|
||||||
|
OPENAI_API_KEY = os.environ.get('OPENAI_API_KEY', 'sk-your-openai-key')
|
||||||
|
OPENAI_MODEL = os.environ.get('OPENAI_MODEL', 'gpt-4.1')
|
||||||
|
OPENAI_BASE_URL = os.environ.get('OPENAI_BASE_URL', 'https://api.openai.com/v1')
|
||||||
|
|
||||||
|
# 阿里云通义千问
|
||||||
|
QWEN_API_KEY = os.environ.get('QWEN_API_KEY', 'sk-your-qwen-key')
|
||||||
|
QWEN_MODEL = os.environ.get('QWEN_MODEL', 'qwen3.6-plus')
|
||||||
|
QWEN_BASE_URL = os.environ.get('QWEN_BASE_URL', 'https://dashscope.aliyuncs.com/compatible-mode/v1')
|
||||||
|
|
||||||
|
# DeepSeek
|
||||||
|
DEEPSEEK_API_KEY = os.environ.get('DEEPSEEK_API_KEY', 'sk-your-deepseek-key')
|
||||||
|
DEEPSEEK_MODEL = os.environ.get('DEEPSEEK_MODEL', 'deepseek-chat')
|
||||||
|
DEEPSEEK_BASE_URL = os.environ.get('DEEPSEEK_BASE_URL', 'https://api.deepseek.com/v1')
|
||||||
|
|
||||||
|
# Ollama 本地(OpenAI 兼容接口)
|
||||||
|
OLLAMA_BASE_URL = os.environ.get('OLLAMA_BASE_URL', 'http://localhost:11434/v1')
|
||||||
|
OLLAMA_MODEL = os.environ.get('OLLAMA_MODEL', 'qwen3:8b')
|
||||||
|
|
||||||
|
# 豆包 / 火山引擎(字节跳动,OpenAI 兼容接口)
|
||||||
|
DOUBAO_API_KEY = os.environ.get('DOUBAO_API_KEY', 'sk-your-doubao-key')
|
||||||
|
DOUBAO_MODEL = os.environ.get('DOUBAO_MODEL', 'doubao-1-5-pro-32k')
|
||||||
|
DOUBAO_BASE_URL = os.environ.get('DOUBAO_BASE_URL', 'https://ark.cn-beijing.volces.com/api/v3')
|
||||||
|
|
||||||
|
# Kimi / Moonshot AI(OpenAI 兼容接口,支持 Embedding)
|
||||||
|
KIMI_API_KEY = os.environ.get('KIMI_API_KEY', 'sk-your-kimi-key')
|
||||||
|
KIMI_MODEL = os.environ.get('KIMI_MODEL', 'moonshot-v1-32k')
|
||||||
|
KIMI_BASE_URL = os.environ.get('KIMI_BASE_URL', 'https://api.moonshot.cn/v1')
|
||||||
|
|
||||||
|
# Embedding 模型
|
||||||
|
OPENAI_EMBEDDING_MODEL = 'text-embedding-3-small'
|
||||||
|
QWEN_EMBEDDING_MODEL = 'text-embedding-v3'
|
||||||
|
KIMI_EMBEDDING_MODEL = 'moonshot-v1-embedding'
|
||||||
|
|
||||||
|
# ==================== 应用配置 ====================
|
||||||
|
MAX_FILE_SIZE_MB = 50
|
||||||
|
ALLOWED_EXTENSIONS = {'pdf', 'doc', 'docx'}
|
||||||
|
SECRET_KEY = 'bidhuo-partner-secret-2024'
|
||||||
|
|
||||||
|
# ==================== 生成配置 ====================
|
||||||
|
MAX_RETRIES = 3
|
||||||
|
REQUEST_TIMEOUT = int(os.environ.get('REQUEST_TIMEOUT', '180'))
|
||||||
|
# 大纲生成单次提示词长、输出大,适当延长读超时(秒),避免接口未返回即被客户端断开
|
||||||
|
OUTLINE_REQUEST_TIMEOUT = int(os.environ.get('OUTLINE_REQUEST_TIMEOUT', '300'))
|
||||||
|
CHUNK_SIZE = 2000 # 知识库文本分块大小(字符数)
|
||||||
|
CHUNK_OVERLAP = 200 # 分块重叠大小
|
||||||
|
TOP_K_KNOWLEDGE = 3 # 知识库检索数量
|
||||||
|
CONTENT_VOLUME = os.environ.get('CONTENT_VOLUME', 'standard') # 篇幅档位: concise / standard / detailed / full
|
||||||
|
TARGET_PAGES = int(os.environ.get('TARGET_PAGES', '0') or '0') # 目标页数(0=不启用)
|
||||||
|
PAGE_CHAR_ESTIMATE = int(os.environ.get('PAGE_CHAR_ESTIMATE', '700') or '700') # 粗略每页字数估算
|
||||||
|
|
||||||
|
# ==================== 并发控制 (极速优化核心) ====================
|
||||||
|
# 全局LLM调用上限,防止Qwen等云API被限流。默认20,与用户要求对齐。
|
||||||
|
LLM_CONCURRENCY_LIMIT = int(os.environ.get('LLM_CONCURRENCY_LIMIT', '20'))
|
||||||
|
_llm_semaphore = threading.Semaphore(LLM_CONCURRENCY_LIMIT)
|
||||||
|
|
||||||
|
@contextmanager
|
||||||
|
def llm_call():
|
||||||
|
"""全局LLM调用信号量上下文管理器 (上限20)。所有ai_client.chat / embedding 必须使用。
|
||||||
|
针对Qwen云API增加轻微jitter避免429。超时60s防止死锁。"""
|
||||||
|
acquired = _llm_semaphore.acquire(blocking=True, timeout=60.0)
|
||||||
|
if not acquired:
|
||||||
|
raise TimeoutError(f"LLM并发已达上限({LLM_CONCURRENCY_LIMIT}),请稍后重试")
|
||||||
|
try:
|
||||||
|
# Qwen RPM敏感,增加极小jitter (0-0.08s) 避免429限流
|
||||||
|
if MODEL_PROVIDER == 'qwen':
|
||||||
|
time.sleep(random.uniform(0, 0.08))
|
||||||
|
yield
|
||||||
|
finally:
|
||||||
|
_llm_semaphore.release()
|
||||||
|
|
||||||
|
# 更新默认并发章节数,支持更高上限(UI后续同步)
|
||||||
|
MAX_CONCURRENT_SECTIONS = int(os.environ.get('MAX_CONCURRENT_SECTIONS', '12'))
|
||||||
22
data/attachment_section_rules.json
Normal file
22
data/attachment_section_rules.json
Normal file
@ -0,0 +1,22 @@
|
|||||||
|
{
|
||||||
|
"_meta": "附件类章节:stack_charts_only 为默认,叶节点按 diagram 意图栈只输出 [FIGURE]/[TABLE] 块、无叙述正文;full 为长文;single_chart_only 为栈顶单块。修改后重启生效。",
|
||||||
|
"_field_docs": {
|
||||||
|
"title_regex": "标题任一则正则匹配即视为附件节(Python re 语法)",
|
||||||
|
"table_hint_keywords": "标题含此类子串且双开关均开时倾向表格",
|
||||||
|
"figure_hint_keywords": "标题含此类子串且双开关均开时倾向图示",
|
||||||
|
"default_kind_when_ambiguous": "双开且标题无倾向词时的默认:figure 或 table",
|
||||||
|
"attachment_leaf_body_mode": "stack_charts_only:意图栈只生成图/表块;full:与常规章节相同长文;single_chart_only:仅栈顶一块图或表"
|
||||||
|
},
|
||||||
|
"schema_version": 1,
|
||||||
|
"attachment_leaf_body_mode": "stack_charts_only",
|
||||||
|
"title_regex": [
|
||||||
|
"附件\\s*[一二三四五六七八九十0-9A-Za-z、::.]",
|
||||||
|
"附\\s*图",
|
||||||
|
"附\\s*表",
|
||||||
|
"附\\s*件\\s*\\(",
|
||||||
|
"^\\s*[\\d一二三四五六七八九十\\..、]+\\s*附件"
|
||||||
|
],
|
||||||
|
"table_hint_keywords": ["附表", "一览表", "清单表", "表", "统计表", "明细表"],
|
||||||
|
"figure_hint_keywords": ["附图", "示意图", "平面图", "流程图", "布置图", "组织图", "横道"],
|
||||||
|
"default_kind_when_ambiguous": "table"
|
||||||
|
}
|
||||||
51
data/diagram_intent_rules.json
Normal file
51
data/diagram_intent_rules.json
Normal file
@ -0,0 +1,51 @@
|
|||||||
|
{
|
||||||
|
"_meta": "章节级图/表意图:标题与大纲窗口关键词计分,阈值入栈,按栈序拼接图示/表格生成规范。修改后重启服务生效。",
|
||||||
|
"_field_docs": {
|
||||||
|
"threshold_figure": "图示倾向分达到此值才入栈",
|
||||||
|
"threshold_table": "表格倾向分达到此值才入栈",
|
||||||
|
"title_weight": "标题命中的权重乘子",
|
||||||
|
"context_weight": "大纲上下文窗口命中的权重乘子",
|
||||||
|
"outline_context_lines": "before/after 为相对匹配行上下扩展行数",
|
||||||
|
"stack_order_when_both": "figure_first | table_first | score_desc(两者同时入栈时的顺序,栈顶为 index 0)",
|
||||||
|
"figure_keywords": "字符串或 {text,weight} 对象列表",
|
||||||
|
"table_keywords": "同上"
|
||||||
|
},
|
||||||
|
"schema_version": 1,
|
||||||
|
"threshold_figure": 1.0,
|
||||||
|
"threshold_table": 1.0,
|
||||||
|
"title_weight": 1.0,
|
||||||
|
"context_weight": 0.6,
|
||||||
|
"outline_context_lines": {"before": 4, "after": 6},
|
||||||
|
"stack_order_when_both": "score_desc",
|
||||||
|
"figure_keywords": [
|
||||||
|
{"text": "组织", "weight": 1.0},
|
||||||
|
{"text": "架构", "weight": 1.0},
|
||||||
|
{"text": "流程", "weight": 1.2},
|
||||||
|
{"text": "工序", "weight": 1.0},
|
||||||
|
{"text": "进度", "weight": 1.2},
|
||||||
|
{"text": "横道", "weight": 1.5},
|
||||||
|
{"text": "网络图", "weight": 1.5},
|
||||||
|
{"text": "平面", "weight": 1.0},
|
||||||
|
{"text": "布置", "weight": 0.8},
|
||||||
|
{"text": "监测", "weight": 0.8},
|
||||||
|
{"text": "示意", "weight": 0.8},
|
||||||
|
{"text": "应急", "weight": 0.8}
|
||||||
|
],
|
||||||
|
"table_keywords": [
|
||||||
|
{"text": "一览表", "weight": 1.5},
|
||||||
|
{"text": "人员", "weight": 1.0},
|
||||||
|
{"text": "配置", "weight": 0.8},
|
||||||
|
{"text": "设备", "weight": 1.0},
|
||||||
|
{"text": "机械", "weight": 0.9},
|
||||||
|
{"text": "劳动力", "weight": 1.2},
|
||||||
|
{"text": "工种", "weight": 1.0},
|
||||||
|
{"text": "检验", "weight": 1.0},
|
||||||
|
{"text": "验收", "weight": 0.9},
|
||||||
|
{"text": "材料", "weight": 1.0},
|
||||||
|
{"text": "供应", "weight": 0.9},
|
||||||
|
{"text": "风险", "weight": 1.0},
|
||||||
|
{"text": "措施", "weight": 0.6},
|
||||||
|
{"text": "清单", "weight": 0.8},
|
||||||
|
{"text": "计划", "weight": 0.7}
|
||||||
|
]
|
||||||
|
}
|
||||||
BIN
data/exports/20260420测试海东技术标_20260420_180450.docx
Normal file
BIN
data/exports/20260420测试海东技术标_20260420_180450.docx
Normal file
Binary file not shown.
BIN
data/exports/30260420投标技术文档_20260420_170252.docx
Normal file
BIN
data/exports/30260420投标技术文档_20260420_170252.docx
Normal file
Binary file not shown.
BIN
data/exports/A2121212_20260421_103738.docx
Normal file
BIN
data/exports/A2121212_20260421_103738.docx
Normal file
Binary file not shown.
BIN
data/exports/A666_20260422_143004.docx
Normal file
BIN
data/exports/A666_20260422_143004.docx
Normal file
Binary file not shown.
BIN
data/exports/A666_20260422_153137.docx
Normal file
BIN
data/exports/A666_20260422_153137.docx
Normal file
Binary file not shown.
BIN
data/exports/A666_20260422_160459.docx
Normal file
BIN
data/exports/A666_20260422_160459.docx
Normal file
Binary file not shown.
BIN
data/projects.db
Normal file
BIN
data/projects.db
Normal file
Binary file not shown.
BIN
data/projects.db-shm
Normal file
BIN
data/projects.db-shm
Normal file
Binary file not shown.
BIN
data/projects.db-wal
Normal file
BIN
data/projects.db-wal
Normal file
Binary file not shown.
24
data/settings.json
Normal file
24
data/settings.json
Normal file
@ -0,0 +1,24 @@
|
|||||||
|
{
|
||||||
|
"model_provider": "qwen",
|
||||||
|
"qwen_api_key": "sk-999173b3ca7f425a97cc4b12a2d3575f",
|
||||||
|
"qwen_model": "qwen3.6-plus",
|
||||||
|
"qwen_base_url": "https://dashscope.aliyuncs.com/compatible-mode/v1",
|
||||||
|
"openai_api_key": "sk-your-openai-key",
|
||||||
|
"openai_model": "gpt-4.1",
|
||||||
|
"openai_base_url": "https://api.openai.com/v1",
|
||||||
|
"deepseek_api_key": "sk-your-deepseek-key",
|
||||||
|
"deepseek_model": "deepseek-chat",
|
||||||
|
"deepseek_base_url": "https://api.deepseek.com/v1",
|
||||||
|
"ollama_base_url": "http://localhost:11434/v1",
|
||||||
|
"ollama_model": "qwen3:8b",
|
||||||
|
"doubao_api_key": "sk-your-doubao-key",
|
||||||
|
"doubao_model": "doubao-1-5-pro-32k",
|
||||||
|
"doubao_base_url": "https://ark.cn-beijing.volces.com/api/v3",
|
||||||
|
"kimi_api_key": "sk-your-kimi-key",
|
||||||
|
"kimi_model": "moonshot-v1-32k",
|
||||||
|
"kimi_base_url": "https://api.moonshot.cn/v1",
|
||||||
|
"max_concurrent": 10,
|
||||||
|
"content_volume": "full",
|
||||||
|
"target_pages": 120,
|
||||||
|
"page_char_estimate": 700
|
||||||
|
}
|
||||||
23
data/style_presets.json
Normal file
23
data/style_presets.json
Normal file
@ -0,0 +1,23 @@
|
|||||||
|
{
|
||||||
|
"standard": {
|
||||||
|
"bodyMode": "text_only",
|
||||||
|
"figureEnabled": false,
|
||||||
|
"tableEnabled": true,
|
||||||
|
"bodyFont": "宋体",
|
||||||
|
"bodySize": "小四",
|
||||||
|
"bodyLineSpacing": 1.5,
|
||||||
|
"headingFont": "黑体",
|
||||||
|
"headingSize": "三号",
|
||||||
|
"footerFont": "宋体",
|
||||||
|
"footerSize": "五号",
|
||||||
|
"pageCountTarget": 100,
|
||||||
|
"margins": {
|
||||||
|
"top": 2.54,
|
||||||
|
"bottom": 2.54,
|
||||||
|
"left": 3.18,
|
||||||
|
"right": 3.18
|
||||||
|
},
|
||||||
|
"headerText": "标桥AI编标",
|
||||||
|
"footerText": "第 X 页 / 共 Y 页"
|
||||||
|
}
|
||||||
|
}
|
||||||
BIN
data/uploads/10_boq_工程量清单.pdf
Normal file
BIN
data/uploads/10_boq_工程量清单.pdf
Normal file
Binary file not shown.
BIN
data/uploads/10_招标文件正文.pdf
Normal file
BIN
data/uploads/10_招标文件正文.pdf
Normal file
Binary file not shown.
BIN
data/uploads/11_boq_工程量清单.pdf
Normal file
BIN
data/uploads/11_boq_工程量清单.pdf
Normal file
Binary file not shown.
BIN
data/uploads/11_招标文件正文.pdf
Normal file
BIN
data/uploads/11_招标文件正文.pdf
Normal file
Binary file not shown.
BIN
data/uploads/12_boq_工程量清单.pdf
Normal file
BIN
data/uploads/12_boq_工程量清单.pdf
Normal file
Binary file not shown.
BIN
data/uploads/12_招标文件正文.pdf
Normal file
BIN
data/uploads/12_招标文件正文.pdf
Normal file
Binary file not shown.
BIN
data/uploads/15_boq_工程量清单.pdf
Normal file
BIN
data/uploads/15_boq_工程量清单.pdf
Normal file
Binary file not shown.
BIN
data/uploads/15_招标文件正文.pdf
Normal file
BIN
data/uploads/15_招标文件正文.pdf
Normal file
Binary file not shown.
BIN
data/uploads/16_boq_工程量清单.pdf
Normal file
BIN
data/uploads/16_boq_工程量清单.pdf
Normal file
Binary file not shown.
BIN
data/uploads/16_招标文件正文.pdf
Normal file
BIN
data/uploads/16_招标文件正文.pdf
Normal file
Binary file not shown.
BIN
data/uploads/17_boq_工程量清单.pdf
Normal file
BIN
data/uploads/17_boq_工程量清单.pdf
Normal file
Binary file not shown.
BIN
data/uploads/17_招标文件正文.pdf
Normal file
BIN
data/uploads/17_招标文件正文.pdf
Normal file
Binary file not shown.
BIN
data/uploads/19_boq_工程量清单.pdf
Normal file
BIN
data/uploads/19_boq_工程量清单.pdf
Normal file
Binary file not shown.
BIN
data/uploads/19_招标文件正文.pdf
Normal file
BIN
data/uploads/19_招标文件正文.pdf
Normal file
Binary file not shown.
BIN
data/uploads/1_boq_工程量清单1.pdf
Normal file
BIN
data/uploads/1_boq_工程量清单1.pdf
Normal file
Binary file not shown.
BIN
data/uploads/1_招标文件正文1.pdf
Normal file
BIN
data/uploads/1_招标文件正文1.pdf
Normal file
Binary file not shown.
BIN
data/uploads/20_boq_工程量清单.pdf
Normal file
BIN
data/uploads/20_boq_工程量清单.pdf
Normal file
Binary file not shown.
BIN
data/uploads/20_招标文件正文.pdf
Normal file
BIN
data/uploads/20_招标文件正文.pdf
Normal file
Binary file not shown.
BIN
data/uploads/2_boq_工程量清单1.pdf
Normal file
BIN
data/uploads/2_boq_工程量清单1.pdf
Normal file
Binary file not shown.
BIN
data/uploads/2_招标文件正文1.pdf
Normal file
BIN
data/uploads/2_招标文件正文1.pdf
Normal file
Binary file not shown.
BIN
data/uploads/3_boq_工程量清单1.pdf
Normal file
BIN
data/uploads/3_boq_工程量清单1.pdf
Normal file
Binary file not shown.
BIN
data/uploads/3_招标文件正文1.pdf
Normal file
BIN
data/uploads/3_招标文件正文1.pdf
Normal file
Binary file not shown.
BIN
data/uploads/4_boq_工程量清单1.pdf
Normal file
BIN
data/uploads/4_boq_工程量清单1.pdf
Normal file
Binary file not shown.
BIN
data/uploads/4_招标文件正文1.pdf
Normal file
BIN
data/uploads/4_招标文件正文1.pdf
Normal file
Binary file not shown.
BIN
data/uploads/5_boq_工程量清单1.pdf
Normal file
BIN
data/uploads/5_boq_工程量清单1.pdf
Normal file
Binary file not shown.
BIN
data/uploads/5_招标文件正文1.pdf
Normal file
BIN
data/uploads/5_招标文件正文1.pdf
Normal file
Binary file not shown.
BIN
data/uploads/6_boq_工程量清单.pdf
Normal file
BIN
data/uploads/6_boq_工程量清单.pdf
Normal file
Binary file not shown.
BIN
data/uploads/6_招标文件正文.pdf
Normal file
BIN
data/uploads/6_招标文件正文.pdf
Normal file
Binary file not shown.
BIN
data/uploads/7_boq_工程量清单1.pdf
Normal file
BIN
data/uploads/7_boq_工程量清单1.pdf
Normal file
Binary file not shown.
BIN
data/uploads/7_招标文件正文1.pdf
Normal file
BIN
data/uploads/7_招标文件正文1.pdf
Normal file
Binary file not shown.
BIN
data/uploads/8_boq_工程量清单.pdf
Normal file
BIN
data/uploads/8_boq_工程量清单.pdf
Normal file
Binary file not shown.
BIN
data/uploads/8_招标文件正文.pdf
Normal file
BIN
data/uploads/8_招标文件正文.pdf
Normal file
Binary file not shown.
BIN
data/uploads/9_boq_工程量清单.pdf
Normal file
BIN
data/uploads/9_boq_工程量清单.pdf
Normal file
Binary file not shown.
BIN
data/uploads/9_招标文件正文.pdf
Normal file
BIN
data/uploads/9_招标文件正文.pdf
Normal file
Binary file not shown.
30
data/word_allocation_rules.json
Normal file
30
data/word_allocation_rules.json
Normal file
@ -0,0 +1,30 @@
|
|||||||
|
{
|
||||||
|
"_meta": "字数分配约束规则:与「标书篇幅预期」四档的 base/core 配合,按技术评分项权重与章节标题相关性分配各叶节点最低字数与提示词中的评分要点提示。修改后重启服务生效;字段说明见同文件 _field_docs。",
|
||||||
|
"_field_docs": {
|
||||||
|
"schema_version": "规则文件版本号,解析时可做迁移",
|
||||||
|
"alpha": "0~1,评分驱动强度;越大则高分相关章节越接近 core、低相关越接近 base",
|
||||||
|
"budget_mode": "target_pages:启用目标页数且 TARGET_PAGES>0 时,全书叶节点目标总字数为 TARGET_PAGES*PAGE_CHAR_ESTIMATE;无技术评分时叶节均分该总预算。未启用页数时无评分则返回 None。anchor_mean:N*(base+core)/2;anchor_base:N*base",
|
||||||
|
"per_section_floor": "单节 min_chars 下限(不低于此整数)",
|
||||||
|
"per_section_cap": "单节 min_chars 上限(不超过 core 时可设为 core 或略高)",
|
||||||
|
"relevance.method": "keyword_overlap:标题与评分项名称/关键词的字面重叠度",
|
||||||
|
"relevance.min_rating_weight": "忽略权重低于此值的评分项(减少噪声)",
|
||||||
|
"rating_parse": "预留;解析器内置多形态 rating_json,无需在此配置",
|
||||||
|
"prompt.top_k_rating_items": "写入本节字数说明中的相关评分项名称条数上限",
|
||||||
|
"max_tokens_scale": "若为 true,按 min_chars/base 比例缩放本段 max_tokens(仍受模型上限约束)"
|
||||||
|
},
|
||||||
|
"schema_version": 1,
|
||||||
|
"alpha": 0.85,
|
||||||
|
"budget_mode": "target_pages",
|
||||||
|
"per_section_floor": null,
|
||||||
|
"per_section_cap": null,
|
||||||
|
"relevance": {
|
||||||
|
"method": "keyword_overlap",
|
||||||
|
"min_rating_weight": 0.01
|
||||||
|
},
|
||||||
|
"rating_parse": {},
|
||||||
|
"prompt": {
|
||||||
|
"top_k_rating_items": 4,
|
||||||
|
"intro_line": "本节须对下列技术评分要点作实质展开(结合工艺、流程、标准与可验证措施,禁止空泛承诺与复述招标文件):"
|
||||||
|
},
|
||||||
|
"max_tokens_scale": false
|
||||||
|
}
|
||||||
172
launcher.py
Normal file
172
launcher.py
Normal file
@ -0,0 +1,172 @@
|
|||||||
|
"""
|
||||||
|
标伙伴 · AI标书助手 — 桌面启动器
|
||||||
|
运行此文件 (或打包后的 bid_partner.exe) 即可自动启动本地服务并打开浏览器。
|
||||||
|
"""
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import socket
|
||||||
|
import threading
|
||||||
|
import time
|
||||||
|
import webbrowser
|
||||||
|
import urllib.request
|
||||||
|
import logging
|
||||||
|
|
||||||
|
|
||||||
|
# ── 找可用端口 ──────────────────────────────────────────────────────────────
|
||||||
|
def _find_free_port(start: int = 5000, attempts: int = 20) -> int:
|
||||||
|
for port in range(start, start + attempts):
|
||||||
|
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
|
||||||
|
try:
|
||||||
|
s.bind(('127.0.0.1', port))
|
||||||
|
return port
|
||||||
|
except OSError:
|
||||||
|
continue
|
||||||
|
return start # 最坏情况:直接用 5000,让 Flask 报错
|
||||||
|
|
||||||
|
|
||||||
|
PORT = _find_free_port()
|
||||||
|
|
||||||
|
|
||||||
|
# ── 日志 ────────────────────────────────────────────────────────────────────
|
||||||
|
def _setup_logging():
|
||||||
|
if getattr(sys, 'frozen', False):
|
||||||
|
log_dir = os.path.dirname(sys.executable)
|
||||||
|
else:
|
||||||
|
log_dir = os.path.dirname(os.path.abspath(__file__))
|
||||||
|
log_path = os.path.join(log_dir, 'bid_partner.log')
|
||||||
|
logging.basicConfig(
|
||||||
|
level=logging.INFO,
|
||||||
|
format='%(asctime)s [%(levelname)s] %(name)s: %(message)s',
|
||||||
|
handlers=[logging.FileHandler(log_path, encoding='utf-8', mode='a')],
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ── 启动 Flask 服务 ─────────────────────────────────────────────────────────
|
||||||
|
def _start_server():
|
||||||
|
try:
|
||||||
|
import app as flask_app
|
||||||
|
flask_app.init_db()
|
||||||
|
flask_app.app.run(
|
||||||
|
host='127.0.0.1',
|
||||||
|
port=PORT,
|
||||||
|
debug=False,
|
||||||
|
threaded=True,
|
||||||
|
use_reloader=False,
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
logging.getLogger('launcher').error(f'服务启动失败: {e}', exc_info=True)
|
||||||
|
|
||||||
|
|
||||||
|
# ── 等待服务就绪 ─────────────────────────────────────────────────────────────
|
||||||
|
def _wait_for_server(timeout: int = 60) -> bool:
|
||||||
|
url = f'http://127.0.0.1:{PORT}'
|
||||||
|
deadline = time.time() + timeout
|
||||||
|
while time.time() < deadline:
|
||||||
|
try:
|
||||||
|
urllib.request.urlopen(url, timeout=1)
|
||||||
|
return True
|
||||||
|
except Exception:
|
||||||
|
time.sleep(0.4)
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
# ── 主界面 (tkinter) ─────────────────────────────────────────────────────────
|
||||||
|
def _run_gui():
|
||||||
|
import tkinter as tk
|
||||||
|
from tkinter import ttk, font as tkfont
|
||||||
|
|
||||||
|
URL = f'http://127.0.0.1:{PORT}'
|
||||||
|
|
||||||
|
root = tk.Tk()
|
||||||
|
root.title('标伙伴 · AI标书助手')
|
||||||
|
root.geometry('400x220')
|
||||||
|
root.resizable(False, False)
|
||||||
|
root.configure(bg='#f5f5f5')
|
||||||
|
|
||||||
|
# ── 标题 ──
|
||||||
|
title_font = tkfont.Font(family='微软雅黑', size=14, weight='bold')
|
||||||
|
tk.Label(root, text='标伙伴 · AI 标书助手', font=title_font,
|
||||||
|
bg='#f5f5f5', fg='#1a1a2e').pack(pady=(22, 4))
|
||||||
|
|
||||||
|
# ── 状态行 ──
|
||||||
|
status_var = tk.StringVar(value='正在启动服务,请稍候…')
|
||||||
|
status_lbl = tk.Label(root, textvariable=status_var,
|
||||||
|
font=('微软雅黑', 10), bg='#f5f5f5', fg='#555')
|
||||||
|
status_lbl.pack(pady=4)
|
||||||
|
|
||||||
|
# ── URL 链接 ──
|
||||||
|
url_lbl = tk.Label(root, text='', font=('Consolas', 10),
|
||||||
|
bg='#f5f5f5', fg='#1a73e8', cursor='hand2')
|
||||||
|
url_lbl.pack(pady=2)
|
||||||
|
url_lbl.bind('<Button-1>', lambda _: webbrowser.open(URL))
|
||||||
|
|
||||||
|
# ── 按钮区 ──
|
||||||
|
btn_frame = tk.Frame(root, bg='#f5f5f5')
|
||||||
|
btn_frame.pack(pady=18)
|
||||||
|
|
||||||
|
open_btn = ttk.Button(btn_frame, text='打开浏览器',
|
||||||
|
command=lambda: webbrowser.open(URL),
|
||||||
|
state='disabled', width=14)
|
||||||
|
open_btn.pack(side='left', padx=8)
|
||||||
|
|
||||||
|
quit_btn = ttk.Button(btn_frame, text='退出程序',
|
||||||
|
command=root.destroy, width=10)
|
||||||
|
quit_btn.pack(side='left', padx=8)
|
||||||
|
|
||||||
|
# ── 版本信息 ──
|
||||||
|
tk.Label(root, text='单机版 · 本地运行 · 数据不上传',
|
||||||
|
font=('微软雅黑', 8), bg='#f5f5f5', fg='#aaa').pack(pady=(0, 10))
|
||||||
|
|
||||||
|
# ── 后台轮询,服务就绪后更新 UI ──
|
||||||
|
def _on_ready():
|
||||||
|
status_var.set('服务已就绪 ✓')
|
||||||
|
status_lbl.config(fg='#2e7d32')
|
||||||
|
url_lbl.config(text=URL)
|
||||||
|
open_btn.config(state='normal')
|
||||||
|
webbrowser.open(URL)
|
||||||
|
|
||||||
|
def _on_timeout():
|
||||||
|
status_var.set('启动超时,请查看 bid_partner.log')
|
||||||
|
status_lbl.config(fg='#c62828')
|
||||||
|
|
||||||
|
def _check():
|
||||||
|
if _wait_for_server():
|
||||||
|
root.after(0, _on_ready)
|
||||||
|
else:
|
||||||
|
root.after(0, _on_timeout)
|
||||||
|
|
||||||
|
threading.Thread(target=_check, daemon=True).start()
|
||||||
|
root.mainloop()
|
||||||
|
|
||||||
|
|
||||||
|
# ── 无图形模式(仅控制台) ────────────────────────────────────────────────────
|
||||||
|
def _run_headless():
|
||||||
|
print(f'[标伙伴] Starting server on port {PORT} ...')
|
||||||
|
if _wait_for_server():
|
||||||
|
print(f'[标伙伴] Ready → http://127.0.0.1:{PORT}')
|
||||||
|
webbrowser.open(f'http://127.0.0.1:{PORT}')
|
||||||
|
# 阻塞,直到用户 Ctrl+C
|
||||||
|
try:
|
||||||
|
while True:
|
||||||
|
time.sleep(1)
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
print('[标伙伴] Shutting down.')
|
||||||
|
else:
|
||||||
|
print('[标伙伴] Server did not start within 60 s. Check bid_partner.log.')
|
||||||
|
|
||||||
|
|
||||||
|
# ── 入口 ─────────────────────────────────────────────────────────────────────
|
||||||
|
def main():
|
||||||
|
_setup_logging()
|
||||||
|
|
||||||
|
server_thread = threading.Thread(target=_start_server, daemon=True)
|
||||||
|
server_thread.start()
|
||||||
|
|
||||||
|
try:
|
||||||
|
_run_gui()
|
||||||
|
except Exception:
|
||||||
|
_run_headless()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
||||||
1
modules/__init__.py
Normal file
1
modules/__init__.py
Normal file
@ -0,0 +1 @@
|
|||||||
|
|
||||||
98
modules/checker.py
Normal file
98
modules/checker.py
Normal file
@ -0,0 +1,98 @@
|
|||||||
|
"""
|
||||||
|
合规检查模块:检查生成的标书是否响应了招标关键要求
|
||||||
|
"""
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import re
|
||||||
|
import sqlite3
|
||||||
|
|
||||||
|
from utils import ai_client
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
CHECK_PROMPT = """你是一位专业的投标文件技术审核专家。请对照以下【技术评分要求】,检查【标书技术内容】的覆盖情况,输出技术合规检查报告。
|
||||||
|
|
||||||
|
重要限制(必须遵守):
|
||||||
|
★ 本次检查范围仅限技术内容,包括:技术方案、实施能力、技术指标、质量保障、人员配置、技术创新等
|
||||||
|
★ 严禁将商务评分、价格评分、资质评分、报价、合同条款、付款方式等商务内容纳入检查项
|
||||||
|
★ 若技术评分要求中混有商务条款,直接忽略,不得作为检查项输出
|
||||||
|
|
||||||
|
【技术评分要求】
|
||||||
|
{requirements}
|
||||||
|
|
||||||
|
【标书技术内容(各章节摘要)】
|
||||||
|
{content}
|
||||||
|
|
||||||
|
请输出以下格式的 JSON,每个 item 均为技术评分项,不含任何商务内容:
|
||||||
|
{{
|
||||||
|
"overall_score": 85,
|
||||||
|
"status": "良好",
|
||||||
|
"items": [
|
||||||
|
{{
|
||||||
|
"requirement": "技术评分要求描述",
|
||||||
|
"covered": true,
|
||||||
|
"note": "说明"
|
||||||
|
}}
|
||||||
|
],
|
||||||
|
"missing_points": ["未覆盖的技术要点1", "未覆盖的技术要点2"],
|
||||||
|
"suggestions": ["技术内容改进建议1", "技术内容改进建议2"]
|
||||||
|
}}
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
def check_compliance(db_path: str, project_id: int) -> dict:
|
||||||
|
"""
|
||||||
|
执行合规检查,返回检查结果字典。
|
||||||
|
"""
|
||||||
|
conn = sqlite3.connect(db_path)
|
||||||
|
try:
|
||||||
|
# 获取招标要求
|
||||||
|
cur = conn.cursor()
|
||||||
|
cur.execute(
|
||||||
|
"SELECT summary, rating_requirements FROM tender_data WHERE project_id=?",
|
||||||
|
(project_id,)
|
||||||
|
)
|
||||||
|
td = cur.fetchone()
|
||||||
|
if not td:
|
||||||
|
return {'error': '尚未解析招标文件'}
|
||||||
|
|
||||||
|
# 只使用技术评分要求作为检查基准,排除 summary 中可能包含的商务内容
|
||||||
|
requirements = (td[1] or '').strip()
|
||||||
|
if not requirements:
|
||||||
|
return {'error': '尚未提取技术评分要求,请先完成步骤一的招标文件解析'}
|
||||||
|
|
||||||
|
# 收集已生成的章节内容(取前 500 字)
|
||||||
|
cur.execute(
|
||||||
|
"SELECT section_title, content FROM bid_sections WHERE project_id=? AND status='done' ORDER BY order_index",
|
||||||
|
(project_id,)
|
||||||
|
)
|
||||||
|
rows = cur.fetchall()
|
||||||
|
if not rows:
|
||||||
|
return {'error': '尚未生成标书内容,请先生成'}
|
||||||
|
|
||||||
|
content_parts = []
|
||||||
|
for title, content in rows:
|
||||||
|
snippet = (content or '')[:500].replace('\n', ' ')
|
||||||
|
content_parts.append(f"【{title}】{snippet}")
|
||||||
|
content_str = '\n'.join(content_parts)
|
||||||
|
|
||||||
|
# 调用 AI 检查
|
||||||
|
prompt = CHECK_PROMPT.format(requirements=requirements[:3000], content=content_str[:6000])
|
||||||
|
raw = ai_client.chat(prompt, temperature=0.2, max_tokens=2048)
|
||||||
|
|
||||||
|
# 解析 JSON
|
||||||
|
raw = re.sub(r'```(?:json)?\s*', '', raw).replace('```', '').strip()
|
||||||
|
m = re.search(r'\{[\s\S]*\}', raw)
|
||||||
|
if m:
|
||||||
|
raw = m.group(0)
|
||||||
|
result = json.loads(raw)
|
||||||
|
return result
|
||||||
|
|
||||||
|
except json.JSONDecodeError as e:
|
||||||
|
logger.error(f'合规检查结果解析失败: {e}')
|
||||||
|
return {'error': f'AI 返回格式异常: {e}', 'raw': raw}
|
||||||
|
except Exception as e:
|
||||||
|
logger.exception('合规检查失败')
|
||||||
|
return {'error': str(e)}
|
||||||
|
finally:
|
||||||
|
conn.close()
|
||||||
635
modules/dark_bid_format_check.py
Normal file
635
modules/dark_bid_format_check.py
Normal file
@ -0,0 +1,635 @@
|
|||||||
|
"""
|
||||||
|
技术暗标 HTML 格式检查(由 清标工具.js 迁移,不依赖浏览器/jsdom)。
|
||||||
|
仅解析内联 style 与文档内 <style> 中的 @page 简单规则;无内联样式时部分项可能判为不符合。
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import re
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
from bs4 import BeautifulSoup, Tag
|
||||||
|
|
||||||
|
# 1pt ≈ 96/72 px (CSS 标准)
|
||||||
|
_PT_PX = 96.0 / 72.0
|
||||||
|
|
||||||
|
# 三号 16pt / 四号 14pt / 五号 10.5pt / 行距 26pt
|
||||||
|
_TARGET_H = 16 * _PT_PX # 21.333...
|
||||||
|
_TARGET_BODY = 14 * _PT_PX
|
||||||
|
_TARGET_LH = 26 * _PT_PX
|
||||||
|
_TARGET_FIG = 10.5 * _PT_PX
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_style_attr(style: str | None) -> dict[str, str]:
|
||||||
|
if not style or not style.strip():
|
||||||
|
return {}
|
||||||
|
out: dict[str, str] = {}
|
||||||
|
for part in style.split(";"):
|
||||||
|
part = part.strip()
|
||||||
|
if ":" not in part:
|
||||||
|
continue
|
||||||
|
k, v = part.split(":", 1)
|
||||||
|
k, v = k.strip().lower(), v.strip()
|
||||||
|
if k:
|
||||||
|
out[k] = v
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
def _num(s: str) -> float:
|
||||||
|
try:
|
||||||
|
return float(re.sub(r"[^\d.\-]", "", s) or "nan")
|
||||||
|
except ValueError:
|
||||||
|
return float("nan")
|
||||||
|
|
||||||
|
|
||||||
|
def _length_to_px(val: str, font_size_px: float | None = None) -> float:
|
||||||
|
"""将 font-size / line-height 等长度转为近似 px 浮点,用于与 JS 中 getComputedStyle(px) 对齐。"""
|
||||||
|
val = (val or "").strip().lower()
|
||||||
|
if not val or val in ("normal", "inherit", "initial"):
|
||||||
|
return float("nan")
|
||||||
|
if val.isdigit():
|
||||||
|
return float(val)
|
||||||
|
m = re.match(r"^([\d.]+)\s*(pt|px|em|rem)?\s*$", val)
|
||||||
|
if not m:
|
||||||
|
m2 = re.match(r"^([\d.]+)", val)
|
||||||
|
return float(m2.group(1)) if m2 else float("nan")
|
||||||
|
n, unit = float(m.group(1)), (m.group(2) or "px")
|
||||||
|
if unit == "pt":
|
||||||
|
return n * _PT_PX
|
||||||
|
if unit == "px":
|
||||||
|
return n
|
||||||
|
if unit in ("em", "rem") and font_size_px and font_size_px == font_size_px:
|
||||||
|
return n * font_size_px
|
||||||
|
if unit in ("em", "rem"):
|
||||||
|
return n # 无字号时仅返回 em 数,供 text-indent 等判断
|
||||||
|
return n
|
||||||
|
|
||||||
|
|
||||||
|
def _indent_value(style: dict[str, str], font_size_px: float) -> float:
|
||||||
|
"""与 JS 中 parseFloat(textIndent) 对齐:'2em' -> 2.0;'2ch' 等取首数字段。"""
|
||||||
|
raw = (style.get("text-indent") or "").strip()
|
||||||
|
if not raw:
|
||||||
|
return float("nan")
|
||||||
|
if "em" in raw.lower():
|
||||||
|
m = re.search(r"([\d.]+)\s*em", raw, re.I)
|
||||||
|
return float(m.group(1)) if m else _num(raw)
|
||||||
|
# px 转 em 近似
|
||||||
|
px = _length_to_px(raw, font_size_px)
|
||||||
|
if px == px and font_size_px > 0:
|
||||||
|
return px / font_size_px
|
||||||
|
return _num(raw)
|
||||||
|
|
||||||
|
|
||||||
|
def _color_normalized(style: dict[str, str]) -> str:
|
||||||
|
c = (style.get("color") or "").strip().lower()
|
||||||
|
if not c:
|
||||||
|
return ""
|
||||||
|
c = c.replace(" ", "")
|
||||||
|
if c in ("#000", "#000000", "black", "rgb(0,0,0)"):
|
||||||
|
return "rgb(0, 0, 0)"
|
||||||
|
m = re.match(r"rgb\s*\(\s*(\d+)\s*,\s*(\d+)\s*,\s*(\d+)\s*\)", c)
|
||||||
|
if m:
|
||||||
|
r, g, b = int(m.group(1)), int(m.group(2)), int(m.group(3))
|
||||||
|
if r == 0 and g == 0 and b == 0:
|
||||||
|
return "rgb(0, 0, 0)"
|
||||||
|
return c
|
||||||
|
return c
|
||||||
|
|
||||||
|
|
||||||
|
def _el_style_dict(tag: Tag) -> dict[str, str]:
|
||||||
|
s = tag.get("style")
|
||||||
|
if isinstance(s, str):
|
||||||
|
return _parse_style_attr(s)
|
||||||
|
if isinstance(s, list):
|
||||||
|
return _parse_style_attr(";".join(s))
|
||||||
|
return {}
|
||||||
|
|
||||||
|
|
||||||
|
def _get_inline_property(tag: Tag, prop: str) -> str:
|
||||||
|
d = _el_style_dict(tag)
|
||||||
|
return d.get(prop.lower(), "")
|
||||||
|
|
||||||
|
|
||||||
|
def _outer_html_sample(tag: Tag, limit: int = 200) -> str:
|
||||||
|
s = str(tag)
|
||||||
|
return s[:limit] if len(s) > limit else s
|
||||||
|
|
||||||
|
|
||||||
|
def _is_under(node: Tag | None, ancestor: Tag | None) -> bool:
|
||||||
|
if node is None or ancestor is None:
|
||||||
|
return False
|
||||||
|
p: Tag | None = node
|
||||||
|
while p is not None:
|
||||||
|
if p is ancestor:
|
||||||
|
return True
|
||||||
|
p = p.parent
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def _body_text(soup: BeautifulSoup) -> str:
|
||||||
|
body = soup.body
|
||||||
|
if not body:
|
||||||
|
return soup.get_text("\n", strip=True)
|
||||||
|
return body.get_text("\n", strip=True)
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_page_margins_from_html(raw_html: str) -> dict[str, str] | None:
|
||||||
|
"""从 <style> 中粗提取 @page 块内 margin 与 size。"""
|
||||||
|
for m in re.finditer(
|
||||||
|
r"@page\s*\{([^}]+)\}",
|
||||||
|
raw_html,
|
||||||
|
re.I | re.DOTALL,
|
||||||
|
):
|
||||||
|
block = m.group(1)
|
||||||
|
msh = re.search(r"margin\s*:\s*([^;]+);", block, re.I)
|
||||||
|
if msh:
|
||||||
|
return {"shorthand": msh.group(1).strip()}
|
||||||
|
margins: dict[str, str] = {}
|
||||||
|
for name, key in (
|
||||||
|
(r"margin-top\s*:\s*([^;]+)", "top"),
|
||||||
|
(r"margin-bottom\s*:\s*([^;]+)", "bottom"),
|
||||||
|
(r"margin-left\s*:\s*([^;]+)", "left"),
|
||||||
|
(r"margin-right\s*:\s*([^;]+)", "right"),
|
||||||
|
(r"size\s*:\s*([^;]+)", "size"),
|
||||||
|
):
|
||||||
|
mm = re.search(name, block, re.I)
|
||||||
|
if mm:
|
||||||
|
margins[key] = mm.group(1).strip()
|
||||||
|
if margins:
|
||||||
|
return margins
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def check_technical_bid(html_content: str) -> dict[str, Any]:
|
||||||
|
"""
|
||||||
|
对技术暗标 HTML 执行格式检查。
|
||||||
|
返回结构与清标数据.json 一致:overall, details, violations。
|
||||||
|
"""
|
||||||
|
results: dict[str, Any] = {
|
||||||
|
"overall": True,
|
||||||
|
"details": [],
|
||||||
|
"violations": [],
|
||||||
|
}
|
||||||
|
|
||||||
|
def add_result(
|
||||||
|
rule_name: str,
|
||||||
|
passed: bool,
|
||||||
|
message: str,
|
||||||
|
elements: list[Tag] | None = None,
|
||||||
|
) -> None:
|
||||||
|
results["details"].append(
|
||||||
|
{"rule": rule_name, "passed": passed, "message": message}
|
||||||
|
)
|
||||||
|
if not passed:
|
||||||
|
results["overall"] = False
|
||||||
|
el_snips: list[str] = []
|
||||||
|
for el in elements or []:
|
||||||
|
if isinstance(el, Tag):
|
||||||
|
el_snips.append(_outer_html_sample(el))
|
||||||
|
results["violations"].append(
|
||||||
|
{"rule": rule_name, "message": message, "elements": el_snips}
|
||||||
|
)
|
||||||
|
|
||||||
|
if not (html_content or "").strip():
|
||||||
|
add_result("身份信息隐藏", False, "HTML 内容为空", [])
|
||||||
|
return results
|
||||||
|
|
||||||
|
raw_html = html_content
|
||||||
|
soup = BeautifulSoup(html_content, "lxml")
|
||||||
|
if not soup.body:
|
||||||
|
soup = BeautifulSoup(f"<html><body>{html_content}</body></html>", "lxml")
|
||||||
|
|
||||||
|
body = soup.body
|
||||||
|
if not body:
|
||||||
|
add_result("身份信息隐藏", False, "无法解析 body", [])
|
||||||
|
return results
|
||||||
|
|
||||||
|
# ---- 1. 身份 ----
|
||||||
|
body_text = _body_text(soup)
|
||||||
|
company_pattern = re.compile(
|
||||||
|
r"(?:我公司|本公司|[((]?[A-Za-z\u4e00-\u9fa5]+(?:集团|股份|有限|责任|公司)[))]?)"
|
||||||
|
)
|
||||||
|
addr_pattern = re.compile(
|
||||||
|
r"(?:省|市|区|县|镇|路|街|大道|号|大厦|楼|层)[\u4e00-\u9fa50-9]+"
|
||||||
|
)
|
||||||
|
name_pattern = re.compile(
|
||||||
|
r"(?:总监理工程师|专业监理工程师|技术负责人|项目经理)[::]\s*"
|
||||||
|
r"[^甲乙丙丁戊己庚辛壬癸\s]{2,4}(?=[,。;\s]|$)"
|
||||||
|
)
|
||||||
|
found_company = bool(company_pattern.search(body_text))
|
||||||
|
found_addr = bool(addr_pattern.search(body_text))
|
||||||
|
found_name = bool(name_pattern.search(body_text))
|
||||||
|
has_logo = False
|
||||||
|
for img in soup.find_all("img"):
|
||||||
|
if not isinstance(img, Tag):
|
||||||
|
continue
|
||||||
|
alt = (img.get("alt") or "") + ""
|
||||||
|
src = (img.get("src") or "") + ""
|
||||||
|
if re.search(r"logo|商标|微标|公司|品牌", alt, re.I) or re.search(
|
||||||
|
r"logo", src, re.I
|
||||||
|
):
|
||||||
|
has_logo = True
|
||||||
|
break
|
||||||
|
passed_id = not (
|
||||||
|
found_company or found_addr or found_name or has_logo
|
||||||
|
)
|
||||||
|
add_result(
|
||||||
|
"身份信息隐藏",
|
||||||
|
passed_id,
|
||||||
|
"未发现投标人身份信息"
|
||||||
|
if passed_id
|
||||||
|
else "发现投标人身份信息(公司名/地址/真实姓名/商标)",
|
||||||
|
)
|
||||||
|
|
||||||
|
def heading_style_ok(tag: Tag) -> bool:
|
||||||
|
st = _el_style_dict(tag)
|
||||||
|
fs_raw = st.get("font-size", "")
|
||||||
|
fs_px = _length_to_px(fs_raw)
|
||||||
|
if "em" in (fs_raw or "").lower() and "rem" not in (fs_raw or "").lower():
|
||||||
|
fs_px = _num(fs_raw) * 16.0
|
||||||
|
size_ok = abs(fs_px - _TARGET_H) <= 3
|
||||||
|
fam = (st.get("font-family") or "").lower()
|
||||||
|
font_ok = "黑体" in fam or "simhei" in fam or "microsoft yahei" in fam
|
||||||
|
font_style = (st.get("font-style") or "").lower()
|
||||||
|
style_ok = font_style != "italic"
|
||||||
|
text_dec = (st.get("text-decoration") or "").lower()
|
||||||
|
decor_ok = "underline" not in text_dec
|
||||||
|
cr = (st.get("color") or "").strip().lower()
|
||||||
|
if not cr or cr in ("inherit", "initial"):
|
||||||
|
color_ok = True
|
||||||
|
else:
|
||||||
|
cn = _color_normalized(st)
|
||||||
|
color_ok = cn == "rgb(0, 0, 0)" or cr in (
|
||||||
|
"#000",
|
||||||
|
"#000000",
|
||||||
|
"black",
|
||||||
|
"rgb(0,0,0)",
|
||||||
|
)
|
||||||
|
fw = (st.get("font-weight") or "400").lower()
|
||||||
|
weight_ok = fw not in ("400", "normal")
|
||||||
|
if not st.get("font-size"):
|
||||||
|
size_ok = False
|
||||||
|
return (
|
||||||
|
size_ok
|
||||||
|
and font_ok
|
||||||
|
and style_ok
|
||||||
|
and decor_ok
|
||||||
|
and color_ok
|
||||||
|
and weight_ok
|
||||||
|
)
|
||||||
|
|
||||||
|
# ---- 2. 标题 ----
|
||||||
|
heading_tags: list[Tag] = []
|
||||||
|
for sel in ("h1", "h2", "h3", "h4", "h5", "h6"):
|
||||||
|
heading_tags.extend(soup.find_all(sel))
|
||||||
|
for t in soup.find_all(attrs={"role": "heading"}):
|
||||||
|
if isinstance(t, Tag):
|
||||||
|
heading_tags.append(t)
|
||||||
|
for t in soup.select(".heading, .title"):
|
||||||
|
if isinstance(t, Tag) and t not in heading_tags:
|
||||||
|
heading_tags.append(t)
|
||||||
|
|
||||||
|
invalid_h: list[Tag] = []
|
||||||
|
for h in heading_tags:
|
||||||
|
if not isinstance(h, Tag):
|
||||||
|
continue
|
||||||
|
if not heading_style_ok(h):
|
||||||
|
invalid_h.append(h)
|
||||||
|
h_ok = len(invalid_h) == 0
|
||||||
|
add_result(
|
||||||
|
"标题格式",
|
||||||
|
h_ok,
|
||||||
|
"所有标题符合三号黑体要求"
|
||||||
|
if h_ok
|
||||||
|
else "部分标题字号/字体/颜色/下划线不符合要求",
|
||||||
|
invalid_h,
|
||||||
|
)
|
||||||
|
|
||||||
|
def body_el_ok(el: Tag) -> bool:
|
||||||
|
st = _el_style_dict(el)
|
||||||
|
if el.name in ("h1", "h2", "h3", "h4", "h5", "h6"):
|
||||||
|
return True
|
||||||
|
cls = " ".join(el.get("class", [])) if el.get("class") else ""
|
||||||
|
if any(
|
||||||
|
x in cls
|
||||||
|
for x in ("header", "footer", "toc", "目录", "table-of-contents")
|
||||||
|
):
|
||||||
|
return True
|
||||||
|
text = el.get_text(strip=True)
|
||||||
|
if not text:
|
||||||
|
return True
|
||||||
|
fs_raw = st.get("font-size", "")
|
||||||
|
font_px = _length_to_px(fs_raw)
|
||||||
|
if not fs_raw:
|
||||||
|
return False
|
||||||
|
size_ok = abs(font_px - _TARGET_BODY) <= 2
|
||||||
|
fam = (st.get("font-family") or "").lower()
|
||||||
|
font_ok = "宋体" in fam or "simsun" in fam or "serif" in fam
|
||||||
|
col = st.get("color", "")
|
||||||
|
color_ok = (not col) or _color_normalized(st) == "rgb(0, 0, 0)" or col.lower() in (
|
||||||
|
"#000",
|
||||||
|
"#000000",
|
||||||
|
"black",
|
||||||
|
"rgb(0,0,0)",
|
||||||
|
)
|
||||||
|
ind = _indent_value(st, font_px)
|
||||||
|
indent_ok = ind == ind and 1.8 <= ind <= 2.2
|
||||||
|
lh_raw = (st.get("line-height") or "").strip()
|
||||||
|
if not lh_raw:
|
||||||
|
line_ok = False
|
||||||
|
else:
|
||||||
|
if "pt" in lh_raw or "px" in lh_raw:
|
||||||
|
lh_px = _length_to_px(lh_raw, font_px)
|
||||||
|
elif re.match(r"^[\d.]+$", lh_raw):
|
||||||
|
lh_px = float(lh_raw) * font_px
|
||||||
|
else:
|
||||||
|
lh_px = _length_to_px(lh_raw, font_px)
|
||||||
|
line_ok = abs(lh_px - _TARGET_LH) <= 2
|
||||||
|
tdec = (st.get("text-decoration") or "").lower()
|
||||||
|
decor_ok = "underline" not in tdec
|
||||||
|
fw = (st.get("font-weight") or "400").lower()
|
||||||
|
weight_ok = fw in ("400", "normal", "")
|
||||||
|
fst = (st.get("font-style") or "").lower()
|
||||||
|
style_ok = fst != "italic"
|
||||||
|
return (
|
||||||
|
size_ok
|
||||||
|
and font_ok
|
||||||
|
and color_ok
|
||||||
|
and indent_ok
|
||||||
|
and line_ok
|
||||||
|
and decor_ok
|
||||||
|
and weight_ok
|
||||||
|
and style_ok
|
||||||
|
)
|
||||||
|
|
||||||
|
exclude_set = {
|
||||||
|
"h1",
|
||||||
|
"h2",
|
||||||
|
"h3",
|
||||||
|
"h4",
|
||||||
|
"h5",
|
||||||
|
"h6",
|
||||||
|
}
|
||||||
|
invalid_body: list[Tag] = []
|
||||||
|
for el in soup.find_all(["p", "div", "span", "li", "td", "th"]):
|
||||||
|
if not isinstance(el, Tag):
|
||||||
|
continue
|
||||||
|
if el.name in exclude_set:
|
||||||
|
continue
|
||||||
|
if "header" in " ".join(el.get("class", [])):
|
||||||
|
continue
|
||||||
|
if "footer" in " ".join(el.get("class", [])):
|
||||||
|
continue
|
||||||
|
if "toc" in " ".join(el.get("class", [])) or "目录" in " ".join(
|
||||||
|
el.get("class", [])
|
||||||
|
):
|
||||||
|
continue
|
||||||
|
if not el.get_text(strip=True):
|
||||||
|
continue
|
||||||
|
if not body_el_ok(el):
|
||||||
|
invalid_body.append(el)
|
||||||
|
|
||||||
|
b_ok = len(invalid_body) == 0
|
||||||
|
add_result(
|
||||||
|
"正文格式",
|
||||||
|
b_ok,
|
||||||
|
"所有正文符合四号宋体/缩进/行距/颜色要求"
|
||||||
|
if b_ok
|
||||||
|
else "部分正文段落格式不符合要求",
|
||||||
|
invalid_body,
|
||||||
|
)
|
||||||
|
|
||||||
|
# ---- 4. 目录 ----
|
||||||
|
toc_els: list[Tag] = []
|
||||||
|
for cls in ("toc", "table-of-contents", "目录"):
|
||||||
|
for t in soup.find_all(class_=cls):
|
||||||
|
if isinstance(t, Tag) and t not in toc_els:
|
||||||
|
toc_els.append(t)
|
||||||
|
for t in soup.find_all(attrs={"role": "directory"}):
|
||||||
|
if isinstance(t, Tag) and t not in toc_els:
|
||||||
|
toc_els.append(t)
|
||||||
|
|
||||||
|
if not toc_els:
|
||||||
|
add_result("目录要求", False, "未检测到目录,请确保包含目录且目录无页码无页眉页脚")
|
||||||
|
else:
|
||||||
|
no_pn = True
|
||||||
|
no_hf = True
|
||||||
|
for toc in toc_els:
|
||||||
|
text = toc.get_text("\n", strip=True)
|
||||||
|
lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
|
||||||
|
for line in lines:
|
||||||
|
if re.search(r"\d+\s*$", line) and re.search(r"\d$", line):
|
||||||
|
if re.search(r"\.{2,}\s*\d+", line) or re.match(
|
||||||
|
r"^.*\d$", line
|
||||||
|
):
|
||||||
|
if re.search(r"\.{2,}\s*\d+", line):
|
||||||
|
no_pn = False
|
||||||
|
if re.search(r"\.{2,}\s*\d+", line):
|
||||||
|
no_pn = False
|
||||||
|
if toc.find(class_=re.compile("header|page-header", re.I)):
|
||||||
|
no_hf = False
|
||||||
|
if toc.find(class_=re.compile("footer|page-footer", re.I)):
|
||||||
|
no_hf = False
|
||||||
|
t_ok = no_pn and no_hf
|
||||||
|
add_result(
|
||||||
|
"目录要求",
|
||||||
|
t_ok,
|
||||||
|
"目录符合无页码、无页眉页脚要求"
|
||||||
|
if t_ok
|
||||||
|
else "目录中存在页码或页眉页脚",
|
||||||
|
)
|
||||||
|
|
||||||
|
# ---- 5. 图表 / 附件(合法选择器)----
|
||||||
|
appendix: Tag | None = None
|
||||||
|
for sel in (
|
||||||
|
"#appendix",
|
||||||
|
".appendix",
|
||||||
|
".attachment",
|
||||||
|
'[id*="附件"]',
|
||||||
|
'[class*="附件"]',
|
||||||
|
'[class*="附表"]',
|
||||||
|
):
|
||||||
|
hit = soup.select_one(sel)
|
||||||
|
if hit and isinstance(hit, Tag):
|
||||||
|
appendix = hit
|
||||||
|
break
|
||||||
|
|
||||||
|
illegal: list[Tag] = []
|
||||||
|
for tbl in soup.find_all("table"):
|
||||||
|
if isinstance(tbl, Tag) and not _is_under(tbl, appendix):
|
||||||
|
illegal.append(tbl)
|
||||||
|
for im in soup.find_all("img"):
|
||||||
|
if isinstance(im, Tag) and not _is_under(im, appendix):
|
||||||
|
illegal.append(im)
|
||||||
|
for el in soup.find_all("figure"):
|
||||||
|
if isinstance(el, Tag) and not _is_under(el, appendix):
|
||||||
|
illegal.append(el)
|
||||||
|
for el in soup.find_all(class_="chart"):
|
||||||
|
if isinstance(el, Tag) and not _is_under(el, appendix) and el not in illegal:
|
||||||
|
illegal.append(el)
|
||||||
|
|
||||||
|
chart_text_valid = True
|
||||||
|
if appendix:
|
||||||
|
for el in appendix.select("table, td, th, figcaption, .chart-text"):
|
||||||
|
if not isinstance(el, Tag):
|
||||||
|
continue
|
||||||
|
st = _el_style_dict(el)
|
||||||
|
if not st.get("font-size"):
|
||||||
|
continue
|
||||||
|
fs = _length_to_px(st.get("font-size", ""))
|
||||||
|
size_ok = abs(fs - _TARGET_FIG) <= 1.5
|
||||||
|
fam = (st.get("font-family") or "").lower()
|
||||||
|
font_ok = "宋体" in fam or "simsun" in fam
|
||||||
|
c_raw = (st.get("color") or "").strip()
|
||||||
|
if c_raw and c_raw.lower() not in ("inherit", "initial"):
|
||||||
|
c_ok = _color_normalized(st) == "rgb(0, 0, 0)" or c_raw.lower() in (
|
||||||
|
"#000",
|
||||||
|
"#000000",
|
||||||
|
"black",
|
||||||
|
"rgb(0,0,0)",
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
c_ok = True
|
||||||
|
if not (size_ok and font_ok and c_ok):
|
||||||
|
chart_text_valid = False
|
||||||
|
|
||||||
|
c_ok2 = len(illegal) == 0 and chart_text_valid
|
||||||
|
add_result(
|
||||||
|
"图表规范",
|
||||||
|
c_ok2,
|
||||||
|
"图表仅出现在附件/附表内,且图表文字符合五号宋体"
|
||||||
|
if c_ok2
|
||||||
|
else f"正文中发现{len(illegal)}个图表或附件内图表文字格式错误",
|
||||||
|
illegal,
|
||||||
|
)
|
||||||
|
|
||||||
|
# ---- 6. 颜色与装饰 ----
|
||||||
|
color_v: list[Tag] = []
|
||||||
|
decor_v: list[Tag] = []
|
||||||
|
for el in soup.find_all(True):
|
||||||
|
if not isinstance(el, Tag):
|
||||||
|
continue
|
||||||
|
st = _el_style_dict(el)
|
||||||
|
if not st.get("color") and not st.get("text-decoration") and not st.get(
|
||||||
|
"border-bottom-style"
|
||||||
|
):
|
||||||
|
continue
|
||||||
|
col = (st.get("color") or "").strip().lower()
|
||||||
|
if col and col not in (
|
||||||
|
"inherit",
|
||||||
|
"initial",
|
||||||
|
"",
|
||||||
|
"#000",
|
||||||
|
"#000000",
|
||||||
|
"black",
|
||||||
|
"rgb(0,0,0)",
|
||||||
|
"rgb(0, 0, 0)",
|
||||||
|
):
|
||||||
|
if _color_normalized(st) and _color_normalized(st) != "rgb(0, 0, 0)":
|
||||||
|
if el.get_text(strip=True):
|
||||||
|
color_v.append(el)
|
||||||
|
tdec = (st.get("text-decoration") or "").lower()
|
||||||
|
if "underline" in tdec and el.get_text(strip=True):
|
||||||
|
decor_v.append(el)
|
||||||
|
bbs = (st.get("border-bottom-style") or "").lower()
|
||||||
|
if bbs in ("solid", "dotted") and el.get_text(strip=True):
|
||||||
|
decor_v.append(el)
|
||||||
|
col_ok = len(color_v) == 0 and len(decor_v) == 0
|
||||||
|
add_result(
|
||||||
|
"颜色与装饰",
|
||||||
|
col_ok,
|
||||||
|
"无彩色文字、无下划线、无着重号"
|
||||||
|
if col_ok
|
||||||
|
else f"发现{len(color_v)}处彩色文字,{len(decor_v)}处下划线/着重号",
|
||||||
|
(color_v + decor_v)[:20],
|
||||||
|
)
|
||||||
|
|
||||||
|
# ---- 7. 页面 ----
|
||||||
|
page_valid = True
|
||||||
|
margin_top = margin_bottom = margin_left = margin_right = None
|
||||||
|
page_info = _parse_page_margins_from_html(raw_html)
|
||||||
|
# Word 常把 @page 写在 style 里,已在 raw_html 中解析
|
||||||
|
if page_info and "shorthand" in page_info:
|
||||||
|
# margin: 2.54cm 3.18cm
|
||||||
|
parts = page_info["shorthand"].split()
|
||||||
|
if len(parts) >= 4:
|
||||||
|
margin_top, margin_right, margin_bottom, margin_left = (
|
||||||
|
parts[0],
|
||||||
|
parts[1],
|
||||||
|
parts[2],
|
||||||
|
parts[3],
|
||||||
|
)
|
||||||
|
elif len(parts) == 2:
|
||||||
|
margin_top = margin_bottom = parts[0]
|
||||||
|
margin_left = margin_right = parts[1]
|
||||||
|
elif page_info:
|
||||||
|
margin_top = page_info.get("top")
|
||||||
|
margin_bottom = page_info.get("bottom")
|
||||||
|
margin_left = page_info.get("left")
|
||||||
|
margin_right = page_info.get("right")
|
||||||
|
|
||||||
|
bst = _el_style_dict(body) if body else {}
|
||||||
|
mraw = bst.get("margin", "")
|
||||||
|
if mraw and not margin_top:
|
||||||
|
margins = mraw.split()
|
||||||
|
if len(margins) >= 1:
|
||||||
|
margin_top = margins[0]
|
||||||
|
if len(margins) >= 2:
|
||||||
|
margin_right = margins[1]
|
||||||
|
if len(margins) >= 3:
|
||||||
|
margin_bottom = margins[2]
|
||||||
|
if len(margins) >= 4:
|
||||||
|
margin_left = margins[3]
|
||||||
|
else:
|
||||||
|
margin_left = margin_right
|
||||||
|
|
||||||
|
if not margin_top and body:
|
||||||
|
margin_top = _get_inline_property(body, "margin-top")
|
||||||
|
margin_bottom = _get_inline_property(body, "margin-bottom")
|
||||||
|
margin_left = _get_inline_property(body, "margin-left")
|
||||||
|
margin_right = _get_inline_property(body, "margin-right")
|
||||||
|
|
||||||
|
if not any([margin_top, margin_bottom, margin_left, margin_right]) and not page_info:
|
||||||
|
page_valid = False
|
||||||
|
|
||||||
|
def m_ok(
|
||||||
|
m: str | None,
|
||||||
|
target: float,
|
||||||
|
) -> bool:
|
||||||
|
if not m:
|
||||||
|
return False
|
||||||
|
s = m.strip()
|
||||||
|
if "cm" in s:
|
||||||
|
return abs(_num(s) - target) < 0.01
|
||||||
|
return abs(_num(s) - target) < 0.01
|
||||||
|
|
||||||
|
top_ok = m_ok(margin_top, 2.54) or (
|
||||||
|
(margin_top or "") in ("2.54cm", "1in")
|
||||||
|
)
|
||||||
|
bottom_ok = m_ok(margin_bottom, 2.54) or (
|
||||||
|
(margin_bottom or "") in ("2.54cm", "1in")
|
||||||
|
)
|
||||||
|
left_ok = m_ok(margin_left, 3.18) or (margin_left or "").startswith("3.18")
|
||||||
|
right_ok = m_ok(margin_right, 3.18) or (margin_right or "").startswith("3.18")
|
||||||
|
|
||||||
|
html_tag = soup.find("html")
|
||||||
|
w = _get_inline_property(html_tag, "width") if isinstance(html_tag, Tag) else "" # type: ignore[arg-type]
|
||||||
|
page_orientation = "横向" if w and w != "auto" and "%" not in w else "纵向"
|
||||||
|
page_ok = bool(
|
||||||
|
top_ok
|
||||||
|
and bottom_ok
|
||||||
|
and left_ok
|
||||||
|
and right_ok
|
||||||
|
and (page_orientation != "横向" or w in ("", "auto"))
|
||||||
|
)
|
||||||
|
if not margin_top:
|
||||||
|
page_ok = False
|
||||||
|
|
||||||
|
add_result(
|
||||||
|
"页面设置",
|
||||||
|
page_ok,
|
||||||
|
"页面设置符合A4纵向/边距要求"
|
||||||
|
if page_ok
|
||||||
|
else "页面边距或纸张方向不符合要求",
|
||||||
|
)
|
||||||
|
|
||||||
|
return results
|
||||||
463
modules/exporter.py
Normal file
463
modules/exporter.py
Normal file
@ -0,0 +1,463 @@
|
|||||||
|
"""
|
||||||
|
Word 文档导出模块
|
||||||
|
"""
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
import sqlite3
|
||||||
|
import logging
|
||||||
|
from datetime import datetime
|
||||||
|
from docx import Document
|
||||||
|
from docx.shared import Pt, Cm, RGBColor
|
||||||
|
from docx.enum.text import WD_ALIGN_PARAGRAPH
|
||||||
|
from docx.oxml import OxmlElement
|
||||||
|
from docx.oxml.ns import qn
|
||||||
|
|
||||||
|
import config
|
||||||
|
from utils.outline_numbering import format_heading_display
|
||||||
|
from utils.style_manager import get_preset, apply_preset_to_document
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
LEVEL_STYLES = {
|
||||||
|
1: ('Heading 1', 16, True),
|
||||||
|
2: ('Heading 2', 14, True),
|
||||||
|
3: ('Heading 3', 13, False),
|
||||||
|
4: ('Heading 4', 12, False),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def export_to_word(db_path: str, project_id: int, style_preset_name='standard') -> str:
|
||||||
|
"""
|
||||||
|
生成 Word 文档并保存到 data/exports/,返回文件名。
|
||||||
|
支持从首页「文件样式设置」传入预设 (style_preset_name)。
|
||||||
|
"""
|
||||||
|
conn = sqlite3.connect(db_path)
|
||||||
|
try:
|
||||||
|
# 获取项目信息
|
||||||
|
cur = conn.cursor()
|
||||||
|
cur.execute("SELECT name FROM projects WHERE id=?", (project_id,))
|
||||||
|
project = cur.fetchone()
|
||||||
|
if not project:
|
||||||
|
raise ValueError(f'项目 {project_id} 不存在')
|
||||||
|
project_name = project[0]
|
||||||
|
|
||||||
|
# 获取标书大纲文本(用于标题页)
|
||||||
|
cur.execute("SELECT outline FROM tender_data WHERE project_id=?", (project_id,))
|
||||||
|
td = cur.fetchone()
|
||||||
|
bid_title = project_name + '技术标书'
|
||||||
|
if td and td[0]:
|
||||||
|
first_line = td[0].strip().split('\n')[0].strip()
|
||||||
|
if first_line:
|
||||||
|
bid_title = first_line
|
||||||
|
|
||||||
|
# 获取所有章节(按顺序)
|
||||||
|
cur.execute('''
|
||||||
|
SELECT section_number, section_title, level, is_leaf, content, intro_content
|
||||||
|
FROM bid_sections
|
||||||
|
WHERE project_id=?
|
||||||
|
ORDER BY order_index
|
||||||
|
''', (project_id,))
|
||||||
|
sections = cur.fetchall()
|
||||||
|
|
||||||
|
preset = get_preset(style_preset_name)
|
||||||
|
doc = _build_document(bid_title, sections, preset)
|
||||||
|
|
||||||
|
# 保存文件
|
||||||
|
os.makedirs(config.EXPORT_DIR, exist_ok=True)
|
||||||
|
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
|
||||||
|
safe_name = ''.join(c for c in project_name if c.isalnum() or c in '._- \u4e00-\u9fff')
|
||||||
|
filename = f'{safe_name}_{timestamp}.docx'
|
||||||
|
filepath = os.path.join(config.EXPORT_DIR, filename)
|
||||||
|
doc.save(filepath)
|
||||||
|
logger.info(f'导出完成: {filepath} (使用样式预设: {style_preset_name})')
|
||||||
|
return filename
|
||||||
|
|
||||||
|
finally:
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
|
||||||
|
DISCLAIMER_TEXT = """\
|
||||||
|
免责声明
|
||||||
|
|
||||||
|
本工具仅供学习交流免费使用,所生成的技术方案不可直接用于投标,请务必人工核对。本工具不会通过任何平台进行销售,请用户注意辨别真伪。在您开始使用本AI标书制作服务之前,请认真阅读并同意以下关键条款。一旦您继续使用,即表示您已充分理解并认可本提示的全部内容。
|
||||||
|
|
||||||
|
服务定位
|
||||||
|
本工具为单机使用的AI标书辅助工具,旨在帮助您生成标书的参考素材。您需对最终自己编写的标书文件承担全部责任,包括审核、修改内容,确保其符合相关法律法规及项目要求。
|
||||||
|
|
||||||
|
准确性免责
|
||||||
|
本人不对AI生成内容的完全准确性与完整性作任何保证。您有义务自行核实所有关键信息,并自行承担因使用本工具所引发的一切后果。
|
||||||
|
|
||||||
|
标书风险
|
||||||
|
本工具所生成的素材文件仅作参考。若您使用(包括引用、修改或二次创作),需自行承担由此可能导致的废标、侵权等全部风险与责任,本人不承担任何相关责任。
|
||||||
|
|
||||||
|
责任限制
|
||||||
|
任何情形下,本人均不对因使用本服务而造成的任何直接、间接或衍生损失(例如利润损失、业务中断、数据丢失等)承担法律责任。
|
||||||
|
|
||||||
|
其他事项
|
||||||
|
本人保留随时修改或终止本服务的权利。本提示的解释及争议解决,均适用中华人民共和国法律。\
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
def _add_disclaimer_page(doc: Document) -> None:
|
||||||
|
"""在文档开头插入免责声明页"""
|
||||||
|
# 标题
|
||||||
|
title_para = doc.add_paragraph()
|
||||||
|
title_para.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||||||
|
title_run = title_para.add_run('免责声明')
|
||||||
|
title_run.font.size = Pt(16)
|
||||||
|
title_run.font.bold = True
|
||||||
|
title_run.font.name = '黑体'
|
||||||
|
title_run._element.rPr.rFonts.set(qn('w:eastAsia'), '黑体')
|
||||||
|
|
||||||
|
doc.add_paragraph()
|
||||||
|
|
||||||
|
# 正文各段(跳过第一行标题,已单独渲染)
|
||||||
|
body_lines = DISCLAIMER_TEXT.split('\n')[2:] # 跳过"免责声明"和空行
|
||||||
|
for line in body_lines:
|
||||||
|
p = doc.add_paragraph()
|
||||||
|
stripped = line.strip()
|
||||||
|
# 小标题行(非空且后面没有缩进,即段落标题)
|
||||||
|
is_section_title = bool(stripped) and not line.startswith(' ') and not line.startswith('\u3000')
|
||||||
|
run = p.add_run(stripped if stripped else '')
|
||||||
|
if is_section_title and stripped:
|
||||||
|
run.font.bold = True
|
||||||
|
run.font.size = Pt(11)
|
||||||
|
else:
|
||||||
|
run.font.size = Pt(10.5)
|
||||||
|
run.font.name = 'Times New Roman'
|
||||||
|
run._element.rPr.rFonts.set(qn('w:eastAsia'), '宋体')
|
||||||
|
p.paragraph_format.space_after = Pt(4)
|
||||||
|
_set_line_spacing_15(p)
|
||||||
|
|
||||||
|
doc.add_page_break()
|
||||||
|
|
||||||
|
|
||||||
|
def _add_toc_tree_page(doc: Document, sections: list) -> None:
|
||||||
|
"""标题页之后插入树状目录(按 level 缩进;静态文本,不含 Word 目录域)。"""
|
||||||
|
toc_heading = doc.add_paragraph()
|
||||||
|
toc_heading.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||||||
|
tr = toc_heading.add_run('目录')
|
||||||
|
tr.font.size = Pt(16)
|
||||||
|
tr.font.bold = True
|
||||||
|
tr.font.name = '黑体'
|
||||||
|
tr._element.rPr.rFonts.set(qn('w:eastAsia'), '黑体')
|
||||||
|
doc.add_paragraph()
|
||||||
|
|
||||||
|
for row in sections:
|
||||||
|
section_number, title, level, _, _, _ = row
|
||||||
|
level = min(int(level), 4)
|
||||||
|
text = format_heading_display(level, str(section_number or ''), str(title or ''))
|
||||||
|
p = doc.add_paragraph()
|
||||||
|
p.paragraph_format.left_indent = Cm(0.75 * max(0, level - 1))
|
||||||
|
p.paragraph_format.space_after = Pt(3)
|
||||||
|
run = p.add_run(text)
|
||||||
|
run.font.size = Pt(12)
|
||||||
|
run.font.name = '宋体'
|
||||||
|
run._element.rPr.rFonts.set(qn('w:eastAsia'), '宋体')
|
||||||
|
|
||||||
|
doc.add_page_break()
|
||||||
|
|
||||||
|
|
||||||
|
def _build_document(bid_title: str, sections, preset=None) -> Document:
|
||||||
|
if preset is None:
|
||||||
|
preset = get_preset('standard')
|
||||||
|
|
||||||
|
doc = Document()
|
||||||
|
|
||||||
|
# ── 页面设置 (from preset, overrides hardcoded values) ─────────────────────
|
||||||
|
section_obj = doc.sections[0]
|
||||||
|
m = preset.get('margins_cm', {'top': 2.5, 'bottom': 2.5, 'left': 3.0, 'right': 2.5})
|
||||||
|
section_obj.page_width = Cm(21)
|
||||||
|
section_obj.page_height = Cm(29.7)
|
||||||
|
section_obj.left_margin = Cm(m.get('left', 3.0))
|
||||||
|
section_obj.right_margin = Cm(m.get('right', 2.5))
|
||||||
|
section_obj.top_margin = Cm(m.get('top', 2.5))
|
||||||
|
section_obj.bottom_margin = Cm(m.get('bottom', 2.5))
|
||||||
|
|
||||||
|
# Header / Footer from preset
|
||||||
|
if preset.get('header_text'):
|
||||||
|
header = section_obj.header
|
||||||
|
if header.paragraphs:
|
||||||
|
p = header.paragraphs[0]
|
||||||
|
p.text = preset['header_text']
|
||||||
|
p.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||||||
|
if preset.get('footer_text'):
|
||||||
|
footer = section_obj.footer
|
||||||
|
if footer.paragraphs:
|
||||||
|
p = footer.paragraphs[0]
|
||||||
|
p.text = preset['footer_text']
|
||||||
|
p.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||||||
|
|
||||||
|
# ── 免责声明页(第一页)─────────────────────────────────────────────
|
||||||
|
_add_disclaimer_page(doc)
|
||||||
|
|
||||||
|
# ── 标题页 ──────────────────────────────────────────────────────────
|
||||||
|
title_para = doc.add_paragraph()
|
||||||
|
title_para.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||||||
|
title_run = title_para.add_run(bid_title)
|
||||||
|
title_run.font.size = Pt(22)
|
||||||
|
title_run.font.bold = True
|
||||||
|
title_run.font.color.rgb = RGBColor(0x1a, 0x56, 0xdb)
|
||||||
|
title_run.font.name = preset.get('heading_font', '黑体')
|
||||||
|
title_run._element.rPr.rFonts.set(qn('w:eastAsia'), preset.get('heading_font', '黑体'))
|
||||||
|
|
||||||
|
doc.add_paragraph()
|
||||||
|
|
||||||
|
date_para = doc.add_paragraph()
|
||||||
|
date_para.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||||||
|
date_run = date_para.add_run(datetime.now().strftime('%Y年%m月'))
|
||||||
|
date_run.font.size = Pt(14)
|
||||||
|
date_run.font.name = '宋体'
|
||||||
|
date_run._element.rPr.rFonts.set(qn('w:eastAsia'), '宋体')
|
||||||
|
|
||||||
|
doc.add_page_break()
|
||||||
|
|
||||||
|
# ── 树状目录页(标题页后、正文前)──────────────────────────────────
|
||||||
|
_add_toc_tree_page(doc, sections)
|
||||||
|
|
||||||
|
# ── 章节内容 ─────────────────────────────────────────────────────────
|
||||||
|
for row in sections:
|
||||||
|
section_number, title, level, is_leaf, content, intro = row
|
||||||
|
level = min(int(level), 4)
|
||||||
|
|
||||||
|
# 添加标题(带完整目录号,使用preset字体)
|
||||||
|
heading_text = format_heading_display(level, str(section_number or ''), str(title or ''))
|
||||||
|
heading = doc.add_heading(level=level)
|
||||||
|
heading.clear()
|
||||||
|
run = heading.add_run(heading_text)
|
||||||
|
_, font_size, bold = LEVEL_STYLES.get(level, ('Heading 4', 12, False))
|
||||||
|
run.font.size = Pt(preset.get(f'heading{level}_size_pt', font_size))
|
||||||
|
run.font.bold = bold
|
||||||
|
run.font.name = preset.get('heading_font', '黑体' if level <= 2 else '宋体')
|
||||||
|
run._element.rPr.rFonts.set(qn('w:eastAsia'), preset.get('heading_font', '黑体' if level <= 2 else '宋体'))
|
||||||
|
|
||||||
|
# 章节引言(非叶节点)
|
||||||
|
if intro and intro.strip():
|
||||||
|
_add_body_paragraphs(doc, intro, preset)
|
||||||
|
|
||||||
|
# 正文内容(叶节点)
|
||||||
|
if content and content.strip():
|
||||||
|
_add_body_paragraphs(doc, content, preset)
|
||||||
|
|
||||||
|
return doc
|
||||||
|
|
||||||
|
|
||||||
|
def _set_line_spacing_15(paragraph):
|
||||||
|
"""将段落设为 1.5 倍行距(Word 中的 WD_LINE_SPACING.MULTIPLE × 1.5)"""
|
||||||
|
from docx.oxml.ns import qn as _qn
|
||||||
|
pPr = paragraph._element.get_or_add_pPr()
|
||||||
|
spacing = pPr.find(_qn('w:spacing'))
|
||||||
|
if spacing is None:
|
||||||
|
spacing = OxmlElement('w:spacing')
|
||||||
|
pPr.append(spacing)
|
||||||
|
spacing.set(_qn('w:line'), '360') # 240 × 1.5 = 360 twips
|
||||||
|
spacing.set(_qn('w:lineRule'), 'auto')
|
||||||
|
|
||||||
|
|
||||||
|
# ── 图/表标记解析 ─────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
_BLOCK_PATTERN = re.compile(
|
||||||
|
r'\[FIGURE:([^\]]+)\](.*?)\[/FIGURE\]'
|
||||||
|
r'|\[TABLE:([^\]]+)\](.*?)\[/TABLE\]',
|
||||||
|
re.DOTALL
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _split_content_blocks(text: str) -> list:
|
||||||
|
"""
|
||||||
|
将章节正文拆分为有序内容块列表:
|
||||||
|
{'type': 'text', 'content': '...'}
|
||||||
|
{'type': 'figure', 'title': '...', 'content': '...'}
|
||||||
|
{'type': 'table', 'title': '...', 'content': '...'}
|
||||||
|
"""
|
||||||
|
blocks = []
|
||||||
|
last = 0
|
||||||
|
for m in _BLOCK_PATTERN.finditer(text):
|
||||||
|
if m.start() > last:
|
||||||
|
blocks.append({'type': 'text', 'content': text[last:m.start()]})
|
||||||
|
if m.group(1) is not None:
|
||||||
|
blocks.append({'type': 'figure',
|
||||||
|
'title': m.group(1).strip(),
|
||||||
|
'content': m.group(2).strip()})
|
||||||
|
else:
|
||||||
|
blocks.append({'type': 'table',
|
||||||
|
'title': m.group(3).strip(),
|
||||||
|
'content': m.group(4).strip()})
|
||||||
|
last = m.end()
|
||||||
|
if last < len(text):
|
||||||
|
blocks.append({'type': 'text', 'content': text[last:]})
|
||||||
|
return blocks
|
||||||
|
|
||||||
|
|
||||||
|
def _set_para_shading(para, hex_fill: str):
|
||||||
|
"""为段落设置背景填充色"""
|
||||||
|
pPr = para._element.get_or_add_pPr()
|
||||||
|
shd = OxmlElement('w:shd')
|
||||||
|
shd.set(qn('w:val'), 'clear')
|
||||||
|
shd.set(qn('w:color'), 'auto')
|
||||||
|
shd.set(qn('w:fill'), hex_fill)
|
||||||
|
pPr.append(shd)
|
||||||
|
|
||||||
|
|
||||||
|
def _set_cell_bg(cell, hex_fill: str):
|
||||||
|
"""为表格单元格设置背景色"""
|
||||||
|
tc = cell._tc
|
||||||
|
tcPr = tc.get_or_add_tcPr()
|
||||||
|
shd = OxmlElement('w:shd')
|
||||||
|
shd.set(qn('w:val'), 'clear')
|
||||||
|
shd.set(qn('w:color'), 'auto')
|
||||||
|
shd.set(qn('w:fill'), hex_fill)
|
||||||
|
tcPr.append(shd)
|
||||||
|
|
||||||
|
|
||||||
|
def _set_cell_padding(cell, pt_value: float):
|
||||||
|
"""设置表格单元格四侧内边距(单位:磅)"""
|
||||||
|
tc = cell._tc
|
||||||
|
tcPr = tc.get_or_add_tcPr()
|
||||||
|
tcMar = OxmlElement('w:tcMar')
|
||||||
|
val = str(int(pt_value * 20)) # pt → twips(1pt = 20 twips)
|
||||||
|
for side in ('top', 'left', 'bottom', 'right'):
|
||||||
|
node = OxmlElement(f'w:{side}')
|
||||||
|
node.set(qn('w:w'), val)
|
||||||
|
node.set(qn('w:type'), 'dxa')
|
||||||
|
tcMar.append(node)
|
||||||
|
tcPr.append(tcMar)
|
||||||
|
|
||||||
|
|
||||||
|
def _safe_set_eastasia(run, font_name: str):
|
||||||
|
"""安全设置东亚字体,确保 rPr 已存在"""
|
||||||
|
_ = run.font.size # 触发 rPr 创建
|
||||||
|
try:
|
||||||
|
run._element.rPr.rFonts.set(qn('w:eastAsia'), font_name)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
def _add_block_caption(doc: Document, prefix: str, title: str):
|
||||||
|
"""添加图/表居中加粗标题行"""
|
||||||
|
cap = doc.add_paragraph()
|
||||||
|
cap.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||||||
|
cap.paragraph_format.space_before = Pt(8)
|
||||||
|
cap.paragraph_format.space_after = Pt(3)
|
||||||
|
run = cap.add_run(f'{prefix}:{title}')
|
||||||
|
run.font.bold = True
|
||||||
|
run.font.size = Pt(11)
|
||||||
|
run.font.name = 'Times New Roman'
|
||||||
|
_safe_set_eastasia(run, '黑体')
|
||||||
|
|
||||||
|
|
||||||
|
def _add_figure_block(doc: Document, title: str, content: str):
|
||||||
|
"""
|
||||||
|
将图示内容渲染为带边框 + 背景色的文字图示框。
|
||||||
|
使用单格表格(Table Grid 样式)实现四周边框,比纯段落背景更专业。
|
||||||
|
"""
|
||||||
|
_add_block_caption(doc, '图', title)
|
||||||
|
|
||||||
|
lines = content.split('\n')
|
||||||
|
|
||||||
|
# 单格表格:四周边框 + 淡蓝灰背景
|
||||||
|
tbl = doc.add_table(rows=1, cols=1)
|
||||||
|
tbl.style = 'Table Grid'
|
||||||
|
cell = tbl.cell(0, 0)
|
||||||
|
_set_cell_bg(cell, 'EFF3FB') # 淡蓝灰背景
|
||||||
|
_set_cell_padding(cell, 5) # 内边距 5pt
|
||||||
|
|
||||||
|
for i, line in enumerate(lines):
|
||||||
|
if i == 0:
|
||||||
|
para = cell.paragraphs[0]
|
||||||
|
para.clear()
|
||||||
|
else:
|
||||||
|
para = cell.add_paragraph()
|
||||||
|
para.paragraph_format.space_before = Pt(0)
|
||||||
|
para.paragraph_format.space_after = Pt(1)
|
||||||
|
run = para.add_run(line if line else ' ')
|
||||||
|
run.font.size = Pt(9.5)
|
||||||
|
run.font.name = 'Courier New'
|
||||||
|
_safe_set_eastasia(run, '宋体')
|
||||||
|
|
||||||
|
# 图示后空行
|
||||||
|
sp = doc.add_paragraph()
|
||||||
|
sp.paragraph_format.space_after = Pt(8)
|
||||||
|
|
||||||
|
|
||||||
|
def _add_word_table(doc: Document, title: str, content: str):
|
||||||
|
"""将 Markdown 表格解析并渲染为 Word 表格"""
|
||||||
|
# 解析 markdown 行,过滤掉分隔行(|---|)
|
||||||
|
raw_rows = []
|
||||||
|
for line in content.strip().split('\n'):
|
||||||
|
line = line.strip()
|
||||||
|
if not line:
|
||||||
|
continue
|
||||||
|
if re.match(r'^\|[\s\-:| ]+\|$', line):
|
||||||
|
continue # 分隔行
|
||||||
|
if line.startswith('|') and line.endswith('|'):
|
||||||
|
cells = [c.strip() for c in line[1:-1].split('|')]
|
||||||
|
raw_rows.append(cells)
|
||||||
|
|
||||||
|
if not raw_rows:
|
||||||
|
# 没有解析到有效行时,降级为普通文本
|
||||||
|
_add_block_caption(doc, '表', title)
|
||||||
|
_add_plain_text(doc, content)
|
||||||
|
return
|
||||||
|
|
||||||
|
col_count = max(len(r) for r in raw_rows)
|
||||||
|
rows = [r + [''] * (col_count - len(r)) for r in raw_rows]
|
||||||
|
|
||||||
|
_add_block_caption(doc, '表', title)
|
||||||
|
|
||||||
|
table = doc.add_table(rows=len(rows), cols=col_count)
|
||||||
|
table.style = 'Table Grid'
|
||||||
|
|
||||||
|
for i, row_data in enumerate(rows):
|
||||||
|
for j, cell_text in enumerate(row_data):
|
||||||
|
cell = table.cell(i, j)
|
||||||
|
para = cell.paragraphs[0]
|
||||||
|
para.clear()
|
||||||
|
para.alignment = WD_ALIGN_PARAGRAPH.CENTER if i == 0 else WD_ALIGN_PARAGRAPH.LEFT
|
||||||
|
run = para.add_run(cell_text)
|
||||||
|
run.font.size = Pt(10)
|
||||||
|
run.font.bold = (i == 0)
|
||||||
|
run.font.name = 'Times New Roman'
|
||||||
|
_safe_set_eastasia(run, '宋体')
|
||||||
|
if i == 0:
|
||||||
|
_set_cell_bg(cell, 'D6E4F7') # 浅蓝表头
|
||||||
|
|
||||||
|
# 表格后空行
|
||||||
|
sp = doc.add_paragraph()
|
||||||
|
sp.paragraph_format.space_after = Pt(6)
|
||||||
|
|
||||||
|
|
||||||
|
def _add_plain_text(doc: Document, text: str, preset=None):
|
||||||
|
"""添加普通文本段落(内部辅助),支持preset字体/大小"""
|
||||||
|
if preset is None:
|
||||||
|
preset = get_preset('standard')
|
||||||
|
for line in text.split('\n'):
|
||||||
|
line = line.strip()
|
||||||
|
if not line:
|
||||||
|
continue
|
||||||
|
p = doc.add_paragraph()
|
||||||
|
p.paragraph_format.first_line_indent = Pt(24)
|
||||||
|
p.paragraph_format.space_after = Pt(6)
|
||||||
|
_set_line_spacing_15(p)
|
||||||
|
run = p.add_run(line)
|
||||||
|
run.font.size = Pt(preset.get('body_size_pt', 12))
|
||||||
|
run.font.name = preset.get('body_font', 'Times New Roman')
|
||||||
|
_safe_set_eastasia(run, preset.get('body_font', '宋体'))
|
||||||
|
|
||||||
|
|
||||||
|
def _add_body_paragraphs(doc: Document, text: str, preset=None):
|
||||||
|
"""
|
||||||
|
将正文文本分段渲染,自动识别并处理图示 [FIGURE:...] 和表格 [TABLE:...] 标记。
|
||||||
|
支持从preset读取figure/table开关。
|
||||||
|
"""
|
||||||
|
if preset is None:
|
||||||
|
preset = get_preset('standard')
|
||||||
|
for block in _split_content_blocks(text):
|
||||||
|
if block['type'] == 'figure' and preset.get('figure_enabled', True):
|
||||||
|
_add_figure_block(doc, block['title'], block['content'])
|
||||||
|
elif block['type'] == 'table' and preset.get('table_enabled', True):
|
||||||
|
_add_word_table(doc, block['title'], block['content'])
|
||||||
|
else:
|
||||||
|
_add_plain_text(doc, block['content'], preset)
|
||||||
|
|
||||||
|
|
||||||
1212
modules/generator.py
Normal file
1212
modules/generator.py
Normal file
File diff suppressed because it is too large
Load Diff
288
modules/knowledge.py
Normal file
288
modules/knowledge.py
Normal file
@ -0,0 +1,288 @@
|
|||||||
|
"""
|
||||||
|
企业知识库模块(无外部向量库依赖)
|
||||||
|
|
||||||
|
存储后端:SQLite(与主数据库共用同一文件)
|
||||||
|
- knowledge_vectors 表:文本块 + JSON 向量
|
||||||
|
- knowledge_files 表:文件元数据(已在 app.py init_db 中建立)
|
||||||
|
|
||||||
|
检索策略:
|
||||||
|
Qwen / OpenAI provider → Embedding API + 余弦相似度(语义检索)
|
||||||
|
DeepSeek / Ollama → SQL LIKE 关键词检索(降级)
|
||||||
|
"""
|
||||||
|
import json
|
||||||
|
import math
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import sqlite3
|
||||||
|
import threading
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
import config
|
||||||
|
from utils.file_utils import extract_text, split_text_chunks
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
# 正在后台入库的文件名集合(供前端轮询感知"处理中"状态)
|
||||||
|
_processing_files: set = set()
|
||||||
|
_processing_lock = threading.Lock()
|
||||||
|
|
||||||
|
# 每次 Embedding API 批量请求的块数(避免单次请求过大)
|
||||||
|
_EMBED_BATCH = 16
|
||||||
|
|
||||||
|
|
||||||
|
# ─── 数据库 ──────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def _conn() -> sqlite3.Connection:
|
||||||
|
return sqlite3.connect(config.DB_PATH)
|
||||||
|
|
||||||
|
|
||||||
|
def _init_tables(cur: sqlite3.Cursor) -> None:
|
||||||
|
"""确保向量块表存在并创建优化索引(极速检索)。knowledge_files 已由 app.py init_db 创建"""
|
||||||
|
cur.execute('''
|
||||||
|
CREATE TABLE IF NOT EXISTS knowledge_vectors (
|
||||||
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||||
|
file_name TEXT NOT NULL,
|
||||||
|
chunk_idx INTEGER NOT NULL,
|
||||||
|
text TEXT NOT NULL,
|
||||||
|
embedding TEXT,
|
||||||
|
UNIQUE(file_name, chunk_idx)
|
||||||
|
)
|
||||||
|
''')
|
||||||
|
# 优化索引:加速LIKE查询和向量检索时的文本过滤
|
||||||
|
cur.execute('CREATE INDEX IF NOT EXISTS idx_kv_file ON knowledge_vectors(file_name)')
|
||||||
|
cur.execute('CREATE INDEX IF NOT EXISTS idx_kv_text ON knowledge_vectors(text)') # helps FTS/LIKE
|
||||||
|
cur.execute('PRAGMA optimize') # SQLite auto-optimization
|
||||||
|
|
||||||
|
|
||||||
|
# ─── Embedding API ────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def _get_embeddings_batch(texts: list[str]) -> list[list[float] | None]:
|
||||||
|
"""
|
||||||
|
调用 ai_client.get_embeddings (复用全局 semaphore 和客户端逻辑)。
|
||||||
|
不支持 Embedding 的 provider 返回全 None 列表。优化了并发控制。
|
||||||
|
"""
|
||||||
|
if not texts:
|
||||||
|
return []
|
||||||
|
|
||||||
|
try:
|
||||||
|
# 使用统一 ai_client 接口,确保全局LLM semaphore生效,避免重复客户端创建
|
||||||
|
from utils import ai_client
|
||||||
|
embeddings = ai_client.get_embeddings(texts)
|
||||||
|
return embeddings
|
||||||
|
except Exception as e:
|
||||||
|
if "NotImplementedError" in str(type(e).__name__) or "不支持" in str(e):
|
||||||
|
logger.info('Embedding provider不支持,降级到关键词检索')
|
||||||
|
return [None] * len(texts)
|
||||||
|
logger.warning(f'Embedding API 调用失败,将使用关键词检索降级: {e}')
|
||||||
|
return [None] * len(texts)
|
||||||
|
|
||||||
|
|
||||||
|
def _cosine(a: list[float], b: list[float]) -> float:
|
||||||
|
"""纯 Python 余弦相似度,无需 numpy"""
|
||||||
|
dot = sum(x * y for x, y in zip(a, b))
|
||||||
|
na = math.sqrt(sum(x * x for x in a))
|
||||||
|
nb = math.sqrt(sum(x * x for x in b))
|
||||||
|
return dot / (na * nb) if na and nb else 0.0
|
||||||
|
|
||||||
|
|
||||||
|
# ─── 公开接口 ─────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def is_available() -> dict:
|
||||||
|
"""
|
||||||
|
知识库始终可用(无外部依赖),返回当前状态。
|
||||||
|
search_mode: 'vector'(语义检索)或 'keyword'(关键词降级)
|
||||||
|
"""
|
||||||
|
with _processing_lock:
|
||||||
|
processing = list(_processing_files)
|
||||||
|
|
||||||
|
try:
|
||||||
|
db = _conn()
|
||||||
|
cur = db.cursor()
|
||||||
|
_init_tables(cur)
|
||||||
|
db.commit()
|
||||||
|
|
||||||
|
cur.execute('SELECT COUNT(*) FROM knowledge_vectors')
|
||||||
|
doc_count = cur.fetchone()[0]
|
||||||
|
|
||||||
|
# 判断是否已有向量(即 Embedding API 是否可用过)
|
||||||
|
cur.execute('SELECT 1 FROM knowledge_vectors WHERE embedding IS NOT NULL LIMIT 1')
|
||||||
|
has_embedding = cur.fetchone() is not None
|
||||||
|
|
||||||
|
db.close()
|
||||||
|
|
||||||
|
provider = getattr(config, 'MODEL_PROVIDER', '')
|
||||||
|
can_embed = provider in ('qwen', 'openai', 'kimi')
|
||||||
|
mode = 'vector' if (has_embedding or can_embed) else 'keyword'
|
||||||
|
|
||||||
|
return {
|
||||||
|
'available': True,
|
||||||
|
'doc_count': doc_count,
|
||||||
|
'processing': processing,
|
||||||
|
'search_mode': mode,
|
||||||
|
}
|
||||||
|
except Exception as e:
|
||||||
|
return {
|
||||||
|
'available': True,
|
||||||
|
'doc_count': 0,
|
||||||
|
'processing': processing,
|
||||||
|
'search_mode': 'keyword',
|
||||||
|
'error': str(e),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def add_file(file_path: str, db_path: str) -> dict:
|
||||||
|
"""
|
||||||
|
将文件切块 → 批量 Embedding → 写入 SQLite。
|
||||||
|
此函数在后台线程中调用,_processing_files 用于前端感知进度。
|
||||||
|
"""
|
||||||
|
file_name = os.path.basename(file_path)
|
||||||
|
with _processing_lock:
|
||||||
|
_processing_files.add(file_name)
|
||||||
|
|
||||||
|
try:
|
||||||
|
text = extract_text(file_path)
|
||||||
|
chunks = split_text_chunks(text, config.CHUNK_SIZE, config.CHUNK_OVERLAP)
|
||||||
|
if not chunks:
|
||||||
|
return {'success': False, 'error': '文件内容为空,无法入库'}
|
||||||
|
|
||||||
|
# 批量获取 Embedding(Qwen/OpenAI provider 有效;否则全 None)
|
||||||
|
embeddings: list[list[float] | None] = []
|
||||||
|
for i in range(0, len(chunks), _EMBED_BATCH):
|
||||||
|
batch = chunks[i:i + _EMBED_BATCH]
|
||||||
|
embeddings.extend(_get_embeddings_batch(batch))
|
||||||
|
|
||||||
|
db = _conn()
|
||||||
|
try:
|
||||||
|
cur = db.cursor()
|
||||||
|
_init_tables(cur)
|
||||||
|
|
||||||
|
# 先删除同名文件的旧数据
|
||||||
|
cur.execute('DELETE FROM knowledge_vectors WHERE file_name=?', (file_name,))
|
||||||
|
|
||||||
|
for idx, (chunk, emb) in enumerate(zip(chunks, embeddings)):
|
||||||
|
emb_json = json.dumps(emb) if emb is not None else None
|
||||||
|
cur.execute(
|
||||||
|
'INSERT INTO knowledge_vectors (file_name, chunk_idx, text, embedding) VALUES (?,?,?,?)',
|
||||||
|
(file_name, idx, chunk, emb_json),
|
||||||
|
)
|
||||||
|
|
||||||
|
cur.execute('''
|
||||||
|
INSERT OR REPLACE INTO knowledge_files (file_name, file_path, chunk_count, added_at)
|
||||||
|
VALUES (?, ?, ?, ?)
|
||||||
|
''', (file_name, file_path, len(chunks), datetime.now()))
|
||||||
|
|
||||||
|
db.commit()
|
||||||
|
finally:
|
||||||
|
db.close()
|
||||||
|
|
||||||
|
logger.info(f'知识库入库完成: {file_name},{len(chunks)} 块'
|
||||||
|
f'{"(含向量)" if any(e is not None for e in embeddings) else "(关键词模式)"}')
|
||||||
|
return {'success': True, 'chunks': len(chunks)}
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.exception('知识库添加文件失败')
|
||||||
|
return {'success': False, 'error': str(e)}
|
||||||
|
finally:
|
||||||
|
with _processing_lock:
|
||||||
|
_processing_files.discard(file_name)
|
||||||
|
|
||||||
|
|
||||||
|
def search(query: str, top_k: int = None) -> list[str]:
|
||||||
|
"""
|
||||||
|
从知识库检索与 query 最相关的文本块。
|
||||||
|
- 向量模式:获取 query 的 Embedding → 余弦相似度排序
|
||||||
|
- 关键词模式(降级):SQL LIKE 多词匹配
|
||||||
|
"""
|
||||||
|
if top_k is None:
|
||||||
|
top_k = config.TOP_K_KNOWLEDGE
|
||||||
|
|
||||||
|
try:
|
||||||
|
db = _conn()
|
||||||
|
try:
|
||||||
|
cur = db.cursor()
|
||||||
|
_init_tables(cur)
|
||||||
|
db.commit()
|
||||||
|
|
||||||
|
cur.execute('SELECT COUNT(*) FROM knowledge_vectors')
|
||||||
|
if cur.fetchone()[0] == 0:
|
||||||
|
return []
|
||||||
|
|
||||||
|
# ── 向量语义检索 ──────────────────────────────────────────────────
|
||||||
|
q_embs = _get_embeddings_batch([query])
|
||||||
|
q_emb = q_embs[0] if q_embs else None
|
||||||
|
|
||||||
|
if q_emb is not None:
|
||||||
|
# 极速优化:限制扫描行数(避免知识库大时全表扫描),优先最近添加的内容
|
||||||
|
cur.execute(
|
||||||
|
'''SELECT text, embedding FROM knowledge_vectors
|
||||||
|
WHERE embedding IS NOT NULL
|
||||||
|
ORDER BY id DESC LIMIT 500'''
|
||||||
|
)
|
||||||
|
rows = cur.fetchall()
|
||||||
|
if rows:
|
||||||
|
scored: list[tuple[float, str]] = []
|
||||||
|
for text, emb_json in rows:
|
||||||
|
try:
|
||||||
|
emb = json.loads(emb_json)
|
||||||
|
scored.append((_cosine(q_emb, emb), text))
|
||||||
|
except Exception:
|
||||||
|
continue
|
||||||
|
scored.sort(reverse=True)
|
||||||
|
return [t for _, t in scored[:top_k]]
|
||||||
|
|
||||||
|
# ── 关键词降级检索(DeepSeek / Ollama 无 Embedding API)─────────
|
||||||
|
# 过滤纯数字/编号词(如 "1.2" "一、"),避免误匹配无关段落
|
||||||
|
import re as _re
|
||||||
|
_num_pat = _re.compile(r'^[\d\.\-、一二三四五六七八九十]+$')
|
||||||
|
words = [
|
||||||
|
w.strip() for w in query.split()
|
||||||
|
if len(w.strip()) > 1 and not _num_pat.match(w.strip())
|
||||||
|
][:6]
|
||||||
|
if not words:
|
||||||
|
cur.execute('SELECT text FROM knowledge_vectors LIMIT ?', (top_k,))
|
||||||
|
return [r[0] for r in cur.fetchall()]
|
||||||
|
|
||||||
|
conditions = ' OR '.join(['text LIKE ?' for _ in words])
|
||||||
|
params = [f'%{w}%' for w in words] + [top_k]
|
||||||
|
cur.execute(
|
||||||
|
f'SELECT text FROM knowledge_vectors WHERE {conditions} LIMIT ?', params
|
||||||
|
)
|
||||||
|
return [r[0] for r in cur.fetchall()]
|
||||||
|
|
||||||
|
finally:
|
||||||
|
db.close()
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f'知识库检索失败: {e}')
|
||||||
|
return []
|
||||||
|
|
||||||
|
|
||||||
|
def list_files(db_path: str) -> list[dict]:
|
||||||
|
"""列出知识库已入库的文件"""
|
||||||
|
try:
|
||||||
|
db = sqlite3.connect(db_path)
|
||||||
|
cur = db.cursor()
|
||||||
|
cur.execute(
|
||||||
|
'SELECT file_name, chunk_count, added_at FROM knowledge_files ORDER BY added_at DESC'
|
||||||
|
)
|
||||||
|
rows = cur.fetchall()
|
||||||
|
db.close()
|
||||||
|
return [{'name': r[0], 'chunks': r[1], 'added_at': r[2]} for r in rows]
|
||||||
|
except Exception:
|
||||||
|
return []
|
||||||
|
|
||||||
|
|
||||||
|
def delete_file(file_name: str, db_path: str) -> dict:
|
||||||
|
"""从知识库删除指定文件的所有数据"""
|
||||||
|
try:
|
||||||
|
db = _conn()
|
||||||
|
cur = db.cursor()
|
||||||
|
_init_tables(cur)
|
||||||
|
cur.execute('DELETE FROM knowledge_vectors WHERE file_name=?', (file_name,))
|
||||||
|
cur.execute('DELETE FROM knowledge_files WHERE file_name=?', (file_name,))
|
||||||
|
db.commit()
|
||||||
|
db.close()
|
||||||
|
return {'success': True}
|
||||||
|
except Exception as e:
|
||||||
|
logger.exception('知识库删除文件失败')
|
||||||
|
return {'success': False, 'error': str(e)}
|
||||||
206
modules/parser.py
Normal file
206
modules/parser.py
Normal file
@ -0,0 +1,206 @@
|
|||||||
|
"""
|
||||||
|
招标文件解析模块
|
||||||
|
流程:提取文本 → 生成摘要 → 提取评分要求 → 结构化JSON
|
||||||
|
"""
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import re
|
||||||
|
import sqlite3
|
||||||
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||||
|
from datetime import datetime
|
||||||
|
from functools import partial
|
||||||
|
|
||||||
|
from utils import ai_client, prompts as P
|
||||||
|
from utils.file_utils import extract_text, truncate_text
|
||||||
|
from utils.tender_kind_sections import (
|
||||||
|
get_tender_kind_classify_prompt,
|
||||||
|
parse_tender_kind_response,
|
||||||
|
)
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
def parse_boq_file(db_path: str, project_id: int, file_path: str, file_name: str) -> None:
|
||||||
|
"""
|
||||||
|
后台线程:解析工程量清单文件 → 本地结构化分析 → AI 摘要 → 写库。
|
||||||
|
boq_status: none → parsing → done / error
|
||||||
|
"""
|
||||||
|
from utils.bill_analysis import analyze_boq_pages, categories_to_prompt_appendix
|
||||||
|
from utils.boq_parser import extract_boq_pages
|
||||||
|
|
||||||
|
conn = sqlite3.connect(db_path)
|
||||||
|
try:
|
||||||
|
_set_boq_status(conn, project_id, 'parsing', '正在提取工程量清单文本...')
|
||||||
|
|
||||||
|
page_texts = extract_boq_pages(file_path)
|
||||||
|
boq_text = '\n'.join(page_texts).strip()
|
||||||
|
if not boq_text:
|
||||||
|
raise ValueError('未能从文件中提取到有效内容,请检查文件格式')
|
||||||
|
|
||||||
|
_set_boq_status(conn, project_id, 'parsing', '正在本地解析清单结构...')
|
||||||
|
analysis = analyze_boq_pages(page_texts)
|
||||||
|
boq_analysis_json = json.dumps(analysis, ensure_ascii=False)
|
||||||
|
|
||||||
|
structured = ''
|
||||||
|
if not analysis.get('scanned') and not analysis.get('no_bill_pages'):
|
||||||
|
structured = categories_to_prompt_appendix(analysis)
|
||||||
|
|
||||||
|
_set_boq_status(conn, project_id, 'parsing', '正在生成工程量清单摘要...')
|
||||||
|
|
||||||
|
summary_prompt = P.get_boq_summary_prompt(boq_text[:10000], structured)
|
||||||
|
boq_summary = ai_client.chat(summary_prompt, temperature=0.2, max_tokens=2048)
|
||||||
|
|
||||||
|
cur = conn.cursor()
|
||||||
|
cur.execute('''
|
||||||
|
UPDATE tender_data
|
||||||
|
SET boq_file_name=?, boq_text=?, boq_summary=?, boq_analysis_json=?,
|
||||||
|
boq_status='done', boq_error='', updated_at=?
|
||||||
|
WHERE project_id=?
|
||||||
|
''', (file_name, boq_text[:12000], boq_summary, boq_analysis_json, datetime.now(), project_id))
|
||||||
|
conn.commit()
|
||||||
|
logger.info(f'项目 {project_id} 工程量清单解析完成')
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.exception(f'工程量清单解析失败 project_id={project_id}')
|
||||||
|
_set_boq_status(conn, project_id, 'error', str(e))
|
||||||
|
finally:
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
|
||||||
|
def _set_boq_status(conn, project_id, status, message=''):
|
||||||
|
cur = conn.cursor()
|
||||||
|
cur.execute('''
|
||||||
|
UPDATE tender_data SET boq_status=?, boq_error=?, updated_at=?
|
||||||
|
WHERE project_id=?
|
||||||
|
''', (status, message, datetime.now(), project_id))
|
||||||
|
conn.commit()
|
||||||
|
|
||||||
|
|
||||||
|
def parse_tender_file(db_path: str, project_id: int, file_path: str, file_name: str) -> None:
|
||||||
|
"""
|
||||||
|
后台线程中运行:解析招标文件并将结果写入数据库。
|
||||||
|
极速优化:提取文本后并行执行3个独立AI任务(摘要、评分要求、类型识别),
|
||||||
|
然后顺序执行依赖的JSON结构化。全局信号量限制总并发≤20。
|
||||||
|
status 字段:pending → parsing → done / error
|
||||||
|
"""
|
||||||
|
conn = sqlite3.connect(db_path)
|
||||||
|
try:
|
||||||
|
_set_status(conn, project_id, 'parsing', '正在提取文件文本...')
|
||||||
|
|
||||||
|
# 1. 提取原始文本(I/O/CPU,可进一步并行但当前足够快)
|
||||||
|
raw_text = extract_text(file_path)
|
||||||
|
raw_text = truncate_text(raw_text, 60000)
|
||||||
|
excerpt = (raw_text or '')[:15000]
|
||||||
|
|
||||||
|
_set_status(conn, project_id, 'parsing', '并行生成摘要、评分要求和类型识别...')
|
||||||
|
|
||||||
|
# 并行准备3个独立AI任务(大幅加速解析环节)
|
||||||
|
def _run_summary():
|
||||||
|
prompt = P.get_project_summary_prompt(raw_text)
|
||||||
|
return ai_client.chat(prompt, temperature=0.3, max_tokens=4096)
|
||||||
|
|
||||||
|
def _run_rating():
|
||||||
|
prompt = P.get_rating_requirements_prompt(raw_text)
|
||||||
|
return ai_client.chat(prompt, temperature=0.2, max_tokens=4096)
|
||||||
|
|
||||||
|
def _run_kind():
|
||||||
|
prompt = get_tender_kind_classify_prompt(excerpt)
|
||||||
|
raw = ai_client.chat(prompt, temperature=0.1, max_tokens=32)
|
||||||
|
return parse_tender_kind_response(raw)
|
||||||
|
|
||||||
|
# 使用有限线程池 + 全局semaphore保护执行(并发上限20)
|
||||||
|
with ThreadPoolExecutor(max_workers=3, thread_name_prefix='parse') as executor:
|
||||||
|
future_summary = executor.submit(_run_summary)
|
||||||
|
future_rating = executor.submit(_run_rating)
|
||||||
|
future_kind = executor.submit(_run_kind)
|
||||||
|
|
||||||
|
summary = future_summary.result()
|
||||||
|
rating_md = future_rating.result()
|
||||||
|
tender_kind = future_kind.result()
|
||||||
|
|
||||||
|
logger.info(f'项目 {project_id} 招标文件类型识别为: {tender_kind}')
|
||||||
|
|
||||||
|
_set_status(conn, project_id, 'parsing', '正在结构化评分数据...')
|
||||||
|
|
||||||
|
# 4. 依赖rating_md的JSON结构化(顺序执行)
|
||||||
|
rating_json_prompt = P.get_rating_json_prompt(rating_md)
|
||||||
|
rating_json_raw = ai_client.chat(rating_json_prompt, temperature=0.1, max_tokens=2048)
|
||||||
|
rating_json_str = _clean_json(rating_json_raw)
|
||||||
|
|
||||||
|
# 写入数据库
|
||||||
|
_upsert_tender_data(conn, project_id, file_name, raw_text,
|
||||||
|
summary, rating_md, rating_json_str, tender_kind)
|
||||||
|
|
||||||
|
# Deep integration: persist diagram/anon settings to projects table (auto-defaults)
|
||||||
|
# Future: add AI extraction prompt for diagram intent and anon rules from raw_text
|
||||||
|
cur = conn.cursor()
|
||||||
|
cur.execute('''
|
||||||
|
UPDATE projects SET
|
||||||
|
enable_figure = COALESCE(enable_figure, 1),
|
||||||
|
enable_table = COALESCE(enable_table, 1),
|
||||||
|
anon_requirements = COALESCE(anon_requirements, '不得出现投标人身份信息')
|
||||||
|
WHERE id = ?
|
||||||
|
''', (project_id,))
|
||||||
|
conn.commit()
|
||||||
|
|
||||||
|
_set_status(conn, project_id, 'done', '解析完成(已同步生成设置)')
|
||||||
|
logger.info(f'项目 {project_id} 招标文件解析完成(并行加速完成,生成设置已打通)')
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.exception(f'解析失败 project_id={project_id}')
|
||||||
|
_set_status(conn, project_id, 'error', str(e))
|
||||||
|
finally:
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
|
||||||
|
# ─── 内部工具 ──────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def _set_status(conn, project_id, status, message=''):
|
||||||
|
cur = conn.cursor()
|
||||||
|
cur.execute('''
|
||||||
|
INSERT INTO tender_data (project_id, status, error_message)
|
||||||
|
VALUES (?, ?, ?)
|
||||||
|
ON CONFLICT(project_id) DO UPDATE SET status=?, error_message=?, updated_at=?
|
||||||
|
''', (project_id, status, message, status, message, datetime.now()))
|
||||||
|
conn.commit()
|
||||||
|
|
||||||
|
|
||||||
|
def _upsert_tender_data(conn, project_id, file_name, raw_text,
|
||||||
|
summary, rating_md, rating_json_str,
|
||||||
|
tender_kind: str = 'engineering'):
|
||||||
|
cur = conn.cursor()
|
||||||
|
cur.execute('''
|
||||||
|
INSERT INTO tender_data
|
||||||
|
(project_id, file_name, raw_text, summary, rating_requirements, rating_json,
|
||||||
|
tender_kind, status, error_message)
|
||||||
|
VALUES (?, ?, ?, ?, ?, ?, ?, 'done', '')
|
||||||
|
ON CONFLICT(project_id) DO UPDATE SET
|
||||||
|
file_name=?, raw_text=?, summary=?, rating_requirements=?,
|
||||||
|
rating_json=?, tender_kind=?, status='done', error_message='', updated_at=?
|
||||||
|
''', (
|
||||||
|
project_id, file_name, raw_text, summary, rating_md, rating_json_str, tender_kind,
|
||||||
|
file_name, raw_text, summary, rating_md, rating_json_str, tender_kind, datetime.now()
|
||||||
|
))
|
||||||
|
conn.commit()
|
||||||
|
|
||||||
|
|
||||||
|
def _clean_json(raw: str) -> str:
|
||||||
|
"""尝试从 AI 返回中提取 JSON 字符串"""
|
||||||
|
# 去除 markdown 代码块
|
||||||
|
raw = re.sub(r'```(?:json)?\s*', '', raw)
|
||||||
|
raw = raw.replace('```', '').strip()
|
||||||
|
# 验证是否是有效 JSON
|
||||||
|
try:
|
||||||
|
json.loads(raw)
|
||||||
|
return raw
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
# 尝试提取 { ... } 部分
|
||||||
|
m = re.search(r'\{[\s\S]*\}', raw)
|
||||||
|
if m:
|
||||||
|
candidate = m.group(0)
|
||||||
|
try:
|
||||||
|
json.loads(candidate)
|
||||||
|
return candidate
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
return raw
|
||||||
36
prompts/chapter_outline.txt
Normal file
36
prompts/chapter_outline.txt
Normal file
@ -0,0 +1,36 @@
|
|||||||
|
- 角色:技术标书架构师
|
||||||
|
|
||||||
|
- 能力:
|
||||||
|
- 单章节深度解构能力
|
||||||
|
- 跨章节协同规划视野
|
||||||
|
- 评分权重动态分配策略
|
||||||
|
|
||||||
|
- 任务:根据招标文件概要、章节主题、评分要求,生成结构化的技术标书该章节的目录
|
||||||
|
|
||||||
|
- 输出要求:
|
||||||
|
- 采用四级嵌套编码体系(X.X.X.X)确保章节颗粒度可控
|
||||||
|
- 直接给出生成的章节大纲,禁止解释和引导词
|
||||||
|
- markdown格式输出
|
||||||
|
|
||||||
|
|
||||||
|
- 示例输出,以"服务进度保障措施"为例:
|
||||||
|
二、智慧物流系统全生命周期进度保障
|
||||||
|
2.1 基于BIM的进度协同管理平台
|
||||||
|
2.1.1 多级进度计划耦合模型
|
||||||
|
2.1.1.1 WBS-Milestone映射矩阵
|
||||||
|
2.1.1.2 Primavera P6进度基线
|
||||||
|
2.1.2 资源约束进度优化算法
|
||||||
|
2.1.2.1 基于CPM的缓冲区间动态分配
|
||||||
|
2.1.2.2 资源平滑度R=0.92
|
||||||
|
|
||||||
|
- 招标文件概要:
|
||||||
|
{summary}
|
||||||
|
|
||||||
|
- 章节主题:
|
||||||
|
{chapter}
|
||||||
|
|
||||||
|
- 评分要求:
|
||||||
|
{score}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
158
prompts/outlines.txt
Normal file
158
prompts/outlines.txt
Normal file
@ -0,0 +1,158 @@
|
|||||||
|
- 角色:技术标书架构师
|
||||||
|
- 任务:生成适配技术评分标准的技术标书目录
|
||||||
|
- 输出要求:
|
||||||
|
采用四级嵌套编码体系(X.X.X.X)下实现按需分层
|
||||||
|
直接给出生成的目录,禁止解释和引导词
|
||||||
|
|
||||||
|
- 约束控制:
|
||||||
|
根据项目生成标书的名称,如“XXXX项目技术标书”
|
||||||
|
总的章节数应该控制在8-10个
|
||||||
|
章节颗粒度与评分指标权重正相关
|
||||||
|
技术实施类章节必须达到四级深度,管理保障类章节允许三级结构
|
||||||
|
同级节点数量必须有波动区间:技术方案类(4-7)、实施保障类(2-4)、创新应用类(1-3)
|
||||||
|
目录的章节不能缺少包含以下关键词的内容:
|
||||||
|
- 对本项目的了解和分析
|
||||||
|
- 项目工作重难点分析
|
||||||
|
- 项目实施方案
|
||||||
|
- 服务进度保障措施
|
||||||
|
- 服务质量保障方案
|
||||||
|
- 合理化建议
|
||||||
|
- 服务承诺及处罚措施
|
||||||
|
目录不包含成本和预算内容,但要平衡项目预算、技术可行性以及技术的专业度
|
||||||
|
|
||||||
|
- 示例输出:
|
||||||
|
<example>
|
||||||
|
花岭新城BIM项目技术标书
|
||||||
|
一、总体实施方案
|
||||||
|
1.1 项目理解与需求分析
|
||||||
|
1.1.1 项目概述
|
||||||
|
1.1.1.1 建设地点及规模
|
||||||
|
1.1.1.2 工程地质勘察报告
|
||||||
|
1.1.1.3 抗震设防烈度与防火等级
|
||||||
|
1.1.1.4 建筑结构形式与建筑面积分布
|
||||||
|
1.1.2 项目背景
|
||||||
|
1.1.2.1 核心宗旨与目标
|
||||||
|
1.1.2.2 地理位置与项目规模
|
||||||
|
1.1.3 项目目标
|
||||||
|
1.1.3.1 就业机会与基础设施提升
|
||||||
|
1.1.3.2 乡村振兴与经济增长
|
||||||
|
1.1.4 项目特点
|
||||||
|
1.1.4.1 框筒结构抗震性能
|
||||||
|
1.1.4.2 分阶段工程地质勘察
|
||||||
|
1.1.4.3 功能区域多样化
|
||||||
|
|
||||||
|
二、建筑设计
|
||||||
|
2.1 主要设计依据
|
||||||
|
2.1.1 国家标准与规范
|
||||||
|
2.1.2 行业标准与图集
|
||||||
|
2.2 建筑结构设计
|
||||||
|
2.2.1 结构形式
|
||||||
|
2.2.2 结构材料
|
||||||
|
2.2.3 结构布局
|
||||||
|
2.2.4 结构经济指标
|
||||||
|
2.2.5 结构细节设计
|
||||||
|
2.3 建筑功能布局
|
||||||
|
2.3.1 C1#楼(厂房)
|
||||||
|
2.3.1.1 功能分区明确
|
||||||
|
2.3.1.2 流线优化与安全性
|
||||||
|
2.3.2 配电房
|
||||||
|
2.3.2.1 设计目标与设备布置
|
||||||
|
2.3.2.2 空间规划与电气主接线方案
|
||||||
|
2.3.3 外廊及架空建筑
|
||||||
|
2.3.3.1 功能区域与景观设计
|
||||||
|
2.3.3.2 光照与通风优化
|
||||||
|
2.4 建筑材料选用
|
||||||
|
2.5 建筑外观设计
|
||||||
|
2.6 建筑室内布局
|
||||||
|
2.6.1 功能分区与设计要点
|
||||||
|
2.7 建筑安全和消防设计
|
||||||
|
2.7.1 建筑安全体系
|
||||||
|
2.7.2 消防系统设计
|
||||||
|
2.8 建筑节能设计
|
||||||
|
2.8.1 节能措施与绿色建材
|
||||||
|
2.8.2 雨水收集系统
|
||||||
|
|
||||||
|
三、结构设计
|
||||||
|
3.1 结构形式
|
||||||
|
3.2 结构材料
|
||||||
|
3.2.1 混凝土与钢材选用
|
||||||
|
3.3 结构布局
|
||||||
|
3.3.1 结构柱网与通风疏散通道
|
||||||
|
3.4 结构经济指标
|
||||||
|
3.4.1 抗震设计要求与用材控制
|
||||||
|
3.5 结构细节设计
|
||||||
|
3.5.1 基础设计与钢结构细节
|
||||||
|
3.5.2 混凝土结构与抗震设计
|
||||||
|
3.6 结构分析与计算
|
||||||
|
|
||||||
|
四、给排水设计
|
||||||
|
4.1 引言
|
||||||
|
4.2 供水系统设计
|
||||||
|
4.2.1 供水管道与消防水源
|
||||||
|
4.2.2 节水设计与雨水收集
|
||||||
|
4.3 排水系统设计
|
||||||
|
4.3.1 排水管道与雨水管理
|
||||||
|
4.3.2 污水处理与分流制度
|
||||||
|
4.4 给排水设备选择
|
||||||
|
4.5 细节设计
|
||||||
|
4.6 监测与维护
|
||||||
|
|
||||||
|
五、暖通设计
|
||||||
|
5.1 引言
|
||||||
|
5.2 供暖系统设计
|
||||||
|
5.2.1 供暖方式与设备选择
|
||||||
|
5.2.2 温度控制系统
|
||||||
|
5.3 通风系统设计
|
||||||
|
5.3.1 通风方式与设备选择
|
||||||
|
5.3.2 空气质量控制
|
||||||
|
5.4 空调系统设计
|
||||||
|
5.4.1 空调方式与设备选择
|
||||||
|
5.4.2 温湿度控制系统
|
||||||
|
5.5 热水系统设计
|
||||||
|
5.6 细节设计与监测维护
|
||||||
|
|
||||||
|
|
||||||
|
六、BIM设计
|
||||||
|
6.1 项目总图与单体建筑设计
|
||||||
|
6.2 道路与排水设计
|
||||||
|
6.3 电气系统设计
|
||||||
|
6.4 绿化设计
|
||||||
|
6.5 BIM协同设计与施工管理
|
||||||
|
6.6 数据管理与培训支持
|
||||||
|
|
||||||
|
七、设计说明
|
||||||
|
7.1 项目设计依据
|
||||||
|
7.2 设计原则
|
||||||
|
7.3 结构经济合理化
|
||||||
|
7.4 建筑功能分区
|
||||||
|
7.5 设计细节要求
|
||||||
|
|
||||||
|
八、合理化建议
|
||||||
|
8.1 建筑专业合理化建议
|
||||||
|
8.2 结构专业合理化建议
|
||||||
|
8.3 给排水专业合理化建议
|
||||||
|
8.4 暖通专业合理化建议
|
||||||
|
8.5 BIM专业合理化建议
|
||||||
|
8.6 技术和工艺方面的建议
|
||||||
|
8.7 成本和预算方面的建议
|
||||||
|
8.8 时间和进度方面的建议
|
||||||
|
8.9 施工质量管理方面的建议
|
||||||
|
8.10 质量和安全方面的建议
|
||||||
|
8.11 环境和可持续性方面的建议
|
||||||
|
|
||||||
|
九、施工进度安排
|
||||||
|
9.1 施工进度安排
|
||||||
|
9.2 施工进度跟踪与管理
|
||||||
|
9.3 施工质量管理
|
||||||
|
9.4 施工现场管理
|
||||||
|
9.5 施工结项与验收
|
||||||
|
|
||||||
|
十、本项目工作重点难点分析
|
||||||
|
10.1 工程特点与设计工作难点
|
||||||
|
10.2 重点与难点分析
|
||||||
|
10.3 综合解决措施
|
||||||
|
</example>
|
||||||
|
|
||||||
|
- 招标文件内容:
|
||||||
|
{document_text}
|
||||||
|
"""
|
||||||
155
prompts/outlines_with_rating.txt
Normal file
155
prompts/outlines_with_rating.txt
Normal file
@ -0,0 +1,155 @@
|
|||||||
|
- 角色:技术标书架构师
|
||||||
|
- 任务:生成适配技术评分标准的技术标书目录
|
||||||
|
- 输出要求:
|
||||||
|
采用四级嵌套编码体系(X.X.X.X)下实现按需分层
|
||||||
|
直接给出生成的目录,禁止解释和引导词
|
||||||
|
|
||||||
|
- 约束控制:
|
||||||
|
根据项目生成标书的名称,如“XXXX项目技术标书”
|
||||||
|
总的章节数应该控制在8-10个,不超过10个
|
||||||
|
目录的章节必须按照技术评分标准的项目生成,题目应包括技术评分项目中的关键词:
|
||||||
|
章节颗粒度与评分指标权重正相关
|
||||||
|
技术方案类章节必须达到四级深度,管理保障类章节允许三级结构
|
||||||
|
同级节点数量必须有波动区间:技术方案类(4-7)、实施保障类(2-4)、创新应用类(1-3)
|
||||||
|
目录禁止包含报价、团队、资质、文件等商务性质的章节
|
||||||
|
|
||||||
|
- 示例输出:
|
||||||
|
<example>
|
||||||
|
花岭新城BIM项目技术标书
|
||||||
|
一、总体实施方案
|
||||||
|
1.1 项目理解与需求分析
|
||||||
|
1.1.1 项目概述
|
||||||
|
1.1.1.1 建设地点及规模
|
||||||
|
1.1.1.2 工程地质勘察报告
|
||||||
|
1.1.1.3 抗震设防烈度与防火等级
|
||||||
|
1.1.1.4 建筑结构形式与建筑面积分布
|
||||||
|
1.1.2 项目背景
|
||||||
|
1.1.2.1 核心宗旨与目标
|
||||||
|
1.1.2.2 地理位置与项目规模
|
||||||
|
1.1.3 项目目标
|
||||||
|
1.1.3.1 就业机会与基础设施提升
|
||||||
|
1.1.3.2 乡村振兴与经济增长
|
||||||
|
1.1.4 项目特点
|
||||||
|
1.1.4.1 框筒结构抗震性能
|
||||||
|
1.1.4.2 分阶段工程地质勘察
|
||||||
|
1.1.4.3 功能区域多样化
|
||||||
|
|
||||||
|
二、建筑设计
|
||||||
|
2.1 主要设计依据
|
||||||
|
2.1.1 国家标准与规范
|
||||||
|
2.1.2 行业标准与图集
|
||||||
|
2.2 建筑结构设计
|
||||||
|
2.2.1 结构形式
|
||||||
|
2.2.2 结构材料
|
||||||
|
2.2.3 结构布局
|
||||||
|
2.2.4 结构经济指标
|
||||||
|
2.2.5 结构细节设计
|
||||||
|
2.3 建筑功能布局
|
||||||
|
2.3.1 C1#楼(厂房)
|
||||||
|
2.3.1.1 功能分区明确
|
||||||
|
2.3.1.2 流线优化与安全性
|
||||||
|
2.3.2 配电房
|
||||||
|
2.3.2.1 设计目标与设备布置
|
||||||
|
2.3.2.2 空间规划与电气主接线方案
|
||||||
|
2.3.3 外廊及架空建筑
|
||||||
|
2.3.3.1 功能区域与景观设计
|
||||||
|
2.3.3.2 光照与通风优化
|
||||||
|
2.4 建筑材料选用
|
||||||
|
2.5 建筑外观设计
|
||||||
|
2.6 建筑室内布局
|
||||||
|
2.6.1 功能分区与设计要点
|
||||||
|
2.7 建筑安全和消防设计
|
||||||
|
2.7.1 建筑安全体系
|
||||||
|
2.7.2 消防系统设计
|
||||||
|
2.8 建筑节能设计
|
||||||
|
2.8.1 节能措施与绿色建材
|
||||||
|
2.8.2 雨水收集系统
|
||||||
|
|
||||||
|
三、结构设计
|
||||||
|
3.1 结构形式
|
||||||
|
3.2 结构材料
|
||||||
|
3.2.1 混凝土与钢材选用
|
||||||
|
3.3 结构布局
|
||||||
|
3.3.1 结构柱网与通风疏散通道
|
||||||
|
3.4 结构经济指标
|
||||||
|
3.4.1 抗震设计要求与用材控制
|
||||||
|
3.5 结构细节设计
|
||||||
|
3.5.1 基础设计与钢结构细节
|
||||||
|
3.5.2 混凝土结构与抗震设计
|
||||||
|
3.6 结构分析与计算
|
||||||
|
|
||||||
|
四、给排水设计
|
||||||
|
4.1 引言
|
||||||
|
4.2 供水系统设计
|
||||||
|
4.2.1 供水管道与消防水源
|
||||||
|
4.2.2 节水设计与雨水收集
|
||||||
|
4.3 排水系统设计
|
||||||
|
4.3.1 排水管道与雨水管理
|
||||||
|
4.3.2 污水处理与分流制度
|
||||||
|
4.4 给排水设备选择
|
||||||
|
4.5 细节设计
|
||||||
|
4.6 监测与维护
|
||||||
|
|
||||||
|
五、暖通设计
|
||||||
|
5.1 引言
|
||||||
|
5.2 供暖系统设计
|
||||||
|
5.2.1 供暖方式与设备选择
|
||||||
|
5.2.2 温度控制系统
|
||||||
|
5.3 通风系统设计
|
||||||
|
5.3.1 通风方式与设备选择
|
||||||
|
5.3.2 空气质量控制
|
||||||
|
5.4 空调系统设计
|
||||||
|
5.4.1 空调方式与设备选择
|
||||||
|
5.4.2 温湿度控制系统
|
||||||
|
5.5 热水系统设计
|
||||||
|
5.6 细节设计与监测维护
|
||||||
|
|
||||||
|
|
||||||
|
六、BIM设计
|
||||||
|
6.1 项目总图与单体建筑设计
|
||||||
|
6.2 道路与排水设计
|
||||||
|
6.3 电气系统设计
|
||||||
|
6.4 绿化设计
|
||||||
|
6.5 BIM协同设计与施工管理
|
||||||
|
6.6 数据管理与培训支持
|
||||||
|
|
||||||
|
七、设计说明
|
||||||
|
7.1 项目设计依据
|
||||||
|
7.2 设计原则
|
||||||
|
7.3 结构经济合理化
|
||||||
|
7.4 建筑功能分区
|
||||||
|
7.5 设计细节要求
|
||||||
|
|
||||||
|
八、合理化建议
|
||||||
|
8.1 建筑专业合理化建议
|
||||||
|
8.2 结构专业合理化建议
|
||||||
|
8.3 给排水专业合理化建议
|
||||||
|
8.4 暖通专业合理化建议
|
||||||
|
8.5 BIM专业合理化建议
|
||||||
|
8.6 技术和工艺方面的建议
|
||||||
|
8.7 成本和预算方面的建议
|
||||||
|
8.8 时间和进度方面的建议
|
||||||
|
8.9 施工质量管理方面的建议
|
||||||
|
8.10 质量和安全方面的建议
|
||||||
|
8.11 环境和可持续性方面的建议
|
||||||
|
|
||||||
|
九、施工进度安排
|
||||||
|
9.1 施工进度安排
|
||||||
|
9.2 施工进度跟踪与管理
|
||||||
|
9.3 施工质量管理
|
||||||
|
9.4 施工现场管理
|
||||||
|
9.5 施工结项与验收
|
||||||
|
|
||||||
|
十、本项目工作重点难点分析
|
||||||
|
10.1 工程特点与设计工作难点
|
||||||
|
10.2 重点与难点分析
|
||||||
|
10.3 综合解决措施
|
||||||
|
</example>
|
||||||
|
|
||||||
|
- 招标文件摘要:
|
||||||
|
{summary}
|
||||||
|
|
||||||
|
- 技术评分标准:
|
||||||
|
{rating}
|
||||||
|
|
||||||
|
"""
|
||||||
92
prompts/project_summary.txt
Normal file
92
prompts/project_summary.txt
Normal file
@ -0,0 +1,92 @@
|
|||||||
|
- 角色:招标文件编写专家,精通招标文件结构化、摘要编写
|
||||||
|
|
||||||
|
- 任务:根据用户提供的项目招标文件内容,生成一份专业、清晰的结构化摘要
|
||||||
|
|
||||||
|
- 要求:
|
||||||
|
|
||||||
|
一、摘要框架
|
||||||
|
1. 项目概况
|
||||||
|
- 项目名称
|
||||||
|
- 建设地点
|
||||||
|
- 工程性质(新建/改建/扩建)
|
||||||
|
- 核心建设内容
|
||||||
|
- 关键工程量指标
|
||||||
|
- 特殊施工工艺(如顶管/盾构等)
|
||||||
|
- 项目概况
|
||||||
|
|
||||||
|
2. 技术要求体系
|
||||||
|
- 专业监测要求(分项列出核心监测指标)
|
||||||
|
- 技术标准规范
|
||||||
|
- 质量管控要点
|
||||||
|
- 特殊工艺标准
|
||||||
|
|
||||||
|
3. 交付物矩阵
|
||||||
|
- 阶段性成果清单(含时间节点)
|
||||||
|
- 最终交付文件要求
|
||||||
|
- 成果验收标准
|
||||||
|
- 备案审批流程
|
||||||
|
|
||||||
|
4. 商务条款摘要
|
||||||
|
- 合同期限
|
||||||
|
- 支付结构
|
||||||
|
- 报价约束条件
|
||||||
|
- 违约条款要点
|
||||||
|
- 知识产权约定
|
||||||
|
|
||||||
|
5. 资质要求矩阵
|
||||||
|
- 企业资质门槛
|
||||||
|
- 人员资格要求
|
||||||
|
- 设备配置标准
|
||||||
|
- 同类项目经验
|
||||||
|
|
||||||
|
6. 评标要素体系
|
||||||
|
- 技术评分维度
|
||||||
|
- 商务评分权重
|
||||||
|
- 否决性条款
|
||||||
|
- 实质性条款
|
||||||
|
- 围标识别机制
|
||||||
|
|
||||||
|
|
||||||
|
二、处理规范
|
||||||
|
1. 信息抽取规则:
|
||||||
|
- 采用三级信息提炼法(关键数据→技术参数→约束条件)
|
||||||
|
- 识别并标注法定强制性条款(★号条款)
|
||||||
|
- 提取特殊工艺参数(例如顶管直径、沉井尺寸等)
|
||||||
|
|
||||||
|
2. 结构化呈现要求:
|
||||||
|
- 使用Markdown分级标题系统
|
||||||
|
- 技术参数格式化处理
|
||||||
|
- 流程节点采用时间轴呈现
|
||||||
|
- 关键数据突出显示(例如预算金额、最高限价)
|
||||||
|
|
||||||
|
3. 专业术语处理:
|
||||||
|
- 保持行业术语准确性
|
||||||
|
- 工程计量单位标准化转换
|
||||||
|
- 法律条款原文引述
|
||||||
|
|
||||||
|
三、输出示例
|
||||||
|
1.确保包含但不仅限于:
|
||||||
|
- 项目背景的技术参数分解
|
||||||
|
- 监测要求的分类归纳
|
||||||
|
- 成果交付的阶段性要求
|
||||||
|
- 商务条款的要点提炼
|
||||||
|
|
||||||
|
四、质量保障
|
||||||
|
1. 完整性核查清单:
|
||||||
|
- 验证五证要求(资质/业绩/人员/设备/资金)
|
||||||
|
- 检查三大核心条款(技术/商务/法律)
|
||||||
|
- 确认关键日期节点(工期/交付期/质保期)
|
||||||
|
|
||||||
|
2. 风险提示机制:
|
||||||
|
- 标注异常约束条款
|
||||||
|
- 识别排他性要求
|
||||||
|
- 提示潜在履约风险点
|
||||||
|
|
||||||
|
请严格按照上述结构化框架处理输入的招标文件,生成专业、准确、易读的项目摘要报告。
|
||||||
|
输出内容需符合工程领域专业规范,重点数据需二次核验确保准确性。
|
||||||
|
严格按照招标文件的内容,确保输出内容的完整性。
|
||||||
|
直接给出摘要,禁止说明和引导词。
|
||||||
|
|
||||||
|
- 用户提供的招标文件内容如下:
|
||||||
|
{bid_document}
|
||||||
|
|
||||||
23
prompts/rating_json.txt
Normal file
23
prompts/rating_json.txt
Normal file
@ -0,0 +1,23 @@
|
|||||||
|
- 任务:从工程项目招标文件中提取技术评分要求,并以严格的JSON格式输出。
|
||||||
|
|
||||||
|
- 要求:
|
||||||
|
必须生成完整有效的JSON对象,不使用JSON之外的文本说明
|
||||||
|
数值类型字段不添加单位符号
|
||||||
|
包含所有的评分项及其权重分配
|
||||||
|
特殊说明字段仅在存在否决条款(强制性条款)时出现
|
||||||
|
|
||||||
|
- 输出结构(必须严格遵守根字段名与数组名,便于后续章节字数与要点映射):
|
||||||
|
{
|
||||||
|
"items": [
|
||||||
|
{
|
||||||
|
"id": "唯一短标识,如 T01",
|
||||||
|
"name": "评分项名称(与招标文件表述一致或精简概括)",
|
||||||
|
"weight": 数值型权重或分值(如 10 表示 10 分或 10%),
|
||||||
|
"keywords": ["与本项相关的可选关键词1", "关键词2"]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"notes": "可选:否决条款、阶梯得分等特殊说明;无则写空字符串"
|
||||||
|
}
|
||||||
|
|
||||||
|
- 技术评分要求内容如下:
|
||||||
|
{tech_rating}
|
||||||
46
prompts/rating_requirements copy.txt
Normal file
46
prompts/rating_requirements copy.txt
Normal file
@ -0,0 +1,46 @@
|
|||||||
|
- 角色:招标文件信息提取专家,精通技术评分/技术评审要求的提取
|
||||||
|
|
||||||
|
- 任务:请严格按照以下步骤分析提供的招标文件内容,并完整提取所有技术评分标准:
|
||||||
|
|
||||||
|
- 步骤与要求:
|
||||||
|
|
||||||
|
1. **结构解析**
|
||||||
|
- 首先识别文件整体结构,仅提取“技术评分”/“技术评审”部分
|
||||||
|
- 标注评分大类的权重占比(如出现)
|
||||||
|
|
||||||
|
2. **要素提取**
|
||||||
|
对“技术评分”板块进行深度解析,要求:
|
||||||
|
- 提取评分的全部细节,不能省略
|
||||||
|
- 明确列出技术评分的标准,如有(如"ISO认证+3分"、"项目经验每年加1分")
|
||||||
|
|
||||||
|
3. **结果呈现样例**
|
||||||
|
参考以下示例输出markdown结构化格式:
|
||||||
|
|
||||||
|
# 招标技术评分细则
|
||||||
|
|
||||||
|
## 技术评分(80分)
|
||||||
|
- 对本项目的了解和分析(12分)
|
||||||
|
→ 对本项目的理解与项目背景把握准确,对本项目特点、实 施目标和定位内容详尽,完全满足项目需要,科学、合理、 针对性强、合理可行的,得 12 分; 对本项目的理解与项 目背景有一定把握,对本项目特点、实施目标和定位有阐 述说明,基本可行的,得 8 分;对本项目的理解与项目 背景把握片面,对本项目特点、实施目标和定位理解有较 大偏差,可行性较差的,得 4 分;未提供不得分。
|
||||||
|
→ 合理可行指:( 1)完全响应采购需求;( 2)相关内容的表述具有针对性,全面、具体。
|
||||||
|
→ 基本可行指:( 1)响应采购需求有微小偏差;( 2)相关 内容的表述有一定的层次性、针对性,但全面性不够。
|
||||||
|
→ 可行性较差指:( 1)响应采购需求有较大偏差;( 2)相 关内容的表述针对性弱、全面性方面欠缺较大。
|
||||||
|
- 项目工作重难点分析(12分)
|
||||||
|
→ 根据供应商针对本项目工作重难点分析与解决方案的科学性、合理性且满足项目实际情况进行评分,项目工作重 难点分析到位、有针对性、完全符合项目实际情况,对应 的解决方案合理可行的,得 12 分;
|
||||||
|
项目工作重难点内容 基本准确、针对性一般、基本符合项目实际,对应的解决 方案基本可行的,得 8 分;
|
||||||
|
项目工作重难点分析一般,对应的解决方案一般、可行性较差的,得 4 分;未提供 不得分。
|
||||||
|
→ 合理可行指:( 1)完全响应采购需求;( 2)相关内容的表述具有针对性,全面、具体。
|
||||||
|
→ 基本可行指:( 1)响应采购需求有微小偏差;( 2)相关 内容的表述有一定的层次性、针对性,但全面性不够。
|
||||||
|
→ 可行性较差指:( 1)响应采购需求有较大偏差;( 2)相 关内容的表述针对性弱、全面性方面欠缺较大。
|
||||||
|
- 项目实施方案(12分)
|
||||||
|
(继续展开...)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
请严格按照上述结构化框架处理输入的招标文件,生成专业、准确的项目技术评分/评审要求。
|
||||||
|
严格按照招标文件的内容,确保输出内容的完整性。
|
||||||
|
直接输出评分/评审要求,禁止说明和引导词。
|
||||||
|
|
||||||
|
- 招标文件内容如下:
|
||||||
|
{bid_document}
|
||||||
|
|
||||||
|
|
||||||
43
prompts/rating_requirements.txt
Normal file
43
prompts/rating_requirements.txt
Normal file
@ -0,0 +1,43 @@
|
|||||||
|
- 角色:招标文件信息提取专家,精通技术评分/技术评审要求的提取
|
||||||
|
|
||||||
|
- 任务:请严格按照以下步骤分析提供的招标文件内容,并完整提取所有技术评分标准:
|
||||||
|
|
||||||
|
- 步骤与要求:
|
||||||
|
|
||||||
|
1. **结构解析**
|
||||||
|
- 首先识别文件整体结构,仅提取“技术评分”/“技术评审要求”部分
|
||||||
|
- 标注评分大类的权重占比(如出现)
|
||||||
|
|
||||||
|
2. **要素提取**
|
||||||
|
对“技术评分”板块进行深度解析,要求:
|
||||||
|
- 提取评分的全部细节,不能省略
|
||||||
|
- 明确列出量化指标,如有(如"ISO认证+3分"、"项目经验每年加1分")
|
||||||
|
- 区分强制性条款(必须满足项)与竞争性条款(择优评分项),如有
|
||||||
|
- 标注特殊要求(本地化服务、专利数量、团队资质等),如有
|
||||||
|
|
||||||
|
3. **异常识别**
|
||||||
|
- 标出表述模糊的评分项(如"酌情加分""优/良/差等级")
|
||||||
|
- 识别可能存在的矛盾条款
|
||||||
|
- 提示需要注意的隐藏评分点(如投标格式错误扣分项)
|
||||||
|
|
||||||
|
4. **结果呈现样例**
|
||||||
|
参考以下示例输出markdown结构化格式:
|
||||||
|
|
||||||
|
# 招标技术评分细则
|
||||||
|
|
||||||
|
## 技术评分(50%)
|
||||||
|
- 系统架构设计(20%)
|
||||||
|
→ 要求:支持分布式部署(未满足直接废标)
|
||||||
|
→ 加分项:采用微服务架构+3分
|
||||||
|
(继续展开...)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
请严格按照上述结构化框架处理输入的招标文件,生成专业、准确的项目技术评分要求。
|
||||||
|
严格按照招标文件的内容,确保输出内容的完整性。
|
||||||
|
直接输出评分要求,禁止说明和引导词。
|
||||||
|
|
||||||
|
- 招标文件内容如下:
|
||||||
|
{bid_document}
|
||||||
|
|
||||||
|
|
||||||
45
prompts/scoring_rules.txt
Normal file
45
prompts/scoring_rules.txt
Normal file
@ -0,0 +1,45 @@
|
|||||||
|
"你是一名专业的招标文件分析师,请按照以下步骤处理用户提供的项目招标文件内容:
|
||||||
|
|
||||||
|
1. **结构识别**
|
||||||
|
- 仔细解析文件结构,定位'评分标准'、'评审办法'、'投标人须知'等关键章节
|
||||||
|
- 特别注意包含'分值'、'评分项'、'权重'等关键词的段落
|
||||||
|
|
||||||
|
2. **核心要素提取**
|
||||||
|
- 系统提取以下要素形成结构化表格:
|
||||||
|
│ 类别 │ 评分项名称 │ 分值权重 │ 具体要求 │ 否决条款 │
|
||||||
|
- 分类标准:
|
||||||
|
● 技术部分(方案设计、实施能力、技术创新等)
|
||||||
|
● 商务部分(资质证明、业绩案例、团队经验等)
|
||||||
|
● 价格部分(报价合理性、计价方式等)
|
||||||
|
● 其他专项(售后服务、本地化服务等)
|
||||||
|
|
||||||
|
3. **深度分析**
|
||||||
|
- 计算权重配比(示例:技术60% = 方案设计30% + 实施能力20% + 创新10%)
|
||||||
|
- 识别否决性条款(如"▲"标记项或特定强制要求)
|
||||||
|
- 标注特殊评分规则:阶梯得分、区间赋分、横向比较等机制
|
||||||
|
|
||||||
|
4. **风险提示**
|
||||||
|
- 标出易被忽视的得分点(如ISO认证、专利数量等)
|
||||||
|
- 识别矛盾条款(如总分值≠100%的情况)
|
||||||
|
- 提示资质门槛要求(注册资金、特定资质证书等)
|
||||||
|
|
||||||
|
5. **输出格式**
|
||||||
|
采用Markdown输出以下结构:
|
||||||
|
```markdown
|
||||||
|
# 招标评分要点汇总
|
||||||
|
|
||||||
|
## 核心指标配比
|
||||||
|
- 总评分构成:技术分(__%)+ 商务分(__%)+ 价格分(__%)
|
||||||
|
|
||||||
|
## 详细评分矩阵
|
||||||
|
| 类别 | 评分项 | 分值 | 具体要求 | 关键指标 |
|
||||||
|
|------|-------|-----|---------|---------|
|
||||||
|
| ... | ... | ... | ... | ... |
|
||||||
|
|
||||||
|
## 重点提示
|
||||||
|
⚠️ 否决条款:列出所有一票否决项
|
||||||
|
💡 得分要点:突出3-5个高权重核心指标
|
||||||
|
⏱️ 时间节点:标注与评分相关的时限要求
|
||||||
|
```
|
||||||
|
请先确认理解任务要求,待用户提供招标文件内容后执行分析。"
|
||||||
|
|
||||||
47
prompts/section_detail.py
Normal file
47
prompts/section_detail.py
Normal file
@ -0,0 +1,47 @@
|
|||||||
|
GEN_LEAF_DETAIL_PROMT = """
|
||||||
|
【最重要的要求——字数】
|
||||||
|
{word_count_spec}
|
||||||
|
|
||||||
|
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
||||||
|
- 角色:资深投标文件撰写专家
|
||||||
|
- 任务:根据招标文件概要、标书目录、子小节标题,撰写该子小节的正文
|
||||||
|
|
||||||
|
【行文规范】
|
||||||
|
- 投标方自称统一用"我方",禁用"我们""本公司"
|
||||||
|
- 招标人统一称"招标方"或"建设单位"
|
||||||
|
- 禁止前导句:"本章节对应……""本小节主要说明……""以下将从……方面说明"等——开头直接写实质内容
|
||||||
|
- 禁止AI套话:综上所述、首先其次再次、我们深信、高度重视、全力以赴、不断优化、稳步推进、通过以上措施
|
||||||
|
- 用具体数据/标准编号/人员配置替代空洞承诺
|
||||||
|
- 列举用(1)(2)(3)编号,禁止"首先其次"连接;禁止"等"作结尾
|
||||||
|
- 纯文本输出,禁用markdown符号,段落间空行分隔
|
||||||
|
- 直接输出正文,不含标题和解释
|
||||||
|
|
||||||
|
【输入信息】
|
||||||
|
- 招标文件概要:
|
||||||
|
{summary}
|
||||||
|
|
||||||
|
- 技术标书目录:
|
||||||
|
{outline}
|
||||||
|
|
||||||
|
- 待撰写的子小节标题:
|
||||||
|
{title}
|
||||||
|
|
||||||
|
再次强调:篇幅是最核心的质量指标。内容必须充分展开,每个技术要点都要详细阐述实施方法、技术参数、人员安排或设备配置。绝不可以概括性一笔带过。
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
GEN_SECTION_INTRODUCTION_PROMT = """
|
||||||
|
- 角色:资深投标文件撰写专家
|
||||||
|
- 任务:为章节撰写简短开篇引言(100~200字),点明核心主题与招标要求的对应关系
|
||||||
|
- 使用"我方"自称,禁止套话和前导解释句,纯文本输出
|
||||||
|
- 若无需过渡可输出空白
|
||||||
|
|
||||||
|
- 招标文件概要:
|
||||||
|
{summary}
|
||||||
|
|
||||||
|
- 技术标书目录:
|
||||||
|
{outline}
|
||||||
|
|
||||||
|
- 章节标题:
|
||||||
|
{title}
|
||||||
|
"""
|
||||||
28
prompts/section_details.txt
Normal file
28
prompts/section_details.txt
Normal file
@ -0,0 +1,28 @@
|
|||||||
|
【最重要的要求——字数】
|
||||||
|
{word_count_spec}
|
||||||
|
|
||||||
|
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
||||||
|
- 角色:资深投标文件撰写专家
|
||||||
|
- 任务:根据招标文件概要、标书目录、子小节标题,撰写该子小节的正文
|
||||||
|
|
||||||
|
【行文规范】
|
||||||
|
- 投标方自称用"我方","我们","本公司"随机使用
|
||||||
|
- 招标人统一称"招标方"或"建设单位"
|
||||||
|
- 禁止前导句:"本章节对应……""本小节主要说明……""以下将从……方面说明"等——开头直接写实质内容
|
||||||
|
- 禁止AI套话:综上所述、首先其次再次、我们深信、高度重视、全力以赴、不断优化、稳步推进、通过以上措施
|
||||||
|
- 用具体数据/标准编号/人员配置替代空洞承诺
|
||||||
|
- 列举用(1)(2)(3)编号,禁止"首先其次"连接;禁止"等"作结尾
|
||||||
|
- 纯文本输出,禁用markdown符号,段落间空行分隔
|
||||||
|
- 直接输出正文,不含标题和解释
|
||||||
|
|
||||||
|
【输入信息】
|
||||||
|
- 招标文件概要:
|
||||||
|
{summary}
|
||||||
|
|
||||||
|
- 技术标书目录:
|
||||||
|
{outline}
|
||||||
|
|
||||||
|
- 待撰写的子小节标题:
|
||||||
|
{subsection_title}
|
||||||
|
|
||||||
|
再次强调:篇幅是最核心的质量指标。内容必须充分展开,每个技术要点都要详细阐述实施方法、技术参数、人员安排或设备配置。绝不可以概括性一笔带过。
|
||||||
12
requirements.txt
Normal file
12
requirements.txt
Normal file
@ -0,0 +1,12 @@
|
|||||||
|
Flask==3.0.3
|
||||||
|
flask-cors==4.0.1
|
||||||
|
PyPDF2==3.0.1
|
||||||
|
python-docx==1.1.2
|
||||||
|
openai==1.52.0
|
||||||
|
Werkzeug==3.0.4
|
||||||
|
requests==2.32.3
|
||||||
|
chardet==5.2.0
|
||||||
|
pypdf==4.3.1
|
||||||
|
pdfminer.six==20231228
|
||||||
|
beautifulsoup4==4.12.3
|
||||||
|
lxml==5.3.0
|
||||||
39
start.bat
Normal file
39
start.bat
Normal file
@ -0,0 +1,39 @@
|
|||||||
|
@echo off
|
||||||
|
title BidPartner - AI Bid Assistant
|
||||||
|
|
||||||
|
echo.
|
||||||
|
echo ============================================
|
||||||
|
echo BidPartner - AI Bid Writing Tool
|
||||||
|
echo ============================================
|
||||||
|
echo.
|
||||||
|
|
||||||
|
cd /d "%~dp0"
|
||||||
|
|
||||||
|
python --version >nul 2>&1
|
||||||
|
if %errorlevel% neq 0 (
|
||||||
|
echo [ERROR] Python not found. Please install Python 3.9+
|
||||||
|
pause
|
||||||
|
exit /b 1
|
||||||
|
)
|
||||||
|
|
||||||
|
if not exist "%~dp0.deps_installed" (
|
||||||
|
echo Installing dependencies...
|
||||||
|
pip install -r requirements.txt
|
||||||
|
if %errorlevel% neq 0 (
|
||||||
|
echo [ERROR] Failed to install dependencies.
|
||||||
|
pause
|
||||||
|
exit /b 1
|
||||||
|
)
|
||||||
|
echo.> "%~dp0.deps_installed"
|
||||||
|
echo Dependencies installed successfully.
|
||||||
|
)
|
||||||
|
|
||||||
|
echo Starting server...
|
||||||
|
echo Open browser: http://localhost:5000
|
||||||
|
echo Press Ctrl+C to stop
|
||||||
|
echo.
|
||||||
|
|
||||||
|
start "" "http://localhost:5000"
|
||||||
|
python app.py
|
||||||
|
|
||||||
|
pause
|
||||||
89
static/style.css
Normal file
89
static/style.css
Normal file
@ -0,0 +1,89 @@
|
|||||||
|
/* 标伙伴 · 自定义样式 */
|
||||||
|
|
||||||
|
/* 滚动条美化 */
|
||||||
|
::-webkit-scrollbar {
|
||||||
|
width: 6px;
|
||||||
|
height: 6px;
|
||||||
|
}
|
||||||
|
::-webkit-scrollbar-track {
|
||||||
|
background: #f1f5f9;
|
||||||
|
border-radius: 3px;
|
||||||
|
}
|
||||||
|
::-webkit-scrollbar-thumb {
|
||||||
|
background: #cbd5e1;
|
||||||
|
border-radius: 3px;
|
||||||
|
}
|
||||||
|
::-webkit-scrollbar-thumb:hover {
|
||||||
|
background: #94a3b8;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* 章节树左侧栏 */
|
||||||
|
.sidebar-fixed::-webkit-scrollbar {
|
||||||
|
width: 4px;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* 正文内容排版 */
|
||||||
|
.prose-content {
|
||||||
|
font-family: 'SimSun', '宋体', 'Times New Roman', serif;
|
||||||
|
line-height: 1.9;
|
||||||
|
color: #374151;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* 动画 */
|
||||||
|
@keyframes fadeIn {
|
||||||
|
from { opacity: 0; transform: translateY(8px); }
|
||||||
|
to { opacity: 1; transform: translateY(0); }
|
||||||
|
}
|
||||||
|
.fade-in {
|
||||||
|
animation: fadeIn 0.25s ease-out;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* 表格样式(评分要求展示) */
|
||||||
|
.markdown-table table {
|
||||||
|
width: 100%;
|
||||||
|
border-collapse: collapse;
|
||||||
|
font-size: 13px;
|
||||||
|
}
|
||||||
|
.markdown-table th {
|
||||||
|
background: #f8fafc;
|
||||||
|
font-weight: 600;
|
||||||
|
color: #475569;
|
||||||
|
padding: 8px 12px;
|
||||||
|
border: 1px solid #e2e8f0;
|
||||||
|
text-align: left;
|
||||||
|
}
|
||||||
|
.markdown-table td {
|
||||||
|
padding: 7px 12px;
|
||||||
|
border: 1px solid #e2e8f0;
|
||||||
|
color: #334155;
|
||||||
|
}
|
||||||
|
.markdown-table tr:nth-child(even) td {
|
||||||
|
background: #f8fafc;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* 步骤指示器 */
|
||||||
|
.step-active {
|
||||||
|
background: #2563eb;
|
||||||
|
color: #fff;
|
||||||
|
box-shadow: 0 2px 8px rgba(37,99,235,.35);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* 文件上传拖拽高亮 */
|
||||||
|
.drop-active {
|
||||||
|
border-color: #3b82f6 !important;
|
||||||
|
background: #eff6ff !important;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* 章节缩进指示线 */
|
||||||
|
.section-indent-line {
|
||||||
|
border-left: 2px solid #e2e8f0;
|
||||||
|
margin-left: 8px;
|
||||||
|
padding-left: 8px;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* 打印样式 */
|
||||||
|
@media print {
|
||||||
|
header, nav, aside, button { display: none !important; }
|
||||||
|
main { padding: 0 !important; }
|
||||||
|
.bg-white { box-shadow: none !important; border: none !important; }
|
||||||
|
}
|
||||||
1226
templates/index.html
Normal file
1226
templates/index.html
Normal file
File diff suppressed because it is too large
Load Diff
2308
templates/project.html
Normal file
2308
templates/project.html
Normal file
File diff suppressed because it is too large
Load Diff
13
tests/fixtures/dark_bid_report_sample.json
vendored
Normal file
13
tests/fixtures/dark_bid_report_sample.json
vendored
Normal file
@ -0,0 +1,13 @@
|
|||||||
|
{
|
||||||
|
"overall": false,
|
||||||
|
"details": [
|
||||||
|
{"rule": "身份信息隐藏", "passed": true, "message": "未发现投标人身份信息"},
|
||||||
|
{"rule": "标题格式", "passed": false, "message": "部分标题字号/字体/颜色/下划线不符合要求"},
|
||||||
|
{"rule": "正文格式", "passed": false, "message": "部分正文段落格式不符合要求"},
|
||||||
|
{"rule": "目录要求", "passed": true, "message": "目录符合无页码、无页眉页脚要求"},
|
||||||
|
{"rule": "图表规范", "passed": false, "message": "正文中发现2个图表或附件内图表文字格式错误"},
|
||||||
|
{"rule": "颜色与装饰", "passed": true, "message": "无彩色文字、无下划线、无着重号"},
|
||||||
|
{"rule": "页面设置", "passed": false, "message": "页面边距或纸张方向不符合要求"}
|
||||||
|
],
|
||||||
|
"violations": []
|
||||||
|
}
|
||||||
95
tests/test_attachment_section.py
Normal file
95
tests/test_attachment_section.py
Normal file
@ -0,0 +1,95 @@
|
|||||||
|
"""附件类章节识别与单图/单表类型选择。"""
|
||||||
|
import unittest
|
||||||
|
|
||||||
|
from utils import attachment_section as att
|
||||||
|
|
||||||
|
|
||||||
|
class TestIsAttachment(unittest.TestCase):
|
||||||
|
def test_positive(self):
|
||||||
|
r = att.load_attachment_rules()
|
||||||
|
self.assertTrue(att.is_attachment_only_section('附件一:施工平面布置', r))
|
||||||
|
self.assertTrue(att.is_attachment_only_section('附图 组织机构', r))
|
||||||
|
self.assertTrue(att.is_attachment_only_section('附表 人员一览', r))
|
||||||
|
|
||||||
|
def test_negative(self):
|
||||||
|
r = att.load_attachment_rules()
|
||||||
|
self.assertFalse(att.is_attachment_only_section('施工组织设计', r))
|
||||||
|
self.assertFalse(att.is_attachment_only_section('', r))
|
||||||
|
|
||||||
|
|
||||||
|
class TestPickKind(unittest.TestCase):
|
||||||
|
def test_only_figure_switch(self):
|
||||||
|
r = att.DEFAULT_ATTACHMENT_RULES
|
||||||
|
self.assertEqual(
|
||||||
|
att.pick_single_figure_or_table('附件一:xxx', True, False, r),
|
||||||
|
'figure',
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_only_table_switch(self):
|
||||||
|
r = att.DEFAULT_ATTACHMENT_RULES
|
||||||
|
self.assertEqual(
|
||||||
|
att.pick_single_figure_or_table('附件一:xxx', False, True, r),
|
||||||
|
'table',
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_both_off(self):
|
||||||
|
self.assertIsNone(
|
||||||
|
att.pick_single_figure_or_table('附件一', False, False, None),
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_table_hint(self):
|
||||||
|
r = att.DEFAULT_ATTACHMENT_RULES
|
||||||
|
k = att.pick_single_figure_or_table('附件三 工程量一览表', True, True, r)
|
||||||
|
self.assertEqual(k, 'table')
|
||||||
|
|
||||||
|
def test_figure_hint(self):
|
||||||
|
r = att.DEFAULT_ATTACHMENT_RULES
|
||||||
|
k = att.pick_single_figure_or_table('附图 施工平面示意图', True, True, r)
|
||||||
|
self.assertEqual(k, 'figure')
|
||||||
|
|
||||||
|
def test_default_ambiguous(self):
|
||||||
|
r = dict(att.DEFAULT_ATTACHMENT_RULES)
|
||||||
|
r['default_kind_when_ambiguous'] = 'table'
|
||||||
|
k = att.pick_single_figure_or_table('附件五 其他资料', True, True, r)
|
||||||
|
self.assertEqual(k, 'table')
|
||||||
|
|
||||||
|
|
||||||
|
class TestAttachmentBodyMode(unittest.TestCase):
|
||||||
|
def test_default_stack_charts_only(self):
|
||||||
|
r = att.DEFAULT_ATTACHMENT_RULES
|
||||||
|
self.assertEqual(att.attachment_leaf_body_mode(r), 'stack_charts_only')
|
||||||
|
self.assertTrue(att.use_attachment_stack_charts_body(r))
|
||||||
|
self.assertFalse(att.use_attachment_single_chart_only_body(r))
|
||||||
|
self.assertFalse(att.use_attachment_full_body(r))
|
||||||
|
|
||||||
|
def test_full_mode(self):
|
||||||
|
r = dict(att.DEFAULT_ATTACHMENT_RULES)
|
||||||
|
r['attachment_leaf_body_mode'] = 'full'
|
||||||
|
self.assertEqual(att.attachment_leaf_body_mode(r), 'full')
|
||||||
|
self.assertTrue(att.use_attachment_full_body(r))
|
||||||
|
self.assertFalse(att.use_attachment_stack_charts_body(r))
|
||||||
|
|
||||||
|
def test_single_chart_only(self):
|
||||||
|
r = dict(att.DEFAULT_ATTACHMENT_RULES)
|
||||||
|
r['attachment_leaf_body_mode'] = 'single_chart_only'
|
||||||
|
self.assertTrue(att.use_attachment_single_chart_only_body(r))
|
||||||
|
self.assertTrue(att.use_attachment_stack_charts_body(r))
|
||||||
|
|
||||||
|
|
||||||
|
class TestExpandOutlineSkip(unittest.TestCase):
|
||||||
|
def test_should_skip_attachment(self):
|
||||||
|
self.assertTrue(att.should_skip_expand_subchapters('附件一:平面图'))
|
||||||
|
self.assertTrue(att.should_skip_expand_subchapters('附图 示意'))
|
||||||
|
|
||||||
|
def test_should_skip_normal_chapter(self):
|
||||||
|
self.assertFalse(att.should_skip_expand_subchapters('施工组织设计'))
|
||||||
|
self.assertFalse(att.should_skip_expand_subchapters('质量管理体系与措施'))
|
||||||
|
|
||||||
|
def test_parse_attachment_label(self):
|
||||||
|
self.assertEqual(att.parse_attachment_label('附件一:平面图'), '一')
|
||||||
|
self.assertEqual(att.parse_attachment_label('附件2 承诺书'), '2')
|
||||||
|
self.assertEqual(att.parse_attachment_label('附图 总平面'), '附图')
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
unittest.main()
|
||||||
52
tests/test_bill_analysis.py
Normal file
52
tests/test_bill_analysis.py
Normal file
@ -0,0 +1,52 @@
|
|||||||
|
"""工程量清单本地分析单元测试。"""
|
||||||
|
import unittest
|
||||||
|
|
||||||
|
from utils.bill_analysis import (
|
||||||
|
analyze_boq_pages,
|
||||||
|
filter_bill_pages,
|
||||||
|
parse_bill_text,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class TestParseBillText(unittest.TestCase):
|
||||||
|
def test_code_name_unit_qty(self):
|
||||||
|
text = '010101001001 挖土方 m3 100.5 土壤类别:三类土'
|
||||||
|
r = parse_bill_text(text)
|
||||||
|
self.assertIn('categories', r)
|
||||||
|
self.assertTrue(r['categories'])
|
||||||
|
cat = r['categories'][0]
|
||||||
|
self.assertEqual(cat['name'], '未分类')
|
||||||
|
self.assertEqual(len(cat['items']), 1)
|
||||||
|
it = cat['items'][0]
|
||||||
|
self.assertEqual(it['code'], '010101001001')
|
||||||
|
self.assertIn('挖土', it['name'])
|
||||||
|
self.assertEqual(it['unit'], 'm3')
|
||||||
|
self.assertEqual(it['quantity'], '100.5')
|
||||||
|
|
||||||
|
def test_hierarchical_line_prefix(self):
|
||||||
|
text = '1.1 010101001001 基础开挖 m3 50'
|
||||||
|
r = parse_bill_text(text)
|
||||||
|
it = r['categories'][0]['items'][0]
|
||||||
|
self.assertEqual(it['code'], '010101001001')
|
||||||
|
|
||||||
|
|
||||||
|
class TestFilterBillPages(unittest.TestCase):
|
||||||
|
def test_two_pages_gap_fill(self):
|
||||||
|
p0 = '目录 前言'
|
||||||
|
p1 = '分部分项工程量清单\n项目编码 项目名称 工程量\n010101001001 项 m3 1'
|
||||||
|
p2 = '续表无表头\n010101002001 土 m2 2'
|
||||||
|
p3 = '规费 税金 社会保险费 住房公积金 其他说明'
|
||||||
|
pages, meta = filter_bill_pages([p0, p1, p2, p3])
|
||||||
|
self.assertEqual(meta['total_pages'], 4)
|
||||||
|
self.assertGreaterEqual(len(pages), 2)
|
||||||
|
merged = '\n'.join(pages)
|
||||||
|
self.assertIn('010101001001', merged)
|
||||||
|
self.assertIn('010101002001', merged)
|
||||||
|
|
||||||
|
def test_analyze_scanned_empty(self):
|
||||||
|
r = analyze_boq_pages(['', ' ', ''])
|
||||||
|
self.assertTrue(r.get('scanned'))
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
unittest.main()
|
||||||
63
tests/test_dark_bid_format_check.py
Normal file
63
tests/test_dark_bid_format_check.py
Normal file
@ -0,0 +1,63 @@
|
|||||||
|
"""技术暗标 HTML 格式检查:结构校验与极简用例(标准库 unittest)。"""
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import unittest
|
||||||
|
|
||||||
|
# 保证可 `python tests/test_*.py` 从项目根导入 `modules`
|
||||||
|
_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||||
|
if _ROOT not in sys.path:
|
||||||
|
sys.path.insert(0, _ROOT)
|
||||||
|
|
||||||
|
from modules.dark_bid_format_check import check_technical_bid # noqa: E402
|
||||||
|
|
||||||
|
|
||||||
|
def _sample_schema_path():
|
||||||
|
return os.path.join(os.path.dirname(__file__), "fixtures", "dark_bid_report_sample.json")
|
||||||
|
|
||||||
|
|
||||||
|
class TestDarkBidFormatCheck(unittest.TestCase):
|
||||||
|
def test_sample_fixture_keys(self):
|
||||||
|
with open(_sample_schema_path(), encoding="utf-8") as f:
|
||||||
|
sample = json.load(f)
|
||||||
|
self.assertIn("overall", sample)
|
||||||
|
self.assertIn("details", sample)
|
||||||
|
self.assertIn("violations", sample)
|
||||||
|
for d in sample["details"]:
|
||||||
|
self.assertTrue({"rule", "passed", "message"}.issubset(d.keys()))
|
||||||
|
|
||||||
|
def test_check_returns_structure(self):
|
||||||
|
html = """<!DOCTYPE html><html><head><style>
|
||||||
|
@page { margin: 2.54cm 3.18cm 2.54cm 3.18cm; size: A4; }
|
||||||
|
</style></head><body style="margin:2.54cm 3.18cm">
|
||||||
|
<div class="toc">第一章 概述</div>
|
||||||
|
<h2 style="font-size:16pt;font-family:SimHei;font-weight:bold;color:#000">标题</h2>
|
||||||
|
<p style="font-size:14pt;font-family:SimSun;line-height:26pt;text-indent:2em;color:#000">
|
||||||
|
正文内容示例。</p>
|
||||||
|
</body></html>"""
|
||||||
|
r = check_technical_bid(html)
|
||||||
|
self.assertIsInstance(r["overall"], bool)
|
||||||
|
self.assertEqual(len(r["details"]), 7)
|
||||||
|
rules = [x["rule"] for x in r["details"]]
|
||||||
|
self.assertIn("身份信息隐藏", rules)
|
||||||
|
self.assertIn("标题格式", rules)
|
||||||
|
|
||||||
|
def test_empty_html(self):
|
||||||
|
r = check_technical_bid("")
|
||||||
|
self.assertFalse(r["overall"])
|
||||||
|
|
||||||
|
def test_identity_fail_on_company(self):
|
||||||
|
html = (
|
||||||
|
"<html><body><p style='font-size:14pt;font-family:SimSun;"
|
||||||
|
"line-height:26pt;text-indent:2em;color:#000'>我公司参与投标</p>"
|
||||||
|
"<div class='toc'>x</div>"
|
||||||
|
"<style>@page{margin:2.54cm 3.18cm 2.54cm 3.18cm}</style>"
|
||||||
|
"</body></html>"
|
||||||
|
)
|
||||||
|
r = check_technical_bid(html)
|
||||||
|
id_rule = next(x for x in r["details"] if x["rule"] == "身份信息隐藏")
|
||||||
|
self.assertFalse(id_rule["passed"])
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
unittest.main()
|
||||||
121
tests/test_diagram_intent.py
Normal file
121
tests/test_diagram_intent.py
Normal file
@ -0,0 +1,121 @@
|
|||||||
|
"""图表意图栈与特征计分。"""
|
||||||
|
import unittest
|
||||||
|
|
||||||
|
from utils import diagram_intent as di
|
||||||
|
|
||||||
|
|
||||||
|
def _base_rules() -> dict:
|
||||||
|
return {
|
||||||
|
'schema_version': 1,
|
||||||
|
'threshold_figure': 1.0,
|
||||||
|
'threshold_table': 1.0,
|
||||||
|
'title_weight': 1.0,
|
||||||
|
'context_weight': 0.6,
|
||||||
|
'outline_context_lines': {'before': 2, 'after': 2},
|
||||||
|
'stack_order_when_both': 'score_desc',
|
||||||
|
'figure_keywords': [
|
||||||
|
{'text': '进度', 'weight': 1.2},
|
||||||
|
{'text': '横道', 'weight': 1.5},
|
||||||
|
],
|
||||||
|
'table_keywords': [
|
||||||
|
{'text': '一览表', 'weight': 1.5},
|
||||||
|
{'text': '人员', 'weight': 1.0},
|
||||||
|
],
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class TestScoreFigureTable(unittest.TestCase):
|
||||||
|
def test_figure_higher_on_progress(self):
|
||||||
|
r = _base_rules()
|
||||||
|
f, t = di.score_figure_table('施工进度与横道计划', '', r)
|
||||||
|
self.assertGreater(f, t)
|
||||||
|
|
||||||
|
def test_table_higher_on_roster(self):
|
||||||
|
r = _base_rules()
|
||||||
|
f, t = di.score_figure_table('主要管理人员配置一览表', '', r)
|
||||||
|
self.assertGreater(t, f)
|
||||||
|
|
||||||
|
|
||||||
|
class TestBuildStack(unittest.TestCase):
|
||||||
|
def test_gate_figure_off(self):
|
||||||
|
r = _base_rules()
|
||||||
|
st = di.build_stack(5.0, 5.0, r, enable_figure=False, enable_table=True)
|
||||||
|
self.assertEqual(len(st), 1)
|
||||||
|
self.assertEqual(st[0].kind, 'table')
|
||||||
|
|
||||||
|
def test_score_desc_order(self):
|
||||||
|
r = dict(_base_rules())
|
||||||
|
r['stack_order_when_both'] = 'score_desc'
|
||||||
|
st = di.build_stack(3.0, 1.0, r, True, True)
|
||||||
|
self.assertEqual(len(st), 2)
|
||||||
|
self.assertEqual(st[0].kind, 'figure')
|
||||||
|
self.assertGreater(st[0].score, st[1].score)
|
||||||
|
|
||||||
|
def test_figure_first(self):
|
||||||
|
r = dict(_base_rules())
|
||||||
|
r['stack_order_when_both'] = 'figure_first'
|
||||||
|
st = di.build_stack(2.0, 5.0, r, True, True)
|
||||||
|
self.assertEqual(st[0].kind, 'figure')
|
||||||
|
self.assertEqual(st[1].kind, 'table')
|
||||||
|
|
||||||
|
def test_below_threshold_empty(self):
|
||||||
|
r = dict(_base_rules())
|
||||||
|
r['threshold_figure'] = 10.0
|
||||||
|
r['threshold_table'] = 10.0
|
||||||
|
st = di.build_stack(1.0, 1.0, r, True, True)
|
||||||
|
self.assertEqual(st, [])
|
||||||
|
|
||||||
|
|
||||||
|
class TestOutlineWindow(unittest.TestCase):
|
||||||
|
def test_finds_title_line(self):
|
||||||
|
outline = '一、总则\n二、进度\n 2.1 横道计划\n三、尾'
|
||||||
|
w = di.extract_outline_window(outline, '2.1 横道计划', 1, 1)
|
||||||
|
self.assertIn('横道', w)
|
||||||
|
|
||||||
|
def test_fallback_prefix(self):
|
||||||
|
w = di.extract_outline_window('abc' * 400, '不存在的标题', 2, 2)
|
||||||
|
self.assertTrue(len(w) > 0)
|
||||||
|
|
||||||
|
|
||||||
|
class TestAgentRender(unittest.TestCase):
|
||||||
|
def test_render_non_empty_when_match(self):
|
||||||
|
r = dict(_base_rules())
|
||||||
|
r['threshold_figure'] = 0.5
|
||||||
|
r['threshold_table'] = 0.5
|
||||||
|
agent = di.DiagramIntentAgent(r)
|
||||||
|
s = agent.render_for_section(
|
||||||
|
'施工进度横道图编制说明',
|
||||||
|
'大纲\n进度\n横道',
|
||||||
|
True,
|
||||||
|
True,
|
||||||
|
)
|
||||||
|
self.assertIn('图示生成规范', s)
|
||||||
|
self.assertIn('本节图表生成优先级', s)
|
||||||
|
|
||||||
|
def test_render_empty_when_scores_low(self):
|
||||||
|
r = dict(_base_rules())
|
||||||
|
r['threshold_figure'] = 100.0
|
||||||
|
r['threshold_table'] = 100.0
|
||||||
|
agent = di.DiagramIntentAgent(r)
|
||||||
|
s = agent.render_for_section('无关标题', '无关', True, True)
|
||||||
|
self.assertEqual(s, '')
|
||||||
|
|
||||||
|
|
||||||
|
class TestStackHelpers(unittest.TestCase):
|
||||||
|
def test_stack_compact_labels(self):
|
||||||
|
st = [
|
||||||
|
di.DiagramIntent('figure', 1.0, 't'),
|
||||||
|
di.DiagramIntent('table', 1.0, 't'),
|
||||||
|
]
|
||||||
|
lab = di.stack_compact_labels(st)
|
||||||
|
self.assertEqual(len(lab), 2)
|
||||||
|
self.assertIn('[FIGURE]', lab[0])
|
||||||
|
|
||||||
|
def test_make_fallback_stack(self):
|
||||||
|
st = di.make_fallback_stack('figure')
|
||||||
|
self.assertEqual(len(st), 1)
|
||||||
|
self.assertEqual(st[0].kind, 'figure')
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
unittest.main()
|
||||||
44
tests/test_outline_numbering.py
Normal file
44
tests/test_outline_numbering.py
Normal file
@ -0,0 +1,44 @@
|
|||||||
|
"""目录号格式化与大纲带号写回。"""
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import unittest
|
||||||
|
|
||||||
|
_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||||
|
if _ROOT not in sys.path:
|
||||||
|
sys.path.insert(0, _ROOT)
|
||||||
|
|
||||||
|
from modules.generator import _parse_outline, _sections_to_outline_text # noqa: E402
|
||||||
|
from utils.outline_numbering import format_heading_display, int_to_chinese_numeral # noqa: E402
|
||||||
|
|
||||||
|
|
||||||
|
class TestOutlineNumbering(unittest.TestCase):
|
||||||
|
def test_int_to_chinese(self):
|
||||||
|
self.assertEqual(int_to_chinese_numeral(1), "一")
|
||||||
|
self.assertEqual(int_to_chinese_numeral(10), "十")
|
||||||
|
self.assertEqual(int_to_chinese_numeral(11), "十一")
|
||||||
|
self.assertEqual(int_to_chinese_numeral(23), "二十三")
|
||||||
|
|
||||||
|
def test_format_heading(self):
|
||||||
|
self.assertEqual(format_heading_display(1, "3", "总体"), "三、总体")
|
||||||
|
self.assertEqual(format_heading_display(2, "1.2", "子节"), "1.2 子节")
|
||||||
|
|
||||||
|
def test_sections_to_outline_text_has_numbers(self):
|
||||||
|
sections = [
|
||||||
|
{"level": 1, "title": "第一章", "number": "1"},
|
||||||
|
{"level": 2, "title": "小节", "number": "1.1"},
|
||||||
|
]
|
||||||
|
text = _sections_to_outline_text("某项目技术标书", sections)
|
||||||
|
self.assertIn("某项目技术标书", text)
|
||||||
|
self.assertIn("一、第一章", text)
|
||||||
|
self.assertIn("1.1 小节", text)
|
||||||
|
|
||||||
|
def test_parse_roundtrip_numbered_outline(self):
|
||||||
|
raw = "标书标题\n一、第一章\n1.1 节A\n"
|
||||||
|
_, sections, normalized = _parse_outline(raw)
|
||||||
|
self.assertGreaterEqual(len(sections), 2)
|
||||||
|
self.assertIn("一、第一章", normalized)
|
||||||
|
self.assertIn("1.1 节A", normalized)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
unittest.main()
|
||||||
24
tests/test_parse_outline.py
Normal file
24
tests/test_parse_outline.py
Normal file
@ -0,0 +1,24 @@
|
|||||||
|
"""大纲解析:1.1 类编号不得被误拆成一级 1 与 title '.1 标题'。"""
|
||||||
|
import unittest
|
||||||
|
|
||||||
|
from modules.generator import _parse_outline
|
||||||
|
|
||||||
|
|
||||||
|
class TestParseOutline(unittest.TestCase):
|
||||||
|
def test_11_stays_single_section(self):
|
||||||
|
text = "某某项目标书标题\n1.1 沟槽开挖与支护\n1.2 排降水\n"
|
||||||
|
_, sections, _ = _parse_outline(text)
|
||||||
|
self.assertEqual(len(sections), 2, [s.get('number') for s in sections])
|
||||||
|
for s in sections:
|
||||||
|
if s.get('level') == 1:
|
||||||
|
self.assertFalse(
|
||||||
|
(s.get('title') or '').lstrip().startswith('.'),
|
||||||
|
'不得出现一级章节 title 以 .1 开头(误将 1.1 拆成 1 与 .1 标题)',
|
||||||
|
)
|
||||||
|
titles = ' '.join(s['title'] for s in sections)
|
||||||
|
self.assertIn('沟槽', titles)
|
||||||
|
self.assertIn('排降', titles)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
unittest.main()
|
||||||
86
tests/test_volume_chapters.py
Normal file
86
tests/test_volume_chapters.py
Normal file
@ -0,0 +1,86 @@
|
|||||||
|
"""目标页数与一级篇章区间。"""
|
||||||
|
import random
|
||||||
|
import unittest
|
||||||
|
|
||||||
|
from utils import volume_chapters as vc
|
||||||
|
|
||||||
|
|
||||||
|
class TestVolumeChapters(unittest.TestCase):
|
||||||
|
def test_top_level_default_pages_zero(self):
|
||||||
|
lo, hi = vc.top_level_chapter_range_from_pages(0)
|
||||||
|
self.assertEqual((lo, hi), (8, 10))
|
||||||
|
|
||||||
|
def test_ranges_match_effective_volume_bands(self):
|
||||||
|
self.assertEqual(vc.top_level_chapter_range_from_pages(100), (6, 8))
|
||||||
|
self.assertEqual(vc.top_level_chapter_range_from_pages(125), (6, 8))
|
||||||
|
self.assertEqual(vc.top_level_chapter_range_from_pages(150), (8, 10))
|
||||||
|
self.assertEqual(vc.top_level_chapter_range_from_pages(200), (10, 12))
|
||||||
|
self.assertEqual(vc.top_level_chapter_range_from_pages(300), (12, 16))
|
||||||
|
|
||||||
|
def test_hint_default_no_pages(self):
|
||||||
|
h = vc.outline_chapter_count_hint(0, 'standard')
|
||||||
|
self.assertIn('8-10', h)
|
||||||
|
self.assertIn('不超过10', h)
|
||||||
|
|
||||||
|
def test_hint_with_pages(self):
|
||||||
|
h = vc.outline_chapter_count_hint(150, 'standard', 700)
|
||||||
|
self.assertIn('约 8–10', h)
|
||||||
|
self.assertIn('150', h)
|
||||||
|
self.assertIn('105000', h) # 150×700 总字目标
|
||||||
|
self.assertIn('过细', h)
|
||||||
|
|
||||||
|
def test_subchapter_base_anchor_points(self):
|
||||||
|
self.assertAlmostEqual(vc.subchapter_total_base_from_pages(100), 78.0, places=5)
|
||||||
|
self.assertAlmostEqual(vc.subchapter_total_base_from_pages(300), 212.0, places=5)
|
||||||
|
self.assertEqual(vc.SUBCHAPTER_PAGES_SLOPE, 0.67)
|
||||||
|
self.assertEqual(vc.SUBCHAPTER_PAGES_INTERCEPT, 11.0)
|
||||||
|
|
||||||
|
def test_subchapter_jitter_bounds_78_anchor(self):
|
||||||
|
"""100 页基线 78 章,±10% 严格为 [70, 86]。"""
|
||||||
|
self.assertEqual(vc.subchapter_jitter_bounds(78.0), (70, 86))
|
||||||
|
|
||||||
|
def test_subchapter_jitter_bounds_300_pages(self):
|
||||||
|
self.assertEqual(vc.subchapter_jitter_bounds(212.0), (191, 233))
|
||||||
|
|
||||||
|
def test_allocate_subchapters_to_mains(self):
|
||||||
|
self.assertEqual(vc.allocate_subchapters_to_mains(10, 3), [4, 3, 3])
|
||||||
|
self.assertEqual(vc.allocate_subchapters_to_mains(0, 3), [0, 0, 0])
|
||||||
|
self.assertEqual(vc.allocate_subchapters_to_mains(5, 2), [3, 2])
|
||||||
|
self.assertEqual(vc.allocate_subchapters_to_mains(7, 0), [])
|
||||||
|
# n < k 时多出的主章 quota 为 0
|
||||||
|
a = vc.allocate_subchapters_to_mains(70, 100)
|
||||||
|
self.assertEqual(len(a), 100)
|
||||||
|
self.assertEqual(sum(a), 70)
|
||||||
|
self.assertEqual(a.count(1), 70)
|
||||||
|
self.assertEqual(a.count(0), 30)
|
||||||
|
|
||||||
|
def test_subchapter_effective_respects_k_floor_and_jitter(self):
|
||||||
|
# round(78 * u) for u in [0.9, 1.1] stays in [70, 86] for 78.0 base
|
||||||
|
for seed in range(800):
|
||||||
|
n = vc.subchapter_total_effective(100, 1, random.Random(seed))
|
||||||
|
self.assertGreaterEqual(n, 70)
|
||||||
|
self.assertLessEqual(n, 86)
|
||||||
|
# 主章数很大时,总条数仍须在 [70, 86](不得被 max(n,k) 抬到数百)
|
||||||
|
for seed in range(20):
|
||||||
|
nk = vc.subchapter_total_effective(100, 500, random.Random(seed))
|
||||||
|
self.assertGreaterEqual(nk, 70, msg=f'seed={seed}')
|
||||||
|
self.assertLessEqual(nk, 86, msg=f'seed={seed}')
|
||||||
|
|
||||||
|
def test_subchapter_effective_zero_pages(self):
|
||||||
|
self.assertEqual(vc.subchapter_total_effective(0, 5), 0)
|
||||||
|
self.assertEqual(vc.subchapter_total_effective(100, 0), 0)
|
||||||
|
|
||||||
|
def test_resolve_expand_target_pages(self):
|
||||||
|
self.assertEqual(vc.resolve_expand_target_pages(None, True, 100, 200), 0)
|
||||||
|
self.assertEqual(vc.resolve_expand_target_pages(200, False, 100, 50), 200)
|
||||||
|
self.assertEqual(vc.resolve_expand_target_pages(0, False, 80, 0), 80)
|
||||||
|
self.assertEqual(vc.resolve_expand_target_pages(0, False, 0, 50), 50)
|
||||||
|
self.assertEqual(
|
||||||
|
vc.resolve_expand_target_pages(0, False, 0, 0),
|
||||||
|
vc.EXPAND_OUTLINE_DEFAULT_TARGET_PAGES,
|
||||||
|
)
|
||||||
|
self.assertEqual(vc.EXPAND_OUTLINE_DEFAULT_TARGET_PAGES, 100)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
unittest.main()
|
||||||
170
tests/test_word_allocation.py
Normal file
170
tests/test_word_allocation.py
Normal file
@ -0,0 +1,170 @@
|
|||||||
|
"""字数分配与 rating_json 解析单元测试。"""
|
||||||
|
import json
|
||||||
|
import unittest
|
||||||
|
|
||||||
|
import config as cfg
|
||||||
|
|
||||||
|
from utils import word_allocation as wa
|
||||||
|
|
||||||
|
|
||||||
|
class TestParseRatingJson(unittest.TestCase):
|
||||||
|
def test_canonical_items(self):
|
||||||
|
raw = json.dumps(
|
||||||
|
{
|
||||||
|
'items': [
|
||||||
|
{'id': 'T1', 'name': '施工方案', 'weight': 30, 'keywords': ['工艺']},
|
||||||
|
{'id': 'T2', 'name': '质量保证', 'weight': 10, 'keywords': []},
|
||||||
|
],
|
||||||
|
'notes': '',
|
||||||
|
},
|
||||||
|
ensure_ascii=False,
|
||||||
|
)
|
||||||
|
items = wa.parse_rating_json(raw)
|
||||||
|
self.assertEqual(len(items), 2)
|
||||||
|
names = {x['name'] for x in items}
|
||||||
|
self.assertIn('施工方案', names)
|
||||||
|
self.assertIn('质量保证', names)
|
||||||
|
wmap = {x['name']: x['weight'] for x in items}
|
||||||
|
self.assertEqual(wmap['施工方案'], 30.0)
|
||||||
|
|
||||||
|
def test_malformed_returns_empty(self):
|
||||||
|
self.assertEqual(wa.parse_rating_json('not json'), [])
|
||||||
|
self.assertEqual(wa.parse_rating_json(''), [])
|
||||||
|
|
||||||
|
|
||||||
|
class TestComputeLeafAllocations(unittest.TestCase):
|
||||||
|
def test_none_when_no_rating_and_not_target_pages_budget(self):
|
||||||
|
leaves = [{'id': 1, 'section_title': '一、总体方案'}]
|
||||||
|
rules = dict(wa.DEFAULT_RULES)
|
||||||
|
rules['budget_mode'] = 'anchor_mean'
|
||||||
|
self.assertIsNone(
|
||||||
|
wa.compute_leaf_allocations('standard', leaves, '', rules)
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_uniform_when_no_rating_but_target_pages(self):
|
||||||
|
"""无技术评分时仍按目标页均分 B=页×每页字,全稿不随节数 N 线性爆量。"""
|
||||||
|
leaves = [
|
||||||
|
{'id': 1, 'section_title': 'A'},
|
||||||
|
{'id': 2, 'section_title': 'B'},
|
||||||
|
]
|
||||||
|
rules = dict(wa.DEFAULT_RULES)
|
||||||
|
rules['budget_mode'] = 'target_pages'
|
||||||
|
old_tp = cfg.TARGET_PAGES
|
||||||
|
old_pce = cfg.PAGE_CHAR_ESTIMATE
|
||||||
|
try:
|
||||||
|
cfg.TARGET_PAGES = 100
|
||||||
|
cfg.PAGE_CHAR_ESTIMATE = 700
|
||||||
|
out = wa.compute_leaf_allocations('standard', leaves, '', rules)
|
||||||
|
finally:
|
||||||
|
cfg.TARGET_PAGES = old_tp
|
||||||
|
cfg.PAGE_CHAR_ESTIMATE = old_pce
|
||||||
|
self.assertIsNotNone(out)
|
||||||
|
s = out[1]['target_chars'] + out[2]['target_chars']
|
||||||
|
self.assertEqual(s, 100 * 700)
|
||||||
|
self.assertEqual(out[1]['target_chars'], out[2]['target_chars'])
|
||||||
|
|
||||||
|
def test_monotonicity_high_weight_match(self):
|
||||||
|
rating = json.dumps(
|
||||||
|
{
|
||||||
|
'items': [
|
||||||
|
{'name': '施工组织设计', 'weight': 50, 'keywords': ['进度']},
|
||||||
|
{'name': '页眉页脚规范', 'weight': 2, 'keywords': []},
|
||||||
|
]
|
||||||
|
},
|
||||||
|
ensure_ascii=False,
|
||||||
|
)
|
||||||
|
leaves = [
|
||||||
|
{'id': 10, 'section_title': '3.1 施工组织设计与进度计划'},
|
||||||
|
{'id': 11, 'section_title': '9.9 页眉格式说明'},
|
||||||
|
]
|
||||||
|
rules = dict(wa.DEFAULT_RULES)
|
||||||
|
rules['alpha'] = 0.95
|
||||||
|
out = wa.compute_leaf_allocations('standard', leaves, rating, rules)
|
||||||
|
self.assertIsNotNone(out)
|
||||||
|
t_high = out[10]['target_chars']
|
||||||
|
t_low = out[11]['target_chars']
|
||||||
|
self.assertGreaterEqual(t_high, t_low, '强匹配高分项的章节应不低于弱匹配章节')
|
||||||
|
self.assertIn('施工组织设计', out[10]['word_count_spec'])
|
||||||
|
|
||||||
|
def test_budget_anchor_mean(self):
|
||||||
|
rating = json.dumps(
|
||||||
|
{'items': [{'name': '技术部分', 'weight': 100}]},
|
||||||
|
ensure_ascii=False,
|
||||||
|
)
|
||||||
|
leaves = [
|
||||||
|
{'id': 1, 'section_title': 'A'},
|
||||||
|
{'id': 2, 'section_title': 'B'},
|
||||||
|
{'id': 3, 'section_title': 'C'},
|
||||||
|
]
|
||||||
|
rules = dict(wa.DEFAULT_RULES)
|
||||||
|
rules['budget_mode'] = 'anchor_mean'
|
||||||
|
rules['alpha'] = 0.0
|
||||||
|
old_tp = getattr(cfg, 'TARGET_PAGES', 0)
|
||||||
|
setattr(cfg, 'TARGET_PAGES', 0)
|
||||||
|
try:
|
||||||
|
out = wa.compute_leaf_allocations('standard', leaves, rating, rules)
|
||||||
|
finally:
|
||||||
|
setattr(cfg, 'TARGET_PAGES', old_tp)
|
||||||
|
self.assertIsNotNone(out)
|
||||||
|
base, core, _, _ = wa.VOLUME_PRESETS['standard']
|
||||||
|
expect = int(round(len(leaves) * (base + core) / 2.0))
|
||||||
|
s = sum(out[i]['target_chars'] for i in (1, 2, 3))
|
||||||
|
self.assertEqual(s, expect)
|
||||||
|
|
||||||
|
def test_budget_target_pages(self):
|
||||||
|
rating = json.dumps(
|
||||||
|
{'items': [{'name': '技术部分', 'weight': 100}]},
|
||||||
|
ensure_ascii=False,
|
||||||
|
)
|
||||||
|
leaves = [
|
||||||
|
{'id': 1, 'section_title': 'A'},
|
||||||
|
{'id': 2, 'section_title': 'B'},
|
||||||
|
]
|
||||||
|
rules = dict(wa.DEFAULT_RULES)
|
||||||
|
rules['budget_mode'] = 'target_pages'
|
||||||
|
rules['alpha'] = 0.0
|
||||||
|
old_tp = cfg.TARGET_PAGES
|
||||||
|
old_pce = cfg.PAGE_CHAR_ESTIMATE
|
||||||
|
try:
|
||||||
|
cfg.TARGET_PAGES = 100
|
||||||
|
cfg.PAGE_CHAR_ESTIMATE = 700
|
||||||
|
out = wa.compute_leaf_allocations('standard', leaves, rating, rules)
|
||||||
|
finally:
|
||||||
|
cfg.TARGET_PAGES = old_tp
|
||||||
|
cfg.PAGE_CHAR_ESTIMATE = old_pce
|
||||||
|
self.assertIsNotNone(out)
|
||||||
|
expect = 100 * 700
|
||||||
|
s = sum(out[i]['target_chars'] for i in (1, 2))
|
||||||
|
self.assertEqual(s, expect)
|
||||||
|
|
||||||
|
def test_budget_target_pages_falls_back_when_pages_zero(self):
|
||||||
|
rating = json.dumps(
|
||||||
|
{'items': [{'name': '技术部分', 'weight': 100}]},
|
||||||
|
ensure_ascii=False,
|
||||||
|
)
|
||||||
|
leaves = [
|
||||||
|
{'id': 1, 'section_title': 'A'},
|
||||||
|
{'id': 2, 'section_title': 'B'},
|
||||||
|
]
|
||||||
|
rules = dict(wa.DEFAULT_RULES)
|
||||||
|
rules['budget_mode'] = 'target_pages'
|
||||||
|
rules['alpha'] = 0.0
|
||||||
|
old_tp = cfg.TARGET_PAGES
|
||||||
|
try:
|
||||||
|
cfg.TARGET_PAGES = 0
|
||||||
|
out = wa.compute_leaf_allocations('standard', leaves, rating, rules)
|
||||||
|
finally:
|
||||||
|
cfg.TARGET_PAGES = old_tp
|
||||||
|
self.assertIsNotNone(out)
|
||||||
|
base, core, _, _ = wa.VOLUME_PRESETS['standard']
|
||||||
|
expect = int(round(len(leaves) * (base + core) / 2.0))
|
||||||
|
s = sum(out[i]['target_chars'] for i in (1, 2))
|
||||||
|
self.assertEqual(s, expect)
|
||||||
|
|
||||||
|
def test_continuation_threshold(self):
|
||||||
|
self.assertEqual(wa.continuation_threshold(2000), 1300)
|
||||||
|
self.assertEqual(wa.continuation_threshold(100), 200)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
unittest.main()
|
||||||
1
utils/__init__.py
Normal file
1
utils/__init__.py
Normal file
@ -0,0 +1 @@
|
|||||||
|
|
||||||
259
utils/ai_client.py
Normal file
259
utils/ai_client.py
Normal file
@ -0,0 +1,259 @@
|
|||||||
|
"""
|
||||||
|
AI API 调用封装,支持 OpenAI、阿里云通义千问、DeepSeek、Ollama(均兼容 OpenAI SDK)
|
||||||
|
"""
|
||||||
|
import re
|
||||||
|
import time
|
||||||
|
import logging
|
||||||
|
from openai import OpenAI
|
||||||
|
import config
|
||||||
|
from contextlib import contextmanager # for type hints if needed
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
PROVIDER_NAMES = {
|
||||||
|
'qwen': '通义千问 (Qwen)',
|
||||||
|
'deepseek': 'DeepSeek',
|
||||||
|
'openai': 'OpenAI',
|
||||||
|
'ollama': 'Ollama 本地',
|
||||||
|
'doubao': '豆包 (Doubao)',
|
||||||
|
'kimi': 'Kimi (Moonshot)',
|
||||||
|
}
|
||||||
|
|
||||||
|
PROVIDER_LINKS = {
|
||||||
|
'qwen': 'https://dashscope.aliyun.com/',
|
||||||
|
'deepseek': 'https://platform.deepseek.com/',
|
||||||
|
'openai': 'https://platform.openai.com/',
|
||||||
|
'ollama': 'https://ollama.com/',
|
||||||
|
'doubao': 'https://console.volcengine.com/ark/',
|
||||||
|
'kimi': 'https://platform.moonshot.cn/',
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _check_api_key():
|
||||||
|
"""调用前预检 API Key,无效时直接抛出友好提示,不做无意义的重试"""
|
||||||
|
provider = config.MODEL_PROVIDER
|
||||||
|
|
||||||
|
# Ollama 本地无需 API Key,跳过检查
|
||||||
|
if provider == 'ollama':
|
||||||
|
return
|
||||||
|
|
||||||
|
name = PROVIDER_NAMES.get(provider, provider)
|
||||||
|
link = PROVIDER_LINKS.get(provider, '')
|
||||||
|
|
||||||
|
if provider == 'qwen':
|
||||||
|
key = config.QWEN_API_KEY
|
||||||
|
elif provider == 'deepseek':
|
||||||
|
key = config.DEEPSEEK_API_KEY
|
||||||
|
elif provider == 'doubao':
|
||||||
|
key = config.DOUBAO_API_KEY
|
||||||
|
elif provider == 'kimi':
|
||||||
|
key = config.KIMI_API_KEY
|
||||||
|
else:
|
||||||
|
key = config.OPENAI_API_KEY
|
||||||
|
|
||||||
|
if not key or key.startswith('sk-your'):
|
||||||
|
raise RuntimeError(
|
||||||
|
f'尚未配置 {name} 的 API Key。'
|
||||||
|
f'请点击右上角设置按钮,选择"{name}"并填入有效的 API Key。'
|
||||||
|
f'申请地址:{link}'
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _get_client() -> OpenAI:
|
||||||
|
"""根据 MODEL_PROVIDER 返回对应的 OpenAI 兼容客户端"""
|
||||||
|
if config.MODEL_PROVIDER == 'qwen':
|
||||||
|
return OpenAI(api_key=config.QWEN_API_KEY, base_url=config.QWEN_BASE_URL)
|
||||||
|
if config.MODEL_PROVIDER == 'deepseek':
|
||||||
|
return OpenAI(api_key=config.DEEPSEEK_API_KEY, base_url=config.DEEPSEEK_BASE_URL)
|
||||||
|
if config.MODEL_PROVIDER == 'ollama':
|
||||||
|
return OpenAI(api_key='ollama', base_url=config.OLLAMA_BASE_URL)
|
||||||
|
if config.MODEL_PROVIDER == 'doubao':
|
||||||
|
return OpenAI(api_key=config.DOUBAO_API_KEY, base_url=config.DOUBAO_BASE_URL)
|
||||||
|
if config.MODEL_PROVIDER == 'kimi':
|
||||||
|
return OpenAI(api_key=config.KIMI_API_KEY, base_url=config.KIMI_BASE_URL)
|
||||||
|
return OpenAI(api_key=config.OPENAI_API_KEY, base_url=config.OPENAI_BASE_URL)
|
||||||
|
|
||||||
|
|
||||||
|
def _get_model() -> str:
|
||||||
|
if config.MODEL_PROVIDER == 'qwen':
|
||||||
|
return config.QWEN_MODEL
|
||||||
|
if config.MODEL_PROVIDER == 'deepseek':
|
||||||
|
return config.DEEPSEEK_MODEL
|
||||||
|
if config.MODEL_PROVIDER == 'ollama':
|
||||||
|
return config.OLLAMA_MODEL
|
||||||
|
if config.MODEL_PROVIDER == 'doubao':
|
||||||
|
return config.DOUBAO_MODEL
|
||||||
|
if config.MODEL_PROVIDER == 'kimi':
|
||||||
|
return config.KIMI_MODEL
|
||||||
|
return config.OPENAI_MODEL
|
||||||
|
|
||||||
|
|
||||||
|
def _clean_response(text: str) -> str:
|
||||||
|
"""
|
||||||
|
过滤推理模型(DeepSeek R1 / QwQ 等)输出的 <think>...</think> 思考过程标签,
|
||||||
|
只保留最终正文内容,避免思考链污染标书正文。
|
||||||
|
"""
|
||||||
|
# 去除 <think>...</think> 块(含跨行内容)
|
||||||
|
text = re.sub(r'<think>[\s\S]*?</think>', '', text, flags=re.IGNORECASE)
|
||||||
|
return text.strip()
|
||||||
|
|
||||||
|
|
||||||
|
def _is_auth_error(e: Exception) -> bool:
|
||||||
|
"""判断是否为认证错误(401 / invalid_api_key),无需重试"""
|
||||||
|
# 优先用 openai 原生异常类型判断
|
||||||
|
try:
|
||||||
|
from openai import AuthenticationError, PermissionDeniedError
|
||||||
|
if isinstance(e, (AuthenticationError, PermissionDeniedError)):
|
||||||
|
return True
|
||||||
|
except ImportError:
|
||||||
|
pass
|
||||||
|
# 兜底:字符串匹配
|
||||||
|
err_str = str(e).lower()
|
||||||
|
return ('401' in err_str or 'invalid_api_key' in err_str
|
||||||
|
or 'incorrect api key' in err_str or 'authentication' in err_str)
|
||||||
|
|
||||||
|
|
||||||
|
# OpenAI o 系列推理模型:不支持 temperature,max_tokens 需用 max_completion_tokens
|
||||||
|
_OPENAI_REASONING_MODELS = {'o1', 'o1-mini', 'o1-pro', 'o3', 'o3-mini', 'o3-pro', 'o4-mini'}
|
||||||
|
|
||||||
|
|
||||||
|
def _build_chat_kwargs(
|
||||||
|
model: str,
|
||||||
|
messages: list,
|
||||||
|
temperature: float,
|
||||||
|
max_tokens: int,
|
||||||
|
request_timeout: float | None = None,
|
||||||
|
) -> dict:
|
||||||
|
"""
|
||||||
|
根据模型类型构建 chat.completions.create 的参数字典。
|
||||||
|
OpenAI o 系列推理模型不接受 temperature,且使用 max_completion_tokens 替代 max_tokens。
|
||||||
|
"""
|
||||||
|
base_model = model.split(':')[0] # 去掉 ollama tag 后缀
|
||||||
|
is_reasoning = base_model in _OPENAI_REASONING_MODELS
|
||||||
|
|
||||||
|
to = request_timeout if request_timeout is not None else config.REQUEST_TIMEOUT
|
||||||
|
kwargs = {
|
||||||
|
'model': model,
|
||||||
|
'messages': messages,
|
||||||
|
'timeout': to,
|
||||||
|
}
|
||||||
|
if is_reasoning:
|
||||||
|
kwargs['max_completion_tokens'] = max_tokens
|
||||||
|
else:
|
||||||
|
kwargs['temperature'] = temperature
|
||||||
|
kwargs['max_tokens'] = max_tokens
|
||||||
|
return kwargs
|
||||||
|
|
||||||
|
|
||||||
|
def chat(
|
||||||
|
prompt: str,
|
||||||
|
system: str = '你是一位专业的投标文件撰写专家。',
|
||||||
|
temperature: float = 0.7,
|
||||||
|
max_tokens: int = 8192,
|
||||||
|
retries: int = None,
|
||||||
|
request_timeout: float | None = None,
|
||||||
|
) -> str:
|
||||||
|
"""
|
||||||
|
调用 AI 接口,返回文本响应。
|
||||||
|
认证错误立即终止;其他错误指数退避重试。
|
||||||
|
自动兼容 OpenAI o 系列推理模型的参数差异。
|
||||||
|
所有调用受全局LLM_SEMAPHORE(上限20)保护,实现极速并发优化。
|
||||||
|
"""
|
||||||
|
_check_api_key()
|
||||||
|
|
||||||
|
max_retries = retries if retries is not None else config.MAX_RETRIES
|
||||||
|
client = _get_client()
|
||||||
|
model = _get_model()
|
||||||
|
provider = config.MODEL_PROVIDER
|
||||||
|
name = PROVIDER_NAMES.get(provider, provider)
|
||||||
|
|
||||||
|
messages = [
|
||||||
|
{'role': 'system', 'content': system},
|
||||||
|
{'role': 'user', 'content': prompt},
|
||||||
|
]
|
||||||
|
|
||||||
|
for attempt in range(max_retries):
|
||||||
|
try:
|
||||||
|
with config.llm_call(): # 全局并发控制,上限20
|
||||||
|
kwargs = _build_chat_kwargs(
|
||||||
|
model, messages, temperature, max_tokens, request_timeout=request_timeout
|
||||||
|
)
|
||||||
|
resp = client.chat.completions.create(**kwargs)
|
||||||
|
return _clean_response(resp.choices[0].message.content.strip())
|
||||||
|
except Exception as e:
|
||||||
|
if _is_auth_error(e):
|
||||||
|
raise RuntimeError(
|
||||||
|
f'{name} API Key 无效或已过期,请在设置中重新配置。'
|
||||||
|
f'申请地址:{PROVIDER_LINKS.get(provider, "")}'
|
||||||
|
) from e
|
||||||
|
|
||||||
|
wait = 2 ** attempt
|
||||||
|
logger.warning(f'AI 请求失败 (第{attempt+1}次),{wait}s 后重试: {e}')
|
||||||
|
if attempt < max_retries - 1:
|
||||||
|
time.sleep(wait)
|
||||||
|
else:
|
||||||
|
raise RuntimeError(f'AI 接口调用失败(已重试 {max_retries} 次): {e}') from e
|
||||||
|
|
||||||
|
return ''
|
||||||
|
|
||||||
|
|
||||||
|
def chat_with_history(system: str, messages: list,
|
||||||
|
temperature: float = 0.7, max_tokens: int = 4096) -> str:
|
||||||
|
"""
|
||||||
|
多轮对话接口,支持完整历史上下文,用于对话式章节生成。
|
||||||
|
messages 格式:[{'role': 'user'|'assistant', 'content': str}, ...]
|
||||||
|
受全局LLM_SEMAPHORE保护。
|
||||||
|
"""
|
||||||
|
_check_api_key()
|
||||||
|
|
||||||
|
client = _get_client()
|
||||||
|
model = _get_model()
|
||||||
|
provider = config.MODEL_PROVIDER
|
||||||
|
name = PROVIDER_NAMES.get(provider, provider)
|
||||||
|
|
||||||
|
full_messages = [{'role': 'system', 'content': system}] + messages
|
||||||
|
|
||||||
|
for attempt in range(config.MAX_RETRIES):
|
||||||
|
try:
|
||||||
|
with config.llm_call(): # 全局并发控制
|
||||||
|
kwargs = _build_chat_kwargs(model, full_messages, temperature, max_tokens)
|
||||||
|
resp = client.chat.completions.create(**kwargs)
|
||||||
|
return _clean_response(resp.choices[0].message.content.strip())
|
||||||
|
except Exception as e:
|
||||||
|
if _is_auth_error(e):
|
||||||
|
raise RuntimeError(
|
||||||
|
f'{name} API Key 无效或已过期,请在设置中重新配置。'
|
||||||
|
f'申请地址:{PROVIDER_LINKS.get(provider, "")}'
|
||||||
|
) from e
|
||||||
|
wait = 2 ** attempt
|
||||||
|
logger.warning(f'对话 AI 请求失败 (第{attempt+1}次),{wait}s 后重试: {e}')
|
||||||
|
if attempt < config.MAX_RETRIES - 1:
|
||||||
|
time.sleep(wait)
|
||||||
|
else:
|
||||||
|
raise RuntimeError(f'AI 接口调用失败(已重试 {config.MAX_RETRIES} 次): {e}') from e
|
||||||
|
|
||||||
|
return ''
|
||||||
|
|
||||||
|
|
||||||
|
def get_embeddings(texts: list[str]) -> list[list[float]]:
|
||||||
|
"""获取文本嵌入向量。
|
||||||
|
支持 Qwen、OpenAI、Kimi;DeepSeek / Ollama / 豆包 暂不提供 Embedding API。
|
||||||
|
受全局LLM_SEMAPHORE保护(嵌入调用计入并发上限)。
|
||||||
|
"""
|
||||||
|
provider = config.MODEL_PROVIDER
|
||||||
|
if provider in ('deepseek', 'ollama', 'doubao'):
|
||||||
|
raise NotImplementedError(
|
||||||
|
f'{PROVIDER_NAMES.get(provider)} 暂不支持 Embedding API,知识库将使用关键词检索降级'
|
||||||
|
)
|
||||||
|
|
||||||
|
client = _get_client()
|
||||||
|
if provider == 'qwen':
|
||||||
|
model = config.QWEN_EMBEDDING_MODEL
|
||||||
|
elif provider == 'kimi':
|
||||||
|
model = config.KIMI_EMBEDDING_MODEL
|
||||||
|
else:
|
||||||
|
model = config.OPENAI_EMBEDDING_MODEL
|
||||||
|
|
||||||
|
with config.llm_call(): # 嵌入也受并发限制
|
||||||
|
resp = client.embeddings.create(model=model, input=texts)
|
||||||
|
return [item.embedding for item in resp.data]
|
||||||
186
utils/attachment_section.py
Normal file
186
utils/attachment_section.py
Normal file
@ -0,0 +1,186 @@
|
|||||||
|
"""
|
||||||
|
附件类章节识别:标题匹配、expand_outline 跳过、以及正文模式(完整正文 vs 仅单图单表)。
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
from typing import Any, Dict, List, Optional
|
||||||
|
|
||||||
|
import config
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
DEFAULT_ATTACHMENT_RULES: Dict[str, Any] = {
|
||||||
|
'schema_version': 1,
|
||||||
|
'title_regex': [
|
||||||
|
r'附件\s*[一二三四五六七八九十0-9A-Za-z、::.]',
|
||||||
|
r'附\s*图',
|
||||||
|
r'附\s*表',
|
||||||
|
r'附\s*件\s*\(',
|
||||||
|
r'^\s*[\d一二三四五六七八九十\..、]+\s*附件',
|
||||||
|
],
|
||||||
|
'table_hint_keywords': [
|
||||||
|
'附表', '一览表', '清单表', '统计表', '明细表',
|
||||||
|
],
|
||||||
|
'figure_hint_keywords': [
|
||||||
|
'附图', '示意图', '平面图', '流程图', '布置图', '组织图', '横道',
|
||||||
|
],
|
||||||
|
'default_kind_when_ambiguous': 'table',
|
||||||
|
# stack_charts_only:默认,意图栈只输出 [FIGURE]/[TABLE] 无正文;full:长文;single_chart_only:栈顶仅一块
|
||||||
|
'attachment_leaf_body_mode': 'stack_charts_only',
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def attachment_rules_path() -> str:
|
||||||
|
return os.path.join(config.DATA_DIR, 'attachment_section_rules.json')
|
||||||
|
|
||||||
|
|
||||||
|
def load_attachment_rules(path: Optional[str] = None) -> Dict[str, Any]:
|
||||||
|
p = path or attachment_rules_path()
|
||||||
|
data = dict(DEFAULT_ATTACHMENT_RULES)
|
||||||
|
if not os.path.isfile(p):
|
||||||
|
return data
|
||||||
|
try:
|
||||||
|
with open(p, encoding='utf-8') as f:
|
||||||
|
raw = json.load(f)
|
||||||
|
if isinstance(raw, dict):
|
||||||
|
for k, v in raw.items():
|
||||||
|
if k.startswith('_'):
|
||||||
|
continue
|
||||||
|
data[k] = v
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning('加载 attachment_section_rules.json 失败,使用内置默认: %s', e)
|
||||||
|
return data
|
||||||
|
|
||||||
|
|
||||||
|
def attachment_leaf_body_mode(rules: Optional[Dict[str, Any]] = None) -> str:
|
||||||
|
"""
|
||||||
|
附件叶节点正文策略:
|
||||||
|
stack_charts_only(默认)、full(完整技术正文)、single_chart_only(栈顶仅一块图或表)。
|
||||||
|
"""
|
||||||
|
r = rules or get_attachment_rules_cached()
|
||||||
|
mode = (r.get('attachment_leaf_body_mode') or 'stack_charts_only').strip().lower()
|
||||||
|
if mode in ('single_chart_only', 'stack_charts_only', 'full'):
|
||||||
|
return mode
|
||||||
|
return 'stack_charts_only'
|
||||||
|
|
||||||
|
|
||||||
|
def use_attachment_stack_charts_body(rules: Optional[Dict[str, Any]] = None) -> bool:
|
||||||
|
"""附件走「意图栈仅图/表、无长文」路径(含 single_chart_only 的单栈顶版本)。"""
|
||||||
|
m = attachment_leaf_body_mode(rules)
|
||||||
|
return m in ('stack_charts_only', 'single_chart_only')
|
||||||
|
|
||||||
|
|
||||||
|
def use_attachment_single_chart_only_body(rules: Optional[Dict[str, Any]] = None) -> bool:
|
||||||
|
return attachment_leaf_body_mode(rules) == 'single_chart_only'
|
||||||
|
|
||||||
|
|
||||||
|
def use_attachment_full_body(rules: Optional[Dict[str, Any]] = None) -> bool:
|
||||||
|
return attachment_leaf_body_mode(rules) == 'full'
|
||||||
|
|
||||||
|
|
||||||
|
def is_attachment_only_section(section_title: str, rules: Optional[Dict[str, Any]] = None) -> bool:
|
||||||
|
"""标题是否属于附件类(附图/附表/附件N 等),用于 expand_outline 跳过与正文分支。"""
|
||||||
|
t = (section_title or '').strip()
|
||||||
|
if not t:
|
||||||
|
return False
|
||||||
|
r = rules or load_attachment_rules()
|
||||||
|
patterns: List[str] = list(r.get('title_regex') or [])
|
||||||
|
for pat in patterns:
|
||||||
|
try:
|
||||||
|
if re.search(pat, t):
|
||||||
|
return True
|
||||||
|
except re.error:
|
||||||
|
logger.warning('无效 attachment title_regex,已跳过: %s', pat[:80])
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def pick_single_figure_or_table(
|
||||||
|
section_title: str,
|
||||||
|
enable_figure: bool,
|
||||||
|
enable_table: bool,
|
||||||
|
rules: Optional[Dict[str, Any]] = None,
|
||||||
|
) -> Optional[str]:
|
||||||
|
"""
|
||||||
|
返回 'figure' | 'table' | None。
|
||||||
|
两开关均关返回 None;仅开一个则取对应类型。
|
||||||
|
"""
|
||||||
|
if not enable_figure and not enable_table:
|
||||||
|
return None
|
||||||
|
if enable_figure and not enable_table:
|
||||||
|
return 'figure'
|
||||||
|
if enable_table and not enable_figure:
|
||||||
|
return 'table'
|
||||||
|
|
||||||
|
r = rules or load_attachment_rules()
|
||||||
|
t = (section_title or '')
|
||||||
|
tbl_kw = list(r.get('table_hint_keywords') or [])
|
||||||
|
fig_kw = list(r.get('figure_hint_keywords') or [])
|
||||||
|
# 单独「表」字易误判,仅当同时存在附表类或与其它词组合时再偏表
|
||||||
|
for kw in tbl_kw:
|
||||||
|
if kw and kw in t:
|
||||||
|
return 'table'
|
||||||
|
for kw in fig_kw:
|
||||||
|
if kw and kw in t:
|
||||||
|
return 'figure'
|
||||||
|
# 泛「表」在附件语境下常见
|
||||||
|
if '表' in t and '图' not in t:
|
||||||
|
return 'table'
|
||||||
|
if '图' in t and '表' not in t:
|
||||||
|
return 'figure'
|
||||||
|
|
||||||
|
default = (r.get('default_kind_when_ambiguous') or 'table').strip().lower()
|
||||||
|
if default == 'figure':
|
||||||
|
return 'figure'
|
||||||
|
return 'table'
|
||||||
|
|
||||||
|
|
||||||
|
_cached_rules: Optional[Dict[str, Any]] = None
|
||||||
|
|
||||||
|
|
||||||
|
def get_attachment_rules_cached() -> Dict[str, Any]:
|
||||||
|
global _cached_rules
|
||||||
|
if _cached_rules is None:
|
||||||
|
_cached_rules = load_attachment_rules()
|
||||||
|
return _cached_rules
|
||||||
|
|
||||||
|
|
||||||
|
def should_skip_expand_subchapters(title: str) -> bool:
|
||||||
|
"""
|
||||||
|
AI 自动填充小章节(expand_outline)时:附件类一级主章不调用子章节生成。
|
||||||
|
判定与 is_attachment_only_section 一致。
|
||||||
|
"""
|
||||||
|
return is_attachment_only_section(title, get_attachment_rules_cached())
|
||||||
|
|
||||||
|
|
||||||
|
# 从标题中提取「附件几」等标签,用于日志
|
||||||
|
_ATTACHMENT_LABEL_RE = re.compile(
|
||||||
|
r'附件\s*[::]?\s*([一二三四五六七八九十百0-9A-Za-z]+)',
|
||||||
|
)
|
||||||
|
_ATTACHMENT_FIG_TBL_RE = re.compile(r'附\s*[图表]\s*([一二三四五六七八九十百0-9]*)')
|
||||||
|
|
||||||
|
|
||||||
|
def parse_attachment_label(title: str) -> Optional[str]:
|
||||||
|
t = (title or '').strip()
|
||||||
|
if not t:
|
||||||
|
return None
|
||||||
|
m = _ATTACHMENT_LABEL_RE.search(t)
|
||||||
|
if m:
|
||||||
|
return m.group(1).strip() or None
|
||||||
|
m2 = _ATTACHMENT_FIG_TBL_RE.search(t)
|
||||||
|
if m2:
|
||||||
|
rest = (m2.group(1) or '').strip()
|
||||||
|
if rest:
|
||||||
|
return rest
|
||||||
|
matched = m2.group(0)
|
||||||
|
if '图' in matched:
|
||||||
|
return '附图'
|
||||||
|
return '附表'
|
||||||
|
if re.search(r'附\s*图', t):
|
||||||
|
return '附图'
|
||||||
|
if re.search(r'附\s*表', t):
|
||||||
|
return '附表'
|
||||||
|
return None
|
||||||
577
utils/bill_analysis.py
Normal file
577
utils/bill_analysis.py
Normal file
@ -0,0 +1,577 @@
|
|||||||
|
"""
|
||||||
|
工程量清单本地分析(从 bill-worker.js Phase 2/3 移植)。
|
||||||
|
Phase 2:按页关键字筛选清单页;Phase 3:正则解析分部与清单项。
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import re
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
BILL_KW = ['项目编码', '项目名称', '工程量', '计量单位', '综合单价', '清单编码']
|
||||||
|
SEC_KW = ['分部分项', '分类分项', '措施项目', '其他项目', '工程量清单计价']
|
||||||
|
FEE_PAGE_KW = [
|
||||||
|
'规费', '税金', '社会保险费', '住房公积金', '养老保险',
|
||||||
|
'工伤保险', '失业保险', '医疗保险', '教育费附加', '城市维护建设税',
|
||||||
|
]
|
||||||
|
|
||||||
|
ITEM_START = re.compile(r'^\d+(\.\d+)+\s')
|
||||||
|
CODE_INLINE = re.compile(r'(?:^|\s)(\d{9,12}|(?<![A-Za-z])B\d{5,6})\s')
|
||||||
|
CODE_START_RE = re.compile(r'^(\d{9,12}|B\d{5,6})\s')
|
||||||
|
SEQ_CODE_RE = re.compile(r'^\d{1,4}\s+(\d{9,12}|(?<![A-Za-z])B\d{5,6})\s')
|
||||||
|
PAGE_MARK = re.compile(r'^--\s*\d+\s+of\s+\d+\s*--')
|
||||||
|
HEADER_RE = re.compile(r'^序号\s+(项目编码|项目名称)')
|
||||||
|
HEADER_KW = re.compile(
|
||||||
|
r'^(项目编码|项目名称|清单编码|计量单位|综合单价|工程量|合\s*价|金额|序号)\s'
|
||||||
|
)
|
||||||
|
CATEGORY_MARKERS = [
|
||||||
|
'一', '二', '三', '四', '五', '六', '七', '八', '九', '十',
|
||||||
|
'(一)', '(二)', '(三)', '(四)', '(五)',
|
||||||
|
]
|
||||||
|
|
||||||
|
# 编码:行内 9–12 位数字或 B 编码(排除字母前缀如 GB)
|
||||||
|
CODE_RE = re.compile(r'(?<![A-Za-z])(\d{9,12}|(?<![A-Za-z])B\d{5,6})')
|
||||||
|
|
||||||
|
UNIT_TOKENS = [
|
||||||
|
'm³', 'm²', 'm3', 'm2', 'km', 'hm2', '㎡', '㎥', 't', 'kg',
|
||||||
|
'个', '台', '套', '组', '根', '块', '片', '张', '只', '吨', '项',
|
||||||
|
'处', '座', '件', '段', '条', '把', '扇', '口', '圈', '道', '孔',
|
||||||
|
'对', '副', '樘', '方', '延m', '株', '棵', 'm',
|
||||||
|
]
|
||||||
|
UNIT_SET = frozenset(UNIT_TOKENS)
|
||||||
|
_unit_escaped = [re.escape(u) for u in UNIT_TOKENS]
|
||||||
|
UNIT_RE = re.compile(r'(?:^|\s)(' + '|'.join(_unit_escaped) + r')(?=\s|\d|$)')
|
||||||
|
|
||||||
|
SKIP_RE = re.compile(r'合\s*计|小\s*计|本页小计|总\s*计|价税合计')
|
||||||
|
|
||||||
|
_DASH_CODE = re.compile(
|
||||||
|
r'(\d{2,4})[-‐–](\d{2,4})[-‐–](\d{2,4})(?:[-‐–](\d{2,4}))?'
|
||||||
|
)
|
||||||
|
|
||||||
|
_EXACT_FEE_ITEM = frozenset([
|
||||||
|
'规费', '税金', '利润', '增值税', '暂列金额', '暂估价', '计日工',
|
||||||
|
'总承包服务费', '企业管理费', '甲供材料保管费', '价税合计',
|
||||||
|
])
|
||||||
|
_FEE_KW = [
|
||||||
|
'安全文明', '文明施工费', '环境保护费', '临时设施费',
|
||||||
|
'夜间施工增加费', '夜间施工费',
|
||||||
|
'冬雨季施工增加费', '冬雨季施工费',
|
||||||
|
'二次搬运费', '大型机械设备进出场', '大型机械进出场',
|
||||||
|
'施工排水降水', '排水降水费',
|
||||||
|
'已完工程及设备保护', '已完工程保护费',
|
||||||
|
'工程排污费', '社会保障费', '住房公积金',
|
||||||
|
'工伤保险', '劳动保险', '意外伤害保险', '建筑工程保险',
|
||||||
|
'城市维护建设税', '城市建设维护税',
|
||||||
|
'教育费附加', '地方教育附加',
|
||||||
|
'材料暂估', '专业工程暂估',
|
||||||
|
'超高施工增加费', '安全防护费',
|
||||||
|
'措施项目费', '其他项目费', '不可竞争费',
|
||||||
|
]
|
||||||
|
|
||||||
|
_CAT_KW = [
|
||||||
|
'土建', '建筑', '结构', '装饰', '装修', '安装', '给排水', '暖通', '空调', '通风',
|
||||||
|
'电气', '强电', '弱电', '消防', '智能化', '幕墙', '门窗', '园林', '绿化', '景观',
|
||||||
|
'市政', '道路', '桥梁', '管网', '基础', '地基', '桩基', '主体', '屋面', '防水',
|
||||||
|
'保温', '钢结构', '排水', '给水', '照明', '动力', '防雷', '电梯', '人防', '室外',
|
||||||
|
'附属', '分部', '工程', '措施', '清单', '土石方', '混凝土', '砌筑', '模板', '脚手架',
|
||||||
|
'水利', '河道', '管道', '阀门', '设备', '仪表', '自动化', '通信', '网络',
|
||||||
|
'拆除', '外墙', '内墙', '楼地面', '天棚', '吊顶', '栏杆', '屋顶', '涂料', '抹灰',
|
||||||
|
'廊道', '阀门井', '蓄水池', '泵站', '供水', '引水', '水源', '渠道', '闸门',
|
||||||
|
'围栏', '警示', '检修', '管线', '配电', '水池', '水塔', '取水', '净水',
|
||||||
|
]
|
||||||
|
|
||||||
|
_EXACT_FEE_CAT = frozenset([
|
||||||
|
'规费', '税金', '利润', '增值税', '暂列金额', '暂估价', '计日工',
|
||||||
|
'总承包服务费', '企业管理费', '价税合计',
|
||||||
|
'措施项目费', '其他项目费', '不可竞争费',
|
||||||
|
])
|
||||||
|
_FEE_CAT_KW = [
|
||||||
|
'措施项目费', '其他项目费', '不可竞争费',
|
||||||
|
'规费汇总', '税金汇总', '费率', '费用汇总', '费用合计',
|
||||||
|
'暂列金额', '暂估价', '计日工', '总承包服务费',
|
||||||
|
'安全文明施工费', '社会保障费', '住房公积金',
|
||||||
|
'工伤保险', '教育费附加', '城市维护建设税',
|
||||||
|
]
|
||||||
|
|
||||||
|
_SPEC_KW_RE = re.compile(
|
||||||
|
r'(材质|规格|型号|品牌|颜色|尺寸|厚度|直径|管径|强度|等级|类别|类型|做法|要求|标准|内容|工作内容|土壤|含量|配合比|工艺|方式|形式|范围|部位|位置|高度|宽度|长度|深度|坡度|截面|跨度|运距|开挖|回填|混凝土|钢筋|压实)[::]'
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _fold_dash_codes(line: str) -> str:
|
||||||
|
def repl(m: re.Match) -> str:
|
||||||
|
a, b, c, d = m.group(1), m.group(2), m.group(3), m.group(4) or ''
|
||||||
|
combined = a + b + c + d
|
||||||
|
if 9 <= len(combined) <= 12:
|
||||||
|
return combined
|
||||||
|
return m.group(0)
|
||||||
|
|
||||||
|
return _DASH_CODE.sub(repl, line)
|
||||||
|
|
||||||
|
|
||||||
|
def is_fee_item(name: str) -> bool:
|
||||||
|
if not name:
|
||||||
|
return False
|
||||||
|
n = re.sub(r'\s+', '', name)
|
||||||
|
if n in _EXACT_FEE_ITEM:
|
||||||
|
return True
|
||||||
|
for kw in _FEE_KW:
|
||||||
|
if kw in n:
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def split_name_and_spec(raw_name: str) -> tuple[str, str]:
|
||||||
|
if not raw_name:
|
||||||
|
return '', ''
|
||||||
|
m = re.search(r'\d+[.、.)\uFF09]\s*[\u4e00-\u9fff]', raw_name)
|
||||||
|
if m and m.start() > 0:
|
||||||
|
return raw_name[:m.start()].strip(), raw_name[m.start():].strip()
|
||||||
|
kw = _SPEC_KW_RE.search(raw_name)
|
||||||
|
if kw and kw.start() > 0:
|
||||||
|
return raw_name[:kw.start()].strip(), raw_name[kw.start():].strip()
|
||||||
|
paren = re.search(r'[((]\d+[))]', raw_name)
|
||||||
|
if paren and paren.start() > 0:
|
||||||
|
return raw_name[:paren.start()].strip(), raw_name[paren.start():].strip()
|
||||||
|
return raw_name, ''
|
||||||
|
|
||||||
|
|
||||||
|
def is_cat_title(text: str) -> bool:
|
||||||
|
return any(k in text for k in _CAT_KW)
|
||||||
|
|
||||||
|
|
||||||
|
def is_fee_cat_title(text: str) -> bool:
|
||||||
|
if not text:
|
||||||
|
return False
|
||||||
|
t = re.sub(r'\s+', '', text)
|
||||||
|
if t in _EXACT_FEE_CAT:
|
||||||
|
return True
|
||||||
|
for kw in _FEE_CAT_KW:
|
||||||
|
if kw in t:
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def _is_new_line_trigger(raw: str) -> bool:
|
||||||
|
if ITEM_START.match(raw):
|
||||||
|
return True
|
||||||
|
if CODE_START_RE.match(raw):
|
||||||
|
return True
|
||||||
|
if SEQ_CODE_RE.match(raw):
|
||||||
|
return True
|
||||||
|
for m in CATEGORY_MARKERS:
|
||||||
|
if raw.startswith(m + ' ') or raw.startswith(m + '\u3000'):
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def parse_bill_text(text: str) -> dict[str, Any]:
|
||||||
|
raw_lines = []
|
||||||
|
for l in text.split('\n'):
|
||||||
|
line = l.replace('\t', ' ').strip()
|
||||||
|
line = _fold_dash_codes(line)
|
||||||
|
raw_lines.append(line)
|
||||||
|
|
||||||
|
logic_lines: list[str] = []
|
||||||
|
current_line = ''
|
||||||
|
|
||||||
|
for raw in raw_lines:
|
||||||
|
if not raw or PAGE_MARK.match(raw):
|
||||||
|
continue
|
||||||
|
if HEADER_RE.match(raw) or HEADER_KW.match(raw):
|
||||||
|
continue
|
||||||
|
if re.match(r'^(元)|^款章节号|^备注$|^第\d+页', raw):
|
||||||
|
continue
|
||||||
|
|
||||||
|
if _is_new_line_trigger(raw):
|
||||||
|
if current_line:
|
||||||
|
logic_lines.append(current_line)
|
||||||
|
current_line = raw
|
||||||
|
elif CODE_INLINE.search(raw) and len(raw) > 15:
|
||||||
|
if current_line:
|
||||||
|
logic_lines.append(current_line)
|
||||||
|
current_line = raw
|
||||||
|
else:
|
||||||
|
if current_line and len(current_line) > 300:
|
||||||
|
logic_lines.append(current_line)
|
||||||
|
current_line = raw
|
||||||
|
else:
|
||||||
|
current_line = current_line + ' ' + raw if current_line else raw
|
||||||
|
if current_line:
|
||||||
|
logic_lines.append(current_line)
|
||||||
|
|
||||||
|
logger.debug('合并后 %s 条逻辑行(原始 %s 行)', len(logic_lines), len(raw_lines))
|
||||||
|
|
||||||
|
categories: list[dict[str, Any]] = []
|
||||||
|
cur_cat: dict[str, Any] | None = None
|
||||||
|
cur_item: dict[str, Any] | None = None
|
||||||
|
|
||||||
|
for line in logic_lines:
|
||||||
|
if SKIP_RE.search(line):
|
||||||
|
continue
|
||||||
|
|
||||||
|
# 行首序号:多级如「1.1.1.1 」;或「1–4 位序号 + 空格 + 9 位以上编码」。
|
||||||
|
# 避免误删「行首即 9–12 位清单编码 + 空格」整段(JS 原 \d+(\.\d+)* 会吞掉编码)。
|
||||||
|
stripped = line.strip()
|
||||||
|
m_hier = re.match(r'^\d+(?:\.\d+)+\s+', stripped)
|
||||||
|
if m_hier:
|
||||||
|
stripped = stripped[m_hier.end():].strip()
|
||||||
|
elif re.match(r'^\d{1,4}\s+\d{9}', stripped):
|
||||||
|
stripped = re.sub(r'^\d{1,4}\s+', '', stripped, count=1).strip()
|
||||||
|
if not stripped:
|
||||||
|
stripped = line.strip()
|
||||||
|
if not stripped:
|
||||||
|
continue
|
||||||
|
|
||||||
|
cm = CODE_RE.search(stripped)
|
||||||
|
if cm:
|
||||||
|
if cur_item and cur_cat:
|
||||||
|
cur_cat['items'].append(cur_item)
|
||||||
|
if not cur_cat:
|
||||||
|
cur_cat = {'name': '未分类', 'items': []}
|
||||||
|
categories.append(cur_cat)
|
||||||
|
|
||||||
|
code = cm.group(1)
|
||||||
|
rest = stripped[cm.end():].strip()
|
||||||
|
name, unit, quantity, spec = '', '', '', ''
|
||||||
|
|
||||||
|
unit_match = UNIT_RE.search(rest)
|
||||||
|
if unit_match:
|
||||||
|
ui = rest.find(unit_match.group(0))
|
||||||
|
raw_name = rest[:ui].strip()
|
||||||
|
unit = unit_match.group(1)
|
||||||
|
after_unit = rest[ui + len(unit_match.group(0)):].strip()
|
||||||
|
qm = re.match(r'^([\d,.]+)', after_unit)
|
||||||
|
if qm:
|
||||||
|
quantity = qm.group(1)
|
||||||
|
tail = after_unit[qm.end():].strip()
|
||||||
|
if tail:
|
||||||
|
tail_tokens = tail.split()
|
||||||
|
si = 0
|
||||||
|
while si < len(tail_tokens) and re.match(r'^[\d,.%\-]+$', tail_tokens[si]):
|
||||||
|
si += 1
|
||||||
|
spec_tail = ' '.join(tail_tokens[si:]).strip()
|
||||||
|
if spec_tail:
|
||||||
|
spec = spec_tail
|
||||||
|
ns_name, ns_spec = split_name_and_spec(raw_name)
|
||||||
|
name = ns_name
|
||||||
|
if ns_spec:
|
||||||
|
spec = ns_spec + (';' + spec if spec else '')
|
||||||
|
else:
|
||||||
|
tokens = [t for t in rest.split() if t]
|
||||||
|
found_unit_idx = -1
|
||||||
|
for ti in range(len(tokens) - 1, 0, -1):
|
||||||
|
if tokens[ti] in UNIT_SET:
|
||||||
|
found_unit_idx = ti
|
||||||
|
break
|
||||||
|
if found_unit_idx >= 1:
|
||||||
|
raw_name_str = ' '.join(tokens[:found_unit_idx])
|
||||||
|
ns_name, ns_spec = split_name_and_spec(raw_name_str)
|
||||||
|
name = ns_name
|
||||||
|
if ns_spec:
|
||||||
|
spec = ns_spec
|
||||||
|
unit = tokens[found_unit_idx]
|
||||||
|
after_tokens = tokens[found_unit_idx + 1:]
|
||||||
|
if after_tokens and re.match(r'^[\d,.]+$', after_tokens[0]):
|
||||||
|
quantity = after_tokens[0]
|
||||||
|
si = 1
|
||||||
|
while si < len(after_tokens) and re.match(r'^[\d,.%\-]+$', after_tokens[si]):
|
||||||
|
si += 1
|
||||||
|
spec_tail = ' '.join(after_tokens[si:]).strip()
|
||||||
|
if spec_tail:
|
||||||
|
spec = spec + ';' + spec_tail if spec else spec_tail
|
||||||
|
else:
|
||||||
|
name = rest
|
||||||
|
|
||||||
|
name = re.sub(r'\s+', '', name).strip()
|
||||||
|
for u in UNIT_TOKENS:
|
||||||
|
if name.endswith(u) and len(name) > len(u):
|
||||||
|
unit = unit or u
|
||||||
|
name = name[: len(name) - len(u)]
|
||||||
|
break
|
||||||
|
|
||||||
|
cur_item = {'code': code, 'name': name, 'unit': unit, 'quantity': quantity, 'spec': spec}
|
||||||
|
continue
|
||||||
|
|
||||||
|
if len(stripped) > 4:
|
||||||
|
uni_match = UNIT_RE.search(stripped)
|
||||||
|
if uni_match:
|
||||||
|
ui = stripped.find(uni_match.group(0))
|
||||||
|
before_unit = stripped[:ui].strip()
|
||||||
|
after_unit = stripped[ui + len(uni_match.group(0)):].strip()
|
||||||
|
has_qty = bool(re.match(r'^[\d,.]+', after_unit))
|
||||||
|
if (
|
||||||
|
2 <= len(before_unit) <= 50
|
||||||
|
and has_qty
|
||||||
|
and re.search(r'[\u4e00-\u9fff]', before_unit)
|
||||||
|
):
|
||||||
|
if cur_item and cur_cat:
|
||||||
|
cur_cat['items'].append(cur_item)
|
||||||
|
if not cur_cat:
|
||||||
|
cur_cat = {'name': '未分类', 'items': []}
|
||||||
|
categories.append(cur_cat)
|
||||||
|
unit_fb = uni_match.group(1)
|
||||||
|
qm = re.match(r'^([\d,.]+)', after_unit)
|
||||||
|
quantity_fb = qm.group(1) if qm else ''
|
||||||
|
ns_name, ns_spec = split_name_and_spec(before_unit)
|
||||||
|
name_fb = re.sub(r'\s+', '', ns_name).strip()
|
||||||
|
spec_fb = ns_spec or ''
|
||||||
|
cur_item = {'code': '', 'name': name_fb, 'unit': unit_fb, 'quantity': quantity_fb, 'spec': spec_fb}
|
||||||
|
continue
|
||||||
|
|
||||||
|
if 2 < len(stripped) < 60 and not CODE_RE.search(stripped):
|
||||||
|
if UNIT_RE.search(stripped) and re.search(r'\d+\.?\d*\s*$', stripped):
|
||||||
|
if cur_item:
|
||||||
|
cur_item['spec'] = (cur_item.get('spec') or '') + (
|
||||||
|
';' + stripped if cur_item.get('spec') else stripped
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
if is_cat_title(stripped) and not UNIT_RE.search(stripped) and not is_fee_cat_title(stripped):
|
||||||
|
if cur_item and cur_cat:
|
||||||
|
cur_cat['items'].append(cur_item)
|
||||||
|
cur_item = None
|
||||||
|
clean_title = re.sub(
|
||||||
|
r'\s+(座|个|项|处|m|km|段|条)\s+\d+[\d.]*\s*$', '', stripped
|
||||||
|
).strip()
|
||||||
|
cur_cat = {'name': clean_title, 'items': []}
|
||||||
|
categories.append(cur_cat)
|
||||||
|
continue
|
||||||
|
|
||||||
|
if re.match(r'^[一二三四五六七八九十]+\s', stripped) or re.match(
|
||||||
|
r'^([一二三四五六七八九十\d]+)', stripped
|
||||||
|
):
|
||||||
|
clean_title = re.sub(r'\s+(座|个|项|处)\s+\d+[\d.]*\s*$', '', stripped).strip()
|
||||||
|
if is_fee_cat_title(clean_title):
|
||||||
|
continue
|
||||||
|
if cur_item and cur_cat:
|
||||||
|
cur_cat['items'].append(cur_item)
|
||||||
|
cur_item = None
|
||||||
|
cur_cat = {'name': clean_title, 'items': []}
|
||||||
|
categories.append(cur_cat)
|
||||||
|
continue
|
||||||
|
|
||||||
|
if cur_item and len(stripped) > 1:
|
||||||
|
cur_item['spec'] = (cur_item.get('spec') or '') + (
|
||||||
|
';' + stripped if cur_item.get('spec') else stripped
|
||||||
|
)
|
||||||
|
|
||||||
|
if cur_item and cur_cat:
|
||||||
|
cur_cat['items'].append(cur_item)
|
||||||
|
|
||||||
|
fee_filtered = 0
|
||||||
|
for cat in categories:
|
||||||
|
if cat.get('items'):
|
||||||
|
before = len(cat['items'])
|
||||||
|
cat['items'] = [it for it in cat['items'] if not is_fee_item(it.get('name', ''))]
|
||||||
|
fee_filtered += before - len(cat['items'])
|
||||||
|
if fee_filtered:
|
||||||
|
logger.debug('费用项过滤: 移除 %s 项', fee_filtered)
|
||||||
|
|
||||||
|
total_before_merge = 0
|
||||||
|
total_after_merge = 0
|
||||||
|
for cat in categories:
|
||||||
|
items = cat.get('items') or []
|
||||||
|
if not items:
|
||||||
|
continue
|
||||||
|
total_before_merge += len(items)
|
||||||
|
name_map: dict[str, dict[str, Any]] = {}
|
||||||
|
for item in items:
|
||||||
|
key = re.sub(r'\s+', '', (item.get('name') or '')).strip()
|
||||||
|
if not key:
|
||||||
|
continue
|
||||||
|
if key not in name_map:
|
||||||
|
name_map[key] = {
|
||||||
|
'code': item.get('code') or '',
|
||||||
|
'name': item['name'],
|
||||||
|
'unit': item.get('unit') or '',
|
||||||
|
'quantity': item.get('quantity') or '',
|
||||||
|
'spec': item.get('spec') or '',
|
||||||
|
'_quantities': [item['quantity']] if item.get('quantity') else [],
|
||||||
|
'_specs': [item['spec']] if item.get('spec') else [],
|
||||||
|
}
|
||||||
|
else:
|
||||||
|
m = name_map[key]
|
||||||
|
if not m['code'] and item.get('code'):
|
||||||
|
m['code'] = item['code']
|
||||||
|
if not m['unit'] and item.get('unit'):
|
||||||
|
m['unit'] = item['unit']
|
||||||
|
if item.get('quantity'):
|
||||||
|
m['_quantities'].append(item['quantity'])
|
||||||
|
if item.get('spec') and item['spec'] not in m['_specs']:
|
||||||
|
m['_specs'].append(item['spec'])
|
||||||
|
|
||||||
|
merged_items: list[dict[str, str]] = []
|
||||||
|
for m in name_map.values():
|
||||||
|
qlist = m['_quantities']
|
||||||
|
if len(qlist) > 1:
|
||||||
|
nums = []
|
||||||
|
ok = True
|
||||||
|
for q in qlist:
|
||||||
|
try:
|
||||||
|
nums.append(float(q.replace(',', '')))
|
||||||
|
except ValueError:
|
||||||
|
ok = False
|
||||||
|
break
|
||||||
|
if ok:
|
||||||
|
s = sum(nums)
|
||||||
|
m['quantity'] = str(int(s)) if s % 1 == 0 else f'{s:.2f}'
|
||||||
|
else:
|
||||||
|
m['quantity'] = '; '.join(qlist)
|
||||||
|
elif len(qlist) == 1:
|
||||||
|
m['quantity'] = qlist[0]
|
||||||
|
|
||||||
|
if m['_specs']:
|
||||||
|
trimmed = [s[:120] + '...' if len(s) > 120 else s for s in m['_specs']]
|
||||||
|
m['spec'] = '; '.join(trimmed)
|
||||||
|
if len(m['spec']) > 300:
|
||||||
|
m['spec'] = m['spec'][:300] + '...'
|
||||||
|
for k in ('_quantities', '_specs'):
|
||||||
|
m.pop(k, None)
|
||||||
|
merged_items.append(
|
||||||
|
{k: m[k] for k in ('code', 'name', 'unit', 'quantity', 'spec')}
|
||||||
|
)
|
||||||
|
cat['items'] = merged_items
|
||||||
|
total_after_merge += len(merged_items)
|
||||||
|
|
||||||
|
merged_count = total_before_merge - total_after_merge
|
||||||
|
if merged_count > 0:
|
||||||
|
logger.debug('按名称合并: %s → %s 项', total_before_merge, total_after_merge)
|
||||||
|
|
||||||
|
valid = [c for c in categories if c.get('items')]
|
||||||
|
total_items = sum(len(c['items']) for c in valid)
|
||||||
|
logger.debug(
|
||||||
|
'最终结果: %s 分部, %s 清单项', len(valid), total_items
|
||||||
|
)
|
||||||
|
|
||||||
|
return {
|
||||||
|
'project_summary': {
|
||||||
|
'remark': f'本地解析:{len(valid)} 个分部,{total_items} 个清单项(合并前 {total_before_merge} 项)',
|
||||||
|
},
|
||||||
|
'categories': valid,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def filter_bill_pages(page_texts: list[str]) -> tuple[list[str], dict[str, Any]]:
|
||||||
|
"""
|
||||||
|
从按页文本中筛选工程量清单相关页;返回 (bill_page_texts, meta)。
|
||||||
|
"""
|
||||||
|
n = len(page_texts)
|
||||||
|
meta: dict[str, Any] = {'total_pages': n, 'scanned': False, 'no_bill_pages': False}
|
||||||
|
|
||||||
|
total_chars = sum(len(t or '') for t in page_texts)
|
||||||
|
if total_chars < 50:
|
||||||
|
meta['scanned'] = True
|
||||||
|
meta['reason'] = 'noText'
|
||||||
|
return [], meta
|
||||||
|
|
||||||
|
bill_flags = [False] * n
|
||||||
|
for i, t in enumerate(page_texts):
|
||||||
|
if not (t or '').strip():
|
||||||
|
continue
|
||||||
|
t = t or ''
|
||||||
|
h_hits = sum(1 for k in BILL_KW if k in t)
|
||||||
|
s_hit = any(k in t for k in SEC_KW)
|
||||||
|
has_code = bool(re.search(r'\d{9}', t))
|
||||||
|
if h_hits >= 2 or s_hit or has_code:
|
||||||
|
bill_flags[i] = True
|
||||||
|
|
||||||
|
first_bill = next((i for i, f in enumerate(bill_flags) if f), -1)
|
||||||
|
last_bill = max((i for i, f in enumerate(bill_flags) if f), default=-1)
|
||||||
|
if first_bill >= 0 and last_bill > first_bill:
|
||||||
|
for i in range(first_bill, last_bill + 1):
|
||||||
|
if bill_flags[i]:
|
||||||
|
continue
|
||||||
|
t = page_texts[i] or ''
|
||||||
|
if not t.strip() or len(t.strip()) <= 30:
|
||||||
|
continue
|
||||||
|
fee_hits = sum(1 for kw in FEE_PAGE_KW if kw in t)
|
||||||
|
if fee_hits >= 2 and not re.search(r'\d{9}', t):
|
||||||
|
continue
|
||||||
|
bill_flags[i] = True
|
||||||
|
|
||||||
|
bill_texts = [page_texts[i] for i in range(n) if bill_flags[i]]
|
||||||
|
if not bill_texts:
|
||||||
|
meta['no_bill_pages'] = True
|
||||||
|
|
||||||
|
meta['bill_page_indices'] = [i for i in range(n) if bill_flags[i]]
|
||||||
|
meta['bill_pages'] = len(bill_texts)
|
||||||
|
return bill_texts, meta
|
||||||
|
|
||||||
|
|
||||||
|
def analyze_boq_pages(page_texts: list[str]) -> dict[str, Any]:
|
||||||
|
"""
|
||||||
|
串联筛选 + parse_bill_text;返回结构含 _meta,供持久化与前端。
|
||||||
|
"""
|
||||||
|
total_pages = len(page_texts)
|
||||||
|
total_chars = sum(len(t or '') for t in page_texts)
|
||||||
|
|
||||||
|
if total_chars < 50:
|
||||||
|
return {
|
||||||
|
'scanned': True,
|
||||||
|
'reason': 'noText',
|
||||||
|
'totalPages': total_pages,
|
||||||
|
'project_summary': {'remark': '文本过少,疑似扫描件或未提取到文字'},
|
||||||
|
'categories': [],
|
||||||
|
'_meta': {
|
||||||
|
'method': 'python-local',
|
||||||
|
'total_pages': total_pages,
|
||||||
|
'bill_pages': 0,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
bill_texts, fmeta = filter_bill_pages(page_texts)
|
||||||
|
if not bill_texts:
|
||||||
|
return {
|
||||||
|
'scanned': False,
|
||||||
|
'no_bill_pages': True,
|
||||||
|
'totalPages': total_pages,
|
||||||
|
'project_summary': {'remark': '未识别到清单相关页面'},
|
||||||
|
'categories': [],
|
||||||
|
'_meta': {
|
||||||
|
'method': 'python-local',
|
||||||
|
'total_pages': total_pages,
|
||||||
|
'bill_pages': 0,
|
||||||
|
**{k: fmeta[k] for k in ('no_bill_pages',) if k in fmeta},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
merged = '\n'.join(bill_texts)
|
||||||
|
parsed = parse_bill_text(merged)
|
||||||
|
return {
|
||||||
|
'scanned': False,
|
||||||
|
**parsed,
|
||||||
|
'_meta': {
|
||||||
|
'method': 'python-local',
|
||||||
|
'total_pages': total_pages,
|
||||||
|
'bill_pages': len(bill_texts),
|
||||||
|
'bill_page_indices': fmeta.get('bill_page_indices', []),
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def categories_to_prompt_appendix(
|
||||||
|
analysis: dict[str, Any],
|
||||||
|
max_chars: int = 3000,
|
||||||
|
max_per_cat: int = 40,
|
||||||
|
) -> str:
|
||||||
|
"""将本地解析结果压成短文本,注入 AI 摘要提示词。"""
|
||||||
|
cats = analysis.get('categories') or []
|
||||||
|
lines: list[str] = []
|
||||||
|
for cat in cats:
|
||||||
|
name = cat.get('name', '')
|
||||||
|
items = cat.get('items') or []
|
||||||
|
lines.append(f'【{name}】')
|
||||||
|
for it in items[:max_per_cat]:
|
||||||
|
code = it.get('code') or '-'
|
||||||
|
n = it.get('name') or ''
|
||||||
|
u = it.get('unit') or ''
|
||||||
|
q = it.get('quantity') or ''
|
||||||
|
lines.append(f' {code} {n} {u} {q}'.strip())
|
||||||
|
if len(items) > max_per_cat:
|
||||||
|
lines.append(f' …共 {len(items)} 条,此处省略其余')
|
||||||
|
text = '\n'.join(lines).strip()
|
||||||
|
if len(text) > max_chars:
|
||||||
|
return text[:max_chars] + '\n…(附录已截断)'
|
||||||
|
return text
|
||||||
138
utils/boq_parser.py
Normal file
138
utils/boq_parser.py
Normal file
@ -0,0 +1,138 @@
|
|||||||
|
"""
|
||||||
|
工程量清单解析模块:从 Excel / CSV / PDF / Word 文件中提取结构化文本。
|
||||||
|
"""
|
||||||
|
import csv
|
||||||
|
import logging
|
||||||
|
import re
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
# 最大返回字符数(送给 AI 做摘要时截断)
|
||||||
|
MAX_BOQ_CHARS = 12000
|
||||||
|
|
||||||
|
|
||||||
|
def extract_boq_text(file_path: str) -> str:
|
||||||
|
"""
|
||||||
|
从工程量清单文件提取原始结构化文本。
|
||||||
|
支持:.xlsx / .xls / .csv / .pdf / .docx / .doc
|
||||||
|
"""
|
||||||
|
ext = Path(file_path).suffix.lower()
|
||||||
|
if ext in ('.xlsx', '.xls'):
|
||||||
|
text = _extract_excel(file_path)
|
||||||
|
elif ext == '.csv':
|
||||||
|
text = _extract_csv(file_path)
|
||||||
|
elif ext == '.pdf':
|
||||||
|
from utils.file_utils import _extract_pdf
|
||||||
|
text = _extract_pdf(file_path)
|
||||||
|
elif ext == '.docx':
|
||||||
|
from utils.file_utils import _extract_docx
|
||||||
|
text = _extract_docx(file_path)
|
||||||
|
elif ext == '.doc':
|
||||||
|
from utils.file_utils import _extract_doc
|
||||||
|
text = _extract_doc(file_path)
|
||||||
|
else:
|
||||||
|
raise ValueError(f'不支持的文件格式 {ext},请使用 xlsx/xls/csv/pdf/docx/doc')
|
||||||
|
|
||||||
|
return text[:MAX_BOQ_CHARS]
|
||||||
|
|
||||||
|
|
||||||
|
def extract_boq_pages(file_path: str) -> list[str]:
|
||||||
|
"""
|
||||||
|
返回按「页」切分的清单文本:PDF 为每页一段;Excel/CSV/Word 为单元素全文。
|
||||||
|
"""
|
||||||
|
ext = Path(file_path).suffix.lower()
|
||||||
|
if ext == '.pdf':
|
||||||
|
from utils.file_utils import extract_pdf_pages
|
||||||
|
return extract_pdf_pages(file_path)
|
||||||
|
text = extract_boq_text(file_path)
|
||||||
|
return [text] if text else ['']
|
||||||
|
|
||||||
|
|
||||||
|
# ─── Excel ────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def _extract_excel(file_path: str) -> str:
|
||||||
|
try:
|
||||||
|
import openpyxl
|
||||||
|
wb = openpyxl.load_workbook(file_path, data_only=True, read_only=True)
|
||||||
|
parts = []
|
||||||
|
for name in wb.sheetnames:
|
||||||
|
ws = wb[name]
|
||||||
|
block = _sheet_to_text(ws, name)
|
||||||
|
if block.strip():
|
||||||
|
parts.append(block)
|
||||||
|
wb.close()
|
||||||
|
return '\n\n'.join(parts)
|
||||||
|
except ImportError:
|
||||||
|
return _extract_xls_fallback(file_path)
|
||||||
|
except Exception as e:
|
||||||
|
raise RuntimeError(f'Excel 解析失败:{e}') from e
|
||||||
|
|
||||||
|
|
||||||
|
def _sheet_to_text(ws, sheet_name: str) -> str:
|
||||||
|
"""将一个 Sheet 转为管道分隔文本,自动过滤全空行和全空列。"""
|
||||||
|
raw_rows = []
|
||||||
|
for row in ws.iter_rows(values_only=True):
|
||||||
|
cells = ['' if v is None else str(v).strip() for v in row]
|
||||||
|
if any(cells):
|
||||||
|
raw_rows.append(cells)
|
||||||
|
|
||||||
|
if not raw_rows:
|
||||||
|
return ''
|
||||||
|
|
||||||
|
# 对齐列数
|
||||||
|
max_cols = max(len(r) for r in raw_rows)
|
||||||
|
raw_rows = [r + [''] * (max_cols - len(r)) for r in raw_rows]
|
||||||
|
|
||||||
|
# 找出有内容的列索引
|
||||||
|
active_cols = [j for j in range(max_cols)
|
||||||
|
if any(raw_rows[i][j] for i in range(len(raw_rows)))]
|
||||||
|
if not active_cols:
|
||||||
|
return ''
|
||||||
|
|
||||||
|
lines = [f'【{sheet_name}】']
|
||||||
|
for row in raw_rows:
|
||||||
|
line = ' | '.join(row[j] for j in active_cols)
|
||||||
|
if line.replace('|', '').strip():
|
||||||
|
lines.append(line)
|
||||||
|
return '\n'.join(lines)
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_xls_fallback(file_path: str) -> str:
|
||||||
|
"""旧版 .xls 使用 xlrd 兜底(需安装 xlrd<2)"""
|
||||||
|
try:
|
||||||
|
import xlrd # type: ignore
|
||||||
|
wb = xlrd.open_workbook(file_path)
|
||||||
|
parts = []
|
||||||
|
for sheet in wb.sheets():
|
||||||
|
lines = [f'【{sheet.name}】']
|
||||||
|
for rx in range(sheet.nrows):
|
||||||
|
cells = [str(sheet.cell_value(rx, cx)).strip()
|
||||||
|
for cx in range(sheet.ncols)]
|
||||||
|
line = ' | '.join(c for c in cells if c)
|
||||||
|
if line:
|
||||||
|
lines.append(line)
|
||||||
|
parts.append('\n'.join(lines))
|
||||||
|
return '\n\n'.join(parts)
|
||||||
|
except Exception as e:
|
||||||
|
raise RuntimeError(f'.xls 解析失败,请另存为 .xlsx 后重试:{e}') from e
|
||||||
|
|
||||||
|
|
||||||
|
# ─── CSV ─────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def _extract_csv(file_path: str) -> str:
|
||||||
|
encodings = ['utf-8-sig', 'gbk', 'utf-8', 'gb18030', 'latin-1']
|
||||||
|
for enc in encodings:
|
||||||
|
try:
|
||||||
|
lines = []
|
||||||
|
with open(file_path, 'r', encoding=enc, newline='') as f:
|
||||||
|
for row in csv.reader(f):
|
||||||
|
line = ' | '.join(c.strip() for c in row if c.strip())
|
||||||
|
if line:
|
||||||
|
lines.append(line)
|
||||||
|
return '\n'.join(lines)
|
||||||
|
except (UnicodeDecodeError, UnicodeError):
|
||||||
|
continue
|
||||||
|
except Exception as e:
|
||||||
|
raise RuntimeError(f'CSV 解析失败:{e}') from e
|
||||||
|
raise RuntimeError('CSV 文件编码不支持,请另存为 UTF-8 格式后重试')
|
||||||
283
utils/diagram_intent.py
Normal file
283
utils/diagram_intent.py
Normal file
@ -0,0 +1,283 @@
|
|||||||
|
"""
|
||||||
|
章节级图/表意图:字符特征 + 大纲上下文窗口计分,栈式优先级,驱动提示词附加段。
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from typing import Any, Dict, List, Optional, Sequence, Tuple
|
||||||
|
|
||||||
|
import config
|
||||||
|
from utils import prompts as P
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
DEFAULT_DIAGRAM_RULES: Dict[str, Any] = {
|
||||||
|
'schema_version': 1,
|
||||||
|
'threshold_figure': 1.0,
|
||||||
|
'threshold_table': 1.0,
|
||||||
|
'title_weight': 1.0,
|
||||||
|
'context_weight': 0.6,
|
||||||
|
'outline_context_lines': {'before': 4, 'after': 6},
|
||||||
|
'stack_order_when_both': 'score_desc',
|
||||||
|
'figure_keywords': [],
|
||||||
|
'table_keywords': [],
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def diagram_rules_path() -> str:
|
||||||
|
return os.path.join(config.DATA_DIR, 'diagram_intent_rules.json')
|
||||||
|
|
||||||
|
|
||||||
|
def load_diagram_rules(path: Optional[str] = None) -> Dict[str, Any]:
|
||||||
|
"""加载规则 JSON;文件缺失或解析失败时返回内置默认。"""
|
||||||
|
p = path or diagram_rules_path()
|
||||||
|
data = dict(DEFAULT_DIAGRAM_RULES)
|
||||||
|
if not os.path.isfile(p):
|
||||||
|
return data
|
||||||
|
try:
|
||||||
|
with open(p, encoding='utf-8') as f:
|
||||||
|
raw = json.load(f)
|
||||||
|
if isinstance(raw, dict):
|
||||||
|
for k, v in raw.items():
|
||||||
|
if k.startswith('_'):
|
||||||
|
continue
|
||||||
|
if k == 'outline_context_lines' and isinstance(v, dict):
|
||||||
|
data['outline_context_lines'] = {
|
||||||
|
**data.get('outline_context_lines', {}),
|
||||||
|
**v,
|
||||||
|
}
|
||||||
|
else:
|
||||||
|
data[k] = v
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning('加载 diagram_intent_rules.json 失败,使用内置默认: %s', e)
|
||||||
|
return data
|
||||||
|
|
||||||
|
|
||||||
|
def _normalize_keyword_entries(raw: Any) -> List[Tuple[str, float]]:
|
||||||
|
out: List[Tuple[str, float]] = []
|
||||||
|
if not isinstance(raw, list):
|
||||||
|
return out
|
||||||
|
for item in raw:
|
||||||
|
if isinstance(item, str) and item.strip():
|
||||||
|
out.append((item.strip(), 1.0))
|
||||||
|
elif isinstance(item, dict):
|
||||||
|
t = (item.get('text') or item.get('pattern') or '').strip()
|
||||||
|
if not t:
|
||||||
|
continue
|
||||||
|
w = float(item.get('weight', 1.0))
|
||||||
|
out.append((t, w))
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
def _score_text(text: str, entries: Sequence[Tuple[str, float]]) -> float:
|
||||||
|
if not text or not entries:
|
||||||
|
return 0.0
|
||||||
|
s = 0.0
|
||||||
|
for kw, w in entries:
|
||||||
|
if kw in text:
|
||||||
|
s += w
|
||||||
|
return s
|
||||||
|
|
||||||
|
|
||||||
|
DiagramKind = str # 'figure' | 'table'
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class DiagramIntent:
|
||||||
|
kind: str
|
||||||
|
score: float
|
||||||
|
sources: str
|
||||||
|
|
||||||
|
|
||||||
|
# 栈顶 = index 0,优先生效
|
||||||
|
DiagramStack = List[DiagramIntent]
|
||||||
|
|
||||||
|
|
||||||
|
def score_figure_table(
|
||||||
|
title: str,
|
||||||
|
context_snippet: str,
|
||||||
|
rules: Dict[str, Any],
|
||||||
|
) -> Tuple[float, float]:
|
||||||
|
"""标题与上下文分别计分后按权重合并。"""
|
||||||
|
fig_kw = _normalize_keyword_entries(rules.get('figure_keywords'))
|
||||||
|
tbl_kw = _normalize_keyword_entries(rules.get('table_keywords'))
|
||||||
|
tw = float(rules.get('title_weight', 1.0))
|
||||||
|
cw = float(rules.get('context_weight', 0.6))
|
||||||
|
t = title or ''
|
||||||
|
c = context_snippet or ''
|
||||||
|
fig = tw * _score_text(t, fig_kw) + cw * _score_text(c, fig_kw)
|
||||||
|
tbl = tw * _score_text(t, tbl_kw) + cw * _score_text(c, tbl_kw)
|
||||||
|
return fig, tbl
|
||||||
|
|
||||||
|
|
||||||
|
def extract_outline_window(
|
||||||
|
outline_text: str,
|
||||||
|
section_title: str,
|
||||||
|
before: int,
|
||||||
|
after: int,
|
||||||
|
fallback_chars: int = 1200,
|
||||||
|
) -> str:
|
||||||
|
"""
|
||||||
|
在大纲中定位章节标题所在行,取上下窗口;找不到则取全文前缀。
|
||||||
|
"""
|
||||||
|
if not outline_text or not section_title:
|
||||||
|
return (outline_text or '')[:fallback_chars]
|
||||||
|
title_stripped = section_title.strip()
|
||||||
|
if not title_stripped:
|
||||||
|
return outline_text[:fallback_chars]
|
||||||
|
lines = outline_text.splitlines()
|
||||||
|
idx = -1
|
||||||
|
# 优先整行包含;否则子串匹配(去编号后)
|
||||||
|
def _strip_serial(s: str) -> str:
|
||||||
|
return re.sub(r'^\s*[\d一二三四五六七八九十]+[、..\s]+', '', s).strip()
|
||||||
|
|
||||||
|
core = _strip_serial(title_stripped)
|
||||||
|
for i, line in enumerate(lines):
|
||||||
|
line_s = line.strip()
|
||||||
|
if title_stripped in line_s or (core and core in _strip_serial(line_s)):
|
||||||
|
idx = i
|
||||||
|
break
|
||||||
|
if core and core in line_s:
|
||||||
|
idx = i
|
||||||
|
break
|
||||||
|
if idx < 0:
|
||||||
|
return outline_text[:fallback_chars]
|
||||||
|
lo = max(0, idx - max(0, before))
|
||||||
|
hi = min(len(lines), idx + max(0, after) + 1)
|
||||||
|
return '\n'.join(lines[lo:hi])
|
||||||
|
|
||||||
|
|
||||||
|
def build_stack(
|
||||||
|
fig_score: float,
|
||||||
|
tbl_score: float,
|
||||||
|
rules: Dict[str, Any],
|
||||||
|
enable_figure: bool,
|
||||||
|
enable_table: bool,
|
||||||
|
) -> DiagramStack:
|
||||||
|
tf = float(rules.get('threshold_figure', 1.0))
|
||||||
|
tt = float(rules.get('threshold_table', 1.0))
|
||||||
|
mode = (rules.get('stack_order_when_both') or 'score_desc').strip()
|
||||||
|
|
||||||
|
fig_ok = enable_figure and fig_score >= tf
|
||||||
|
tbl_ok = enable_table and tbl_score >= tt
|
||||||
|
|
||||||
|
intents: List[DiagramIntent] = []
|
||||||
|
if fig_ok:
|
||||||
|
intents.append(
|
||||||
|
DiagramIntent('figure', fig_score, 'title+context')
|
||||||
|
)
|
||||||
|
if tbl_ok:
|
||||||
|
intents.append(
|
||||||
|
DiagramIntent('table', tbl_score, 'title+context')
|
||||||
|
)
|
||||||
|
if len(intents) <= 1:
|
||||||
|
return intents
|
||||||
|
|
||||||
|
a, b = intents[0], intents[1]
|
||||||
|
if mode == 'figure_first':
|
||||||
|
order = [a, b] if a.kind == 'figure' else [b, a]
|
||||||
|
elif mode == 'table_first':
|
||||||
|
order = [a, b] if a.kind == 'table' else [b, a]
|
||||||
|
else: # score_desc — 高分在栈顶
|
||||||
|
order = sorted([a, b], key=lambda x: -x.score)
|
||||||
|
return order
|
||||||
|
|
||||||
|
|
||||||
|
def stack_compact_labels(stack: DiagramStack) -> List[str]:
|
||||||
|
"""与 stack_to_addon 中 labels 一致,供附件仅块输出的提示词。"""
|
||||||
|
labels: List[str] = []
|
||||||
|
for it in stack:
|
||||||
|
if it.kind == 'figure':
|
||||||
|
labels.append('图示([FIGURE] 块)')
|
||||||
|
else:
|
||||||
|
labels.append('表格([TABLE] 块)')
|
||||||
|
return labels
|
||||||
|
|
||||||
|
|
||||||
|
def make_fallback_stack(kind: str) -> DiagramStack:
|
||||||
|
"""栈空且需生成时,按单一 figure/table 占位。"""
|
||||||
|
k = (kind or '').strip().lower()
|
||||||
|
if k not in ('figure', 'table'):
|
||||||
|
k = 'table'
|
||||||
|
return [DiagramIntent(k, 1.0, 'fallback')]
|
||||||
|
|
||||||
|
|
||||||
|
def stack_to_addon(stack: DiagramStack) -> str:
|
||||||
|
"""按栈序拼接优先级说明 + 图示/表格规范全文。"""
|
||||||
|
if not stack:
|
||||||
|
return ''
|
||||||
|
labels: List[str] = []
|
||||||
|
for it in stack:
|
||||||
|
if it.kind == 'figure':
|
||||||
|
labels.append('图示([FIGURE] 块)')
|
||||||
|
else:
|
||||||
|
labels.append('表格([TABLE] 块)')
|
||||||
|
parts: List[str] = [P.diagram_priority_preamble(labels)]
|
||||||
|
for it in stack:
|
||||||
|
if it.kind == 'figure':
|
||||||
|
parts.append(P.get_figure_addon())
|
||||||
|
else:
|
||||||
|
parts.append(P.get_table_addon())
|
||||||
|
return ''.join(parts)
|
||||||
|
|
||||||
|
|
||||||
|
class DiagramIntentAgent:
|
||||||
|
"""可配置规则实例:对单节计算栈并渲染附加提示词。"""
|
||||||
|
|
||||||
|
def __init__(self, rules: Optional[Dict[str, Any]] = None) -> None:
|
||||||
|
self.rules = rules or load_diagram_rules()
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def load_default(cls) -> 'DiagramIntentAgent':
|
||||||
|
return cls(load_diagram_rules())
|
||||||
|
|
||||||
|
def plan(
|
||||||
|
self,
|
||||||
|
section_title: str,
|
||||||
|
outline_text: str,
|
||||||
|
enable_figure: bool,
|
||||||
|
enable_table: bool,
|
||||||
|
) -> DiagramStack:
|
||||||
|
r = self.rules
|
||||||
|
oc = r.get('outline_context_lines') or {}
|
||||||
|
before = int(oc.get('before', 4))
|
||||||
|
after = int(oc.get('after', 6))
|
||||||
|
ctx = extract_outline_window(
|
||||||
|
outline_text, section_title, before, after,
|
||||||
|
)
|
||||||
|
fig_s, tbl_s = score_figure_table(section_title, ctx, r)
|
||||||
|
return build_stack(fig_s, tbl_s, r, enable_figure, enable_table)
|
||||||
|
|
||||||
|
def render_for_section(
|
||||||
|
self,
|
||||||
|
section_title: str,
|
||||||
|
outline_text: str,
|
||||||
|
enable_figure: bool,
|
||||||
|
enable_table: bool,
|
||||||
|
) -> str:
|
||||||
|
if not enable_figure and not enable_table:
|
||||||
|
return ''
|
||||||
|
stack = self.plan(
|
||||||
|
section_title, outline_text, enable_figure, enable_table,
|
||||||
|
)
|
||||||
|
return stack_to_addon(stack)
|
||||||
|
|
||||||
|
|
||||||
|
# 模块级默认实例,供 generator 单次调用
|
||||||
|
_default_agent: Optional[DiagramIntentAgent] = None
|
||||||
|
|
||||||
|
|
||||||
|
def get_diagram_agent() -> DiagramIntentAgent:
|
||||||
|
global _default_agent
|
||||||
|
if _default_agent is None:
|
||||||
|
_default_agent = DiagramIntentAgent.load_default()
|
||||||
|
return _default_agent
|
||||||
|
|
||||||
|
|
||||||
|
def invalidate_diagram_agent_cache() -> None:
|
||||||
|
global _default_agent
|
||||||
|
_default_agent = None
|
||||||
213
utils/file_utils.py
Normal file
213
utils/file_utils.py
Normal file
@ -0,0 +1,213 @@
|
|||||||
|
"""
|
||||||
|
文件处理工具:从 PDF / Word 文件中提取纯文本
|
||||||
|
"""
|
||||||
|
import os
|
||||||
|
import logging
|
||||||
|
from pathlib import Path
|
||||||
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||||
|
from functools import partial
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
def extract_text(file_path: str) -> str:
|
||||||
|
"""
|
||||||
|
根据文件扩展名提取文本。
|
||||||
|
支持 .pdf / .docx / .doc
|
||||||
|
"""
|
||||||
|
path = Path(file_path)
|
||||||
|
ext = path.suffix.lower()
|
||||||
|
|
||||||
|
if ext == '.pdf':
|
||||||
|
return _extract_pdf(file_path)
|
||||||
|
elif ext == '.docx':
|
||||||
|
return _extract_docx(file_path)
|
||||||
|
elif ext == '.doc':
|
||||||
|
return _extract_doc(file_path)
|
||||||
|
else:
|
||||||
|
raise ValueError(f'不支持的文件类型: {ext}')
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_pdf(file_path: str) -> str:
|
||||||
|
"""提取 PDF 文本,优先使用 pypdf,回退到 pdfminer"""
|
||||||
|
try:
|
||||||
|
from pypdf import PdfReader
|
||||||
|
reader = PdfReader(file_path)
|
||||||
|
parts = []
|
||||||
|
for page in reader.pages:
|
||||||
|
text = page.extract_text()
|
||||||
|
if text:
|
||||||
|
parts.append(text)
|
||||||
|
result = '\n'.join(parts)
|
||||||
|
if result.strip():
|
||||||
|
return result
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f'pypdf 提取失败: {e},尝试 pdfminer')
|
||||||
|
|
||||||
|
try:
|
||||||
|
from pdfminer.high_level import extract_text as pm_extract
|
||||||
|
result = pm_extract(file_path)
|
||||||
|
return result or ''
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f'pdfminer 提取失败: {e}')
|
||||||
|
raise RuntimeError(f'PDF 文本提取失败: {e}')
|
||||||
|
|
||||||
|
|
||||||
|
def extract_pdf_pages(file_path: str) -> list[str]:
|
||||||
|
"""
|
||||||
|
按页提取 PDF 文本(用于工程量清单页筛选)。
|
||||||
|
极速优化:对于>5页PDF使用ThreadPool并行提取页面 (plan要求),大幅加速解析环节。
|
||||||
|
优先 pypdf 逐页;若各页均无文本则回退 pdfminer。
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
from pypdf import PdfReader
|
||||||
|
reader = PdfReader(file_path)
|
||||||
|
if len(reader.pages) <= 5:
|
||||||
|
# 小文件顺序更快
|
||||||
|
pages = [(page.extract_text() or '').strip() for page in reader.pages]
|
||||||
|
else:
|
||||||
|
# 并行提取大PDF (fulfills plan's Parallel Extract Pages)
|
||||||
|
def _extract_page(page):
|
||||||
|
return (page.extract_text() or '').strip()
|
||||||
|
with ThreadPoolExecutor(max_workers=4) as executor:
|
||||||
|
pages = list(executor.map(_extract_page, reader.pages))
|
||||||
|
if any(pages):
|
||||||
|
return pages
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f'pypdf 按页提取失败: {e},尝试 pdfminer')
|
||||||
|
|
||||||
|
try:
|
||||||
|
from pdfminer.high_level import extract_text as pm_extract
|
||||||
|
blob = (pm_extract(file_path) or '').strip()
|
||||||
|
return [blob] if blob else ['']
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f'pdfminer 提取失败: {e}')
|
||||||
|
raise RuntimeError(f'PDF 文本提取失败: {e}')
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_docx(file_path: str) -> str:
|
||||||
|
"""提取 .docx 文档文本(python-docx)"""
|
||||||
|
try:
|
||||||
|
from docx import Document
|
||||||
|
doc = Document(file_path)
|
||||||
|
parts = []
|
||||||
|
for para in doc.paragraphs:
|
||||||
|
if para.text.strip():
|
||||||
|
parts.append(para.text)
|
||||||
|
for table in doc.tables:
|
||||||
|
for row in table.rows:
|
||||||
|
row_texts = [cell.text.strip() for cell in row.cells if cell.text.strip()]
|
||||||
|
if row_texts:
|
||||||
|
parts.append(' '.join(row_texts))
|
||||||
|
return '\n'.join(parts)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f'.docx 提取失败: {e}')
|
||||||
|
raise RuntimeError(f'Word 文本提取失败: {e}')
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_doc(file_path: str) -> str:
|
||||||
|
"""
|
||||||
|
提取旧版 .doc 文件文本,按优先级依次尝试:
|
||||||
|
1. win32com(Windows + Microsoft Word 已安装,最准确)
|
||||||
|
2. LibreOffice 命令行转换(需安装 LibreOffice)
|
||||||
|
3. python-docx 兼容尝试(部分以 XML 保存的伪 .doc 可读)
|
||||||
|
全部失败时提示用户手动另存为 .docx
|
||||||
|
"""
|
||||||
|
abs_path = str(Path(file_path).resolve())
|
||||||
|
|
||||||
|
# ── 方案1:win32com(Windows + Word)──────────────────────────────────
|
||||||
|
try:
|
||||||
|
import win32com.client
|
||||||
|
import pythoncom
|
||||||
|
pythoncom.CoInitialize()
|
||||||
|
word = None
|
||||||
|
try:
|
||||||
|
word = win32com.client.Dispatch('Word.Application')
|
||||||
|
word.Visible = False
|
||||||
|
doc = word.Documents.Open(abs_path, ReadOnly=True)
|
||||||
|
text = doc.Range().Text
|
||||||
|
doc.Close(False)
|
||||||
|
logger.info(f'.doc 通过 win32com 提取成功: {file_path}')
|
||||||
|
return text or ''
|
||||||
|
finally:
|
||||||
|
if word:
|
||||||
|
try:
|
||||||
|
word.Quit()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
pythoncom.CoUninitialize()
|
||||||
|
except ImportError:
|
||||||
|
logger.info('pywin32 未安装,跳过 win32com 方案')
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f'win32com 提取 .doc 失败: {e}')
|
||||||
|
|
||||||
|
# ── 方案2:LibreOffice 命令行 ─────────────────────────────────────────
|
||||||
|
try:
|
||||||
|
import subprocess
|
||||||
|
import tempfile
|
||||||
|
tmp_dir = tempfile.mkdtemp()
|
||||||
|
for soffice_cmd in ('soffice', 'libreoffice'):
|
||||||
|
try:
|
||||||
|
result = subprocess.run(
|
||||||
|
[soffice_cmd, '--headless', '--convert-to', 'txt:Text',
|
||||||
|
'--outdir', tmp_dir, abs_path],
|
||||||
|
capture_output=True, text=True, timeout=60,
|
||||||
|
)
|
||||||
|
if result.returncode == 0:
|
||||||
|
txt_file = os.path.join(tmp_dir, Path(file_path).stem + '.txt')
|
||||||
|
if os.path.exists(txt_file):
|
||||||
|
with open(txt_file, 'r', encoding='utf-8', errors='ignore') as f:
|
||||||
|
content = f.read()
|
||||||
|
logger.info(f'.doc 通过 LibreOffice 提取成功: {file_path}')
|
||||||
|
return content
|
||||||
|
except FileNotFoundError:
|
||||||
|
continue
|
||||||
|
except subprocess.TimeoutExpired:
|
||||||
|
logger.warning('LibreOffice 转换超时')
|
||||||
|
break
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f'LibreOffice 提取 .doc 失败: {e}')
|
||||||
|
|
||||||
|
# ── 方案3:python-docx 兼容尝试(部分另存的 .doc 实为 XML 格式)──────
|
||||||
|
try:
|
||||||
|
result = _extract_docx(file_path)
|
||||||
|
if result.strip():
|
||||||
|
logger.info(f'.doc 通过 python-docx 兼容读取成功: {file_path}')
|
||||||
|
return result
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f'python-docx 兼容读取 .doc 失败: {e}')
|
||||||
|
|
||||||
|
raise RuntimeError(
|
||||||
|
'无法读取 .doc 格式文件。请在 Word 中打开该文件,'
|
||||||
|
'选择「另存为」→「Word 文档 (.docx)」后重新上传。'
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def truncate_text(text: str, max_chars: int = 60000) -> str:
|
||||||
|
"""截断超长文本,避免超出 AI Token 限制"""
|
||||||
|
if len(text) <= max_chars:
|
||||||
|
return text
|
||||||
|
return text[:max_chars] + '\n\n...[文档内容已截断,仅展示前段]'
|
||||||
|
|
||||||
|
|
||||||
|
def split_text_chunks(text: str, chunk_size: int = 2000, overlap: int = 200) -> list[str]:
|
||||||
|
"""将文本按固定大小分块(用于知识库)"""
|
||||||
|
chunks = []
|
||||||
|
start = 0
|
||||||
|
while start < len(text):
|
||||||
|
end = min(start + chunk_size, len(text))
|
||||||
|
chunks.append(text[start:end])
|
||||||
|
start += chunk_size - overlap
|
||||||
|
return chunks
|
||||||
|
|
||||||
|
|
||||||
|
def allowed_file(filename: str) -> bool:
|
||||||
|
allowed = {'pdf', 'doc', 'docx'}
|
||||||
|
return '.' in filename and filename.rsplit('.', 1)[1].lower() in allowed
|
||||||
|
|
||||||
|
|
||||||
|
def safe_filename(filename: str) -> str:
|
||||||
|
"""生成安全的文件名"""
|
||||||
|
import re
|
||||||
|
name = re.sub(r'[^\w\u4e00-\u9fff.\-]', '_', filename)
|
||||||
|
return name
|
||||||
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user