提交版本

This commit is contained in:
wanghui 2026-04-23 14:36:26 +08:00
commit 003af9dc9c
110 changed files with 14829 additions and 0 deletions

1
.deps_installed Normal file
View File

@ -0,0 +1 @@

6
.gitignore vendored Normal file
View File

@ -0,0 +1,6 @@
.env
*.log
__pycache__/
*.pyc
.venv/
venv/

10
.idea/.gitignore generated vendored Normal file
View File

@ -0,0 +1,10 @@
# 默认忽略的文件
/shelf/
/workspace.xml
# 基于编辑器的 HTTP 客户端请求
/httpRequests/
# 已忽略包含查询文件的默认文件夹
/queries/
# Datasource local storage ignored files
/dataSources/
/dataSources.local.xml

View File

@ -0,0 +1,6 @@
<component name="InspectionProjectProfileManager">
<settings>
<option name="USE_PROJECT_PROFILE" value="false" />
<version value="1.0" />
</settings>
</component>

8
.idea/modules.xml generated Normal file
View File

@ -0,0 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/tech-bid-manage20260422.iml" filepath="$PROJECT_DIR$/.idea/tech-bid-manage20260422.iml" />
</modules>
</component>
</project>

16
.idea/tech-bid-manage20260422.iml generated Normal file
View File

@ -0,0 +1,16 @@
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$" />
<orderEntry type="inheritedJdk" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
<component name="TemplatesService">
<option name="TEMPLATE_CONFIGURATION" value="Jinja2" />
<option name="TEMPLATE_FOLDERS">
<list>
<option value="$MODULE_DIR$/templates" />
</list>
</option>
</component>
</module>

6
.idea/vcs.xml generated Normal file
View File

@ -0,0 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="VcsDirectoryMappings">
<mapping directory="" vcs="Git" />
</component>
</project>

113
README.md Normal file
View File

@ -0,0 +1,113 @@
# 标伙伴 · AI 标书助手
基于大模型的智能标书生成工具(单机版),支持解析招标文件、自动生成技术标书、导出 Word 文档。
## 快速开始
### 方式一双击启动Windows
直接双击 `start.bat`,首次运行会自动安装依赖。
### 方式二:命令行启动
```bash
# 1. 安装依赖
pip install -r requirements.txt
# 2. 启动应用
python app.py
```
浏览器访问 **http://localhost:5000**
---
## 配置 API Key
首次使用前,点击右上角 ⚙️ 设置图标,选择模型提供商并填入 API Key
| 提供商 | 推荐模型 | 申请地址 |
|--------|---------|---------|
| 通义千问 | qwen-max | https://dashscope.aliyun.com/ |
| DeepSeek | deepseek-chat (V3) | https://platform.deepseek.com/ |
| OpenAI | gpt-4o | https://platform.openai.com/ |
> **DeepSeek 说明**deepseek-chat (V3) 性价比极高,推荐用于生产环境。
> 由于 DeepSeek 暂不提供 Embedding API使用知识库功能时会自动回退到本地 sentence-transformers 模型(首次使用需下载约 90MB
也可通过环境变量配置:
```bash
# 通义千问
set QWEN_API_KEY=sk-xxxxxxxx
set MODEL_PROVIDER=qwen
# DeepSeek
set DEEPSEEK_API_KEY=sk-xxxxxxxx
set MODEL_PROVIDER=deepseek
python app.py
```
---
## 使用流程
1. **新建项目** → 输入项目名称
2. **上传招标文件** → 支持 PDF / DOC / DOCX
3. **AI 解析** → 自动提取评分要求、资质条件、商务条款
4. **生成大纲** → 按评分权重生成四级章节目录
5. **生成内容** → 逐章节或一键全部生成
6. **合规检查** → 对照招标要求检验覆盖情况
7. **导出 Word** → 专业排版,直接使用
---
## 目录结构
```
autorfp/
├── app.py # Flask 主程序
├── config.py # 配置文件
├── requirements.txt # Python 依赖
├── start.bat # Windows 一键启动
├── prompts/ # AI 提示词模板
├── modules/ # 功能模块
│ ├── parser.py # 招标文件解析
│ ├── generator.py # 标书内容生成
│ ├── checker.py # 合规检查
│ ├── exporter.py # Word 导出
│ └── knowledge.py # 企业知识库
├── utils/ # 工具函数
│ ├── ai_client.py # AI API 封装
│ ├── file_utils.py # 文件处理
│ └── prompts.py # 提示词加载
├── templates/ # HTML 模板
├── static/ # 静态资源
└── data/ # 数据目录(自动创建)
├── projects.db # SQLite 数据库
├── uploads/ # 上传的招标文件
├── exports/ # 导出的标书
├── knowledge/ # 知识库文件
└── chroma/ # 向量数据库
```
---
## 企业知识库
在项目页面切换到「知识库」标签,上传历史标书文件。
系统会自动将文件分块存入向量数据库,生成内容时自动检索相关片段,让 AI 更好地体现企业优势。
---
## 常见问题
**Q: 解析速度很慢?**
A: 招标文件越长耗时越长,通常 30-120 秒。建议使用 qwen-max 或 gpt-4o。
**Q: 内容生成失败?**
A: 检查 API Key 是否正确,以及账户余额是否充足。
**Q: 导出的 Word 文件乱码?**
A: 请使用 Microsoft Word 2016 及以上版本打开。

1201
app.py Normal file

File diff suppressed because it is too large Load Diff

118
bid_partner.spec Normal file
View File

@ -0,0 +1,118 @@
# -*- mode: python ; coding: utf-8 -*-
"""
PyInstaller spec for 标伙伴 · AI标书助手
Build: pyinstaller bid_partner.spec
知识库改用 SQLite + Python 向量存储,已不依赖 ChromaDB打包更小。
"""
import os
from PyInstaller.utils.hooks import collect_all, collect_data_files
block_cipher = None
# ── Collect complex packages ─────────────────────────────────────────────────
openai_datas, openai_bins, openai_hidden = collect_all('openai')
pydantic_datas, pydantic_bins, pydantic_hidden = collect_all('pydantic')
# tiktoken data (BPE vocab files)
tiktoken_datas = collect_data_files('tiktoken')
a = Analysis(
['launcher.py'],
pathex=['.'],
binaries=openai_bins + pydantic_bins,
datas=[
# ── App assets (read-only, go into _MEIPASS) ──
('templates', 'templates'),
('static', 'static'),
# ── Package data ──
*openai_datas,
*pydantic_datas,
*tiktoken_datas,
],
hiddenimports=[
# Flask / Werkzeug
'flask', 'flask_cors', 'werkzeug', 'werkzeug.serving',
'werkzeug.routing', 'werkzeug.middleware.proxy_fix',
'jinja2', 'jinja2.ext',
# SQLite (stdlib, always present)
'sqlite3',
# OpenAI
*openai_hidden,
# Pydantic
*pydantic_hidden,
# Document processing
'PyPDF2', 'pypdf', 'pypdf.errors',
'pdfminer', 'pdfminer.high_level', 'pdfminer.layout',
'pdfminer.pdfpage', 'pdfminer.pdfinterp', 'pdfminer.converter',
'docx', 'docx.oxml', 'docx.oxml.ns', 'docx.shared',
'docx.enum', 'docx.enum.text', 'docx.enum.style',
'python_docx',
# tiktoken
'tiktoken', 'tiktoken.core', 'tiktoken.model',
'tiktoken_ext', 'tiktoken_ext.openai_public',
# Network / encoding
'requests', 'chardet', 'httpx', 'httpcore',
'anyio', 'anyio.streams', 'anyio.streams.memory',
'sniffio', 'certifi',
# Stdlib extras
'importlib.metadata', 'importlib.resources',
'pkg_resources', 'json', 'math', 'threading',
# Local project modules (explicitly include all)
'config', 'app',
'utils', 'utils.ai_client', 'utils.file_utils',
'utils.prompts', 'utils.settings', 'utils.boq_parser', 'utils.bill_analysis',
'modules', 'modules.parser', 'modules.generator',
'modules.checker', 'modules.exporter', 'modules.knowledge',
],
hookspath=[],
hooksconfig={},
runtime_hooks=[],
excludes=[
# Heavy packages not used in this app
'matplotlib', 'pandas', 'scipy', 'numpy',
'IPython', 'jupyter', 'notebook',
'PIL', 'Pillow',
'cv2', 'torch', 'tensorflow',
'pytest', 'unittest',
# ChromaDB 及其依赖(已移除,改用 SQLite 内置存储)
'chromadb', 'hnswlib', 'posthog', 'pypika',
'mmh3', 'overrides', 'monotonic',
'sentence_transformers', 'onnxruntime',
],
win_no_prefer_redirects=False,
win_private_assemblies=False,
cipher=block_cipher,
noarchive=False,
)
pyz = PYZ(a.pure, a.zipped_data, cipher=block_cipher)
exe = EXE(
pyz,
a.scripts,
[],
exclude_binaries=True,
name='bid_partner',
debug=False,
bootloader_ignore_signals=False,
strip=False,
upx=False,
console=False, # no black console window — GUI launcher takes over
disable_windowed_traceback=False,
argv_emulation=False,
target_arch=None,
codesign_identity=None,
entitlements_file=None,
)
coll = COLLECT(
exe,
a.binaries,
a.zipfiles,
a.datas,
strip=False,
upx=False,
upx_exclude=[],
name='BidPartner',
)

672
bill-worker.js Normal file
View File

@ -0,0 +1,672 @@
/**
* bill-worker.js PDF 清单解析调度器Worker Thread
*
* 架构v3 SharedArrayBuffer 零拷贝
* Phase 1 并行文本提取
* PDF 数据写入 SharedArrayBuffer一次分配所有子线程共享读
* 启动 N page-worker每个负责固定 20
*
* Phase 2 清单页筛选 + 文本解析纯正则毫秒级
* 汇总全部页面文本 关键字筛选清单页 多行合并 逐行解析
*/
'use strict';
const { parentPort } = require('worker_threads');
const { Worker } = require('worker_threads');
const path = require('path');
const PAGES_PER_CHUNK = 20;
parentPort.on('message', async (msg) => {
if (msg.type !== 'parse') return;
const t0 = Date.now();
try {
// 立即做一次干净的拷贝,确保拥有独立的 ArrayBuffer
const raw = msg.buffer;
const buf = Buffer.alloc(raw.byteLength);
Buffer.from(raw).copy(buf);
if (buf.length === 0) {
parentPort.postMessage({ type: 'done', ok: false, error: '收到空 PDF 数据' });
return;
}
// ── 获取总页数 ──
const pdfjsModule = await import('pdfjs-dist/build/pdf.mjs');
const pdfjsLib = pdfjsModule.default || pdfjsModule;
// 给 pdfjs 一份独立拷贝pdfjs 内部可能 detach buffer
const pdfData = new Uint8Array(buf.length);
buf.copy(Buffer.from(pdfData.buffer));
const pdf = await pdfjsLib.getDocument({ data: pdfData, isEvalSupported: false }).promise;
const totalPages = pdf.numPages;
// ── 将 PDF 数据写入 SharedArrayBuffer一次分配所有子线程共享读──
const sab = new SharedArrayBuffer(buf.length);
const sabView = new Uint8Array(sab);
buf.copy(Buffer.from(sabView.buffer)); // 从独立 buf 拷贝到共享内存
const workerCount = Math.ceil(totalPages / PAGES_PER_CHUNK);
console.log(`[BillWorker] PDF ${totalPages} 页, ${workerCount} 路并行 (SharedArrayBuffer ${(buf.length/1024/1024).toFixed(1)}MB)`);
// Phase 1: 并行文本提取
const pageTexts = await parallelExtract(sab, buf.length, totalPages, workerCount);
const t1 = Date.now();
const extractedCount = pageTexts.filter(t => t.length > 0).length;
console.log(`[BillWorker] Phase1 完成: ${t1 - t0}ms, ${extractedCount}/${totalPages} 页有文本`);
// 扫描件判断
const totalChars = pageTexts.reduce((s, t) => s + t.length, 0);
if (totalChars < 50) {
parentPort.postMessage({ type: 'done', ok: true, data: { scanned: true, reason: 'noText', totalPages } });
return;
}
// Phase 2: 筛选清单页(宽松策略 + 连续页补全)
const BILL_KW = ['项目编码', '项目名称', '工程量', '计量单位', '综合单价', '清单编码'];
const SEC_KW = ['分部分项', '分类分项', '措施项目', '其他项目', '工程量清单计价'];
// 第一轮:标记确定的清单页
const billFlags = new Array(pageTexts.length).fill(false);
for (let i = 0; i < pageTexts.length; i++) {
const t = pageTexts[i];
if (!t.trim()) continue;
const hHits = BILL_KW.filter(k => t.includes(k)).length;
const sHit = SEC_KW.some(k => t.includes(k));
const hasCode = /\d{9}/.test(t);
// 放宽有9位编码即可不再要求同时命中表头关键字
if (hHits >= 2 || sHit || hasCode) {
billFlags[i] = true;
}
}
// 第二轮:连续页补全 — 两个清单页之间的非空页也视为清单页(续页无表头)
// 但排除纯费用/税金页面(它们不含施工清单项)
const FEE_PAGE_KW = ['规费', '税金', '社会保险费', '住房公积金', '养老保险',
'工伤保险', '失业保险', '医疗保险', '教育费附加', '城市维护建设税'];
const firstBill = billFlags.indexOf(true);
const lastBill = billFlags.lastIndexOf(true);
if (firstBill >= 0 && lastBill > firstBill) {
for (let i = firstBill; i <= lastBill; i++) {
if (!billFlags[i] && pageTexts[i] && pageTexts[i].trim().length > 30) {
const t = pageTexts[i];
const feeHits = FEE_PAGE_KW.filter(kw => t.includes(kw)).length;
// 命中 2+ 个费用关键字且没有9位工程编码 → 纯费用页,排除
if (feeHits >= 2 && !/\d{9}/.test(t)) continue;
billFlags[i] = true;
}
}
}
const billTexts = [];
for (let i = 0; i < pageTexts.length; i++) {
if (billFlags[i]) billTexts.push(pageTexts[i]);
}
if (!billTexts.length) {
parentPort.postMessage({ type: 'done', ok: true, data: { scanned: false, noBillPages: true, totalPages } });
return;
}
console.log(`[BillWorker] ${totalPages} 页 → ${billTexts.length} 页清单 (原始识别 ${billFlags.filter(f=>f).length - (lastBill - firstBill >= 0 ? 0 : 0)} / 补全后 ${billTexts.length})`);
// Phase 3: 文本解析
const merged = billTexts.join('\n');
const parsed = parseBillText(merged);
const t2 = Date.now();
console.log(`[BillWorker] Phase2+3: ${t2 - t1}ms, 总耗时: ${t2 - t0}ms`);
parentPort.postMessage({
type: 'done', ok: true,
data: {
scanned: false,
...parsed,
_meta: {
method: 'local-parallel',
workers: workerCount,
billPages: billTexts.length,
totalPages,
extractMs: t1 - t0,
parseMs: t2 - t1,
totalMs: t2 - t0,
}
}
});
} catch (err) {
console.error('[BillWorker] 错误:', err.message);
parentPort.postMessage({ type: 'done', ok: false, error: err.message });
}
});
// ================================================================
// Phase 1: 多 Worker 并行提取SharedArrayBuffer 零拷贝)
// ================================================================
function parallelExtract(sab, dataLength, totalPages, workerCount) {
return new Promise((resolve) => {
const workerPath = path.join(__dirname, 'page-worker.js');
const allPageTexts = new Array(totalPages).fill('');
const workerStatus = new Array(workerCount).fill('pending'); // pending, done, failed
let resolved = false;
const checkComplete = () => {
if (resolved) return;
const doneCount = workerStatus.filter(s => s === 'done' || s === 'failed').length;
if (doneCount >= workerCount) {
resolved = true;
// 检查是否有失败的worker打印警告
const failedCount = workerStatus.filter(s => s === 'failed').length;
if (failedCount > 0) {
console.warn(`[BillWorker] ${failedCount}/${workerCount} 个worker失败可能导致部分页面无内容`);
}
resolve(allPageTexts);
}
};
for (let i = 0; i < workerCount; i++) {
const startPage = i * PAGES_PER_CHUNK + 1;
const endPage = Math.min((i + 1) * PAGES_PER_CHUNK, totalPages);
// workerData 传 SharedArrayBuffer跨线程共享不会被清空
const w = new Worker(workerPath, {
workerData: { sab, dataLength, startPage, endPage }
});
let workerDone = false;
const markDone = (status) => {
if (workerDone) return;
workerDone = true;
workerStatus[i] = status;
checkComplete();
};
w.on('message', (msg) => {
if (msg.ok && msg.results) {
for (const r of msg.results) {
allPageTexts[r.page - 1] = r.text;
}
markDone('done');
} else if (!msg.ok) {
console.warn(`[BillWorker] page-worker[${startPage}-${endPage}] 失败: ${msg.error}`);
markDone('failed');
}
});
w.on('error', (err) => {
console.warn(`[BillWorker] page-worker[${startPage}-${endPage}] 异常: ${err.message}`);
markDone('failed');
});
w.on('exit', (code) => {
// exit 在 message 之后触发,但如果 worker 崩溃没发 message 则在这里兜底
if (code !== 0 && !workerDone) {
console.warn(`[BillWorker] page-worker[${startPage}-${endPage}] 意外退出(code=${code})`);
markDone('failed');
} else if (!workerDone) {
markDone('done');
}
});
}
if (workerCount <= 0) {
resolved = true;
resolve(allPageTexts);
}
});
}
// ================================================================
// Phase 3: 清单文本解析(纯正则 + 字符串处理,毫秒级)
// ================================================================
function parseBillText(text) {
const rawLines = text.split(/\n/).map(l => {
let line = l.replace(/\t/g, ' ').trim();
// 规范化带横杠的编码:如 "010-101-001-001" → "010101001001"
line = line.replace(/(\d{2,4})[-](\d{2,4})[-](\d{2,4})(?:[-](\d{2,4}))?/g,
(m, a, b, c, d) => {
const combined = a + b + c + (d || '');
return (combined.length >= 9 && combined.length <= 12) ? combined : m;
});
return line;
});
// ── Step 1: 多行合并成逻辑行 ──
// pdfjs 按 Y 坐标分行,表格一行通常 = 一条文本行
// 但有时 项目特征/名称 会折行,需要合并
//
// 新逻辑行的起始标志(任一命中即切断):
// a) 序号模式1.1.1.1.5 开头
// b) 清单编码9-12位数字 或 B+5-6位数字 开头
// c) 中文大标题:一 二 三 ... 或 (一)(二)...
// d) 表头行内容(跳过)
// e) 纯数字序号 + 空格 + 编码(如 "5 500101004001"
const ITEM_START = /^\d+(\.\d+)+\s/; // 1.1 或 1.1.1 等序号
const CODE_INLINE = /(?:^|\s)(\d{9,12}|(?<![A-Za-z])B\d{5,6})\s/; // 行内含清单编码(排除 GB/DB 等标准号)
const CODE_START_RE = /^(\d{9,12}|B\d{5,6})\s/; // 行首就是清单编码(行首 B 不会有前缀字母)
const SEQ_CODE_RE = /^\d{1,4}\s+(\d{9,12}|(?<![A-Za-z])B\d{5,6})\s/; // "序号 编码"格式
const PAGE_MARK = /^--\s*\d+\s+of\s+\d+\s*--/;
const HEADER_RE = /^序号\s+(项目编码|项目名称)/;
const HEADER_KW = /^(项目编码|项目名称|清单编码|计量单位|综合单价|工程量|合\s*价|金额|序号)\s/;
const CATEGORY_MARKERS = ['一', '二', '三', '四', '五', '六', '七', '八', '九', '十',
'(一)', '(二)', '(三)', '(四)', '(五)'];
const logicLines = [];
let currentLine = '';
function isNewLineTrigger(raw) {
if (ITEM_START.test(raw)) return true;
if (CODE_START_RE.test(raw)) return true;
if (SEQ_CODE_RE.test(raw)) return true;
if (CATEGORY_MARKERS.some(m => raw.startsWith(m + ' ') || raw.startsWith(m + '\u3000'))) return true;
return false;
}
for (const raw of rawLines) {
if (!raw || PAGE_MARK.test(raw)) continue;
if (HEADER_RE.test(raw) || HEADER_KW.test(raw)) continue;
if (/^(元)|^款章节号|^备注$|^第\d+页/.test(raw)) continue;
if (isNewLineTrigger(raw)) {
if (currentLine) logicLines.push(currentLine);
currentLine = raw;
} else if (CODE_INLINE.test(raw) && raw.length > 15) {
// 行内包含编码且够长(像是完整的表格行)→ 也开新行
if (currentLine) logicLines.push(currentLine);
currentLine = raw;
} else {
// 续行(项目特征折行等短文本)
// 安全阀:已合并行过长时强制切断,防止整页吞并
if (currentLine && currentLine.length > 300) {
logicLines.push(currentLine);
currentLine = raw;
} else {
currentLine = currentLine ? currentLine + ' ' + raw : raw;
}
}
}
if (currentLine) logicLines.push(currentLine);
console.log(`[BillWorker] 合并后 ${logicLines.length} 条逻辑行(原始 ${rawLines.length} 行)`);
// 打印前5条逻辑行供调试
for (let i = 0; i < Math.min(5, logicLines.length); i++) {
console.log(`[BillWorker] L${i}: ${logicLines[i].substring(0, 120)}`);
}
const categories = [];
let curCat = null, curItem = null;
// 编码匹配支持行内任意位置的9-12位数字或B编码排除 GB/DB 等标准号前缀)
const CODE_RE = /(?<![A-Za-z])(\d{9,12}|(?<![A-Za-z])B\d{5,6})/;
const UNIT_TOKENS = ['m³','m²','m3','m2','km','hm2','㎡','㎥','t','kg',
'个','台','套','组','根','块','片','张','只','吨','项',
'处','座','件','段','条','把','扇','口','圈','道','孔',
'对','副','樘','方','延m','株','棵','m'];
const UNIT_SET = new Set(UNIT_TOKENS);
const unitEscaped = UNIT_TOKENS.map(u => u.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'));
const UNIT_RE = new RegExp(`(?:^|\\s)(${unitEscaped.join('|')})(?=\\s|\\d|$)`);
const SKIP_RE = /合\s*计|小\s*计|本页小计|总\s*计|价税合计/;
for (const line of logicLines) {
if (SKIP_RE.test(line)) continue;
// 去掉行首的序号部分("1.1.1.1.5 " 或 "5 " 等纯序号前缀)
let stripped = line.replace(/^\d+(\.\d+)*\s+/, '').trim();
if (!stripped) stripped = line.trim();
if (!stripped) continue;
const cm = stripped.match(CODE_RE);
if (cm) {
if (curItem && curCat) curCat.items.push(curItem);
if (!curCat) { curCat = { name: '未分类', items: [] }; categories.push(curCat); }
const code = cm[1];
let rest = stripped.substring(cm.index + cm[0].length).trim();
let name = '', unit = '', quantity = '', spec = '';
const unitMatch = rest.match(UNIT_RE);
if (unitMatch) {
const ui = rest.indexOf(unitMatch[0]);
let rawName = rest.substring(0, ui).trim();
unit = unitMatch[1];
const afterUnit = rest.substring(ui + unitMatch[0].length).trim();
const qm = afterUnit.match(/^([\d,.]+)/);
if (qm) {
quantity = qm[1];
// 提取 quantity 之后的尾部文本,跳过纯数字字段(综合单价、合价等)
let tail = afterUnit.substring(qm.index + qm[0].length).trim();
if (tail) {
const tailTokens = tail.split(/\s+/);
let si = 0;
while (si < tailTokens.length && /^[\d,.%\-]+$/.test(tailTokens[si])) si++;
const specTail = tailTokens.slice(si).join(' ').trim();
if (specTail) spec = specTail;
}
}
// 分离 rawName 中的"项目名称"和内联"项目特征"
const ns = splitNameAndSpec(rawName);
name = ns.name;
if (ns.spec) spec = ns.spec + (spec ? ';' + spec : '');
} else {
const tokens = rest.split(/\s+/).filter(t => t);
let foundUnitIdx = -1;
for (let ti = tokens.length - 1; ti >= 1; ti--) {
if (UNIT_SET.has(tokens[ti])) { foundUnitIdx = ti; break; }
}
if (foundUnitIdx >= 1) {
const rawNameStr = tokens.slice(0, foundUnitIdx).join(' ');
const ns = splitNameAndSpec(rawNameStr);
name = ns.name;
if (ns.spec) spec = ns.spec;
unit = tokens[foundUnitIdx];
const afterTokens = tokens.slice(foundUnitIdx + 1);
if (afterTokens.length && /^[\d,.]+$/.test(afterTokens[0])) {
quantity = afterTokens[0];
let si = 1;
while (si < afterTokens.length && /^[\d,.%\-]+$/.test(afterTokens[si])) si++;
const specTail = afterTokens.slice(si).join(' ').trim();
if (specTail) spec = spec ? spec + ';' + specTail : specTail;
}
} else {
name = rest;
}
}
name = name.replace(/\s+/g, '').trim();
for (const u of UNIT_TOKENS) {
if (name.endsWith(u) && name.length > u.length) {
unit = unit || u;
name = name.substring(0, name.length - u.length);
break;
}
}
curItem = { code, name, unit, quantity, spec };
continue;
}
// ── 回退:无标准编码但有 "名称 单位 数量" 结构 → 也视为清单项 ──
// 常见于措施项目、未编码的补充清单项
if (!cm && stripped.length > 4) {
const uniMatch = stripped.match(UNIT_RE);
if (uniMatch) {
const ui = stripped.indexOf(uniMatch[0]);
const beforeUnit = stripped.substring(0, ui).trim();
const afterUnit = stripped.substring(ui + uniMatch[0].length).trim();
const hasQty = /^[\d,.]+/.test(afterUnit);
// 名称 2-50 字、含中文、有数量、不是分部标题
if (beforeUnit.length >= 2 && beforeUnit.length <= 50 && hasQty
&& /[\u4e00-\u9fff]/.test(beforeUnit)) {
if (curItem && curCat) curCat.items.push(curItem);
if (!curCat) { curCat = { name: '未分类', items: [] }; categories.push(curCat); }
const unit = uniMatch[1];
const qm = afterUnit.match(/^([\d,.]+)/);
const quantity = qm ? qm[1] : '';
const ns = splitNameAndSpec(beforeUnit);
const name = ns.name.replace(/\s+/g, '').trim();
const spec = ns.spec || '';
curItem = { code: '', name, unit, quantity, spec };
continue;
}
}
}
// 分部标题判断:不含编码、较短的文本、含工程关键字
// 关键守卫:如果行里有计量单位,说明是清单项,不是标题
if (stripped.length > 2 && stripped.length < 60 && !CODE_RE.test(stripped)) {
if (UNIT_RE.test(stripped) && /\d+\.?\d*\s*$/.test(stripped)) {
if (curItem) curItem.spec = curItem.spec ? curItem.spec + ';' + stripped : stripped;
continue;
}
if (isCatTitle(stripped) && !UNIT_RE.test(stripped) && !isFeeCatTitle(stripped)) {
if (curItem && curCat) { curCat.items.push(curItem); curItem = null; }
const cleanTitle = stripped.replace(/\s+(座|个|项|处|m|km|段|条)\s+\d+[\d.]*\s*$/, '').trim();
curCat = { name: cleanTitle, items: [] };
categories.push(curCat);
continue;
}
}
if (/^[一二三四五六七八九十]+\s/.test(stripped) || /^[一二三四五六七八九十\d]+/.test(stripped)) {
// 中文序号标题也需要排除费用类
const cleanTitle = stripped.replace(/\s+(座|个|项|处)\s+\d+[\d.]*\s*$/, '').trim();
if (isFeeCatTitle(cleanTitle)) {
// 费用类标题:跳过,不建分部(其下的行会作为续行处理)
continue;
}
if (curItem && curCat) { curCat.items.push(curItem); curItem = null; }
curCat = { name: cleanTitle, items: [] };
categories.push(curCat);
continue;
}
if (curItem && stripped.length > 1) {
curItem.spec = curItem.spec ? curItem.spec + ';' + stripped : stripped;
}
}
if (curItem && curCat) curCat.items.push(curItem);
// 过滤费用项:只保留需要写入技术标的施工清单项
let feeFiltered = 0;
for (const cat of categories) {
if (cat.items) {
const before = cat.items.length;
cat.items = cat.items.filter(it => !isFeeItem(it.name));
feeFiltered += before - cat.items.length;
}
}
if (feeFiltered > 0) console.log(`[BillWorker] 费用项过滤: 移除 ${feeFiltered}`);
// ========== 按项目名称合并(核心去重,大幅减少清单项数量)==========
// 规则同一分部内name 相同的清单项合并为一条
// - code: 保留第一个非空编码
// - unit: 保留第一个非空单位
// - quantity: 尝试数值求和,否则用分号拼接
// - spec: 去重后用分号拼接(截断过长的)
let totalBeforeMerge = 0, totalAfterMerge = 0;
for (const cat of categories) {
if (!cat.items || !cat.items.length) continue;
totalBeforeMerge += cat.items.length;
const nameMap = new Map(); // name → merged item
for (const item of cat.items) {
const key = (item.name || '').replace(/\s+/g, '').trim();
if (!key) continue;
if (!nameMap.has(key)) {
nameMap.set(key, {
code: item.code || '',
name: item.name,
unit: item.unit || '',
quantity: item.quantity || '',
spec: item.spec || '',
_count: 1,
_quantities: item.quantity ? [item.quantity] : [],
_specs: item.spec ? [item.spec] : [],
});
} else {
const m = nameMap.get(key);
m._count++;
// code: 取第一个非空的
if (!m.code && item.code) m.code = item.code;
// unit: 取第一个非空的
if (!m.unit && item.unit) m.unit = item.unit;
// quantity: 收集所有
if (item.quantity) m._quantities.push(item.quantity);
// spec: 收集不重复的
if (item.spec && !m._specs.includes(item.spec)) {
m._specs.push(item.spec);
}
}
}
// 后处理:合成最终字段
const merged = [];
for (const [, m] of nameMap) {
// quantity: 尝试数值求和
if (m._quantities.length > 1) {
const nums = m._quantities.map(q => parseFloat(q.replace(/,/g, '')));
if (nums.every(n => !isNaN(n))) {
const sum = nums.reduce((a, b) => a + b, 0);
m.quantity = sum % 1 === 0 ? String(sum) : sum.toFixed(2);
} else {
m.quantity = m._quantities.join('; ');
}
} else if (m._quantities.length === 1) {
m.quantity = m._quantities[0];
}
// spec: 拼接去重后的 spec每条最多120字
if (m._specs.length > 0) {
const trimmed = m._specs.map(s => s.length > 120 ? s.substring(0, 120) + '...' : s);
m.spec = trimmed.join('; ');
// 总 spec 上限 300 字
if (m.spec.length > 300) m.spec = m.spec.substring(0, 300) + '...';
}
// 清理临时字段
delete m._count; delete m._quantities; delete m._specs;
merged.push(m);
}
cat.items = merged;
totalAfterMerge += merged.length;
}
const mergedCount = totalBeforeMerge - totalAfterMerge;
if (mergedCount > 0) {
console.log(`[BillWorker] 按名称合并: ${totalBeforeMerge}${totalAfterMerge} 项(合并 ${mergedCount} 个重复项)`);
}
const valid = categories.filter(c => c.items && c.items.length > 0);
const totalItems = valid.reduce((s, c) => s + c.items.length, 0);
const withSpec = valid.reduce((s, c) => s + c.items.filter(it => it.spec).length, 0);
const withCode = valid.reduce((s, c) => s + c.items.filter(it => it.code).length, 0);
console.log(`[BillWorker] 最终结果: ${valid.length} 分部, ${totalItems} 清单项 (${withCode} 有编码, ${withSpec} 有spec)`);
// 打印前 3 个 item 供调试
let debugCount = 0;
for (const cat of valid) {
for (const it of cat.items) {
if (debugCount < 3) {
console.log(`[BillWorker] 样例: [${it.code}] ${it.name} | ${it.unit} | qty=${it.quantity} | spec=${(it.spec||'').substring(0, 80)}`);
debugCount++;
}
}
}
return {
project_summary: { remark: `本地解析:${valid.length} 个分部,${totalItems} 个清单项(合并前 ${totalBeforeMerge} 项)` },
categories: valid,
};
}
/**
* 判断清单项是否为"费用项"非施工内容不写入技术标
* 安全文明措施费规费税金暂列金额等
*/
function isFeeItem(name) {
if (!name) return false;
const n = name.replace(/\s+/g, '');
// ── 1. 精确匹配 ──
const EXACT = [
'规费', '税金', '利润', '增值税', '暂列金额', '暂估价', '计日工',
'总承包服务费', '企业管理费', '甲供材料保管费', '价税合计',
];
if (EXACT.includes(n)) return true;
// ── 2. 包含匹配:措施费/规费/保险/行政类 ──
const FEE_KW = [
'安全文明', '文明施工费', '环境保护费', '临时设施费',
'夜间施工增加费', '夜间施工费',
'冬雨季施工增加费', '冬雨季施工费',
'二次搬运费', '大型机械设备进出场', '大型机械进出场',
'施工排水降水', '排水降水费',
'已完工程及设备保护', '已完工程保护费',
'工程排污费', '社会保障费', '住房公积金',
'工伤保险', '劳动保险', '意外伤害保险', '建筑工程保险',
'城市维护建设税', '城市建设维护税',
'教育费附加', '地方教育附加',
'材料暂估', '专业工程暂估',
'超高施工增加费', '安全防护费',
'措施项目费', '其他项目费', '不可竞争费',
];
for (const kw of FEE_KW) {
if (n.includes(kw)) return true;
}
return false;
}
/**
* rawName 中的"项目名称"与内联"项目特征描述"分离
* : "土方开挖 1.土壤类别:普通土" { name: "土方开挖", spec: "1.土壤类别:普通土" }
*/
function splitNameAndSpec(rawName) {
if (!rawName) return { name: '', spec: '' };
// Pattern 1: 数字+点+中文(如 "1.土壤类别" "2、强度等级"
const m = rawName.match(/\d+[.、.)\uFF09]\s*[\u4e00-\u9fff]/);
if (m && m.index > 0) {
return {
name: rawName.substring(0, m.index).trim(),
spec: rawName.substring(m.index).trim()
};
}
// Pattern 2: 特征关键字+冒号(如 "材质:" "规格:"
const SPEC_KW_RE = /(材质|规格|型号|品牌|颜色|尺寸|厚度|直径|管径|强度|等级|类别|类型|做法|要求|标准|内容|工作内容|土壤|含量|配合比|工艺|方式|形式|范围|部位|位置|高度|宽度|长度|深度|坡度|截面|跨度|运距|开挖|回填|混凝土|钢筋|压实)[:]/;
const kw = rawName.match(SPEC_KW_RE);
if (kw && kw.index > 0) {
return {
name: rawName.substring(0, kw.index).trim(),
spec: rawName.substring(kw.index).trim()
};
}
// Pattern 3: 括号开头的特征描述 "1" "(1)"
const paren = rawName.match(/[(]\d+[)]/);
if (paren && paren.index > 0) {
return {
name: rawName.substring(0, paren.index).trim(),
spec: rawName.substring(paren.index).trim()
};
}
return { name: rawName, spec: '' };
}
function isCatTitle(text) {
const KW = [
'土建','建筑','结构','装饰','装修','安装','给排水','暖通','空调','通风',
'电气','强电','弱电','消防','智能化','幕墙','门窗','园林','绿化','景观',
'市政','道路','桥梁','管网','基础','地基','桩基','主体','屋面','防水',
'保温','钢结构','排水','给水','照明','动力','防雷','电梯','人防','室外',
'附属','分部','工程','措施','清单','土石方','混凝土','砌筑','模板','脚手架',
'水利','河道','管道','阀门','设备','仪表','自动化','通信','网络',
'拆除','外墙','内墙','楼地面','天棚','吊顶','栏杆','屋顶','涂料','抹灰',
'廊道','阀门井','蓄水池','泵站','供水','引水','水源','渠道','闸门',
'围栏','警示','检修','管线','配电','水池','水塔','取水','净水',
];
return KW.some(k => text.includes(k));
}
/**
* 判断分部标题是否为"费用类"不应创建分部分类
* 规费税金措施项目费其他项目费 等非施工类分部
*/
function isFeeCatTitle(text) {
if (!text) return false;
const t = text.replace(/\s+/g, '');
// 精确匹配整个标题
const EXACT = [
'规费', '税金', '利润', '增值税', '暂列金额', '暂估价', '计日工',
'总承包服务费', '企业管理费', '价税合计',
'措施项目费', '其他项目费', '不可竞争费',
];
if (EXACT.includes(t)) return true;
// 包含匹配
const FEE_CAT_KW = [
'措施项目费', '其他项目费', '不可竞争费',
'规费汇总', '税金汇总', '费率', '费用汇总', '费用合计',
'暂列金额', '暂估价', '计日工', '总承包服务费',
'安全文明施工费', '社会保障费', '住房公积金',
'工伤保险', '教育费附加', '城市维护建设税',
];
for (const kw of FEE_CAT_KW) {
if (t.includes(kw)) return true;
}
return false;
}

95
build.bat Normal file
View File

@ -0,0 +1,95 @@
@echo off
chcp 65001 >nul 2>&1
setlocal
echo ============================================================
echo BidPartner - Build Desktop EXE
echo ============================================================
echo.
:: ── 1. Check Python ────────────────────────────────────────────────────────
python --version >nul 2>&1
if errorlevel 1 (
echo [ERROR] Python not found. Please install Python 3.9+.
pause & exit /b 1
)
:: ── 2. Install / upgrade PyInstaller ───────────────────────────────────────
echo [Step 1/4] Installing PyInstaller...
pip install --quiet --upgrade pyinstaller
if errorlevel 1 (
echo [ERROR] Failed to install PyInstaller.
pause & exit /b 1
)
:: ── 3. Install project dependencies (if not already installed) ─────────────
echo [Step 2/4] Checking dependencies...
pip install --quiet -r requirements.txt
if errorlevel 1 (
echo [ERROR] Failed to install dependencies.
pause & exit /b 1
)
:: ── 4. Sanitize settings.json - REMOVE API KEYS before build ───────────────
echo [Step 3/4] Sanitizing settings (removing API keys from build)...
if exist "data\settings.json" (
:: Back up real settings
copy /y "data\settings.json" "data\settings.json.bak" >nul
)
:: Write a clean settings file with no real keys
(
echo {
echo "model_provider": "deepseek",
echo "qwen_api_key": "sk-your-qwen-key",
echo "qwen_model": "qwen3.6-plus",
echo "openai_api_key": "sk-your-openai-key",
echo "openai_model": "gpt-4o",
echo "deepseek_api_key": "sk-your-deepseek-key",
echo "deepseek_model": "deepseek-chat",
echo "max_concurrent": 5,
echo "content_volume": "standard"
echo }
) > "data\settings_clean.tmp"
:: ── 5. Build ────────────────────────────────────────────────────────────────
echo [Step 4/4] Building EXE with PyInstaller...
echo (This may take 3-10 minutes on first run)
echo.
:: Clean previous build artifacts
if exist "build" rd /s /q "build" >nul 2>&1
if exist "dist\BidPartner" rd /s /q "dist\BidPartner" >nul 2>&1
pyinstaller bid_partner.spec --noconfirm
set BUILD_RESULT=%errorlevel%
:: ── Restore real settings ───────────────────────────────────────────────────
if exist "data\settings.json.bak" (
copy /y "data\settings.json.bak" "data\settings.json" >nul
del /f /q "data\settings.json.bak" >nul 2>&1
)
del /f /q "data\settings_clean.tmp" >nul 2>&1
if %BUILD_RESULT% neq 0 (
echo.
echo [ERROR] PyInstaller build failed. See output above for details.
pause & exit /b 1
)
:: ── 6. Result ───────────────────────────────────────────────────────────────
echo.
echo ============================================================
echo Build SUCCESSFUL!
echo Output: dist\BidPartner\bid_partner.exe
echo ============================================================
echo.
echo The 'dist\BidPartner' folder is your distributable package.
echo Users only need this folder - no Python installation required.
echo Each user must set their own API key in the app settings.
echo.
:: Open the output folder
explorer "dist\BidPartner" >nul 2>&1
endlocal
pause

102
config.py Normal file
View File

@ -0,0 +1,102 @@
import os
import sys
import threading
import random
import time
from contextlib import contextmanager
# When running as a PyInstaller bundle:
# sys._MEIPASS → read-only bundle dir (templates, static, prompts)
# sys.executable dir → writable dir next to the .exe (data, settings, db)
if getattr(sys, 'frozen', False):
_BUNDLE_DIR = sys._MEIPASS # bundled app files
BASE_DIR = os.path.dirname(sys.executable) # writable runtime dir
else:
_BUNDLE_DIR = os.path.dirname(os.path.abspath(__file__))
BASE_DIR = _BUNDLE_DIR
DATA_DIR = os.path.join(BASE_DIR, 'data')
UPLOAD_DIR = os.path.join(DATA_DIR, 'uploads')
EXPORT_DIR = os.path.join(DATA_DIR, 'exports')
KNOWLEDGE_DIR= os.path.join(DATA_DIR, 'knowledge')
DB_PATH = os.path.join(DATA_DIR, 'projects.db')
CHROMA_DIR = os.path.join(DATA_DIR, 'chroma')
PROMPTS_DIR = os.path.join(_BUNDLE_DIR, 'prompts')
# ==================== AI 模型配置 ====================
# 模型选择:'openai' | 'qwen' | 'deepseek' | 'ollama'
MODEL_PROVIDER = os.environ.get('MODEL_PROVIDER', 'qwen')
# OpenAI
OPENAI_API_KEY = os.environ.get('OPENAI_API_KEY', 'sk-your-openai-key')
OPENAI_MODEL = os.environ.get('OPENAI_MODEL', 'gpt-4.1')
OPENAI_BASE_URL = os.environ.get('OPENAI_BASE_URL', 'https://api.openai.com/v1')
# 阿里云通义千问
QWEN_API_KEY = os.environ.get('QWEN_API_KEY', 'sk-your-qwen-key')
QWEN_MODEL = os.environ.get('QWEN_MODEL', 'qwen3.6-plus')
QWEN_BASE_URL = os.environ.get('QWEN_BASE_URL', 'https://dashscope.aliyuncs.com/compatible-mode/v1')
# DeepSeek
DEEPSEEK_API_KEY = os.environ.get('DEEPSEEK_API_KEY', 'sk-your-deepseek-key')
DEEPSEEK_MODEL = os.environ.get('DEEPSEEK_MODEL', 'deepseek-chat')
DEEPSEEK_BASE_URL = os.environ.get('DEEPSEEK_BASE_URL', 'https://api.deepseek.com/v1')
# Ollama 本地OpenAI 兼容接口)
OLLAMA_BASE_URL = os.environ.get('OLLAMA_BASE_URL', 'http://localhost:11434/v1')
OLLAMA_MODEL = os.environ.get('OLLAMA_MODEL', 'qwen3:8b')
# 豆包 / 火山引擎字节跳动OpenAI 兼容接口)
DOUBAO_API_KEY = os.environ.get('DOUBAO_API_KEY', 'sk-your-doubao-key')
DOUBAO_MODEL = os.environ.get('DOUBAO_MODEL', 'doubao-1-5-pro-32k')
DOUBAO_BASE_URL = os.environ.get('DOUBAO_BASE_URL', 'https://ark.cn-beijing.volces.com/api/v3')
# Kimi / Moonshot AIOpenAI 兼容接口,支持 Embedding
KIMI_API_KEY = os.environ.get('KIMI_API_KEY', 'sk-your-kimi-key')
KIMI_MODEL = os.environ.get('KIMI_MODEL', 'moonshot-v1-32k')
KIMI_BASE_URL = os.environ.get('KIMI_BASE_URL', 'https://api.moonshot.cn/v1')
# Embedding 模型
OPENAI_EMBEDDING_MODEL = 'text-embedding-3-small'
QWEN_EMBEDDING_MODEL = 'text-embedding-v3'
KIMI_EMBEDDING_MODEL = 'moonshot-v1-embedding'
# ==================== 应用配置 ====================
MAX_FILE_SIZE_MB = 50
ALLOWED_EXTENSIONS = {'pdf', 'doc', 'docx'}
SECRET_KEY = 'bidhuo-partner-secret-2024'
# ==================== 生成配置 ====================
MAX_RETRIES = 3
REQUEST_TIMEOUT = int(os.environ.get('REQUEST_TIMEOUT', '180'))
# 大纲生成单次提示词长、输出大,适当延长读超时(秒),避免接口未返回即被客户端断开
OUTLINE_REQUEST_TIMEOUT = int(os.environ.get('OUTLINE_REQUEST_TIMEOUT', '300'))
CHUNK_SIZE = 2000 # 知识库文本分块大小(字符数)
CHUNK_OVERLAP = 200 # 分块重叠大小
TOP_K_KNOWLEDGE = 3 # 知识库检索数量
CONTENT_VOLUME = os.environ.get('CONTENT_VOLUME', 'standard') # 篇幅档位: concise / standard / detailed / full
TARGET_PAGES = int(os.environ.get('TARGET_PAGES', '0') or '0') # 目标页数0=不启用)
PAGE_CHAR_ESTIMATE = int(os.environ.get('PAGE_CHAR_ESTIMATE', '700') or '700') # 粗略每页字数估算
# ==================== 并发控制 (极速优化核心) ====================
# 全局LLM调用上限防止Qwen等云API被限流。默认20与用户要求对齐。
LLM_CONCURRENCY_LIMIT = int(os.environ.get('LLM_CONCURRENCY_LIMIT', '20'))
_llm_semaphore = threading.Semaphore(LLM_CONCURRENCY_LIMIT)
@contextmanager
def llm_call():
"""全局LLM调用信号量上下文管理器 (上限20)。所有ai_client.chat / embedding 必须使用。
针对Qwen云API增加轻微jitter避免429超时60s防止死锁"""
acquired = _llm_semaphore.acquire(blocking=True, timeout=60.0)
if not acquired:
raise TimeoutError(f"LLM并发已达上限({LLM_CONCURRENCY_LIMIT}),请稍后重试")
try:
# Qwen RPM敏感增加极小jitter (0-0.08s) 避免429限流
if MODEL_PROVIDER == 'qwen':
time.sleep(random.uniform(0, 0.08))
yield
finally:
_llm_semaphore.release()
# 更新默认并发章节数支持更高上限UI后续同步
MAX_CONCURRENT_SECTIONS = int(os.environ.get('MAX_CONCURRENT_SECTIONS', '12'))

View File

@ -0,0 +1,22 @@
{
"_meta": "附件类章节stack_charts_only 为默认,叶节点按 diagram 意图栈只输出 [FIGURE]/[TABLE] 块、无叙述正文full 为长文single_chart_only 为栈顶单块。修改后重启生效。",
"_field_docs": {
"title_regex": "标题任一则正则匹配即视为附件节Python re 语法)",
"table_hint_keywords": "标题含此类子串且双开关均开时倾向表格",
"figure_hint_keywords": "标题含此类子串且双开关均开时倾向图示",
"default_kind_when_ambiguous": "双开且标题无倾向词时的默认figure 或 table",
"attachment_leaf_body_mode": "stack_charts_only意图栈只生成图/表块full与常规章节相同长文single_chart_only仅栈顶一块图或表"
},
"schema_version": 1,
"attachment_leaf_body_mode": "stack_charts_only",
"title_regex": [
"附件\\s*[一二三四五六七八九十0-9A-Za-z、:.]",
"附\\s*图",
"附\\s*表",
"附\\s*件\\s*\\(",
"^\\s*[\\d一二三四五六七八九十\\..、]+\\s*附件"
],
"table_hint_keywords": ["附表", "一览表", "清单表", "表", "统计表", "明细表"],
"figure_hint_keywords": ["附图", "示意图", "平面图", "流程图", "布置图", "组织图", "横道"],
"default_kind_when_ambiguous": "table"
}

View File

@ -0,0 +1,51 @@
{
"_meta": "章节级图/表意图:标题与大纲窗口关键词计分,阈值入栈,按栈序拼接图示/表格生成规范。修改后重启服务生效。",
"_field_docs": {
"threshold_figure": "图示倾向分达到此值才入栈",
"threshold_table": "表格倾向分达到此值才入栈",
"title_weight": "标题命中的权重乘子",
"context_weight": "大纲上下文窗口命中的权重乘子",
"outline_context_lines": "before/after 为相对匹配行上下扩展行数",
"stack_order_when_both": "figure_first | table_first | score_desc两者同时入栈时的顺序栈顶为 index 0",
"figure_keywords": "字符串或 {text,weight} 对象列表",
"table_keywords": "同上"
},
"schema_version": 1,
"threshold_figure": 1.0,
"threshold_table": 1.0,
"title_weight": 1.0,
"context_weight": 0.6,
"outline_context_lines": {"before": 4, "after": 6},
"stack_order_when_both": "score_desc",
"figure_keywords": [
{"text": "组织", "weight": 1.0},
{"text": "架构", "weight": 1.0},
{"text": "流程", "weight": 1.2},
{"text": "工序", "weight": 1.0},
{"text": "进度", "weight": 1.2},
{"text": "横道", "weight": 1.5},
{"text": "网络图", "weight": 1.5},
{"text": "平面", "weight": 1.0},
{"text": "布置", "weight": 0.8},
{"text": "监测", "weight": 0.8},
{"text": "示意", "weight": 0.8},
{"text": "应急", "weight": 0.8}
],
"table_keywords": [
{"text": "一览表", "weight": 1.5},
{"text": "人员", "weight": 1.0},
{"text": "配置", "weight": 0.8},
{"text": "设备", "weight": 1.0},
{"text": "机械", "weight": 0.9},
{"text": "劳动力", "weight": 1.2},
{"text": "工种", "weight": 1.0},
{"text": "检验", "weight": 1.0},
{"text": "验收", "weight": 0.9},
{"text": "材料", "weight": 1.0},
{"text": "供应", "weight": 0.9},
{"text": "风险", "weight": 1.0},
{"text": "措施", "weight": 0.6},
{"text": "清单", "weight": 0.8},
{"text": "计划", "weight": 0.7}
]
}

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

BIN
data/projects.db Normal file

Binary file not shown.

BIN
data/projects.db-shm Normal file

Binary file not shown.

BIN
data/projects.db-wal Normal file

Binary file not shown.

24
data/settings.json Normal file
View File

@ -0,0 +1,24 @@
{
"model_provider": "qwen",
"qwen_api_key": "sk-999173b3ca7f425a97cc4b12a2d3575f",
"qwen_model": "qwen3.6-plus",
"qwen_base_url": "https://dashscope.aliyuncs.com/compatible-mode/v1",
"openai_api_key": "sk-your-openai-key",
"openai_model": "gpt-4.1",
"openai_base_url": "https://api.openai.com/v1",
"deepseek_api_key": "sk-your-deepseek-key",
"deepseek_model": "deepseek-chat",
"deepseek_base_url": "https://api.deepseek.com/v1",
"ollama_base_url": "http://localhost:11434/v1",
"ollama_model": "qwen3:8b",
"doubao_api_key": "sk-your-doubao-key",
"doubao_model": "doubao-1-5-pro-32k",
"doubao_base_url": "https://ark.cn-beijing.volces.com/api/v3",
"kimi_api_key": "sk-your-kimi-key",
"kimi_model": "moonshot-v1-32k",
"kimi_base_url": "https://api.moonshot.cn/v1",
"max_concurrent": 10,
"content_volume": "full",
"target_pages": 120,
"page_char_estimate": 700
}

23
data/style_presets.json Normal file
View File

@ -0,0 +1,23 @@
{
"standard": {
"bodyMode": "text_only",
"figureEnabled": false,
"tableEnabled": true,
"bodyFont": "宋体",
"bodySize": "小四",
"bodyLineSpacing": 1.5,
"headingFont": "黑体",
"headingSize": "三号",
"footerFont": "宋体",
"footerSize": "五号",
"pageCountTarget": 100,
"margins": {
"top": 2.54,
"bottom": 2.54,
"left": 3.18,
"right": 3.18
},
"headerText": "标桥AI编标",
"footerText": "第 X 页 / 共 Y 页"
}
}

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@ -0,0 +1,30 @@
{
"_meta": "字数分配约束规则:与「标书篇幅预期」四档的 base/core 配合,按技术评分项权重与章节标题相关性分配各叶节点最低字数与提示词中的评分要点提示。修改后重启服务生效;字段说明见同文件 _field_docs。",
"_field_docs": {
"schema_version": "规则文件版本号,解析时可做迁移",
"alpha": "0~1评分驱动强度越大则高分相关章节越接近 core、低相关越接近 base",
"budget_mode": "target_pages启用目标页数且 TARGET_PAGES>0 时,全书叶节点目标总字数为 TARGET_PAGES*PAGE_CHAR_ESTIMATE无技术评分时叶节均分该总预算。未启用页数时无评分则返回 None。anchor_meanN*(base+core)/2anchor_baseN*base",
"per_section_floor": "单节 min_chars 下限(不低于此整数)",
"per_section_cap": "单节 min_chars 上限(不超过 core 时可设为 core 或略高)",
"relevance.method": "keyword_overlap标题与评分项名称/关键词的字面重叠度",
"relevance.min_rating_weight": "忽略权重低于此值的评分项(减少噪声)",
"rating_parse": "预留;解析器内置多形态 rating_json无需在此配置",
"prompt.top_k_rating_items": "写入本节字数说明中的相关评分项名称条数上限",
"max_tokens_scale": "若为 true按 min_chars/base 比例缩放本段 max_tokens仍受模型上限约束"
},
"schema_version": 1,
"alpha": 0.85,
"budget_mode": "target_pages",
"per_section_floor": null,
"per_section_cap": null,
"relevance": {
"method": "keyword_overlap",
"min_rating_weight": 0.01
},
"rating_parse": {},
"prompt": {
"top_k_rating_items": 4,
"intro_line": "本节须对下列技术评分要点作实质展开(结合工艺、流程、标准与可验证措施,禁止空泛承诺与复述招标文件):"
},
"max_tokens_scale": false
}

172
launcher.py Normal file
View File

@ -0,0 +1,172 @@
"""
标伙伴 · AI标书助手 桌面启动器
运行此文件 (或打包后的 bid_partner.exe) 即可自动启动本地服务并打开浏览器
"""
import os
import sys
import socket
import threading
import time
import webbrowser
import urllib.request
import logging
# ── 找可用端口 ──────────────────────────────────────────────────────────────
def _find_free_port(start: int = 5000, attempts: int = 20) -> int:
for port in range(start, start + attempts):
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
try:
s.bind(('127.0.0.1', port))
return port
except OSError:
continue
return start # 最坏情况:直接用 5000让 Flask 报错
PORT = _find_free_port()
# ── 日志 ────────────────────────────────────────────────────────────────────
def _setup_logging():
if getattr(sys, 'frozen', False):
log_dir = os.path.dirname(sys.executable)
else:
log_dir = os.path.dirname(os.path.abspath(__file__))
log_path = os.path.join(log_dir, 'bid_partner.log')
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s [%(levelname)s] %(name)s: %(message)s',
handlers=[logging.FileHandler(log_path, encoding='utf-8', mode='a')],
)
# ── 启动 Flask 服务 ─────────────────────────────────────────────────────────
def _start_server():
try:
import app as flask_app
flask_app.init_db()
flask_app.app.run(
host='127.0.0.1',
port=PORT,
debug=False,
threaded=True,
use_reloader=False,
)
except Exception as e:
logging.getLogger('launcher').error(f'服务启动失败: {e}', exc_info=True)
# ── 等待服务就绪 ─────────────────────────────────────────────────────────────
def _wait_for_server(timeout: int = 60) -> bool:
url = f'http://127.0.0.1:{PORT}'
deadline = time.time() + timeout
while time.time() < deadline:
try:
urllib.request.urlopen(url, timeout=1)
return True
except Exception:
time.sleep(0.4)
return False
# ── 主界面 (tkinter) ─────────────────────────────────────────────────────────
def _run_gui():
import tkinter as tk
from tkinter import ttk, font as tkfont
URL = f'http://127.0.0.1:{PORT}'
root = tk.Tk()
root.title('标伙伴 · AI标书助手')
root.geometry('400x220')
root.resizable(False, False)
root.configure(bg='#f5f5f5')
# ── 标题 ──
title_font = tkfont.Font(family='微软雅黑', size=14, weight='bold')
tk.Label(root, text='标伙伴 · AI 标书助手', font=title_font,
bg='#f5f5f5', fg='#1a1a2e').pack(pady=(22, 4))
# ── 状态行 ──
status_var = tk.StringVar(value='正在启动服务,请稍候…')
status_lbl = tk.Label(root, textvariable=status_var,
font=('微软雅黑', 10), bg='#f5f5f5', fg='#555')
status_lbl.pack(pady=4)
# ── URL 链接 ──
url_lbl = tk.Label(root, text='', font=('Consolas', 10),
bg='#f5f5f5', fg='#1a73e8', cursor='hand2')
url_lbl.pack(pady=2)
url_lbl.bind('<Button-1>', lambda _: webbrowser.open(URL))
# ── 按钮区 ──
btn_frame = tk.Frame(root, bg='#f5f5f5')
btn_frame.pack(pady=18)
open_btn = ttk.Button(btn_frame, text='打开浏览器',
command=lambda: webbrowser.open(URL),
state='disabled', width=14)
open_btn.pack(side='left', padx=8)
quit_btn = ttk.Button(btn_frame, text='退出程序',
command=root.destroy, width=10)
quit_btn.pack(side='left', padx=8)
# ── 版本信息 ──
tk.Label(root, text='单机版 · 本地运行 · 数据不上传',
font=('微软雅黑', 8), bg='#f5f5f5', fg='#aaa').pack(pady=(0, 10))
# ── 后台轮询,服务就绪后更新 UI ──
def _on_ready():
status_var.set('服务已就绪 ✓')
status_lbl.config(fg='#2e7d32')
url_lbl.config(text=URL)
open_btn.config(state='normal')
webbrowser.open(URL)
def _on_timeout():
status_var.set('启动超时,请查看 bid_partner.log')
status_lbl.config(fg='#c62828')
def _check():
if _wait_for_server():
root.after(0, _on_ready)
else:
root.after(0, _on_timeout)
threading.Thread(target=_check, daemon=True).start()
root.mainloop()
# ── 无图形模式(仅控制台) ────────────────────────────────────────────────────
def _run_headless():
print(f'[标伙伴] Starting server on port {PORT} ...')
if _wait_for_server():
print(f'[标伙伴] Ready → http://127.0.0.1:{PORT}')
webbrowser.open(f'http://127.0.0.1:{PORT}')
# 阻塞,直到用户 Ctrl+C
try:
while True:
time.sleep(1)
except KeyboardInterrupt:
print('[标伙伴] Shutting down.')
else:
print('[标伙伴] Server did not start within 60 s. Check bid_partner.log.')
# ── 入口 ─────────────────────────────────────────────────────────────────────
def main():
_setup_logging()
server_thread = threading.Thread(target=_start_server, daemon=True)
server_thread.start()
try:
_run_gui()
except Exception:
_run_headless()
if __name__ == '__main__':
main()

1
modules/__init__.py Normal file
View File

@ -0,0 +1 @@

98
modules/checker.py Normal file
View File

@ -0,0 +1,98 @@
"""
合规检查模块检查生成的标书是否响应了招标关键要求
"""
import json
import logging
import re
import sqlite3
from utils import ai_client
logger = logging.getLogger(__name__)
CHECK_PROMPT = """你是一位专业的投标文件技术审核专家。请对照以下【技术评分要求】,检查【标书技术内容】的覆盖情况,输出技术合规检查报告。
重要限制必须遵守
本次检查范围仅限技术内容包括技术方案实施能力技术指标质量保障人员配置技术创新等
严禁将商务评分价格评分资质评分报价合同条款付款方式等商务内容纳入检查项
若技术评分要求中混有商务条款直接忽略不得作为检查项输出
技术评分要求
{requirements}
标书技术内容各章节摘要
{content}
请输出以下格式的 JSON每个 item 均为技术评分项不含任何商务内容
{{
"overall_score": 85,
"status": "良好",
"items": [
{{
"requirement": "技术评分要求描述",
"covered": true,
"note": "说明"
}}
],
"missing_points": ["未覆盖的技术要点1", "未覆盖的技术要点2"],
"suggestions": ["技术内容改进建议1", "技术内容改进建议2"]
}}
"""
def check_compliance(db_path: str, project_id: int) -> dict:
"""
执行合规检查返回检查结果字典
"""
conn = sqlite3.connect(db_path)
try:
# 获取招标要求
cur = conn.cursor()
cur.execute(
"SELECT summary, rating_requirements FROM tender_data WHERE project_id=?",
(project_id,)
)
td = cur.fetchone()
if not td:
return {'error': '尚未解析招标文件'}
# 只使用技术评分要求作为检查基准,排除 summary 中可能包含的商务内容
requirements = (td[1] or '').strip()
if not requirements:
return {'error': '尚未提取技术评分要求,请先完成步骤一的招标文件解析'}
# 收集已生成的章节内容(取前 500 字)
cur.execute(
"SELECT section_title, content FROM bid_sections WHERE project_id=? AND status='done' ORDER BY order_index",
(project_id,)
)
rows = cur.fetchall()
if not rows:
return {'error': '尚未生成标书内容,请先生成'}
content_parts = []
for title, content in rows:
snippet = (content or '')[:500].replace('\n', ' ')
content_parts.append(f"{title}{snippet}")
content_str = '\n'.join(content_parts)
# 调用 AI 检查
prompt = CHECK_PROMPT.format(requirements=requirements[:3000], content=content_str[:6000])
raw = ai_client.chat(prompt, temperature=0.2, max_tokens=2048)
# 解析 JSON
raw = re.sub(r'```(?:json)?\s*', '', raw).replace('```', '').strip()
m = re.search(r'\{[\s\S]*\}', raw)
if m:
raw = m.group(0)
result = json.loads(raw)
return result
except json.JSONDecodeError as e:
logger.error(f'合规检查结果解析失败: {e}')
return {'error': f'AI 返回格式异常: {e}', 'raw': raw}
except Exception as e:
logger.exception('合规检查失败')
return {'error': str(e)}
finally:
conn.close()

View File

@ -0,0 +1,635 @@
"""
技术暗标 HTML 格式检查 清标工具.js 迁移不依赖浏览器/jsdom
仅解析内联 style 与文档内 <style> 中的 @page 简单规则无内联样式时部分项可能判为不符合
"""
from __future__ import annotations
import re
from typing import Any
from bs4 import BeautifulSoup, Tag
# 1pt ≈ 96/72 px (CSS 标准)
_PT_PX = 96.0 / 72.0
# 三号 16pt / 四号 14pt / 五号 10.5pt / 行距 26pt
_TARGET_H = 16 * _PT_PX # 21.333...
_TARGET_BODY = 14 * _PT_PX
_TARGET_LH = 26 * _PT_PX
_TARGET_FIG = 10.5 * _PT_PX
def _parse_style_attr(style: str | None) -> dict[str, str]:
if not style or not style.strip():
return {}
out: dict[str, str] = {}
for part in style.split(";"):
part = part.strip()
if ":" not in part:
continue
k, v = part.split(":", 1)
k, v = k.strip().lower(), v.strip()
if k:
out[k] = v
return out
def _num(s: str) -> float:
try:
return float(re.sub(r"[^\d.\-]", "", s) or "nan")
except ValueError:
return float("nan")
def _length_to_px(val: str, font_size_px: float | None = None) -> float:
"""将 font-size / line-height 等长度转为近似 px 浮点,用于与 JS 中 getComputedStyle(px) 对齐。"""
val = (val or "").strip().lower()
if not val or val in ("normal", "inherit", "initial"):
return float("nan")
if val.isdigit():
return float(val)
m = re.match(r"^([\d.]+)\s*(pt|px|em|rem)?\s*$", val)
if not m:
m2 = re.match(r"^([\d.]+)", val)
return float(m2.group(1)) if m2 else float("nan")
n, unit = float(m.group(1)), (m.group(2) or "px")
if unit == "pt":
return n * _PT_PX
if unit == "px":
return n
if unit in ("em", "rem") and font_size_px and font_size_px == font_size_px:
return n * font_size_px
if unit in ("em", "rem"):
return n # 无字号时仅返回 em 数,供 text-indent 等判断
return n
def _indent_value(style: dict[str, str], font_size_px: float) -> float:
"""与 JS 中 parseFloat(textIndent) 对齐:'2em' -> 2.0'2ch' 等取首数字段。"""
raw = (style.get("text-indent") or "").strip()
if not raw:
return float("nan")
if "em" in raw.lower():
m = re.search(r"([\d.]+)\s*em", raw, re.I)
return float(m.group(1)) if m else _num(raw)
# px 转 em 近似
px = _length_to_px(raw, font_size_px)
if px == px and font_size_px > 0:
return px / font_size_px
return _num(raw)
def _color_normalized(style: dict[str, str]) -> str:
c = (style.get("color") or "").strip().lower()
if not c:
return ""
c = c.replace(" ", "")
if c in ("#000", "#000000", "black", "rgb(0,0,0)"):
return "rgb(0, 0, 0)"
m = re.match(r"rgb\s*\(\s*(\d+)\s*,\s*(\d+)\s*,\s*(\d+)\s*\)", c)
if m:
r, g, b = int(m.group(1)), int(m.group(2)), int(m.group(3))
if r == 0 and g == 0 and b == 0:
return "rgb(0, 0, 0)"
return c
return c
def _el_style_dict(tag: Tag) -> dict[str, str]:
s = tag.get("style")
if isinstance(s, str):
return _parse_style_attr(s)
if isinstance(s, list):
return _parse_style_attr(";".join(s))
return {}
def _get_inline_property(tag: Tag, prop: str) -> str:
d = _el_style_dict(tag)
return d.get(prop.lower(), "")
def _outer_html_sample(tag: Tag, limit: int = 200) -> str:
s = str(tag)
return s[:limit] if len(s) > limit else s
def _is_under(node: Tag | None, ancestor: Tag | None) -> bool:
if node is None or ancestor is None:
return False
p: Tag | None = node
while p is not None:
if p is ancestor:
return True
p = p.parent
return False
def _body_text(soup: BeautifulSoup) -> str:
body = soup.body
if not body:
return soup.get_text("\n", strip=True)
return body.get_text("\n", strip=True)
def _parse_page_margins_from_html(raw_html: str) -> dict[str, str] | None:
"""从 <style> 中粗提取 @page 块内 margin 与 size。"""
for m in re.finditer(
r"@page\s*\{([^}]+)\}",
raw_html,
re.I | re.DOTALL,
):
block = m.group(1)
msh = re.search(r"margin\s*:\s*([^;]+);", block, re.I)
if msh:
return {"shorthand": msh.group(1).strip()}
margins: dict[str, str] = {}
for name, key in (
(r"margin-top\s*:\s*([^;]+)", "top"),
(r"margin-bottom\s*:\s*([^;]+)", "bottom"),
(r"margin-left\s*:\s*([^;]+)", "left"),
(r"margin-right\s*:\s*([^;]+)", "right"),
(r"size\s*:\s*([^;]+)", "size"),
):
mm = re.search(name, block, re.I)
if mm:
margins[key] = mm.group(1).strip()
if margins:
return margins
return None
def check_technical_bid(html_content: str) -> dict[str, Any]:
"""
对技术暗标 HTML 执行格式检查
返回结构与清标数据.json 一致overall, details, violations
"""
results: dict[str, Any] = {
"overall": True,
"details": [],
"violations": [],
}
def add_result(
rule_name: str,
passed: bool,
message: str,
elements: list[Tag] | None = None,
) -> None:
results["details"].append(
{"rule": rule_name, "passed": passed, "message": message}
)
if not passed:
results["overall"] = False
el_snips: list[str] = []
for el in elements or []:
if isinstance(el, Tag):
el_snips.append(_outer_html_sample(el))
results["violations"].append(
{"rule": rule_name, "message": message, "elements": el_snips}
)
if not (html_content or "").strip():
add_result("身份信息隐藏", False, "HTML 内容为空", [])
return results
raw_html = html_content
soup = BeautifulSoup(html_content, "lxml")
if not soup.body:
soup = BeautifulSoup(f"<html><body>{html_content}</body></html>", "lxml")
body = soup.body
if not body:
add_result("身份信息隐藏", False, "无法解析 body", [])
return results
# ---- 1. 身份 ----
body_text = _body_text(soup)
company_pattern = re.compile(
r"(?:我公司|本公司|[(]?[A-Za-z\u4e00-\u9fa5]+(?:集团|股份|有限|责任|公司)[)]?)"
)
addr_pattern = re.compile(
r"(?:省|市|区|县|镇|路|街|大道|号|大厦|楼|层)[\u4e00-\u9fa50-9]+"
)
name_pattern = re.compile(
r"(?:总监理工程师|专业监理工程师|技术负责人|项目经理)[:]\s*"
r"[^甲乙丙丁戊己庚辛壬癸\s]{2,4}(?=[,。;\s]|$)"
)
found_company = bool(company_pattern.search(body_text))
found_addr = bool(addr_pattern.search(body_text))
found_name = bool(name_pattern.search(body_text))
has_logo = False
for img in soup.find_all("img"):
if not isinstance(img, Tag):
continue
alt = (img.get("alt") or "") + ""
src = (img.get("src") or "") + ""
if re.search(r"logo|商标|微标|公司|品牌", alt, re.I) or re.search(
r"logo", src, re.I
):
has_logo = True
break
passed_id = not (
found_company or found_addr or found_name or has_logo
)
add_result(
"身份信息隐藏",
passed_id,
"未发现投标人身份信息"
if passed_id
else "发现投标人身份信息(公司名/地址/真实姓名/商标)",
)
def heading_style_ok(tag: Tag) -> bool:
st = _el_style_dict(tag)
fs_raw = st.get("font-size", "")
fs_px = _length_to_px(fs_raw)
if "em" in (fs_raw or "").lower() and "rem" not in (fs_raw or "").lower():
fs_px = _num(fs_raw) * 16.0
size_ok = abs(fs_px - _TARGET_H) <= 3
fam = (st.get("font-family") or "").lower()
font_ok = "黑体" in fam or "simhei" in fam or "microsoft yahei" in fam
font_style = (st.get("font-style") or "").lower()
style_ok = font_style != "italic"
text_dec = (st.get("text-decoration") or "").lower()
decor_ok = "underline" not in text_dec
cr = (st.get("color") or "").strip().lower()
if not cr or cr in ("inherit", "initial"):
color_ok = True
else:
cn = _color_normalized(st)
color_ok = cn == "rgb(0, 0, 0)" or cr in (
"#000",
"#000000",
"black",
"rgb(0,0,0)",
)
fw = (st.get("font-weight") or "400").lower()
weight_ok = fw not in ("400", "normal")
if not st.get("font-size"):
size_ok = False
return (
size_ok
and font_ok
and style_ok
and decor_ok
and color_ok
and weight_ok
)
# ---- 2. 标题 ----
heading_tags: list[Tag] = []
for sel in ("h1", "h2", "h3", "h4", "h5", "h6"):
heading_tags.extend(soup.find_all(sel))
for t in soup.find_all(attrs={"role": "heading"}):
if isinstance(t, Tag):
heading_tags.append(t)
for t in soup.select(".heading, .title"):
if isinstance(t, Tag) and t not in heading_tags:
heading_tags.append(t)
invalid_h: list[Tag] = []
for h in heading_tags:
if not isinstance(h, Tag):
continue
if not heading_style_ok(h):
invalid_h.append(h)
h_ok = len(invalid_h) == 0
add_result(
"标题格式",
h_ok,
"所有标题符合三号黑体要求"
if h_ok
else "部分标题字号/字体/颜色/下划线不符合要求",
invalid_h,
)
def body_el_ok(el: Tag) -> bool:
st = _el_style_dict(el)
if el.name in ("h1", "h2", "h3", "h4", "h5", "h6"):
return True
cls = " ".join(el.get("class", [])) if el.get("class") else ""
if any(
x in cls
for x in ("header", "footer", "toc", "目录", "table-of-contents")
):
return True
text = el.get_text(strip=True)
if not text:
return True
fs_raw = st.get("font-size", "")
font_px = _length_to_px(fs_raw)
if not fs_raw:
return False
size_ok = abs(font_px - _TARGET_BODY) <= 2
fam = (st.get("font-family") or "").lower()
font_ok = "宋体" in fam or "simsun" in fam or "serif" in fam
col = st.get("color", "")
color_ok = (not col) or _color_normalized(st) == "rgb(0, 0, 0)" or col.lower() in (
"#000",
"#000000",
"black",
"rgb(0,0,0)",
)
ind = _indent_value(st, font_px)
indent_ok = ind == ind and 1.8 <= ind <= 2.2
lh_raw = (st.get("line-height") or "").strip()
if not lh_raw:
line_ok = False
else:
if "pt" in lh_raw or "px" in lh_raw:
lh_px = _length_to_px(lh_raw, font_px)
elif re.match(r"^[\d.]+$", lh_raw):
lh_px = float(lh_raw) * font_px
else:
lh_px = _length_to_px(lh_raw, font_px)
line_ok = abs(lh_px - _TARGET_LH) <= 2
tdec = (st.get("text-decoration") or "").lower()
decor_ok = "underline" not in tdec
fw = (st.get("font-weight") or "400").lower()
weight_ok = fw in ("400", "normal", "")
fst = (st.get("font-style") or "").lower()
style_ok = fst != "italic"
return (
size_ok
and font_ok
and color_ok
and indent_ok
and line_ok
and decor_ok
and weight_ok
and style_ok
)
exclude_set = {
"h1",
"h2",
"h3",
"h4",
"h5",
"h6",
}
invalid_body: list[Tag] = []
for el in soup.find_all(["p", "div", "span", "li", "td", "th"]):
if not isinstance(el, Tag):
continue
if el.name in exclude_set:
continue
if "header" in " ".join(el.get("class", [])):
continue
if "footer" in " ".join(el.get("class", [])):
continue
if "toc" in " ".join(el.get("class", [])) or "目录" in " ".join(
el.get("class", [])
):
continue
if not el.get_text(strip=True):
continue
if not body_el_ok(el):
invalid_body.append(el)
b_ok = len(invalid_body) == 0
add_result(
"正文格式",
b_ok,
"所有正文符合四号宋体/缩进/行距/颜色要求"
if b_ok
else "部分正文段落格式不符合要求",
invalid_body,
)
# ---- 4. 目录 ----
toc_els: list[Tag] = []
for cls in ("toc", "table-of-contents", "目录"):
for t in soup.find_all(class_=cls):
if isinstance(t, Tag) and t not in toc_els:
toc_els.append(t)
for t in soup.find_all(attrs={"role": "directory"}):
if isinstance(t, Tag) and t not in toc_els:
toc_els.append(t)
if not toc_els:
add_result("目录要求", False, "未检测到目录,请确保包含目录且目录无页码无页眉页脚")
else:
no_pn = True
no_hf = True
for toc in toc_els:
text = toc.get_text("\n", strip=True)
lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
for line in lines:
if re.search(r"\d+\s*$", line) and re.search(r"\d$", line):
if re.search(r"\.{2,}\s*\d+", line) or re.match(
r"^.*\d$", line
):
if re.search(r"\.{2,}\s*\d+", line):
no_pn = False
if re.search(r"\.{2,}\s*\d+", line):
no_pn = False
if toc.find(class_=re.compile("header|page-header", re.I)):
no_hf = False
if toc.find(class_=re.compile("footer|page-footer", re.I)):
no_hf = False
t_ok = no_pn and no_hf
add_result(
"目录要求",
t_ok,
"目录符合无页码、无页眉页脚要求"
if t_ok
else "目录中存在页码或页眉页脚",
)
# ---- 5. 图表 / 附件(合法选择器)----
appendix: Tag | None = None
for sel in (
"#appendix",
".appendix",
".attachment",
'[id*="附件"]',
'[class*="附件"]',
'[class*="附表"]',
):
hit = soup.select_one(sel)
if hit and isinstance(hit, Tag):
appendix = hit
break
illegal: list[Tag] = []
for tbl in soup.find_all("table"):
if isinstance(tbl, Tag) and not _is_under(tbl, appendix):
illegal.append(tbl)
for im in soup.find_all("img"):
if isinstance(im, Tag) and not _is_under(im, appendix):
illegal.append(im)
for el in soup.find_all("figure"):
if isinstance(el, Tag) and not _is_under(el, appendix):
illegal.append(el)
for el in soup.find_all(class_="chart"):
if isinstance(el, Tag) and not _is_under(el, appendix) and el not in illegal:
illegal.append(el)
chart_text_valid = True
if appendix:
for el in appendix.select("table, td, th, figcaption, .chart-text"):
if not isinstance(el, Tag):
continue
st = _el_style_dict(el)
if not st.get("font-size"):
continue
fs = _length_to_px(st.get("font-size", ""))
size_ok = abs(fs - _TARGET_FIG) <= 1.5
fam = (st.get("font-family") or "").lower()
font_ok = "宋体" in fam or "simsun" in fam
c_raw = (st.get("color") or "").strip()
if c_raw and c_raw.lower() not in ("inherit", "initial"):
c_ok = _color_normalized(st) == "rgb(0, 0, 0)" or c_raw.lower() in (
"#000",
"#000000",
"black",
"rgb(0,0,0)",
)
else:
c_ok = True
if not (size_ok and font_ok and c_ok):
chart_text_valid = False
c_ok2 = len(illegal) == 0 and chart_text_valid
add_result(
"图表规范",
c_ok2,
"图表仅出现在附件/附表内,且图表文字符合五号宋体"
if c_ok2
else f"正文中发现{len(illegal)}个图表或附件内图表文字格式错误",
illegal,
)
# ---- 6. 颜色与装饰 ----
color_v: list[Tag] = []
decor_v: list[Tag] = []
for el in soup.find_all(True):
if not isinstance(el, Tag):
continue
st = _el_style_dict(el)
if not st.get("color") and not st.get("text-decoration") and not st.get(
"border-bottom-style"
):
continue
col = (st.get("color") or "").strip().lower()
if col and col not in (
"inherit",
"initial",
"",
"#000",
"#000000",
"black",
"rgb(0,0,0)",
"rgb(0, 0, 0)",
):
if _color_normalized(st) and _color_normalized(st) != "rgb(0, 0, 0)":
if el.get_text(strip=True):
color_v.append(el)
tdec = (st.get("text-decoration") or "").lower()
if "underline" in tdec and el.get_text(strip=True):
decor_v.append(el)
bbs = (st.get("border-bottom-style") or "").lower()
if bbs in ("solid", "dotted") and el.get_text(strip=True):
decor_v.append(el)
col_ok = len(color_v) == 0 and len(decor_v) == 0
add_result(
"颜色与装饰",
col_ok,
"无彩色文字、无下划线、无着重号"
if col_ok
else f"发现{len(color_v)}处彩色文字,{len(decor_v)}处下划线/着重号",
(color_v + decor_v)[:20],
)
# ---- 7. 页面 ----
page_valid = True
margin_top = margin_bottom = margin_left = margin_right = None
page_info = _parse_page_margins_from_html(raw_html)
# Word 常把 @page 写在 style 里,已在 raw_html 中解析
if page_info and "shorthand" in page_info:
# margin: 2.54cm 3.18cm
parts = page_info["shorthand"].split()
if len(parts) >= 4:
margin_top, margin_right, margin_bottom, margin_left = (
parts[0],
parts[1],
parts[2],
parts[3],
)
elif len(parts) == 2:
margin_top = margin_bottom = parts[0]
margin_left = margin_right = parts[1]
elif page_info:
margin_top = page_info.get("top")
margin_bottom = page_info.get("bottom")
margin_left = page_info.get("left")
margin_right = page_info.get("right")
bst = _el_style_dict(body) if body else {}
mraw = bst.get("margin", "")
if mraw and not margin_top:
margins = mraw.split()
if len(margins) >= 1:
margin_top = margins[0]
if len(margins) >= 2:
margin_right = margins[1]
if len(margins) >= 3:
margin_bottom = margins[2]
if len(margins) >= 4:
margin_left = margins[3]
else:
margin_left = margin_right
if not margin_top and body:
margin_top = _get_inline_property(body, "margin-top")
margin_bottom = _get_inline_property(body, "margin-bottom")
margin_left = _get_inline_property(body, "margin-left")
margin_right = _get_inline_property(body, "margin-right")
if not any([margin_top, margin_bottom, margin_left, margin_right]) and not page_info:
page_valid = False
def m_ok(
m: str | None,
target: float,
) -> bool:
if not m:
return False
s = m.strip()
if "cm" in s:
return abs(_num(s) - target) < 0.01
return abs(_num(s) - target) < 0.01
top_ok = m_ok(margin_top, 2.54) or (
(margin_top or "") in ("2.54cm", "1in")
)
bottom_ok = m_ok(margin_bottom, 2.54) or (
(margin_bottom or "") in ("2.54cm", "1in")
)
left_ok = m_ok(margin_left, 3.18) or (margin_left or "").startswith("3.18")
right_ok = m_ok(margin_right, 3.18) or (margin_right or "").startswith("3.18")
html_tag = soup.find("html")
w = _get_inline_property(html_tag, "width") if isinstance(html_tag, Tag) else "" # type: ignore[arg-type]
page_orientation = "横向" if w and w != "auto" and "%" not in w else "纵向"
page_ok = bool(
top_ok
and bottom_ok
and left_ok
and right_ok
and (page_orientation != "横向" or w in ("", "auto"))
)
if not margin_top:
page_ok = False
add_result(
"页面设置",
page_ok,
"页面设置符合A4纵向/边距要求"
if page_ok
else "页面边距或纸张方向不符合要求",
)
return results

463
modules/exporter.py Normal file
View File

@ -0,0 +1,463 @@
"""
Word 文档导出模块
"""
import os
import re
import sqlite3
import logging
from datetime import datetime
from docx import Document
from docx.shared import Pt, Cm, RGBColor
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.oxml import OxmlElement
from docx.oxml.ns import qn
import config
from utils.outline_numbering import format_heading_display
from utils.style_manager import get_preset, apply_preset_to_document
logger = logging.getLogger(__name__)
LEVEL_STYLES = {
1: ('Heading 1', 16, True),
2: ('Heading 2', 14, True),
3: ('Heading 3', 13, False),
4: ('Heading 4', 12, False),
}
def export_to_word(db_path: str, project_id: int, style_preset_name='standard') -> str:
"""
生成 Word 文档并保存到 data/exports/返回文件名
支持从首页文件样式设置传入预设 (style_preset_name)
"""
conn = sqlite3.connect(db_path)
try:
# 获取项目信息
cur = conn.cursor()
cur.execute("SELECT name FROM projects WHERE id=?", (project_id,))
project = cur.fetchone()
if not project:
raise ValueError(f'项目 {project_id} 不存在')
project_name = project[0]
# 获取标书大纲文本(用于标题页)
cur.execute("SELECT outline FROM tender_data WHERE project_id=?", (project_id,))
td = cur.fetchone()
bid_title = project_name + '技术标书'
if td and td[0]:
first_line = td[0].strip().split('\n')[0].strip()
if first_line:
bid_title = first_line
# 获取所有章节(按顺序)
cur.execute('''
SELECT section_number, section_title, level, is_leaf, content, intro_content
FROM bid_sections
WHERE project_id=?
ORDER BY order_index
''', (project_id,))
sections = cur.fetchall()
preset = get_preset(style_preset_name)
doc = _build_document(bid_title, sections, preset)
# 保存文件
os.makedirs(config.EXPORT_DIR, exist_ok=True)
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
safe_name = ''.join(c for c in project_name if c.isalnum() or c in '._- \u4e00-\u9fff')
filename = f'{safe_name}_{timestamp}.docx'
filepath = os.path.join(config.EXPORT_DIR, filename)
doc.save(filepath)
logger.info(f'导出完成: {filepath} (使用样式预设: {style_preset_name})')
return filename
finally:
conn.close()
DISCLAIMER_TEXT = """\
免责声明
本工具仅供学习交流免费使用所生成的技术方案不可直接用于投标请务必人工核对本工具不会通过任何平台进行销售请用户注意辨别真伪在您开始使用本AI标书制作服务之前请认真阅读并同意以下关键条款一旦您继续使用即表示您已充分理解并认可本提示的全部内容
服务定位
本工具为单机使用的AI标书辅助工具旨在帮助您生成标书的参考素材您需对最终自己编写的标书文件承担全部责任包括审核修改内容确保其符合相关法律法规及项目要求
准确性免责
本人不对AI生成内容的完全准确性与完整性作任何保证您有义务自行核实所有关键信息并自行承担因使用本工具所引发的一切后果
标书风险
本工具所生成的素材文件仅作参考若您使用包括引用修改或二次创作需自行承担由此可能导致的废标侵权等全部风险与责任本人不承担任何相关责任
责任限制
任何情形下本人均不对因使用本服务而造成的任何直接间接或衍生损失例如利润损失业务中断数据丢失等承担法律责任
其他事项
本人保留随时修改或终止本服务的权利本提示的解释及争议解决均适用中华人民共和国法律\
"""
def _add_disclaimer_page(doc: Document) -> None:
"""在文档开头插入免责声明页"""
# 标题
title_para = doc.add_paragraph()
title_para.alignment = WD_ALIGN_PARAGRAPH.CENTER
title_run = title_para.add_run('免责声明')
title_run.font.size = Pt(16)
title_run.font.bold = True
title_run.font.name = '黑体'
title_run._element.rPr.rFonts.set(qn('w:eastAsia'), '黑体')
doc.add_paragraph()
# 正文各段(跳过第一行标题,已单独渲染)
body_lines = DISCLAIMER_TEXT.split('\n')[2:] # 跳过"免责声明"和空行
for line in body_lines:
p = doc.add_paragraph()
stripped = line.strip()
# 小标题行(非空且后面没有缩进,即段落标题)
is_section_title = bool(stripped) and not line.startswith(' ') and not line.startswith('\u3000')
run = p.add_run(stripped if stripped else '')
if is_section_title and stripped:
run.font.bold = True
run.font.size = Pt(11)
else:
run.font.size = Pt(10.5)
run.font.name = 'Times New Roman'
run._element.rPr.rFonts.set(qn('w:eastAsia'), '宋体')
p.paragraph_format.space_after = Pt(4)
_set_line_spacing_15(p)
doc.add_page_break()
def _add_toc_tree_page(doc: Document, sections: list) -> None:
"""标题页之后插入树状目录(按 level 缩进;静态文本,不含 Word 目录域)。"""
toc_heading = doc.add_paragraph()
toc_heading.alignment = WD_ALIGN_PARAGRAPH.CENTER
tr = toc_heading.add_run('目录')
tr.font.size = Pt(16)
tr.font.bold = True
tr.font.name = '黑体'
tr._element.rPr.rFonts.set(qn('w:eastAsia'), '黑体')
doc.add_paragraph()
for row in sections:
section_number, title, level, _, _, _ = row
level = min(int(level), 4)
text = format_heading_display(level, str(section_number or ''), str(title or ''))
p = doc.add_paragraph()
p.paragraph_format.left_indent = Cm(0.75 * max(0, level - 1))
p.paragraph_format.space_after = Pt(3)
run = p.add_run(text)
run.font.size = Pt(12)
run.font.name = '宋体'
run._element.rPr.rFonts.set(qn('w:eastAsia'), '宋体')
doc.add_page_break()
def _build_document(bid_title: str, sections, preset=None) -> Document:
if preset is None:
preset = get_preset('standard')
doc = Document()
# ── 页面设置 (from preset, overrides hardcoded values) ─────────────────────
section_obj = doc.sections[0]
m = preset.get('margins_cm', {'top': 2.5, 'bottom': 2.5, 'left': 3.0, 'right': 2.5})
section_obj.page_width = Cm(21)
section_obj.page_height = Cm(29.7)
section_obj.left_margin = Cm(m.get('left', 3.0))
section_obj.right_margin = Cm(m.get('right', 2.5))
section_obj.top_margin = Cm(m.get('top', 2.5))
section_obj.bottom_margin = Cm(m.get('bottom', 2.5))
# Header / Footer from preset
if preset.get('header_text'):
header = section_obj.header
if header.paragraphs:
p = header.paragraphs[0]
p.text = preset['header_text']
p.alignment = WD_ALIGN_PARAGRAPH.CENTER
if preset.get('footer_text'):
footer = section_obj.footer
if footer.paragraphs:
p = footer.paragraphs[0]
p.text = preset['footer_text']
p.alignment = WD_ALIGN_PARAGRAPH.CENTER
# ── 免责声明页(第一页)─────────────────────────────────────────────
_add_disclaimer_page(doc)
# ── 标题页 ──────────────────────────────────────────────────────────
title_para = doc.add_paragraph()
title_para.alignment = WD_ALIGN_PARAGRAPH.CENTER
title_run = title_para.add_run(bid_title)
title_run.font.size = Pt(22)
title_run.font.bold = True
title_run.font.color.rgb = RGBColor(0x1a, 0x56, 0xdb)
title_run.font.name = preset.get('heading_font', '黑体')
title_run._element.rPr.rFonts.set(qn('w:eastAsia'), preset.get('heading_font', '黑体'))
doc.add_paragraph()
date_para = doc.add_paragraph()
date_para.alignment = WD_ALIGN_PARAGRAPH.CENTER
date_run = date_para.add_run(datetime.now().strftime('%Y年%m月'))
date_run.font.size = Pt(14)
date_run.font.name = '宋体'
date_run._element.rPr.rFonts.set(qn('w:eastAsia'), '宋体')
doc.add_page_break()
# ── 树状目录页(标题页后、正文前)──────────────────────────────────
_add_toc_tree_page(doc, sections)
# ── 章节内容 ─────────────────────────────────────────────────────────
for row in sections:
section_number, title, level, is_leaf, content, intro = row
level = min(int(level), 4)
# 添加标题带完整目录号使用preset字体
heading_text = format_heading_display(level, str(section_number or ''), str(title or ''))
heading = doc.add_heading(level=level)
heading.clear()
run = heading.add_run(heading_text)
_, font_size, bold = LEVEL_STYLES.get(level, ('Heading 4', 12, False))
run.font.size = Pt(preset.get(f'heading{level}_size_pt', font_size))
run.font.bold = bold
run.font.name = preset.get('heading_font', '黑体' if level <= 2 else '宋体')
run._element.rPr.rFonts.set(qn('w:eastAsia'), preset.get('heading_font', '黑体' if level <= 2 else '宋体'))
# 章节引言(非叶节点)
if intro and intro.strip():
_add_body_paragraphs(doc, intro, preset)
# 正文内容(叶节点)
if content and content.strip():
_add_body_paragraphs(doc, content, preset)
return doc
def _set_line_spacing_15(paragraph):
"""将段落设为 1.5 倍行距Word 中的 WD_LINE_SPACING.MULTIPLE × 1.5"""
from docx.oxml.ns import qn as _qn
pPr = paragraph._element.get_or_add_pPr()
spacing = pPr.find(_qn('w:spacing'))
if spacing is None:
spacing = OxmlElement('w:spacing')
pPr.append(spacing)
spacing.set(_qn('w:line'), '360') # 240 × 1.5 = 360 twips
spacing.set(_qn('w:lineRule'), 'auto')
# ── 图/表标记解析 ─────────────────────────────────────────────────────────
_BLOCK_PATTERN = re.compile(
r'\[FIGURE:([^\]]+)\](.*?)\[/FIGURE\]'
r'|\[TABLE:([^\]]+)\](.*?)\[/TABLE\]',
re.DOTALL
)
def _split_content_blocks(text: str) -> list:
"""
将章节正文拆分为有序内容块列表
{'type': 'text', 'content': '...'}
{'type': 'figure', 'title': '...', 'content': '...'}
{'type': 'table', 'title': '...', 'content': '...'}
"""
blocks = []
last = 0
for m in _BLOCK_PATTERN.finditer(text):
if m.start() > last:
blocks.append({'type': 'text', 'content': text[last:m.start()]})
if m.group(1) is not None:
blocks.append({'type': 'figure',
'title': m.group(1).strip(),
'content': m.group(2).strip()})
else:
blocks.append({'type': 'table',
'title': m.group(3).strip(),
'content': m.group(4).strip()})
last = m.end()
if last < len(text):
blocks.append({'type': 'text', 'content': text[last:]})
return blocks
def _set_para_shading(para, hex_fill: str):
"""为段落设置背景填充色"""
pPr = para._element.get_or_add_pPr()
shd = OxmlElement('w:shd')
shd.set(qn('w:val'), 'clear')
shd.set(qn('w:color'), 'auto')
shd.set(qn('w:fill'), hex_fill)
pPr.append(shd)
def _set_cell_bg(cell, hex_fill: str):
"""为表格单元格设置背景色"""
tc = cell._tc
tcPr = tc.get_or_add_tcPr()
shd = OxmlElement('w:shd')
shd.set(qn('w:val'), 'clear')
shd.set(qn('w:color'), 'auto')
shd.set(qn('w:fill'), hex_fill)
tcPr.append(shd)
def _set_cell_padding(cell, pt_value: float):
"""设置表格单元格四侧内边距(单位:磅)"""
tc = cell._tc
tcPr = tc.get_or_add_tcPr()
tcMar = OxmlElement('w:tcMar')
val = str(int(pt_value * 20)) # pt → twips1pt = 20 twips
for side in ('top', 'left', 'bottom', 'right'):
node = OxmlElement(f'w:{side}')
node.set(qn('w:w'), val)
node.set(qn('w:type'), 'dxa')
tcMar.append(node)
tcPr.append(tcMar)
def _safe_set_eastasia(run, font_name: str):
"""安全设置东亚字体,确保 rPr 已存在"""
_ = run.font.size # 触发 rPr 创建
try:
run._element.rPr.rFonts.set(qn('w:eastAsia'), font_name)
except Exception:
pass
def _add_block_caption(doc: Document, prefix: str, title: str):
"""添加图/表居中加粗标题行"""
cap = doc.add_paragraph()
cap.alignment = WD_ALIGN_PARAGRAPH.CENTER
cap.paragraph_format.space_before = Pt(8)
cap.paragraph_format.space_after = Pt(3)
run = cap.add_run(f'{prefix}{title}')
run.font.bold = True
run.font.size = Pt(11)
run.font.name = 'Times New Roman'
_safe_set_eastasia(run, '黑体')
def _add_figure_block(doc: Document, title: str, content: str):
"""
将图示内容渲染为带边框 + 背景色的文字图示框
使用单格表格Table Grid 样式实现四周边框比纯段落背景更专业
"""
_add_block_caption(doc, '', title)
lines = content.split('\n')
# 单格表格:四周边框 + 淡蓝灰背景
tbl = doc.add_table(rows=1, cols=1)
tbl.style = 'Table Grid'
cell = tbl.cell(0, 0)
_set_cell_bg(cell, 'EFF3FB') # 淡蓝灰背景
_set_cell_padding(cell, 5) # 内边距 5pt
for i, line in enumerate(lines):
if i == 0:
para = cell.paragraphs[0]
para.clear()
else:
para = cell.add_paragraph()
para.paragraph_format.space_before = Pt(0)
para.paragraph_format.space_after = Pt(1)
run = para.add_run(line if line else ' ')
run.font.size = Pt(9.5)
run.font.name = 'Courier New'
_safe_set_eastasia(run, '宋体')
# 图示后空行
sp = doc.add_paragraph()
sp.paragraph_format.space_after = Pt(8)
def _add_word_table(doc: Document, title: str, content: str):
"""将 Markdown 表格解析并渲染为 Word 表格"""
# 解析 markdown 行,过滤掉分隔行(|---|
raw_rows = []
for line in content.strip().split('\n'):
line = line.strip()
if not line:
continue
if re.match(r'^\|[\s\-:| ]+\|$', line):
continue # 分隔行
if line.startswith('|') and line.endswith('|'):
cells = [c.strip() for c in line[1:-1].split('|')]
raw_rows.append(cells)
if not raw_rows:
# 没有解析到有效行时,降级为普通文本
_add_block_caption(doc, '', title)
_add_plain_text(doc, content)
return
col_count = max(len(r) for r in raw_rows)
rows = [r + [''] * (col_count - len(r)) for r in raw_rows]
_add_block_caption(doc, '', title)
table = doc.add_table(rows=len(rows), cols=col_count)
table.style = 'Table Grid'
for i, row_data in enumerate(rows):
for j, cell_text in enumerate(row_data):
cell = table.cell(i, j)
para = cell.paragraphs[0]
para.clear()
para.alignment = WD_ALIGN_PARAGRAPH.CENTER if i == 0 else WD_ALIGN_PARAGRAPH.LEFT
run = para.add_run(cell_text)
run.font.size = Pt(10)
run.font.bold = (i == 0)
run.font.name = 'Times New Roman'
_safe_set_eastasia(run, '宋体')
if i == 0:
_set_cell_bg(cell, 'D6E4F7') # 浅蓝表头
# 表格后空行
sp = doc.add_paragraph()
sp.paragraph_format.space_after = Pt(6)
def _add_plain_text(doc: Document, text: str, preset=None):
"""添加普通文本段落内部辅助支持preset字体/大小"""
if preset is None:
preset = get_preset('standard')
for line in text.split('\n'):
line = line.strip()
if not line:
continue
p = doc.add_paragraph()
p.paragraph_format.first_line_indent = Pt(24)
p.paragraph_format.space_after = Pt(6)
_set_line_spacing_15(p)
run = p.add_run(line)
run.font.size = Pt(preset.get('body_size_pt', 12))
run.font.name = preset.get('body_font', 'Times New Roman')
_safe_set_eastasia(run, preset.get('body_font', '宋体'))
def _add_body_paragraphs(doc: Document, text: str, preset=None):
"""
将正文文本分段渲染自动识别并处理图示 [FIGURE:...] 和表格 [TABLE:...] 标记
支持从preset读取figure/table开关
"""
if preset is None:
preset = get_preset('standard')
for block in _split_content_blocks(text):
if block['type'] == 'figure' and preset.get('figure_enabled', True):
_add_figure_block(doc, block['title'], block['content'])
elif block['type'] == 'table' and preset.get('table_enabled', True):
_add_word_table(doc, block['title'], block['content'])
else:
_add_plain_text(doc, block['content'], preset)

1212
modules/generator.py Normal file

File diff suppressed because it is too large Load Diff

288
modules/knowledge.py Normal file
View File

@ -0,0 +1,288 @@
"""
企业知识库模块无外部向量库依赖
存储后端SQLite与主数据库共用同一文件
- knowledge_vectors 文本块 + JSON 向量
- knowledge_files 文件元数据已在 app.py init_db 中建立
检索策略
Qwen / OpenAI provider Embedding API + 余弦相似度语义检索
DeepSeek / Ollama SQL LIKE 关键词检索降级
"""
import json
import math
import logging
import os
import sqlite3
import threading
from datetime import datetime
import config
from utils.file_utils import extract_text, split_text_chunks
logger = logging.getLogger(__name__)
# 正在后台入库的文件名集合(供前端轮询感知"处理中"状态)
_processing_files: set = set()
_processing_lock = threading.Lock()
# 每次 Embedding API 批量请求的块数(避免单次请求过大)
_EMBED_BATCH = 16
# ─── 数据库 ──────────────────────────────────────────────────────────────────
def _conn() -> sqlite3.Connection:
return sqlite3.connect(config.DB_PATH)
def _init_tables(cur: sqlite3.Cursor) -> None:
"""确保向量块表存在并创建优化索引极速检索。knowledge_files 已由 app.py init_db 创建"""
cur.execute('''
CREATE TABLE IF NOT EXISTS knowledge_vectors (
id INTEGER PRIMARY KEY AUTOINCREMENT,
file_name TEXT NOT NULL,
chunk_idx INTEGER NOT NULL,
text TEXT NOT NULL,
embedding TEXT,
UNIQUE(file_name, chunk_idx)
)
''')
# 优化索引加速LIKE查询和向量检索时的文本过滤
cur.execute('CREATE INDEX IF NOT EXISTS idx_kv_file ON knowledge_vectors(file_name)')
cur.execute('CREATE INDEX IF NOT EXISTS idx_kv_text ON knowledge_vectors(text)') # helps FTS/LIKE
cur.execute('PRAGMA optimize') # SQLite auto-optimization
# ─── Embedding API ────────────────────────────────────────────────────────────
def _get_embeddings_batch(texts: list[str]) -> list[list[float] | None]:
"""
调用 ai_client.get_embeddings (复用全局 semaphore 和客户端逻辑)
不支持 Embedding provider 返回全 None 列表优化了并发控制
"""
if not texts:
return []
try:
# 使用统一 ai_client 接口确保全局LLM semaphore生效避免重复客户端创建
from utils import ai_client
embeddings = ai_client.get_embeddings(texts)
return embeddings
except Exception as e:
if "NotImplementedError" in str(type(e).__name__) or "不支持" in str(e):
logger.info('Embedding provider不支持降级到关键词检索')
return [None] * len(texts)
logger.warning(f'Embedding API 调用失败,将使用关键词检索降级: {e}')
return [None] * len(texts)
def _cosine(a: list[float], b: list[float]) -> float:
"""纯 Python 余弦相似度,无需 numpy"""
dot = sum(x * y for x, y in zip(a, b))
na = math.sqrt(sum(x * x for x in a))
nb = math.sqrt(sum(x * x for x in b))
return dot / (na * nb) if na and nb else 0.0
# ─── 公开接口 ─────────────────────────────────────────────────────────────────
def is_available() -> dict:
"""
知识库始终可用无外部依赖返回当前状态
search_mode: 'vector'语义检索 'keyword'关键词降级
"""
with _processing_lock:
processing = list(_processing_files)
try:
db = _conn()
cur = db.cursor()
_init_tables(cur)
db.commit()
cur.execute('SELECT COUNT(*) FROM knowledge_vectors')
doc_count = cur.fetchone()[0]
# 判断是否已有向量(即 Embedding API 是否可用过)
cur.execute('SELECT 1 FROM knowledge_vectors WHERE embedding IS NOT NULL LIMIT 1')
has_embedding = cur.fetchone() is not None
db.close()
provider = getattr(config, 'MODEL_PROVIDER', '')
can_embed = provider in ('qwen', 'openai', 'kimi')
mode = 'vector' if (has_embedding or can_embed) else 'keyword'
return {
'available': True,
'doc_count': doc_count,
'processing': processing,
'search_mode': mode,
}
except Exception as e:
return {
'available': True,
'doc_count': 0,
'processing': processing,
'search_mode': 'keyword',
'error': str(e),
}
def add_file(file_path: str, db_path: str) -> dict:
"""
将文件切块 批量 Embedding 写入 SQLite
此函数在后台线程中调用_processing_files 用于前端感知进度
"""
file_name = os.path.basename(file_path)
with _processing_lock:
_processing_files.add(file_name)
try:
text = extract_text(file_path)
chunks = split_text_chunks(text, config.CHUNK_SIZE, config.CHUNK_OVERLAP)
if not chunks:
return {'success': False, 'error': '文件内容为空,无法入库'}
# 批量获取 EmbeddingQwen/OpenAI provider 有效;否则全 None
embeddings: list[list[float] | None] = []
for i in range(0, len(chunks), _EMBED_BATCH):
batch = chunks[i:i + _EMBED_BATCH]
embeddings.extend(_get_embeddings_batch(batch))
db = _conn()
try:
cur = db.cursor()
_init_tables(cur)
# 先删除同名文件的旧数据
cur.execute('DELETE FROM knowledge_vectors WHERE file_name=?', (file_name,))
for idx, (chunk, emb) in enumerate(zip(chunks, embeddings)):
emb_json = json.dumps(emb) if emb is not None else None
cur.execute(
'INSERT INTO knowledge_vectors (file_name, chunk_idx, text, embedding) VALUES (?,?,?,?)',
(file_name, idx, chunk, emb_json),
)
cur.execute('''
INSERT OR REPLACE INTO knowledge_files (file_name, file_path, chunk_count, added_at)
VALUES (?, ?, ?, ?)
''', (file_name, file_path, len(chunks), datetime.now()))
db.commit()
finally:
db.close()
logger.info(f'知识库入库完成: {file_name}{len(chunks)}'
f'{"(含向量)" if any(e is not None for e in embeddings) else "(关键词模式)"}')
return {'success': True, 'chunks': len(chunks)}
except Exception as e:
logger.exception('知识库添加文件失败')
return {'success': False, 'error': str(e)}
finally:
with _processing_lock:
_processing_files.discard(file_name)
def search(query: str, top_k: int = None) -> list[str]:
"""
从知识库检索与 query 最相关的文本块
- 向量模式获取 query Embedding 余弦相似度排序
- 关键词模式降级SQL LIKE 多词匹配
"""
if top_k is None:
top_k = config.TOP_K_KNOWLEDGE
try:
db = _conn()
try:
cur = db.cursor()
_init_tables(cur)
db.commit()
cur.execute('SELECT COUNT(*) FROM knowledge_vectors')
if cur.fetchone()[0] == 0:
return []
# ── 向量语义检索 ──────────────────────────────────────────────────
q_embs = _get_embeddings_batch([query])
q_emb = q_embs[0] if q_embs else None
if q_emb is not None:
# 极速优化:限制扫描行数(避免知识库大时全表扫描),优先最近添加的内容
cur.execute(
'''SELECT text, embedding FROM knowledge_vectors
WHERE embedding IS NOT NULL
ORDER BY id DESC LIMIT 500'''
)
rows = cur.fetchall()
if rows:
scored: list[tuple[float, str]] = []
for text, emb_json in rows:
try:
emb = json.loads(emb_json)
scored.append((_cosine(q_emb, emb), text))
except Exception:
continue
scored.sort(reverse=True)
return [t for _, t in scored[:top_k]]
# ── 关键词降级检索DeepSeek / Ollama 无 Embedding API─────────
# 过滤纯数字/编号词(如 "1.2" "一、"),避免误匹配无关段落
import re as _re
_num_pat = _re.compile(r'^[\d\.\-、一二三四五六七八九十]+$')
words = [
w.strip() for w in query.split()
if len(w.strip()) > 1 and not _num_pat.match(w.strip())
][:6]
if not words:
cur.execute('SELECT text FROM knowledge_vectors LIMIT ?', (top_k,))
return [r[0] for r in cur.fetchall()]
conditions = ' OR '.join(['text LIKE ?' for _ in words])
params = [f'%{w}%' for w in words] + [top_k]
cur.execute(
f'SELECT text FROM knowledge_vectors WHERE {conditions} LIMIT ?', params
)
return [r[0] for r in cur.fetchall()]
finally:
db.close()
except Exception as e:
logger.error(f'知识库检索失败: {e}')
return []
def list_files(db_path: str) -> list[dict]:
"""列出知识库已入库的文件"""
try:
db = sqlite3.connect(db_path)
cur = db.cursor()
cur.execute(
'SELECT file_name, chunk_count, added_at FROM knowledge_files ORDER BY added_at DESC'
)
rows = cur.fetchall()
db.close()
return [{'name': r[0], 'chunks': r[1], 'added_at': r[2]} for r in rows]
except Exception:
return []
def delete_file(file_name: str, db_path: str) -> dict:
"""从知识库删除指定文件的所有数据"""
try:
db = _conn()
cur = db.cursor()
_init_tables(cur)
cur.execute('DELETE FROM knowledge_vectors WHERE file_name=?', (file_name,))
cur.execute('DELETE FROM knowledge_files WHERE file_name=?', (file_name,))
db.commit()
db.close()
return {'success': True}
except Exception as e:
logger.exception('知识库删除文件失败')
return {'success': False, 'error': str(e)}

206
modules/parser.py Normal file
View File

@ -0,0 +1,206 @@
"""
招标文件解析模块
流程提取文本 生成摘要 提取评分要求 结构化JSON
"""
import json
import logging
import re
import sqlite3
from concurrent.futures import ThreadPoolExecutor, as_completed
from datetime import datetime
from functools import partial
from utils import ai_client, prompts as P
from utils.file_utils import extract_text, truncate_text
from utils.tender_kind_sections import (
get_tender_kind_classify_prompt,
parse_tender_kind_response,
)
logger = logging.getLogger(__name__)
def parse_boq_file(db_path: str, project_id: int, file_path: str, file_name: str) -> None:
"""
后台线程解析工程量清单文件 本地结构化分析 AI 摘要 写库
boq_status: none parsing done / error
"""
from utils.bill_analysis import analyze_boq_pages, categories_to_prompt_appendix
from utils.boq_parser import extract_boq_pages
conn = sqlite3.connect(db_path)
try:
_set_boq_status(conn, project_id, 'parsing', '正在提取工程量清单文本...')
page_texts = extract_boq_pages(file_path)
boq_text = '\n'.join(page_texts).strip()
if not boq_text:
raise ValueError('未能从文件中提取到有效内容,请检查文件格式')
_set_boq_status(conn, project_id, 'parsing', '正在本地解析清单结构...')
analysis = analyze_boq_pages(page_texts)
boq_analysis_json = json.dumps(analysis, ensure_ascii=False)
structured = ''
if not analysis.get('scanned') and not analysis.get('no_bill_pages'):
structured = categories_to_prompt_appendix(analysis)
_set_boq_status(conn, project_id, 'parsing', '正在生成工程量清单摘要...')
summary_prompt = P.get_boq_summary_prompt(boq_text[:10000], structured)
boq_summary = ai_client.chat(summary_prompt, temperature=0.2, max_tokens=2048)
cur = conn.cursor()
cur.execute('''
UPDATE tender_data
SET boq_file_name=?, boq_text=?, boq_summary=?, boq_analysis_json=?,
boq_status='done', boq_error='', updated_at=?
WHERE project_id=?
''', (file_name, boq_text[:12000], boq_summary, boq_analysis_json, datetime.now(), project_id))
conn.commit()
logger.info(f'项目 {project_id} 工程量清单解析完成')
except Exception as e:
logger.exception(f'工程量清单解析失败 project_id={project_id}')
_set_boq_status(conn, project_id, 'error', str(e))
finally:
conn.close()
def _set_boq_status(conn, project_id, status, message=''):
cur = conn.cursor()
cur.execute('''
UPDATE tender_data SET boq_status=?, boq_error=?, updated_at=?
WHERE project_id=?
''', (status, message, datetime.now(), project_id))
conn.commit()
def parse_tender_file(db_path: str, project_id: int, file_path: str, file_name: str) -> None:
"""
后台线程中运行解析招标文件并将结果写入数据库
极速优化提取文本后并行执行3个独立AI任务摘要评分要求类型识别
然后顺序执行依赖的JSON结构化全局信号量限制总并发20
status 字段pending parsing done / error
"""
conn = sqlite3.connect(db_path)
try:
_set_status(conn, project_id, 'parsing', '正在提取文件文本...')
# 1. 提取原始文本I/O/CPU可进一步并行但当前足够快
raw_text = extract_text(file_path)
raw_text = truncate_text(raw_text, 60000)
excerpt = (raw_text or '')[:15000]
_set_status(conn, project_id, 'parsing', '并行生成摘要、评分要求和类型识别...')
# 并行准备3个独立AI任务大幅加速解析环节
def _run_summary():
prompt = P.get_project_summary_prompt(raw_text)
return ai_client.chat(prompt, temperature=0.3, max_tokens=4096)
def _run_rating():
prompt = P.get_rating_requirements_prompt(raw_text)
return ai_client.chat(prompt, temperature=0.2, max_tokens=4096)
def _run_kind():
prompt = get_tender_kind_classify_prompt(excerpt)
raw = ai_client.chat(prompt, temperature=0.1, max_tokens=32)
return parse_tender_kind_response(raw)
# 使用有限线程池 + 全局semaphore保护执行并发上限20
with ThreadPoolExecutor(max_workers=3, thread_name_prefix='parse') as executor:
future_summary = executor.submit(_run_summary)
future_rating = executor.submit(_run_rating)
future_kind = executor.submit(_run_kind)
summary = future_summary.result()
rating_md = future_rating.result()
tender_kind = future_kind.result()
logger.info(f'项目 {project_id} 招标文件类型识别为: {tender_kind}')
_set_status(conn, project_id, 'parsing', '正在结构化评分数据...')
# 4. 依赖rating_md的JSON结构化顺序执行
rating_json_prompt = P.get_rating_json_prompt(rating_md)
rating_json_raw = ai_client.chat(rating_json_prompt, temperature=0.1, max_tokens=2048)
rating_json_str = _clean_json(rating_json_raw)
# 写入数据库
_upsert_tender_data(conn, project_id, file_name, raw_text,
summary, rating_md, rating_json_str, tender_kind)
# Deep integration: persist diagram/anon settings to projects table (auto-defaults)
# Future: add AI extraction prompt for diagram intent and anon rules from raw_text
cur = conn.cursor()
cur.execute('''
UPDATE projects SET
enable_figure = COALESCE(enable_figure, 1),
enable_table = COALESCE(enable_table, 1),
anon_requirements = COALESCE(anon_requirements, '不得出现投标人身份信息')
WHERE id = ?
''', (project_id,))
conn.commit()
_set_status(conn, project_id, 'done', '解析完成(已同步生成设置)')
logger.info(f'项目 {project_id} 招标文件解析完成(并行加速完成,生成设置已打通)')
except Exception as e:
logger.exception(f'解析失败 project_id={project_id}')
_set_status(conn, project_id, 'error', str(e))
finally:
conn.close()
# ─── 内部工具 ──────────────────────────────────────────────────────────────
def _set_status(conn, project_id, status, message=''):
cur = conn.cursor()
cur.execute('''
INSERT INTO tender_data (project_id, status, error_message)
VALUES (?, ?, ?)
ON CONFLICT(project_id) DO UPDATE SET status=?, error_message=?, updated_at=?
''', (project_id, status, message, status, message, datetime.now()))
conn.commit()
def _upsert_tender_data(conn, project_id, file_name, raw_text,
summary, rating_md, rating_json_str,
tender_kind: str = 'engineering'):
cur = conn.cursor()
cur.execute('''
INSERT INTO tender_data
(project_id, file_name, raw_text, summary, rating_requirements, rating_json,
tender_kind, status, error_message)
VALUES (?, ?, ?, ?, ?, ?, ?, 'done', '')
ON CONFLICT(project_id) DO UPDATE SET
file_name=?, raw_text=?, summary=?, rating_requirements=?,
rating_json=?, tender_kind=?, status='done', error_message='', updated_at=?
''', (
project_id, file_name, raw_text, summary, rating_md, rating_json_str, tender_kind,
file_name, raw_text, summary, rating_md, rating_json_str, tender_kind, datetime.now()
))
conn.commit()
def _clean_json(raw: str) -> str:
"""尝试从 AI 返回中提取 JSON 字符串"""
# 去除 markdown 代码块
raw = re.sub(r'```(?:json)?\s*', '', raw)
raw = raw.replace('```', '').strip()
# 验证是否是有效 JSON
try:
json.loads(raw)
return raw
except json.JSONDecodeError:
# 尝试提取 { ... } 部分
m = re.search(r'\{[\s\S]*\}', raw)
if m:
candidate = m.group(0)
try:
json.loads(candidate)
return candidate
except Exception:
pass
return raw

View File

@ -0,0 +1,36 @@
- 角色:技术标书架构师
- 能力:
- 单章节深度解构能力
- 跨章节协同规划视野
- 评分权重动态分配策略
- 任务:根据招标文件概要、章节主题、评分要求,生成结构化的技术标书该章节的目录
- 输出要求:
- 采用四级嵌套编码体系X.X.X.X确保章节颗粒度可控
- 直接给出生成的章节大纲,禁止解释和引导词
- markdown格式输出
- 示例输出,以"服务进度保障措施"为例:
二、智慧物流系统全生命周期进度保障
 2.1 基于BIM的进度协同管理平台
  2.1.1 多级进度计划耦合模型
   2.1.1.1 WBS-Milestone映射矩阵
   2.1.1.2 Primavera P6进度基线
  2.1.2 资源约束进度优化算法
   2.1.2.1 基于CPM的缓冲区间动态分配
   2.1.2.2 资源平滑度R=0.92
- 招标文件概要:
{summary}
- 章节主题:
{chapter}
- 评分要求:
{score}

158
prompts/outlines.txt Normal file
View File

@ -0,0 +1,158 @@
- 角色:技术标书架构师
- 任务:生成适配技术评分标准的技术标书目录
- 输出要求:
采用四级嵌套编码体系X.X.X.X下实现按需分层
直接给出生成的目录,禁止解释和引导词
- 约束控制:
根据项目生成标书的名称如“XXXX项目技术标书”
总的章节数应该控制在8-10个
章节颗粒度与评分指标权重正相关
技术实施类章节必须达到四级深度,管理保障类章节允许三级结构
同级节点数量必须有波动区间:技术方案类(4-7)、实施保障类(2-4)、创新应用类(1-3)
目录的章节不能缺少包含以下关键词的内容:
- 对本项目的了解和分析
- 项目工作重难点分析
- 项目实施方案
- 服务进度保障措施
- 服务质量保障方案
- 合理化建议
- 服务承诺及处罚措施
目录不包含成本和预算内容,但要平衡项目预算、技术可行性以及技术的专业度
- 示例输出:
<example>
花岭新城BIM项目技术标书
一、总体实施方案
 1.1 项目理解与需求分析
  1.1.1 项目概述
    1.1.1.1 建设地点及规模
    1.1.1.2 工程地质勘察报告
    1.1.1.3 抗震设防烈度与防火等级
    1.1.1.4 建筑结构形式与建筑面积分布
  1.1.2 项目背景
    1.1.2.1 核心宗旨与目标
    1.1.2.2 地理位置与项目规模
  1.1.3 项目目标
    1.1.3.1 就业机会与基础设施提升
    1.1.3.2 乡村振兴与经济增长
  1.1.4 项目特点
    1.1.4.1 框筒结构抗震性能
    1.1.4.2 分阶段工程地质勘察
    1.1.4.3 功能区域多样化
二、建筑设计
 2.1 主要设计依据
    2.1.1 国家标准与规范
    2.1.2 行业标准与图集
 2.2 建筑结构设计
    2.2.1 结构形式
    2.2.2 结构材料
    2.2.3 结构布局
    2.2.4 结构经济指标
    2.2.5 结构细节设计
 2.3 建筑功能布局
    2.3.1 C1#楼(厂房)
      2.3.1.1 功能分区明确
      2.3.1.2 流线优化与安全性
    2.3.2 配电房
      2.3.2.1 设计目标与设备布置
      2.3.2.2 空间规划与电气主接线方案
    2.3.3 外廊及架空建筑
      2.3.3.1 功能区域与景观设计
      2.3.3.2 光照与通风优化
 2.4 建筑材料选用
 2.5 建筑外观设计
 2.6 建筑室内布局
    2.6.1 功能分区与设计要点
 2.7 建筑安全和消防设计
    2.7.1 建筑安全体系
    2.7.2 消防系统设计
 2.8 建筑节能设计
    2.8.1 节能措施与绿色建材
    2.8.2 雨水收集系统
三、结构设计
 3.1 结构形式
 3.2 结构材料
    3.2.1 混凝土与钢材选用
 3.3 结构布局
    3.3.1 结构柱网与通风疏散通道
 3.4 结构经济指标
    3.4.1 抗震设计要求与用材控制
 3.5 结构细节设计
    3.5.1 基础设计与钢结构细节
    3.5.2 混凝土结构与抗震设计
 3.6 结构分析与计算
四、给排水设计
 4.1 引言
 4.2 供水系统设计
    4.2.1 供水管道与消防水源
    4.2.2 节水设计与雨水收集
 4.3 排水系统设计
    4.3.1 排水管道与雨水管理
    4.3.2 污水处理与分流制度
 4.4 给排水设备选择
 4.5 细节设计
 4.6 监测与维护
五、暖通设计
 5.1 引言
 5.2 供暖系统设计
    5.2.1 供暖方式与设备选择
    5.2.2 温度控制系统
 5.3 通风系统设计
    5.3.1 通风方式与设备选择
    5.3.2 空气质量控制
 5.4 空调系统设计
    5.4.1 空调方式与设备选择
    5.4.2 温湿度控制系统
 5.5 热水系统设计
 5.6 细节设计与监测维护
六、BIM设计
 6.1 项目总图与单体建筑设计
 6.2 道路与排水设计
 6.3 电气系统设计
 6.4 绿化设计
 6.5 BIM协同设计与施工管理
 6.6 数据管理与培训支持
七、设计说明
 7.1 项目设计依据
 7.2 设计原则
 7.3 结构经济合理化
 7.4 建筑功能分区
 7.5 设计细节要求
八、合理化建议
 8.1 建筑专业合理化建议
 8.2 结构专业合理化建议
 8.3 给排水专业合理化建议
 8.4 暖通专业合理化建议
 8.5 BIM专业合理化建议
8.6 技术和工艺方面的建议
8.7 成本和预算方面的建议
8.8 时间和进度方面的建议
8.9 施工质量管理方面的建议
8.10 质量和安全方面的建议
8.11 环境和可持续性方面的建议
九、施工进度安排
 9.1 施工进度安排
 9.2 施工进度跟踪与管理
 9.3 施工质量管理
 9.4 施工现场管理
 9.5 施工结项与验收
十、本项目工作重点难点分析
 10.1 工程特点与设计工作难点
 10.2 重点与难点分析
 10.3 综合解决措施
</example>
- 招标文件内容:
{document_text}
"""

View File

@ -0,0 +1,155 @@
- 角色:技术标书架构师
- 任务:生成适配技术评分标准的技术标书目录
- 输出要求:
采用四级嵌套编码体系X.X.X.X下实现按需分层
直接给出生成的目录,禁止解释和引导词
- 约束控制:
根据项目生成标书的名称如“XXXX项目技术标书”
总的章节数应该控制在8-10个,不超过10个
目录的章节必须按照技术评分标准的项目生成,题目应包括技术评分项目中的关键词:
章节颗粒度与评分指标权重正相关
技术方案类章节必须达到四级深度,管理保障类章节允许三级结构
同级节点数量必须有波动区间:技术方案类(4-7)、实施保障类(2-4)、创新应用类(1-3)
目录禁止包含报价、团队、资质、文件等商务性质的章节
- 示例输出:
<example>
花岭新城BIM项目技术标书
一、总体实施方案
 1.1 项目理解与需求分析
  1.1.1 项目概述
    1.1.1.1 建设地点及规模
    1.1.1.2 工程地质勘察报告
    1.1.1.3 抗震设防烈度与防火等级
    1.1.1.4 建筑结构形式与建筑面积分布
  1.1.2 项目背景
    1.1.2.1 核心宗旨与目标
    1.1.2.2 地理位置与项目规模
  1.1.3 项目目标
    1.1.3.1 就业机会与基础设施提升
    1.1.3.2 乡村振兴与经济增长
  1.1.4 项目特点
    1.1.4.1 框筒结构抗震性能
    1.1.4.2 分阶段工程地质勘察
    1.1.4.3 功能区域多样化
二、建筑设计
 2.1 主要设计依据
    2.1.1 国家标准与规范
    2.1.2 行业标准与图集
 2.2 建筑结构设计
    2.2.1 结构形式
    2.2.2 结构材料
    2.2.3 结构布局
    2.2.4 结构经济指标
    2.2.5 结构细节设计
 2.3 建筑功能布局
    2.3.1 C1#楼(厂房)
      2.3.1.1 功能分区明确
      2.3.1.2 流线优化与安全性
    2.3.2 配电房
      2.3.2.1 设计目标与设备布置
      2.3.2.2 空间规划与电气主接线方案
    2.3.3 外廊及架空建筑
      2.3.3.1 功能区域与景观设计
      2.3.3.2 光照与通风优化
 2.4 建筑材料选用
 2.5 建筑外观设计
 2.6 建筑室内布局
    2.6.1 功能分区与设计要点
 2.7 建筑安全和消防设计
    2.7.1 建筑安全体系
    2.7.2 消防系统设计
 2.8 建筑节能设计
    2.8.1 节能措施与绿色建材
    2.8.2 雨水收集系统
三、结构设计
 3.1 结构形式
 3.2 结构材料
    3.2.1 混凝土与钢材选用
 3.3 结构布局
    3.3.1 结构柱网与通风疏散通道
 3.4 结构经济指标
    3.4.1 抗震设计要求与用材控制
 3.5 结构细节设计
    3.5.1 基础设计与钢结构细节
    3.5.2 混凝土结构与抗震设计
 3.6 结构分析与计算
四、给排水设计
 4.1 引言
 4.2 供水系统设计
    4.2.1 供水管道与消防水源
    4.2.2 节水设计与雨水收集
 4.3 排水系统设计
    4.3.1 排水管道与雨水管理
    4.3.2 污水处理与分流制度
 4.4 给排水设备选择
 4.5 细节设计
 4.6 监测与维护
五、暖通设计
 5.1 引言
 5.2 供暖系统设计
    5.2.1 供暖方式与设备选择
    5.2.2 温度控制系统
 5.3 通风系统设计
    5.3.1 通风方式与设备选择
    5.3.2 空气质量控制
 5.4 空调系统设计
    5.4.1 空调方式与设备选择
    5.4.2 温湿度控制系统
 5.5 热水系统设计
 5.6 细节设计与监测维护
六、BIM设计
 6.1 项目总图与单体建筑设计
 6.2 道路与排水设计
 6.3 电气系统设计
 6.4 绿化设计
 6.5 BIM协同设计与施工管理
 6.6 数据管理与培训支持
七、设计说明
 7.1 项目设计依据
 7.2 设计原则
 7.3 结构经济合理化
 7.4 建筑功能分区
 7.5 设计细节要求
八、合理化建议
 8.1 建筑专业合理化建议
 8.2 结构专业合理化建议
 8.3 给排水专业合理化建议
 8.4 暖通专业合理化建议
 8.5 BIM专业合理化建议
8.6 技术和工艺方面的建议
8.7 成本和预算方面的建议
8.8 时间和进度方面的建议
8.9 施工质量管理方面的建议
8.10 质量和安全方面的建议
8.11 环境和可持续性方面的建议
九、施工进度安排
 9.1 施工进度安排
 9.2 施工进度跟踪与管理
 9.3 施工质量管理
 9.4 施工现场管理
 9.5 施工结项与验收
十、本项目工作重点难点分析
 10.1 工程特点与设计工作难点
 10.2 重点与难点分析
 10.3 综合解决措施
</example>
- 招标文件摘要:
{summary}
- 技术评分标准:
{rating}
"""

View File

@ -0,0 +1,92 @@
- 角色:招标文件编写专家,精通招标文件结构化、摘要编写
- 任务:根据用户提供的项目招标文件内容,生成一份专业、清晰的结构化摘要
- 要求:
一、摘要框架
1. 项目概况
- 项目名称
- 建设地点
- 工程性质(新建/改建/扩建)
- 核心建设内容
- 关键工程量指标
- 特殊施工工艺(如顶管/盾构等)
- 项目概况
2. 技术要求体系
- 专业监测要求(分项列出核心监测指标)
- 技术标准规范
- 质量管控要点
- 特殊工艺标准
3. 交付物矩阵
- 阶段性成果清单(含时间节点)
- 最终交付文件要求
- 成果验收标准
- 备案审批流程
4. 商务条款摘要
- 合同期限
- 支付结构
- 报价约束条件
- 违约条款要点
- 知识产权约定
5. 资质要求矩阵
- 企业资质门槛
- 人员资格要求
- 设备配置标准
- 同类项目经验
6. 评标要素体系
- 技术评分维度
- 商务评分权重
- 否决性条款
- 实质性条款
- 围标识别机制
二、处理规范
1. 信息抽取规则:
- 采用三级信息提炼法(关键数据→技术参数→约束条件)
- 识别并标注法定强制性条款(★号条款)
- 提取特殊工艺参数(例如顶管直径、沉井尺寸等)
2. 结构化呈现要求:
- 使用Markdown分级标题系统
- 技术参数格式化处理
- 流程节点采用时间轴呈现
- 关键数据突出显示(例如预算金额、最高限价)
3. 专业术语处理:
- 保持行业术语准确性
- 工程计量单位标准化转换
- 法律条款原文引述
三、输出示例
1.确保包含但不仅限于:
- 项目背景的技术参数分解
- 监测要求的分类归纳
- 成果交付的阶段性要求
- 商务条款的要点提炼
四、质量保障
1. 完整性核查清单:
- 验证五证要求(资质/业绩/人员/设备/资金)
- 检查三大核心条款(技术/商务/法律)
- 确认关键日期节点(工期/交付期/质保期)
2. 风险提示机制:
- 标注异常约束条款
- 识别排他性要求
- 提示潜在履约风险点
请严格按照上述结构化框架处理输入的招标文件,生成专业、准确、易读的项目摘要报告。
输出内容需符合工程领域专业规范,重点数据需二次核验确保准确性。
严格按照招标文件的内容,确保输出内容的完整性。
直接给出摘要,禁止说明和引导词。
- 用户提供的招标文件内容如下:
{bid_document}

23
prompts/rating_json.txt Normal file
View File

@ -0,0 +1,23 @@
- 任务从工程项目招标文件中提取技术评分要求并以严格的JSON格式输出。
- 要求:
必须生成完整有效的JSON对象不使用JSON之外的文本说明
数值类型字段不添加单位符号
包含所有的评分项及其权重分配
特殊说明字段仅在存在否决条款(强制性条款)时出现
- 输出结构(必须严格遵守根字段名与数组名,便于后续章节字数与要点映射):
{
"items": [
{
"id": "唯一短标识,如 T01",
"name": "评分项名称(与招标文件表述一致或精简概括)",
"weight": 数值型权重或分值(如 10 表示 10 分或 10%,
"keywords": ["与本项相关的可选关键词1", "关键词2"]
}
],
"notes": "可选:否决条款、阶梯得分等特殊说明;无则写空字符串"
}
- 技术评分要求内容如下:
{tech_rating}

View File

@ -0,0 +1,46 @@
- 角色:招标文件信息提取专家,精通技术评分/技术评审要求的提取
- 任务:请严格按照以下步骤分析提供的招标文件内容,并完整提取所有技术评分标准:
- 步骤与要求:
1. **结构解析**
- 首先识别文件整体结构,仅提取“技术评分”/“技术评审”部分
- 标注评分大类的权重占比(如出现)
2. **要素提取**
对“技术评分”板块进行深度解析,要求:
- 提取评分的全部细节,不能省略
- 明确列出技术评分的标准,如有(如"ISO认证+3分"、"项目经验每年加1分"
3. **结果呈现样例**
参考以下示例输出markdown结构化格式
# 招标技术评分细则
## 技术评分(80分)
- 对本项目的了解和分析(12分)
→ 对本项目的理解与项目背景把握准确,对本项目特点、实 施目标和定位内容详尽,完全满足项目需要,科学、合理、 针对性强、合理可行的,得 12 分; 对本项目的理解与项 目背景有一定把握,对本项目特点、实施目标和定位有阐 述说明,基本可行的,得 8 分;对本项目的理解与项目 背景把握片面,对本项目特点、实施目标和定位理解有较 大偏差,可行性较差的,得 4 分;未提供不得分。
→ 合理可行指:( 1完全响应采购需求 2相关内容的表述具有针对性全面、具体。
→ 基本可行指:( 1响应采购需求有微小偏差 2相关 内容的表述有一定的层次性、针对性,但全面性不够。
→ 可行性较差指:( 1响应采购需求有较大偏差 2相 关内容的表述针对性弱、全面性方面欠缺较大。
- 项目工作重难点分析(12分)
→ 根据供应商针对本项目工作重难点分析与解决方案的科学性、合理性且满足项目实际情况进行评分,项目工作重 难点分析到位、有针对性、完全符合项目实际情况,对应 的解决方案合理可行的,得 12 分;
项目工作重难点内容 基本准确、针对性一般、基本符合项目实际,对应的解决 方案基本可行的,得 8 分;
项目工作重难点分析一般,对应的解决方案一般、可行性较差的,得 4 分;未提供 不得分。
→ 合理可行指:( 1完全响应采购需求 2相关内容的表述具有针对性全面、具体。
→ 基本可行指:( 1响应采购需求有微小偏差 2相关 内容的表述有一定的层次性、针对性,但全面性不够。
→ 可行性较差指:( 1响应采购需求有较大偏差 2相 关内容的表述针对性弱、全面性方面欠缺较大。
- 项目实施方案(12分)
(继续展开...
请严格按照上述结构化框架处理输入的招标文件,生成专业、准确的项目技术评分/评审要求。
严格按照招标文件的内容,确保输出内容的完整性。
直接输出评分/评审要求,禁止说明和引导词。
- 招标文件内容如下:
{bid_document}

View File

@ -0,0 +1,43 @@
- 角色:招标文件信息提取专家,精通技术评分/技术评审要求的提取
- 任务:请严格按照以下步骤分析提供的招标文件内容,并完整提取所有技术评分标准:
- 步骤与要求:
1. **结构解析**
- 首先识别文件整体结构,仅提取“技术评分”/“技术评审要求”部分
- 标注评分大类的权重占比(如出现)
2. **要素提取**
对“技术评分”板块进行深度解析,要求:
- 提取评分的全部细节,不能省略
- 明确列出量化指标,如有(如"ISO认证+3分"、"项目经验每年加1分"
- 区分强制性条款(必须满足项)与竞争性条款(择优评分项),如有
- 标注特殊要求(本地化服务、专利数量、团队资质等),如有
3. **异常识别**
- 标出表述模糊的评分项(如"酌情加分""优/良/差等级"
- 识别可能存在的矛盾条款
- 提示需要注意的隐藏评分点(如投标格式错误扣分项)
4. **结果呈现样例**
参考以下示例输出markdown结构化格式
# 招标技术评分细则
## 技术评分(50%)
- 系统架构设计(20%)
→ 要求:支持分布式部署(未满足直接废标)
→ 加分项:采用微服务架构+3分
(继续展开...
请严格按照上述结构化框架处理输入的招标文件,生成专业、准确的项目技术评分要求。
严格按照招标文件的内容,确保输出内容的完整性。
直接输出评分要求,禁止说明和引导词。
- 招标文件内容如下:
{bid_document}

45
prompts/scoring_rules.txt Normal file
View File

@ -0,0 +1,45 @@
"你是一名专业的招标文件分析师,请按照以下步骤处理用户提供的项目招标文件内容:
1. **结构识别**
- 仔细解析文件结构,定位'评分标准'、'评审办法'、'投标人须知'等关键章节
- 特别注意包含'分值'、'评分项'、'权重'等关键词的段落
2. **核心要素提取**
- 系统提取以下要素形成结构化表格:
│ 类别 │ 评分项名称 │ 分值权重 │ 具体要求 │ 否决条款 │
- 分类标准:
● 技术部分(方案设计、实施能力、技术创新等)
● 商务部分(资质证明、业绩案例、团队经验等)
● 价格部分(报价合理性、计价方式等)
● 其他专项(售后服务、本地化服务等)
3. **深度分析**
- 计算权重配比示例技术60% = 方案设计30% + 实施能力20% + 创新10%
- 识别否决性条款(如"▲"标记项或特定强制要求)
- 标注特殊评分规则:阶梯得分、区间赋分、横向比较等机制
4. **风险提示**
- 标出易被忽视的得分点如ISO认证、专利数量等
- 识别矛盾条款如总分值≠100%的情况)
- 提示资质门槛要求(注册资金、特定资质证书等)
5. **输出格式**
采用Markdown输出以下结构
```markdown
# 招标评分要点汇总
## 核心指标配比
- 总评分构成技术分__%+ 商务分__%+ 价格分__%
## 详细评分矩阵
| 类别 | 评分项 | 分值 | 具体要求 | 关键指标 |
|------|-------|-----|---------|---------|
| ... | ... | ... | ... | ... |
## 重点提示
⚠️ 否决条款:列出所有一票否决项
💡 得分要点突出3-5个高权重核心指标
⏱️ 时间节点:标注与评分相关的时限要求
```
请先确认理解任务要求,待用户提供招标文件内容后执行分析。"

47
prompts/section_detail.py Normal file
View File

@ -0,0 +1,47 @@
GEN_LEAF_DETAIL_PROMT = """
最重要的要求字数
{word_count_spec}
- 角色资深投标文件撰写专家
- 任务根据招标文件概要标书目录子小节标题撰写该子小节的正文
行文规范
- 投标方自称统一用"我方"禁用"我们""本公司"
- 招标人统一称"招标方""建设单位"
- 禁止前导句"本章节对应……""本小节主要说明……""以下将从……方面说明"开头直接写实质内容
- 禁止AI套话综上所述首先其次再次我们深信高度重视全力以赴不断优化稳步推进通过以上措施
- 用具体数据/标准编号/人员配置替代空洞承诺
- 列举用(1)(2)(3)编号禁止"首先其次"连接禁止""作结尾
- 纯文本输出禁用markdown符号段落间空行分隔
- 直接输出正文不含标题和解释
输入信息
- 招标文件概要
{summary}
- 技术标书目录
{outline}
- 待撰写的子小节标题
{title}
再次强调篇幅是最核心的质量指标内容必须充分展开每个技术要点都要详细阐述实施方法技术参数人员安排或设备配置绝不可以概括性一笔带过
"""
GEN_SECTION_INTRODUCTION_PROMT = """
- 角色资深投标文件撰写专家
- 任务为章节撰写简短开篇引言100200点明核心主题与招标要求的对应关系
- 使用"我方"自称禁止套话和前导解释句纯文本输出
- 若无需过渡可输出空白
- 招标文件概要
{summary}
- 技术标书目录
{outline}
- 章节标题
{title}
"""

View File

@ -0,0 +1,28 @@
【最重要的要求——字数】
{word_count_spec}
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
- 角色:资深投标文件撰写专家
- 任务:根据招标文件概要、标书目录、子小节标题,撰写该子小节的正文
【行文规范】
- 投标方自称用"我方""我们""本公司"随机使用
- 招标人统一称"招标方"或"建设单位"
- 禁止前导句:"本章节对应……""本小节主要说明……""以下将从……方面说明"等——开头直接写实质内容
- 禁止AI套话综上所述、首先其次再次、我们深信、高度重视、全力以赴、不断优化、稳步推进、通过以上措施
- 用具体数据/标准编号/人员配置替代空洞承诺
- 列举用(1)(2)(3)编号,禁止"首先其次"连接;禁止"等"作结尾
- 纯文本输出禁用markdown符号段落间空行分隔
- 直接输出正文,不含标题和解释
【输入信息】
- 招标文件概要:
{summary}
- 技术标书目录:
{outline}
- 待撰写的子小节标题:
{subsection_title}
再次强调:篇幅是最核心的质量指标。内容必须充分展开,每个技术要点都要详细阐述实施方法、技术参数、人员安排或设备配置。绝不可以概括性一笔带过。

12
requirements.txt Normal file
View File

@ -0,0 +1,12 @@
Flask==3.0.3
flask-cors==4.0.1
PyPDF2==3.0.1
python-docx==1.1.2
openai==1.52.0
Werkzeug==3.0.4
requests==2.32.3
chardet==5.2.0
pypdf==4.3.1
pdfminer.six==20231228
beautifulsoup4==4.12.3
lxml==5.3.0

39
start.bat Normal file
View File

@ -0,0 +1,39 @@
@echo off
title BidPartner - AI Bid Assistant
echo.
echo ============================================
echo BidPartner - AI Bid Writing Tool
echo ============================================
echo.
cd /d "%~dp0"
python --version >nul 2>&1
if %errorlevel% neq 0 (
echo [ERROR] Python not found. Please install Python 3.9+
pause
exit /b 1
)
if not exist "%~dp0.deps_installed" (
echo Installing dependencies...
pip install -r requirements.txt
if %errorlevel% neq 0 (
echo [ERROR] Failed to install dependencies.
pause
exit /b 1
)
echo.> "%~dp0.deps_installed"
echo Dependencies installed successfully.
)
echo Starting server...
echo Open browser: http://localhost:5000
echo Press Ctrl+C to stop
echo.
start "" "http://localhost:5000"
python app.py
pause

89
static/style.css Normal file
View File

@ -0,0 +1,89 @@
/* 标伙伴 · 自定义样式 */
/* 滚动条美化 */
::-webkit-scrollbar {
width: 6px;
height: 6px;
}
::-webkit-scrollbar-track {
background: #f1f5f9;
border-radius: 3px;
}
::-webkit-scrollbar-thumb {
background: #cbd5e1;
border-radius: 3px;
}
::-webkit-scrollbar-thumb:hover {
background: #94a3b8;
}
/* 章节树左侧栏 */
.sidebar-fixed::-webkit-scrollbar {
width: 4px;
}
/* 正文内容排版 */
.prose-content {
font-family: 'SimSun', '宋体', 'Times New Roman', serif;
line-height: 1.9;
color: #374151;
}
/* 动画 */
@keyframes fadeIn {
from { opacity: 0; transform: translateY(8px); }
to { opacity: 1; transform: translateY(0); }
}
.fade-in {
animation: fadeIn 0.25s ease-out;
}
/* 表格样式(评分要求展示) */
.markdown-table table {
width: 100%;
border-collapse: collapse;
font-size: 13px;
}
.markdown-table th {
background: #f8fafc;
font-weight: 600;
color: #475569;
padding: 8px 12px;
border: 1px solid #e2e8f0;
text-align: left;
}
.markdown-table td {
padding: 7px 12px;
border: 1px solid #e2e8f0;
color: #334155;
}
.markdown-table tr:nth-child(even) td {
background: #f8fafc;
}
/* 步骤指示器 */
.step-active {
background: #2563eb;
color: #fff;
box-shadow: 0 2px 8px rgba(37,99,235,.35);
}
/* 文件上传拖拽高亮 */
.drop-active {
border-color: #3b82f6 !important;
background: #eff6ff !important;
}
/* 章节缩进指示线 */
.section-indent-line {
border-left: 2px solid #e2e8f0;
margin-left: 8px;
padding-left: 8px;
}
/* 打印样式 */
@media print {
header, nav, aside, button { display: none !important; }
main { padding: 0 !important; }
.bg-white { box-shadow: none !important; border: none !important; }
}

1226
templates/index.html Normal file

File diff suppressed because it is too large Load Diff

2308
templates/project.html Normal file

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,13 @@
{
"overall": false,
"details": [
{"rule": "身份信息隐藏", "passed": true, "message": "未发现投标人身份信息"},
{"rule": "标题格式", "passed": false, "message": "部分标题字号/字体/颜色/下划线不符合要求"},
{"rule": "正文格式", "passed": false, "message": "部分正文段落格式不符合要求"},
{"rule": "目录要求", "passed": true, "message": "目录符合无页码、无页眉页脚要求"},
{"rule": "图表规范", "passed": false, "message": "正文中发现2个图表或附件内图表文字格式错误"},
{"rule": "颜色与装饰", "passed": true, "message": "无彩色文字、无下划线、无着重号"},
{"rule": "页面设置", "passed": false, "message": "页面边距或纸张方向不符合要求"}
],
"violations": []
}

View File

@ -0,0 +1,95 @@
"""附件类章节识别与单图/单表类型选择。"""
import unittest
from utils import attachment_section as att
class TestIsAttachment(unittest.TestCase):
def test_positive(self):
r = att.load_attachment_rules()
self.assertTrue(att.is_attachment_only_section('附件一:施工平面布置', r))
self.assertTrue(att.is_attachment_only_section('附图 组织机构', r))
self.assertTrue(att.is_attachment_only_section('附表 人员一览', r))
def test_negative(self):
r = att.load_attachment_rules()
self.assertFalse(att.is_attachment_only_section('施工组织设计', r))
self.assertFalse(att.is_attachment_only_section('', r))
class TestPickKind(unittest.TestCase):
def test_only_figure_switch(self):
r = att.DEFAULT_ATTACHMENT_RULES
self.assertEqual(
att.pick_single_figure_or_table('附件一xxx', True, False, r),
'figure',
)
def test_only_table_switch(self):
r = att.DEFAULT_ATTACHMENT_RULES
self.assertEqual(
att.pick_single_figure_or_table('附件一xxx', False, True, r),
'table',
)
def test_both_off(self):
self.assertIsNone(
att.pick_single_figure_or_table('附件一', False, False, None),
)
def test_table_hint(self):
r = att.DEFAULT_ATTACHMENT_RULES
k = att.pick_single_figure_or_table('附件三 工程量一览表', True, True, r)
self.assertEqual(k, 'table')
def test_figure_hint(self):
r = att.DEFAULT_ATTACHMENT_RULES
k = att.pick_single_figure_or_table('附图 施工平面示意图', True, True, r)
self.assertEqual(k, 'figure')
def test_default_ambiguous(self):
r = dict(att.DEFAULT_ATTACHMENT_RULES)
r['default_kind_when_ambiguous'] = 'table'
k = att.pick_single_figure_or_table('附件五 其他资料', True, True, r)
self.assertEqual(k, 'table')
class TestAttachmentBodyMode(unittest.TestCase):
def test_default_stack_charts_only(self):
r = att.DEFAULT_ATTACHMENT_RULES
self.assertEqual(att.attachment_leaf_body_mode(r), 'stack_charts_only')
self.assertTrue(att.use_attachment_stack_charts_body(r))
self.assertFalse(att.use_attachment_single_chart_only_body(r))
self.assertFalse(att.use_attachment_full_body(r))
def test_full_mode(self):
r = dict(att.DEFAULT_ATTACHMENT_RULES)
r['attachment_leaf_body_mode'] = 'full'
self.assertEqual(att.attachment_leaf_body_mode(r), 'full')
self.assertTrue(att.use_attachment_full_body(r))
self.assertFalse(att.use_attachment_stack_charts_body(r))
def test_single_chart_only(self):
r = dict(att.DEFAULT_ATTACHMENT_RULES)
r['attachment_leaf_body_mode'] = 'single_chart_only'
self.assertTrue(att.use_attachment_single_chart_only_body(r))
self.assertTrue(att.use_attachment_stack_charts_body(r))
class TestExpandOutlineSkip(unittest.TestCase):
def test_should_skip_attachment(self):
self.assertTrue(att.should_skip_expand_subchapters('附件一:平面图'))
self.assertTrue(att.should_skip_expand_subchapters('附图 示意'))
def test_should_skip_normal_chapter(self):
self.assertFalse(att.should_skip_expand_subchapters('施工组织设计'))
self.assertFalse(att.should_skip_expand_subchapters('质量管理体系与措施'))
def test_parse_attachment_label(self):
self.assertEqual(att.parse_attachment_label('附件一:平面图'), '')
self.assertEqual(att.parse_attachment_label('附件2 承诺书'), '2')
self.assertEqual(att.parse_attachment_label('附图 总平面'), '附图')
if __name__ == '__main__':
unittest.main()

View File

@ -0,0 +1,52 @@
"""工程量清单本地分析单元测试。"""
import unittest
from utils.bill_analysis import (
analyze_boq_pages,
filter_bill_pages,
parse_bill_text,
)
class TestParseBillText(unittest.TestCase):
def test_code_name_unit_qty(self):
text = '010101001001 挖土方 m3 100.5 土壤类别:三类土'
r = parse_bill_text(text)
self.assertIn('categories', r)
self.assertTrue(r['categories'])
cat = r['categories'][0]
self.assertEqual(cat['name'], '未分类')
self.assertEqual(len(cat['items']), 1)
it = cat['items'][0]
self.assertEqual(it['code'], '010101001001')
self.assertIn('挖土', it['name'])
self.assertEqual(it['unit'], 'm3')
self.assertEqual(it['quantity'], '100.5')
def test_hierarchical_line_prefix(self):
text = '1.1 010101001001 基础开挖 m3 50'
r = parse_bill_text(text)
it = r['categories'][0]['items'][0]
self.assertEqual(it['code'], '010101001001')
class TestFilterBillPages(unittest.TestCase):
def test_two_pages_gap_fill(self):
p0 = '目录 前言'
p1 = '分部分项工程量清单\n项目编码 项目名称 工程量\n010101001001 项 m3 1'
p2 = '续表无表头\n010101002001 土 m2 2'
p3 = '规费 税金 社会保险费 住房公积金 其他说明'
pages, meta = filter_bill_pages([p0, p1, p2, p3])
self.assertEqual(meta['total_pages'], 4)
self.assertGreaterEqual(len(pages), 2)
merged = '\n'.join(pages)
self.assertIn('010101001001', merged)
self.assertIn('010101002001', merged)
def test_analyze_scanned_empty(self):
r = analyze_boq_pages(['', ' ', ''])
self.assertTrue(r.get('scanned'))
if __name__ == '__main__':
unittest.main()

View File

@ -0,0 +1,63 @@
"""技术暗标 HTML 格式检查:结构校验与极简用例(标准库 unittest"""
import json
import os
import sys
import unittest
# 保证可 `python tests/test_*.py` 从项目根导入 `modules`
_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
if _ROOT not in sys.path:
sys.path.insert(0, _ROOT)
from modules.dark_bid_format_check import check_technical_bid # noqa: E402
def _sample_schema_path():
return os.path.join(os.path.dirname(__file__), "fixtures", "dark_bid_report_sample.json")
class TestDarkBidFormatCheck(unittest.TestCase):
def test_sample_fixture_keys(self):
with open(_sample_schema_path(), encoding="utf-8") as f:
sample = json.load(f)
self.assertIn("overall", sample)
self.assertIn("details", sample)
self.assertIn("violations", sample)
for d in sample["details"]:
self.assertTrue({"rule", "passed", "message"}.issubset(d.keys()))
def test_check_returns_structure(self):
html = """<!DOCTYPE html><html><head><style>
@page { margin: 2.54cm 3.18cm 2.54cm 3.18cm; size: A4; }
</style></head><body style="margin:2.54cm 3.18cm">
<div class="toc">第一章 概述</div>
<h2 style="font-size:16pt;font-family:SimHei;font-weight:bold;color:#000">标题</h2>
<p style="font-size:14pt;font-family:SimSun;line-height:26pt;text-indent:2em;color:#000">
正文内容示例</p>
</body></html>"""
r = check_technical_bid(html)
self.assertIsInstance(r["overall"], bool)
self.assertEqual(len(r["details"]), 7)
rules = [x["rule"] for x in r["details"]]
self.assertIn("身份信息隐藏", rules)
self.assertIn("标题格式", rules)
def test_empty_html(self):
r = check_technical_bid("")
self.assertFalse(r["overall"])
def test_identity_fail_on_company(self):
html = (
"<html><body><p style='font-size:14pt;font-family:SimSun;"
"line-height:26pt;text-indent:2em;color:#000'>我公司参与投标</p>"
"<div class='toc'>x</div>"
"<style>@page{margin:2.54cm 3.18cm 2.54cm 3.18cm}</style>"
"</body></html>"
)
r = check_technical_bid(html)
id_rule = next(x for x in r["details"] if x["rule"] == "身份信息隐藏")
self.assertFalse(id_rule["passed"])
if __name__ == "__main__":
unittest.main()

View File

@ -0,0 +1,121 @@
"""图表意图栈与特征计分。"""
import unittest
from utils import diagram_intent as di
def _base_rules() -> dict:
return {
'schema_version': 1,
'threshold_figure': 1.0,
'threshold_table': 1.0,
'title_weight': 1.0,
'context_weight': 0.6,
'outline_context_lines': {'before': 2, 'after': 2},
'stack_order_when_both': 'score_desc',
'figure_keywords': [
{'text': '进度', 'weight': 1.2},
{'text': '横道', 'weight': 1.5},
],
'table_keywords': [
{'text': '一览表', 'weight': 1.5},
{'text': '人员', 'weight': 1.0},
],
}
class TestScoreFigureTable(unittest.TestCase):
def test_figure_higher_on_progress(self):
r = _base_rules()
f, t = di.score_figure_table('施工进度与横道计划', '', r)
self.assertGreater(f, t)
def test_table_higher_on_roster(self):
r = _base_rules()
f, t = di.score_figure_table('主要管理人员配置一览表', '', r)
self.assertGreater(t, f)
class TestBuildStack(unittest.TestCase):
def test_gate_figure_off(self):
r = _base_rules()
st = di.build_stack(5.0, 5.0, r, enable_figure=False, enable_table=True)
self.assertEqual(len(st), 1)
self.assertEqual(st[0].kind, 'table')
def test_score_desc_order(self):
r = dict(_base_rules())
r['stack_order_when_both'] = 'score_desc'
st = di.build_stack(3.0, 1.0, r, True, True)
self.assertEqual(len(st), 2)
self.assertEqual(st[0].kind, 'figure')
self.assertGreater(st[0].score, st[1].score)
def test_figure_first(self):
r = dict(_base_rules())
r['stack_order_when_both'] = 'figure_first'
st = di.build_stack(2.0, 5.0, r, True, True)
self.assertEqual(st[0].kind, 'figure')
self.assertEqual(st[1].kind, 'table')
def test_below_threshold_empty(self):
r = dict(_base_rules())
r['threshold_figure'] = 10.0
r['threshold_table'] = 10.0
st = di.build_stack(1.0, 1.0, r, True, True)
self.assertEqual(st, [])
class TestOutlineWindow(unittest.TestCase):
def test_finds_title_line(self):
outline = '一、总则\n二、进度\n 2.1 横道计划\n三、尾'
w = di.extract_outline_window(outline, '2.1 横道计划', 1, 1)
self.assertIn('横道', w)
def test_fallback_prefix(self):
w = di.extract_outline_window('abc' * 400, '不存在的标题', 2, 2)
self.assertTrue(len(w) > 0)
class TestAgentRender(unittest.TestCase):
def test_render_non_empty_when_match(self):
r = dict(_base_rules())
r['threshold_figure'] = 0.5
r['threshold_table'] = 0.5
agent = di.DiagramIntentAgent(r)
s = agent.render_for_section(
'施工进度横道图编制说明',
'大纲\n进度\n横道',
True,
True,
)
self.assertIn('图示生成规范', s)
self.assertIn('本节图表生成优先级', s)
def test_render_empty_when_scores_low(self):
r = dict(_base_rules())
r['threshold_figure'] = 100.0
r['threshold_table'] = 100.0
agent = di.DiagramIntentAgent(r)
s = agent.render_for_section('无关标题', '无关', True, True)
self.assertEqual(s, '')
class TestStackHelpers(unittest.TestCase):
def test_stack_compact_labels(self):
st = [
di.DiagramIntent('figure', 1.0, 't'),
di.DiagramIntent('table', 1.0, 't'),
]
lab = di.stack_compact_labels(st)
self.assertEqual(len(lab), 2)
self.assertIn('[FIGURE]', lab[0])
def test_make_fallback_stack(self):
st = di.make_fallback_stack('figure')
self.assertEqual(len(st), 1)
self.assertEqual(st[0].kind, 'figure')
if __name__ == '__main__':
unittest.main()

View File

@ -0,0 +1,44 @@
"""目录号格式化与大纲带号写回。"""
import os
import sys
import unittest
_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
if _ROOT not in sys.path:
sys.path.insert(0, _ROOT)
from modules.generator import _parse_outline, _sections_to_outline_text # noqa: E402
from utils.outline_numbering import format_heading_display, int_to_chinese_numeral # noqa: E402
class TestOutlineNumbering(unittest.TestCase):
def test_int_to_chinese(self):
self.assertEqual(int_to_chinese_numeral(1), "")
self.assertEqual(int_to_chinese_numeral(10), "")
self.assertEqual(int_to_chinese_numeral(11), "十一")
self.assertEqual(int_to_chinese_numeral(23), "二十三")
def test_format_heading(self):
self.assertEqual(format_heading_display(1, "3", "总体"), "三、总体")
self.assertEqual(format_heading_display(2, "1.2", "子节"), "1.2 子节")
def test_sections_to_outline_text_has_numbers(self):
sections = [
{"level": 1, "title": "第一章", "number": "1"},
{"level": 2, "title": "小节", "number": "1.1"},
]
text = _sections_to_outline_text("某项目技术标书", sections)
self.assertIn("某项目技术标书", text)
self.assertIn("一、第一章", text)
self.assertIn("1.1 小节", text)
def test_parse_roundtrip_numbered_outline(self):
raw = "标书标题\n一、第一章\n1.1 节A\n"
_, sections, normalized = _parse_outline(raw)
self.assertGreaterEqual(len(sections), 2)
self.assertIn("一、第一章", normalized)
self.assertIn("1.1 节A", normalized)
if __name__ == "__main__":
unittest.main()

View File

@ -0,0 +1,24 @@
"""大纲解析1.1 类编号不得被误拆成一级 1 与 title '.1 标题'"""
import unittest
from modules.generator import _parse_outline
class TestParseOutline(unittest.TestCase):
def test_11_stays_single_section(self):
text = "某某项目标书标题\n1.1 沟槽开挖与支护\n1.2 排降水\n"
_, sections, _ = _parse_outline(text)
self.assertEqual(len(sections), 2, [s.get('number') for s in sections])
for s in sections:
if s.get('level') == 1:
self.assertFalse(
(s.get('title') or '').lstrip().startswith('.'),
'不得出现一级章节 title 以 .1 开头(误将 1.1 拆成 1 与 .1 标题)',
)
titles = ' '.join(s['title'] for s in sections)
self.assertIn('沟槽', titles)
self.assertIn('排降', titles)
if __name__ == '__main__':
unittest.main()

View File

@ -0,0 +1,86 @@
"""目标页数与一级篇章区间。"""
import random
import unittest
from utils import volume_chapters as vc
class TestVolumeChapters(unittest.TestCase):
def test_top_level_default_pages_zero(self):
lo, hi = vc.top_level_chapter_range_from_pages(0)
self.assertEqual((lo, hi), (8, 10))
def test_ranges_match_effective_volume_bands(self):
self.assertEqual(vc.top_level_chapter_range_from_pages(100), (6, 8))
self.assertEqual(vc.top_level_chapter_range_from_pages(125), (6, 8))
self.assertEqual(vc.top_level_chapter_range_from_pages(150), (8, 10))
self.assertEqual(vc.top_level_chapter_range_from_pages(200), (10, 12))
self.assertEqual(vc.top_level_chapter_range_from_pages(300), (12, 16))
def test_hint_default_no_pages(self):
h = vc.outline_chapter_count_hint(0, 'standard')
self.assertIn('8-10', h)
self.assertIn('不超过10', h)
def test_hint_with_pages(self):
h = vc.outline_chapter_count_hint(150, 'standard', 700)
self.assertIn('约 810', h)
self.assertIn('150', h)
self.assertIn('105000', h) # 150×700 总字目标
self.assertIn('过细', h)
def test_subchapter_base_anchor_points(self):
self.assertAlmostEqual(vc.subchapter_total_base_from_pages(100), 78.0, places=5)
self.assertAlmostEqual(vc.subchapter_total_base_from_pages(300), 212.0, places=5)
self.assertEqual(vc.SUBCHAPTER_PAGES_SLOPE, 0.67)
self.assertEqual(vc.SUBCHAPTER_PAGES_INTERCEPT, 11.0)
def test_subchapter_jitter_bounds_78_anchor(self):
"""100 页基线 78 章±10% 严格为 [70, 86]。"""
self.assertEqual(vc.subchapter_jitter_bounds(78.0), (70, 86))
def test_subchapter_jitter_bounds_300_pages(self):
self.assertEqual(vc.subchapter_jitter_bounds(212.0), (191, 233))
def test_allocate_subchapters_to_mains(self):
self.assertEqual(vc.allocate_subchapters_to_mains(10, 3), [4, 3, 3])
self.assertEqual(vc.allocate_subchapters_to_mains(0, 3), [0, 0, 0])
self.assertEqual(vc.allocate_subchapters_to_mains(5, 2), [3, 2])
self.assertEqual(vc.allocate_subchapters_to_mains(7, 0), [])
# n < k 时多出的主章 quota 为 0
a = vc.allocate_subchapters_to_mains(70, 100)
self.assertEqual(len(a), 100)
self.assertEqual(sum(a), 70)
self.assertEqual(a.count(1), 70)
self.assertEqual(a.count(0), 30)
def test_subchapter_effective_respects_k_floor_and_jitter(self):
# round(78 * u) for u in [0.9, 1.1] stays in [70, 86] for 78.0 base
for seed in range(800):
n = vc.subchapter_total_effective(100, 1, random.Random(seed))
self.assertGreaterEqual(n, 70)
self.assertLessEqual(n, 86)
# 主章数很大时,总条数仍须在 [70, 86](不得被 max(n,k) 抬到数百)
for seed in range(20):
nk = vc.subchapter_total_effective(100, 500, random.Random(seed))
self.assertGreaterEqual(nk, 70, msg=f'seed={seed}')
self.assertLessEqual(nk, 86, msg=f'seed={seed}')
def test_subchapter_effective_zero_pages(self):
self.assertEqual(vc.subchapter_total_effective(0, 5), 0)
self.assertEqual(vc.subchapter_total_effective(100, 0), 0)
def test_resolve_expand_target_pages(self):
self.assertEqual(vc.resolve_expand_target_pages(None, True, 100, 200), 0)
self.assertEqual(vc.resolve_expand_target_pages(200, False, 100, 50), 200)
self.assertEqual(vc.resolve_expand_target_pages(0, False, 80, 0), 80)
self.assertEqual(vc.resolve_expand_target_pages(0, False, 0, 50), 50)
self.assertEqual(
vc.resolve_expand_target_pages(0, False, 0, 0),
vc.EXPAND_OUTLINE_DEFAULT_TARGET_PAGES,
)
self.assertEqual(vc.EXPAND_OUTLINE_DEFAULT_TARGET_PAGES, 100)
if __name__ == '__main__':
unittest.main()

View File

@ -0,0 +1,170 @@
"""字数分配与 rating_json 解析单元测试。"""
import json
import unittest
import config as cfg
from utils import word_allocation as wa
class TestParseRatingJson(unittest.TestCase):
def test_canonical_items(self):
raw = json.dumps(
{
'items': [
{'id': 'T1', 'name': '施工方案', 'weight': 30, 'keywords': ['工艺']},
{'id': 'T2', 'name': '质量保证', 'weight': 10, 'keywords': []},
],
'notes': '',
},
ensure_ascii=False,
)
items = wa.parse_rating_json(raw)
self.assertEqual(len(items), 2)
names = {x['name'] for x in items}
self.assertIn('施工方案', names)
self.assertIn('质量保证', names)
wmap = {x['name']: x['weight'] for x in items}
self.assertEqual(wmap['施工方案'], 30.0)
def test_malformed_returns_empty(self):
self.assertEqual(wa.parse_rating_json('not json'), [])
self.assertEqual(wa.parse_rating_json(''), [])
class TestComputeLeafAllocations(unittest.TestCase):
def test_none_when_no_rating_and_not_target_pages_budget(self):
leaves = [{'id': 1, 'section_title': '一、总体方案'}]
rules = dict(wa.DEFAULT_RULES)
rules['budget_mode'] = 'anchor_mean'
self.assertIsNone(
wa.compute_leaf_allocations('standard', leaves, '', rules)
)
def test_uniform_when_no_rating_but_target_pages(self):
"""无技术评分时仍按目标页均分 B=页×每页字,全稿不随节数 N 线性爆量。"""
leaves = [
{'id': 1, 'section_title': 'A'},
{'id': 2, 'section_title': 'B'},
]
rules = dict(wa.DEFAULT_RULES)
rules['budget_mode'] = 'target_pages'
old_tp = cfg.TARGET_PAGES
old_pce = cfg.PAGE_CHAR_ESTIMATE
try:
cfg.TARGET_PAGES = 100
cfg.PAGE_CHAR_ESTIMATE = 700
out = wa.compute_leaf_allocations('standard', leaves, '', rules)
finally:
cfg.TARGET_PAGES = old_tp
cfg.PAGE_CHAR_ESTIMATE = old_pce
self.assertIsNotNone(out)
s = out[1]['target_chars'] + out[2]['target_chars']
self.assertEqual(s, 100 * 700)
self.assertEqual(out[1]['target_chars'], out[2]['target_chars'])
def test_monotonicity_high_weight_match(self):
rating = json.dumps(
{
'items': [
{'name': '施工组织设计', 'weight': 50, 'keywords': ['进度']},
{'name': '页眉页脚规范', 'weight': 2, 'keywords': []},
]
},
ensure_ascii=False,
)
leaves = [
{'id': 10, 'section_title': '3.1 施工组织设计与进度计划'},
{'id': 11, 'section_title': '9.9 页眉格式说明'},
]
rules = dict(wa.DEFAULT_RULES)
rules['alpha'] = 0.95
out = wa.compute_leaf_allocations('standard', leaves, rating, rules)
self.assertIsNotNone(out)
t_high = out[10]['target_chars']
t_low = out[11]['target_chars']
self.assertGreaterEqual(t_high, t_low, '强匹配高分项的章节应不低于弱匹配章节')
self.assertIn('施工组织设计', out[10]['word_count_spec'])
def test_budget_anchor_mean(self):
rating = json.dumps(
{'items': [{'name': '技术部分', 'weight': 100}]},
ensure_ascii=False,
)
leaves = [
{'id': 1, 'section_title': 'A'},
{'id': 2, 'section_title': 'B'},
{'id': 3, 'section_title': 'C'},
]
rules = dict(wa.DEFAULT_RULES)
rules['budget_mode'] = 'anchor_mean'
rules['alpha'] = 0.0
old_tp = getattr(cfg, 'TARGET_PAGES', 0)
setattr(cfg, 'TARGET_PAGES', 0)
try:
out = wa.compute_leaf_allocations('standard', leaves, rating, rules)
finally:
setattr(cfg, 'TARGET_PAGES', old_tp)
self.assertIsNotNone(out)
base, core, _, _ = wa.VOLUME_PRESETS['standard']
expect = int(round(len(leaves) * (base + core) / 2.0))
s = sum(out[i]['target_chars'] for i in (1, 2, 3))
self.assertEqual(s, expect)
def test_budget_target_pages(self):
rating = json.dumps(
{'items': [{'name': '技术部分', 'weight': 100}]},
ensure_ascii=False,
)
leaves = [
{'id': 1, 'section_title': 'A'},
{'id': 2, 'section_title': 'B'},
]
rules = dict(wa.DEFAULT_RULES)
rules['budget_mode'] = 'target_pages'
rules['alpha'] = 0.0
old_tp = cfg.TARGET_PAGES
old_pce = cfg.PAGE_CHAR_ESTIMATE
try:
cfg.TARGET_PAGES = 100
cfg.PAGE_CHAR_ESTIMATE = 700
out = wa.compute_leaf_allocations('standard', leaves, rating, rules)
finally:
cfg.TARGET_PAGES = old_tp
cfg.PAGE_CHAR_ESTIMATE = old_pce
self.assertIsNotNone(out)
expect = 100 * 700
s = sum(out[i]['target_chars'] for i in (1, 2))
self.assertEqual(s, expect)
def test_budget_target_pages_falls_back_when_pages_zero(self):
rating = json.dumps(
{'items': [{'name': '技术部分', 'weight': 100}]},
ensure_ascii=False,
)
leaves = [
{'id': 1, 'section_title': 'A'},
{'id': 2, 'section_title': 'B'},
]
rules = dict(wa.DEFAULT_RULES)
rules['budget_mode'] = 'target_pages'
rules['alpha'] = 0.0
old_tp = cfg.TARGET_PAGES
try:
cfg.TARGET_PAGES = 0
out = wa.compute_leaf_allocations('standard', leaves, rating, rules)
finally:
cfg.TARGET_PAGES = old_tp
self.assertIsNotNone(out)
base, core, _, _ = wa.VOLUME_PRESETS['standard']
expect = int(round(len(leaves) * (base + core) / 2.0))
s = sum(out[i]['target_chars'] for i in (1, 2))
self.assertEqual(s, expect)
def test_continuation_threshold(self):
self.assertEqual(wa.continuation_threshold(2000), 1300)
self.assertEqual(wa.continuation_threshold(100), 200)
if __name__ == '__main__':
unittest.main()

1
utils/__init__.py Normal file
View File

@ -0,0 +1 @@

259
utils/ai_client.py Normal file
View File

@ -0,0 +1,259 @@
"""
AI API 调用封装支持 OpenAI阿里云通义千问DeepSeekOllama均兼容 OpenAI SDK
"""
import re
import time
import logging
from openai import OpenAI
import config
from contextlib import contextmanager # for type hints if needed
logger = logging.getLogger(__name__)
PROVIDER_NAMES = {
'qwen': '通义千问 (Qwen)',
'deepseek': 'DeepSeek',
'openai': 'OpenAI',
'ollama': 'Ollama 本地',
'doubao': '豆包 (Doubao)',
'kimi': 'Kimi (Moonshot)',
}
PROVIDER_LINKS = {
'qwen': 'https://dashscope.aliyun.com/',
'deepseek': 'https://platform.deepseek.com/',
'openai': 'https://platform.openai.com/',
'ollama': 'https://ollama.com/',
'doubao': 'https://console.volcengine.com/ark/',
'kimi': 'https://platform.moonshot.cn/',
}
def _check_api_key():
"""调用前预检 API Key无效时直接抛出友好提示不做无意义的重试"""
provider = config.MODEL_PROVIDER
# Ollama 本地无需 API Key跳过检查
if provider == 'ollama':
return
name = PROVIDER_NAMES.get(provider, provider)
link = PROVIDER_LINKS.get(provider, '')
if provider == 'qwen':
key = config.QWEN_API_KEY
elif provider == 'deepseek':
key = config.DEEPSEEK_API_KEY
elif provider == 'doubao':
key = config.DOUBAO_API_KEY
elif provider == 'kimi':
key = config.KIMI_API_KEY
else:
key = config.OPENAI_API_KEY
if not key or key.startswith('sk-your'):
raise RuntimeError(
f'尚未配置 {name} 的 API Key。'
f'请点击右上角设置按钮,选择"{name}"并填入有效的 API Key。'
f'申请地址:{link}'
)
def _get_client() -> OpenAI:
"""根据 MODEL_PROVIDER 返回对应的 OpenAI 兼容客户端"""
if config.MODEL_PROVIDER == 'qwen':
return OpenAI(api_key=config.QWEN_API_KEY, base_url=config.QWEN_BASE_URL)
if config.MODEL_PROVIDER == 'deepseek':
return OpenAI(api_key=config.DEEPSEEK_API_KEY, base_url=config.DEEPSEEK_BASE_URL)
if config.MODEL_PROVIDER == 'ollama':
return OpenAI(api_key='ollama', base_url=config.OLLAMA_BASE_URL)
if config.MODEL_PROVIDER == 'doubao':
return OpenAI(api_key=config.DOUBAO_API_KEY, base_url=config.DOUBAO_BASE_URL)
if config.MODEL_PROVIDER == 'kimi':
return OpenAI(api_key=config.KIMI_API_KEY, base_url=config.KIMI_BASE_URL)
return OpenAI(api_key=config.OPENAI_API_KEY, base_url=config.OPENAI_BASE_URL)
def _get_model() -> str:
if config.MODEL_PROVIDER == 'qwen':
return config.QWEN_MODEL
if config.MODEL_PROVIDER == 'deepseek':
return config.DEEPSEEK_MODEL
if config.MODEL_PROVIDER == 'ollama':
return config.OLLAMA_MODEL
if config.MODEL_PROVIDER == 'doubao':
return config.DOUBAO_MODEL
if config.MODEL_PROVIDER == 'kimi':
return config.KIMI_MODEL
return config.OPENAI_MODEL
def _clean_response(text: str) -> str:
"""
过滤推理模型DeepSeek R1 / QwQ 输出的 <think>...</think> 思考过程标签
只保留最终正文内容避免思考链污染标书正文
"""
# 去除 <think>...</think> 块(含跨行内容)
text = re.sub(r'<think>[\s\S]*?</think>', '', text, flags=re.IGNORECASE)
return text.strip()
def _is_auth_error(e: Exception) -> bool:
"""判断是否为认证错误401 / invalid_api_key无需重试"""
# 优先用 openai 原生异常类型判断
try:
from openai import AuthenticationError, PermissionDeniedError
if isinstance(e, (AuthenticationError, PermissionDeniedError)):
return True
except ImportError:
pass
# 兜底:字符串匹配
err_str = str(e).lower()
return ('401' in err_str or 'invalid_api_key' in err_str
or 'incorrect api key' in err_str or 'authentication' in err_str)
# OpenAI o 系列推理模型:不支持 temperaturemax_tokens 需用 max_completion_tokens
_OPENAI_REASONING_MODELS = {'o1', 'o1-mini', 'o1-pro', 'o3', 'o3-mini', 'o3-pro', 'o4-mini'}
def _build_chat_kwargs(
model: str,
messages: list,
temperature: float,
max_tokens: int,
request_timeout: float | None = None,
) -> dict:
"""
根据模型类型构建 chat.completions.create 的参数字典
OpenAI o 系列推理模型不接受 temperature且使用 max_completion_tokens 替代 max_tokens
"""
base_model = model.split(':')[0] # 去掉 ollama tag 后缀
is_reasoning = base_model in _OPENAI_REASONING_MODELS
to = request_timeout if request_timeout is not None else config.REQUEST_TIMEOUT
kwargs = {
'model': model,
'messages': messages,
'timeout': to,
}
if is_reasoning:
kwargs['max_completion_tokens'] = max_tokens
else:
kwargs['temperature'] = temperature
kwargs['max_tokens'] = max_tokens
return kwargs
def chat(
prompt: str,
system: str = '你是一位专业的投标文件撰写专家。',
temperature: float = 0.7,
max_tokens: int = 8192,
retries: int = None,
request_timeout: float | None = None,
) -> str:
"""
调用 AI 接口返回文本响应
认证错误立即终止其他错误指数退避重试
自动兼容 OpenAI o 系列推理模型的参数差异
所有调用受全局LLM_SEMAPHORE(上限20)保护实现极速并发优化
"""
_check_api_key()
max_retries = retries if retries is not None else config.MAX_RETRIES
client = _get_client()
model = _get_model()
provider = config.MODEL_PROVIDER
name = PROVIDER_NAMES.get(provider, provider)
messages = [
{'role': 'system', 'content': system},
{'role': 'user', 'content': prompt},
]
for attempt in range(max_retries):
try:
with config.llm_call(): # 全局并发控制上限20
kwargs = _build_chat_kwargs(
model, messages, temperature, max_tokens, request_timeout=request_timeout
)
resp = client.chat.completions.create(**kwargs)
return _clean_response(resp.choices[0].message.content.strip())
except Exception as e:
if _is_auth_error(e):
raise RuntimeError(
f'{name} API Key 无效或已过期,请在设置中重新配置。'
f'申请地址:{PROVIDER_LINKS.get(provider, "")}'
) from e
wait = 2 ** attempt
logger.warning(f'AI 请求失败 (第{attempt+1}次){wait}s 后重试: {e}')
if attempt < max_retries - 1:
time.sleep(wait)
else:
raise RuntimeError(f'AI 接口调用失败(已重试 {max_retries} 次): {e}') from e
return ''
def chat_with_history(system: str, messages: list,
temperature: float = 0.7, max_tokens: int = 4096) -> str:
"""
多轮对话接口支持完整历史上下文用于对话式章节生成
messages 格式[{'role': 'user'|'assistant', 'content': str}, ...]
受全局LLM_SEMAPHORE保护
"""
_check_api_key()
client = _get_client()
model = _get_model()
provider = config.MODEL_PROVIDER
name = PROVIDER_NAMES.get(provider, provider)
full_messages = [{'role': 'system', 'content': system}] + messages
for attempt in range(config.MAX_RETRIES):
try:
with config.llm_call(): # 全局并发控制
kwargs = _build_chat_kwargs(model, full_messages, temperature, max_tokens)
resp = client.chat.completions.create(**kwargs)
return _clean_response(resp.choices[0].message.content.strip())
except Exception as e:
if _is_auth_error(e):
raise RuntimeError(
f'{name} API Key 无效或已过期,请在设置中重新配置。'
f'申请地址:{PROVIDER_LINKS.get(provider, "")}'
) from e
wait = 2 ** attempt
logger.warning(f'对话 AI 请求失败 (第{attempt+1}次){wait}s 后重试: {e}')
if attempt < config.MAX_RETRIES - 1:
time.sleep(wait)
else:
raise RuntimeError(f'AI 接口调用失败(已重试 {config.MAX_RETRIES} 次): {e}') from e
return ''
def get_embeddings(texts: list[str]) -> list[list[float]]:
"""获取文本嵌入向量。
支持 QwenOpenAIKimiDeepSeek / Ollama / 豆包 暂不提供 Embedding API
受全局LLM_SEMAPHORE保护嵌入调用计入并发上限
"""
provider = config.MODEL_PROVIDER
if provider in ('deepseek', 'ollama', 'doubao'):
raise NotImplementedError(
f'{PROVIDER_NAMES.get(provider)} 暂不支持 Embedding API知识库将使用关键词检索降级'
)
client = _get_client()
if provider == 'qwen':
model = config.QWEN_EMBEDDING_MODEL
elif provider == 'kimi':
model = config.KIMI_EMBEDDING_MODEL
else:
model = config.OPENAI_EMBEDDING_MODEL
with config.llm_call(): # 嵌入也受并发限制
resp = client.embeddings.create(model=model, input=texts)
return [item.embedding for item in resp.data]

186
utils/attachment_section.py Normal file
View File

@ -0,0 +1,186 @@
"""
附件类章节识别标题匹配expand_outline 跳过以及正文模式完整正文 vs 仅单图单表
"""
from __future__ import annotations
import json
import logging
import os
import re
from typing import Any, Dict, List, Optional
import config
logger = logging.getLogger(__name__)
DEFAULT_ATTACHMENT_RULES: Dict[str, Any] = {
'schema_version': 1,
'title_regex': [
r'附件\s*[一二三四五六七八九十0-9A-Za-z、:.]',
r'\s*图',
r'\s*表',
r'\s*件\s*\(',
r'^\s*[\d一二三四五六七八九十\..、]+\s*附件',
],
'table_hint_keywords': [
'附表', '一览表', '清单表', '统计表', '明细表',
],
'figure_hint_keywords': [
'附图', '示意图', '平面图', '流程图', '布置图', '组织图', '横道',
],
'default_kind_when_ambiguous': 'table',
# stack_charts_only默认意图栈只输出 [FIGURE]/[TABLE] 无正文full长文single_chart_only栈顶仅一块
'attachment_leaf_body_mode': 'stack_charts_only',
}
def attachment_rules_path() -> str:
return os.path.join(config.DATA_DIR, 'attachment_section_rules.json')
def load_attachment_rules(path: Optional[str] = None) -> Dict[str, Any]:
p = path or attachment_rules_path()
data = dict(DEFAULT_ATTACHMENT_RULES)
if not os.path.isfile(p):
return data
try:
with open(p, encoding='utf-8') as f:
raw = json.load(f)
if isinstance(raw, dict):
for k, v in raw.items():
if k.startswith('_'):
continue
data[k] = v
except Exception as e:
logger.warning('加载 attachment_section_rules.json 失败,使用内置默认: %s', e)
return data
def attachment_leaf_body_mode(rules: Optional[Dict[str, Any]] = None) -> str:
"""
附件叶节点正文策略
stack_charts_only默认full完整技术正文single_chart_only栈顶仅一块图或表
"""
r = rules or get_attachment_rules_cached()
mode = (r.get('attachment_leaf_body_mode') or 'stack_charts_only').strip().lower()
if mode in ('single_chart_only', 'stack_charts_only', 'full'):
return mode
return 'stack_charts_only'
def use_attachment_stack_charts_body(rules: Optional[Dict[str, Any]] = None) -> bool:
"""附件走「意图栈仅图/表、无长文」路径(含 single_chart_only 的单栈顶版本)。"""
m = attachment_leaf_body_mode(rules)
return m in ('stack_charts_only', 'single_chart_only')
def use_attachment_single_chart_only_body(rules: Optional[Dict[str, Any]] = None) -> bool:
return attachment_leaf_body_mode(rules) == 'single_chart_only'
def use_attachment_full_body(rules: Optional[Dict[str, Any]] = None) -> bool:
return attachment_leaf_body_mode(rules) == 'full'
def is_attachment_only_section(section_title: str, rules: Optional[Dict[str, Any]] = None) -> bool:
"""标题是否属于附件类(附图/附表/附件N 等),用于 expand_outline 跳过与正文分支。"""
t = (section_title or '').strip()
if not t:
return False
r = rules or load_attachment_rules()
patterns: List[str] = list(r.get('title_regex') or [])
for pat in patterns:
try:
if re.search(pat, t):
return True
except re.error:
logger.warning('无效 attachment title_regex已跳过: %s', pat[:80])
return False
def pick_single_figure_or_table(
section_title: str,
enable_figure: bool,
enable_table: bool,
rules: Optional[Dict[str, Any]] = None,
) -> Optional[str]:
"""
返回 'figure' | 'table' | None
两开关均关返回 None仅开一个则取对应类型
"""
if not enable_figure and not enable_table:
return None
if enable_figure and not enable_table:
return 'figure'
if enable_table and not enable_figure:
return 'table'
r = rules or load_attachment_rules()
t = (section_title or '')
tbl_kw = list(r.get('table_hint_keywords') or [])
fig_kw = list(r.get('figure_hint_keywords') or [])
# 单独「表」字易误判,仅当同时存在附表类或与其它词组合时再偏表
for kw in tbl_kw:
if kw and kw in t:
return 'table'
for kw in fig_kw:
if kw and kw in t:
return 'figure'
# 泛「表」在附件语境下常见
if '' in t and '' not in t:
return 'table'
if '' in t and '' not in t:
return 'figure'
default = (r.get('default_kind_when_ambiguous') or 'table').strip().lower()
if default == 'figure':
return 'figure'
return 'table'
_cached_rules: Optional[Dict[str, Any]] = None
def get_attachment_rules_cached() -> Dict[str, Any]:
global _cached_rules
if _cached_rules is None:
_cached_rules = load_attachment_rules()
return _cached_rules
def should_skip_expand_subchapters(title: str) -> bool:
"""
AI 自动填充小章节expand_outline附件类一级主章不调用子章节生成
判定与 is_attachment_only_section 一致
"""
return is_attachment_only_section(title, get_attachment_rules_cached())
# 从标题中提取「附件几」等标签,用于日志
_ATTACHMENT_LABEL_RE = re.compile(
r'附件\s*[:]?\s*([一二三四五六七八九十百0-9A-Za-z]+)',
)
_ATTACHMENT_FIG_TBL_RE = re.compile(r'\s*[图表]\s*([一二三四五六七八九十百0-9]*)')
def parse_attachment_label(title: str) -> Optional[str]:
t = (title or '').strip()
if not t:
return None
m = _ATTACHMENT_LABEL_RE.search(t)
if m:
return m.group(1).strip() or None
m2 = _ATTACHMENT_FIG_TBL_RE.search(t)
if m2:
rest = (m2.group(1) or '').strip()
if rest:
return rest
matched = m2.group(0)
if '' in matched:
return '附图'
return '附表'
if re.search(r'\s*图', t):
return '附图'
if re.search(r'\s*表', t):
return '附表'
return None

577
utils/bill_analysis.py Normal file
View File

@ -0,0 +1,577 @@
"""
工程量清单本地分析 bill-worker.js Phase 2/3 移植
Phase 2按页关键字筛选清单页Phase 3正则解析分部与清单项
"""
from __future__ import annotations
import logging
import re
from typing import Any
logger = logging.getLogger(__name__)
BILL_KW = ['项目编码', '项目名称', '工程量', '计量单位', '综合单价', '清单编码']
SEC_KW = ['分部分项', '分类分项', '措施项目', '其他项目', '工程量清单计价']
FEE_PAGE_KW = [
'规费', '税金', '社会保险费', '住房公积金', '养老保险',
'工伤保险', '失业保险', '医疗保险', '教育费附加', '城市维护建设税',
]
ITEM_START = re.compile(r'^\d+(\.\d+)+\s')
CODE_INLINE = re.compile(r'(?:^|\s)(\d{9,12}|(?<![A-Za-z])B\d{5,6})\s')
CODE_START_RE = re.compile(r'^(\d{9,12}|B\d{5,6})\s')
SEQ_CODE_RE = re.compile(r'^\d{1,4}\s+(\d{9,12}|(?<![A-Za-z])B\d{5,6})\s')
PAGE_MARK = re.compile(r'^--\s*\d+\s+of\s+\d+\s*--')
HEADER_RE = re.compile(r'^序号\s+(项目编码|项目名称)')
HEADER_KW = re.compile(
r'^(项目编码|项目名称|清单编码|计量单位|综合单价|工程量|合\s*价|金额|序号)\s'
)
CATEGORY_MARKERS = [
'', '', '', '', '', '', '', '', '', '',
'(一)', '(二)', '(三)', '(四)', '(五)',
]
# 编码:行内 912 位数字或 B 编码(排除字母前缀如 GB
CODE_RE = re.compile(r'(?<![A-Za-z])(\d{9,12}|(?<![A-Za-z])B\d{5,6})')
UNIT_TOKENS = [
'', '', 'm3', 'm2', 'km', 'hm2', '', '', 't', 'kg',
'', '', '', '', '', '', '', '', '', '', '',
'', '', '', '', '', '', '', '', '', '', '',
'', '', '', '', '延m', '', '', 'm',
]
UNIT_SET = frozenset(UNIT_TOKENS)
_unit_escaped = [re.escape(u) for u in UNIT_TOKENS]
UNIT_RE = re.compile(r'(?:^|\s)(' + '|'.join(_unit_escaped) + r')(?=\s|\d|$)')
SKIP_RE = re.compile(r'\s*计|小\s*计|本页小计|总\s*计|价税合计')
_DASH_CODE = re.compile(
r'(\d{2,4})[-](\d{2,4})[-](\d{2,4})(?:[-](\d{2,4}))?'
)
_EXACT_FEE_ITEM = frozenset([
'规费', '税金', '利润', '增值税', '暂列金额', '暂估价', '计日工',
'总承包服务费', '企业管理费', '甲供材料保管费', '价税合计',
])
_FEE_KW = [
'安全文明', '文明施工费', '环境保护费', '临时设施费',
'夜间施工增加费', '夜间施工费',
'冬雨季施工增加费', '冬雨季施工费',
'二次搬运费', '大型机械设备进出场', '大型机械进出场',
'施工排水降水', '排水降水费',
'已完工程及设备保护', '已完工程保护费',
'工程排污费', '社会保障费', '住房公积金',
'工伤保险', '劳动保险', '意外伤害保险', '建筑工程保险',
'城市维护建设税', '城市建设维护税',
'教育费附加', '地方教育附加',
'材料暂估', '专业工程暂估',
'超高施工增加费', '安全防护费',
'措施项目费', '其他项目费', '不可竞争费',
]
_CAT_KW = [
'土建', '建筑', '结构', '装饰', '装修', '安装', '给排水', '暖通', '空调', '通风',
'电气', '强电', '弱电', '消防', '智能化', '幕墙', '门窗', '园林', '绿化', '景观',
'市政', '道路', '桥梁', '管网', '基础', '地基', '桩基', '主体', '屋面', '防水',
'保温', '钢结构', '排水', '给水', '照明', '动力', '防雷', '电梯', '人防', '室外',
'附属', '分部', '工程', '措施', '清单', '土石方', '混凝土', '砌筑', '模板', '脚手架',
'水利', '河道', '管道', '阀门', '设备', '仪表', '自动化', '通信', '网络',
'拆除', '外墙', '内墙', '楼地面', '天棚', '吊顶', '栏杆', '屋顶', '涂料', '抹灰',
'廊道', '阀门井', '蓄水池', '泵站', '供水', '引水', '水源', '渠道', '闸门',
'围栏', '警示', '检修', '管线', '配电', '水池', '水塔', '取水', '净水',
]
_EXACT_FEE_CAT = frozenset([
'规费', '税金', '利润', '增值税', '暂列金额', '暂估价', '计日工',
'总承包服务费', '企业管理费', '价税合计',
'措施项目费', '其他项目费', '不可竞争费',
])
_FEE_CAT_KW = [
'措施项目费', '其他项目费', '不可竞争费',
'规费汇总', '税金汇总', '费率', '费用汇总', '费用合计',
'暂列金额', '暂估价', '计日工', '总承包服务费',
'安全文明施工费', '社会保障费', '住房公积金',
'工伤保险', '教育费附加', '城市维护建设税',
]
_SPEC_KW_RE = re.compile(
r'(材质|规格|型号|品牌|颜色|尺寸|厚度|直径|管径|强度|等级|类别|类型|做法|要求|标准|内容|工作内容|土壤|含量|配合比|工艺|方式|形式|范围|部位|位置|高度|宽度|长度|深度|坡度|截面|跨度|运距|开挖|回填|混凝土|钢筋|压实)[:]'
)
def _fold_dash_codes(line: str) -> str:
def repl(m: re.Match) -> str:
a, b, c, d = m.group(1), m.group(2), m.group(3), m.group(4) or ''
combined = a + b + c + d
if 9 <= len(combined) <= 12:
return combined
return m.group(0)
return _DASH_CODE.sub(repl, line)
def is_fee_item(name: str) -> bool:
if not name:
return False
n = re.sub(r'\s+', '', name)
if n in _EXACT_FEE_ITEM:
return True
for kw in _FEE_KW:
if kw in n:
return True
return False
def split_name_and_spec(raw_name: str) -> tuple[str, str]:
if not raw_name:
return '', ''
m = re.search(r'\d+[.、.)\uFF09]\s*[\u4e00-\u9fff]', raw_name)
if m and m.start() > 0:
return raw_name[:m.start()].strip(), raw_name[m.start():].strip()
kw = _SPEC_KW_RE.search(raw_name)
if kw and kw.start() > 0:
return raw_name[:kw.start()].strip(), raw_name[kw.start():].strip()
paren = re.search(r'[(]\d+[)]', raw_name)
if paren and paren.start() > 0:
return raw_name[:paren.start()].strip(), raw_name[paren.start():].strip()
return raw_name, ''
def is_cat_title(text: str) -> bool:
return any(k in text for k in _CAT_KW)
def is_fee_cat_title(text: str) -> bool:
if not text:
return False
t = re.sub(r'\s+', '', text)
if t in _EXACT_FEE_CAT:
return True
for kw in _FEE_CAT_KW:
if kw in t:
return True
return False
def _is_new_line_trigger(raw: str) -> bool:
if ITEM_START.match(raw):
return True
if CODE_START_RE.match(raw):
return True
if SEQ_CODE_RE.match(raw):
return True
for m in CATEGORY_MARKERS:
if raw.startswith(m + ' ') or raw.startswith(m + '\u3000'):
return True
return False
def parse_bill_text(text: str) -> dict[str, Any]:
raw_lines = []
for l in text.split('\n'):
line = l.replace('\t', ' ').strip()
line = _fold_dash_codes(line)
raw_lines.append(line)
logic_lines: list[str] = []
current_line = ''
for raw in raw_lines:
if not raw or PAGE_MARK.match(raw):
continue
if HEADER_RE.match(raw) or HEADER_KW.match(raw):
continue
if re.match(r'^(元)|^款章节号|^备注$|^第\d+页', raw):
continue
if _is_new_line_trigger(raw):
if current_line:
logic_lines.append(current_line)
current_line = raw
elif CODE_INLINE.search(raw) and len(raw) > 15:
if current_line:
logic_lines.append(current_line)
current_line = raw
else:
if current_line and len(current_line) > 300:
logic_lines.append(current_line)
current_line = raw
else:
current_line = current_line + ' ' + raw if current_line else raw
if current_line:
logic_lines.append(current_line)
logger.debug('合并后 %s 条逻辑行(原始 %s 行)', len(logic_lines), len(raw_lines))
categories: list[dict[str, Any]] = []
cur_cat: dict[str, Any] | None = None
cur_item: dict[str, Any] | None = None
for line in logic_lines:
if SKIP_RE.search(line):
continue
# 行首序号多级如「1.1.1.1 」或「14 位序号 + 空格 + 9 位以上编码」。
# 避免误删「行首即 912 位清单编码 + 空格」整段JS 原 \d+(\.\d+)* 会吞掉编码)。
stripped = line.strip()
m_hier = re.match(r'^\d+(?:\.\d+)+\s+', stripped)
if m_hier:
stripped = stripped[m_hier.end():].strip()
elif re.match(r'^\d{1,4}\s+\d{9}', stripped):
stripped = re.sub(r'^\d{1,4}\s+', '', stripped, count=1).strip()
if not stripped:
stripped = line.strip()
if not stripped:
continue
cm = CODE_RE.search(stripped)
if cm:
if cur_item and cur_cat:
cur_cat['items'].append(cur_item)
if not cur_cat:
cur_cat = {'name': '未分类', 'items': []}
categories.append(cur_cat)
code = cm.group(1)
rest = stripped[cm.end():].strip()
name, unit, quantity, spec = '', '', '', ''
unit_match = UNIT_RE.search(rest)
if unit_match:
ui = rest.find(unit_match.group(0))
raw_name = rest[:ui].strip()
unit = unit_match.group(1)
after_unit = rest[ui + len(unit_match.group(0)):].strip()
qm = re.match(r'^([\d,.]+)', after_unit)
if qm:
quantity = qm.group(1)
tail = after_unit[qm.end():].strip()
if tail:
tail_tokens = tail.split()
si = 0
while si < len(tail_tokens) and re.match(r'^[\d,.%\-]+$', tail_tokens[si]):
si += 1
spec_tail = ' '.join(tail_tokens[si:]).strip()
if spec_tail:
spec = spec_tail
ns_name, ns_spec = split_name_and_spec(raw_name)
name = ns_name
if ns_spec:
spec = ns_spec + (';' + spec if spec else '')
else:
tokens = [t for t in rest.split() if t]
found_unit_idx = -1
for ti in range(len(tokens) - 1, 0, -1):
if tokens[ti] in UNIT_SET:
found_unit_idx = ti
break
if found_unit_idx >= 1:
raw_name_str = ' '.join(tokens[:found_unit_idx])
ns_name, ns_spec = split_name_and_spec(raw_name_str)
name = ns_name
if ns_spec:
spec = ns_spec
unit = tokens[found_unit_idx]
after_tokens = tokens[found_unit_idx + 1:]
if after_tokens and re.match(r'^[\d,.]+$', after_tokens[0]):
quantity = after_tokens[0]
si = 1
while si < len(after_tokens) and re.match(r'^[\d,.%\-]+$', after_tokens[si]):
si += 1
spec_tail = ' '.join(after_tokens[si:]).strip()
if spec_tail:
spec = spec + ';' + spec_tail if spec else spec_tail
else:
name = rest
name = re.sub(r'\s+', '', name).strip()
for u in UNIT_TOKENS:
if name.endswith(u) and len(name) > len(u):
unit = unit or u
name = name[: len(name) - len(u)]
break
cur_item = {'code': code, 'name': name, 'unit': unit, 'quantity': quantity, 'spec': spec}
continue
if len(stripped) > 4:
uni_match = UNIT_RE.search(stripped)
if uni_match:
ui = stripped.find(uni_match.group(0))
before_unit = stripped[:ui].strip()
after_unit = stripped[ui + len(uni_match.group(0)):].strip()
has_qty = bool(re.match(r'^[\d,.]+', after_unit))
if (
2 <= len(before_unit) <= 50
and has_qty
and re.search(r'[\u4e00-\u9fff]', before_unit)
):
if cur_item and cur_cat:
cur_cat['items'].append(cur_item)
if not cur_cat:
cur_cat = {'name': '未分类', 'items': []}
categories.append(cur_cat)
unit_fb = uni_match.group(1)
qm = re.match(r'^([\d,.]+)', after_unit)
quantity_fb = qm.group(1) if qm else ''
ns_name, ns_spec = split_name_and_spec(before_unit)
name_fb = re.sub(r'\s+', '', ns_name).strip()
spec_fb = ns_spec or ''
cur_item = {'code': '', 'name': name_fb, 'unit': unit_fb, 'quantity': quantity_fb, 'spec': spec_fb}
continue
if 2 < len(stripped) < 60 and not CODE_RE.search(stripped):
if UNIT_RE.search(stripped) and re.search(r'\d+\.?\d*\s*$', stripped):
if cur_item:
cur_item['spec'] = (cur_item.get('spec') or '') + (
';' + stripped if cur_item.get('spec') else stripped
)
continue
if is_cat_title(stripped) and not UNIT_RE.search(stripped) and not is_fee_cat_title(stripped):
if cur_item and cur_cat:
cur_cat['items'].append(cur_item)
cur_item = None
clean_title = re.sub(
r'\s+(座|个|项|处|m|km|段|条)\s+\d+[\d.]*\s*$', '', stripped
).strip()
cur_cat = {'name': clean_title, 'items': []}
categories.append(cur_cat)
continue
if re.match(r'^[一二三四五六七八九十]+\s', stripped) or re.match(
r'^[一二三四五六七八九十\d]+', stripped
):
clean_title = re.sub(r'\s+(座|个|项|处)\s+\d+[\d.]*\s*$', '', stripped).strip()
if is_fee_cat_title(clean_title):
continue
if cur_item and cur_cat:
cur_cat['items'].append(cur_item)
cur_item = None
cur_cat = {'name': clean_title, 'items': []}
categories.append(cur_cat)
continue
if cur_item and len(stripped) > 1:
cur_item['spec'] = (cur_item.get('spec') or '') + (
';' + stripped if cur_item.get('spec') else stripped
)
if cur_item and cur_cat:
cur_cat['items'].append(cur_item)
fee_filtered = 0
for cat in categories:
if cat.get('items'):
before = len(cat['items'])
cat['items'] = [it for it in cat['items'] if not is_fee_item(it.get('name', ''))]
fee_filtered += before - len(cat['items'])
if fee_filtered:
logger.debug('费用项过滤: 移除 %s', fee_filtered)
total_before_merge = 0
total_after_merge = 0
for cat in categories:
items = cat.get('items') or []
if not items:
continue
total_before_merge += len(items)
name_map: dict[str, dict[str, Any]] = {}
for item in items:
key = re.sub(r'\s+', '', (item.get('name') or '')).strip()
if not key:
continue
if key not in name_map:
name_map[key] = {
'code': item.get('code') or '',
'name': item['name'],
'unit': item.get('unit') or '',
'quantity': item.get('quantity') or '',
'spec': item.get('spec') or '',
'_quantities': [item['quantity']] if item.get('quantity') else [],
'_specs': [item['spec']] if item.get('spec') else [],
}
else:
m = name_map[key]
if not m['code'] and item.get('code'):
m['code'] = item['code']
if not m['unit'] and item.get('unit'):
m['unit'] = item['unit']
if item.get('quantity'):
m['_quantities'].append(item['quantity'])
if item.get('spec') and item['spec'] not in m['_specs']:
m['_specs'].append(item['spec'])
merged_items: list[dict[str, str]] = []
for m in name_map.values():
qlist = m['_quantities']
if len(qlist) > 1:
nums = []
ok = True
for q in qlist:
try:
nums.append(float(q.replace(',', '')))
except ValueError:
ok = False
break
if ok:
s = sum(nums)
m['quantity'] = str(int(s)) if s % 1 == 0 else f'{s:.2f}'
else:
m['quantity'] = '; '.join(qlist)
elif len(qlist) == 1:
m['quantity'] = qlist[0]
if m['_specs']:
trimmed = [s[:120] + '...' if len(s) > 120 else s for s in m['_specs']]
m['spec'] = '; '.join(trimmed)
if len(m['spec']) > 300:
m['spec'] = m['spec'][:300] + '...'
for k in ('_quantities', '_specs'):
m.pop(k, None)
merged_items.append(
{k: m[k] for k in ('code', 'name', 'unit', 'quantity', 'spec')}
)
cat['items'] = merged_items
total_after_merge += len(merged_items)
merged_count = total_before_merge - total_after_merge
if merged_count > 0:
logger.debug('按名称合并: %s%s', total_before_merge, total_after_merge)
valid = [c for c in categories if c.get('items')]
total_items = sum(len(c['items']) for c in valid)
logger.debug(
'最终结果: %s 分部, %s 清单项', len(valid), total_items
)
return {
'project_summary': {
'remark': f'本地解析:{len(valid)} 个分部,{total_items} 个清单项(合并前 {total_before_merge} 项)',
},
'categories': valid,
}
def filter_bill_pages(page_texts: list[str]) -> tuple[list[str], dict[str, Any]]:
"""
从按页文本中筛选工程量清单相关页返回 (bill_page_texts, meta)
"""
n = len(page_texts)
meta: dict[str, Any] = {'total_pages': n, 'scanned': False, 'no_bill_pages': False}
total_chars = sum(len(t or '') for t in page_texts)
if total_chars < 50:
meta['scanned'] = True
meta['reason'] = 'noText'
return [], meta
bill_flags = [False] * n
for i, t in enumerate(page_texts):
if not (t or '').strip():
continue
t = t or ''
h_hits = sum(1 for k in BILL_KW if k in t)
s_hit = any(k in t for k in SEC_KW)
has_code = bool(re.search(r'\d{9}', t))
if h_hits >= 2 or s_hit or has_code:
bill_flags[i] = True
first_bill = next((i for i, f in enumerate(bill_flags) if f), -1)
last_bill = max((i for i, f in enumerate(bill_flags) if f), default=-1)
if first_bill >= 0 and last_bill > first_bill:
for i in range(first_bill, last_bill + 1):
if bill_flags[i]:
continue
t = page_texts[i] or ''
if not t.strip() or len(t.strip()) <= 30:
continue
fee_hits = sum(1 for kw in FEE_PAGE_KW if kw in t)
if fee_hits >= 2 and not re.search(r'\d{9}', t):
continue
bill_flags[i] = True
bill_texts = [page_texts[i] for i in range(n) if bill_flags[i]]
if not bill_texts:
meta['no_bill_pages'] = True
meta['bill_page_indices'] = [i for i in range(n) if bill_flags[i]]
meta['bill_pages'] = len(bill_texts)
return bill_texts, meta
def analyze_boq_pages(page_texts: list[str]) -> dict[str, Any]:
"""
串联筛选 + parse_bill_text返回结构含 _meta供持久化与前端
"""
total_pages = len(page_texts)
total_chars = sum(len(t or '') for t in page_texts)
if total_chars < 50:
return {
'scanned': True,
'reason': 'noText',
'totalPages': total_pages,
'project_summary': {'remark': '文本过少,疑似扫描件或未提取到文字'},
'categories': [],
'_meta': {
'method': 'python-local',
'total_pages': total_pages,
'bill_pages': 0,
},
}
bill_texts, fmeta = filter_bill_pages(page_texts)
if not bill_texts:
return {
'scanned': False,
'no_bill_pages': True,
'totalPages': total_pages,
'project_summary': {'remark': '未识别到清单相关页面'},
'categories': [],
'_meta': {
'method': 'python-local',
'total_pages': total_pages,
'bill_pages': 0,
**{k: fmeta[k] for k in ('no_bill_pages',) if k in fmeta},
},
}
merged = '\n'.join(bill_texts)
parsed = parse_bill_text(merged)
return {
'scanned': False,
**parsed,
'_meta': {
'method': 'python-local',
'total_pages': total_pages,
'bill_pages': len(bill_texts),
'bill_page_indices': fmeta.get('bill_page_indices', []),
},
}
def categories_to_prompt_appendix(
analysis: dict[str, Any],
max_chars: int = 3000,
max_per_cat: int = 40,
) -> str:
"""将本地解析结果压成短文本,注入 AI 摘要提示词。"""
cats = analysis.get('categories') or []
lines: list[str] = []
for cat in cats:
name = cat.get('name', '')
items = cat.get('items') or []
lines.append(f'{name}')
for it in items[:max_per_cat]:
code = it.get('code') or '-'
n = it.get('name') or ''
u = it.get('unit') or ''
q = it.get('quantity') or ''
lines.append(f' {code} {n} {u} {q}'.strip())
if len(items) > max_per_cat:
lines.append(f' …共 {len(items)} 条,此处省略其余')
text = '\n'.join(lines).strip()
if len(text) > max_chars:
return text[:max_chars] + '\n…(附录已截断)'
return text

138
utils/boq_parser.py Normal file
View File

@ -0,0 +1,138 @@
"""
工程量清单解析模块 Excel / CSV / PDF / Word 文件中提取结构化文本
"""
import csv
import logging
import re
from pathlib import Path
logger = logging.getLogger(__name__)
# 最大返回字符数(送给 AI 做摘要时截断)
MAX_BOQ_CHARS = 12000
def extract_boq_text(file_path: str) -> str:
"""
从工程量清单文件提取原始结构化文本
支持.xlsx / .xls / .csv / .pdf / .docx / .doc
"""
ext = Path(file_path).suffix.lower()
if ext in ('.xlsx', '.xls'):
text = _extract_excel(file_path)
elif ext == '.csv':
text = _extract_csv(file_path)
elif ext == '.pdf':
from utils.file_utils import _extract_pdf
text = _extract_pdf(file_path)
elif ext == '.docx':
from utils.file_utils import _extract_docx
text = _extract_docx(file_path)
elif ext == '.doc':
from utils.file_utils import _extract_doc
text = _extract_doc(file_path)
else:
raise ValueError(f'不支持的文件格式 {ext},请使用 xlsx/xls/csv/pdf/docx/doc')
return text[:MAX_BOQ_CHARS]
def extract_boq_pages(file_path: str) -> list[str]:
"""
返回按切分的清单文本PDF 为每页一段Excel/CSV/Word 为单元素全文
"""
ext = Path(file_path).suffix.lower()
if ext == '.pdf':
from utils.file_utils import extract_pdf_pages
return extract_pdf_pages(file_path)
text = extract_boq_text(file_path)
return [text] if text else ['']
# ─── Excel ────────────────────────────────────────────────────────────────
def _extract_excel(file_path: str) -> str:
try:
import openpyxl
wb = openpyxl.load_workbook(file_path, data_only=True, read_only=True)
parts = []
for name in wb.sheetnames:
ws = wb[name]
block = _sheet_to_text(ws, name)
if block.strip():
parts.append(block)
wb.close()
return '\n\n'.join(parts)
except ImportError:
return _extract_xls_fallback(file_path)
except Exception as e:
raise RuntimeError(f'Excel 解析失败:{e}') from e
def _sheet_to_text(ws, sheet_name: str) -> str:
"""将一个 Sheet 转为管道分隔文本,自动过滤全空行和全空列。"""
raw_rows = []
for row in ws.iter_rows(values_only=True):
cells = ['' if v is None else str(v).strip() for v in row]
if any(cells):
raw_rows.append(cells)
if not raw_rows:
return ''
# 对齐列数
max_cols = max(len(r) for r in raw_rows)
raw_rows = [r + [''] * (max_cols - len(r)) for r in raw_rows]
# 找出有内容的列索引
active_cols = [j for j in range(max_cols)
if any(raw_rows[i][j] for i in range(len(raw_rows)))]
if not active_cols:
return ''
lines = [f'{sheet_name}']
for row in raw_rows:
line = ' | '.join(row[j] for j in active_cols)
if line.replace('|', '').strip():
lines.append(line)
return '\n'.join(lines)
def _extract_xls_fallback(file_path: str) -> str:
"""旧版 .xls 使用 xlrd 兜底(需安装 xlrd<2"""
try:
import xlrd # type: ignore
wb = xlrd.open_workbook(file_path)
parts = []
for sheet in wb.sheets():
lines = [f'{sheet.name}']
for rx in range(sheet.nrows):
cells = [str(sheet.cell_value(rx, cx)).strip()
for cx in range(sheet.ncols)]
line = ' | '.join(c for c in cells if c)
if line:
lines.append(line)
parts.append('\n'.join(lines))
return '\n\n'.join(parts)
except Exception as e:
raise RuntimeError(f'.xls 解析失败,请另存为 .xlsx 后重试:{e}') from e
# ─── CSV ─────────────────────────────────────────────────────────────────
def _extract_csv(file_path: str) -> str:
encodings = ['utf-8-sig', 'gbk', 'utf-8', 'gb18030', 'latin-1']
for enc in encodings:
try:
lines = []
with open(file_path, 'r', encoding=enc, newline='') as f:
for row in csv.reader(f):
line = ' | '.join(c.strip() for c in row if c.strip())
if line:
lines.append(line)
return '\n'.join(lines)
except (UnicodeDecodeError, UnicodeError):
continue
except Exception as e:
raise RuntimeError(f'CSV 解析失败:{e}') from e
raise RuntimeError('CSV 文件编码不支持,请另存为 UTF-8 格式后重试')

283
utils/diagram_intent.py Normal file
View File

@ -0,0 +1,283 @@
"""
章节级图/表意图字符特征 + 大纲上下文窗口计分栈式优先级驱动提示词附加段
"""
from __future__ import annotations
import json
import logging
import os
import re
from dataclasses import dataclass
from typing import Any, Dict, List, Optional, Sequence, Tuple
import config
from utils import prompts as P
logger = logging.getLogger(__name__)
DEFAULT_DIAGRAM_RULES: Dict[str, Any] = {
'schema_version': 1,
'threshold_figure': 1.0,
'threshold_table': 1.0,
'title_weight': 1.0,
'context_weight': 0.6,
'outline_context_lines': {'before': 4, 'after': 6},
'stack_order_when_both': 'score_desc',
'figure_keywords': [],
'table_keywords': [],
}
def diagram_rules_path() -> str:
return os.path.join(config.DATA_DIR, 'diagram_intent_rules.json')
def load_diagram_rules(path: Optional[str] = None) -> Dict[str, Any]:
"""加载规则 JSON文件缺失或解析失败时返回内置默认。"""
p = path or diagram_rules_path()
data = dict(DEFAULT_DIAGRAM_RULES)
if not os.path.isfile(p):
return data
try:
with open(p, encoding='utf-8') as f:
raw = json.load(f)
if isinstance(raw, dict):
for k, v in raw.items():
if k.startswith('_'):
continue
if k == 'outline_context_lines' and isinstance(v, dict):
data['outline_context_lines'] = {
**data.get('outline_context_lines', {}),
**v,
}
else:
data[k] = v
except Exception as e:
logger.warning('加载 diagram_intent_rules.json 失败,使用内置默认: %s', e)
return data
def _normalize_keyword_entries(raw: Any) -> List[Tuple[str, float]]:
out: List[Tuple[str, float]] = []
if not isinstance(raw, list):
return out
for item in raw:
if isinstance(item, str) and item.strip():
out.append((item.strip(), 1.0))
elif isinstance(item, dict):
t = (item.get('text') or item.get('pattern') or '').strip()
if not t:
continue
w = float(item.get('weight', 1.0))
out.append((t, w))
return out
def _score_text(text: str, entries: Sequence[Tuple[str, float]]) -> float:
if not text or not entries:
return 0.0
s = 0.0
for kw, w in entries:
if kw in text:
s += w
return s
DiagramKind = str # 'figure' | 'table'
@dataclass(frozen=True)
class DiagramIntent:
kind: str
score: float
sources: str
# 栈顶 = index 0优先生效
DiagramStack = List[DiagramIntent]
def score_figure_table(
title: str,
context_snippet: str,
rules: Dict[str, Any],
) -> Tuple[float, float]:
"""标题与上下文分别计分后按权重合并。"""
fig_kw = _normalize_keyword_entries(rules.get('figure_keywords'))
tbl_kw = _normalize_keyword_entries(rules.get('table_keywords'))
tw = float(rules.get('title_weight', 1.0))
cw = float(rules.get('context_weight', 0.6))
t = title or ''
c = context_snippet or ''
fig = tw * _score_text(t, fig_kw) + cw * _score_text(c, fig_kw)
tbl = tw * _score_text(t, tbl_kw) + cw * _score_text(c, tbl_kw)
return fig, tbl
def extract_outline_window(
outline_text: str,
section_title: str,
before: int,
after: int,
fallback_chars: int = 1200,
) -> str:
"""
在大纲中定位章节标题所在行取上下窗口找不到则取全文前缀
"""
if not outline_text or not section_title:
return (outline_text or '')[:fallback_chars]
title_stripped = section_title.strip()
if not title_stripped:
return outline_text[:fallback_chars]
lines = outline_text.splitlines()
idx = -1
# 优先整行包含;否则子串匹配(去编号后)
def _strip_serial(s: str) -> str:
return re.sub(r'^\s*[\d一二三四五六七八九十]+[、.\s]+', '', s).strip()
core = _strip_serial(title_stripped)
for i, line in enumerate(lines):
line_s = line.strip()
if title_stripped in line_s or (core and core in _strip_serial(line_s)):
idx = i
break
if core and core in line_s:
idx = i
break
if idx < 0:
return outline_text[:fallback_chars]
lo = max(0, idx - max(0, before))
hi = min(len(lines), idx + max(0, after) + 1)
return '\n'.join(lines[lo:hi])
def build_stack(
fig_score: float,
tbl_score: float,
rules: Dict[str, Any],
enable_figure: bool,
enable_table: bool,
) -> DiagramStack:
tf = float(rules.get('threshold_figure', 1.0))
tt = float(rules.get('threshold_table', 1.0))
mode = (rules.get('stack_order_when_both') or 'score_desc').strip()
fig_ok = enable_figure and fig_score >= tf
tbl_ok = enable_table and tbl_score >= tt
intents: List[DiagramIntent] = []
if fig_ok:
intents.append(
DiagramIntent('figure', fig_score, 'title+context')
)
if tbl_ok:
intents.append(
DiagramIntent('table', tbl_score, 'title+context')
)
if len(intents) <= 1:
return intents
a, b = intents[0], intents[1]
if mode == 'figure_first':
order = [a, b] if a.kind == 'figure' else [b, a]
elif mode == 'table_first':
order = [a, b] if a.kind == 'table' else [b, a]
else: # score_desc — 高分在栈顶
order = sorted([a, b], key=lambda x: -x.score)
return order
def stack_compact_labels(stack: DiagramStack) -> List[str]:
"""与 stack_to_addon 中 labels 一致,供附件仅块输出的提示词。"""
labels: List[str] = []
for it in stack:
if it.kind == 'figure':
labels.append('图示([FIGURE] 块)')
else:
labels.append('表格([TABLE] 块)')
return labels
def make_fallback_stack(kind: str) -> DiagramStack:
"""栈空且需生成时,按单一 figure/table 占位。"""
k = (kind or '').strip().lower()
if k not in ('figure', 'table'):
k = 'table'
return [DiagramIntent(k, 1.0, 'fallback')]
def stack_to_addon(stack: DiagramStack) -> str:
"""按栈序拼接优先级说明 + 图示/表格规范全文。"""
if not stack:
return ''
labels: List[str] = []
for it in stack:
if it.kind == 'figure':
labels.append('图示([FIGURE] 块)')
else:
labels.append('表格([TABLE] 块)')
parts: List[str] = [P.diagram_priority_preamble(labels)]
for it in stack:
if it.kind == 'figure':
parts.append(P.get_figure_addon())
else:
parts.append(P.get_table_addon())
return ''.join(parts)
class DiagramIntentAgent:
"""可配置规则实例:对单节计算栈并渲染附加提示词。"""
def __init__(self, rules: Optional[Dict[str, Any]] = None) -> None:
self.rules = rules or load_diagram_rules()
@classmethod
def load_default(cls) -> 'DiagramIntentAgent':
return cls(load_diagram_rules())
def plan(
self,
section_title: str,
outline_text: str,
enable_figure: bool,
enable_table: bool,
) -> DiagramStack:
r = self.rules
oc = r.get('outline_context_lines') or {}
before = int(oc.get('before', 4))
after = int(oc.get('after', 6))
ctx = extract_outline_window(
outline_text, section_title, before, after,
)
fig_s, tbl_s = score_figure_table(section_title, ctx, r)
return build_stack(fig_s, tbl_s, r, enable_figure, enable_table)
def render_for_section(
self,
section_title: str,
outline_text: str,
enable_figure: bool,
enable_table: bool,
) -> str:
if not enable_figure and not enable_table:
return ''
stack = self.plan(
section_title, outline_text, enable_figure, enable_table,
)
return stack_to_addon(stack)
# 模块级默认实例,供 generator 单次调用
_default_agent: Optional[DiagramIntentAgent] = None
def get_diagram_agent() -> DiagramIntentAgent:
global _default_agent
if _default_agent is None:
_default_agent = DiagramIntentAgent.load_default()
return _default_agent
def invalidate_diagram_agent_cache() -> None:
global _default_agent
_default_agent = None

213
utils/file_utils.py Normal file
View File

@ -0,0 +1,213 @@
"""
文件处理工具 PDF / Word 文件中提取纯文本
"""
import os
import logging
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor, as_completed
from functools import partial
logger = logging.getLogger(__name__)
def extract_text(file_path: str) -> str:
"""
根据文件扩展名提取文本
支持 .pdf / .docx / .doc
"""
path = Path(file_path)
ext = path.suffix.lower()
if ext == '.pdf':
return _extract_pdf(file_path)
elif ext == '.docx':
return _extract_docx(file_path)
elif ext == '.doc':
return _extract_doc(file_path)
else:
raise ValueError(f'不支持的文件类型: {ext}')
def _extract_pdf(file_path: str) -> str:
"""提取 PDF 文本,优先使用 pypdf回退到 pdfminer"""
try:
from pypdf import PdfReader
reader = PdfReader(file_path)
parts = []
for page in reader.pages:
text = page.extract_text()
if text:
parts.append(text)
result = '\n'.join(parts)
if result.strip():
return result
except Exception as e:
logger.warning(f'pypdf 提取失败: {e},尝试 pdfminer')
try:
from pdfminer.high_level import extract_text as pm_extract
result = pm_extract(file_path)
return result or ''
except Exception as e:
logger.error(f'pdfminer 提取失败: {e}')
raise RuntimeError(f'PDF 文本提取失败: {e}')
def extract_pdf_pages(file_path: str) -> list[str]:
"""
按页提取 PDF 文本用于工程量清单页筛选
极速优化对于>5页PDF使用ThreadPool并行提取页面 (plan要求)大幅加速解析环节
优先 pypdf 逐页若各页均无文本则回退 pdfminer
"""
try:
from pypdf import PdfReader
reader = PdfReader(file_path)
if len(reader.pages) <= 5:
# 小文件顺序更快
pages = [(page.extract_text() or '').strip() for page in reader.pages]
else:
# 并行提取大PDF (fulfills plan's Parallel Extract Pages)
def _extract_page(page):
return (page.extract_text() or '').strip()
with ThreadPoolExecutor(max_workers=4) as executor:
pages = list(executor.map(_extract_page, reader.pages))
if any(pages):
return pages
except Exception as e:
logger.warning(f'pypdf 按页提取失败: {e},尝试 pdfminer')
try:
from pdfminer.high_level import extract_text as pm_extract
blob = (pm_extract(file_path) or '').strip()
return [blob] if blob else ['']
except Exception as e:
logger.error(f'pdfminer 提取失败: {e}')
raise RuntimeError(f'PDF 文本提取失败: {e}')
def _extract_docx(file_path: str) -> str:
"""提取 .docx 文档文本python-docx"""
try:
from docx import Document
doc = Document(file_path)
parts = []
for para in doc.paragraphs:
if para.text.strip():
parts.append(para.text)
for table in doc.tables:
for row in table.rows:
row_texts = [cell.text.strip() for cell in row.cells if cell.text.strip()]
if row_texts:
parts.append(' '.join(row_texts))
return '\n'.join(parts)
except Exception as e:
logger.error(f'.docx 提取失败: {e}')
raise RuntimeError(f'Word 文本提取失败: {e}')
def _extract_doc(file_path: str) -> str:
"""
提取旧版 .doc 文件文本按优先级依次尝试
1. win32comWindows + Microsoft Word 已安装最准确
2. LibreOffice 命令行转换需安装 LibreOffice
3. python-docx 兼容尝试部分以 XML 保存的伪 .doc 可读
全部失败时提示用户手动另存为 .docx
"""
abs_path = str(Path(file_path).resolve())
# ── 方案1win32comWindows + Word──────────────────────────────────
try:
import win32com.client
import pythoncom
pythoncom.CoInitialize()
word = None
try:
word = win32com.client.Dispatch('Word.Application')
word.Visible = False
doc = word.Documents.Open(abs_path, ReadOnly=True)
text = doc.Range().Text
doc.Close(False)
logger.info(f'.doc 通过 win32com 提取成功: {file_path}')
return text or ''
finally:
if word:
try:
word.Quit()
except Exception:
pass
pythoncom.CoUninitialize()
except ImportError:
logger.info('pywin32 未安装,跳过 win32com 方案')
except Exception as e:
logger.warning(f'win32com 提取 .doc 失败: {e}')
# ── 方案2LibreOffice 命令行 ─────────────────────────────────────────
try:
import subprocess
import tempfile
tmp_dir = tempfile.mkdtemp()
for soffice_cmd in ('soffice', 'libreoffice'):
try:
result = subprocess.run(
[soffice_cmd, '--headless', '--convert-to', 'txt:Text',
'--outdir', tmp_dir, abs_path],
capture_output=True, text=True, timeout=60,
)
if result.returncode == 0:
txt_file = os.path.join(tmp_dir, Path(file_path).stem + '.txt')
if os.path.exists(txt_file):
with open(txt_file, 'r', encoding='utf-8', errors='ignore') as f:
content = f.read()
logger.info(f'.doc 通过 LibreOffice 提取成功: {file_path}')
return content
except FileNotFoundError:
continue
except subprocess.TimeoutExpired:
logger.warning('LibreOffice 转换超时')
break
except Exception as e:
logger.warning(f'LibreOffice 提取 .doc 失败: {e}')
# ── 方案3python-docx 兼容尝试(部分另存的 .doc 实为 XML 格式)──────
try:
result = _extract_docx(file_path)
if result.strip():
logger.info(f'.doc 通过 python-docx 兼容读取成功: {file_path}')
return result
except Exception as e:
logger.warning(f'python-docx 兼容读取 .doc 失败: {e}')
raise RuntimeError(
'无法读取 .doc 格式文件。请在 Word 中打开该文件,'
'选择「另存为」→「Word 文档 (.docx)」后重新上传。'
)
def truncate_text(text: str, max_chars: int = 60000) -> str:
"""截断超长文本,避免超出 AI Token 限制"""
if len(text) <= max_chars:
return text
return text[:max_chars] + '\n\n...[文档内容已截断,仅展示前段]'
def split_text_chunks(text: str, chunk_size: int = 2000, overlap: int = 200) -> list[str]:
"""将文本按固定大小分块(用于知识库)"""
chunks = []
start = 0
while start < len(text):
end = min(start + chunk_size, len(text))
chunks.append(text[start:end])
start += chunk_size - overlap
return chunks
def allowed_file(filename: str) -> bool:
allowed = {'pdf', 'doc', 'docx'}
return '.' in filename and filename.rsplit('.', 1)[1].lower() in allowed
def safe_filename(filename: str) -> str:
"""生成安全的文件名"""
import re
name = re.sub(r'[^\w\u4e00-\u9fff.\-]', '_', filename)
return name

Some files were not shown because too many files have changed in this diff Show More