2026-04-24 14:44:38 +08:00

673 lines
30 KiB
JavaScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/**
* bill-worker.js — PDF 清单解析调度器Worker Thread
*
* 架构v3 — SharedArrayBuffer 零拷贝):
* Phase 1 — 并行文本提取
* 将 PDF 数据写入 SharedArrayBuffer一次分配所有子线程共享读
* 启动 N 个 page-worker每个负责固定 20 页
*
* Phase 2 — 清单页筛选 + 文本解析(纯正则,毫秒级)
* 汇总全部页面文本 → 关键字筛选清单页 → 多行合并 → 逐行解析
*/
'use strict';
const { parentPort } = require('worker_threads');
const { Worker } = require('worker_threads');
const path = require('path');
const PAGES_PER_CHUNK = 20;
parentPort.on('message', async (msg) => {
if (msg.type !== 'parse') return;
const t0 = Date.now();
try {
// 立即做一次干净的拷贝,确保拥有独立的 ArrayBuffer
const raw = msg.buffer;
const buf = Buffer.alloc(raw.byteLength);
Buffer.from(raw).copy(buf);
if (buf.length === 0) {
parentPort.postMessage({ type: 'done', ok: false, error: '收到空 PDF 数据' });
return;
}
// ── 获取总页数 ──
const pdfjsModule = await import('pdfjs-dist/build/pdf.mjs');
const pdfjsLib = pdfjsModule.default || pdfjsModule;
// 给 pdfjs 一份独立拷贝pdfjs 内部可能 detach buffer
const pdfData = new Uint8Array(buf.length);
buf.copy(Buffer.from(pdfData.buffer));
const pdf = await pdfjsLib.getDocument({ data: pdfData, isEvalSupported: false }).promise;
const totalPages = pdf.numPages;
// ── 将 PDF 数据写入 SharedArrayBuffer一次分配所有子线程共享读──
const sab = new SharedArrayBuffer(buf.length);
const sabView = new Uint8Array(sab);
buf.copy(Buffer.from(sabView.buffer)); // 从独立 buf 拷贝到共享内存
const workerCount = Math.ceil(totalPages / PAGES_PER_CHUNK);
console.log(`[BillWorker] PDF ${totalPages} 页, ${workerCount} 路并行 (SharedArrayBuffer ${(buf.length/1024/1024).toFixed(1)}MB)`);
// Phase 1: 并行文本提取
const pageTexts = await parallelExtract(sab, buf.length, totalPages, workerCount);
const t1 = Date.now();
const extractedCount = pageTexts.filter(t => t.length > 0).length;
console.log(`[BillWorker] Phase1 完成: ${t1 - t0}ms, ${extractedCount}/${totalPages} 页有文本`);
// 扫描件判断
const totalChars = pageTexts.reduce((s, t) => s + t.length, 0);
if (totalChars < 50) {
parentPort.postMessage({ type: 'done', ok: true, data: { scanned: true, reason: 'noText', totalPages } });
return;
}
// Phase 2: 筛选清单页(宽松策略 + 连续页补全)
const BILL_KW = ['项目编码', '项目名称', '工程量', '计量单位', '综合单价', '清单编码'];
const SEC_KW = ['分部分项', '分类分项', '措施项目', '其他项目', '工程量清单计价'];
// 第一轮:标记确定的清单页
const billFlags = new Array(pageTexts.length).fill(false);
for (let i = 0; i < pageTexts.length; i++) {
const t = pageTexts[i];
if (!t.trim()) continue;
const hHits = BILL_KW.filter(k => t.includes(k)).length;
const sHit = SEC_KW.some(k => t.includes(k));
const hasCode = /\d{9}/.test(t);
// 放宽有9位编码即可不再要求同时命中表头关键字
if (hHits >= 2 || sHit || hasCode) {
billFlags[i] = true;
}
}
// 第二轮:连续页补全 — 两个清单页之间的非空页也视为清单页(续页无表头)
// 但排除纯费用/税金页面(它们不含施工清单项)
const FEE_PAGE_KW = ['规费', '税金', '社会保险费', '住房公积金', '养老保险',
'工伤保险', '失业保险', '医疗保险', '教育费附加', '城市维护建设税'];
const firstBill = billFlags.indexOf(true);
const lastBill = billFlags.lastIndexOf(true);
if (firstBill >= 0 && lastBill > firstBill) {
for (let i = firstBill; i <= lastBill; i++) {
if (!billFlags[i] && pageTexts[i] && pageTexts[i].trim().length > 30) {
const t = pageTexts[i];
const feeHits = FEE_PAGE_KW.filter(kw => t.includes(kw)).length;
// 命中 2+ 个费用关键字且没有9位工程编码 → 纯费用页,排除
if (feeHits >= 2 && !/\d{9}/.test(t)) continue;
billFlags[i] = true;
}
}
}
const billTexts = [];
for (let i = 0; i < pageTexts.length; i++) {
if (billFlags[i]) billTexts.push(pageTexts[i]);
}
if (!billTexts.length) {
parentPort.postMessage({ type: 'done', ok: true, data: { scanned: false, noBillPages: true, totalPages } });
return;
}
console.log(`[BillWorker] ${totalPages} 页 → ${billTexts.length} 页清单 (原始识别 ${billFlags.filter(f=>f).length - (lastBill - firstBill >= 0 ? 0 : 0)} / 补全后 ${billTexts.length})`);
// Phase 3: 文本解析
const merged = billTexts.join('\n');
const parsed = parseBillText(merged);
const t2 = Date.now();
console.log(`[BillWorker] Phase2+3: ${t2 - t1}ms, 总耗时: ${t2 - t0}ms`);
parentPort.postMessage({
type: 'done', ok: true,
data: {
scanned: false,
...parsed,
_meta: {
method: 'local-parallel',
workers: workerCount,
billPages: billTexts.length,
totalPages,
extractMs: t1 - t0,
parseMs: t2 - t1,
totalMs: t2 - t0,
}
}
});
} catch (err) {
console.error('[BillWorker] 错误:', err.message);
parentPort.postMessage({ type: 'done', ok: false, error: err.message });
}
});
// ================================================================
// Phase 1: 多 Worker 并行提取SharedArrayBuffer 零拷贝)
// ================================================================
function parallelExtract(sab, dataLength, totalPages, workerCount) {
return new Promise((resolve) => {
const workerPath = path.join(__dirname, 'page-worker.js');
const allPageTexts = new Array(totalPages).fill('');
const workerStatus = new Array(workerCount).fill('pending'); // pending, done, failed
let resolved = false;
const checkComplete = () => {
if (resolved) return;
const doneCount = workerStatus.filter(s => s === 'done' || s === 'failed').length;
if (doneCount >= workerCount) {
resolved = true;
// 检查是否有失败的worker打印警告
const failedCount = workerStatus.filter(s => s === 'failed').length;
if (failedCount > 0) {
console.warn(`[BillWorker] ${failedCount}/${workerCount} 个worker失败可能导致部分页面无内容`);
}
resolve(allPageTexts);
}
};
for (let i = 0; i < workerCount; i++) {
const startPage = i * PAGES_PER_CHUNK + 1;
const endPage = Math.min((i + 1) * PAGES_PER_CHUNK, totalPages);
// workerData 传 SharedArrayBuffer跨线程共享不会被清空
const w = new Worker(workerPath, {
workerData: { sab, dataLength, startPage, endPage }
});
let workerDone = false;
const markDone = (status) => {
if (workerDone) return;
workerDone = true;
workerStatus[i] = status;
checkComplete();
};
w.on('message', (msg) => {
if (msg.ok && msg.results) {
for (const r of msg.results) {
allPageTexts[r.page - 1] = r.text;
}
markDone('done');
} else if (!msg.ok) {
console.warn(`[BillWorker] page-worker[${startPage}-${endPage}] 失败: ${msg.error}`);
markDone('failed');
}
});
w.on('error', (err) => {
console.warn(`[BillWorker] page-worker[${startPage}-${endPage}] 异常: ${err.message}`);
markDone('failed');
});
w.on('exit', (code) => {
// exit 在 message 之后触发,但如果 worker 崩溃没发 message 则在这里兜底
if (code !== 0 && !workerDone) {
console.warn(`[BillWorker] page-worker[${startPage}-${endPage}] 意外退出(code=${code})`);
markDone('failed');
} else if (!workerDone) {
markDone('done');
}
});
}
if (workerCount <= 0) {
resolved = true;
resolve(allPageTexts);
}
});
}
// ================================================================
// Phase 3: 清单文本解析(纯正则 + 字符串处理,毫秒级)
// ================================================================
function parseBillText(text) {
const rawLines = text.split(/\n/).map(l => {
let line = l.replace(/\t/g, ' ').trim();
// 规范化带横杠的编码:如 "010-101-001-001" → "010101001001"
line = line.replace(/(\d{2,4})[-](\d{2,4})[-](\d{2,4})(?:[-](\d{2,4}))?/g,
(m, a, b, c, d) => {
const combined = a + b + c + (d || '');
return (combined.length >= 9 && combined.length <= 12) ? combined : m;
});
return line;
});
// ── Step 1: 多行合并成逻辑行 ──
// pdfjs 按 Y 坐标分行,表格一行通常 = 一条文本行
// 但有时 项目特征/名称 会折行,需要合并
//
// 新逻辑行的起始标志(任一命中即切断):
// a) 序号模式1.1.1.1.5 开头
// b) 清单编码9-12位数字 或 B+5-6位数字 开头
// c) 中文大标题:一 二 三 ... 或 (一)(二)...
// d) 表头行内容(跳过)
// e) 纯数字序号 + 空格 + 编码(如 "5 500101004001"
const ITEM_START = /^\d+(\.\d+)+\s/; // 1.1 或 1.1.1 等序号
const CODE_INLINE = /(?:^|\s)(\d{9,12}|(?<![A-Za-z])B\d{5,6})\s/; // 行内含清单编码(排除 GB/DB 等标准号)
const CODE_START_RE = /^(\d{9,12}|B\d{5,6})\s/; // 行首就是清单编码(行首 B 不会有前缀字母)
const SEQ_CODE_RE = /^\d{1,4}\s+(\d{9,12}|(?<![A-Za-z])B\d{5,6})\s/; // "序号 编码"格式
const PAGE_MARK = /^--\s*\d+\s+of\s+\d+\s*--/;
const HEADER_RE = /^序号\s+(项目编码|项目名称)/;
const HEADER_KW = /^(项目编码|项目名称|清单编码|计量单位|综合单价|工程量|合\s*价|金额|序号)\s/;
const CATEGORY_MARKERS = ['一', '二', '三', '四', '五', '六', '七', '八', '九', '十',
'(一)', '(二)', '(三)', '(四)', '(五)'];
const logicLines = [];
let currentLine = '';
function isNewLineTrigger(raw) {
if (ITEM_START.test(raw)) return true;
if (CODE_START_RE.test(raw)) return true;
if (SEQ_CODE_RE.test(raw)) return true;
if (CATEGORY_MARKERS.some(m => raw.startsWith(m + ' ') || raw.startsWith(m + '\u3000'))) return true;
return false;
}
for (const raw of rawLines) {
if (!raw || PAGE_MARK.test(raw)) continue;
if (HEADER_RE.test(raw) || HEADER_KW.test(raw)) continue;
if (/^(元)|^款章节号|^备注$|^第\d+页/.test(raw)) continue;
if (isNewLineTrigger(raw)) {
if (currentLine) logicLines.push(currentLine);
currentLine = raw;
} else if (CODE_INLINE.test(raw) && raw.length > 15) {
// 行内包含编码且够长(像是完整的表格行)→ 也开新行
if (currentLine) logicLines.push(currentLine);
currentLine = raw;
} else {
// 续行(项目特征折行等短文本)
// 安全阀:已合并行过长时强制切断,防止整页吞并
if (currentLine && currentLine.length > 300) {
logicLines.push(currentLine);
currentLine = raw;
} else {
currentLine = currentLine ? currentLine + ' ' + raw : raw;
}
}
}
if (currentLine) logicLines.push(currentLine);
console.log(`[BillWorker] 合并后 ${logicLines.length} 条逻辑行(原始 ${rawLines.length} 行)`);
// 打印前5条逻辑行供调试
for (let i = 0; i < Math.min(5, logicLines.length); i++) {
console.log(`[BillWorker] L${i}: ${logicLines[i].substring(0, 120)}`);
}
const categories = [];
let curCat = null, curItem = null;
// 编码匹配支持行内任意位置的9-12位数字或B编码排除 GB/DB 等标准号前缀)
const CODE_RE = /(?<![A-Za-z])(\d{9,12}|(?<![A-Za-z])B\d{5,6})/;
const UNIT_TOKENS = ['m³','m²','m3','m2','km','hm2','㎡','㎥','t','kg',
'个','台','套','组','根','块','片','张','只','吨','项',
'处','座','件','段','条','把','扇','口','圈','道','孔',
'对','副','樘','方','延m','株','棵','m'];
const UNIT_SET = new Set(UNIT_TOKENS);
const unitEscaped = UNIT_TOKENS.map(u => u.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'));
const UNIT_RE = new RegExp(`(?:^|\\s)(${unitEscaped.join('|')})(?=\\s|\\d|$)`);
const SKIP_RE = /合\s*计|小\s*计|本页小计|总\s*计|价税合计/;
for (const line of logicLines) {
if (SKIP_RE.test(line)) continue;
// 去掉行首的序号部分("1.1.1.1.5 " 或 "5 " 等纯序号前缀)
let stripped = line.replace(/^\d+(\.\d+)*\s+/, '').trim();
if (!stripped) stripped = line.trim();
if (!stripped) continue;
const cm = stripped.match(CODE_RE);
if (cm) {
if (curItem && curCat) curCat.items.push(curItem);
if (!curCat) { curCat = { name: '未分类', items: [] }; categories.push(curCat); }
const code = cm[1];
let rest = stripped.substring(cm.index + cm[0].length).trim();
let name = '', unit = '', quantity = '', spec = '';
const unitMatch = rest.match(UNIT_RE);
if (unitMatch) {
const ui = rest.indexOf(unitMatch[0]);
let rawName = rest.substring(0, ui).trim();
unit = unitMatch[1];
const afterUnit = rest.substring(ui + unitMatch[0].length).trim();
const qm = afterUnit.match(/^([\d,.]+)/);
if (qm) {
quantity = qm[1];
// 提取 quantity 之后的尾部文本,跳过纯数字字段(综合单价、合价等)
let tail = afterUnit.substring(qm.index + qm[0].length).trim();
if (tail) {
const tailTokens = tail.split(/\s+/);
let si = 0;
while (si < tailTokens.length && /^[\d,.%\-]+$/.test(tailTokens[si])) si++;
const specTail = tailTokens.slice(si).join(' ').trim();
if (specTail) spec = specTail;
}
}
// 分离 rawName 中的"项目名称"和内联"项目特征"
const ns = splitNameAndSpec(rawName);
name = ns.name;
if (ns.spec) spec = ns.spec + (spec ? ';' + spec : '');
} else {
const tokens = rest.split(/\s+/).filter(t => t);
let foundUnitIdx = -1;
for (let ti = tokens.length - 1; ti >= 1; ti--) {
if (UNIT_SET.has(tokens[ti])) { foundUnitIdx = ti; break; }
}
if (foundUnitIdx >= 1) {
const rawNameStr = tokens.slice(0, foundUnitIdx).join(' ');
const ns = splitNameAndSpec(rawNameStr);
name = ns.name;
if (ns.spec) spec = ns.spec;
unit = tokens[foundUnitIdx];
const afterTokens = tokens.slice(foundUnitIdx + 1);
if (afterTokens.length && /^[\d,.]+$/.test(afterTokens[0])) {
quantity = afterTokens[0];
let si = 1;
while (si < afterTokens.length && /^[\d,.%\-]+$/.test(afterTokens[si])) si++;
const specTail = afterTokens.slice(si).join(' ').trim();
if (specTail) spec = spec ? spec + ';' + specTail : specTail;
}
} else {
name = rest;
}
}
name = name.replace(/\s+/g, '').trim();
for (const u of UNIT_TOKENS) {
if (name.endsWith(u) && name.length > u.length) {
unit = unit || u;
name = name.substring(0, name.length - u.length);
break;
}
}
curItem = { code, name, unit, quantity, spec };
continue;
}
// ── 回退:无标准编码但有 "名称 单位 数量" 结构 → 也视为清单项 ──
// 常见于措施项目、未编码的补充清单项
if (!cm && stripped.length > 4) {
const uniMatch = stripped.match(UNIT_RE);
if (uniMatch) {
const ui = stripped.indexOf(uniMatch[0]);
const beforeUnit = stripped.substring(0, ui).trim();
const afterUnit = stripped.substring(ui + uniMatch[0].length).trim();
const hasQty = /^[\d,.]+/.test(afterUnit);
// 名称 2-50 字、含中文、有数量、不是分部标题
if (beforeUnit.length >= 2 && beforeUnit.length <= 50 && hasQty
&& /[\u4e00-\u9fff]/.test(beforeUnit)) {
if (curItem && curCat) curCat.items.push(curItem);
if (!curCat) { curCat = { name: '未分类', items: [] }; categories.push(curCat); }
const unit = uniMatch[1];
const qm = afterUnit.match(/^([\d,.]+)/);
const quantity = qm ? qm[1] : '';
const ns = splitNameAndSpec(beforeUnit);
const name = ns.name.replace(/\s+/g, '').trim();
const spec = ns.spec || '';
curItem = { code: '', name, unit, quantity, spec };
continue;
}
}
}
// 分部标题判断:不含编码、较短的文本、含工程关键字
// 关键守卫:如果行里有计量单位,说明是清单项,不是标题
if (stripped.length > 2 && stripped.length < 60 && !CODE_RE.test(stripped)) {
if (UNIT_RE.test(stripped) && /\d+\.?\d*\s*$/.test(stripped)) {
if (curItem) curItem.spec = curItem.spec ? curItem.spec + ';' + stripped : stripped;
continue;
}
if (isCatTitle(stripped) && !UNIT_RE.test(stripped) && !isFeeCatTitle(stripped)) {
if (curItem && curCat) { curCat.items.push(curItem); curItem = null; }
const cleanTitle = stripped.replace(/\s+(座|个|项|处|m|km|段|条)\s+\d+[\d.]*\s*$/, '').trim();
curCat = { name: cleanTitle, items: [] };
categories.push(curCat);
continue;
}
}
if (/^[一二三四五六七八九十]+\s/.test(stripped) || /^[一二三四五六七八九十\d]+/.test(stripped)) {
// 中文序号标题也需要排除费用类
const cleanTitle = stripped.replace(/\s+(座|个|项|处)\s+\d+[\d.]*\s*$/, '').trim();
if (isFeeCatTitle(cleanTitle)) {
// 费用类标题:跳过,不建分部(其下的行会作为续行处理)
continue;
}
if (curItem && curCat) { curCat.items.push(curItem); curItem = null; }
curCat = { name: cleanTitle, items: [] };
categories.push(curCat);
continue;
}
if (curItem && stripped.length > 1) {
curItem.spec = curItem.spec ? curItem.spec + ';' + stripped : stripped;
}
}
if (curItem && curCat) curCat.items.push(curItem);
// 过滤费用项:只保留需要写入技术标的施工清单项
let feeFiltered = 0;
for (const cat of categories) {
if (cat.items) {
const before = cat.items.length;
cat.items = cat.items.filter(it => !isFeeItem(it.name));
feeFiltered += before - cat.items.length;
}
}
if (feeFiltered > 0) console.log(`[BillWorker] 费用项过滤: 移除 ${feeFiltered}`);
// ========== 按项目名称合并(核心去重,大幅减少清单项数量)==========
// 规则同一分部内name 相同的清单项合并为一条
// - code: 保留第一个非空编码
// - unit: 保留第一个非空单位
// - quantity: 尝试数值求和,否则用分号拼接
// - spec: 去重后用分号拼接(截断过长的)
let totalBeforeMerge = 0, totalAfterMerge = 0;
for (const cat of categories) {
if (!cat.items || !cat.items.length) continue;
totalBeforeMerge += cat.items.length;
const nameMap = new Map(); // name → merged item
for (const item of cat.items) {
const key = (item.name || '').replace(/\s+/g, '').trim();
if (!key) continue;
if (!nameMap.has(key)) {
nameMap.set(key, {
code: item.code || '',
name: item.name,
unit: item.unit || '',
quantity: item.quantity || '',
spec: item.spec || '',
_count: 1,
_quantities: item.quantity ? [item.quantity] : [],
_specs: item.spec ? [item.spec] : [],
});
} else {
const m = nameMap.get(key);
m._count++;
// code: 取第一个非空的
if (!m.code && item.code) m.code = item.code;
// unit: 取第一个非空的
if (!m.unit && item.unit) m.unit = item.unit;
// quantity: 收集所有
if (item.quantity) m._quantities.push(item.quantity);
// spec: 收集不重复的
if (item.spec && !m._specs.includes(item.spec)) {
m._specs.push(item.spec);
}
}
}
// 后处理:合成最终字段
const merged = [];
for (const [, m] of nameMap) {
// quantity: 尝试数值求和
if (m._quantities.length > 1) {
const nums = m._quantities.map(q => parseFloat(q.replace(/,/g, '')));
if (nums.every(n => !isNaN(n))) {
const sum = nums.reduce((a, b) => a + b, 0);
m.quantity = sum % 1 === 0 ? String(sum) : sum.toFixed(2);
} else {
m.quantity = m._quantities.join('; ');
}
} else if (m._quantities.length === 1) {
m.quantity = m._quantities[0];
}
// spec: 拼接去重后的 spec每条最多120字
if (m._specs.length > 0) {
const trimmed = m._specs.map(s => s.length > 120 ? s.substring(0, 120) + '...' : s);
m.spec = trimmed.join('; ');
// 总 spec 上限 300 字
if (m.spec.length > 300) m.spec = m.spec.substring(0, 300) + '...';
}
// 清理临时字段
delete m._count; delete m._quantities; delete m._specs;
merged.push(m);
}
cat.items = merged;
totalAfterMerge += merged.length;
}
const mergedCount = totalBeforeMerge - totalAfterMerge;
if (mergedCount > 0) {
console.log(`[BillWorker] 按名称合并: ${totalBeforeMerge}${totalAfterMerge} 项(合并 ${mergedCount} 个重复项)`);
}
const valid = categories.filter(c => c.items && c.items.length > 0);
const totalItems = valid.reduce((s, c) => s + c.items.length, 0);
const withSpec = valid.reduce((s, c) => s + c.items.filter(it => it.spec).length, 0);
const withCode = valid.reduce((s, c) => s + c.items.filter(it => it.code).length, 0);
console.log(`[BillWorker] 最终结果: ${valid.length} 分部, ${totalItems} 清单项 (${withCode} 有编码, ${withSpec} 有spec)`);
// 打印前 3 个 item 供调试
let debugCount = 0;
for (const cat of valid) {
for (const it of cat.items) {
if (debugCount < 3) {
console.log(`[BillWorker] 样例: [${it.code}] ${it.name} | ${it.unit} | qty=${it.quantity} | spec=${(it.spec||'').substring(0, 80)}`);
debugCount++;
}
}
}
return {
project_summary: { remark: `本地解析:${valid.length} 个分部,${totalItems} 个清单项(合并前 ${totalBeforeMerge} 项)` },
categories: valid,
};
}
/**
* 判断清单项是否为"费用项"(非施工内容,不写入技术标)
* 如:安全文明措施费、规费、税金、暂列金额等
*/
function isFeeItem(name) {
if (!name) return false;
const n = name.replace(/\s+/g, '');
// ── 1. 精确匹配 ──
const EXACT = [
'规费', '税金', '利润', '增值税', '暂列金额', '暂估价', '计日工',
'总承包服务费', '企业管理费', '甲供材料保管费', '价税合计',
];
if (EXACT.includes(n)) return true;
// ── 2. 包含匹配:措施费/规费/保险/行政类 ──
const FEE_KW = [
'安全文明', '文明施工费', '环境保护费', '临时设施费',
'夜间施工增加费', '夜间施工费',
'冬雨季施工增加费', '冬雨季施工费',
'二次搬运费', '大型机械设备进出场', '大型机械进出场',
'施工排水降水', '排水降水费',
'已完工程及设备保护', '已完工程保护费',
'工程排污费', '社会保障费', '住房公积金',
'工伤保险', '劳动保险', '意外伤害保险', '建筑工程保险',
'城市维护建设税', '城市建设维护税',
'教育费附加', '地方教育附加',
'材料暂估', '专业工程暂估',
'超高施工增加费', '安全防护费',
'措施项目费', '其他项目费', '不可竞争费',
];
for (const kw of FEE_KW) {
if (n.includes(kw)) return true;
}
return false;
}
/**
* 将 rawName 中的"项目名称"与内联"项目特征描述"分离
* 例: "土方开挖 1.土壤类别:普通土" → { name: "土方开挖", spec: "1.土壤类别:普通土" }
*/
function splitNameAndSpec(rawName) {
if (!rawName) return { name: '', spec: '' };
// Pattern 1: 数字+点+中文(如 "1.土壤类别" "2、强度等级"
const m = rawName.match(/\d+[.、.)\uFF09]\s*[\u4e00-\u9fff]/);
if (m && m.index > 0) {
return {
name: rawName.substring(0, m.index).trim(),
spec: rawName.substring(m.index).trim()
};
}
// Pattern 2: 特征关键字+冒号(如 "材质:" "规格:"
const SPEC_KW_RE = /(材质|规格|型号|品牌|颜色|尺寸|厚度|直径|管径|强度|等级|类别|类型|做法|要求|标准|内容|工作内容|土壤|含量|配合比|工艺|方式|形式|范围|部位|位置|高度|宽度|长度|深度|坡度|截面|跨度|运距|开挖|回填|混凝土|钢筋|压实)[:]/;
const kw = rawName.match(SPEC_KW_RE);
if (kw && kw.index > 0) {
return {
name: rawName.substring(0, kw.index).trim(),
spec: rawName.substring(kw.index).trim()
};
}
// Pattern 3: 括号开头的特征描述 "1" "(1)"
const paren = rawName.match(/[(]\d+[)]/);
if (paren && paren.index > 0) {
return {
name: rawName.substring(0, paren.index).trim(),
spec: rawName.substring(paren.index).trim()
};
}
return { name: rawName, spec: '' };
}
function isCatTitle(text) {
const KW = [
'土建','建筑','结构','装饰','装修','安装','给排水','暖通','空调','通风',
'电气','强电','弱电','消防','智能化','幕墙','门窗','园林','绿化','景观',
'市政','道路','桥梁','管网','基础','地基','桩基','主体','屋面','防水',
'保温','钢结构','排水','给水','照明','动力','防雷','电梯','人防','室外',
'附属','分部','工程','措施','清单','土石方','混凝土','砌筑','模板','脚手架',
'水利','河道','管道','阀门','设备','仪表','自动化','通信','网络',
'拆除','外墙','内墙','楼地面','天棚','吊顶','栏杆','屋顶','涂料','抹灰',
'廊道','阀门井','蓄水池','泵站','供水','引水','水源','渠道','闸门',
'围栏','警示','检修','管线','配电','水池','水塔','取水','净水',
];
return KW.some(k => text.includes(k));
}
/**
* 判断分部标题是否为"费用类"(不应创建分部分类)
* 如:规费、税金、措施项目费、其他项目费 等非施工类分部
*/
function isFeeCatTitle(text) {
if (!text) return false;
const t = text.replace(/\s+/g, '');
// 精确匹配整个标题
const EXACT = [
'规费', '税金', '利润', '增值税', '暂列金额', '暂估价', '计日工',
'总承包服务费', '企业管理费', '价税合计',
'措施项目费', '其他项目费', '不可竞争费',
];
if (EXACT.includes(t)) return true;
// 包含匹配
const FEE_CAT_KW = [
'措施项目费', '其他项目费', '不可竞争费',
'规费汇总', '税金汇总', '费率', '费用汇总', '费用合计',
'暂列金额', '暂估价', '计日工', '总承包服务费',
'安全文明施工费', '社会保障费', '住房公积金',
'工伤保险', '教育费附加', '城市维护建设税',
];
for (const kw of FEE_CAT_KW) {
if (t.includes(kw)) return true;
}
return false;
}