673 lines
30 KiB
JavaScript
673 lines
30 KiB
JavaScript
/**
|
||
* bill-worker.js — PDF 清单解析调度器(Worker Thread)
|
||
*
|
||
* 架构(v3 — SharedArrayBuffer 零拷贝):
|
||
* Phase 1 — 并行文本提取
|
||
* 将 PDF 数据写入 SharedArrayBuffer(一次分配,所有子线程共享读)
|
||
* 启动 N 个 page-worker,每个负责固定 20 页
|
||
*
|
||
* Phase 2 — 清单页筛选 + 文本解析(纯正则,毫秒级)
|
||
* 汇总全部页面文本 → 关键字筛选清单页 → 多行合并 → 逐行解析
|
||
*/
|
||
'use strict';
|
||
const { parentPort } = require('worker_threads');
|
||
const { Worker } = require('worker_threads');
|
||
const path = require('path');
|
||
|
||
const PAGES_PER_CHUNK = 20;
|
||
|
||
parentPort.on('message', async (msg) => {
|
||
if (msg.type !== 'parse') return;
|
||
const t0 = Date.now();
|
||
try {
|
||
// 立即做一次干净的拷贝,确保拥有独立的 ArrayBuffer
|
||
const raw = msg.buffer;
|
||
const buf = Buffer.alloc(raw.byteLength);
|
||
Buffer.from(raw).copy(buf);
|
||
|
||
if (buf.length === 0) {
|
||
parentPort.postMessage({ type: 'done', ok: false, error: '收到空 PDF 数据' });
|
||
return;
|
||
}
|
||
|
||
// ── 获取总页数 ──
|
||
const pdfjsModule = await import('pdfjs-dist/build/pdf.mjs');
|
||
const pdfjsLib = pdfjsModule.default || pdfjsModule;
|
||
// 给 pdfjs 一份独立拷贝(pdfjs 内部可能 detach buffer)
|
||
const pdfData = new Uint8Array(buf.length);
|
||
buf.copy(Buffer.from(pdfData.buffer));
|
||
const pdf = await pdfjsLib.getDocument({ data: pdfData, isEvalSupported: false }).promise;
|
||
const totalPages = pdf.numPages;
|
||
|
||
// ── 将 PDF 数据写入 SharedArrayBuffer(一次分配,所有子线程共享读)──
|
||
const sab = new SharedArrayBuffer(buf.length);
|
||
const sabView = new Uint8Array(sab);
|
||
buf.copy(Buffer.from(sabView.buffer)); // 从独立 buf 拷贝到共享内存
|
||
|
||
const workerCount = Math.ceil(totalPages / PAGES_PER_CHUNK);
|
||
console.log(`[BillWorker] PDF ${totalPages} 页, ${workerCount} 路并行 (SharedArrayBuffer ${(buf.length/1024/1024).toFixed(1)}MB)`);
|
||
|
||
// Phase 1: 并行文本提取
|
||
const pageTexts = await parallelExtract(sab, buf.length, totalPages, workerCount);
|
||
const t1 = Date.now();
|
||
|
||
const extractedCount = pageTexts.filter(t => t.length > 0).length;
|
||
console.log(`[BillWorker] Phase1 完成: ${t1 - t0}ms, ${extractedCount}/${totalPages} 页有文本`);
|
||
|
||
// 扫描件判断
|
||
const totalChars = pageTexts.reduce((s, t) => s + t.length, 0);
|
||
if (totalChars < 50) {
|
||
parentPort.postMessage({ type: 'done', ok: true, data: { scanned: true, reason: 'noText', totalPages } });
|
||
return;
|
||
}
|
||
|
||
// Phase 2: 筛选清单页(宽松策略 + 连续页补全)
|
||
const BILL_KW = ['项目编码', '项目名称', '工程量', '计量单位', '综合单价', '清单编码'];
|
||
const SEC_KW = ['分部分项', '分类分项', '措施项目', '其他项目', '工程量清单计价'];
|
||
// 第一轮:标记确定的清单页
|
||
const billFlags = new Array(pageTexts.length).fill(false);
|
||
for (let i = 0; i < pageTexts.length; i++) {
|
||
const t = pageTexts[i];
|
||
if (!t.trim()) continue;
|
||
const hHits = BILL_KW.filter(k => t.includes(k)).length;
|
||
const sHit = SEC_KW.some(k => t.includes(k));
|
||
const hasCode = /\d{9}/.test(t);
|
||
// 放宽:有9位编码即可(不再要求同时命中表头关键字)
|
||
if (hHits >= 2 || sHit || hasCode) {
|
||
billFlags[i] = true;
|
||
}
|
||
}
|
||
// 第二轮:连续页补全 — 两个清单页之间的非空页也视为清单页(续页无表头)
|
||
// 但排除纯费用/税金页面(它们不含施工清单项)
|
||
const FEE_PAGE_KW = ['规费', '税金', '社会保险费', '住房公积金', '养老保险',
|
||
'工伤保险', '失业保险', '医疗保险', '教育费附加', '城市维护建设税'];
|
||
const firstBill = billFlags.indexOf(true);
|
||
const lastBill = billFlags.lastIndexOf(true);
|
||
if (firstBill >= 0 && lastBill > firstBill) {
|
||
for (let i = firstBill; i <= lastBill; i++) {
|
||
if (!billFlags[i] && pageTexts[i] && pageTexts[i].trim().length > 30) {
|
||
const t = pageTexts[i];
|
||
const feeHits = FEE_PAGE_KW.filter(kw => t.includes(kw)).length;
|
||
// 命中 2+ 个费用关键字且没有9位工程编码 → 纯费用页,排除
|
||
if (feeHits >= 2 && !/\d{9}/.test(t)) continue;
|
||
billFlags[i] = true;
|
||
}
|
||
}
|
||
}
|
||
const billTexts = [];
|
||
for (let i = 0; i < pageTexts.length; i++) {
|
||
if (billFlags[i]) billTexts.push(pageTexts[i]);
|
||
}
|
||
|
||
if (!billTexts.length) {
|
||
parentPort.postMessage({ type: 'done', ok: true, data: { scanned: false, noBillPages: true, totalPages } });
|
||
return;
|
||
}
|
||
|
||
console.log(`[BillWorker] ${totalPages} 页 → ${billTexts.length} 页清单 (原始识别 ${billFlags.filter(f=>f).length - (lastBill - firstBill >= 0 ? 0 : 0)} / 补全后 ${billTexts.length})`);
|
||
|
||
// Phase 3: 文本解析
|
||
const merged = billTexts.join('\n');
|
||
const parsed = parseBillText(merged);
|
||
const t2 = Date.now();
|
||
console.log(`[BillWorker] Phase2+3: ${t2 - t1}ms, 总耗时: ${t2 - t0}ms`);
|
||
|
||
parentPort.postMessage({
|
||
type: 'done', ok: true,
|
||
data: {
|
||
scanned: false,
|
||
...parsed,
|
||
_meta: {
|
||
method: 'local-parallel',
|
||
workers: workerCount,
|
||
billPages: billTexts.length,
|
||
totalPages,
|
||
extractMs: t1 - t0,
|
||
parseMs: t2 - t1,
|
||
totalMs: t2 - t0,
|
||
}
|
||
}
|
||
});
|
||
} catch (err) {
|
||
console.error('[BillWorker] 错误:', err.message);
|
||
parentPort.postMessage({ type: 'done', ok: false, error: err.message });
|
||
}
|
||
});
|
||
|
||
// ================================================================
|
||
// Phase 1: 多 Worker 并行提取(SharedArrayBuffer 零拷贝)
|
||
// ================================================================
|
||
|
||
function parallelExtract(sab, dataLength, totalPages, workerCount) {
|
||
return new Promise((resolve) => {
|
||
const workerPath = path.join(__dirname, 'page-worker.js');
|
||
const allPageTexts = new Array(totalPages).fill('');
|
||
const workerStatus = new Array(workerCount).fill('pending'); // pending, done, failed
|
||
let resolved = false;
|
||
|
||
const checkComplete = () => {
|
||
if (resolved) return;
|
||
const doneCount = workerStatus.filter(s => s === 'done' || s === 'failed').length;
|
||
if (doneCount >= workerCount) {
|
||
resolved = true;
|
||
// 检查是否有失败的worker,打印警告
|
||
const failedCount = workerStatus.filter(s => s === 'failed').length;
|
||
if (failedCount > 0) {
|
||
console.warn(`[BillWorker] ${failedCount}/${workerCount} 个worker失败,可能导致部分页面无内容`);
|
||
}
|
||
resolve(allPageTexts);
|
||
}
|
||
};
|
||
|
||
for (let i = 0; i < workerCount; i++) {
|
||
const startPage = i * PAGES_PER_CHUNK + 1;
|
||
const endPage = Math.min((i + 1) * PAGES_PER_CHUNK, totalPages);
|
||
|
||
// workerData 传 SharedArrayBuffer(跨线程共享,不会被清空)
|
||
const w = new Worker(workerPath, {
|
||
workerData: { sab, dataLength, startPage, endPage }
|
||
});
|
||
|
||
let workerDone = false;
|
||
|
||
const markDone = (status) => {
|
||
if (workerDone) return;
|
||
workerDone = true;
|
||
workerStatus[i] = status;
|
||
checkComplete();
|
||
};
|
||
|
||
w.on('message', (msg) => {
|
||
if (msg.ok && msg.results) {
|
||
for (const r of msg.results) {
|
||
allPageTexts[r.page - 1] = r.text;
|
||
}
|
||
markDone('done');
|
||
} else if (!msg.ok) {
|
||
console.warn(`[BillWorker] page-worker[${startPage}-${endPage}] 失败: ${msg.error}`);
|
||
markDone('failed');
|
||
}
|
||
});
|
||
|
||
w.on('error', (err) => {
|
||
console.warn(`[BillWorker] page-worker[${startPage}-${endPage}] 异常: ${err.message}`);
|
||
markDone('failed');
|
||
});
|
||
|
||
w.on('exit', (code) => {
|
||
// exit 在 message 之后触发,但如果 worker 崩溃没发 message 则在这里兜底
|
||
if (code !== 0 && !workerDone) {
|
||
console.warn(`[BillWorker] page-worker[${startPage}-${endPage}] 意外退出(code=${code})`);
|
||
markDone('failed');
|
||
} else if (!workerDone) {
|
||
markDone('done');
|
||
}
|
||
});
|
||
}
|
||
|
||
if (workerCount <= 0) {
|
||
resolved = true;
|
||
resolve(allPageTexts);
|
||
}
|
||
});
|
||
}
|
||
|
||
// ================================================================
|
||
// Phase 3: 清单文本解析(纯正则 + 字符串处理,毫秒级)
|
||
// ================================================================
|
||
|
||
function parseBillText(text) {
|
||
const rawLines = text.split(/\n/).map(l => {
|
||
let line = l.replace(/\t/g, ' ').trim();
|
||
// 规范化带横杠的编码:如 "010-101-001-001" → "010101001001"
|
||
line = line.replace(/(\d{2,4})[-‐–](\d{2,4})[-‐–](\d{2,4})(?:[-‐–](\d{2,4}))?/g,
|
||
(m, a, b, c, d) => {
|
||
const combined = a + b + c + (d || '');
|
||
return (combined.length >= 9 && combined.length <= 12) ? combined : m;
|
||
});
|
||
return line;
|
||
});
|
||
|
||
// ── Step 1: 多行合并成逻辑行 ──
|
||
// pdfjs 按 Y 坐标分行,表格一行通常 = 一条文本行
|
||
// 但有时 项目特征/名称 会折行,需要合并
|
||
//
|
||
// 新逻辑行的起始标志(任一命中即切断):
|
||
// a) 序号模式:1.1.1.1.5 开头
|
||
// b) 清单编码:9-12位数字 或 B+5-6位数字 开头
|
||
// c) 中文大标题:一 二 三 ... 或 (一)(二)...
|
||
// d) 表头行内容(跳过)
|
||
// e) 纯数字序号 + 空格 + 编码(如 "5 500101004001")
|
||
|
||
const ITEM_START = /^\d+(\.\d+)+\s/; // 1.1 或 1.1.1 等序号
|
||
const CODE_INLINE = /(?:^|\s)(\d{9,12}|(?<![A-Za-z])B\d{5,6})\s/; // 行内含清单编码(排除 GB/DB 等标准号)
|
||
const CODE_START_RE = /^(\d{9,12}|B\d{5,6})\s/; // 行首就是清单编码(行首 B 不会有前缀字母)
|
||
const SEQ_CODE_RE = /^\d{1,4}\s+(\d{9,12}|(?<![A-Za-z])B\d{5,6})\s/; // "序号 编码"格式
|
||
const PAGE_MARK = /^--\s*\d+\s+of\s+\d+\s*--/;
|
||
const HEADER_RE = /^序号\s+(项目编码|项目名称)/;
|
||
const HEADER_KW = /^(项目编码|项目名称|清单编码|计量单位|综合单价|工程量|合\s*价|金额|序号)\s/;
|
||
const CATEGORY_MARKERS = ['一', '二', '三', '四', '五', '六', '七', '八', '九', '十',
|
||
'(一)', '(二)', '(三)', '(四)', '(五)'];
|
||
|
||
const logicLines = [];
|
||
let currentLine = '';
|
||
|
||
function isNewLineTrigger(raw) {
|
||
if (ITEM_START.test(raw)) return true;
|
||
if (CODE_START_RE.test(raw)) return true;
|
||
if (SEQ_CODE_RE.test(raw)) return true;
|
||
if (CATEGORY_MARKERS.some(m => raw.startsWith(m + ' ') || raw.startsWith(m + '\u3000'))) return true;
|
||
return false;
|
||
}
|
||
|
||
for (const raw of rawLines) {
|
||
if (!raw || PAGE_MARK.test(raw)) continue;
|
||
if (HEADER_RE.test(raw) || HEADER_KW.test(raw)) continue;
|
||
if (/^(元)|^款章节号|^备注$|^第\d+页/.test(raw)) continue;
|
||
|
||
if (isNewLineTrigger(raw)) {
|
||
if (currentLine) logicLines.push(currentLine);
|
||
currentLine = raw;
|
||
} else if (CODE_INLINE.test(raw) && raw.length > 15) {
|
||
// 行内包含编码且够长(像是完整的表格行)→ 也开新行
|
||
if (currentLine) logicLines.push(currentLine);
|
||
currentLine = raw;
|
||
} else {
|
||
// 续行(项目特征折行等短文本)
|
||
// 安全阀:已合并行过长时强制切断,防止整页吞并
|
||
if (currentLine && currentLine.length > 300) {
|
||
logicLines.push(currentLine);
|
||
currentLine = raw;
|
||
} else {
|
||
currentLine = currentLine ? currentLine + ' ' + raw : raw;
|
||
}
|
||
}
|
||
}
|
||
if (currentLine) logicLines.push(currentLine);
|
||
|
||
console.log(`[BillWorker] 合并后 ${logicLines.length} 条逻辑行(原始 ${rawLines.length} 行)`);
|
||
// 打印前5条逻辑行供调试
|
||
for (let i = 0; i < Math.min(5, logicLines.length); i++) {
|
||
console.log(`[BillWorker] L${i}: ${logicLines[i].substring(0, 120)}`);
|
||
}
|
||
|
||
const categories = [];
|
||
let curCat = null, curItem = null;
|
||
|
||
// 编码匹配:支持行内任意位置的9-12位数字或B编码(排除 GB/DB 等标准号前缀)
|
||
const CODE_RE = /(?<![A-Za-z])(\d{9,12}|(?<![A-Za-z])B\d{5,6})/;
|
||
const UNIT_TOKENS = ['m³','m²','m3','m2','km','hm2','㎡','㎥','t','kg',
|
||
'个','台','套','组','根','块','片','张','只','吨','项',
|
||
'处','座','件','段','条','把','扇','口','圈','道','孔',
|
||
'对','副','樘','方','延m','株','棵','m'];
|
||
const UNIT_SET = new Set(UNIT_TOKENS);
|
||
const unitEscaped = UNIT_TOKENS.map(u => u.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'));
|
||
const UNIT_RE = new RegExp(`(?:^|\\s)(${unitEscaped.join('|')})(?=\\s|\\d|$)`);
|
||
const SKIP_RE = /合\s*计|小\s*计|本页小计|总\s*计|价税合计/;
|
||
|
||
for (const line of logicLines) {
|
||
if (SKIP_RE.test(line)) continue;
|
||
|
||
// 去掉行首的序号部分("1.1.1.1.5 " 或 "5 " 等纯序号前缀)
|
||
let stripped = line.replace(/^\d+(\.\d+)*\s+/, '').trim();
|
||
if (!stripped) stripped = line.trim();
|
||
if (!stripped) continue;
|
||
|
||
const cm = stripped.match(CODE_RE);
|
||
if (cm) {
|
||
if (curItem && curCat) curCat.items.push(curItem);
|
||
if (!curCat) { curCat = { name: '未分类', items: [] }; categories.push(curCat); }
|
||
|
||
const code = cm[1];
|
||
let rest = stripped.substring(cm.index + cm[0].length).trim();
|
||
let name = '', unit = '', quantity = '', spec = '';
|
||
|
||
const unitMatch = rest.match(UNIT_RE);
|
||
if (unitMatch) {
|
||
const ui = rest.indexOf(unitMatch[0]);
|
||
let rawName = rest.substring(0, ui).trim();
|
||
unit = unitMatch[1];
|
||
const afterUnit = rest.substring(ui + unitMatch[0].length).trim();
|
||
const qm = afterUnit.match(/^([\d,.]+)/);
|
||
if (qm) {
|
||
quantity = qm[1];
|
||
// 提取 quantity 之后的尾部文本,跳过纯数字字段(综合单价、合价等)
|
||
let tail = afterUnit.substring(qm.index + qm[0].length).trim();
|
||
if (tail) {
|
||
const tailTokens = tail.split(/\s+/);
|
||
let si = 0;
|
||
while (si < tailTokens.length && /^[\d,.%\-]+$/.test(tailTokens[si])) si++;
|
||
const specTail = tailTokens.slice(si).join(' ').trim();
|
||
if (specTail) spec = specTail;
|
||
}
|
||
}
|
||
// 分离 rawName 中的"项目名称"和内联"项目特征"
|
||
const ns = splitNameAndSpec(rawName);
|
||
name = ns.name;
|
||
if (ns.spec) spec = ns.spec + (spec ? ';' + spec : '');
|
||
} else {
|
||
const tokens = rest.split(/\s+/).filter(t => t);
|
||
let foundUnitIdx = -1;
|
||
for (let ti = tokens.length - 1; ti >= 1; ti--) {
|
||
if (UNIT_SET.has(tokens[ti])) { foundUnitIdx = ti; break; }
|
||
}
|
||
if (foundUnitIdx >= 1) {
|
||
const rawNameStr = tokens.slice(0, foundUnitIdx).join(' ');
|
||
const ns = splitNameAndSpec(rawNameStr);
|
||
name = ns.name;
|
||
if (ns.spec) spec = ns.spec;
|
||
unit = tokens[foundUnitIdx];
|
||
const afterTokens = tokens.slice(foundUnitIdx + 1);
|
||
if (afterTokens.length && /^[\d,.]+$/.test(afterTokens[0])) {
|
||
quantity = afterTokens[0];
|
||
let si = 1;
|
||
while (si < afterTokens.length && /^[\d,.%\-]+$/.test(afterTokens[si])) si++;
|
||
const specTail = afterTokens.slice(si).join(' ').trim();
|
||
if (specTail) spec = spec ? spec + ';' + specTail : specTail;
|
||
}
|
||
} else {
|
||
name = rest;
|
||
}
|
||
}
|
||
|
||
name = name.replace(/\s+/g, '').trim();
|
||
for (const u of UNIT_TOKENS) {
|
||
if (name.endsWith(u) && name.length > u.length) {
|
||
unit = unit || u;
|
||
name = name.substring(0, name.length - u.length);
|
||
break;
|
||
}
|
||
}
|
||
|
||
curItem = { code, name, unit, quantity, spec };
|
||
continue;
|
||
}
|
||
|
||
// ── 回退:无标准编码但有 "名称 单位 数量" 结构 → 也视为清单项 ──
|
||
// 常见于措施项目、未编码的补充清单项
|
||
if (!cm && stripped.length > 4) {
|
||
const uniMatch = stripped.match(UNIT_RE);
|
||
if (uniMatch) {
|
||
const ui = stripped.indexOf(uniMatch[0]);
|
||
const beforeUnit = stripped.substring(0, ui).trim();
|
||
const afterUnit = stripped.substring(ui + uniMatch[0].length).trim();
|
||
const hasQty = /^[\d,.]+/.test(afterUnit);
|
||
// 名称 2-50 字、含中文、有数量、不是分部标题
|
||
if (beforeUnit.length >= 2 && beforeUnit.length <= 50 && hasQty
|
||
&& /[\u4e00-\u9fff]/.test(beforeUnit)) {
|
||
if (curItem && curCat) curCat.items.push(curItem);
|
||
if (!curCat) { curCat = { name: '未分类', items: [] }; categories.push(curCat); }
|
||
const unit = uniMatch[1];
|
||
const qm = afterUnit.match(/^([\d,.]+)/);
|
||
const quantity = qm ? qm[1] : '';
|
||
const ns = splitNameAndSpec(beforeUnit);
|
||
const name = ns.name.replace(/\s+/g, '').trim();
|
||
const spec = ns.spec || '';
|
||
curItem = { code: '', name, unit, quantity, spec };
|
||
continue;
|
||
}
|
||
}
|
||
}
|
||
|
||
// 分部标题判断:不含编码、较短的文本、含工程关键字
|
||
// 关键守卫:如果行里有计量单位,说明是清单项,不是标题
|
||
if (stripped.length > 2 && stripped.length < 60 && !CODE_RE.test(stripped)) {
|
||
if (UNIT_RE.test(stripped) && /\d+\.?\d*\s*$/.test(stripped)) {
|
||
if (curItem) curItem.spec = curItem.spec ? curItem.spec + ';' + stripped : stripped;
|
||
continue;
|
||
}
|
||
if (isCatTitle(stripped) && !UNIT_RE.test(stripped) && !isFeeCatTitle(stripped)) {
|
||
if (curItem && curCat) { curCat.items.push(curItem); curItem = null; }
|
||
const cleanTitle = stripped.replace(/\s+(座|个|项|处|m|km|段|条)\s+\d+[\d.]*\s*$/, '').trim();
|
||
curCat = { name: cleanTitle, items: [] };
|
||
categories.push(curCat);
|
||
continue;
|
||
}
|
||
}
|
||
|
||
if (/^[一二三四五六七八九十]+\s/.test(stripped) || /^([一二三四五六七八九十\d]+)/.test(stripped)) {
|
||
// 中文序号标题也需要排除费用类
|
||
const cleanTitle = stripped.replace(/\s+(座|个|项|处)\s+\d+[\d.]*\s*$/, '').trim();
|
||
if (isFeeCatTitle(cleanTitle)) {
|
||
// 费用类标题:跳过,不建分部(其下的行会作为续行处理)
|
||
continue;
|
||
}
|
||
if (curItem && curCat) { curCat.items.push(curItem); curItem = null; }
|
||
curCat = { name: cleanTitle, items: [] };
|
||
categories.push(curCat);
|
||
continue;
|
||
}
|
||
|
||
if (curItem && stripped.length > 1) {
|
||
curItem.spec = curItem.spec ? curItem.spec + ';' + stripped : stripped;
|
||
}
|
||
}
|
||
|
||
if (curItem && curCat) curCat.items.push(curItem);
|
||
|
||
// 过滤费用项:只保留需要写入技术标的施工清单项
|
||
let feeFiltered = 0;
|
||
for (const cat of categories) {
|
||
if (cat.items) {
|
||
const before = cat.items.length;
|
||
cat.items = cat.items.filter(it => !isFeeItem(it.name));
|
||
feeFiltered += before - cat.items.length;
|
||
}
|
||
}
|
||
if (feeFiltered > 0) console.log(`[BillWorker] 费用项过滤: 移除 ${feeFiltered} 项`);
|
||
|
||
// ========== 按项目名称合并(核心去重,大幅减少清单项数量)==========
|
||
// 规则:同一分部内,name 相同的清单项合并为一条
|
||
// - code: 保留第一个非空编码
|
||
// - unit: 保留第一个非空单位
|
||
// - quantity: 尝试数值求和,否则用分号拼接
|
||
// - spec: 去重后用分号拼接(截断过长的)
|
||
let totalBeforeMerge = 0, totalAfterMerge = 0;
|
||
for (const cat of categories) {
|
||
if (!cat.items || !cat.items.length) continue;
|
||
totalBeforeMerge += cat.items.length;
|
||
|
||
const nameMap = new Map(); // name → merged item
|
||
for (const item of cat.items) {
|
||
const key = (item.name || '').replace(/\s+/g, '').trim();
|
||
if (!key) continue;
|
||
|
||
if (!nameMap.has(key)) {
|
||
nameMap.set(key, {
|
||
code: item.code || '',
|
||
name: item.name,
|
||
unit: item.unit || '',
|
||
quantity: item.quantity || '',
|
||
spec: item.spec || '',
|
||
_count: 1,
|
||
_quantities: item.quantity ? [item.quantity] : [],
|
||
_specs: item.spec ? [item.spec] : [],
|
||
});
|
||
} else {
|
||
const m = nameMap.get(key);
|
||
m._count++;
|
||
// code: 取第一个非空的
|
||
if (!m.code && item.code) m.code = item.code;
|
||
// unit: 取第一个非空的
|
||
if (!m.unit && item.unit) m.unit = item.unit;
|
||
// quantity: 收集所有
|
||
if (item.quantity) m._quantities.push(item.quantity);
|
||
// spec: 收集不重复的
|
||
if (item.spec && !m._specs.includes(item.spec)) {
|
||
m._specs.push(item.spec);
|
||
}
|
||
}
|
||
}
|
||
|
||
// 后处理:合成最终字段
|
||
const merged = [];
|
||
for (const [, m] of nameMap) {
|
||
// quantity: 尝试数值求和
|
||
if (m._quantities.length > 1) {
|
||
const nums = m._quantities.map(q => parseFloat(q.replace(/,/g, '')));
|
||
if (nums.every(n => !isNaN(n))) {
|
||
const sum = nums.reduce((a, b) => a + b, 0);
|
||
m.quantity = sum % 1 === 0 ? String(sum) : sum.toFixed(2);
|
||
} else {
|
||
m.quantity = m._quantities.join('; ');
|
||
}
|
||
} else if (m._quantities.length === 1) {
|
||
m.quantity = m._quantities[0];
|
||
}
|
||
// spec: 拼接去重后的 spec,每条最多120字
|
||
if (m._specs.length > 0) {
|
||
const trimmed = m._specs.map(s => s.length > 120 ? s.substring(0, 120) + '...' : s);
|
||
m.spec = trimmed.join('; ');
|
||
// 总 spec 上限 300 字
|
||
if (m.spec.length > 300) m.spec = m.spec.substring(0, 300) + '...';
|
||
}
|
||
// 清理临时字段
|
||
delete m._count; delete m._quantities; delete m._specs;
|
||
merged.push(m);
|
||
}
|
||
cat.items = merged;
|
||
totalAfterMerge += merged.length;
|
||
}
|
||
|
||
const mergedCount = totalBeforeMerge - totalAfterMerge;
|
||
if (mergedCount > 0) {
|
||
console.log(`[BillWorker] 按名称合并: ${totalBeforeMerge} → ${totalAfterMerge} 项(合并 ${mergedCount} 个重复项)`);
|
||
}
|
||
|
||
const valid = categories.filter(c => c.items && c.items.length > 0);
|
||
const totalItems = valid.reduce((s, c) => s + c.items.length, 0);
|
||
const withSpec = valid.reduce((s, c) => s + c.items.filter(it => it.spec).length, 0);
|
||
const withCode = valid.reduce((s, c) => s + c.items.filter(it => it.code).length, 0);
|
||
console.log(`[BillWorker] 最终结果: ${valid.length} 分部, ${totalItems} 清单项 (${withCode} 有编码, ${withSpec} 有spec)`);
|
||
// 打印前 3 个 item 供调试
|
||
let debugCount = 0;
|
||
for (const cat of valid) {
|
||
for (const it of cat.items) {
|
||
if (debugCount < 3) {
|
||
console.log(`[BillWorker] 样例: [${it.code}] ${it.name} | ${it.unit} | qty=${it.quantity} | spec=${(it.spec||'').substring(0, 80)}`);
|
||
debugCount++;
|
||
}
|
||
}
|
||
}
|
||
|
||
return {
|
||
project_summary: { remark: `本地解析:${valid.length} 个分部,${totalItems} 个清单项(合并前 ${totalBeforeMerge} 项)` },
|
||
categories: valid,
|
||
};
|
||
}
|
||
|
||
/**
|
||
* 判断清单项是否为"费用项"(非施工内容,不写入技术标)
|
||
* 如:安全文明措施费、规费、税金、暂列金额等
|
||
*/
|
||
function isFeeItem(name) {
|
||
if (!name) return false;
|
||
const n = name.replace(/\s+/g, '');
|
||
|
||
// ── 1. 精确匹配 ──
|
||
const EXACT = [
|
||
'规费', '税金', '利润', '增值税', '暂列金额', '暂估价', '计日工',
|
||
'总承包服务费', '企业管理费', '甲供材料保管费', '价税合计',
|
||
];
|
||
if (EXACT.includes(n)) return true;
|
||
|
||
// ── 2. 包含匹配:措施费/规费/保险/行政类 ──
|
||
const FEE_KW = [
|
||
'安全文明', '文明施工费', '环境保护费', '临时设施费',
|
||
'夜间施工增加费', '夜间施工费',
|
||
'冬雨季施工增加费', '冬雨季施工费',
|
||
'二次搬运费', '大型机械设备进出场', '大型机械进出场',
|
||
'施工排水降水', '排水降水费',
|
||
'已完工程及设备保护', '已完工程保护费',
|
||
'工程排污费', '社会保障费', '住房公积金',
|
||
'工伤保险', '劳动保险', '意外伤害保险', '建筑工程保险',
|
||
'城市维护建设税', '城市建设维护税',
|
||
'教育费附加', '地方教育附加',
|
||
'材料暂估', '专业工程暂估',
|
||
'超高施工增加费', '安全防护费',
|
||
'措施项目费', '其他项目费', '不可竞争费',
|
||
];
|
||
for (const kw of FEE_KW) {
|
||
if (n.includes(kw)) return true;
|
||
}
|
||
|
||
return false;
|
||
}
|
||
|
||
/**
|
||
* 将 rawName 中的"项目名称"与内联"项目特征描述"分离
|
||
* 例: "土方开挖 1.土壤类别:普通土" → { name: "土方开挖", spec: "1.土壤类别:普通土" }
|
||
*/
|
||
function splitNameAndSpec(rawName) {
|
||
if (!rawName) return { name: '', spec: '' };
|
||
// Pattern 1: 数字+点+中文(如 "1.土壤类别" "2、强度等级")
|
||
const m = rawName.match(/\d+[.、.)\uFF09]\s*[\u4e00-\u9fff]/);
|
||
if (m && m.index > 0) {
|
||
return {
|
||
name: rawName.substring(0, m.index).trim(),
|
||
spec: rawName.substring(m.index).trim()
|
||
};
|
||
}
|
||
// Pattern 2: 特征关键字+冒号(如 "材质:" "规格:")
|
||
const SPEC_KW_RE = /(材质|规格|型号|品牌|颜色|尺寸|厚度|直径|管径|强度|等级|类别|类型|做法|要求|标准|内容|工作内容|土壤|含量|配合比|工艺|方式|形式|范围|部位|位置|高度|宽度|长度|深度|坡度|截面|跨度|运距|开挖|回填|混凝土|钢筋|压实)[::]/;
|
||
const kw = rawName.match(SPEC_KW_RE);
|
||
if (kw && kw.index > 0) {
|
||
return {
|
||
name: rawName.substring(0, kw.index).trim(),
|
||
spec: rawName.substring(kw.index).trim()
|
||
};
|
||
}
|
||
// Pattern 3: 括号开头的特征描述 "(1)" "(1)"
|
||
const paren = rawName.match(/[((]\d+[))]/);
|
||
if (paren && paren.index > 0) {
|
||
return {
|
||
name: rawName.substring(0, paren.index).trim(),
|
||
spec: rawName.substring(paren.index).trim()
|
||
};
|
||
}
|
||
return { name: rawName, spec: '' };
|
||
}
|
||
|
||
function isCatTitle(text) {
|
||
const KW = [
|
||
'土建','建筑','结构','装饰','装修','安装','给排水','暖通','空调','通风',
|
||
'电气','强电','弱电','消防','智能化','幕墙','门窗','园林','绿化','景观',
|
||
'市政','道路','桥梁','管网','基础','地基','桩基','主体','屋面','防水',
|
||
'保温','钢结构','排水','给水','照明','动力','防雷','电梯','人防','室外',
|
||
'附属','分部','工程','措施','清单','土石方','混凝土','砌筑','模板','脚手架',
|
||
'水利','河道','管道','阀门','设备','仪表','自动化','通信','网络',
|
||
'拆除','外墙','内墙','楼地面','天棚','吊顶','栏杆','屋顶','涂料','抹灰',
|
||
'廊道','阀门井','蓄水池','泵站','供水','引水','水源','渠道','闸门',
|
||
'围栏','警示','检修','管线','配电','水池','水塔','取水','净水',
|
||
];
|
||
return KW.some(k => text.includes(k));
|
||
}
|
||
|
||
/**
|
||
* 判断分部标题是否为"费用类"(不应创建分部分类)
|
||
* 如:规费、税金、措施项目费、其他项目费 等非施工类分部
|
||
*/
|
||
function isFeeCatTitle(text) {
|
||
if (!text) return false;
|
||
const t = text.replace(/\s+/g, '');
|
||
// 精确匹配整个标题
|
||
const EXACT = [
|
||
'规费', '税金', '利润', '增值税', '暂列金额', '暂估价', '计日工',
|
||
'总承包服务费', '企业管理费', '价税合计',
|
||
'措施项目费', '其他项目费', '不可竞争费',
|
||
];
|
||
if (EXACT.includes(t)) return true;
|
||
// 包含匹配
|
||
const FEE_CAT_KW = [
|
||
'措施项目费', '其他项目费', '不可竞争费',
|
||
'规费汇总', '税金汇总', '费率', '费用汇总', '费用合计',
|
||
'暂列金额', '暂估价', '计日工', '总承包服务费',
|
||
'安全文明施工费', '社会保障费', '住房公积金',
|
||
'工伤保险', '教育费附加', '城市维护建设税',
|
||
];
|
||
for (const kw of FEE_CAT_KW) {
|
||
if (t.includes(kw)) return true;
|
||
}
|
||
return false;
|
||
}
|