tech-bid-manage20260423A/bill-worker.js

/**
 * bill-worker.js — PDF 清单解析调度器（Worker Thread）
 *
 * 架构（v3 — SharedArrayBuffer 零拷贝）：
 *   Phase 1 — 并行文本提取
 *     将 PDF 数据写入 SharedArrayBuffer（一次分配，所有子线程共享读）
 *     启动 N 个 page-worker，每个负责固定 20 页
 *
 *   Phase 2 — 清单页筛选 + 文本解析（纯正则，毫秒级）
 *     汇总全部页面文本 → 关键字筛选清单页 → 多行合并 → 逐行解析
 */
'use strict';
const { parentPort } = require('worker_threads');
const { Worker } = require('worker_threads');
const path = require('path');

const PAGES_PER_CHUNK = 20;

parentPort.on('message', async (msg) => {
    if (msg.type !== 'parse') return;
    const t0 = Date.now();
    try {
        // 立即做一次干净的拷贝，确保拥有独立的 ArrayBuffer
        const raw = msg.buffer;
        const buf = Buffer.alloc(raw.byteLength);
        Buffer.from(raw).copy(buf);

        if (buf.length === 0) {
            parentPort.postMessage({ type: 'done', ok: false, error: '收到空 PDF 数据' });
            return;
        }

        // ── 获取总页数 ──
        const pdfjsModule = await import('pdfjs-dist/build/pdf.mjs');
        const pdfjsLib = pdfjsModule.default || pdfjsModule;
        // 给 pdfjs 一份独立拷贝（pdfjs 内部可能 detach buffer）
        const pdfData = new Uint8Array(buf.length);
        buf.copy(Buffer.from(pdfData.buffer));
        const pdf = await pdfjsLib.getDocument({ data: pdfData, isEvalSupported: false }).promise;
        const totalPages = pdf.numPages;

        // ── 将 PDF 数据写入 SharedArrayBuffer（一次分配，所有子线程共享读）──
        const sab = new SharedArrayBuffer(buf.length);
        const sabView = new Uint8Array(sab);
        buf.copy(Buffer.from(sabView.buffer)); // 从独立 buf 拷贝到共享内存

        const workerCount = Math.ceil(totalPages / PAGES_PER_CHUNK);
        console.log(`[BillWorker] PDF ${totalPages} 页, ${workerCount} 路并行 (SharedArrayBuffer ${(buf.length/1024/1024).toFixed(1)}MB)`);

        // Phase 1: 并行文本提取
        const pageTexts = await parallelExtract(sab, buf.length, totalPages, workerCount);
        const t1 = Date.now();

        const extractedCount = pageTexts.filter(t => t.length > 0).length;
        console.log(`[BillWorker] Phase1 完成: ${t1 - t0}ms, ${extractedCount}/${totalPages} 页有文本`);

        // 扫描件判断
        const totalChars = pageTexts.reduce((s, t) => s + t.length, 0);
        if (totalChars < 50) {
            parentPort.postMessage({ type: 'done', ok: true, data: { scanned: true, reason: 'noText', totalPages } });
            return;
        }

        // Phase 2: 筛选清单页（宽松策略 + 连续页补全）
        const BILL_KW = ['项目编码', '项目名称', '工程量', '计量单位', '综合单价', '清单编码'];
        const SEC_KW  = ['分部分项', '分类分项', '措施项目', '其他项目', '工程量清单计价'];
        // 第一轮：标记确定的清单页
        const billFlags = new Array(pageTexts.length).fill(false);
        for (let i = 0; i < pageTexts.length; i++) {
            const t = pageTexts[i];
            if (!t.trim()) continue;
            const hHits = BILL_KW.filter(k => t.includes(k)).length;
            const sHit  = SEC_KW.some(k => t.includes(k));
            const hasCode = /\d{9}/.test(t);
            // 放宽：有9位编码即可（不再要求同时命中表头关键字）
            if (hHits >= 2 || sHit || hasCode) {
                billFlags[i] = true;
            }
        }
        // 第二轮：连续页补全 — 两个清单页之间的非空页也视为清单页（续页无表头）
        // 但排除纯费用/税金页面（它们不含施工清单项）
        const FEE_PAGE_KW = ['规费', '税金', '社会保险费', '住房公积金', '养老保险',
                             '工伤保险', '失业保险', '医疗保险', '教育费附加', '城市维护建设税'];
        const firstBill = billFlags.indexOf(true);
        const lastBill  = billFlags.lastIndexOf(true);
        if (firstBill >= 0 && lastBill > firstBill) {
            for (let i = firstBill; i <= lastBill; i++) {
                if (!billFlags[i] && pageTexts[i] && pageTexts[i].trim().length > 30) {
                    const t = pageTexts[i];
                    const feeHits = FEE_PAGE_KW.filter(kw => t.includes(kw)).length;
                    // 命中 2+ 个费用关键字且没有9位工程编码 → 纯费用页，排除
                    if (feeHits >= 2 && !/\d{9}/.test(t)) continue;
                    billFlags[i] = true;
                }
            }
        }
        const billTexts = [];
        for (let i = 0; i < pageTexts.length; i++) {
            if (billFlags[i]) billTexts.push(pageTexts[i]);
        }

        if (!billTexts.length) {
            parentPort.postMessage({ type: 'done', ok: true, data: { scanned: false, noBillPages: true, totalPages } });
            return;
        }

        console.log(`[BillWorker] ${totalPages} 页 → ${billTexts.length} 页清单 (原始识别 ${billFlags.filter(f=>f).length - (lastBill - firstBill >= 0 ? 0 : 0)} / 补全后 ${billTexts.length})`);

        // Phase 3: 文本解析
        const merged = billTexts.join('\n');
        const parsed = parseBillText(merged);
        const t2 = Date.now();
        console.log(`[BillWorker] Phase2+3: ${t2 - t1}ms, 总耗时: ${t2 - t0}ms`);

        parentPort.postMessage({
            type: 'done', ok: true,
            data: {
                scanned: false,
                ...parsed,
                _meta: {
                    method: 'local-parallel',
                    workers: workerCount,
                    billPages: billTexts.length,
                    totalPages,
                    extractMs: t1 - t0,
                    parseMs: t2 - t1,
                    totalMs: t2 - t0,
                }
            }
        });
    } catch (err) {
        console.error('[BillWorker] 错误:', err.message);
        parentPort.postMessage({ type: 'done', ok: false, error: err.message });
    }
});

// ================================================================
// Phase 1: 多 Worker 并行提取（SharedArrayBuffer 零拷贝）
// ================================================================

function parallelExtract(sab, dataLength, totalPages, workerCount) {
    return new Promise((resolve) => {
        const workerPath = path.join(__dirname, 'page-worker.js');
        const allPageTexts = new Array(totalPages).fill('');
        const workerStatus = new Array(workerCount).fill('pending'); // pending, done, failed
        let resolved = false;

        const checkComplete = () => {
            if (resolved) return;
            const doneCount = workerStatus.filter(s => s === 'done' || s === 'failed').length;
            if (doneCount >= workerCount) {
                resolved = true;
                // 检查是否有失败的worker，打印警告
                const failedCount = workerStatus.filter(s => s === 'failed').length;
                if (failedCount > 0) {
                    console.warn(`[BillWorker] ${failedCount}/${workerCount} 个worker失败，可能导致部分页面无内容`);
                }
                resolve(allPageTexts);
            }
        };

        for (let i = 0; i < workerCount; i++) {
            const startPage = i * PAGES_PER_CHUNK + 1;
            const endPage = Math.min((i + 1) * PAGES_PER_CHUNK, totalPages);

            // workerData 传 SharedArrayBuffer（跨线程共享，不会被清空）
            const w = new Worker(workerPath, {
                workerData: { sab, dataLength, startPage, endPage }
            });

            let workerDone = false;

            const markDone = (status) => {
                if (workerDone) return;
                workerDone = true;
                workerStatus[i] = status;
                checkComplete();
            };

            w.on('message', (msg) => {
                if (msg.ok && msg.results) {
                    for (const r of msg.results) {
                        allPageTexts[r.page - 1] = r.text;
                    }
                    markDone('done');
                } else if (!msg.ok) {
                    console.warn(`[BillWorker] page-worker[${startPage}-${endPage}] 失败: ${msg.error}`);
                    markDone('failed');
                }
            });

            w.on('error', (err) => {
                console.warn(`[BillWorker] page-worker[${startPage}-${endPage}] 异常: ${err.message}`);
                markDone('failed');
            });

            w.on('exit', (code) => {
                // exit 在 message 之后触发，但如果 worker 崩溃没发 message 则在这里兜底
                if (code !== 0 && !workerDone) {
                    console.warn(`[BillWorker] page-worker[${startPage}-${endPage}] 意外退出(code=${code})`);
                    markDone('failed');
                } else if (!workerDone) {
                    markDone('done');
                }
            });
        }

        if (workerCount <= 0) {
            resolved = true;
            resolve(allPageTexts);
        }
    });
}

// ================================================================
// Phase 3: 清单文本解析（纯正则 + 字符串处理，毫秒级）
// ================================================================

function parseBillText(text) {
    const rawLines = text.split(/\n/).map(l => {
        let line = l.replace(/\t/g, ' ').trim();
        // 规范化带横杠的编码：如 "010-101-001-001" → "010101001001"
        line = line.replace(/(\d{2,4})[-‐–](\d{2,4})[-‐–](\d{2,4})(?:[-‐–](\d{2,4}))?/g,
            (m, a, b, c, d) => {
                const combined = a + b + c + (d || '');
                return (combined.length >= 9 && combined.length <= 12) ? combined : m;
            });
        return line;
    });

    // ── Step 1: 多行合并成逻辑行 ──
    // pdfjs 按 Y 坐标分行，表格一行通常 = 一条文本行
    // 但有时 项目特征/名称 会折行，需要合并
    //
    // 新逻辑行的起始标志（任一命中即切断）：
    //   a) 序号模式：1.1.1.1.5 开头
    //   b) 清单编码：9-12位数字 或 B+5-6位数字 开头
    //   c) 中文大标题：一 二 三 ... 或 （一）（二）...
    //   d) 表头行内容（跳过）
    //   e) 纯数字序号 + 空格 + 编码（如 "5 500101004001"）

    const ITEM_START    = /^\d+(\.\d+)+\s/;              // 1.1 或 1.1.1 等序号
    const CODE_INLINE   = /(?:^|\s)(\d{9,12}|(?<![A-Za-z])B\d{5,6})\s/; // 行内含清单编码（排除 GB/DB 等标准号）
    const CODE_START_RE = /^(\d{9,12}|B\d{5,6})\s/;     // 行首就是清单编码（行首 B 不会有前缀字母）
    const SEQ_CODE_RE   = /^\d{1,4}\s+(\d{9,12}|(?<![A-Za-z])B\d{5,6})\s/; // "序号 编码"格式
    const PAGE_MARK     = /^--\s*\d+\s+of\s+\d+\s*--/;
    const HEADER_RE     = /^序号\s+(项目编码|项目名称)/;
    const HEADER_KW     = /^(项目编码|项目名称|清单编码|计量单位|综合单价|工程量|合\s*价|金额|序号)\s/;
    const CATEGORY_MARKERS = ['一', '二', '三', '四', '五', '六', '七', '八', '九', '十',
                              '（一）', '（二）', '（三）', '（四）', '（五）'];

    const logicLines = [];
    let currentLine = '';

    function isNewLineTrigger(raw) {
        if (ITEM_START.test(raw)) return true;
        if (CODE_START_RE.test(raw)) return true;
        if (SEQ_CODE_RE.test(raw)) return true;
        if (CATEGORY_MARKERS.some(m => raw.startsWith(m + ' ') || raw.startsWith(m + '\u3000'))) return true;
        return false;
    }

    for (const raw of rawLines) {
        if (!raw || PAGE_MARK.test(raw)) continue;
        if (HEADER_RE.test(raw) || HEADER_KW.test(raw)) continue;
        if (/^（元）|^款章节号|^备注$|^第\d+页/.test(raw)) continue;

        if (isNewLineTrigger(raw)) {
            if (currentLine) logicLines.push(currentLine);
            currentLine = raw;
        } else if (CODE_INLINE.test(raw) && raw.length > 15) {
            // 行内包含编码且够长（像是完整的表格行）→ 也开新行
            if (currentLine) logicLines.push(currentLine);
            currentLine = raw;
        } else {
            // 续行（项目特征折行等短文本）
            // 安全阀：已合并行过长时强制切断，防止整页吞并
            if (currentLine && currentLine.length > 300) {
                logicLines.push(currentLine);
                currentLine = raw;
            } else {
                currentLine = currentLine ? currentLine + ' ' + raw : raw;
            }
        }
    }
    if (currentLine) logicLines.push(currentLine);

    console.log(`[BillWorker] 合并后 ${logicLines.length} 条逻辑行（原始 ${rawLines.length} 行）`);
    // 打印前5条逻辑行供调试
    for (let i = 0; i < Math.min(5, logicLines.length); i++) {
        console.log(`[BillWorker]   L${i}: ${logicLines[i].substring(0, 120)}`);
    }

    const categories = [];
    let curCat = null, curItem = null;

    // 编码匹配：支持行内任意位置的9-12位数字或B编码（排除 GB/DB 等标准号前缀）
    const CODE_RE = /(?<![A-Za-z])(\d{9,12}|(?<![A-Za-z])B\d{5,6})/;
    const UNIT_TOKENS = ['m³','m²','m3','m2','km','hm2','㎡','㎥','t','kg',
                         '个','台','套','组','根','块','片','张','只','吨','项',
                         '处','座','件','段','条','把','扇','口','圈','道','孔',
                         '对','副','樘','方','延m','株','棵','m'];
    const UNIT_SET = new Set(UNIT_TOKENS);
    const unitEscaped = UNIT_TOKENS.map(u => u.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'));
    const UNIT_RE = new RegExp(`(?:^|\\s)(${unitEscaped.join('|')})(?=\\s|\\d|$)`);
    const SKIP_RE = /合\s*计|小\s*计|本页小计|总\s*计|价税合计/;

    for (const line of logicLines) {
        if (SKIP_RE.test(line)) continue;

        // 去掉行首的序号部分（"1.1.1.1.5 " 或 "5 " 等纯序号前缀）
        let stripped = line.replace(/^\d+(\.\d+)*\s+/, '').trim();
        if (!stripped) stripped = line.trim();
        if (!stripped) continue;

        const cm = stripped.match(CODE_RE);
        if (cm) {
            if (curItem && curCat) curCat.items.push(curItem);
            if (!curCat) { curCat = { name: '未分类', items: [] }; categories.push(curCat); }

            const code = cm[1];
            let rest = stripped.substring(cm.index + cm[0].length).trim();
            let name = '', unit = '', quantity = '', spec = '';

            const unitMatch = rest.match(UNIT_RE);
            if (unitMatch) {
                const ui = rest.indexOf(unitMatch[0]);
                let rawName = rest.substring(0, ui).trim();
                unit = unitMatch[1];
                const afterUnit = rest.substring(ui + unitMatch[0].length).trim();
                const qm = afterUnit.match(/^([\d,.]+)/);
                if (qm) {
                    quantity = qm[1];
                    // 提取 quantity 之后的尾部文本，跳过纯数字字段（综合单价、合价等）
                    let tail = afterUnit.substring(qm.index + qm[0].length).trim();
                    if (tail) {
                        const tailTokens = tail.split(/\s+/);
                        let si = 0;
                        while (si < tailTokens.length && /^[\d,.%\-]+$/.test(tailTokens[si])) si++;
                        const specTail = tailTokens.slice(si).join(' ').trim();
                        if (specTail) spec = specTail;
                    }
                }
                // 分离 rawName 中的"项目名称"和内联"项目特征"
                const ns = splitNameAndSpec(rawName);
                name = ns.name;
                if (ns.spec) spec = ns.spec + (spec ? ';' + spec : '');
            } else {
                const tokens = rest.split(/\s+/).filter(t => t);
                let foundUnitIdx = -1;
                for (let ti = tokens.length - 1; ti >= 1; ti--) {
                    if (UNIT_SET.has(tokens[ti])) { foundUnitIdx = ti; break; }
                }
                if (foundUnitIdx >= 1) {
                    const rawNameStr = tokens.slice(0, foundUnitIdx).join(' ');
                    const ns = splitNameAndSpec(rawNameStr);
                    name = ns.name;
                    if (ns.spec) spec = ns.spec;
                    unit = tokens[foundUnitIdx];
                    const afterTokens = tokens.slice(foundUnitIdx + 1);
                    if (afterTokens.length && /^[\d,.]+$/.test(afterTokens[0])) {
                        quantity = afterTokens[0];
                        let si = 1;
                        while (si < afterTokens.length && /^[\d,.%\-]+$/.test(afterTokens[si])) si++;
                        const specTail = afterTokens.slice(si).join(' ').trim();
                        if (specTail) spec = spec ? spec + ';' + specTail : specTail;
                    }
                } else {
                    name = rest;
                }
            }

            name = name.replace(/\s+/g, '').trim();
            for (const u of UNIT_TOKENS) {
                if (name.endsWith(u) && name.length > u.length) {
                    unit = unit || u;
                    name = name.substring(0, name.length - u.length);
                    break;
                }
            }

            curItem = { code, name, unit, quantity, spec };
            continue;
        }

        // ── 回退：无标准编码但有 "名称 单位 数量" 结构 → 也视为清单项 ──
        // 常见于措施项目、未编码的补充清单项
        if (!cm && stripped.length > 4) {
            const uniMatch = stripped.match(UNIT_RE);
            if (uniMatch) {
                const ui = stripped.indexOf(uniMatch[0]);
                const beforeUnit = stripped.substring(0, ui).trim();
                const afterUnit  = stripped.substring(ui + uniMatch[0].length).trim();
                const hasQty = /^[\d,.]+/.test(afterUnit);
                // 名称 2-50 字、含中文、有数量、不是分部标题
                if (beforeUnit.length >= 2 && beforeUnit.length <= 50 && hasQty
                    && /[\u4e00-\u9fff]/.test(beforeUnit)) {
                    if (curItem && curCat) curCat.items.push(curItem);
                    if (!curCat) { curCat = { name: '未分类', items: [] }; categories.push(curCat); }
                    const unit = uniMatch[1];
                    const qm = afterUnit.match(/^([\d,.]+)/);
                    const quantity = qm ? qm[1] : '';
                    const ns = splitNameAndSpec(beforeUnit);
                    const name = ns.name.replace(/\s+/g, '').trim();
                    const spec = ns.spec || '';
                    curItem = { code: '', name, unit, quantity, spec };
                    continue;
                }
            }
        }

        // 分部标题判断：不含编码、较短的文本、含工程关键字
        // 关键守卫：如果行里有计量单位，说明是清单项，不是标题
        if (stripped.length > 2 && stripped.length < 60 && !CODE_RE.test(stripped)) {
            if (UNIT_RE.test(stripped) && /\d+\.?\d*\s*$/.test(stripped)) {
                if (curItem) curItem.spec = curItem.spec ? curItem.spec + ';' + stripped : stripped;
                continue;
            }
            if (isCatTitle(stripped) && !UNIT_RE.test(stripped) && !isFeeCatTitle(stripped)) {
                if (curItem && curCat) { curCat.items.push(curItem); curItem = null; }
                const cleanTitle = stripped.replace(/\s+(座|个|项|处|m|km|段|条)\s+\d+[\d.]*\s*$/, '').trim();
                curCat = { name: cleanTitle, items: [] };
                categories.push(curCat);
                continue;
            }
        }

        if (/^[一二三四五六七八九十]+\s/.test(stripped) || /^（[一二三四五六七八九十\d]+）/.test(stripped)) {
            // 中文序号标题也需要排除费用类
            const cleanTitle = stripped.replace(/\s+(座|个|项|处)\s+\d+[\d.]*\s*$/, '').trim();
            if (isFeeCatTitle(cleanTitle)) {
                // 费用类标题：跳过，不建分部（其下的行会作为续行处理）
                continue;
            }
            if (curItem && curCat) { curCat.items.push(curItem); curItem = null; }
            curCat = { name: cleanTitle, items: [] };
            categories.push(curCat);
            continue;
        }

        if (curItem && stripped.length > 1) {
            curItem.spec = curItem.spec ? curItem.spec + ';' + stripped : stripped;
        }
    }

    if (curItem && curCat) curCat.items.push(curItem);

    // 过滤费用项：只保留需要写入技术标的施工清单项
    let feeFiltered = 0;
    for (const cat of categories) {
        if (cat.items) {
            const before = cat.items.length;
            cat.items = cat.items.filter(it => !isFeeItem(it.name));
            feeFiltered += before - cat.items.length;
        }
    }
    if (feeFiltered > 0) console.log(`[BillWorker] 费用项过滤: 移除 ${feeFiltered} 项`);

    // ========== 按项目名称合并（核心去重，大幅减少清单项数量）==========
    // 规则：同一分部内，name 相同的清单项合并为一条
    //   - code: 保留第一个非空编码
    //   - unit: 保留第一个非空单位
    //   - quantity: 尝试数值求和，否则用分号拼接
    //   - spec: 去重后用分号拼接（截断过长的）
    let totalBeforeMerge = 0, totalAfterMerge = 0;
    for (const cat of categories) {
        if (!cat.items || !cat.items.length) continue;
        totalBeforeMerge += cat.items.length;

        const nameMap = new Map(); // name → merged item
        for (const item of cat.items) {
            const key = (item.name || '').replace(/\s+/g, '').trim();
            if (!key) continue;

            if (!nameMap.has(key)) {
                nameMap.set(key, {
                    code: item.code || '',
                    name: item.name,
                    unit: item.unit || '',
                    quantity: item.quantity || '',
                    spec: item.spec || '',
                    _count: 1,
                    _quantities: item.quantity ? [item.quantity] : [],
                    _specs: item.spec ? [item.spec] : [],
                });
            } else {
                const m = nameMap.get(key);
                m._count++;
                // code: 取第一个非空的
                if (!m.code && item.code) m.code = item.code;
                // unit: 取第一个非空的
                if (!m.unit && item.unit) m.unit = item.unit;
                // quantity: 收集所有
                if (item.quantity) m._quantities.push(item.quantity);
                // spec: 收集不重复的
                if (item.spec && !m._specs.includes(item.spec)) {
                    m._specs.push(item.spec);
                }
            }
        }

        // 后处理：合成最终字段
        const merged = [];
        for (const [, m] of nameMap) {
            // quantity: 尝试数值求和
            if (m._quantities.length > 1) {
                const nums = m._quantities.map(q => parseFloat(q.replace(/,/g, '')));
                if (nums.every(n => !isNaN(n))) {
                    const sum = nums.reduce((a, b) => a + b, 0);
                    m.quantity = sum % 1 === 0 ? String(sum) : sum.toFixed(2);
                } else {
                    m.quantity = m._quantities.join('; ');
                }
            } else if (m._quantities.length === 1) {
                m.quantity = m._quantities[0];
            }
            // spec: 拼接去重后的 spec，每条最多120字
            if (m._specs.length > 0) {
                const trimmed = m._specs.map(s => s.length > 120 ? s.substring(0, 120) + '...' : s);
                m.spec = trimmed.join('; ');
                // 总 spec 上限 300 字
                if (m.spec.length > 300) m.spec = m.spec.substring(0, 300) + '...';
            }
            // 清理临时字段
            delete m._count; delete m._quantities; delete m._specs;
            merged.push(m);
        }
        cat.items = merged;
        totalAfterMerge += merged.length;
    }

    const mergedCount = totalBeforeMerge - totalAfterMerge;
    if (mergedCount > 0) {
        console.log(`[BillWorker] 按名称合并: ${totalBeforeMerge} → ${totalAfterMerge} 项（合并 ${mergedCount} 个重复项）`);
    }

    const valid = categories.filter(c => c.items && c.items.length > 0);
    const totalItems = valid.reduce((s, c) => s + c.items.length, 0);
    const withSpec = valid.reduce((s, c) => s + c.items.filter(it => it.spec).length, 0);
    const withCode = valid.reduce((s, c) => s + c.items.filter(it => it.code).length, 0);
    console.log(`[BillWorker] 最终结果: ${valid.length} 分部, ${totalItems} 清单项 (${withCode} 有编码, ${withSpec} 有spec)`);
    // 打印前 3 个 item 供调试
    let debugCount = 0;
    for (const cat of valid) {
        for (const it of cat.items) {
            if (debugCount < 3) {
                console.log(`[BillWorker]   样例: [${it.code}] ${it.name} | ${it.unit} | qty=${it.quantity} | spec=${(it.spec||'').substring(0, 80)}`);
                debugCount++;
            }
        }
    }

    return {
        project_summary: { remark: `本地解析：${valid.length} 个分部，${totalItems} 个清单项（合并前 ${totalBeforeMerge} 项）` },
        categories: valid,
    };
}

/**
 * 判断清单项是否为"费用项"（非施工内容，不写入技术标）
 * 如：安全文明措施费、规费、税金、暂列金额等
 */
function isFeeItem(name) {
    if (!name) return false;
    const n = name.replace(/\s+/g, '');

    // ── 1. 精确匹配 ──
    const EXACT = [
        '规费', '税金', '利润', '增值税', '暂列金额', '暂估价', '计日工',
        '总承包服务费', '企业管理费', '甲供材料保管费', '价税合计',
    ];
    if (EXACT.includes(n)) return true;

    // ── 2. 包含匹配：措施费/规费/保险/行政类 ──
    const FEE_KW = [
        '安全文明', '文明施工费', '环境保护费', '临时设施费',
        '夜间施工增加费', '夜间施工费',
        '冬雨季施工增加费', '冬雨季施工费',
        '二次搬运费', '大型机械设备进出场', '大型机械进出场',
        '施工排水降水', '排水降水费',
        '已完工程及设备保护', '已完工程保护费',
        '工程排污费', '社会保障费', '住房公积金',
        '工伤保险', '劳动保险', '意外伤害保险', '建筑工程保险',
        '城市维护建设税', '城市建设维护税',
        '教育费附加', '地方教育附加',
        '材料暂估', '专业工程暂估',
        '超高施工增加费', '安全防护费',
        '措施项目费', '其他项目费', '不可竞争费',
    ];
    for (const kw of FEE_KW) {
        if (n.includes(kw)) return true;
    }

    return false;
}

/**
 * 将 rawName 中的"项目名称"与内联"项目特征描述"分离
 * 例: "土方开挖 1.土壤类别：普通土" → { name: "土方开挖", spec: "1.土壤类别：普通土" }
 */
function splitNameAndSpec(rawName) {
    if (!rawName) return { name: '', spec: '' };
    // Pattern 1: 数字+点+中文（如 "1.土壤类别" "2、强度等级"）
    const m = rawName.match(/\d+[.、．)\uFF09]\s*[\u4e00-\u9fff]/);
    if (m && m.index > 0) {
        return {
            name: rawName.substring(0, m.index).trim(),
            spec: rawName.substring(m.index).trim()
        };
    }
    // Pattern 2: 特征关键字+冒号（如 "材质：" "规格："）
    const SPEC_KW_RE = /(材质|规格|型号|品牌|颜色|尺寸|厚度|直径|管径|强度|等级|类别|类型|做法|要求|标准|内容|工作内容|土壤|含量|配合比|工艺|方式|形式|范围|部位|位置|高度|宽度|长度|深度|坡度|截面|跨度|运距|开挖|回填|混凝土|钢筋|压实)[：:]/;
    const kw = rawName.match(SPEC_KW_RE);
    if (kw && kw.index > 0) {
        return {
            name: rawName.substring(0, kw.index).trim(),
            spec: rawName.substring(kw.index).trim()
        };
    }
    // Pattern 3: 括号开头的特征描述 "（1）" "(1)"
    const paren = rawName.match(/[（(]\d+[）)]/);
    if (paren && paren.index > 0) {
        return {
            name: rawName.substring(0, paren.index).trim(),
            spec: rawName.substring(paren.index).trim()
        };
    }
    return { name: rawName, spec: '' };
}

function isCatTitle(text) {
    const KW = [
        '土建','建筑','结构','装饰','装修','安装','给排水','暖通','空调','通风',
        '电气','强电','弱电','消防','智能化','幕墙','门窗','园林','绿化','景观',
        '市政','道路','桥梁','管网','基础','地基','桩基','主体','屋面','防水',
        '保温','钢结构','排水','给水','照明','动力','防雷','电梯','人防','室外',
        '附属','分部','工程','措施','清单','土石方','混凝土','砌筑','模板','脚手架',
        '水利','河道','管道','阀门','设备','仪表','自动化','通信','网络',
        '拆除','外墙','内墙','楼地面','天棚','吊顶','栏杆','屋顶','涂料','抹灰',
        '廊道','阀门井','蓄水池','泵站','供水','引水','水源','渠道','闸门',
        '围栏','警示','检修','管线','配电','水池','水塔','取水','净水',
    ];
    return KW.some(k => text.includes(k));
}

/**
 * 判断分部标题是否为"费用类"（不应创建分部分类）
 * 如：规费、税金、措施项目费、其他项目费 等非施工类分部
 */
function isFeeCatTitle(text) {
    if (!text) return false;
    const t = text.replace(/\s+/g, '');
    // 精确匹配整个标题
    const EXACT = [
        '规费', '税金', '利润', '增值税', '暂列金额', '暂估价', '计日工',
        '总承包服务费', '企业管理费', '价税合计',
        '措施项目费', '其他项目费', '不可竞争费',
    ];
    if (EXACT.includes(t)) return true;
    // 包含匹配
    const FEE_CAT_KW = [
        '措施项目费', '其他项目费', '不可竞争费',
        '规费汇总', '税金汇总', '费率', '费用汇总', '费用合计',
        '暂列金额', '暂估价', '计日工', '总承包服务费',
        '安全文明施工费', '社会保障费', '住房公积金',
        '工伤保险', '教育费附加', '城市维护建设税',
    ];
    for (const kw of FEE_CAT_KW) {
        if (t.includes(kw)) return true;
    }
    return false;
}