2026-04-20 16:21:06 +08:00

180 lines
7.0 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
招标文件解析模块
流程:提取文本 → 生成摘要 → 提取评分要求 → 结构化JSON
"""
import json
import logging
import re
import sqlite3
from datetime import datetime
from utils import ai_client, prompts as P
from utils.file_utils import extract_text, truncate_text
from utils.tender_kind_sections import (
get_tender_kind_classify_prompt,
parse_tender_kind_response,
)
logger = logging.getLogger(__name__)
def parse_boq_file(db_path: str, project_id: int, file_path: str, file_name: str) -> None:
"""
后台线程:解析工程量清单文件 → 本地结构化分析 → AI 摘要 → 写库。
boq_status: none → parsing → done / error
"""
from utils.bill_analysis import analyze_boq_pages, categories_to_prompt_appendix
from utils.boq_parser import extract_boq_pages
conn = sqlite3.connect(db_path)
try:
_set_boq_status(conn, project_id, 'parsing', '正在提取工程量清单文本...')
page_texts = extract_boq_pages(file_path)
boq_text = '\n'.join(page_texts).strip()
if not boq_text:
raise ValueError('未能从文件中提取到有效内容,请检查文件格式')
_set_boq_status(conn, project_id, 'parsing', '正在本地解析清单结构...')
analysis = analyze_boq_pages(page_texts)
boq_analysis_json = json.dumps(analysis, ensure_ascii=False)
structured = ''
if not analysis.get('scanned') and not analysis.get('no_bill_pages'):
structured = categories_to_prompt_appendix(analysis)
_set_boq_status(conn, project_id, 'parsing', '正在生成工程量清单摘要...')
summary_prompt = P.get_boq_summary_prompt(boq_text[:10000], structured)
boq_summary = ai_client.chat(summary_prompt, temperature=0.2, max_tokens=2048)
cur = conn.cursor()
cur.execute('''
UPDATE tender_data
SET boq_file_name=?, boq_text=?, boq_summary=?, boq_analysis_json=?,
boq_status='done', boq_error='', updated_at=?
WHERE project_id=?
''', (file_name, boq_text[:12000], boq_summary, boq_analysis_json, datetime.now(), project_id))
conn.commit()
logger.info(f'项目 {project_id} 工程量清单解析完成')
except Exception as e:
logger.exception(f'工程量清单解析失败 project_id={project_id}')
_set_boq_status(conn, project_id, 'error', str(e))
finally:
conn.close()
def _set_boq_status(conn, project_id, status, message=''):
cur = conn.cursor()
cur.execute('''
UPDATE tender_data SET boq_status=?, boq_error=?, updated_at=?
WHERE project_id=?
''', (status, message, datetime.now(), project_id))
conn.commit()
def parse_tender_file(db_path: str, project_id: int, file_path: str, file_name: str) -> None:
"""
后台线程中运行:解析招标文件并将结果写入数据库。
status 字段pending → parsing → done / error
"""
conn = sqlite3.connect(db_path)
try:
_set_status(conn, project_id, 'parsing', '正在提取文件文本...')
# 1. 提取原始文本
raw_text = extract_text(file_path)
raw_text = truncate_text(raw_text, 60000)
_set_status(conn, project_id, 'parsing', '正在生成招标摘要...')
# 2. 生成结构化摘要
summary_prompt = P.get_project_summary_prompt(raw_text)
summary = ai_client.chat(summary_prompt, temperature=0.3, max_tokens=4096)
_set_status(conn, project_id, 'parsing', '正在提取技术评分要求...')
# 3. 提取技术评分要求Markdown 格式)
rating_prompt = P.get_rating_requirements_prompt(raw_text)
rating_md = ai_client.chat(rating_prompt, temperature=0.2, max_tokens=4096)
_set_status(conn, project_id, 'parsing', '正在结构化评分数据...')
# 4. 将评分要求转换为 JSON
rating_json_prompt = P.get_rating_json_prompt(rating_md)
rating_json_raw = ai_client.chat(rating_json_prompt, temperature=0.1, max_tokens=2048)
rating_json_str = _clean_json(rating_json_raw)
_set_status(conn, project_id, 'parsing', '正在识别招标文件类型(工程/服务/货物)...')
excerpt = (raw_text or '')[:15000]
kind_prompt = get_tender_kind_classify_prompt(excerpt)
kind_raw = ai_client.chat(kind_prompt, temperature=0.1, max_tokens=32)
tender_kind = parse_tender_kind_response(kind_raw)
logger.info(f'项目 {project_id} 招标文件类型识别为: {tender_kind}')
# 写入数据库
_upsert_tender_data(conn, project_id, file_name, raw_text,
summary, rating_md, rating_json_str, tender_kind)
_set_status(conn, project_id, 'done', '解析完成')
logger.info(f'项目 {project_id} 招标文件解析完成')
except Exception as e:
logger.exception(f'解析失败 project_id={project_id}')
_set_status(conn, project_id, 'error', str(e))
finally:
conn.close()
# ─── 内部工具 ──────────────────────────────────────────────────────────────
def _set_status(conn, project_id, status, message=''):
cur = conn.cursor()
cur.execute('''
INSERT INTO tender_data (project_id, status, error_message)
VALUES (?, ?, ?)
ON CONFLICT(project_id) DO UPDATE SET status=?, error_message=?, updated_at=?
''', (project_id, status, message, status, message, datetime.now()))
conn.commit()
def _upsert_tender_data(conn, project_id, file_name, raw_text,
summary, rating_md, rating_json_str,
tender_kind: str = 'engineering'):
cur = conn.cursor()
cur.execute('''
INSERT INTO tender_data
(project_id, file_name, raw_text, summary, rating_requirements, rating_json,
tender_kind, status, error_message)
VALUES (?, ?, ?, ?, ?, ?, ?, 'done', '')
ON CONFLICT(project_id) DO UPDATE SET
file_name=?, raw_text=?, summary=?, rating_requirements=?,
rating_json=?, tender_kind=?, status='done', error_message='', updated_at=?
''', (
project_id, file_name, raw_text, summary, rating_md, rating_json_str, tender_kind,
file_name, raw_text, summary, rating_md, rating_json_str, tender_kind, datetime.now()
))
conn.commit()
def _clean_json(raw: str) -> str:
"""尝试从 AI 返回中提取 JSON 字符串"""
# 去除 markdown 代码块
raw = re.sub(r'```(?:json)?\s*', '', raw)
raw = raw.replace('```', '').strip()
# 验证是否是有效 JSON
try:
json.loads(raw)
return raw
except json.JSONDecodeError:
# 尝试提取 { ... } 部分
m = re.search(r'\{[\s\S]*\}', raw)
if m:
candidate = m.group(0)
try:
json.loads(candidate)
return candidate
except Exception:
pass
return raw