You've already forked DataMate
feat(generation_service): add image URL extraction and random QA generation logic (#182)
This commit is contained in:
@@ -1,5 +1,7 @@
|
|||||||
import asyncio
|
import asyncio
|
||||||
import json
|
import json
|
||||||
|
import random
|
||||||
|
import re
|
||||||
import uuid
|
import uuid
|
||||||
|
|
||||||
from langchain_core.language_models import BaseChatModel
|
from langchain_core.language_models import BaseChatModel
|
||||||
@@ -36,6 +38,13 @@ def _filter_docs(split_docs, chunk_size):
|
|||||||
return filtered_docs
|
return filtered_docs
|
||||||
|
|
||||||
|
|
||||||
|
def extract_img_urls(doc):
|
||||||
|
"""提取文档中的图片地址"""
|
||||||
|
pattern = r"!\[\]\((.*?)\)"
|
||||||
|
# 查找所有匹配的地址
|
||||||
|
img_urls = re.findall(pattern, doc)
|
||||||
|
return img_urls
|
||||||
|
|
||||||
class GenerationService:
|
class GenerationService:
|
||||||
def __init__(self, db: AsyncSession):
|
def __init__(self, db: AsyncSession):
|
||||||
self.db = db
|
self.db = db
|
||||||
@@ -226,6 +235,15 @@ class GenerationService:
|
|||||||
|
|
||||||
已经进入后续流程的任务(例如其它协程正在生成答案)允许自然执行完。
|
已经进入后续流程的任务(例如其它协程正在生成答案)允许自然执行完。
|
||||||
"""
|
"""
|
||||||
|
# 随机决定是否对当前 chunk 进行 QA 生成
|
||||||
|
if random.random() > question_cfg.temperature:
|
||||||
|
logger.info(
|
||||||
|
f"Skip QA generation for chunk_index={chunk.chunk_index} in file_task={file_task.id} due to random decision."
|
||||||
|
)
|
||||||
|
# 更新文件任务的 processed_chunks 计数
|
||||||
|
await self._increment_processed_chunks(file_task.id, 1)
|
||||||
|
return False
|
||||||
|
|
||||||
# 如果没有全局上限配置,维持原有行为
|
# 如果没有全局上限配置,维持原有行为
|
||||||
if max_qa_pairs is not None and max_qa_pairs > 0:
|
if max_qa_pairs is not None and max_qa_pairs > 0:
|
||||||
from sqlalchemy import func
|
from sqlalchemy import func
|
||||||
@@ -411,6 +429,11 @@ class GenerationService:
|
|||||||
base_obj["instruction"] = question
|
base_obj["instruction"] = question
|
||||||
data_obj = base_obj
|
data_obj = base_obj
|
||||||
|
|
||||||
|
# 提取图片URL
|
||||||
|
img_urls = extract_img_urls(chunk_text)
|
||||||
|
if img_urls:
|
||||||
|
data_obj["img_urls"] = img_urls
|
||||||
|
|
||||||
record = SynthesisData(
|
record = SynthesisData(
|
||||||
id=str(uuid.uuid4()),
|
id=str(uuid.uuid4()),
|
||||||
data=data_obj,
|
data=data_obj,
|
||||||
|
|||||||
Reference in New Issue
Block a user