You've already forked DataMate
255 lines
7.0 KiB
TypeScript
255 lines
7.0 KiB
TypeScript
export const mockChunks = Array.from({ length: 23 }, (_, i) => ({
|
|
id: i + 1,
|
|
content: `这是第 ${
|
|
i + 1
|
|
} 个文档分块的内容示例。在实际应用中,这里会显示从原始文档中提取和分割的具体文本内容。用户可以在这里查看和编辑分块的内容,确保知识库的质量和准确性。这个分块包含了重要的业务信息和技术细节,需要仔细维护以确保检索的准确性。`,
|
|
position: i + 1,
|
|
tokens: Math.floor(Math.random() * 200) + 100,
|
|
embedding: Array.from({ length: 1536 }, () => Math.random() - 0.5),
|
|
similarity: (Math.random() * 0.3 + 0.7).toFixed(3),
|
|
createdAt: "2024-01-22 10:35",
|
|
updatedAt: "2024-01-22 10:35",
|
|
vectorId: `vec_${i + 1}_${Math.random().toString(36).substr(2, 9)}`,
|
|
sliceOperator: ["semantic-split", "paragraph-split", "table-extract"][
|
|
Math.floor(Math.random() * 3)
|
|
],
|
|
parentChunkId: i > 0 ? Math.floor(Math.random() * i) + 1 : undefined,
|
|
metadata: {
|
|
source: "API文档.pdf",
|
|
page: Math.floor(i / 5) + 1,
|
|
section: `第${Math.floor(i / 3) + 1}章`,
|
|
},
|
|
}));
|
|
|
|
export const mockQAPairs = [
|
|
{
|
|
id: 1,
|
|
question: "什么是API文档的主要用途?",
|
|
answer:
|
|
"API文档的主要用途是为开发者提供详细的接口说明,包括请求参数、响应格式和使用示例.",
|
|
},
|
|
{
|
|
id: 2,
|
|
question: "如何正确使用这个API?",
|
|
answer:
|
|
"使用API时需要先获取访问令牌,然后按照文档中的格式发送请求,注意处理错误响应.",
|
|
},
|
|
];
|
|
|
|
export const sliceOperators: SliceOperator[] = [
|
|
{
|
|
id: "paragraph-split",
|
|
name: "段落分割",
|
|
description: "按段落自然分割文档",
|
|
type: "text",
|
|
icon: "📄",
|
|
params: { minLength: 50, maxLength: 1000 },
|
|
},
|
|
{
|
|
id: "sentence-split",
|
|
name: "句子分割",
|
|
description: "按句子边界分割文档",
|
|
type: "text",
|
|
icon: "📝",
|
|
params: { maxSentences: 5, overlap: 1 },
|
|
},
|
|
{
|
|
id: "semantic-split",
|
|
name: "语义分割",
|
|
description: "基于语义相似度智能分割",
|
|
type: "semantic",
|
|
icon: "🧠",
|
|
params: { threshold: 0.7, windowSize: 3 },
|
|
},
|
|
{
|
|
id: "length-split",
|
|
name: "长度分割",
|
|
description: "按固定字符长度分割",
|
|
type: "text",
|
|
icon: "📏",
|
|
params: { chunkSize: 512, overlap: 50 },
|
|
},
|
|
{
|
|
id: "structure-split",
|
|
name: "结构化分割",
|
|
description: "按文档结构(标题、章节)分割",
|
|
type: "structure",
|
|
icon: "🏗️",
|
|
params: { preserveHeaders: true, minSectionLength: 100 },
|
|
},
|
|
{
|
|
id: "table-extract",
|
|
name: "表格提取",
|
|
description: "提取并单独处理表格内容",
|
|
type: "structure",
|
|
icon: "📊",
|
|
params: { includeHeaders: true, mergeRows: false },
|
|
},
|
|
{
|
|
id: "code-extract",
|
|
name: "代码提取",
|
|
description: "识别并提取代码块",
|
|
type: "custom",
|
|
icon: "💻",
|
|
params: {
|
|
languages: ["python", "javascript", "sql"],
|
|
preserveIndentation: true,
|
|
},
|
|
},
|
|
{
|
|
id: "qa-extract",
|
|
name: "问答提取",
|
|
description: "自动识别问答格式内容",
|
|
type: "semantic",
|
|
icon: "❓",
|
|
params: { confidenceThreshold: 0.8, generateAnswers: true },
|
|
},
|
|
];
|
|
|
|
export const vectorDatabases = [
|
|
{
|
|
id: "pinecone",
|
|
name: "Pinecone",
|
|
description: "云端向量数据库,高性能检索",
|
|
},
|
|
{
|
|
id: "weaviate",
|
|
name: "Weaviate",
|
|
description: "开源向量数据库,支持多模态",
|
|
},
|
|
{ id: "qdrant", name: "Qdrant", description: "高性能向量搜索引擎" },
|
|
{ id: "chroma", name: "ChromaDB", description: "轻量级向量数据库" },
|
|
{ id: "milvus", name: "Milvus", description: "分布式向量数据库" },
|
|
{ id: "faiss", name: "FAISS", description: "Facebook AI 相似性搜索库" },
|
|
];
|
|
|
|
export const mockKnowledgeBases: KnowledgeBase[] = [
|
|
{
|
|
id: 1,
|
|
name: "产品技术文档库",
|
|
description:
|
|
"包含所有产品相关的技术文档和API说明,支持多种格式文档的智能解析和向量化处理",
|
|
type: "unstructured",
|
|
status: "ready",
|
|
fileCount: 45,
|
|
chunkCount: 1250,
|
|
vectorCount: 1250,
|
|
size: "2.3 GB",
|
|
progress: 100,
|
|
createdAt: "2024-01-15",
|
|
lastUpdated: "2024-01-22",
|
|
vectorDatabase: "pinecone",
|
|
config: {
|
|
embeddingModel: "text-embedding-3-large",
|
|
llmModel: "gpt-4o",
|
|
chunkSize: 512,
|
|
overlap: 50,
|
|
sliceMethod: "semantic",
|
|
enableQA: true,
|
|
vectorDimension: 1536,
|
|
sliceOperators: ["semantic-split", "paragraph-split", "table-extract"],
|
|
},
|
|
files: [
|
|
{
|
|
id: 1,
|
|
name: "API文档.pdf",
|
|
type: "pdf",
|
|
size: "2.5 MB",
|
|
status: "completed",
|
|
chunkCount: 156,
|
|
progress: 100,
|
|
uploadedAt: "2024-01-15",
|
|
source: "upload",
|
|
vectorizationStatus: "completed",
|
|
},
|
|
{
|
|
id: 2,
|
|
name: "用户手册.docx",
|
|
type: "docx",
|
|
size: "1.8 MB",
|
|
status: "disabled",
|
|
chunkCount: 89,
|
|
progress: 65,
|
|
uploadedAt: "2024-01-22",
|
|
source: "dataset",
|
|
datasetId: "dataset-1",
|
|
vectorizationStatus: "failed",
|
|
},
|
|
],
|
|
vectorizationHistory: [
|
|
{
|
|
id: 1,
|
|
timestamp: "2024-01-22 14:30:00",
|
|
operation: "create",
|
|
fileId: 1,
|
|
fileName: "API文档.pdf",
|
|
chunksProcessed: 156,
|
|
vectorsGenerated: 156,
|
|
status: "success",
|
|
duration: "2m 15s",
|
|
config: {
|
|
embeddingModel: "text-embedding-3-large",
|
|
chunkSize: 512,
|
|
sliceMethod: "semantic",
|
|
},
|
|
},
|
|
{
|
|
id: 2,
|
|
timestamp: "2024-01-22 15:45:00",
|
|
operation: "update",
|
|
fileId: 2,
|
|
fileName: "用户手册.docx",
|
|
chunksProcessed: 89,
|
|
vectorsGenerated: 0,
|
|
status: "failed",
|
|
duration: "0m 45s",
|
|
config: {
|
|
embeddingModel: "text-embedding-3-large",
|
|
chunkSize: 512,
|
|
sliceMethod: "semantic",
|
|
},
|
|
error: "向量化服务连接超时",
|
|
},
|
|
],
|
|
},
|
|
{
|
|
id: 2,
|
|
name: "FAQ结构化知识库",
|
|
description: "客服常见问题的结构化问答对,支持快速检索和智能匹配",
|
|
type: "structured",
|
|
status: "vectorizing",
|
|
fileCount: 12,
|
|
chunkCount: 890,
|
|
vectorCount: 750,
|
|
size: "156 MB",
|
|
progress: 75,
|
|
createdAt: "2024-01-20",
|
|
lastUpdated: "2024-01-23",
|
|
vectorDatabase: "weaviate",
|
|
config: {
|
|
embeddingModel: "text-embedding-ada-002",
|
|
chunkSize: 256,
|
|
overlap: 0,
|
|
sliceMethod: "paragraph",
|
|
enableQA: false,
|
|
vectorDimension: 1536,
|
|
sliceOperators: ["qa-extract", "paragraph-split"],
|
|
},
|
|
files: [
|
|
{
|
|
id: 3,
|
|
name: "FAQ模板.xlsx",
|
|
type: "xlsx",
|
|
size: "450 KB",
|
|
status: "vectorizing",
|
|
chunkCount: 234,
|
|
progress: 75,
|
|
uploadedAt: "2024-01-20",
|
|
source: "upload",
|
|
vectorizationStatus: "processing",
|
|
},
|
|
],
|
|
vectorizationHistory: [],
|
|
},
|
|
];
|