DataMate/frontend/src/mock/knowledgeBase.tsx

export const mockChunks = Array.from({ length: 23 }, (_, i) => ({
  id: i + 1,
  content: `这是第 ${
    i + 1
  } 个文档分块的内容示例。在实际应用中，这里会显示从原始文档中提取和分割的具体文本内容。用户可以在这里查看和编辑分块的内容，确保知识库的质量和准确性。这个分块包含了重要的业务信息和技术细节，需要仔细维护以确保检索的准确性。`,
  position: i + 1,
  tokens: Math.floor(Math.random() * 200) + 100,
  embedding: Array.from({ length: 1536 }, () => Math.random() - 0.5),
  similarity: (Math.random() * 0.3 + 0.7).toFixed(3),
  createdAt: "2024-01-22 10:35",
  updatedAt: "2024-01-22 10:35",
  vectorId: `vec_${i + 1}_${Math.random().toString(36).substr(2, 9)}`,
  sliceOperator: ["semantic-split", "paragraph-split", "table-extract"][
    Math.floor(Math.random() * 3)
  ],
  parentChunkId: i > 0 ? Math.floor(Math.random() * i) + 1 : undefined,
  metadata: {
    source: "API文档.pdf",
    page: Math.floor(i / 5) + 1,
    section: `第${Math.floor(i / 3) + 1}章`,
  },
}));

export const mockQAPairs = [
  {
    id: 1,
    question: "什么是API文档的主要用途？",
    answer:
      "API文档的主要用途是为开发者提供详细的接口说明，包括请求参数、响应格式和使用示例.",
  },
  {
    id: 2,
    question: "如何正确使用这个API？",
    answer:
      "使用API时需要先获取访问令牌，然后按照文档中的格式发送请求，注意处理错误响应.",
  },
];

export const sliceOperators: SliceOperator[] = [
  {
    id: "paragraph-split",
    name: "段落分割",
    description: "按段落自然分割文档",
    type: "text",
    icon: "📄",
    params: { minLength: 50, maxLength: 1000 },
  },
  {
    id: "sentence-split",
    name: "句子分割",
    description: "按句子边界分割文档",
    type: "text",
    icon: "📝",
    params: { maxSentences: 5, overlap: 1 },
  },
  {
    id: "semantic-split",
    name: "语义分割",
    description: "基于语义相似度智能分割",
    type: "semantic",
    icon: "🧠",
    params: { threshold: 0.7, windowSize: 3 },
  },
  {
    id: "length-split",
    name: "长度分割",
    description: "按固定字符长度分割",
    type: "text",
    icon: "📏",
    params: { chunkSize: 512, overlap: 50 },
  },
  {
    id: "structure-split",
    name: "结构化分割",
    description: "按文档结构（标题、章节）分割",
    type: "structure",
    icon: "🏗️",
    params: { preserveHeaders: true, minSectionLength: 100 },
  },
  {
    id: "table-extract",
    name: "表格提取",
    description: "提取并单独处理表格内容",
    type: "structure",
    icon: "📊",
    params: { includeHeaders: true, mergeRows: false },
  },
  {
    id: "code-extract",
    name: "代码提取",
    description: "识别并提取代码块",
    type: "custom",
    icon: "💻",
    params: {
      languages: ["python", "javascript", "sql"],
      preserveIndentation: true,
    },
  },
  {
    id: "qa-extract",
    name: "问答提取",
    description: "自动识别问答格式内容",
    type: "semantic",
    icon: "❓",
    params: { confidenceThreshold: 0.8, generateAnswers: true },
  },
];

export const vectorDatabases = [
  {
    id: "pinecone",
    name: "Pinecone",
    description: "云端向量数据库，高性能检索",
  },
  {
    id: "weaviate",
    name: "Weaviate",
    description: "开源向量数据库，支持多模态",
  },
  { id: "qdrant", name: "Qdrant", description: "高性能向量搜索引擎" },
  { id: "chroma", name: "ChromaDB", description: "轻量级向量数据库" },
  { id: "milvus", name: "Milvus", description: "分布式向量数据库" },
  { id: "faiss", name: "FAISS", description: "Facebook AI 相似性搜索库" },
];

export const mockKnowledgeBases: KnowledgeBase[] = [
  {
    id: 1,
    name: "产品技术文档库",
    description:
      "包含所有产品相关的技术文档和API说明，支持多种格式文档的智能解析和向量化处理",
    type: "unstructured",
    status: "ready",
    fileCount: 45,
    chunkCount: 1250,
    vectorCount: 1250,
    size: "2.3 GB",
    progress: 100,
    createdAt: "2024-01-15",
    lastUpdated: "2024-01-22",
    vectorDatabase: "pinecone",
    config: {
      embeddingModel: "text-embedding-3-large",
      llmModel: "gpt-4o",
      chunkSize: 512,
      overlap: 50,
      sliceMethod: "semantic",
      enableQA: true,
      vectorDimension: 1536,
      sliceOperators: ["semantic-split", "paragraph-split", "table-extract"],
    },
    files: [
      {
        id: 1,
        name: "API文档.pdf",
        type: "pdf",
        size: "2.5 MB",
        status: "completed",
        chunkCount: 156,
        progress: 100,
        uploadedAt: "2024-01-15",
        source: "upload",
        vectorizationStatus: "completed",
      },
      {
        id: 2,
        name: "用户手册.docx",
        type: "docx",
        size: "1.8 MB",
        status: "disabled",
        chunkCount: 89,
        progress: 65,
        uploadedAt: "2024-01-22",
        source: "dataset",
        datasetId: "dataset-1",
        vectorizationStatus: "failed",
      },
    ],
    vectorizationHistory: [
      {
        id: 1,
        timestamp: "2024-01-22 14:30:00",
        operation: "create",
        fileId: 1,
        fileName: "API文档.pdf",
        chunksProcessed: 156,
        vectorsGenerated: 156,
        status: "success",
        duration: "2m 15s",
        config: {
          embeddingModel: "text-embedding-3-large",
          chunkSize: 512,
          sliceMethod: "semantic",
        },
      },
      {
        id: 2,
        timestamp: "2024-01-22 15:45:00",
        operation: "update",
        fileId: 2,
        fileName: "用户手册.docx",
        chunksProcessed: 89,
        vectorsGenerated: 0,
        status: "failed",
        duration: "0m 45s",
        config: {
          embeddingModel: "text-embedding-3-large",
          chunkSize: 512,
          sliceMethod: "semantic",
        },
        error: "向量化服务连接超时",
      },
    ],
  },
  {
    id: 2,
    name: "FAQ结构化知识库",
    description: "客服常见问题的结构化问答对，支持快速检索和智能匹配",
    type: "structured",
    status: "vectorizing",
    fileCount: 12,
    chunkCount: 890,
    vectorCount: 750,
    size: "156 MB",
    progress: 75,
    createdAt: "2024-01-20",
    lastUpdated: "2024-01-23",
    vectorDatabase: "weaviate",
    config: {
      embeddingModel: "text-embedding-ada-002",
      chunkSize: 256,
      overlap: 0,
      sliceMethod: "paragraph",
      enableQA: false,
      vectorDimension: 1536,
      sliceOperators: ["qa-extract", "paragraph-split"],
    },
    files: [
      {
        id: 3,
        name: "FAQ模板.xlsx",
        type: "xlsx",
        size: "450 KB",
        status: "vectorizing",
        chunkCount: 234,
        progress: 75,
        uploadedAt: "2024-01-20",
        source: "upload",
        vectorizationStatus: "processing",
      },
    ],
    vectorizationHistory: [],
  },
];