|
|
|
@@ -1,5 +1,8 @@
|
|
|
|
|
|
|
|
import csv
|
|
|
|
|
|
|
|
import csv
|
|
|
|
import datetime
|
|
|
|
import datetime
|
|
|
|
import os
|
|
|
|
import os
|
|
|
|
|
|
|
|
from io import StringIO
|
|
|
|
from pathlib import Path
|
|
|
|
from pathlib import Path
|
|
|
|
|
|
|
|
|
|
|
|
from fastapi import HTTPException
|
|
|
|
from fastapi import HTTPException
|
|
|
|
@@ -16,18 +19,41 @@ logger = get_logger(__name__)
|
|
|
|
PDF_FILE_TYPE = "pdf"
|
|
|
|
PDF_FILE_TYPE = "pdf"
|
|
|
|
DOC_FILE_TYPE = "doc"
|
|
|
|
DOC_FILE_TYPE = "doc"
|
|
|
|
DOCX_FILE_TYPE = "docx"
|
|
|
|
DOCX_FILE_TYPE = "docx"
|
|
|
|
|
|
|
|
XLS_FILE_TYPE = "xls"
|
|
|
|
|
|
|
|
XLSX_FILE_TYPE = "xlsx"
|
|
|
|
|
|
|
|
CSV_FILE_TYPE = "csv"
|
|
|
|
TEXT_FILE_TYPE = "txt"
|
|
|
|
TEXT_FILE_TYPE = "txt"
|
|
|
|
|
|
|
|
CSV_FILE_EXTENSION = ".csv"
|
|
|
|
TEXT_FILE_EXTENSION = ".txt"
|
|
|
|
TEXT_FILE_EXTENSION = ".txt"
|
|
|
|
SUPPORTED_FILE_TYPES = {PDF_FILE_TYPE, DOC_FILE_TYPE, DOCX_FILE_TYPE}
|
|
|
|
EXCEL_FILE_TYPES = {XLS_FILE_TYPE, XLSX_FILE_TYPE}
|
|
|
|
|
|
|
|
SUPPORTED_FILE_TYPES = {PDF_FILE_TYPE, DOC_FILE_TYPE, DOCX_FILE_TYPE, XLS_FILE_TYPE, XLSX_FILE_TYPE}
|
|
|
|
PARSER_BY_FILE_TYPE = {
|
|
|
|
PARSER_BY_FILE_TYPE = {
|
|
|
|
PDF_FILE_TYPE: "PyPDFLoader",
|
|
|
|
PDF_FILE_TYPE: "PyPDFLoader",
|
|
|
|
DOC_FILE_TYPE: "Docx2txtLoader",
|
|
|
|
DOC_FILE_TYPE: "Docx2txtLoader",
|
|
|
|
DOCX_FILE_TYPE: "Docx2txtLoader",
|
|
|
|
DOCX_FILE_TYPE: "Docx2txtLoader",
|
|
|
|
|
|
|
|
XLS_FILE_TYPE: "xlrd",
|
|
|
|
|
|
|
|
XLSX_FILE_TYPE: "openpyxl",
|
|
|
|
}
|
|
|
|
}
|
|
|
|
DEFAULT_EXTENSION_BY_TYPE = {
|
|
|
|
DEFAULT_EXTENSION_BY_TYPE = {
|
|
|
|
PDF_FILE_TYPE: ".pdf",
|
|
|
|
PDF_FILE_TYPE: ".pdf",
|
|
|
|
DOC_FILE_TYPE: ".doc",
|
|
|
|
DOC_FILE_TYPE: ".doc",
|
|
|
|
DOCX_FILE_TYPE: ".docx",
|
|
|
|
DOCX_FILE_TYPE: ".docx",
|
|
|
|
|
|
|
|
XLS_FILE_TYPE: ".xls",
|
|
|
|
|
|
|
|
XLSX_FILE_TYPE: ".xlsx",
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
DERIVED_EXTENSION_BY_TYPE = {
|
|
|
|
|
|
|
|
PDF_FILE_TYPE: TEXT_FILE_EXTENSION,
|
|
|
|
|
|
|
|
DOC_FILE_TYPE: TEXT_FILE_EXTENSION,
|
|
|
|
|
|
|
|
DOCX_FILE_TYPE: TEXT_FILE_EXTENSION,
|
|
|
|
|
|
|
|
XLS_FILE_TYPE: CSV_FILE_EXTENSION,
|
|
|
|
|
|
|
|
XLSX_FILE_TYPE: CSV_FILE_EXTENSION,
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
DERIVED_FILE_TYPE_BY_SOURCE = {
|
|
|
|
|
|
|
|
PDF_FILE_TYPE: TEXT_FILE_TYPE,
|
|
|
|
|
|
|
|
DOC_FILE_TYPE: TEXT_FILE_TYPE,
|
|
|
|
|
|
|
|
DOCX_FILE_TYPE: TEXT_FILE_TYPE,
|
|
|
|
|
|
|
|
XLS_FILE_TYPE: CSV_FILE_TYPE,
|
|
|
|
|
|
|
|
XLSX_FILE_TYPE: CSV_FILE_TYPE,
|
|
|
|
}
|
|
|
|
}
|
|
|
|
DERIVED_METADATA_KEY = "derived_from_file_id"
|
|
|
|
DERIVED_METADATA_KEY = "derived_from_file_id"
|
|
|
|
DERIVED_METADATA_NAME_KEY = "derived_from_file_name"
|
|
|
|
DERIVED_METADATA_NAME_KEY = "derived_from_file_name"
|
|
|
|
@@ -45,9 +71,11 @@ class PdfTextExtractService:
|
|
|
|
self._validate_dataset_and_file(dataset, file_record)
|
|
|
|
self._validate_dataset_and_file(dataset, file_record)
|
|
|
|
|
|
|
|
|
|
|
|
file_type = str(getattr(file_record, "file_type", "") or "").lower()
|
|
|
|
file_type = str(getattr(file_record, "file_type", "") or "").lower()
|
|
|
|
|
|
|
|
derived_file_type = DERIVED_FILE_TYPE_BY_SOURCE.get(file_type, TEXT_FILE_TYPE)
|
|
|
|
|
|
|
|
assert derived_file_type
|
|
|
|
source_path = self._resolve_source_path(file_record)
|
|
|
|
source_path = self._resolve_source_path(file_record)
|
|
|
|
dataset_path = self._resolve_dataset_path(dataset)
|
|
|
|
dataset_path = self._resolve_dataset_path(dataset)
|
|
|
|
target_path = self._resolve_target_path(dataset_path, source_path, file_record, file_id)
|
|
|
|
target_path = self._resolve_target_path(dataset_path, source_path, file_record, file_id, file_type)
|
|
|
|
|
|
|
|
|
|
|
|
existing_record = await self._find_existing_text_record(dataset_id, target_path)
|
|
|
|
existing_record = await self._find_existing_text_record(dataset_id, target_path)
|
|
|
|
if existing_record:
|
|
|
|
if existing_record:
|
|
|
|
@@ -56,14 +84,18 @@ class PdfTextExtractService:
|
|
|
|
if target_path.exists():
|
|
|
|
if target_path.exists():
|
|
|
|
file_size = self._get_file_size(target_path)
|
|
|
|
file_size = self._get_file_size(target_path)
|
|
|
|
parser_name = PARSER_BY_FILE_TYPE.get(file_type, "")
|
|
|
|
parser_name = PARSER_BY_FILE_TYPE.get(file_type, "")
|
|
|
|
record = await self._create_text_file_record(dataset, file_record, target_path, file_size, parser_name)
|
|
|
|
record = await self._create_text_file_record(
|
|
|
|
|
|
|
|
dataset, file_record, target_path, file_size, parser_name, derived_file_type
|
|
|
|
|
|
|
|
)
|
|
|
|
return self._build_response(dataset_id, file_id, record)
|
|
|
|
return self._build_response(dataset_id, file_id, record)
|
|
|
|
|
|
|
|
|
|
|
|
text_content, parser_name = self._parse_document(source_path, file_type)
|
|
|
|
text_content, parser_name = self._parse_document(source_path, file_type)
|
|
|
|
assert isinstance(text_content, str)
|
|
|
|
assert isinstance(text_content, str)
|
|
|
|
self._write_text_file(target_path, text_content)
|
|
|
|
self._write_text_file(target_path, text_content)
|
|
|
|
file_size = self._get_file_size(target_path)
|
|
|
|
file_size = self._get_file_size(target_path)
|
|
|
|
record = await self._create_text_file_record(dataset, file_record, target_path, file_size, parser_name)
|
|
|
|
record = await self._create_text_file_record(
|
|
|
|
|
|
|
|
dataset, file_record, target_path, file_size, parser_name, derived_file_type
|
|
|
|
|
|
|
|
)
|
|
|
|
return self._build_response(dataset_id, file_id, record)
|
|
|
|
return self._build_response(dataset_id, file_id, record)
|
|
|
|
|
|
|
|
|
|
|
|
async def _get_dataset(self, dataset_id: str) -> Dataset:
|
|
|
|
async def _get_dataset(self, dataset_id: str) -> Dataset:
|
|
|
|
@@ -92,7 +124,7 @@ class PdfTextExtractService:
|
|
|
|
raise HTTPException(status_code=400, detail="仅支持文本类型数据集")
|
|
|
|
raise HTTPException(status_code=400, detail="仅支持文本类型数据集")
|
|
|
|
file_type = str(getattr(file_record, "file_type", "") or "").lower()
|
|
|
|
file_type = str(getattr(file_record, "file_type", "") or "").lower()
|
|
|
|
if file_type not in SUPPORTED_FILE_TYPES:
|
|
|
|
if file_type not in SUPPORTED_FILE_TYPES:
|
|
|
|
raise HTTPException(status_code=400, detail="仅支持 PDF/DOC/DOCX 文件解析")
|
|
|
|
raise HTTPException(status_code=400, detail="仅支持 PDF/DOC/DOCX/XLS/XLSX 文件解析")
|
|
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
@staticmethod
|
|
|
|
def _resolve_source_path(file_record: DatasetFiles) -> Path:
|
|
|
|
def _resolve_source_path(file_record: DatasetFiles) -> Path:
|
|
|
|
@@ -111,13 +143,13 @@ class PdfTextExtractService:
|
|
|
|
return dataset_path
|
|
|
|
return dataset_path
|
|
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
@staticmethod
|
|
|
|
def _build_output_filename(file_record: DatasetFiles, file_id: str) -> str:
|
|
|
|
def _build_output_filename(file_record: DatasetFiles, file_id: str, file_type: str) -> str:
|
|
|
|
original_name = str(getattr(file_record, "file_name", "") or "").strip()
|
|
|
|
original_name = str(getattr(file_record, "file_name", "") or "").strip()
|
|
|
|
if not original_name:
|
|
|
|
if not original_name:
|
|
|
|
file_type = str(getattr(file_record, "file_type", "") or "").lower()
|
|
|
|
|
|
|
|
default_extension = DEFAULT_EXTENSION_BY_TYPE.get(file_type, f".{file_type}")
|
|
|
|
default_extension = DEFAULT_EXTENSION_BY_TYPE.get(file_type, f".{file_type}")
|
|
|
|
original_name = f"{file_id}{default_extension}"
|
|
|
|
original_name = f"{file_id}{default_extension}"
|
|
|
|
return f"{original_name}{TEXT_FILE_EXTENSION}"
|
|
|
|
derived_extension = DERIVED_EXTENSION_BY_TYPE.get(file_type, TEXT_FILE_EXTENSION)
|
|
|
|
|
|
|
|
return f"{original_name}{derived_extension}"
|
|
|
|
|
|
|
|
|
|
|
|
def _resolve_target_path(
|
|
|
|
def _resolve_target_path(
|
|
|
|
self,
|
|
|
|
self,
|
|
|
|
@@ -125,8 +157,9 @@ class PdfTextExtractService:
|
|
|
|
source_path: Path,
|
|
|
|
source_path: Path,
|
|
|
|
file_record: DatasetFiles,
|
|
|
|
file_record: DatasetFiles,
|
|
|
|
file_id: str,
|
|
|
|
file_id: str,
|
|
|
|
|
|
|
|
file_type: str,
|
|
|
|
) -> Path:
|
|
|
|
) -> Path:
|
|
|
|
output_name = self._build_output_filename(file_record, file_id)
|
|
|
|
output_name = self._build_output_filename(file_record, file_id, file_type)
|
|
|
|
if dataset_path in source_path.parents:
|
|
|
|
if dataset_path in source_path.parents:
|
|
|
|
target_dir = source_path.parent
|
|
|
|
target_dir = source_path.parent
|
|
|
|
else:
|
|
|
|
else:
|
|
|
|
@@ -151,6 +184,10 @@ class PdfTextExtractService:
|
|
|
|
if file_type == PDF_FILE_TYPE:
|
|
|
|
if file_type == PDF_FILE_TYPE:
|
|
|
|
loader = PyPDFLoader(str(source_path))
|
|
|
|
loader = PyPDFLoader(str(source_path))
|
|
|
|
parser_name = PARSER_BY_FILE_TYPE[PDF_FILE_TYPE]
|
|
|
|
parser_name = PARSER_BY_FILE_TYPE[PDF_FILE_TYPE]
|
|
|
|
|
|
|
|
elif file_type in EXCEL_FILE_TYPES:
|
|
|
|
|
|
|
|
parser_name = PARSER_BY_FILE_TYPE.get(file_type, "excel")
|
|
|
|
|
|
|
|
csv_content = PdfTextExtractService._parse_excel_to_csv(source_path, file_type)
|
|
|
|
|
|
|
|
return csv_content, parser_name
|
|
|
|
else:
|
|
|
|
else:
|
|
|
|
loader = Docx2txtLoader(str(source_path))
|
|
|
|
loader = Docx2txtLoader(str(source_path))
|
|
|
|
parser_name = PARSER_BY_FILE_TYPE.get(file_type, "Docx2txtLoader")
|
|
|
|
parser_name = PARSER_BY_FILE_TYPE.get(file_type, "Docx2txtLoader")
|
|
|
|
@@ -158,6 +195,53 @@ class PdfTextExtractService:
|
|
|
|
contents = [doc.page_content for doc in docs if doc.page_content]
|
|
|
|
contents = [doc.page_content for doc in docs if doc.page_content]
|
|
|
|
return "\n\n".join(contents), parser_name
|
|
|
|
return "\n\n".join(contents), parser_name
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
|
|
|
|
def _parse_excel_to_csv(source_path: Path, file_type: str) -> str:
|
|
|
|
|
|
|
|
output = StringIO(newline="")
|
|
|
|
|
|
|
|
writer = csv.writer(output)
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
|
|
|
if file_type == XLSX_FILE_TYPE:
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
|
|
|
from openpyxl import load_workbook
|
|
|
|
|
|
|
|
except ImportError as exc:
|
|
|
|
|
|
|
|
raise HTTPException(status_code=500, detail="缺少 openpyxl 依赖") from exc
|
|
|
|
|
|
|
|
workbook = load_workbook(filename=str(source_path), read_only=True, data_only=True)
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
|
|
|
sheet_names = workbook.sheetnames
|
|
|
|
|
|
|
|
include_sheet_name = len(sheet_names) > 1
|
|
|
|
|
|
|
|
for sheet_name in sheet_names:
|
|
|
|
|
|
|
|
sheet = workbook[sheet_name]
|
|
|
|
|
|
|
|
for row in sheet.iter_rows(values_only=True):
|
|
|
|
|
|
|
|
row_values = list(row)
|
|
|
|
|
|
|
|
if include_sheet_name:
|
|
|
|
|
|
|
|
row_values.insert(0, sheet_name)
|
|
|
|
|
|
|
|
writer.writerow(row_values)
|
|
|
|
|
|
|
|
finally:
|
|
|
|
|
|
|
|
workbook.close()
|
|
|
|
|
|
|
|
elif file_type == XLS_FILE_TYPE:
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
|
|
|
import xlrd
|
|
|
|
|
|
|
|
except ImportError as exc:
|
|
|
|
|
|
|
|
raise HTTPException(status_code=500, detail="缺少 xlrd 依赖") from exc
|
|
|
|
|
|
|
|
workbook = xlrd.open_workbook(str(source_path))
|
|
|
|
|
|
|
|
sheet_names = workbook.sheet_names()
|
|
|
|
|
|
|
|
include_sheet_name = len(sheet_names) > 1
|
|
|
|
|
|
|
|
for sheet_index in range(workbook.nsheets):
|
|
|
|
|
|
|
|
sheet = workbook.sheet_by_index(sheet_index)
|
|
|
|
|
|
|
|
for row_index in range(sheet.nrows):
|
|
|
|
|
|
|
|
row_values = sheet.row_values(row_index)
|
|
|
|
|
|
|
|
if include_sheet_name:
|
|
|
|
|
|
|
|
row_values = [sheet.name, *row_values]
|
|
|
|
|
|
|
|
writer.writerow(row_values)
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
|
|
|
raise HTTPException(status_code=400, detail="不支持的 Excel 文件格式")
|
|
|
|
|
|
|
|
except HTTPException:
|
|
|
|
|
|
|
|
raise
|
|
|
|
|
|
|
|
except Exception as exc:
|
|
|
|
|
|
|
|
logger.error("Excel 转 CSV 失败: %s", exc)
|
|
|
|
|
|
|
|
raise HTTPException(status_code=500, detail="Excel 转 CSV 失败") from exc
|
|
|
|
|
|
|
|
return output.getvalue()
|
|
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
@staticmethod
|
|
|
|
def _write_text_file(target_path: Path, content: str) -> None:
|
|
|
|
def _write_text_file(target_path: Path, content: str) -> None:
|
|
|
|
with open(target_path, "w", encoding="utf-8") as handle:
|
|
|
|
with open(target_path, "w", encoding="utf-8") as handle:
|
|
|
|
@@ -177,8 +261,10 @@ class PdfTextExtractService:
|
|
|
|
target_path: Path,
|
|
|
|
target_path: Path,
|
|
|
|
file_size: int,
|
|
|
|
file_size: int,
|
|
|
|
parser_name: str,
|
|
|
|
parser_name: str,
|
|
|
|
|
|
|
|
derived_file_type: str,
|
|
|
|
) -> DatasetFiles:
|
|
|
|
) -> DatasetFiles:
|
|
|
|
assert parser_name
|
|
|
|
assert parser_name
|
|
|
|
|
|
|
|
assert derived_file_type
|
|
|
|
metadata = {
|
|
|
|
metadata = {
|
|
|
|
DERIVED_METADATA_KEY: str(getattr(source_file, "id", "")),
|
|
|
|
DERIVED_METADATA_KEY: str(getattr(source_file, "id", "")),
|
|
|
|
DERIVED_METADATA_NAME_KEY: str(getattr(source_file, "file_name", "")),
|
|
|
|
DERIVED_METADATA_NAME_KEY: str(getattr(source_file, "file_name", "")),
|
|
|
|
@@ -189,7 +275,7 @@ class PdfTextExtractService:
|
|
|
|
dataset_id=dataset.id, # type: ignore[arg-type]
|
|
|
|
dataset_id=dataset.id, # type: ignore[arg-type]
|
|
|
|
file_name=target_path.name,
|
|
|
|
file_name=target_path.name,
|
|
|
|
file_path=str(target_path),
|
|
|
|
file_path=str(target_path),
|
|
|
|
file_type=TEXT_FILE_TYPE,
|
|
|
|
file_type=derived_file_type,
|
|
|
|
file_size=file_size,
|
|
|
|
file_size=file_size,
|
|
|
|
dataset_filemetadata=metadata,
|
|
|
|
dataset_filemetadata=metadata,
|
|
|
|
last_access_time=datetime.datetime.now(datetime.UTC),
|
|
|
|
last_access_time=datetime.datetime.now(datetime.UTC),
|
|
|
|
|