init datamate

This commit is contained in:
Dallas98
2025-10-21 23:00:48 +08:00
commit 1c97afed7d
692 changed files with 135442 additions and 0 deletions

View File

@@ -0,0 +1,6 @@
# -*- coding: utf-8 -*-
from datamate.core.base_op import OPERATORS
OPERATORS.register_module(module_name='WordFormatter',
module_path="ops.formatter.word_formatter.process")

View File

@@ -0,0 +1,16 @@
name: 'Word文本抽取'
name_en: 'Word Text Extraction'
description: '抽取Word中的文本'
description_en: 'Extracts text from Word files.'
language: 'java'
vendor: 'huawei'
raw_id: 'WordFormatter'
version: '1.0.0'
types:
- 'collect'
modal: 'text'
effect:
before: ''
after: ''
inputs: 'text'
outputs: 'text'

View File

@@ -0,0 +1,68 @@
# # -- encoding: utf-8 --
#
# Description:
# Create: 2024/1/30 15:24
# """
from loguru import logger
import os
import subprocess
import time
from typing import Dict, Any
from datamate.common.utils import check_valid_path
from datamate.core.base_op import Mapper
class WordFormatter(Mapper):
SEPERATOR = ' | '
def __init__(self, *args, **kwargs):
super(WordFormatter, self).__init__(*args, **kwargs)
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
start = time.time()
file_name = sample[self.filename_key]
file_path = sample[self.filepath_key]
file_type = sample[self.filetype_key]
txt_content = self.word2html(file_path, file_type)
sample[self.text_key] = txt_content
logger.info(f"fileName: {file_name}, method: WordFormatter costs {(time.time() - start):6f} s")
return sample
@staticmethod
def word2html(file_path, file_type):
check_valid_path(file_path)
file_dir = file_path.rsplit('/', 1)[0]
file_name = file_path.rsplit('/', 1)[1]
html_file_path = os.path.join(file_dir, f"{file_name}.txt")
current_file_path = os.path.dirname(os.path.abspath(__file__))
try:
process = subprocess.Popen(
['java', '-jar', f'{current_file_path}/../../../java_operator/WordFormatter-1.0.jar', file_path,
html_file_path, file_type], shell=False, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
stdout, stderr = process.communicate(timeout=24 * 60 * 60)
if process.returncode == 0:
logger.info(f"Convert {file_path} successfully to DOCX")
else:
logger.info(f"Convert {file_path} failed, error: {stderr.strip().decode('utf-8')}.")
raise RuntimeError()
except subprocess.CalledProcessError as e:
logger.error(f"Convert failed: {e}, return code: {e.returncode}")
except FileNotFoundError:
logger.error("LibreOffice command not found, please make sure it is available in PATH")
except Exception as e:
logger.error(f"An unexpected error occurred, convert failed: {e}", )
try:
with open(html_file_path, 'r', encoding='utf-8') as file:
txt_content = file.read()
os.remove(html_file_path)
logger.info("Tmp docx file removed")
except FileNotFoundError:
logger.error(f"Tmp file {html_file_path} does not exist")
except PermissionError:
logger.error(f"You are not allowed to delete tmp file {html_file_path}")
logger.info(f"Convert {html_file_path} to html success")
return txt_content