init datamate

This commit is contained in:
Dallas98
2025-10-21 23:00:48 +08:00
commit 1c97afed7d
692 changed files with 135442 additions and 0 deletions

View File

@@ -0,0 +1,6 @@
# -*- coding: utf-8 -*-
from datamate.core.base_op import OPERATORS
OPERATORS.register_module(module_name='TextToWord',
module_path="ops.mapper.text_to_word.process")

View File

@@ -0,0 +1,16 @@
name: '转换为Word'
name_en: 'Convert-to-Word'
description: '将抽取结果转换为docx的word文件。'
description_en: 'Converts extraction results to Word files in DOCX format.'
language: 'python'
vendor: 'huawei'
raw_id: 'TextToWord'
version: '1.0.0'
types:
- 'cleanse'
modal: 'text'
effect:
before: ''
after: ''
inputs: 'text'
outputs: 'text'

View File

@@ -0,0 +1,328 @@
# -- encoding: utf-8 --
"""
Description:
Create: 2025/01/18
"""
import random
import string
import time
from io import BytesIO
from typing import List, Union, Dict, Any
import bs4
import docx.table
import numpy as np
import pandas as pd
from docx import Document
from loguru import logger
from datamate.core.base_op import Mapper
class TextToWord(Mapper):
def __init__(self, *args, **kwargs):
# 随机生成两个长度超过10的字符串用作标识合并单元格使用
super().__init__(*args, **kwargs)
self.delete = ''.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(15))
self.abundant = ''.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(15)) + "VVV"
# 生成字典记录单元格合并信息
self.diagonal_merge = dict()
@staticmethod
def _to_clean_paragraphs(table: docx.table.Table, position: tuple) -> None:
"""删除单元格内多余的换行符"""
clearn_paragraphs = []
paragraphs = table.cell(position[0], position[1]).text
for paragraph in paragraphs:
clearn_paragraph = paragraph.replace('\n', " ")
clearn_paragraphs.append(clearn_paragraph)
table.cell(position[0], position[1]).text = clearn_paragraphs
@staticmethod
def _needs_merge(soup: bs4.BeautifulSoup) -> bool:
"""查看是否有合并单元格"""
all_td = soup.select('tr td')
for td in all_td:
if td.has_attr('colspan') or td.has_attr('rowspan'): # 表示有合并单元格,直接返回True
return True
return False
@staticmethod
def _find_html_tds(html_tr: bs4.element.Tag) -> bs4.element.ResultSet:
"""判断bs4.element.Tag, 如果以<td>开头需要特殊处理"""
if str(html_tr).startswith("<td") and str(html_tr).endswith("</td>"):
html_tds = [html_tr]
else:
html_tds = html_tr.find_all('td')
return html_tds
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
"""将文本信息转换为docx文件流"""
start = time.time()
sample[self.data_key] = self._txt_to_docx(sample[self.text_key]) # 将文字转换为word字符串流
sample[self.text_key] = ""
sample["target_type"] = "docx"
logger.info(f"fileName: {sample[self.filename_key]}, method: TextToWord costs {time.time() - start:6f} s")
return sample
def read_html_with_merge_cell(self, html_table: bs4.BeautifulSoup) -> pd.DataFrame:
"""阅读html文档并根据合并单元格特性进行表内文字去重预处理,以及计算df的行数与列数"""
html_trs = html_table.find_all('tr')
# 计算html字段的行数
row_count = len(html_trs)
# 计算html字段的列数
col_count = 0
for row in html_trs:
# 处理html字段可能会生成td开头的html文段,需特殊处理
html_tds = self._find_html_tds(row)
cur_col = sum([int(html_td['colspan']) if html_td.has_attr('colspan') else 1 for html_td in html_tds])
col_count = max(col_count, cur_col)
return self.cell_preprocess(row_count, col_count, html_trs) # 返回对合并单元格特殊标识后的数据模板
def mark_span_cell_diagonal(self, parameters: List[int], row_count: int, df: pd.DataFrame) -> pd.DataFrame:
"""将同时具有rowspan和colspan的单元格优先存入字典中,并对需合并单元格进行特殊标注"""
row, rowspan, col, colspan = parameters
# 需要动态删减超过行数的rowspan
min_row = rowspan - 1
for i in range(row, rowspan):
if i >= row_count:
min_row = min(min_row, i - 1) # 最大rowspan数不能超过模板行数,否则不做后续处理
break
for j in range(col, colspan):
if i == row and j == col:
continue
df.iloc[i, j] = self.delete # 对需被合并单元格进行特殊标注
self.diagonal_merge[(row, col)] = (min_row, colspan - 1) # 将当前单元格以及对应的合并目标单元格存入字典
return df
def mark_span_cell_single(self, param: List[int], count: int, df: pd.DataFrame, is_rowspan: bool) -> pd.DataFrame:
"""将具有rowspan或colspan的单元格优先存入字典中,并对需合并单元格进行特殊标注"""
row, col, span = param
span_direction = row if is_rowspan else col # 判断span方向
min_span = span - 1 # 需要动态删减超过行数的rowspan或colspan
for i in range(span_direction + 1, span):
# 如果索引超过行数或列数,更新min_span, 结束循环处理
if i >= count:
min_span = min(min_span, i - 1)
break
# 根据rowspan或colspan指示判断索引位置
if is_rowspan:
df.iloc[i, col] = self.delete # 对需被合并单元格进行特殊标注
else:
df.iloc[row, i] = self.delete
# 根据rowspan或colspan指示判断索引位置
if is_rowspan:
self.diagonal_merge[(row, col)] = (min_span, col) # 将当前单元格以及对应的合并目标单元格存入字典
else:
self.diagonal_merge[(row, col)] = (row, min_span)
return df
def mark_abundant_cell_edge(self, param: List[int], df: pd.DataFrame) -> pd.DataFrame:
"""冗余单元格特殊标注"""
row, col, col_count = param
if col != col_count:
for mo in range(col, col_count):
df.iloc[row, mo] = self.abundant # 对列中冗余单元格进行特殊标识
return df
def cell_preprocess(self, row_count: int, col_count: int, html_trs: List[bs4.element.Tag]) -> pd.DataFrame:
"""判断单元格是否需要被合并,如果需要被合并,call self._mark_abundant_cell 特别标注此单元格"""
df = pd.DataFrame(np.zeros([row_count, col_count]), dtype=int)
# 根据网页中的表格,还原在dataframe中,有合并单元格现象的值填在第一个单元格中,其他的用特殊标识填充
for row in range(row_count):
# beautifulSoup 处理html表格时可能会生成td开头的html文段,需要特殊处理
html_tds = self._find_html_tds(html_trs[row])
# span记录td的索引,td的总数不一定等于col_count, 因为td可能包含colspan
span = 0
for col in range(col_count):
if span >= len(html_tds):
df = self.mark_abundant_cell_edge([row, col, col_count], df) # 标注潜在冗余单元格
break
# 如果框架为空或者字段为特殊标识,则不做后续处理
if pd.isnull(df.iloc[row, col]) or df.iloc[row, col] == (self.delete or self.abundant):
continue
html_td = html_tds[span] # 获取单个含有td信息的html tag形式字符串
# 根据html_td的属性,选择横竖向/横向/竖向的合并预处理
df = self._choose_span_method(html_td, [row, col, row_count, col_count], df)
# dataframe当前位置根据td文段赋值
df.iloc[row, col] = html_td.get_text(strip=True)
span += 1
return df
def _choose_span_method(self, html_td: bs4.element.Tag, param: List[int], df: pd.DataFrame) -> pd.DataFrame:
"""根据信息td属性信息判断合并单元格方向"""
row, col, row_count, col_count = param
has_rowspan = html_td.has_attr('rowspan')
has_colspan = html_td.has_attr('colspan')
rowspan = int(html_td['rowspan']) if has_rowspan else 0
colspan = int(html_td['colspan']) if has_colspan else 0
# 横向与纵向合并的单元格
if has_colspan and has_rowspan:
df = self.mark_span_cell_diagonal([row, row + rowspan, col, col + colspan], row_count, df) # 标注被合并单元格
# 横向合并的单元格
elif has_colspan:
df = self.mark_span_cell_single([row, col, col + colspan], col_count, df, False)
# 竖向合并的单元格
elif has_rowspan:
df = self.mark_span_cell_single([row, col, row + rowspan], row_count, df, True)
return df
def _eliminate_values(self, table: docx.table.Table, position: List[int]) -> None:
"""合并单元格前预处理,清除被合并单元格内的特殊标识"""
try:
merge_destiny_x, merge_destiny_y = self.diagonal_merge[position] # 获取此位置指向的目标合并单元格坐标
except KeyError as e:
logger.exception(f"Current dictionary is NOT supported: {e}")
for i in range(position[0], merge_destiny_x + 1):
for j in range(position[1], merge_destiny_y + 1):
if i == position[0] and j == position[1]: # 初始目标不做改动
continue
table.cell(i, j).text = "" # 清除特殊标识
def _merge_cell(self, table: docx.table.Table) -> None:
"""合并单元格"""
for position in self.diagonal_merge.keys():
merge_destiny_x, merge_destiny_y = self.diagonal_merge[position] # 获取初始单元格位置指向的目标合并单元格坐标
self._eliminate_values(table, position) # 删除特殊标识
# 如果合并形状不是矩形,则与前单元格合并有冲突
try:
table.cell(position[0], position[1]).merge(table.cell(merge_destiny_x, merge_destiny_y))
self._to_clean_paragraphs(table, position) # 去除多余的换行符
except docx.exceptions.InvalidSpanError as e:
logger.exception(f"Current table cell format is NOT supported: {e}")
def _get_doc_table(self, dataframe: Union[pd.DataFrame, List], doc: Document, is_merge: bool) -> None:
"""
dataframe转换为doc表格
Args:
dataframe : pd.dataframe 单个文件或文件集
doc : Python docx 文档
is_merge : 表示dataframe文件是否存在合并单元格
"""
if isinstance(dataframe, List): # dataframe 可能是多个表格
for data in dataframe:
self._get_doc_table(data, doc, is_merge)
return
rows_num, cols_num = dataframe.shape
table = doc.add_table(rows=rows_num, cols=cols_num, style="Table Grid")
for row in range(rows_num):
cells = table.rows[row].cells
for col in range(cols_num):
# 确定单个单元格是否为np.nan
is_np_nan = pd.isnull(dataframe.iloc[row, col])
# 确定单个单元格是否为None
is_none = dataframe.iloc[row, col] is None
# dataframe.iloc 浮点数固定为0.0
is_float_zero = dataframe.iloc[row, col] == 0.0
if not (is_np_nan or is_none or is_float_zero):
cells[col].text = str(dataframe.iloc[row, col])
else:
cells[col].text = ""
# 只有存在合并单元格,才进行合并。否则无需进一步处理
if is_merge:
self._merge_cell(table) # 合并单元格处理
# 每行列冗余单元格处理
self._merge_col_abundant_cell(rows_num, cols_num, table)
def _merge_col_abundant_cell(self, rows_num: int, cols_num: int, table: docx.table.Table) -> None:
"""对word表格每行的冗余单元格进行合并或者清除处理"""
for row in range(rows_num):
cells = table.rows[row].cells
# 双指针记录冗余单元格的起始与完结位置
start, finish = None, None
for col in range(cols_num - 1, -1, -1):
if cells[col].text == self.abundant and not start: # start 指针赋值,记录第一个冗余单元格位置
start = col
cells[col].text = ""
elif cells[col].text == self.abundant and start: # 冗余单元格消除特殊标识
cells[col].text = ""
elif cells[col].text != self.abundant and start: # finish指针后赋值,记录冗余单元格序列完结位置
finish = col
# 如果双指针有一端未被赋值,则不进行合并处理
if not finish or not start:
continue
# 如果合并形状不是矩形,则与前单元格合并有冲突
try:
cells[finish].merge(cells[start]) # 将冗余单元格合入相对最近的单元格
self._to_clean_paragraphs(table, (row, finish)) # 去除多余换行符
except docx.exceptions.InvalidSpanError as e:
logger.exception(f"Current table cell format is NOT supported: {e}")
# 双指针重置
start, finish = None, None
def _get_df_with_merge_info(self, html_table: bs4.BeautifulSoup, line: str) -> [pd.DataFrame, bool]:
"""根据是否具有合并单元格属性来确定处理方法"""
is_merge = True
if self._needs_merge(html_table): # 判断html_table是否具有需要合并的单元格
# 搭建含有合并单元格的表格框架,使用自研算法处理得到数据框架
df = self.read_html_with_merge_cell(html_table)
else:
# 搭建不含有合并单元格的表格框架,使用pd.read_html() 处理得到数据框架
is_merge = False
df = pd.read_html(line)
return df, is_merge
def _txt_to_docx(self, text: str):
"""将字符串转换为docx文件流"""
doc = Document()
for line in text.split("\n"):
paragraph = doc.add_paragraph()
try:
if line.startswith("<html>") and line.endswith("</html>"): # 一行文字如果以<html>开始并以</html>结束,则判断为是表格
self.diagonal_merge = dict() # 每段html合并单元格信息需重置
html_table = bs4.BeautifulSoup(line, 'html.parser')
df, is_merge = self._get_df_with_merge_info(html_table, line) # 得到数据框架
self._get_doc_table(df, doc, is_merge) # 通过数据框架构造word table
else: # 一行文字如果是纯文字形式,则是不含表格的文本
paragraph.add_run(line)
except docx.exceptions.InvalidSpanError as e:
logger.exception(f"Current table cell format is NOT supported: {e}")
except Exception as e:
logger.exception(f"Current table shape is Not supported {e}")
stream = BytesIO()
doc.save(stream)
return stream.getvalue()