You've already forked DataMate
bugfix: update values.yaml to enhance ray-cluster configuration with security context, environment variables, and resource limits (#172)
* feature: unstructured支持简单pdf处理 * feature: update values.yaml to enhance ray-cluster configuration with security context, environment variables, and resource limits
This commit is contained in:
@@ -146,10 +146,10 @@ class BaseOp:
|
||||
def read_file(self, sample):
|
||||
filepath = sample[self.filepath_key]
|
||||
filetype = sample[self.filetype_key]
|
||||
if filetype in ["ppt", "pptx", "docx", "doc", "xlsx"]:
|
||||
if filetype in ["ppt", "pptx", "docx", "doc", "xlsx", "csv", "md", "pdf"]:
|
||||
elements = partition(filename=filepath)
|
||||
sample[self.text_key] = "\n\n".join([str(el) for el in elements])
|
||||
elif filetype in ["txt", "md", "markdown", "xml", "html", "csv", "json", "jsonl"]:
|
||||
elif filetype in ["txt", "md", "markdown", "xml", "html", "json", "jsonl"]:
|
||||
with open(filepath, 'rb') as f:
|
||||
content = f.read()
|
||||
sample[self.text_key] = content.decode("utf-8-sig").replace("\r\n", "\n")
|
||||
|
||||
@@ -21,7 +21,7 @@ dependencies = [
|
||||
"loguru>=0.7.3",
|
||||
"opencv-python-headless>=4.12.0.88",
|
||||
"ray[data,default]==2.52.1",
|
||||
"unstructured[csv,docx,pptx,xlsx]==0.18.15",
|
||||
"unstructured[csv,docx,pptx,xlsx,pdf,md]==0.18.15",
|
||||
"uvicorn[standard]>=0.38.0",
|
||||
]
|
||||
|
||||
|
||||
Reference in New Issue
Block a user