bugfix: update values.yaml to enhance ray-cluster configuration with security context, environment variables, and resource limits (#172)

* feature: unstructured支持简单pdf处理

* feature: update values.yaml to enhance ray-cluster configuration with security context, environment variables, and resource limits
This commit is contained in:
hhhhsc701
2025-12-17 10:41:13 +08:00
committed by GitHub
parent 082aca1597
commit 62b91b6deb
5 changed files with 62 additions and 51 deletions

View File

@@ -58,25 +58,10 @@ head:
# in the headGroupSpec. See https://github.com/ray-project/kuberay/pull/1128 for more details.
serviceAccountName: ""
restartPolicy: ""
rayStartParams:
object-store-memory: '78643200'
rayStartParams: {}
# containerEnv specifies environment variables for the Ray container,
# Follows standard K8s container env schema.
containerEnv:
- name: RAY_DEDUP_LOGS
value: "0"
- name: RAY_TQDM_PATCH_PRINT
value: "0"
- name: MYSQL_HOST
value: "datamate-database"
- name: MYSQL_PORT
value: "3306"
- name: MYSQL_USER
value: "root"
- name: MYSQL_PASSWORD
value: "password"
- name: MYSQL_DATABASE
value: "datamate"
containerEnv: []
# - name: EXAMPLE_ENV
# value: "1"
envFrom: []
@@ -93,14 +78,7 @@ head:
# It is usually best to set requests equal to limits.
# See https://docs.ray.io/en/latest/cluster/kubernetes/user-guides/config.html#resources
# for further guidance.
resources:
limits:
cpu: "2"
# To avoid out-of-memory issues, never allocate less than 2G memory for the Ray head.
memory: "8G"
requests:
cpu: "1"
memory: "2G"
resources: {}
annotations: {}
nodeSelector: {}
tolerations: []
@@ -156,21 +134,7 @@ worker:
initContainers: []
# containerEnv specifies environment variables for the Ray container,
# Follows standard K8s container env schema.
containerEnv:
- name: RAY_DEDUP_LOGS
value: "0"
- name: RAY_TQDM_PATCH_PRINT
value: "0"
- name: MYSQL_HOST
value: "datamate-database"
- name: MYSQL_PORT
value: "3306"
- name: MYSQL_USER
value: "root"
- name: MYSQL_PASSWORD
value: "password"
- name: MYSQL_DATABASE
value: "datamate"
containerEnv: []
# - name: EXAMPLE_ENV
# value: "1"
envFrom: []
@@ -187,13 +151,7 @@ worker:
# It is usually best to set requests equal to limits.
# See https://docs.ray.io/en/latest/cluster/kubernetes/user-guides/config.html#resources
# for further guidance.
resources:
limits:
cpu: "4"
memory: "8G"
requests:
cpu: "1"
memory: "1G"
resources: {}
annotations: {}
nodeSelector: {}
tolerations: []

View File

@@ -77,6 +77,10 @@ database:
subPath: database
backend:
securityContext:
capabilities:
add:
- SYS_ADMIN
env:
- name: DB_PASSWORD
value: *dbPass
@@ -170,6 +174,31 @@ runtime:
ray-cluster:
enabled: true
head:
rayStartParams:
object-store-memory: '78643200'
num-cpus: '0'
containerEnv:
- name: RAY_DEDUP_LOGS
value: "0"
- name: RAY_TQDM_PATCH_PRINT
value: "0"
- name: MYSQL_HOST
value: "datamate-database"
- name: MYSQL_PORT
value: "3306"
- name: MYSQL_USER
value: "root"
- name: MYSQL_PASSWORD
value: *dbPass
- name: MYSQL_DATABASE
value: "datamate"
resources:
limits:
cpu: "2"
memory: "8G"
requests:
cpu: "1"
memory: "2G"
volumes:
- *datasetVolume
- *flowVolume
@@ -196,6 +225,28 @@ ray-cluster:
- containerPort: 8081
volumeMounts: *runtimeVolumeMounts
worker:
containerEnv:
- name: RAY_DEDUP_LOGS
value: "0"
- name: RAY_TQDM_PATCH_PRINT
value: "0"
- name: MYSQL_HOST
value: "datamate-database"
- name: MYSQL_PORT
value: "3306"
- name: MYSQL_USER
value: "root"
- name: MYSQL_PASSWORD
value: *dbPass
- name: MYSQL_DATABASE
value: "datamate"
resources:
limits:
cpu: "8"
memory: "64G"
requests:
cpu: "1"
memory: "2G"
volumes:
- *datasetVolume
- *flowVolume

View File

@@ -146,10 +146,10 @@ class BaseOp:
def read_file(self, sample):
filepath = sample[self.filepath_key]
filetype = sample[self.filetype_key]
if filetype in ["ppt", "pptx", "docx", "doc", "xlsx"]:
if filetype in ["ppt", "pptx", "docx", "doc", "xlsx", "csv", "md", "pdf"]:
elements = partition(filename=filepath)
sample[self.text_key] = "\n\n".join([str(el) for el in elements])
elif filetype in ["txt", "md", "markdown", "xml", "html", "csv", "json", "jsonl"]:
elif filetype in ["txt", "md", "markdown", "xml", "html", "json", "jsonl"]:
with open(filepath, 'rb') as f:
content = f.read()
sample[self.text_key] = content.decode("utf-8-sig").replace("\r\n", "\n")

View File

@@ -21,7 +21,7 @@ dependencies = [
"loguru>=0.7.3",
"opencv-python-headless>=4.12.0.88",
"ray[data,default]==2.52.1",
"unstructured[csv,docx,pptx,xlsx]==0.18.15",
"unstructured[csv,docx,pptx,xlsx,pdf,md]==0.18.15",
"uvicorn[standard]>=0.38.0",
]

View File

@@ -21,7 +21,9 @@ WORKDIR /opt/runtime
RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install -e . --system \
&& uv pip install -r /opt/runtime/datamate/ops/pyproject.toml --system \
&& UV_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" uv pip install -r /opt/runtime/datamate/ops/pyproject.toml --system \
&& uv pip uninstall torch torchvision triton --system \
&& uv pip list | grep -E '^nvidia-' | awk '{print $1}' | xargs -r uv pip uninstall --system \
&& python -m spacy download zh_core_web_sm
RUN ln -sf /usr/share/zoneinfo/Asia/Shanghai /etc/localtime \