diff --git a/deployment/helm/datamate/charts/ray-cluster/values.yaml b/deployment/helm/datamate/charts/ray-cluster/values.yaml index 9e7c650..0a57504 100644 --- a/deployment/helm/datamate/charts/ray-cluster/values.yaml +++ b/deployment/helm/datamate/charts/ray-cluster/values.yaml @@ -58,25 +58,10 @@ head: # in the headGroupSpec. See https://github.com/ray-project/kuberay/pull/1128 for more details. serviceAccountName: "" restartPolicy: "" - rayStartParams: - object-store-memory: '78643200' + rayStartParams: {} # containerEnv specifies environment variables for the Ray container, # Follows standard K8s container env schema. - containerEnv: - - name: RAY_DEDUP_LOGS - value: "0" - - name: RAY_TQDM_PATCH_PRINT - value: "0" - - name: MYSQL_HOST - value: "datamate-database" - - name: MYSQL_PORT - value: "3306" - - name: MYSQL_USER - value: "root" - - name: MYSQL_PASSWORD - value: "password" - - name: MYSQL_DATABASE - value: "datamate" + containerEnv: [] # - name: EXAMPLE_ENV # value: "1" envFrom: [] @@ -93,14 +78,7 @@ head: # It is usually best to set requests equal to limits. # See https://docs.ray.io/en/latest/cluster/kubernetes/user-guides/config.html#resources # for further guidance. - resources: - limits: - cpu: "2" - # To avoid out-of-memory issues, never allocate less than 2G memory for the Ray head. - memory: "8G" - requests: - cpu: "1" - memory: "2G" + resources: {} annotations: {} nodeSelector: {} tolerations: [] @@ -156,21 +134,7 @@ worker: initContainers: [] # containerEnv specifies environment variables for the Ray container, # Follows standard K8s container env schema. - containerEnv: - - name: RAY_DEDUP_LOGS - value: "0" - - name: RAY_TQDM_PATCH_PRINT - value: "0" - - name: MYSQL_HOST - value: "datamate-database" - - name: MYSQL_PORT - value: "3306" - - name: MYSQL_USER - value: "root" - - name: MYSQL_PASSWORD - value: "password" - - name: MYSQL_DATABASE - value: "datamate" + containerEnv: [] # - name: EXAMPLE_ENV # value: "1" envFrom: [] @@ -187,13 +151,7 @@ worker: # It is usually best to set requests equal to limits. # See https://docs.ray.io/en/latest/cluster/kubernetes/user-guides/config.html#resources # for further guidance. - resources: - limits: - cpu: "4" - memory: "8G" - requests: - cpu: "1" - memory: "1G" + resources: {} annotations: {} nodeSelector: {} tolerations: [] diff --git a/deployment/helm/datamate/values.yaml b/deployment/helm/datamate/values.yaml index c0fcfd6..7f913aa 100644 --- a/deployment/helm/datamate/values.yaml +++ b/deployment/helm/datamate/values.yaml @@ -77,6 +77,10 @@ database: subPath: database backend: + securityContext: + capabilities: + add: + - SYS_ADMIN env: - name: DB_PASSWORD value: *dbPass @@ -170,6 +174,31 @@ runtime: ray-cluster: enabled: true head: + rayStartParams: + object-store-memory: '78643200' + num-cpus: '0' + containerEnv: + - name: RAY_DEDUP_LOGS + value: "0" + - name: RAY_TQDM_PATCH_PRINT + value: "0" + - name: MYSQL_HOST + value: "datamate-database" + - name: MYSQL_PORT + value: "3306" + - name: MYSQL_USER + value: "root" + - name: MYSQL_PASSWORD + value: *dbPass + - name: MYSQL_DATABASE + value: "datamate" + resources: + limits: + cpu: "2" + memory: "8G" + requests: + cpu: "1" + memory: "2G" volumes: - *datasetVolume - *flowVolume @@ -196,6 +225,28 @@ ray-cluster: - containerPort: 8081 volumeMounts: *runtimeVolumeMounts worker: + containerEnv: + - name: RAY_DEDUP_LOGS + value: "0" + - name: RAY_TQDM_PATCH_PRINT + value: "0" + - name: MYSQL_HOST + value: "datamate-database" + - name: MYSQL_PORT + value: "3306" + - name: MYSQL_USER + value: "root" + - name: MYSQL_PASSWORD + value: *dbPass + - name: MYSQL_DATABASE + value: "datamate" + resources: + limits: + cpu: "8" + memory: "64G" + requests: + cpu: "1" + memory: "2G" volumes: - *datasetVolume - *flowVolume diff --git a/runtime/python-executor/datamate/core/base_op.py b/runtime/python-executor/datamate/core/base_op.py index 6a2c592..5b5e5c6 100644 --- a/runtime/python-executor/datamate/core/base_op.py +++ b/runtime/python-executor/datamate/core/base_op.py @@ -146,10 +146,10 @@ class BaseOp: def read_file(self, sample): filepath = sample[self.filepath_key] filetype = sample[self.filetype_key] - if filetype in ["ppt", "pptx", "docx", "doc", "xlsx"]: + if filetype in ["ppt", "pptx", "docx", "doc", "xlsx", "csv", "md", "pdf"]: elements = partition(filename=filepath) sample[self.text_key] = "\n\n".join([str(el) for el in elements]) - elif filetype in ["txt", "md", "markdown", "xml", "html", "csv", "json", "jsonl"]: + elif filetype in ["txt", "md", "markdown", "xml", "html", "json", "jsonl"]: with open(filepath, 'rb') as f: content = f.read() sample[self.text_key] = content.decode("utf-8-sig").replace("\r\n", "\n") diff --git a/runtime/python-executor/pyproject.toml b/runtime/python-executor/pyproject.toml index acffbd9..d84229d 100644 --- a/runtime/python-executor/pyproject.toml +++ b/runtime/python-executor/pyproject.toml @@ -21,7 +21,7 @@ dependencies = [ "loguru>=0.7.3", "opencv-python-headless>=4.12.0.88", "ray[data,default]==2.52.1", - "unstructured[csv,docx,pptx,xlsx]==0.18.15", + "unstructured[csv,docx,pptx,xlsx,pdf,md]==0.18.15", "uvicorn[standard]>=0.38.0", ] diff --git a/scripts/images/runtime/Dockerfile b/scripts/images/runtime/Dockerfile index 653d016..1794d0c 100644 --- a/scripts/images/runtime/Dockerfile +++ b/scripts/images/runtime/Dockerfile @@ -21,7 +21,9 @@ WORKDIR /opt/runtime RUN --mount=type=cache,target=/root/.cache/uv \ uv pip install -e . --system \ - && uv pip install -r /opt/runtime/datamate/ops/pyproject.toml --system \ + && UV_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" uv pip install -r /opt/runtime/datamate/ops/pyproject.toml --system \ + && uv pip uninstall torch torchvision triton --system \ + && uv pip list | grep -E '^nvidia-' | awk '{print $1}' | xargs -r uv pip uninstall --system \ && python -m spacy download zh_core_web_sm RUN ln -sf /usr/share/zoneinfo/Asia/Shanghai /etc/localtime \