From f41478c4e4fdd5e2dfd5acb7440c043fdd5057de Mon Sep 17 00:00:00 2001 From: Avi Basnet Date: Wed, 21 Jan 2026 23:08:05 +0000 Subject: [PATCH 1/6] working --- spark_on_ray/Dockerfile | 88 +++++++++++++++++++++++++++++++++++++---- spark_on_ray/main.py | 11 +++++- 2 files changed, 91 insertions(+), 8 deletions(-) diff --git a/spark_on_ray/Dockerfile b/spark_on_ray/Dockerfile index 4aa3e32..15634e6 100644 --- a/spark_on_ray/Dockerfile +++ b/spark_on_ray/Dockerfile @@ -1,8 +1,8 @@ # Anyscale Container-Compatible Dockerfile -FROM anyscale/ray:2.50.0-slim-py312-cu128 +FROM anyscale/ray:2.53.0-slim-py312-cu128 # Environment variables -ENV ANYSCALE_DISABLE_OPTIMIZED_RAY=1 +# ENV ANYSCALE_DISABLE_OPTIMIZED_RAY=1 ENV DEBIAN_FRONTEND=noninteractive ENV HOME=/home/ray ENV PATH=/home/ray/anaconda3/bin:$PATH @@ -15,6 +15,13 @@ ENV JAVA_HOME=/usr/lib/jvm/java-11-openjdk-amd64 # PySpark uses py4j from site-packages - RayDP needs to find it ENV SPARK_HOME=/home/ray/anaconda3/lib/python3.12/site-packages/pyspark +# Ray JAR path for Spark/RayDP JVM communication +ENV RAY_JARS_DIR=/home/ray/anaconda3/lib/python3.12/site-packages/ray/jars +ENV CLASSPATH=$RAY_JARS_DIR/*:$CLASSPATH + +# Spark config directory +ENV SPARK_CONF_DIR=/home/ray/.spark + # Add Hadoop and Spark to PATH ENV PATH=$HADOOP_HOME/bin:$SPARK_HOME/bin:$PATH ENV LD_LIBRARY_PATH=$HADOOP_HOME/lib/native:$LD_LIBRARY_PATH @@ -99,25 +106,92 @@ RUN /home/ray/anaconda3/bin/pip install --no-cache-dir py4j # Install raydp RUN /home/ray/anaconda3/bin/pip install --no-cache-dir --pre raydp +# Fix: Ensure Ray's Java JAR matches the installed Ray version +# RayDP expects ray_dist.jar in /jars/ for JVM-Ray communication +# The Anyscale image may have this JAR but we ensure it matches the Ray version +# CRITICAL: Also copy to PySpark's jars/ directory so py4j JVM can find it +RUN RAY_VERSION=$(/home/ray/anaconda3/bin/python -c "import ray; print(ray.__version__)") && \ + echo "Ensuring ray_dist.jar for Ray version: ${RAY_VERSION}" && \ + cd /tmp && \ + /home/ray/anaconda3/bin/pip download "ray==${RAY_VERSION}" --no-deps -d . && \ + mkdir -p /home/ray/anaconda3/lib/python3.12/site-packages/ray/jars && \ + unzip -o -j ray-*.whl "ray/jars/*" -d /home/ray/anaconda3/lib/python3.12/site-packages/ray/jars/ && \ + rm -f ray-*.whl && \ + echo "Installed ray_dist.jar:" && \ + ls -la /home/ray/anaconda3/lib/python3.12/site-packages/ray/jars/ && \ + echo "Copying ray_dist.jar to PySpark jars directory..." && \ + cp /home/ray/anaconda3/lib/python3.12/site-packages/ray/jars/*.jar $SPARK_HOME/jars/ && \ + ls -la $SPARK_HOME/jars/ray*.jar + +# CRITICAL FIX: Backup ray jars to a safe location that survives pre-start script +# The Anyscale pre-start script replaces the ray package at container startup, +# which removes ray_dist.jar. We backup the jars and patch pre-start to restore them. +RUN sudo mkdir -p /opt/ray-jars-backup && \ + sudo cp -r /home/ray/anaconda3/lib/python3.12/site-packages/ray/jars/* /opt/ray-jars-backup/ && \ + echo "Backed up ray jars to /opt/ray-jars-backup:" && \ + ls -la /opt/ray-jars-backup/ + +# Create a restore script that can be called to restore ray jars +# This script can be used manually or integrated into startup +RUN echo '#!/bin/bash' | sudo tee /opt/ray-jars-backup/restore-ray-jars.sh > /dev/null && \ + echo 'RAY_JARS_BACKUP="/opt/ray-jars-backup"' | sudo tee -a /opt/ray-jars-backup/restore-ray-jars.sh > /dev/null && \ + echo 'RAY_SITE_PKG="${ANYSCALE_RAY_SITE_PKG_DIR:-/home/ray/anaconda3/lib/python3.12/site-packages}"' | sudo tee -a /opt/ray-jars-backup/restore-ray-jars.sh > /dev/null && \ + echo 'RAY_JARS_DIR="${RAY_SITE_PKG}/ray/jars"' | sudo tee -a /opt/ray-jars-backup/restore-ray-jars.sh > /dev/null && \ + echo 'if [[ -d "${RAY_JARS_BACKUP}" ]] && [[ -f "${RAY_JARS_BACKUP}/ray_dist.jar" ]]; then' | sudo tee -a /opt/ray-jars-backup/restore-ray-jars.sh > /dev/null && \ + echo ' if [[ ! -f "${RAY_JARS_DIR}/ray_dist.jar" ]]; then' | sudo tee -a /opt/ray-jars-backup/restore-ray-jars.sh > /dev/null && \ + echo ' echo "Restoring ray_dist.jar for RayDP/Spark-on-Ray support..."' | sudo tee -a /opt/ray-jars-backup/restore-ray-jars.sh > /dev/null && \ + echo ' mkdir -p "${RAY_JARS_DIR}"' | sudo tee -a /opt/ray-jars-backup/restore-ray-jars.sh > /dev/null && \ + echo ' cp -r "${RAY_JARS_BACKUP}"/* "${RAY_JARS_DIR}/" 2>/dev/null || sudo cp -r "${RAY_JARS_BACKUP}"/* "${RAY_JARS_DIR}/"' | sudo tee -a /opt/ray-jars-backup/restore-ray-jars.sh > /dev/null && \ + echo ' echo "Restored ray jars to ${RAY_JARS_DIR}"' | sudo tee -a /opt/ray-jars-backup/restore-ray-jars.sh > /dev/null && \ + echo ' fi' | sudo tee -a /opt/ray-jars-backup/restore-ray-jars.sh > /dev/null && \ + echo 'fi' | sudo tee -a /opt/ray-jars-backup/restore-ray-jars.sh > /dev/null && \ + sudo chmod +x /opt/ray-jars-backup/restore-ray-jars.sh + +# Patch the Anyscale pre-start script to restore ray_dist.jar after Ray replacement +# We append a restore block to the end of the existing script +RUN if [ -f /opt/anyscale/ray-prestart ]; then \ + echo "Patching /opt/anyscale/ray-prestart to preserve ray_dist.jar..." && \ + sudo cp /opt/anyscale/ray-prestart /opt/anyscale/ray-prestart.original && \ + echo '' | sudo tee -a /opt/anyscale/ray-prestart > /dev/null && \ + echo '# RAYDP FIX: Restore ray_dist.jar after Ray package replacement' | sudo tee -a /opt/anyscale/ray-prestart > /dev/null && \ + echo 'RAY_JARS_BACKUP="/opt/ray-jars-backup"' | sudo tee -a /opt/anyscale/ray-prestart > /dev/null && \ + echo 'RAY_JARS_DIR="${ANYSCALE_RAY_SITE_PKG_DIR}/ray/jars"' | sudo tee -a /opt/anyscale/ray-prestart > /dev/null && \ + echo 'if [[ -d "${RAY_JARS_BACKUP}" ]] && [[ -f "${RAY_JARS_BACKUP}/ray_dist.jar" ]]; then' | sudo tee -a /opt/anyscale/ray-prestart > /dev/null && \ + echo ' echo "Restoring ray_dist.jar for RayDP/Spark-on-Ray support..."' | sudo tee -a /opt/anyscale/ray-prestart > /dev/null && \ + echo ' "${SUDO[@]}" mkdir -p "${RAY_JARS_DIR}"' | sudo tee -a /opt/anyscale/ray-prestart > /dev/null && \ + echo ' "${SUDO[@]}" cp -r "${RAY_JARS_BACKUP}"/* "${RAY_JARS_DIR}/"' | sudo tee -a /opt/anyscale/ray-prestart > /dev/null && \ + echo ' echo "Restored ray jars to ${RAY_JARS_DIR}"' | sudo tee -a /opt/anyscale/ray-prestart > /dev/null && \ + echo 'fi' | sudo tee -a /opt/anyscale/ray-prestart > /dev/null && \ + echo "Pre-start script patched successfully."; \ + else \ + echo "NOTE: /opt/anyscale/ray-prestart not found at build time"; \ + fi + # Install additional Python packages for Spark/Hadoop integration RUN /home/ray/anaconda3/bin/pip install --no-cache-dir emoji pyarrow pandas numpy findspark # Configure bash environment (minimal - for Anyscale workspace compatibility only) +# Also add auto-restore for ray jars as a fallback mechanism RUN echo 'PROMPT_COMMAND="history -a"' >> /home/ray/.bashrc && \ - echo '[ -e ~/.workspacerc ] && source ~/.workspacerc' >> /home/ray/.bashrc + echo '[ -e ~/.workspacerc ] && source ~/.workspacerc' >> /home/ray/.bashrc && \ + echo '# Auto-restore ray_dist.jar if missing (for RayDP support)' >> /home/ray/.bashrc && \ + echo '[ -x /opt/ray-jars-backup/restore-ray-jars.sh ] && /opt/ray-jars-backup/restore-ray-jars.sh 2>/dev/null' >> /home/ray/.bashrc -# Create Spark configuration for S3 access +# Create Spark configuration for S3 access and Ray JAR classpath RUN mkdir -p /home/ray/.spark && \ echo 'spark.hadoop.fs.s3a.impl=org.apache.hadoop.fs.s3a.S3AFileSystem' > /home/ray/.spark/spark-defaults.conf && \ echo 'spark.hadoop.fs.s3a.aws.credentials.provider=com.amazonaws.auth.InstanceProfileCredentialsProvider,com.amazonaws.auth.DefaultAWSCredentialsProviderChain' >> /home/ray/.spark/spark-defaults.conf && \ - echo 'spark.jars.packages=org.apache.hadoop:hadoop-aws:3.3.6,com.amazonaws:aws-java-sdk-bundle:1.12.367' >> /home/ray/.spark/spark-defaults.conf + echo 'spark.jars.packages=org.apache.hadoop:hadoop-aws:3.3.6,com.amazonaws:aws-java-sdk-bundle:1.12.367' >> /home/ray/.spark/spark-defaults.conf && \ + echo "spark.driver.extraClassPath=/home/ray/anaconda3/lib/python3.12/site-packages/ray/jars/*" >> /home/ray/.spark/spark-defaults.conf && \ + echo "spark.executor.extraClassPath=/home/ray/anaconda3/lib/python3.12/site-packages/ray/jars/*" >> /home/ray/.spark/spark-defaults.conf -# Verify Python packages only (Hadoop/Java verification can cause container startup issues) +# Verify Python packages and Ray Java JAR RUN /home/ray/anaconda3/bin/python -c "import ray; print(f'Ray version: {ray.__version__}')" && \ /home/ray/anaconda3/bin/python -c "import py4j; print(f'py4j version: {py4j.__version__}')" && \ /home/ray/anaconda3/bin/python -c "import pyspark; print(f'PySpark version: {pyspark.__version__}')" && \ /home/ray/anaconda3/bin/python -c "import raydp; print('raydp installed successfully')" && \ - /home/ray/anaconda3/bin/python -c "import pyarrow; print('PyArrow installed successfully')" + /home/ray/anaconda3/bin/python -c "import pyarrow; print('PyArrow installed successfully')" && \ + /home/ray/anaconda3/bin/python -c "import ray, os; jar_path = os.path.join(os.path.dirname(ray.__file__), 'jars', 'ray_dist.jar'); assert os.path.exists(jar_path), f'ray_dist.jar not found at {jar_path}'; print(f'ray_dist.jar found: {jar_path}')" # Set working directory WORKDIR /home/ray diff --git a/spark_on_ray/main.py b/spark_on_ray/main.py index 76494b8..d10665e 100644 --- a/spark_on_ray/main.py +++ b/spark_on_ray/main.py @@ -17,11 +17,20 @@ num_executors = 128 executor_memory = "3GB" +# Get the Ray JAR path for Spark classpath +ray_jars_dir = os.path.join(os.path.dirname(ray.__file__), "jars") +ray_dist_jar = os.path.join(ray_jars_dir, "ray_dist.jar") + spark = raydp.init_spark( app_name="RayDP Example", num_executors=num_executors, executor_cores=1, - executor_memory=executor_memory + executor_memory=executor_memory, + configs={ + "spark.jars": ray_dist_jar, + "spark.driver.extraClassPath": ray_jars_dir + "/*", + "spark.executor.extraClassPath": ray_jars_dir + "/*", + } ) From 6c474fb1681f0c9a81539507908ceaf3cb0e82dd Mon Sep 17 00:00:00 2001 From: Avi Basnet Date: Wed, 21 Jan 2026 23:52:31 +0000 Subject: [PATCH 2/6] unnecessary code --- spark_on_ray/main.py | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/spark_on_ray/main.py b/spark_on_ray/main.py index d10665e..76494b8 100644 --- a/spark_on_ray/main.py +++ b/spark_on_ray/main.py @@ -17,20 +17,11 @@ num_executors = 128 executor_memory = "3GB" -# Get the Ray JAR path for Spark classpath -ray_jars_dir = os.path.join(os.path.dirname(ray.__file__), "jars") -ray_dist_jar = os.path.join(ray_jars_dir, "ray_dist.jar") - spark = raydp.init_spark( app_name="RayDP Example", num_executors=num_executors, executor_cores=1, - executor_memory=executor_memory, - configs={ - "spark.jars": ray_dist_jar, - "spark.driver.extraClassPath": ray_jars_dir + "/*", - "spark.executor.extraClassPath": ray_jars_dir + "/*", - } + executor_memory=executor_memory ) From 3b39baf2d07dc2c8ed55f41964d8fa1de9c49702 Mon Sep 17 00:00:00 2001 From: Avi Basnet Date: Thu, 22 Jan 2026 00:09:28 +0000 Subject: [PATCH 3/6] clean --- spark_on_ray/Dockerfile | 67 ++++++++--------------------------------- 1 file changed, 12 insertions(+), 55 deletions(-) diff --git a/spark_on_ray/Dockerfile b/spark_on_ray/Dockerfile index 15634e6..59035f6 100644 --- a/spark_on_ray/Dockerfile +++ b/spark_on_ray/Dockerfile @@ -106,76 +106,33 @@ RUN /home/ray/anaconda3/bin/pip install --no-cache-dir py4j # Install raydp RUN /home/ray/anaconda3/bin/pip install --no-cache-dir --pre raydp -# Fix: Ensure Ray's Java JAR matches the installed Ray version -# RayDP expects ray_dist.jar in /jars/ for JVM-Ray communication -# The Anyscale image may have this JAR but we ensure it matches the Ray version -# CRITICAL: Also copy to PySpark's jars/ directory so py4j JVM can find it +# Extract ray_dist.jar from Ray wheel and backup for restoration after pre-start +# The Anyscale pre-start script replaces Ray at container startup, removing the JAR. RUN RAY_VERSION=$(/home/ray/anaconda3/bin/python -c "import ray; print(ray.__version__)") && \ - echo "Ensuring ray_dist.jar for Ray version: ${RAY_VERSION}" && \ cd /tmp && \ /home/ray/anaconda3/bin/pip download "ray==${RAY_VERSION}" --no-deps -d . && \ mkdir -p /home/ray/anaconda3/lib/python3.12/site-packages/ray/jars && \ unzip -o -j ray-*.whl "ray/jars/*" -d /home/ray/anaconda3/lib/python3.12/site-packages/ray/jars/ && \ rm -f ray-*.whl && \ - echo "Installed ray_dist.jar:" && \ - ls -la /home/ray/anaconda3/lib/python3.12/site-packages/ray/jars/ && \ - echo "Copying ray_dist.jar to PySpark jars directory..." && \ - cp /home/ray/anaconda3/lib/python3.12/site-packages/ray/jars/*.jar $SPARK_HOME/jars/ && \ - ls -la $SPARK_HOME/jars/ray*.jar - -# CRITICAL FIX: Backup ray jars to a safe location that survives pre-start script -# The Anyscale pre-start script replaces the ray package at container startup, -# which removes ray_dist.jar. We backup the jars and patch pre-start to restore them. -RUN sudo mkdir -p /opt/ray-jars-backup && \ - sudo cp -r /home/ray/anaconda3/lib/python3.12/site-packages/ray/jars/* /opt/ray-jars-backup/ && \ - echo "Backed up ray jars to /opt/ray-jars-backup:" && \ - ls -la /opt/ray-jars-backup/ - -# Create a restore script that can be called to restore ray jars -# This script can be used manually or integrated into startup -RUN echo '#!/bin/bash' | sudo tee /opt/ray-jars-backup/restore-ray-jars.sh > /dev/null && \ - echo 'RAY_JARS_BACKUP="/opt/ray-jars-backup"' | sudo tee -a /opt/ray-jars-backup/restore-ray-jars.sh > /dev/null && \ - echo 'RAY_SITE_PKG="${ANYSCALE_RAY_SITE_PKG_DIR:-/home/ray/anaconda3/lib/python3.12/site-packages}"' | sudo tee -a /opt/ray-jars-backup/restore-ray-jars.sh > /dev/null && \ - echo 'RAY_JARS_DIR="${RAY_SITE_PKG}/ray/jars"' | sudo tee -a /opt/ray-jars-backup/restore-ray-jars.sh > /dev/null && \ - echo 'if [[ -d "${RAY_JARS_BACKUP}" ]] && [[ -f "${RAY_JARS_BACKUP}/ray_dist.jar" ]]; then' | sudo tee -a /opt/ray-jars-backup/restore-ray-jars.sh > /dev/null && \ - echo ' if [[ ! -f "${RAY_JARS_DIR}/ray_dist.jar" ]]; then' | sudo tee -a /opt/ray-jars-backup/restore-ray-jars.sh > /dev/null && \ - echo ' echo "Restoring ray_dist.jar for RayDP/Spark-on-Ray support..."' | sudo tee -a /opt/ray-jars-backup/restore-ray-jars.sh > /dev/null && \ - echo ' mkdir -p "${RAY_JARS_DIR}"' | sudo tee -a /opt/ray-jars-backup/restore-ray-jars.sh > /dev/null && \ - echo ' cp -r "${RAY_JARS_BACKUP}"/* "${RAY_JARS_DIR}/" 2>/dev/null || sudo cp -r "${RAY_JARS_BACKUP}"/* "${RAY_JARS_DIR}/"' | sudo tee -a /opt/ray-jars-backup/restore-ray-jars.sh > /dev/null && \ - echo ' echo "Restored ray jars to ${RAY_JARS_DIR}"' | sudo tee -a /opt/ray-jars-backup/restore-ray-jars.sh > /dev/null && \ - echo ' fi' | sudo tee -a /opt/ray-jars-backup/restore-ray-jars.sh > /dev/null && \ - echo 'fi' | sudo tee -a /opt/ray-jars-backup/restore-ray-jars.sh > /dev/null && \ - sudo chmod +x /opt/ray-jars-backup/restore-ray-jars.sh - -# Patch the Anyscale pre-start script to restore ray_dist.jar after Ray replacement -# We append a restore block to the end of the existing script + sudo mkdir -p /opt/ray-jars-backup && \ + sudo cp /home/ray/anaconda3/lib/python3.12/site-packages/ray/jars/ray_dist.jar /opt/ray-jars-backup/ + +# Patch pre-start script to restore ray_dist.jar after Ray replacement RUN if [ -f /opt/anyscale/ray-prestart ]; then \ - echo "Patching /opt/anyscale/ray-prestart to preserve ray_dist.jar..." && \ - sudo cp /opt/anyscale/ray-prestart /opt/anyscale/ray-prestart.original && \ echo '' | sudo tee -a /opt/anyscale/ray-prestart > /dev/null && \ - echo '# RAYDP FIX: Restore ray_dist.jar after Ray package replacement' | sudo tee -a /opt/anyscale/ray-prestart > /dev/null && \ - echo 'RAY_JARS_BACKUP="/opt/ray-jars-backup"' | sudo tee -a /opt/anyscale/ray-prestart > /dev/null && \ - echo 'RAY_JARS_DIR="${ANYSCALE_RAY_SITE_PKG_DIR}/ray/jars"' | sudo tee -a /opt/anyscale/ray-prestart > /dev/null && \ - echo 'if [[ -d "${RAY_JARS_BACKUP}" ]] && [[ -f "${RAY_JARS_BACKUP}/ray_dist.jar" ]]; then' | sudo tee -a /opt/anyscale/ray-prestart > /dev/null && \ - echo ' echo "Restoring ray_dist.jar for RayDP/Spark-on-Ray support..."' | sudo tee -a /opt/anyscale/ray-prestart > /dev/null && \ - echo ' "${SUDO[@]}" mkdir -p "${RAY_JARS_DIR}"' | sudo tee -a /opt/anyscale/ray-prestart > /dev/null && \ - echo ' "${SUDO[@]}" cp -r "${RAY_JARS_BACKUP}"/* "${RAY_JARS_DIR}/"' | sudo tee -a /opt/anyscale/ray-prestart > /dev/null && \ - echo ' echo "Restored ray jars to ${RAY_JARS_DIR}"' | sudo tee -a /opt/anyscale/ray-prestart > /dev/null && \ - echo 'fi' | sudo tee -a /opt/anyscale/ray-prestart > /dev/null && \ - echo "Pre-start script patched successfully."; \ - else \ - echo "NOTE: /opt/anyscale/ray-prestart not found at build time"; \ + echo '# RAYDP FIX: Restore ray_dist.jar' | sudo tee -a /opt/anyscale/ray-prestart > /dev/null && \ + echo 'if [[ -f /opt/ray-jars-backup/ray_dist.jar ]]; then' | sudo tee -a /opt/anyscale/ray-prestart > /dev/null && \ + echo ' "${SUDO[@]}" mkdir -p "${ANYSCALE_RAY_SITE_PKG_DIR}/ray/jars"' | sudo tee -a /opt/anyscale/ray-prestart > /dev/null && \ + echo ' "${SUDO[@]}" cp /opt/ray-jars-backup/ray_dist.jar "${ANYSCALE_RAY_SITE_PKG_DIR}/ray/jars/"' | sudo tee -a /opt/anyscale/ray-prestart > /dev/null && \ + echo 'fi' | sudo tee -a /opt/anyscale/ray-prestart > /dev/null; \ fi # Install additional Python packages for Spark/Hadoop integration RUN /home/ray/anaconda3/bin/pip install --no-cache-dir emoji pyarrow pandas numpy findspark # Configure bash environment (minimal - for Anyscale workspace compatibility only) -# Also add auto-restore for ray jars as a fallback mechanism RUN echo 'PROMPT_COMMAND="history -a"' >> /home/ray/.bashrc && \ - echo '[ -e ~/.workspacerc ] && source ~/.workspacerc' >> /home/ray/.bashrc && \ - echo '# Auto-restore ray_dist.jar if missing (for RayDP support)' >> /home/ray/.bashrc && \ - echo '[ -x /opt/ray-jars-backup/restore-ray-jars.sh ] && /opt/ray-jars-backup/restore-ray-jars.sh 2>/dev/null' >> /home/ray/.bashrc + echo '[ -e ~/.workspacerc ] && source ~/.workspacerc' >> /home/ray/.bashrc # Create Spark configuration for S3 access and Ray JAR classpath RUN mkdir -p /home/ray/.spark && \ From 82f8b75c2d9127f70fb0e38730a93e490fbfcf34 Mon Sep 17 00:00:00 2001 From: Avi Basnet Date: Mon, 26 Jan 2026 09:02:24 +0000 Subject: [PATCH 4/6] clean --- spark_on_ray/Dockerfile | 39 +++++++++------------------------------ 1 file changed, 9 insertions(+), 30 deletions(-) diff --git a/spark_on_ray/Dockerfile b/spark_on_ray/Dockerfile index 59035f6..1aa86a4 100644 --- a/spark_on_ray/Dockerfile +++ b/spark_on_ray/Dockerfile @@ -15,13 +15,6 @@ ENV JAVA_HOME=/usr/lib/jvm/java-11-openjdk-amd64 # PySpark uses py4j from site-packages - RayDP needs to find it ENV SPARK_HOME=/home/ray/anaconda3/lib/python3.12/site-packages/pyspark -# Ray JAR path for Spark/RayDP JVM communication -ENV RAY_JARS_DIR=/home/ray/anaconda3/lib/python3.12/site-packages/ray/jars -ENV CLASSPATH=$RAY_JARS_DIR/*:$CLASSPATH - -# Spark config directory -ENV SPARK_CONF_DIR=/home/ray/.spark - # Add Hadoop and Spark to PATH ENV PATH=$HADOOP_HOME/bin:$SPARK_HOME/bin:$PATH ENV LD_LIBRARY_PATH=$HADOOP_HOME/lib/native:$LD_LIBRARY_PATH @@ -106,26 +99,16 @@ RUN /home/ray/anaconda3/bin/pip install --no-cache-dir py4j # Install raydp RUN /home/ray/anaconda3/bin/pip install --no-cache-dir --pre raydp -# Extract ray_dist.jar from Ray wheel and backup for restoration after pre-start +# Download ray_dist.jar from PyPI and backup for restoration after pre-start # The Anyscale pre-start script replaces Ray at container startup, removing the JAR. -RUN RAY_VERSION=$(/home/ray/anaconda3/bin/python -c "import ray; print(ray.__version__)") && \ - cd /tmp && \ - /home/ray/anaconda3/bin/pip download "ray==${RAY_VERSION}" --no-deps -d . && \ - mkdir -p /home/ray/anaconda3/lib/python3.12/site-packages/ray/jars && \ - unzip -o -j ray-*.whl "ray/jars/*" -d /home/ray/anaconda3/lib/python3.12/site-packages/ray/jars/ && \ - rm -f ray-*.whl && \ +RUN cd /tmp && \ + /home/ray/anaconda3/bin/pip download "ray==2.53.0" --no-deps -d . && \ sudo mkdir -p /opt/ray-jars-backup && \ - sudo cp /home/ray/anaconda3/lib/python3.12/site-packages/ray/jars/ray_dist.jar /opt/ray-jars-backup/ + sudo unzip -o -j ray-*.whl "ray/jars/ray_dist.jar" -d /opt/ray-jars-backup/ && \ + rm -f ray-*.whl # Patch pre-start script to restore ray_dist.jar after Ray replacement -RUN if [ -f /opt/anyscale/ray-prestart ]; then \ - echo '' | sudo tee -a /opt/anyscale/ray-prestart > /dev/null && \ - echo '# RAYDP FIX: Restore ray_dist.jar' | sudo tee -a /opt/anyscale/ray-prestart > /dev/null && \ - echo 'if [[ -f /opt/ray-jars-backup/ray_dist.jar ]]; then' | sudo tee -a /opt/anyscale/ray-prestart > /dev/null && \ - echo ' "${SUDO[@]}" mkdir -p "${ANYSCALE_RAY_SITE_PKG_DIR}/ray/jars"' | sudo tee -a /opt/anyscale/ray-prestart > /dev/null && \ - echo ' "${SUDO[@]}" cp /opt/ray-jars-backup/ray_dist.jar "${ANYSCALE_RAY_SITE_PKG_DIR}/ray/jars/"' | sudo tee -a /opt/anyscale/ray-prestart > /dev/null && \ - echo 'fi' | sudo tee -a /opt/anyscale/ray-prestart > /dev/null; \ - fi +RUN [ -f /opt/anyscale/ray-prestart ] && printf '\n# RAYDP FIX: Restore ray_dist.jar\nif [[ -f /opt/ray-jars-backup/ray_dist.jar ]]; then\n "${SUDO[@]}" mkdir -p "${ANYSCALE_RAY_SITE_PKG_DIR}/ray/jars"\n "${SUDO[@]}" cp /opt/ray-jars-backup/ray_dist.jar "${ANYSCALE_RAY_SITE_PKG_DIR}/ray/jars/"\nfi\n' | sudo tee -a /opt/anyscale/ray-prestart > /dev/null || true # Install additional Python packages for Spark/Hadoop integration RUN /home/ray/anaconda3/bin/pip install --no-cache-dir emoji pyarrow pandas numpy findspark @@ -138,17 +121,13 @@ RUN echo 'PROMPT_COMMAND="history -a"' >> /home/ray/.bashrc && \ RUN mkdir -p /home/ray/.spark && \ echo 'spark.hadoop.fs.s3a.impl=org.apache.hadoop.fs.s3a.S3AFileSystem' > /home/ray/.spark/spark-defaults.conf && \ echo 'spark.hadoop.fs.s3a.aws.credentials.provider=com.amazonaws.auth.InstanceProfileCredentialsProvider,com.amazonaws.auth.DefaultAWSCredentialsProviderChain' >> /home/ray/.spark/spark-defaults.conf && \ - echo 'spark.jars.packages=org.apache.hadoop:hadoop-aws:3.3.6,com.amazonaws:aws-java-sdk-bundle:1.12.367' >> /home/ray/.spark/spark-defaults.conf && \ - echo "spark.driver.extraClassPath=/home/ray/anaconda3/lib/python3.12/site-packages/ray/jars/*" >> /home/ray/.spark/spark-defaults.conf && \ - echo "spark.executor.extraClassPath=/home/ray/anaconda3/lib/python3.12/site-packages/ray/jars/*" >> /home/ray/.spark/spark-defaults.conf + echo 'spark.jars.packages=org.apache.hadoop:hadoop-aws:3.3.6,com.amazonaws:aws-java-sdk-bundle:1.12.367' >> /home/ray/.spark/spark-defaults.conf -# Verify Python packages and Ray Java JAR +# Verify Python packages and ray_dist.jar backup RUN /home/ray/anaconda3/bin/python -c "import ray; print(f'Ray version: {ray.__version__}')" && \ - /home/ray/anaconda3/bin/python -c "import py4j; print(f'py4j version: {py4j.__version__}')" && \ /home/ray/anaconda3/bin/python -c "import pyspark; print(f'PySpark version: {pyspark.__version__}')" && \ /home/ray/anaconda3/bin/python -c "import raydp; print('raydp installed successfully')" && \ - /home/ray/anaconda3/bin/python -c "import pyarrow; print('PyArrow installed successfully')" && \ - /home/ray/anaconda3/bin/python -c "import ray, os; jar_path = os.path.join(os.path.dirname(ray.__file__), 'jars', 'ray_dist.jar'); assert os.path.exists(jar_path), f'ray_dist.jar not found at {jar_path}'; print(f'ray_dist.jar found: {jar_path}')" + ls -la /opt/ray-jars-backup/ray_dist.jar # Set working directory WORKDIR /home/ray From ffdd92db7f42533c9000bbb4014fb9bd1137a689 Mon Sep 17 00:00:00 2001 From: Avi Basnet Date: Mon, 26 Jan 2026 09:04:49 +0000 Subject: [PATCH 5/6] clean --- spark_on_ray/Dockerfile | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/spark_on_ray/Dockerfile b/spark_on_ray/Dockerfile index 1aa86a4..ab58f2f 100644 --- a/spark_on_ray/Dockerfile +++ b/spark_on_ray/Dockerfile @@ -117,17 +117,18 @@ RUN /home/ray/anaconda3/bin/pip install --no-cache-dir emoji pyarrow pandas nump RUN echo 'PROMPT_COMMAND="history -a"' >> /home/ray/.bashrc && \ echo '[ -e ~/.workspacerc ] && source ~/.workspacerc' >> /home/ray/.bashrc -# Create Spark configuration for S3 access and Ray JAR classpath +# Create Spark configuration for S3 access RUN mkdir -p /home/ray/.spark && \ echo 'spark.hadoop.fs.s3a.impl=org.apache.hadoop.fs.s3a.S3AFileSystem' > /home/ray/.spark/spark-defaults.conf && \ echo 'spark.hadoop.fs.s3a.aws.credentials.provider=com.amazonaws.auth.InstanceProfileCredentialsProvider,com.amazonaws.auth.DefaultAWSCredentialsProviderChain' >> /home/ray/.spark/spark-defaults.conf && \ echo 'spark.jars.packages=org.apache.hadoop:hadoop-aws:3.3.6,com.amazonaws:aws-java-sdk-bundle:1.12.367' >> /home/ray/.spark/spark-defaults.conf -# Verify Python packages and ray_dist.jar backup +# Verify Python packages only (Hadoop/Java verification can cause container startup issues) RUN /home/ray/anaconda3/bin/python -c "import ray; print(f'Ray version: {ray.__version__}')" && \ + /home/ray/anaconda3/bin/python -c "import py4j; print(f'py4j version: {py4j.__version__}')" && \ /home/ray/anaconda3/bin/python -c "import pyspark; print(f'PySpark version: {pyspark.__version__}')" && \ /home/ray/anaconda3/bin/python -c "import raydp; print('raydp installed successfully')" && \ - ls -la /opt/ray-jars-backup/ray_dist.jar + /home/ray/anaconda3/bin/python -c "import pyarrow; print('PyArrow installed successfully')" # Set working directory WORKDIR /home/ray From 27eb7a46664177fb834e1f348bd8ba969d05a56f Mon Sep 17 00:00:00 2001 From: Robert Nishihara Date: Mon, 26 Jan 2026 13:41:57 -0800 Subject: [PATCH 6/6] Update Dockerfile --- spark_on_ray/Dockerfile | 1 - 1 file changed, 1 deletion(-) diff --git a/spark_on_ray/Dockerfile b/spark_on_ray/Dockerfile index ab58f2f..3c3da54 100644 --- a/spark_on_ray/Dockerfile +++ b/spark_on_ray/Dockerfile @@ -2,7 +2,6 @@ FROM anyscale/ray:2.53.0-slim-py312-cu128 # Environment variables -# ENV ANYSCALE_DISABLE_OPTIMIZED_RAY=1 ENV DEBIAN_FRONTEND=noninteractive ENV HOME=/home/ray ENV PATH=/home/ray/anaconda3/bin:$PATH