Skip to content

Commit 2e4c9ce

Browse files
committed
Correct test failures on 2.0-debian10
gpu/install_gpu_driver.sh * Do not use fair scheduler for 2.0 clusters * comment out spark-defaults.conf config options as guidance for tuning gpu/test_gpu.py * Specify more tuning parameters when running spark job
1 parent 10570e2 commit 2e4c9ce

2 files changed

Lines changed: 23 additions & 33 deletions

File tree

gpu/install_gpu_driver.sh

Lines changed: 12 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -1445,25 +1445,6 @@ function configure_yarn_resources() {
14451445
'org.apache.hadoop.yarn.util.resource.DominantResourceCalculator'
14461446

14471447
set_hadoop_property 'yarn-site.xml' 'yarn.resource-types' 'yarn.io/gpu'
1448-
1449-
# Older CapacityScheduler does not permit use of gpu resources ; switch to FairScheduler on 2.0 and below
1450-
if version_lt "${DATAPROC_IMAGE_VERSION}" "2.1" ; then
1451-
fs_xml="$HADOOP_CONF_DIR/fair-scheduler.xml"
1452-
set_hadoop_property 'yarn-site.xml' \
1453-
'yarn.resourcemanager.scheduler.class' 'org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.FairScheduler'
1454-
set_hadoop_property 'yarn-site.xml' \
1455-
"yarn.scheduler.fair.user-as-default-queue" "false"
1456-
set_hadoop_property 'yarn-site.xml' \
1457-
"yarn.scheduler.fair.allocation.file" "${fs_xml}"
1458-
set_hadoop_property 'yarn-site.xml' \
1459-
'yarn.scheduler.fair.resource-calculator' 'org.apache.hadoop.yarn.util.resource.DominantResourceCalculator'
1460-
cat > "${fs_xml}" <<EOF
1461-
<!-- ${fs_xml} -->
1462-
<allocations>
1463-
<queueMaxAppsDefault>1</queueMaxAppsDefault>
1464-
</allocations>
1465-
EOF
1466-
fi
14671448
}
14681449

14691450
# This configuration should be applied only if GPU is attached to the node
@@ -1560,6 +1541,9 @@ EOF
15601541
local spark_defaults_conf="/etc/spark/conf.dist/spark-defaults.conf"
15611542
if version_lt "${SPARK_VERSION}" "3.0" ; then return ; fi
15621543

1544+
if ! grep spark.executor.resource.gpu.discoveryScript "${spark_defaults_conf}" ; then
1545+
echo "spark.executor.resource.gpu.discoveryScript=${gpus_resources_script}" >> "${spark_defaults_conf}"
1546+
fi
15631547
local executor_cores
15641548
executor_cores="$(nproc | perl -MPOSIX -pe '$_ = POSIX::floor( $_ * 0.75 ); $_-- if $_ % 2')"
15651549
local executor_memory
@@ -1575,16 +1559,17 @@ EOF
15751559
# query explain output won't show GPU operator, if the user has doubts
15761560
# they can uncomment the line before seeing the GPU plan explain;
15771561
# having AQE enabled gives user the best performance.
1578-
spark.executor.resource.gpu.discoveryScript=${gpus_resources_script}
1562+
#spark.sql.autoBroadcastJoinThreshold=10m
1563+
#spark.sql.files.maxPartitionBytes=512m
15791564
spark.executor.resource.gpu.amount=${gpu_count}
1580-
spark.executor.cores=${executor_cores}
1581-
spark.executor.memory=${executor_memory_gb}G
1582-
spark.dynamicAllocation.enabled=false
1565+
#spark.executor.cores=${executor_cores}
1566+
#spark.executor.memory=${executor_memory_gb}G
1567+
#spark.dynamicAllocation.enabled=false
15831568
# please update this config according to your application
1584-
spark.task.resource.gpu.amount=${gpu_amount}
1585-
spark.task.cpus=2
1586-
spark.yarn.unmanagedAM.enabled=false
1587-
spark.plugins=com.nvidia.spark.SQLPlugin
1569+
#spark.task.resource.gpu.amount=${gpu_amount}
1570+
#spark.task.cpus=2
1571+
#spark.yarn.unmanagedAM.enabled=false
1572+
#spark.plugins=com.nvidia.spark.SQLPlugin
15881573
###### END : RAPIDS properties for Spark ${SPARK_VERSION} ######
15891574
EOF
15901575
}

gpu/test_gpu.py

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -126,12 +126,17 @@ def verify_instance_spark(self):
126126
"spark",
127127
"--jars=file:///usr/lib/spark/examples/jars/spark-examples.jar " \
128128
+ "--class=org.apache.spark.examples.ml.JavaIndexToStringExample " \
129-
+ "--properties=" \
130-
+ "spark.executor.resource.gpu.amount=1," \
131-
+ "spark.executor.cores=6," \
132-
+ "spark.executor.memory=4G," \
133-
+ "spark.task.resource.gpu.amount=0.333," \
134-
+ "spark.task.cpus=2," \
129+
+ "--properties="\
130+
+ "spark.executor.resource.gpu.amount=1,"\
131+
+ "spark.executor.cores=6,"\
132+
+ "spark.executor.memory=4G,"\
133+
+ "spark.plugins=com.nvidia.spark.SQLPlugin,"\
134+
+ "spark.executor.resource.gpu.discoveryScript=${get_gpu_resources_script},"\
135+
+ "spark.dynamicAllocation.enabled=false,"\
136+
+ "spark.sql.autoBroadcastJoinThreshold=10m,"\
137+
+ "spark.sql.files.maxPartitionBytes=512m,"\
138+
+ "spark.task.resource.gpu.amount=0.333,"\
139+
+ "spark.task.cpus=2,"\
135140
+ "spark.yarn.unmanagedAM.enabled=false"
136141
)
137142

0 commit comments

Comments
 (0)