diff --git a/.dockerignore b/.dockerignore index 6d4c3ac8c81..6cf974f33c4 100644 --- a/.dockerignore +++ b/.dockerignore @@ -1,6 +1,10 @@ -target -.idea -.git -tools/docker/Dockerfile.dev -pipeline.yaml -.dockerignore +# Allowlist: only send files referenced by COPY instructions in Dockerfiles. +# CI Dockerfile: environment.yml, project/, build.sbt, sonatype.sbt +# Demo Dockerfile: tools/docker/demo/init_notebook.py, docs/ +* +!environment.yml +!project/ +!build.sbt +!sonatype.sbt +!tools/docker/demo/init_notebook.py +!docs/ diff --git a/.gitignore b/.gitignore index 195890ccf3c..6bd8781ca93 100644 --- a/.gitignore +++ b/.gitignore @@ -95,3 +95,4 @@ condaenv.*.requirements.txt # Dev-loop artifacts .dev-loop/ .dev-loop-artifacts/ +pipeline.yaml.bak diff --git a/build.sbt b/build.sbt index 72d00806c69..d8fbee4f1b3 100644 --- a/build.sbt +++ b/build.sbt @@ -104,7 +104,12 @@ getDatasetsTask := { val f = new File(d, datasetName) if (!d.exists()) d.mkdirs() if (!f.exists()) { - FileUtils.copyURLToFile(datasetUrl, f) + val cached = new File(sys.env.getOrElse("DATASET_CACHE", "/opt/datasets"), datasetName) + if (cached.exists()) { + java.nio.file.Files.copy(cached.toPath, f.toPath) + } else { + FileUtils.copyURLToFile(datasetUrl, f) + } UnzipUtils.unzip(f, d) } } diff --git a/core/src/test/scala/com/microsoft/azure/synapse/ml/core/test/base/TestBase.scala b/core/src/test/scala/com/microsoft/azure/synapse/ml/core/test/base/TestBase.scala index a90f1ea81eb..03b18b84236 100644 --- a/core/src/test/scala/com/microsoft/azure/synapse/ml/core/test/base/TestBase.scala +++ b/core/src/test/scala/com/microsoft/azure/synapse/ml/core/test/base/TestBase.scala @@ -22,7 +22,7 @@ import org.scalatest.time.{Seconds, Span} import java.io.File import java.nio.file.{Files, Path} -import scala.concurrent._ +import scala.concurrent.blocking import scala.reflect.ClassTag trait SparkSessionManagement { @@ -147,7 +147,18 @@ object TestBase extends SparkSessionManagement { } -abstract class TestBase extends AnyFunSuite with BeforeAndAfterEachTestData with BeforeAndAfterAll { +abstract class TestBase extends AnyFunSuite with BeforeAndAfterEachTestData with BeforeAndAfterAll with TimeLimits { + + // Global per-test timeout (10 minutes). Override in subclass if needed. + val testTimeoutInSeconds: Int = 10 * 60 + + override def test(testName: String, testTags: Tag*)(testFun: => Any)(implicit pos: Position): Unit = { + super.test(testName, testTags: _*) { + failAfter(Span(testTimeoutInSeconds, Seconds)) { + testFun + } + } + } lazy val sparkProvider: SparkSessionManagement = TestBase diff --git a/pipeline.yaml b/pipeline.yaml index e9af40f5430..7038478a7c9 100644 --- a/pipeline.yaml +++ b/pipeline.yaml @@ -1,5 +1,11 @@ resources: -- repo: self + containers: + - container: ci + image: mmlsparkmcr.azurecr.io/synapseml/ci:ci-latest + endpoint: 'SynapseML MCR' + repositories: + - repository: self + type: self trigger: branches: @@ -95,12 +101,83 @@ variables: runCoverage: $[or(eq(variables['Build.Reason'], 'PullRequest'), eq(variables['Build.SourceBranch'], 'refs/heads/master'), startsWith(variables['Build.SourceBranch'], 'refs/tags/'))] jobs: +- job: BuildCIImage + displayName: 'Ensure CI Image' + cancelTimeoutInMinutes: 0 + timeoutInMinutes: 60 + pool: + vmImage: $(UBUNTU_VERSION) + steps: + - bash: | + set -e + echo "=== Disk space BEFORE cleanup ===" + df -h / | grep -E 'Filesystem|/$' + sudo rm -rf /usr/local/lib/android || true + sudo rm -rf /usr/lib/google-cloud-sdk || true + sudo rm -rf /usr/share/dotnet || true + sudo rm -rf /opt/ghc || true + sudo rm -rf /opt/hostedtoolcache || true + sudo rm -rf /usr/local/share/boost || true + sudo rm -rf /usr/share/swift || true + sudo rm -rf /usr/local/.ghcup || true + sudo rm -rf /usr/share/miniconda || true + sudo rm -rf /usr/local/share/chromium || true + sudo docker image prune -af || true + echo "=== Disk space AFTER cleanup ===" + df -h / | grep -E 'Filesystem|/$' + displayName: 'Free disk space' + - task: Docker@2 + displayName: 'Login to ACR' + inputs: + command: login + containerRegistry: 'SynapseML MCR' + - bash: | + set -e + # Hash each dependency file individually (with filename) to avoid boundary-shift collisions + HASH=$(sha256sum environment.yml project/plugins.sbt project/build.properties build.sbt sonatype.sbt tools/docker/ci/Dockerfile | sha256sum | cut -c1-12) + TAG="ci-${HASH}" + LATEST_TAG="ci-latest" + REGISTRY="mmlsparkmcr.azurecr.io" + REPO="synapseml/ci" + echo "Content hash: $TAG" + + # Retry manifest inspect to tolerate transient ACR failures + IMAGE_EXISTS=false + for i in 1 2 3; do + if docker manifest inspect "${REGISTRY}/${REPO}:${TAG}" > /dev/null 2>&1; then + IMAGE_EXISTS=true + break + fi + [ $i -lt 3 ] && echo "Manifest inspect attempt $i failed, retrying..." && sleep 5 + done + + if [ "$IMAGE_EXISTS" = "true" ]; then + echo "Image ${TAG} exists. Re-tagging server-side as ${LATEST_TAG}." + docker buildx imagetools create \ + --tag "${REGISTRY}/${REPO}:${LATEST_TAG}" \ + "${REGISTRY}/${REPO}:${TAG}" + else + echo "Image ${TAG} not found. Building..." + docker pull "${REGISTRY}/${REPO}:${LATEST_TAG}" || true + docker build \ + --cache-from "${REGISTRY}/${REPO}:${LATEST_TAG}" \ + -t "${REGISTRY}/${REPO}:${TAG}" \ + -t "${REGISTRY}/${REPO}:${LATEST_TAG}" \ + -f tools/docker/ci/Dockerfile . + docker push "${REGISTRY}/${REPO}:${TAG}" + docker push "${REGISTRY}/${REPO}:${LATEST_TAG}" + fi + displayName: 'Check/Build CI Image' + - job: Style + dependsOn: BuildCIImage cancelTimeoutInMinutes: 0 - condition: and(eq(variables.runTests, 'True'), eq('${{ parameters.testStyle }}', true)) + condition: and(succeeded(), eq(variables.runTests, 'True'), eq('${{ parameters.testStyle }}', true)) pool: vmImage: $(UBUNTU_VERSION) + container: ci steps: + - template: templates/free_disk.yml - task: AzureCLI@2 displayName: 'Scala Style Check' inputs: @@ -108,7 +185,6 @@ jobs: scriptLocation: inlineScript scriptType: bash inlineScript: 'sbt scalastyle test:scalastyle' - - template: templates/conda.yml - bash: | set -e source activate synapseml @@ -116,14 +192,14 @@ jobs: displayName: 'Python Style Check' - ${{ if eq(parameters.publishArtifacts, true) }}: - job: Publish - condition: eq('${{ parameters.publishArtifacts }}', true) + dependsOn: BuildCIImage + condition: and(succeeded(), eq('${{ parameters.publishArtifacts }}', true)) cancelTimeoutInMinutes: 0 pool: vmImage: $(UBUNTU_VERSION) + container: ci steps: - #- template: templates/ivy_cache.yml - - template: templates/update_cli.yml - - template: templates/conda.yml + - template: templates/free_disk.yml - template: templates/kv.yml - task: MavenAuthenticate@0 name: mavenAuthPublicPackages @@ -139,7 +215,6 @@ jobs: scriptType: bash inlineScript: | set -e - sudo apt-get install graphviz doxygen -y source activate synapseml sbt packagePython uploadNotebooks sbt -DskipCodegen=true publishBlob publishDocs publishR publishPython @@ -167,11 +242,13 @@ jobs: - job: DatabricksE2E displayName: 'Databricks E2E' - condition: eq('${{ parameters.testDatabricksE2E }}', true) + dependsOn: BuildCIImage + condition: and(succeeded(), eq('${{ parameters.testDatabricksE2E }}', true)) timeoutInMinutes: 120 cancelTimeoutInMinutes: 0 pool: vmImage: $(UBUNTU_VERSION) + container: ci strategy: matrix: databricks-cpu-1: @@ -195,9 +272,7 @@ jobs: # databricks-rapids: # TEST-CLASS: "com.microsoft.azure.synapse.ml.nbtest.DatabricksRapidsTests" steps: - #- template: templates/ivy_cache.yml - - template: templates/update_cli.yml - - template: templates/conda.yml + - template: templates/free_disk.yml - template: templates/kv.yml - template: templates/publish.yml - task: AzureCLI@2 @@ -218,17 +293,19 @@ jobs: failTaskOnFailedTests: true condition: and(eq(variables.runTests, 'True'), succeededOrFailed()) +# FabricE2E runs in the CI container. The Fabric tests are Scala/JVM tests +# that call Fabric REST APIs — no special Python packages beyond the base env. - job: FabricE2E displayName: 'Fabric E2E' - condition: eq('${{ parameters.testFabricE2E }}', true) + dependsOn: BuildCIImage + condition: and(succeeded(), eq('${{ parameters.testFabricE2E }}', true)) timeoutInMinutes: 120 cancelTimeoutInMinutes: 0 pool: vmImage: $(UBUNTU_VERSION) + container: ci steps: - #- template: templates/ivy_cache.yml - - template: templates/update_cli.yml - - template: templates/conda.yml + - template: templates/free_disk.yml - template: templates/kv.yml - template: templates/fabric_kv.yml - template: templates/publish.yml @@ -426,11 +503,13 @@ jobs: condition: and(eq(variables.isMaster, true), startsWith(variables['tag'], 'v')) - job: PythonTests + dependsOn: BuildCIImage timeoutInMinutes: 120 cancelTimeoutInMinutes: 0 - condition: and(eq(variables.runTests, 'True'), eq('${{ parameters.testPython }}', true)) + condition: and(succeeded(), eq(variables.runTests, 'True'), eq('${{ parameters.testPython }}', true)) pool: vmImage: $(UBUNTU_VERSION) + container: ci strategy: matrix: core: @@ -452,9 +531,7 @@ jobs: cognitive: PACKAGE: "cognitive" steps: - #- template: templates/ivy_cache.yml - - template: templates/update_cli.yml - - template: templates/conda.yml + - template: templates/free_disk.yml - template: templates/kv.yml - task: AzureCLI@2 displayName: 'Install and package deps' @@ -466,11 +543,9 @@ jobs: inlineScript: | source activate synapseml if [ "$(runCoverage)" = "True" ]; then COV_CMD="coverage"; else COV_CMD=""; fi - sbt $COV_CMD getDatasets installPipPackage - sbt publishM2 + sbt $COV_CMD getDatasets "project core" installPipPackage publishM2 "project $(PACKAGE)" installPipPackage publishM2 - task: AzureCLI@2 displayName: 'Test Python Code' - retryCountOnTaskFailure: 1 timeoutInMinutes: 40 inputs: azureSubscription: 'SynapseML Build' @@ -479,7 +554,7 @@ jobs: inlineScript: | set -e source activate synapseml - export SBT_OPTS="-Xms2G -XX:+UseConcMarkSweepGC -XX:+CMSClassUnloadingEnabled -Xss5M -Duser.timezone=GMT" + export SBT_OPTS="-Xmx4G -XX:+UseConcMarkSweepGC -XX:+CMSClassUnloadingEnabled -Xss5M -Duser.timezone=GMT" echo "##vso[task.setvariable variable=SBT_OPTS]$SBT_OPTS" echo "SBT_OPTS=$SBT_OPTS" IGNORE_TEST_PATH_FLAG="" @@ -495,7 +570,6 @@ jobs: echo "IGNORE_TEST_PATH_FLAG=$IGNORE_TEST_PATH_FLAG" echo "TEST_SUB_PATH_FLAG=$TEST_SUB_PATH_FLAG" (sbt $IGNORE_TEST_PATH_FLAG $TEST_SUB_PATH_FLAG "project $(PACKAGE)" coverage testPython) || \ - (sbt $IGNORE_TEST_PATH_FLAG $TEST_SUB_PATH_FLAG "project $(PACKAGE)" coverage testPython) || \ (sbt $IGNORE_TEST_PATH_FLAG $TEST_SUB_PATH_FLAG "project $(PACKAGE)" coverage testPython) - task: PublishTestResults@2 displayName: 'Publish Test Results' @@ -515,11 +589,13 @@ jobs: - ${{ if or(eq(variables['Build.Reason'], 'PullRequest'), eq(variables['Build.SourceBranch'], 'refs/heads/master'), startsWith(variables['Build.SourceBranch'], 'refs/tags/')) }}: - template: templates/codecov.yml - job: RTests + dependsOn: BuildCIImage timeoutInMinutes: 60 cancelTimeoutInMinutes: 0 - condition: and(eq(variables.runTests, 'True'), eq('${{ parameters.testR }}', true)) + condition: and(succeeded(), eq(variables.runTests, 'True'), eq('${{ parameters.testR }}', true)) pool: vmImage: $(UBUNTU_VERSION) + container: ci strategy: matrix: core: @@ -535,9 +611,8 @@ jobs: cognitive: PACKAGE: "cognitive" steps: + - template: templates/free_disk.yml #- template: templates/ivy_cache_2.yml - - template: templates/update_cli.yml - - template: templates/conda.yml - template: templates/kv.yml - task: AzureCLI@2 displayName: 'Prepare for tests' @@ -549,18 +624,15 @@ jobs: scriptType: bash inlineScript: | set -e - export SBT_OPTS="-Xms2G -XX:+UseConcMarkSweepGC -XX:+CMSClassUnloadingEnabled -Xss5M -Duser.timezone=GMT" + export SBT_OPTS="-Xmx4G -XX:+UseConcMarkSweepGC -XX:+CMSClassUnloadingEnabled -Xss5M -Duser.timezone=GMT" source activate synapseml - (timeout 5m sbt setup) || (echo "retrying" && timeout 5m sbt setup) || (echo "retrying" && timeout 5m sbt setup) - sbt codegen - sbt publishM2 + timeout 30m sbt setup codegen publishM2 SPARK_VERSION=3.5.0 HADOOP_VERSION=3 - # wget https://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz - wget https://mmlspark.blob.core.windows.net/installers/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz + wget -q https://mmlspark.blob.core.windows.net/installers/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz - task: AzureCLI@2 displayName: 'Test R Code' - retryCountOnTaskFailure: 3 + retryCountOnTaskFailure: 1 timeoutInMinutes: 20 inputs: azureSubscription: 'SynapseML Build' @@ -568,7 +640,7 @@ jobs: scriptType: bash inlineScript: | set -e - export SBT_OPTS="-Xms2G -Xmx4G -XX:+UseConcMarkSweepGC -XX:+CMSClassUnloadingEnabled -XX:MaxPermSize=4G -Xss5M -Duser.timezone=GMT" + export SBT_OPTS="-Xmx4G -XX:+UseConcMarkSweepGC -XX:+CMSClassUnloadingEnabled -Xss5M -Duser.timezone=GMT" source activate synapseml timeout 20m sbt -DskipCodegen=true "project $(PACKAGE)" coverage testR - task: PublishTestResults@2 @@ -589,23 +661,16 @@ jobs: - ${{ if or(eq(variables['Build.Reason'], 'PullRequest'), eq(variables['Build.SourceBranch'], 'refs/heads/master'), startsWith(variables['Build.SourceBranch'], 'refs/tags/')) }}: - template: templates/codecov.yml -- job: BuildAndCacheCondaEnv - cancelTimeoutInMinutes: 0 - condition: eq(variables.runTests, 'True') - pool: - vmImage: $(UBUNTU_VERSION) - steps: - - template: templates/conda.yml - - job: WebsiteSamplesTests + dependsOn: BuildCIImage cancelTimeoutInMinutes: 0 - condition: and(eq(variables.runTests, 'True'), eq('${{ parameters.testWebsiteSamples }}', true)) + condition: and(succeeded(), eq(variables.runTests, 'True'), eq('${{ parameters.testWebsiteSamples }}', true)) pool: vmImage: $(UBUNTU_VERSION) + container: ci steps: + - template: templates/free_disk.yml #- template: templates/ivy_cache.yml - - template: templates/update_cli.yml - - template: templates/conda.yml - template: templates/kv.yml - template: templates/publish.yml - task: AzureCLI@2 @@ -616,9 +681,10 @@ jobs: scriptLocation: inlineScript scriptType: bash inlineScript: | - (timeout 5m sbt setup) || (echo "retrying" && timeout 5m sbt setup) || (echo "retrying" && timeout 5m sbt setup) + export SBT_OPTS="-Xmx4G -XX:+UseConcMarkSweepGC -XX:+CMSClassUnloadingEnabled -Xss2M -Duser.timezone=GMT" if [ "$(runCoverage)" = "True" ]; then COV_CMD="coverage"; else COV_CMD=""; fi - (sbt $COV_CMD testWebsiteDocs) + (timeout 30m sbt setup) || (echo "retrying" && timeout 30m sbt setup) + sbt $COV_CMD testWebsiteDocs - task: PublishTestResults@2 displayName: 'Publish Test Results' inputs: @@ -638,124 +704,166 @@ jobs: - template: templates/codecov.yml - job: UnitTests + dependsOn: BuildCIImage cancelTimeoutInMinutes: 1 - timeoutInMinutes: 80 - condition: and(eq(variables.runTests, 'True'), eq('${{ parameters.testUnit }}', true)) + timeoutInMinutes: 50 + condition: and(succeeded(), eq(variables.runTests, 'True'), eq('${{ parameters.testUnit }}', true)) pool: vmImage: $(UBUNTU_VERSION) + container: ci strategy: matrix: automl: PACKAGE: "automl" + PROJECT: "core" causal: PACKAGE: "causal" + PROJECT: "core" onnx: PACKAGE: "onnx" + PROJECT: "deepLearning" geospatial: PACKAGE: "services.geospatial" + PROJECT: "cognitive" anomaly: PACKAGE: "services.anomaly" + PROJECT: "cognitive" FLAKY: "true" face: PACKAGE: "services.face" + PROJECT: "cognitive" FLAKY: "true" form: PACKAGE: "services.form" + PROJECT: "cognitive" FLAKY: "true" language: PACKAGE: "services.language" + PROJECT: "cognitive" FLAKY: "true" openai: PACKAGE: "services.openai" + PROJECT: "cognitive" FLAKY: "true" aifoundry: PACKAGE: "services.aifoundry" + PROJECT: "cognitive" FLAKY: "true" search1: PACKAGE: "services.search.split1" + PROJECT: "cognitive" FFMPEG: "true" FLAKY: "true" search2: PACKAGE: "services.search.split2" + PROJECT: "cognitive" FFMPEG: "true" FLAKY: "true" speech1: PACKAGE: "services.speech" TEST_CLASSES: "com.microsoft.azure.synapse.ml.services.speech.SpeechToTextSDKSuite" + PROJECT: "cognitive" FFMPEG: "true" FLAKY: "true" speech2: PACKAGE: "services.speech" TEST_CLASSES: "com.microsoft.azure.synapse.ml.services.speech.ConversationTranscriptionSuite com.microsoft.azure.synapse.ml.services.speech.SpeechToTextSuite com.microsoft.azure.synapse.ml.services.speech.TextToSpeechSuite com.microsoft.azure.synapse.ml.services.speech.SpeakerEmotionInferenceSuite" + PROJECT: "cognitive" FFMPEG: "true" FLAKY: "true" text: PACKAGE: "services.text" + PROJECT: "cognitive" FLAKY: "true" translate: PACKAGE: "services.translate" + PROJECT: "cognitive" FLAKY: "true" vision: PACKAGE: "services.vision" + PROJECT: "cognitive" FLAKY: "true" core: PACKAGE: "core" + PROJECT: "core" explainers1: PACKAGE: "explainers.split1" + PROJECT: "core" explainers2: PACKAGE: "explainers.split2" + PROJECT: "deepLearning" explainers3: PACKAGE: "explainers.split3" + PROJECT: "deepLearning" exploratory: PACKAGE: "exploratory" + PROJECT: "core" featurize: PACKAGE: "featurize" + PROJECT: "core" image: PACKAGE: "image" + PROJECT: "core" io1: PACKAGE: "io.split1" + PROJECT: "core" FLAKY: "true" io2: PACKAGE: "io.split2" + PROJECT: "core" FLAKY: "true" isolationforest: PACKAGE: "isolationforest" + PROJECT: "core" flaky: PACKAGE: "flaky" #TODO fix flaky test so isolation is not needed + PROJECT: "core" FLAKY: "true" lightgbm1: PACKAGE: "lightgbm.split1" #TODO speed up LGBM Tests and remove split + PROJECT: "lightgbm" FLAKY: "true" lightgbm2: PACKAGE: "lightgbm.split2" + PROJECT: "lightgbm" FLAKY: "true" lightgbm3: PACKAGE: "lightgbm.split3" + PROJECT: "lightgbm" FLAKY: "true" lightgbm4: PACKAGE: "lightgbm.split4" + PROJECT: "lightgbm" FLAKY: "true" lightgbm5: PACKAGE: "lightgbm.split5" + PROJECT: "lightgbm" FLAKY: "true" lightgbm6: PACKAGE: "lightgbm.split6" + PROJECT: "lightgbm" FLAKY: "true" opencv: PACKAGE: "opencv" + PROJECT: "opencv" recommendation: PACKAGE: "recommendation" + PROJECT: "core" stages: PACKAGE: "stages" + PROJECT: "core" nn: PACKAGE: "nn" + PROJECT: "core" train: PACKAGE: "train" + PROJECT: "core" vw: PACKAGE: "vw" + PROJECT: "vw" steps: + - template: templates/free_disk.yml #- template: templates/ivy_cache.yml - - template: templates/update_cli.yml - task: AzureCLI@2 displayName: 'Setup repo' retryCountOnTaskFailure: 1 @@ -764,14 +872,10 @@ jobs: scriptLocation: inlineScript scriptType: bash inlineScript: | - (timeout 30s pip install requests) || (echo "retrying" && timeout 30s pip install requests) - (${FFMPEG:-false} && sudo apt-get update && \ - sudo apt-get install ffmpeg libgstreamer1.0-0 \ - gstreamer1.0-plugins-base gstreamer1.0-plugins-good gstreamer1.0-plugins-ugly -y) - (timeout 5m sbt setup) || (echo "retrying" && timeout 5m sbt setup) || (echo "retrying" && timeout 5m sbt setup) + export SBT_OPTS="-Xmx4G -XX:+UseConcMarkSweepGC -XX:+CMSClassUnloadingEnabled -Xss2M -Duser.timezone=GMT" + sbt getDatasets "project $(PROJECT)" Test/compile - task: AzureCLI@2 displayName: 'Unit Test' - retryCountOnTaskFailure: 1 timeoutInMinutes: 90 inputs: azureSubscription: 'SynapseML Build' @@ -779,12 +883,13 @@ jobs: scriptType: bash inlineScript: | ulimit -c unlimited - export SBT_OPTS="-XX:+UseConcMarkSweepGC -XX:+CMSClassUnloadingEnabled -Xss2M -Duser.timezone=GMT" + echo "Available CPUs: $(nproc)" + export SBT_OPTS="-Xmx4G -XX:+UseConcMarkSweepGC -XX:+CMSClassUnloadingEnabled -Xss2M -Duser.timezone=GMT -Dscala.concurrent.context.numThreads=8 -Dscala.concurrent.context.maxThreads=8" # Only run coverage on PRs, master, or tag builds if [ "$(runCoverage)" = "True" ]; then COV_CMD="coverage"; else COV_CMD=""; fi TEST_SPEC="${TEST_CLASSES:-com.microsoft.azure.synapse.ml.$(PACKAGE).**}" - (timeout 30m sbt $COV_CMD "testOnly $TEST_SPEC") || - (${FLAKY:-false} && timeout 30m sbt $COV_CMD "testOnly $TEST_SPEC") + (timeout 30m sbt $COV_CMD "project $(PROJECT)" "testOnly $TEST_SPEC") || + (${FLAKY:-false} && timeout 30m sbt $COV_CMD "project $(PROJECT)" "testOnly $TEST_SPEC") - task: PublishTestResults@2 displayName: 'Publish Test Results' inputs: diff --git a/project/CodegenPlugin.scala b/project/CodegenPlugin.scala index 2395ec49266..a2484325665 100644 --- a/project/CodegenPlugin.scala +++ b/project/CodegenPlugin.scala @@ -268,7 +268,6 @@ object CodegenPlugin extends AutoPlugin { installPipPackage := { val packagePythonResult: Unit = packagePython.value val publishLocalResult: Unit = (publishLocal dependsOn packagePython).value - val rootPublishLocalResult: Unit = (LocalRootProject / Compile / publishLocal).value val packageDir = join(codegenDir.value, "package", "python") val wheel = findBuiltPythonWheel(packageDir, name.value) runCmd( diff --git a/templates/free_disk.yml b/templates/free_disk.yml new file mode 100644 index 00000000000..436d6061e4d --- /dev/null +++ b/templates/free_disk.yml @@ -0,0 +1,6 @@ +steps: + - script: | + sudo rm -rf /usr/local/lib/android /usr/lib/google-cloud-sdk /usr/share/dotnet /opt/ghc || true + df -h / + displayName: 'Free disk space' + target: host diff --git a/tools/docker/ci/Dockerfile b/tools/docker/ci/Dockerfile new file mode 100644 index 00000000000..b038f3dd5b1 --- /dev/null +++ b/tools/docker/ci/Dockerfile @@ -0,0 +1,111 @@ +# SynapseML CI Container Image +# Pre-bakes all build dependencies so CI jobs start with a warm environment. +# Rebuilt automatically by BuildCIImage when dependency files change. + +FROM ubuntu:22.04 + +ENV DEBIAN_FRONTEND=noninteractive +ENV LANG=C.UTF-8 +ENV LC_ALL=C.UTF-8 + +# Temurin JDK 8 + system packages in a single layer to avoid redundant apt-get update. +# JDK 8 required — JDK 11 has different CMYK JPEG handling in ImageIO. +# Audio libs (libasound2, libpulse0) needed by Azure Speech SDK. +# libssl1.1 needed by Azure Speech SDK (Ubuntu 22.04 ships OpenSSL 3.0 but SDK requires 1.x). +RUN apt-get update && apt-get install -y --no-install-recommends curl wget git ca-certificates gnupg2 \ + && wget -qO- https://packages.adoptium.net/artifactory/api/gpg/key/public | gpg --dearmor -o /usr/share/keyrings/adoptium.gpg \ + && echo "deb [signed-by=/usr/share/keyrings/adoptium.gpg] https://packages.adoptium.net/artifactory/deb $(. /etc/os-release && echo $VERSION_CODENAME) main" \ + > /etc/apt/sources.list.d/adoptium.list \ + && apt-get update && apt-get install -y --no-install-recommends \ + temurin-8-jdk \ + openmpi-bin libopenmpi-dev \ + ffmpeg libgstreamer1.0-0 \ + gstreamer1.0-plugins-base gstreamer1.0-plugins-good gstreamer1.0-plugins-ugly \ + libasound2 libpulse0 \ + graphviz doxygen \ + build-essential cmake \ + libssl-dev libffi-dev \ + sudo \ + && rm -rf /var/lib/apt/lists/* \ + && wget -q https://archive.ubuntu.com/ubuntu/pool/main/o/openssl/libssl1.1_1.1.1f-1ubuntu2.24_amd64.deb -O /tmp/libssl1.1.deb \ + && echo "7cf39d70a639017d1dd7c8d36daa2258063608688e449fddf40ffdd46f992a78 /tmp/libssl1.1.deb" | sha256sum -c - \ + && dpkg -i /tmp/libssl1.1.deb \ + && rm /tmp/libssl1.1.deb + +ENV JAVA_HOME=/usr/lib/jvm/temurin-8-jdk-amd64 + +# Miniconda — pinned version for reproducible builds +# conda 24.x does not require TOS acceptance (that was added in conda 25.x) +RUN wget -q https://repo.anaconda.com/miniconda/Miniconda3-py311_24.11.1-0-Linux-x86_64.sh -O /tmp/miniconda.sh \ + && bash /tmp/miniconda.sh -b -p /opt/conda \ + && rm /tmp/miniconda.sh +ENV PATH=/opt/conda/bin:$PATH +ENV CONDA_CACHE_DIR=/opt/conda/envs + +# Azure CLI (installed into base conda python, not the synapseml env) +RUN pip install --no-cache-dir azure-cli==2.60.0 + +# Spark (pre-downloaded for R tests) +ENV SPARK_VERSION=3.5.0 +ENV HADOOP_VERSION=3 +RUN wget -q "https://mmlspark.blob.core.windows.net/installers/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz" -O /tmp/spark.tgz \ + && tar -xzf /tmp/spark.tgz -C /opt \ + && rm /tmp/spark.tgz +ENV SPARK_HOME=/opt/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION} +ENV PATH=${SPARK_HOME}/bin:$PATH + +# Node.js 16 (for website deployment) +RUN curl -fsSL https://deb.nodesource.com/setup_16.x | bash - \ + && apt-get install -y nodejs \ + && npm install -g yarn \ + && rm -rf /var/lib/apt/lists/* + +# SBT — set COURSIER_CACHE to shared location accessible by any UID (ADO uses UID 1001) +ENV SBT_VERSION=1.10.11 +ENV COURSIER_CACHE=/opt/.cache/coursier +RUN wget -q "https://github.com/sbt/sbt/releases/download/v${SBT_VERSION}/sbt-${SBT_VERSION}.tgz" -O /tmp/sbt.tgz \ + && tar -xzf /tmp/sbt.tgz -C /opt \ + && rm /tmp/sbt.tgz \ + && mkdir -p $COURSIER_CACHE +ENV PATH=/opt/sbt/bin:$PATH + +# --- Cache boundary: layers below invalidate when dependency files change --- + +# Conda environment from environment.yml +# Use PIP_NO_CACHE_DIR to save disk; CI is CPU-only so replace CUDA torch with CPU variant +# Direct wheel URLs avoid --extra-index-url which violates CFS policy +COPY environment.yml /tmp/environment.yml +RUN PIP_NO_CACHE_DIR=1 conda env create -f /tmp/environment.yml \ + && conda clean --all -y \ + && rm /tmp/environment.yml \ + && /opt/conda/envs/synapseml/bin/pip install --no-cache-dir --no-deps \ + "https://download.pytorch.org/whl/cpu/torch-2.1.0%2Bcpu-cp311-cp311-linux_x86_64.whl" \ + "https://download.pytorch.org/whl/cpu/torchvision-0.16.0%2Bcpu-cp311-cp311-linux_x86_64.whl" \ + && /opt/conda/envs/synapseml/bin/pip uninstall -y triton \ + nvidia-cublas-cu12 nvidia-cuda-cupti-cu12 nvidia-cuda-nvrtc-cu12 \ + nvidia-cuda-runtime-cu12 nvidia-cudnn-cu12 nvidia-cufft-cu12 \ + nvidia-curand-cu12 nvidia-cusolver-cu12 nvidia-cusparse-cu12 \ + nvidia-nccl-cu12 nvidia-nvjitlink-cu12 nvidia-nvtx-cu12 2>/dev/null || true \ + && chmod -R 755 /opt/conda/envs \ + && chmod -R 777 /opt/conda/envs/synapseml/lib/R/library + +# Pre-fetch SBT plugins, all project dependency JARs, and compiler-bridge. +# Copy the full project/ dir and build files so SBT can resolve the complete +# dependency graph. This eliminates ~286 JAR downloads + compiler-bridge +# compilation (~2-3 min) from every test job. +COPY project/ /tmp/sbt-warmup/project/ +COPY build.sbt /tmp/sbt-warmup/build.sbt +COPY sonatype.sbt /tmp/sbt-warmup/sonatype.sbt +RUN cd /tmp/sbt-warmup \ + && sbt --batch -Dsbt.supershell=false "update; Test/update" || true \ + && rm -rf /tmp/sbt-warmup /tmp/.sbt \ + && chmod -R 755 $COURSIER_CACHE + +# Pre-download test datasets (static tarball, ~50MB) to avoid downloading in every job +ENV DATASET_CACHE=/opt/datasets +RUN mkdir -p $DATASET_CACHE \ + && wget -q "https://mmlspark.blob.core.windows.net/installers/datasets-2023-04-03.tgz" \ + -O "$DATASET_CACHE/datasets-2023-04-03.tgz" + +# No ENTRYPOINT — ADO agent needs to control the process +CMD ["bash"]