diff --git a/Tests/acc_nested_internal_procedure.F90 b/Tests/acc_nested_internal_procedure.F90 new file mode 100644 index 0000000..399bae5 --- /dev/null +++ b/Tests/acc_nested_internal_procedure.F90 @@ -0,0 +1,195 @@ +! acc_nested_internal_procedure.F90 +! +! Feature under test (OpenACC 3.4; clarified procedure-boundary analysis): +! - Clarified analysis of implicit data attributes and parallelism across +! boundaries of procedures that can appear within other procedures, +! including Fortran internal procedures (nested subprograms). +! +! Notes +! - T1 (internal procedure called from parallel loop): Enters an OpenACC +! data region, then executes a parallel loop that calls a contained +! subroutine which references host-associated variables (outer-scope +! arrays/scalars). The contained subroutine is marked device-callable +! via '!$acc routine seq'. Exercises implicit data analysis across an +! internal-procedure boundary. +! - T2 (acc loop inside internal procedure): Enters an OpenACC 'parallel' +! region and calls a contained subroutine that contains an '!$acc loop' +! directive. Exercises loop/parallelism association and “orphaned loop” +! handling across an internal-procedure boundary. + +#ifndef T1 +!T1:runtime,data,implicit-data,procedure-boundary,fortran-internal-procedure,construct-independent,V:3.4- + LOGICAL FUNCTION test1() + USE OPENACC + IMPLICIT NONE + INCLUDE "acc_testsuite.Fh" + + INTEGER :: i + INTEGER :: errors + REAL(8), DIMENSION(LOOPCOUNT) :: a, b, c + REAL(8), DIMENSION(LOOPCOUNT) :: c_ref + REAL(8) :: alpha + + errors = 0 + + SEEDDIM(1) = 1 +# ifdef SEED + SEEDDIM(1) = SEED +# endif + CALL RANDOM_SEED(PUT=SEEDDIM) + CALL RANDOM_NUMBER(a) + CALL RANDOM_NUMBER(b) + c = 0.0D0 + + alpha = 3.0D0 + + ! Reference result on host + DO i = 1, LOOPCOUNT + c_ref(i) = a(i) + b(i) + alpha + END DO + + !$acc data copyin(a(1:LOOPCOUNT), b(1:LOOPCOUNT)) copy(c(1:LOOPCOUNT)) + ! Call an INTERNAL procedure from inside a parallel loop. + ! This stresses that data/attribute analysis remains correct across + ! the internal-procedure boundary, including host-associated scalars. + !$acc parallel loop default(present) firstprivate(alpha) + DO i = 1, LOOPCOUNT + CALL do_point(i) + END DO + !$acc end parallel loop + !$acc end data + + DO i = 1, LOOPCOUNT + IF (ABS(c(i) - c_ref(i)) .GT. PRECISION) THEN + errors = errors + 1 + END IF + END DO + + IF (errors .EQ. 0) THEN + test1 = .FALSE. + ELSE + test1 = .TRUE. + END IF + + RETURN + + CONTAINS + + SUBROUTINE do_point(idx) + IMPLICIT NONE + INTEGER, INTENT(IN) :: idx + ! Mark internal procedure as device-callable. + !$acc routine seq + c(idx) = a(idx) + b(idx) + alpha + END SUBROUTINE do_point + + END FUNCTION +#endif + +#ifndef T2 +!T2:runtime,data,parallelism,procedure-boundary,fortran-internal-procedure,construct-independent,V:3.4- + LOGICAL FUNCTION test2() + USE OPENACC + IMPLICIT NONE + INCLUDE "acc_testsuite.Fh" + + INTEGER :: i + INTEGER :: errors + REAL(8), DIMENSION(LOOPCOUNT) :: a, b, c + REAL(8), DIMENSION(LOOPCOUNT) :: c_ref + REAL(8) :: beta + + errors = 0 + + SEEDDIM(1) = 1 +# ifdef SEED + SEEDDIM(1) = SEED +# endif + CALL RANDOM_SEED(PUT=SEEDDIM) + CALL RANDOM_NUMBER(a) + CALL RANDOM_NUMBER(b) + c = 0.0D0 + + beta = 1.5D0 + + ! Reference result on host + DO i = 1, LOOPCOUNT + c_ref(i) = beta * a(i) - b(i) + END DO + + !$acc data copyin(a(1:LOOPCOUNT), b(1:LOOPCOUNT)) copy(c(1:LOOPCOUNT)) + ! Enter a parallel region, then run an internal procedure that + ! contains the ACC LOOP directive (parallelism/loop context crossing + ! an internal-procedure boundary). + !$acc parallel default(present) firstprivate(beta) + CALL do_loop() + !$acc end parallel + !$acc end data + + DO i = 1, LOOPCOUNT + IF (ABS(c(i) - c_ref(i)) .GT. PRECISION) THEN + errors = errors + 1 + END IF + END DO + + IF (errors .EQ. 0) THEN + test2 = .FALSE. + ELSE + test2 = .TRUE. + END IF + + RETURN + + CONTAINS + + SUBROUTINE do_loop() + IMPLICIT NONE + INTEGER :: j + !$acc routine seq + !$acc loop + DO j = 1, LOOPCOUNT + c(j) = beta * a(j) - b(j) + END DO + END SUBROUTINE do_loop + + END FUNCTION +#endif + + PROGRAM main + IMPLICIT NONE + INTEGER :: failcode, testrun + LOGICAL :: failed + INCLUDE "acc_testsuite.Fh" + +#ifndef T1 + LOGICAL :: test1 +#endif +#ifndef T2 + LOGICAL :: test2 +#endif + + failcode = 0 + failed = .FALSE. + +#ifndef T1 + DO testrun = 1, NUM_TEST_CALLS + failed = failed .OR. test1() + END DO + IF (failed) THEN + failcode = failcode + 2 ** 0 + failed = .FALSE. + END IF +#endif + +#ifndef T2 + DO testrun = 1, NUM_TEST_CALLS + failed = failed .OR. test2() + END DO + IF (failed) THEN + failcode = failcode + 2 ** 1 + failed = .FALSE. + END IF +#endif + + CALL EXIT(failcode) + END PROGRAM diff --git a/Tests/acc_nested_internal_procedure.cpp b/Tests/acc_nested_internal_procedure.cpp new file mode 100644 index 0000000..6ed66c2 --- /dev/null +++ b/Tests/acc_nested_internal_procedure.cpp @@ -0,0 +1,201 @@ +// acc_nested_procedure_boundaries.cpp +// +// Feature under test (OpenACC 3.4; clarified procedure-boundary analysis): +// - Clarified analysis of implicit data attributes and parallelism across +// boundaries of procedures that can appear within other procedures +// (e.g., C++ lambdas, C++ class member functions). +// +// Notes: +// - T1 (C++ lambda in parallel loop): Calls a lambda from within an +// OpenACC 'parallel loop' region while relying on the surrounding data +// region + default(present) and a captured scalar. Exercises implicit +// data attribute analysis for variables referenced through a lambda. +// - T2 (C++ member function in parallel loop): Calls a class member +// function from within an OpenACC 'parallel loop' region using a +// firstprivate object containing pointer members and a scalar member. +// Exercises procedure-boundary analysis for class/struct objects and +// pointer members used on the device. +// - T3 (acc loop inside a lambda): Places an OpenACC 'loop' directive +// inside a lambda body and invokes it within an OpenACC 'parallel' +// region. Exercises loop/parallelism assoc + +#include "acc_testsuite.h" + +#include +#include + +#ifndef T1 +//T1:runtime,data,implicit-data,procedure-boundary,cxx-lambda,construct-independent,V:3.4- +int test1() { + int err = 0; + srand(SEED); + + real_t *a = (real_t*)malloc(n * sizeof(real_t)); + real_t *b = (real_t*)malloc(n * sizeof(real_t)); + real_t *c = (real_t*)malloc(n * sizeof(real_t)); + if (!a || !b || !c) return 1; + + for (int i = 0; i < n; ++i) { + a[i] = rand() / (real_t)(RAND_MAX / 10); + b[i] = rand() / (real_t)(RAND_MAX / 10); + c[i] = 0; + } + + // Capture-by-value scalar should behave like firstprivate in device code. + const real_t scale = (real_t)3.0; + + #pragma acc data copyin(a[0:n], b[0:n]) copy(c[0:n]) + { + // Lambda defined inside the region; called from within parallel loop. + auto op = [=](int i) { + // Uses a, b, c from the surrounding scope; compiler must analyze + // those references correctly across the lambda boundary. + c[i] = a[i] + b[i] + scale; + }; + + #pragma acc parallel loop default(present) + for (int i = 0; i < n; ++i) { + op(i); + } + } + + for (int i = 0; i < n; ++i) { + real_t expect = a[i] + b[i] + scale; + if (fabs(c[i] - expect) > PRECISION) err++; + } + + free(a); + free(b); + free(c); + return err; +} +#endif + +#ifndef T2 +//T2:runtime,data,implicit-data,procedure-boundary,cxx-member-function,construct-independent,V:3.4- +int test2() { + int err = 0; + srand(SEED); + + real_t *a = (real_t*)malloc(n * sizeof(real_t)); + real_t *b = (real_t*)malloc(n * sizeof(real_t)); + real_t *c = (real_t*)malloc(n * sizeof(real_t)); + if (!a || !b || !c) return 1; + + for (int i = 0; i < n; ++i) { + a[i] = rand() / (real_t)(RAND_MAX / 10); + b[i] = rand() / (real_t)(RAND_MAX / 10); + c[i] = 0; + } + + struct Worker { + real_t *a; + real_t *b; + real_t *c; + real_t bias; // scalar member should transfer with firstprivate-like behavior + + // Member function called on device; references through member pointers. + void work(int i) const { + c[i] = a[i] * bias + b[i]; + } + }; + + Worker w; + w.a = a; w.b = b; w.c = c; + w.bias = (real_t)2.0; + + #pragma acc data copyin(a[0:n], b[0:n]) copy(c[0:n]) + { + // w is used inside device region; it should be handled as firstprivate + // and its pointer members should refer to present device data. + #pragma acc parallel loop default(present) firstprivate(w) + for (int i = 0; i < n; ++i) { + w.work(i); + } + } + + for (int i = 0; i < n; ++i) { + real_t expect = a[i] * w.bias + b[i]; + if (fabs(c[i] - expect) > PRECISION) err++; + } + + free(a); + free(b); + free(c); + return err; +} +#endif + +#ifndef T3 +//T3:runtime,data,parallelism,procedure-boundary,cxx-lambda-loop,construct-independent,V:3.4- +int test3() { + int err = 0; + srand(SEED); + + real_t *a = (real_t*)malloc(n * sizeof(real_t)); + real_t *b = (real_t*)malloc(n * sizeof(real_t)); + real_t *c = (real_t*)malloc(n * sizeof(real_t)); + if (!a || !b || !c) return 1; + + for (int i = 0; i < n; ++i) { + a[i] = rand() / (real_t)(RAND_MAX / 10); + b[i] = rand() / (real_t)(RAND_MAX / 10); + c[i] = 0; + } + + const real_t alpha = (real_t)1.5; + + #pragma acc data copyin(a[0:n], b[0:n]) copy(c[0:n]) + { + // We place the OpenACC loop directive inside a lambda body. + // This stresses that loop/parallel analysis remains valid across + // the lambda boundary (i.e., directives in nested procedures). + auto do_loop = [=]() { + #pragma acc loop + for (int i = 0; i < n; ++i) { + c[i] = alpha * a[i] - b[i]; + } + }; + + #pragma acc parallel default(present) + { + do_loop(); + } + } + + for (int i = 0; i < n; ++i) { + real_t expect = alpha * a[i] - b[i]; + if (fabs(c[i] - expect) > PRECISION) err++; + } + + free(a); + free(b); + free(c); + return err; +} +#endif + +int main() { + int failcode = 0; + int failed; + +#ifndef T1 + failed = 0; + for (int i = 0; i < NUM_TEST_CALLS; ++i) failed += test1(); + if (failed != 0) failcode += (1 << 0); +#endif + +#ifndef T2 + failed = 0; + for (int i = 0; i < NUM_TEST_CALLS; ++i) failed += test2(); + if (failed != 0) failcode += (1 << 1); +#endif + +#ifndef T3 + failed = 0; + for (int i = 0; i < NUM_TEST_CALLS; ++i) failed += test3(); + if (failed != 0) failcode += (1 << 2); +#endif + + return failcode; +}