Skip to content

Speed up mul! for Jacobi matrices (X, Y) #190

@MikaelSlevinsky

Description

@MikaelSlevinsky

The following code compares the mul! speed of banded-block-banded matrices to the diagonal. For ChebyshevT, the diagonal mul! is mathematically only 3x for X and 9x for Y given the tridagonal-block-tridiagonal structure, but there appears to be a large overhead to it all. Moreover, mul! with views may spend a lot of time with the garbage collector and can be slower even though they require less data. I also don't see why mul! with X is not 3x faster than that with Y. Finally, mul! with an adjoint view is basically unusable (O(n^4) complexity?).

julia> using ClassicalOrthogonalPolynomials, MultivariateOrthogonalPolynomials, LinearAlgebra

julia>= RectPolynomial(ChebyshevT(), ChebyshevT());

julia> Jxt = jacobimatrix(Val(1), T²);

julia> Jyt = jacobimatrix(Val(2), T²);

julia> function jacobi_mul!(y, Z, x)
           n = blocksize(Z, 1)
           ZB1 = Z[Block(1), Block(1)]
           k = size(ZB1, 1)
           vx = view(x, 1:k)
           vy = view(y, 1:k)
           mul!(vy, ZB1, vx)
           ZB2 = Z[Block(1), Block(2)]
           vx = view(x, k+1:(k+size(ZB2, 2)))
           mul!(vy, ZB2, vx, 1, 1)
           Kstart = 1
           Kstop = k
           for j in 2:n-1
               ZBjm1 = Z[Block(j), Block(j-1)]
               vx = view(x, Kstart:Kstop)
               ZBj = Z[Block(j), Block(j)]
               Kstart = Kstop + 1
               Kstop = Kstop + size(ZBj, 2)
               vy = view(y, Kstart:Kstop)
               mul!(vy, ZBjm1, vx)
               vx = view(x, Kstart:Kstop)
               mul!(vy, ZBj, vx, 1, 1)
               ZBjp1 = Z[Block(j), Block(j+1)]
               vx = view(x, (Kstop+1):(Kstop+size(ZBjp1, 2)))
               mul!(vy, ZBjp1, vx, 1, 1)
           end
           ZBnm1 = Z[Block(n), Block(n-1)]
           vx = view(x, Kstart:Kstop)
           ZBn = Z[Block(n), Block(n)]
           Kstart = Kstop + 1
           Kstop = Kstop + size(ZBn, 2)
           vy = view(y, Kstart:Kstop)
           mul!(vy, ZBnm1, vx)
           vx = view(x, Kstart:Kstop)
           mul!(vy, ZBn, vx, 1, 1)
           return y
       end
jacobi_mul! (generic function with 1 method)

julia> function jacobi_t_mul!(y, Z, x)
           n = blocksize(Z, 1)
           ZB1 = Z[Block(1), Block(1)]'
           k = size(ZB1, 1)
           vx = view(x, 1:k)
           vy = view(y, 1:k)
           mul!(vy, ZB1, vx)
           ZB2 = Z[Block(2), Block(1)]'
           vx = view(x, k+1:(k+size(ZB2, 2)))
           mul!(vy, ZB2, vx, 1, 1)
           Kstart = 1
           Kstop = k
           for j in 2:n-1
               ZBjm1 = Z[Block(j-1), Block(j)]'
               vx = view(x, Kstart:Kstop)
               ZBj = Z[Block(j), Block(j)]'
               Kstart = Kstop + 1
               Kstop = Kstop + size(ZBj, 2)
               vy = view(y, Kstart:Kstop)
               mul!(vy, ZBjm1, vx)
               vx = view(x, Kstart:Kstop)
               mul!(vy, ZBj, vx, 1, 1)
               ZBjp1 = Z[Block(j+1), Block(j)]'
               vx = view(x, (Kstop+1):(Kstop+size(ZBjp1, 2)))
               mul!(vy, ZBjp1, vx, 1, 1)
           end
           ZBnm1 = Z[Block(n-1), Block(n)]'
           vx = view(x, Kstart:Kstop)
           ZBn = Z[Block(n), Block(n)]'
           Kstart = Kstop + 1
           Kstop = Kstop + size(ZBn, 2)
           vy = view(y, Kstart:Kstop)
           mul!(vy, ZBnm1, vx)
           vx = view(x, Kstart:Kstop)
           mul!(vy, ZBn, vx, 1, 1)
           return y
       end
jacobi_t_mul! (generic function with 1 method)

julia> for n in 50:50:1000
           println("n = $n")
           X = Jxt[Block.(1:n), Block.(1:n)]
           Y = Jyt[Block.(1:n), Block.(1:n)]
           DX = Diagonal(X)
           x = Vector(X[:, 1])
           y = zero(x)
           @time mul!(y, X, x)
           @time mul!(y, Y, x)
           @time mul!(y, DX, x)
           @time jacobi_mul!(y, X, x)
           @time jacobi_mul!(y, Y, x)

           VX = view(X, Block.(n÷2:n), Block.(n÷2:n))
           x = Vector(VX[:, 1])
           y = zero(x)
           @time mul!(y, VX, x)
           @time jacobi_mul!(y, VX, x)
           @time jacobi_t_mul!(y, VX, x)
           if n < 200
               @time mul!(y, VX', x)
           end
       end
n = 50
  0.000740 seconds (149 allocations: 9.312 KiB)
  0.000273 seconds (149 allocations: 9.312 KiB)
  0.000008 seconds
  0.000189 seconds (148 allocations: 38.781 KiB)
  0.000369 seconds (148 allocations: 100.453 KiB)
  0.000096 seconds (2.84 k allocations: 178.906 KiB)
  0.000131 seconds (152 allocations: 32.094 KiB)
  0.000126 seconds (152 allocations: 32.094 KiB)
  0.201908 seconds (1.91 M allocations: 494.370 MiB, 31.93% gc time)
n = 100
  0.001709 seconds (299 allocations: 18.688 KiB)
  0.000756 seconds (299 allocations: 18.688 KiB)
  0.000007 seconds
  0.003657 seconds (298 allocations: 139.031 KiB)
  0.000647 seconds (298 allocations: 385.594 KiB)
  0.000317 seconds (11.29 k allocations: 708.781 KiB)
  0.001723 seconds (302 allocations: 110.172 KiB)
  0.000451 seconds (302 allocations: 110.172 KiB)
  3.623088 seconds (29.29 M allocations: 13.525 GiB, 35.57% gc time)
n = 150
  0.002565 seconds (449 allocations: 28.062 KiB)
  0.001368 seconds (449 allocations: 28.062 KiB)
  0.000012 seconds
  0.005007 seconds (448 allocations: 299.688 KiB)
  0.001135 seconds (448 allocations: 848.469 KiB)
  0.000696 seconds (25.36 k allocations: 1.553 MiB)
  0.000674 seconds (452 allocations: 233.438 KiB)
  0.000796 seconds (452 allocations: 233.438 KiB)
 22.467557 seconds (146.26 M allocations: 91.525 GiB, 38.30% gc time)
n = 200
  0.003845 seconds (599 allocations: 37.438 KiB)
  0.002454 seconds (599 allocations: 37.438 KiB)
  0.000019 seconds
  0.001953 seconds (598 allocations: 522.750 KiB)
  0.003532 seconds (598 allocations: 1.452 MiB)
  0.001222 seconds (45.06 k allocations: 2.757 MiB)
  0.001062 seconds (602 allocations: 403.406 KiB)
  0.000965 seconds (602 allocations: 403.406 KiB)
n = 250
  0.003841 seconds (749 allocations: 46.812 KiB)
  0.005091 seconds (749 allocations: 46.812 KiB)
  0.000040 seconds
  0.002799 seconds (748 allocations: 808.344 KiB)
  0.003726 seconds (748 allocations: 2.248 MiB)
  0.002525 seconds (70.39 k allocations: 4.304 MiB)
  0.002658 seconds (752 allocations: 620.469 KiB)
  0.001539 seconds (752 allocations: 620.469 KiB)
n = 300
  0.004683 seconds (899 allocations: 56.188 KiB)
  0.004331 seconds (899 allocations: 56.188 KiB)
  0.000057 seconds
  0.004545 seconds (898 allocations: 1.127 MiB)
  0.004530 seconds (898 allocations: 3.215 MiB)
  0.002720 seconds (101.34 k allocations: 6.195 MiB)
  0.003189 seconds (902 allocations: 883.484 KiB)
  0.001960 seconds (902 allocations: 883.484 KiB)
n = 350
  0.015783 seconds (1.05 k allocations: 65.562 KiB)
  0.016362 seconds (1.05 k allocations: 65.562 KiB)
  0.000076 seconds
  0.008280 seconds (1.05 k allocations: 1.521 MiB)
  0.009735 seconds (1.05 k allocations: 4.353 MiB)
  0.003792 seconds (137.91 k allocations: 8.428 MiB)
  0.002665 seconds (1.05 k allocations: 1.160 MiB)
  0.002354 seconds (1.05 k allocations: 1.160 MiB)
n = 400
  0.010599 seconds (1.20 k allocations: 74.938 KiB)
  0.007430 seconds (1.20 k allocations: 74.938 KiB)
  0.000078 seconds
  0.006021 seconds (1.20 k allocations: 1.973 MiB)
  0.026957 seconds (1.20 k allocations: 5.664 MiB, 70.43% gc time)
  0.004860 seconds (180.11 k allocations: 11.006 MiB)
  0.003017 seconds (1.20 k allocations: 1.500 MiB)
  0.003652 seconds (1.20 k allocations: 1.500 MiB)
n = 450
  0.009276 seconds (1.35 k allocations: 84.312 KiB)
  0.009503 seconds (1.35 k allocations: 84.312 KiB)
  0.000115 seconds
  0.006383 seconds (1.35 k allocations: 2.482 MiB)
  0.008038 seconds (1.35 k allocations: 7.146 MiB)
  0.006218 seconds (227.94 k allocations: 13.927 MiB)
  0.005178 seconds (1.35 k allocations: 1.881 MiB)
  0.004215 seconds (1.35 k allocations: 1.881 MiB)
n = 500
  0.012857 seconds (1.50 k allocations: 93.688 KiB)
  0.011674 seconds (1.50 k allocations: 93.688 KiB)
  0.000136 seconds
  0.008604 seconds (1.50 k allocations: 3.048 MiB)
  0.010344 seconds (1.50 k allocations: 8.799 MiB)
  0.007599 seconds (281.39 k allocations: 17.191 MiB)
  0.005681 seconds (1.50 k allocations: 2.306 MiB)
  0.004769 seconds (1.50 k allocations: 2.306 MiB)
n = 550
  0.014980 seconds (1.65 k allocations: 103.062 KiB)
  0.017946 seconds (1.65 k allocations: 103.062 KiB)
  0.000174 seconds
  0.010327 seconds (1.65 k allocations: 3.671 MiB)
  0.014607 seconds (1.65 k allocations: 10.625 MiB)
  0.009169 seconds (340.46 k allocations: 20.798 MiB)
  0.006178 seconds (1.65 k allocations: 2.773 MiB)
  0.009445 seconds (1.65 k allocations: 2.773 MiB)
n = 600
  0.016862 seconds (1.80 k allocations: 112.438 KiB)
  0.018070 seconds (1.80 k allocations: 112.438 KiB)
  0.000194 seconds
  0.012115 seconds (1.80 k allocations: 4.351 MiB)
  0.017187 seconds (1.80 k allocations: 12.622 MiB)
  0.011016 seconds (405.16 k allocations: 24.748 MiB)
  0.009576 seconds (1.80 k allocations: 3.282 MiB)
  0.007794 seconds (1.80 k allocations: 3.282 MiB)
n = 650
  0.019549 seconds (1.95 k allocations: 121.812 KiB)
  0.020473 seconds (1.95 k allocations: 121.812 KiB)
  0.000235 seconds
  0.013520 seconds (1.95 k allocations: 5.089 MiB)
  0.019936 seconds (1.95 k allocations: 14.790 MiB)
  0.012865 seconds (475.49 k allocations: 29.042 MiB)
  0.009241 seconds (1.95 k allocations: 3.835 MiB)
  0.008665 seconds (1.95 k allocations: 3.835 MiB)
n = 700
  0.026187 seconds (2.10 k allocations: 131.188 KiB)
  0.023385 seconds (2.10 k allocations: 131.188 KiB)
  0.000276 seconds
  0.018192 seconds (2.10 k allocations: 5.884 MiB)
  0.021707 seconds (2.15 k allocations: 17.127 MiB)
  0.032264 seconds (551.44 k allocations: 33.679 MiB, 53.71% gc time)
  0.010982 seconds (2.10 k allocations: 4.430 MiB)
  0.009810 seconds (2.10 k allocations: 4.430 MiB)
n = 750
  0.026024 seconds (2.25 k allocations: 140.562 KiB)
  0.026325 seconds (2.25 k allocations: 140.562 KiB)
  0.000324 seconds
  0.075207 seconds (2.25 k allocations: 6.736 MiB, 73.08% gc time)
  0.024082 seconds (2.45 k allocations: 19.627 MiB)
  0.017025 seconds (633.01 k allocations: 38.659 MiB)
  0.012634 seconds (2.25 k allocations: 5.069 MiB)
  0.009810 seconds (2.25 k allocations: 5.069 MiB)
n = 800
  0.033951 seconds (2.40 k allocations: 149.938 KiB)
  0.033742 seconds (2.40 k allocations: 149.938 KiB)
  0.000368 seconds
  0.027009 seconds (2.40 k allocations: 7.646 MiB)
  0.026405 seconds (2.75 k allocations: 22.299 MiB)
  0.020540 seconds (720.21 k allocations: 43.984 MiB)
  0.035980 seconds (2.40 k allocations: 5.750 MiB, 62.47% gc time)
  0.012604 seconds (2.40 k allocations: 5.750 MiB)
n = 850
  0.028899 seconds (2.55 k allocations: 159.312 KiB)
  0.031767 seconds (2.55 k allocations: 159.312 KiB)
  0.000406 seconds
  0.039557 seconds (2.55 k allocations: 8.613 MiB, 45.35% gc time)
  0.027419 seconds (3.05 k allocations: 25.143 MiB)
  0.021903 seconds (813.04 k allocations: 49.651 MiB)
  0.031004 seconds (2.55 k allocations: 6.474 MiB, 51.47% gc time)
  0.035000 seconds (2.55 k allocations: 6.474 MiB)
n = 900
  0.044860 seconds (2.70 k allocations: 168.688 KiB)
  0.046641 seconds (2.70 k allocations: 168.688 KiB)
  0.000514 seconds
  0.026460 seconds (2.70 k allocations: 9.636 MiB)
  0.039504 seconds (3.35 k allocations: 28.159 MiB)
  0.026057 seconds (911.49 k allocations: 55.661 MiB)
  0.015254 seconds (2.70 k allocations: 7.241 MiB)
  0.016091 seconds (2.70 k allocations: 7.241 MiB)
n = 950
  0.036002 seconds (2.85 k allocations: 178.062 KiB)
  0.034280 seconds (2.85 k allocations: 178.062 KiB)
  0.000477 seconds
  0.026395 seconds (2.85 k allocations: 10.717 MiB)
  0.057024 seconds (3.65 k allocations: 31.346 MiB, 38.59% gc time)
  0.027497 seconds (1.02 M allocations: 62.015 MiB)
  0.021707 seconds (2.85 k allocations: 8.051 MiB, 19.26% gc time)
  0.016856 seconds (2.85 k allocations: 8.051 MiB)
n = 1000
  0.063660 seconds (3.00 k allocations: 187.438 KiB, 31.73% gc time)
  0.038649 seconds (3.00 k allocations: 187.438 KiB)
  0.000567 seconds
  0.028659 seconds (3.00 k allocations: 11.856 MiB)
  0.040442 seconds (3.95 k allocations: 34.705 MiB)
  0.081095 seconds (1.13 M allocations: 68.712 MiB, 59.31% gc time)
  0.019885 seconds (3.00 k allocations: 8.904 MiB)
  0.023380 seconds (3.00 k allocations: 8.904 MiB)

julia> 

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type
    No fields configured for issues without a type.

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions