diff --git a/README.md b/README.md
index ba4934b..986a071 100644
--- a/README.md
+++ b/README.md
@@ -39,7 +39,29 @@ julia> fft(rand(Double64, 2))
 2-element Vector{Complex{Double64}}:
  0.4026739024263829 + 0.0im
  0.3969515892883767 + 0.0im
- ```
+```
+
+## Usage for low-precision FFTs
+
+```julia
+julia> using GenericFFT, BFloat16s
+
+julia> fs = 1000.0        
+julia> t  = 0:1/fs:1-1/fs 
+julia> f1, f2 = 50.0, 120.0
+
+julia> T = Float16
+julia> # see: https://www.mathworks.com/help/matlab/ref/fft.html
+julia> x = T.(0.7*sin.(2π * f1 * t) .+ 0.5 * sin.(2π * f2 * t)) .+ T(0.8) 
+julia> X = fft(x)
+julia> println("Max round-trip error: ", maximum(abs.(x - real(ifft(X)))))
+
+julia> T = BFloat16
+julia> x = T.(0.7*sin.(2π * f1 * t) .+ 0.5 * sin.(2π * f2 * t)) .+ T(0.8) 
+julia> X = fft(x)
+julia> println("Max round-trip error: ", maximum(abs.(x - real(ifft(X)))))
+```
+
 
 ## History
 
diff --git a/src/fft.jl b/src/fft.jl
index 26910bc..e39e799 100644
--- a/src/fft.jl
+++ b/src/fft.jl
@@ -1,12 +1,9 @@
-const AbstractFloats = Union{AbstractFloat,Complex{T} where T<:AbstractFloat}
-
 # We use these type definitions for clarity
 const RealFloats = T where T<:AbstractFloat
 const ComplexFloats = Complex{T} where T<:AbstractFloat
-
+const AbstractFloats = Union{RealFloats, ComplexFloats}
 
 # The following implements Bluestein's algorithm, following http://www.dsprelated.com/dspbooks/mdft/Bluestein_s_FFT_Algorithm.html
-# To add more types, add them in the union of the function's signature.
 
 function generic_fft!(x::AbstractVector{Complex{T}}) where {T<:AbstractFloat}
     if ispow2(length(x))
@@ -23,7 +20,7 @@ function generic_fft!(x::AbstractVector{Complex{T}}, region::Integer) where {T<:
 end
 
 function _generic_fft_first_dim!(x, Ipost)
-    Threads.@threads for I in Ipost
+    for I in Ipost
         generic_fft!(@view x[:, I])
     end
     x
@@ -81,18 +78,24 @@ function generic_fft!(x)
 end
 
 
-generic_fft(x, region) = generic_fft!(copy(x), region)
+# generic_fft(x, region) = generic_fft!(copy(complex(x)), region)
+# generic_fft(x) = generic_fft!(copy(complex(x)))
+
+copycomplex(A::AbstractArray{<:Complex}) = copy(A)
+copycomplex(A::AbstractArray{<:Real}) = complex(A)
+generic_fft(x, region) = generic_fft!(copycomplex(x), region)
+generic_fft(x) = generic_fft!(copycomplex(x))
 
-generic_fft(x) = generic_fft!(copy(x))
 
 function generic_fft(x::AbstractVector{T}) where T<:AbstractFloats
     n = length(x)
     ispow2(n) && return generic_fft_pow2(x)
-    ks = range(zero(real(T)),stop=n-one(real(T)),length=n)
-    Wks = @. cispi(-T(ks^2/n))
+    S = promote_type(real(T), Float64)
+    ks = range(zero(S), stop=S(n)-one(S), length=n)
+    Wks = Complex{real(T)}.(cispi.(-ks.^2 ./ S(n)))    # always Complex
     Wksrev = @view Wks[reverse(eachindex(Wks))]
-    xq, wq = x.*Wks, conj!([cispi(-T(n)); Wksrev; @view Wks[2:end]])
-    return Wks.* @view _conv!(xq,wq)[n+1:2n]
+    xq, wq = complex(x).*Wks, conj!([Complex{real(T)}(cispi(-S(n))); Wksrev; @view Wks[2:end]])
+    return Wks .* @view _conv!(xq,wq)[n+1:2n]
 end
 
 generic_bfft(x::AbstractArray{T, N}, region) where {T <: AbstractFloats, N} = conj!(generic_fft(conj(x), region))
@@ -105,27 +108,78 @@ generic_ifft(x::AbstractArray{T, N}, region) where {T<:AbstractFloats, N} = ldiv
 generic_ifft!(x::AbstractArray{T, N}, region) where {T<:AbstractFloats, N} = ldiv!(T(_regionscale(x, region)), conj!(generic_fft!(conj!(x), region)))
 
 generic_rfft(v::AbstractVector{T}, region) where T<:AbstractFloats = generic_fft(v, region)[1:div(length(v),2)+1]
+
+function generic_rfft(x::AbstractArray{T, N}, region) where {T<:AbstractFloats, N}
+    d = first(region)
+    if length(region) > 1
+        return generic_fft(generic_rfft(x, d), region[2:end])
+    end
+
+    nout = size(x, d) ÷ 2 + 1
+    sz = collect(size(x))
+    sz[d] = nout
+    out = similar(x, Complex{real(T)}, tuple(sz...))
+
+    # CartesianIndices enables iterating over slices in arbitrary dimensions
+    Rpre = CartesianIndices(size(x)[1:d-1])
+    Rpost = CartesianIndices(size(x)[d+1:end])
+
+    for Ipost in Rpost
+        for Ipre in Rpre
+            out[Ipre, :, Ipost] .= generic_rfft(view(x, Ipre, :, Ipost), 1)
+        end
+    end
+    return out
+end
+
 function generic_irfft(v::AbstractVector{T}, n::Integer, region) where T<:ComplexFloats
     @assert length(v) == n>>1 + 1
     r = Vector{T}(undef, n)
     r[1:length(v)]=v
     r[length(v)+1:n]=reverse(conj(v[2:end])[1:n-length(v)])
-    real(generic_ifft(r, region))
+    return real(generic_ifft(r, region))
+end
+
+function generic_irfft(x::AbstractArray{T, N}, n::Integer, region) where {T<:ComplexFloats, N}
+    d = first(region)
+    if length(region) > 1
+        return generic_irfft(generic_ifft(x, region[2:end]), n, d)
+    end
+
+    sz = collect(size(x))
+    sz[d] = n
+    out = similar(x, real(T), tuple(sz...))
+
+    Rpre = CartesianIndices(size(x)[1:d-1])
+    Rpost = CartesianIndices(size(x)[d+1:end])
+
+    for Ipost in Rpost
+        for Ipre in Rpre
+            out[Ipre, :, Ipost] .= generic_irfft(view(x, Ipre, :, Ipost), n, 1)
+        end
+    end
+    return out
+end
+
+function generic_brfft(v::AbstractArray, n::Integer, region)
+    scale = n * _regionscale(v, region isa Integer ? () : region[2:end])
+    return generic_irfft(v, n, region) * scale
 end
-generic_brfft(v::AbstractArray, n::Integer, region) = generic_irfft(v, n, region)*n
 
 function _conv!(u::AbstractVector{T}, v::AbstractVector{T}) where T<:AbstractFloats
-    nu = length(u)
-    nv = length(v)
-    n = nu + nv - 1
+    nu, nv = length(u), length(v)
+    n  = nu + nv - 1
     np2 = nextpow(2, n)
     append!(u, zeros(T, np2-nu))
     append!(v, zeros(T, np2-nv))
-    y = generic_ifft_pow2(generic_fft_pow2(u).*generic_fft_pow2(v))
-    #TODO This would not handle Dual/ComplexDual numbers correctly
-    y = T<:Real ? real(y[1:n]) : y[1:n]
+    S = promote_type(real(T), Float64)
+    uf = Complex{S}.(u)
+    vf = Complex{S}.(v)
+    y = generic_ifft_pow2(generic_fft_pow2(uf) .* generic_fft_pow2(vf))
+    y = T <: Real ? T.(real(y[1:n])) : T.(y[1:n])
 end
 
+
 # This is a Cooley-Tukey FFT algorithm inspired by many widely available algorithms including:
 # c_radix2.c in the GNU Scientific Library and four1 in the Numerical Recipes in C.
 # However, the trigonometric recurrence is improved for greater efficiency.
@@ -262,7 +316,7 @@ for P in (:DummyFFTPlan, :DummyiFFTPlan, :DummybFFTPlan, :DummyDCTPlan, :DummyiD
     @eval begin
         mutable struct $P{T,inplace,G} <: DummyPlan{T}
             region::G # region (iterable) of dims that are transformed
-            pinv::DummyPlan{T}
+            pinv::Plan
             $P{T,inplace,G}(region::G) where {T<:AbstractFloats, inplace, G} = new(region)
         end
     end
@@ -271,8 +325,8 @@ for P in (:DummyrFFTPlan, :DummyirFFTPlan, :DummybrFFTPlan)
     @eval begin
         mutable struct $P{T,inplace,G} <: DummyPlan{T}
             n::Integer
-            region::G # region (iterable) of dims that are transformed
-            pinv::DummyPlan{T}
+            region::G
+            pinv::Plan
             $P{T,inplace,G}(n::Integer, region::G) where {T<:AbstractFloats, inplace, G} = new(n, region)
         end
     end
@@ -287,8 +341,8 @@ for (Plan,iPlan) in ((:DummyFFTPlan,:DummyiFFTPlan),
 end
 
 # Specific for rfft, irfft and brfft:
-plan_inv(p::DummyirFFTPlan{T,inplace,G}) where {T,inplace,G} = DummyrFFTPlan{T,inplace,G}(p.n, p.region)
-plan_inv(p::DummyrFFTPlan{T,inplace,G}) where {T,inplace,G} = DummyirFFTPlan{T,inplace,G}(p.n, p.region)
+plan_inv(p::DummyirFFTPlan{T,inplace,G}) where {T,inplace,G} = DummyrFFTPlan{real(T),inplace,G}(p.n, p.region)
+plan_inv(p::DummyrFFTPlan{T,inplace,G}) where {T,inplace,G} = DummyirFFTPlan{Complex{T},inplace,G}(p.n, p.region)
 
 
 
@@ -331,6 +385,14 @@ end
 
 plan_fft(x::StridedArray{T}, region) where {T <: ComplexFloats} = DummyFFTPlan{T,false,typeof(region)}(region)
 plan_fft!(x::StridedArray{T}, region) where {T <: ComplexFloats} = DummyFFTPlan{T,true,typeof(region)}(region)
+plan_fft(x::StridedArray{T}, region; kws...) where {T <: RealFloats} =
+    T <: FFTW.fftwReal ? invoke(plan_fft, Tuple{AbstractArray{<:Real}, Any}, x, region; kws...) : DummyFFTPlan{Complex{T},false,typeof(region)}(region)
+plan_fft!(x::StridedArray{T}, region; kws...) where {T <: RealFloats} =
+    T <: FFTW.fftwReal ? invoke(plan_fft!, Tuple{AbstractArray, Any}, x, region; kws...) : DummyFFTPlan{Complex{T},true,typeof(region)}(region)
+
+# intercept fft(x) before AbstractFFTs gets a chance for any non-FFTW float type.
+fft(x::StridedArray{T}) where {T<:AbstractFloats} = generic_fft(x)
+fft(x::StridedArray{T}, region) where {T<:AbstractFloats} = generic_fft(x, region)
 
 plan_bfft(x::StridedArray{T}, region) where {T <: ComplexFloats} = DummybFFTPlan{T,false,typeof(region)}(region)
 plan_bfft!(x::StridedArray{T}, region) where {T <: ComplexFloats} = DummybFFTPlan{T,true,typeof(region)}(region)
@@ -345,11 +407,11 @@ plan_dct!(x::StridedArray{T}, region) where {T <: AbstractFloats} = DummyDCTPlan
 plan_idct(x::StridedArray{T}, region) where {T <: AbstractFloats} = DummyiDCTPlan{T,false,typeof(region)}(region)
 plan_idct!(x::StridedArray{T}, region) where {T <: AbstractFloats} = DummyiDCTPlan{T,true,typeof(region)}(region)
 
-plan_rfft(x::StridedArray{T}, region) where {T <: RealFloats} = DummyrFFTPlan{T,false,typeof(region)}(length(x), region)
+plan_rfft(x::StridedArray{T}, region) where {T <: RealFloats} = DummyrFFTPlan{T,false,typeof(region)}(size(x, first(region)), region)
 plan_brfft(x::StridedArray{T}, n::Integer, region) where {T <: ComplexFloats} = DummybrFFTPlan{T,false,typeof(region)}(n, region)
 
-# A plan for irfft is created in terms of a plan for brfft.
-# plan_irfft(x::StridedArray{T}, n::Integer, region) where {T <: ComplexFloats} = DummyirFFTPlan{Complex{real(T)},false,typeof(region)}(n, region)
+# Explicitly define plan_irfft to ensure correct scaling
+plan_irfft(x::StridedArray{T}, n::Integer, region) where {T <: ComplexFloats} = DummyirFFTPlan{T,false,typeof(region)}(n, region)
 
 # These don't exist for now:
 # plan_rfft!(x::StridedArray{T}) where {T <: RealFloats} = DummyrFFTPlan{Complex{real(T)},true}()
diff --git a/test/Project.toml b/test/Project.toml
index d19789a..b59fafd 100644
--- a/test/Project.toml
+++ b/test/Project.toml
@@ -1,6 +1,7 @@
 [deps]
 AbstractFFTs = "621f4979-c628-5d54-868e-fcf4e3e8185c"
 Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595"
+BFloat16s = "ab4f0b2a-ad5b-11e8-123f-65d77653426b"
 DoubleFloats = "497a8b3b-efae-58df-a0af-a86822472b78"
 FFTW = "7a1cc6ca-52ef-59f5-83cd-3a7055c09341"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
diff --git a/test/fft_tests.jl b/test/fft_tests.jl
index 266451e..156db22 100644
--- a/test/fft_tests.jl
+++ b/test/fft_tests.jl
@@ -1,4 +1,4 @@
-using DoubleFloats, FFTW, GenericFFT, LinearAlgebra
+using BFloat16s, DoubleFloats, LinearAlgebra
 import GenericFFT: generic_fft, generic_fft!
 
 function test_basic_functionality()
@@ -44,8 +44,8 @@ function test_fft_dct(T)
     @test norm(dct(idct(c))-c,Inf) < 1000eps(T)
 
     @test_throws AssertionError irfft(c, 197)
-    @test norm(irfft(c, 198) - irfft(map(ComplexF64, c), 198), Inf) < 10eps(Float64)
-    @test norm(irfft(c, 199) - irfft(map(ComplexF64, c), 199), Inf) < 10eps(Float64)
+    @test norm(irfft(c, 198) - irfft(map(ComplexF64, c), 198), Inf) < 100eps(Float64)
+    @test norm(irfft(c, 199) - irfft(map(ComplexF64, c), 199), Inf) < 100eps(Float64)
     @test_throws AssertionError irfft(c, 200)
 end
 
@@ -202,3 +202,156 @@ end
     @allocations generic_fft!(A2)  # compile
     @test N+150 > @allocations generic_fft!(A2)  # a few allocations is OK
 end
+
+@testset "Batched rfft/irfft" begin
+    for T in (Float64, BigFloat)
+        X = randn(T, 10, 6)
+        
+        Y1 = rfft(X, 1) # Dimension 1
+        @test size(Y1) == (10÷2+1, 6)
+        for j in 1:6
+            @test Y1[:, j] ≈ rfft(X[:, j])
+        end
+        @test irfft(Y1, 10, 1) ≈ X
+        
+        Y2 = rfft(X, 2)  # Dimension 2
+        @test size(Y2) == (10, 6÷2+1)
+        for i in 1:10
+            @test Y2[i, :] ≈ rfft(X[i, :])
+        end
+        @test irfft(Y2, 6, 2) ≈ X
+
+        Y12 = rfft(X, (1, 2)) # 2D RFFT
+        @test size(Y12) == (10÷2+1, 6)
+        @test Y12 ≈ fft(rfft(X, 1), 2)
+        @test irfft(Y12, 10, (1, 2)) ≈ X
+
+        p1 = plan_rfft(X, 1) # Plans
+        @test p1 * X ≈ rfft(X, 1)
+        @test inv(p1) * (p1 * X) ≈ X
+
+        p2 = plan_rfft(X, 2)
+        @test p2 * X ≈ rfft(X, 2)
+        @test inv(p2) * (p2 * X) ≈ X
+    end
+
+
+    for n in (7, 11) # Test a few odd lengths
+        X = randn(BigFloat, n, 4)
+        Y = rfft(X, 1)
+        @test size(Y) == (n÷2+1, 4)
+        @test irfft(Y, n, 1) ≈ X
+    end
+
+    data = randn(BigFloat, 10, 10)
+    v = view(data, 1:8, 1:6)
+    @test rfft(v, 1) ≈ rfft(collect(v), 1)
+    @test irfft(rfft(v, 1), 8, 1) ≈ v
+
+    X3 = randn(BigFloat, 4, 10, 4) # Test 3D Batched
+    
+    Y3 = rfft(X3, 2) # Transform along dimension 2
+    @test size(Y3) == (4, 10÷2+1, 4)
+    for i in 1:4, k in 1:4
+        @test Y3[i, :, k] ≈ rfft(X3[i, :, k])
+    end
+    @test irfft(Y3, 10, 2) ≈ X3
+
+    X4 = randn(BigFloat, 3, 3, 3, 3) # Test 4D
+    Y4 = rfft(X4, (1, 2, 3)) # RFFT over first 3 dimensions
+    @test size(Y4) == (3÷2+1, 3, 3, 3)
+    @test irfft(Y4, 3, (1, 2, 3)) ≈ X4
+    @test irfft(rfft(X4, (1, 2, 3, 4)), 3, (1, 2, 3, 4)) ≈ X4
+
+    X_single = randn(BigFloat, 10, 1)
+    @test rfft(X_single, 1) ≈ rfft(vec(X_single))
+    @test irfft(rfft(X_single, 1), 10, 1) ≈ X_single
+
+    X_br = randn(BigFloat, 10, 6)
+    Y_br = rfft(X_br, (1, 2))
+    # brfft should be irfft * (10 * 6)
+    @test brfft(Y_br, 10, (1, 2)) ≈ irfft(Y_br, 10, (1, 2)) * 60
+end
+
+@testset "Real-input generic_fft coverage" begin
+    X = randn(BigFloat, 8, 8)
+    @test GenericFFT.generic_fft(X, 1) ≈ fft(complex(X), 1)
+    @test GenericFFT.generic_fft(X, (1, 2)) ≈ fft(complex(X))
+end
+
+@testset "Real-input dispatch — fft(x) routes through GenericFFT" begin
+    for T in (Float16, BFloat16, BigFloat)
+        x = T.(randn(64))
+        @test plan_fft(x, 1:1) isa GenericFFT.DummyPlan
+        @test plan_fft(x, 1) isa GenericFFT.DummyPlan
+    end
+end
+
+@testset "No stack overflow for real input — non-power-of-2" begin
+    for T in (Float16, BFloat16, BigFloat)
+        x = T.(randn(100))
+        @test_nowarn fft(x)
+        @test_nowarn fft(x, 1)
+    end
+end
+
+@testset "Chirp index arithmetic in Float64 — no k² precision loss" begin
+    for T in (Float16, BFloat16, BigFloat)
+        n = 1000
+        S = promote_type(real(T), Float64)
+        ks = range(zero(S), stop=S(n)-one(S), length=n)
+        Wks = Complex{real(T)}.(cispi.(-ks.^2 ./ S(n)))
+        for k in [100, 500, 999]
+            ref = cispi(-Float64(k-1)^2 / n)
+            err = abs(ref - Complex{Float64}(Wks[k]))
+            @test err < 1e-2
+        end
+    end
+end
+
+@testset "Non-power-of-2 round-trip — no overflow or precision collapse" begin
+    for T in (Float16, BFloat16, BigFloat)
+        for n in (100, 500, 1000)
+            x   = T.(randn(n))
+            X   = fft(x)
+            xr  = real(ifft(X))
+            err = maximum(abs.(Float64.(x) .- Float64.(xr)))
+            @test !isnan(err)
+            tol = 200 * Float64(eps(real(T)(1))) * log2(n)
+            @test err < tol
+        end
+    end
+end
+
+@testset "BigFloat precision preserved — ks range stays in BigFloat" begin
+    setprecision(256) do
+        n   = 1000
+        x   = randn(BigFloat, n)
+        X   = fft(x)
+        xr  = real(ifft(X))
+        err = maximum(abs.(x .- xr))
+        @test err < 1e-60
+    end
+end
+
+@testset "Wks complex — no InexactError for real T" begin
+    for T in (Float16, BFloat16, BigFloat)
+        n = 100
+        S = promote_type(real(T), Float64)
+        ks = range(zero(S), stop=S(n)-one(S), length=n)
+        @test_nowarn Complex{real(T)}.(cispi.(-ks.^2 ./ S(n)))
+    end
+end
+
+@testset "Dominant frequency bins correct after all fixes" begin
+    for T in (Float16, BFloat16, BigFloat)
+        fs = 1000.0
+        t  = (0:999) ./ fs
+        x  = T.(sin.(2π * 50 * t) .+ 0.5 * sin.(2π * 200 * t))
+        X  = fft(x)
+        mags = abs.(Complex{Float64}.(X))
+        top4 = sortperm(mags, rev=true)[1:4]
+        @test 51  ∈ top4
+        @test 201 ∈ top4
+    end
+end
diff --git a/test/runtests.jl b/test/runtests.jl
index 54ee4c2..c25f376 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -1,10 +1,9 @@
 using AbstractFFTs, GenericFFT, Test
-
 using Aqua
+
 @testset "Project quality" begin
     Aqua.test_all(GenericFFT, piracies=(; broken=true))
 end
-
 @test AbstractFFTs.fftfloat(zero(Float16)) isa Float32
 @test AbstractFFTs.fftfloat(zero(Float32)) isa Float32
 @test AbstractFFTs.fftfloat(zero(Float64)) isa Float64
@@ -14,6 +13,8 @@ end
 @test AbstractFFTs.fftfloat(zero(Complex{Float64})) isa Complex{Float64}
 @test AbstractFFTs.fftfloat(zero(Complex{BigFloat})) isa Complex{BigFloat}
 
+using FFTW
+import AbstractFFTs: fft, ifft, rfft, irfft, brfft
 include("fft_tests.jl")
 include("toeplitz_tests.jl")
-include("interlace.jl")
+include("interlace.jl")
\ No newline at end of file