From 99bb96826549cd75c5a73d71433f38cd52ed52a5 Mon Sep 17 00:00:00 2001
From: chen <cx2016013@163.com>
Date: Tue, 13 Jan 2026 10:10:51 +0000
Subject: [PATCH 1/6] feat: add precision checker with hook system and
 command-line control

This PR introduces a comprehensive precision checking system for debugging
numerical accuracy issues in distributed training:

**Core Features:**
- Two-level precision checking (module-level and function-level)
- Command-line flags: --precision_check, --precision_check_all_ranks
- Extensible hook system for Functions, Modules, and Tensors
- Automatic FP32 reference computation for validation

**Hook System:**
- Forward/backward pre/post hooks for Functions and Modules
- Tensor gradient hooks for inspection
- Unified hook type definitions to reduce code duplication

**Implementation:**
- PrecisionChecker utility with configurable check levels
- Integration with autograd Function and nn::Module
- Support for distributed training (per-rank checking)
- Detailed logging to precision_check_rank_[N].log files

**Documentation:**
- docs/hook_mechanism.md - Hook system architecture
- docs/precision_checker_guide.md - Usage guide

**Testing:**
- test/hook/test_hook.cc - Hook functionality tests
- test/hook/test_precision_check.cc - Precision checker tests

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
---
 example/gpt2/main.cc                          |  10 +-
 example/gpt2/net.cc                           |  44 ++---
 example/gpt2/net.h                            |  10 +
 example/llama3/main.cc                        |  10 +-
 example/llama3/net.cc                         |  50 ++---
 example/llama3/net.h                          |  11 ++
 example/mnist/net.cc                          |   4 +-
 infini_train/include/autograd/accumulate.h    |   2 +
 infini_train/include/autograd/function.h      |  22 +++
 infini_train/include/autograd/function_hook.h |  28 +++
 infini_train/include/autograd/tensor_hook.h   |  37 ++++
 infini_train/include/nn/module_hook.h         |  56 ++++++
 infini_train/include/nn/modules/activations.h |   3 +-
 infini_train/include/nn/modules/container.h   |   2 +
 infini_train/include/nn/modules/loss.h        |   3 +-
 infini_train/include/nn/modules/module.h      |  17 ++
 .../include/nn/modules/normalization.h        |   1 +
 infini_train/include/nn/parallel/global.h     |  17 +-
 .../include/nn/parallel/tensor_parallel.h     |   3 +-
 .../include/utils/precision_checker.h         |  49 +++++
 infini_train/src/autograd/function.cc         |  79 ++++++++
 infini_train/src/nn/modules/container.cc      |   6 +-
 infini_train/src/nn/modules/module.cc         |  78 ++++++++
 infini_train/src/nn/modules/normalization.cc  |   3 +-
 infini_train/src/nn/parallel/data_parallel.cc |   4 +-
 .../nn/parallel/distributed_data_parallel.cc  |   2 +-
 infini_train/src/nn/parallel/global.cc        |  27 ++-
 .../src/nn/parallel/pp/pipeline_schedule.cc   |   2 +-
 .../src/nn/parallel/pp/pipeline_stage.cc      |   2 +-
 .../src/nn/parallel/tensor_parallel.cc        |  10 +-
 infini_train/src/utils/precision_checker.cc   | 184 ++++++++++++++++++
 31 files changed, 699 insertions(+), 77 deletions(-)
 create mode 100644 infini_train/include/autograd/tensor_hook.h
 create mode 100644 infini_train/include/nn/module_hook.h
 create mode 100644 infini_train/include/utils/precision_checker.h
 create mode 100644 infini_train/src/utils/precision_checker.cc

diff --git a/example/gpt2/main.cc b/example/gpt2/main.cc
index 4a34c464..c43a04d2 100644
--- a/example/gpt2/main.cc
+++ b/example/gpt2/main.cc
@@ -68,6 +68,9 @@ DEFINE_uint32(virtual_pipeline_parallel, 1, "Number of chunks in PP stage.");
 
 // precision
 DEFINE_string(dtype, "float32", "precision used in training (float32/bfloat16)");
+// precision check
+DEFINE_int32(precision_check, 0, "precision check level: 0=off, 1=module, 2=function");
+DEFINE_bool(precision_check_all_ranks, false, "enable precision check for all ranks (default: rank 0 only)");
 
 using namespace infini_train;
 
@@ -297,9 +300,9 @@ void Train(const nn::parallel::Rank &rank) {
 
                 LOG(INFO) << "Rank " << rank.GlobalRank() << ": start forward";
                 // (bs, seq_len, vocab_size)
-                auto logits = model->Forward({x, y})[0];
+                auto logits = (*model)({x, y})[0];
                 LOG(INFO) << "Rank " << rank.GlobalRank() << ": finish model forward, start loss forward";
-                auto loss = loss_fn->Forward({logits, y})[0];
+                auto loss = (*loss_fn)({logits, y})[0];
                 // FIXME(jym): verify gradient accumulation precision
                 loss = loss / grad_accum_steps;
 
@@ -364,7 +367,8 @@ int main(int argc, char *argv[]) {
     google::InitGoogleLogging(argv[0]);
 
     nn::parallel::global::InitAllEnv(FLAGS_nthread_per_process, FLAGS_tensor_parallel, FLAGS_sequence_parallel,
-                                     FLAGS_pipeline_parallel, FLAGS_virtual_pipeline_parallel);
+                                     FLAGS_pipeline_parallel, FLAGS_virtual_pipeline_parallel, FLAGS_precision_check,
+                                     FLAGS_precision_check_all_ranks);
 
     LOG(INFO) << nn::parallel::global::ProcessGroupOverview();
 
diff --git a/example/gpt2/net.cc b/example/gpt2/net.cc
index 69e4278e..18e07dca 100644
--- a/example/gpt2/net.cc
+++ b/example/gpt2/net.cc
@@ -47,7 +47,7 @@ NewGELU::Forward(const std::vector<std::shared_ptr<infini_train::Tensor>> &x) {
 }
 
 CausalSelfAttention::CausalSelfAttention(const GPT2Config &config)
-    : config_(config), n_head_(config.n_head), n_embd_(config.n_embd) {
+    : CloneableModule(kType), config_(config), n_head_(config.n_head), n_embd_(config.n_embd) {
     auto tp_world_size = nn::parallel::global::GetTensorParallelSize();
     CHECK_EQ(config.n_embd % config.n_head, 0);
     CHECK_EQ(n_head_ % tp_world_size, 0) << "n_head must be divisible by TP world size";
@@ -89,7 +89,7 @@ CausalSelfAttention::Forward(const std::vector<std::shared_ptr<infini_train::Ten
 
     // (B, T, C) -> ColumnParallelLinear(C, 3*C) -> (B, T, 3 * local_C)
     // -> Split -> (3, B, T, local_C)
-    auto qkv = modules_[kCAttnLayerName]->Forward(x)[0]->Split(local_C, 2);
+    auto qkv = (*modules_[kCAttnLayerName])(x)[0]->Split(local_C, 2);
 
     // (B, T, local_C)
     auto q = qkv[0];
@@ -120,12 +120,12 @@ CausalSelfAttention::Forward(const std::vector<std::shared_ptr<infini_train::Ten
 
     // Get full tensor
     // (B, T, local_C) -> RowParallelLinear(n_embd, n_embd) -> (B, T, C)
-    y = modules_[kCProjLayerName]->Forward({y})[0];
+    y = (*modules_[kCProjLayerName])({y})[0];
     // (B, T, C) == (bs, seq_len, n_embd)
     return {y};
 }
 
-MLP::MLP(const GPT2Config &config) {
+MLP::MLP(const GPT2Config &config) : CloneableModule(kType) {
     // c_fc: ColumnParallel (input full, output parallel)
     modules_[kCFcLayerName] = std::make_shared<nn::parallel::ColumnParallelLinear>(
         /*in_features=*/config.n_embd, /*out_features=*/4 * config.n_embd,
@@ -150,16 +150,16 @@ MLP::MLP(const GPT2Config &config) {
 std::vector<std::shared_ptr<infini_train::Tensor>>
 MLP::Forward(const std::vector<std::shared_ptr<infini_train::Tensor>> &x) {
     // (B, T, C) -> ColumnParallelLinear(C, 4 * C) -> (B, T, 4 * C_local)
-    auto x1 = modules_[kCFcLayerName]->Forward(x);
+    auto x1 = (*modules_[kCFcLayerName])(x);
     // (B, T, 4 * C_local) -> GELU -> (B, T, 4 * C_local)
-    auto x2 = modules_[kGeluLayerName]->Forward(x1);
+    auto x2 = (*modules_[kGeluLayerName])(x1);
     // (B, T, 4 * C_local) -> RowParallelLinear(4 * C, C) -> (B, T, C)
-    auto x3 = modules_[kCProjLayerName]->Forward(x2);
+    auto x3 = (*modules_[kCProjLayerName])(x2);
     // (B, T, C)
     return x3;
 }
 
-Block::Block(const GPT2Config &config) {
+Block::Block(const GPT2Config &config) : CloneableModule(kType) {
     modules_[kLn1LayerName] = std::make_shared<nn::LayerNorm>(std::vector<int64_t>{config.n_embd});
     modules_[kAttnLayerName] = std::make_shared<CausalSelfAttention>(config);
     modules_[kLn2LayerName] = std::make_shared<nn::LayerNorm>(std::vector<int64_t>{config.n_embd});
@@ -170,15 +170,15 @@ std::vector<std::shared_ptr<infini_train::Tensor>>
 Block::Forward(const std::vector<std::shared_ptr<infini_train::Tensor>> &x) {
     // (bs, seq_len, n_embd) -> Layernorm -> (bs, seq_len, n_embd) -> attention -> (bs, seq_len, n_embd)
     // -> Add -> (bs, seq_len, n_embd)
-    auto x1 = x[0] + modules_[kAttnLayerName]->Forward(modules_[kLn1LayerName]->Forward(x))[0];
+    auto x1 = x[0] + (*modules_[kAttnLayerName])((*modules_[kLn1LayerName])(x))[0];
     // (bs, seq_len, n_embd) -> Layernorm -> (bs, seq_len, n_embd) -> MLP -> (bs, seq_len, n_embd)
     // -> Add -> (bs, seq_len, n_embd)
-    auto x2 = x1 + modules_[kMlpLayerName]->Forward(modules_[kLn2LayerName]->Forward({x1}))[0];
+    auto x2 = x1 + (*modules_[kMlpLayerName])((*modules_[kLn2LayerName])({x1}))[0];
     // (bs, seq_len, n_embd)
     return {x2};
 }
 
-GPT2FirstStage::GPT2FirstStage(const GPT2Config &config) : config_(config) {
+GPT2FirstStage::GPT2FirstStage(const GPT2Config &config) : CloneableModule(kType), config_(config) {
     modules_[kWTELayerName] = std::make_shared<nn::parallel::VocabParallelEmbedding>(
         config_.vocab_size, config_.n_embd, nn::parallel::global::GetSequenceParallelEnabled());
     modules_[kWPELayerName] = std::make_shared<nn::Embedding>(config_.block_size, config_.n_embd);
@@ -207,15 +207,15 @@ GPT2FirstStage::Forward(const std::vector<std::shared_ptr<infini_train::Tensor>>
     auto pos = nn::init::Arange(start, start + t_local, infini_train::DataType::kINT64, device);
 
     // (B, T) -> Embedding(V_local, C) -> (B, T, C)
-    auto tok_emb = modules_[kWTELayerName]->Forward({x1})[0];
+    auto tok_emb = (*modules_[kWTELayerName])({x1})[0];
 
     // (T) -> Embedding(T_max, C) -> (T, C)
-    auto pos_emb = modules_[kWPELayerName]->Forward({pos})[0];
+    auto pos_emb = (*modules_[kWPELayerName])({pos})[0];
     // (B, T, C)
     return {tok_emb + pos_emb};
 }
 
-GPT2Chunk::GPT2Chunk(const GPT2Config &config, int start_layer, int end_layer) : config_(config) {
+GPT2Chunk::GPT2Chunk(const GPT2Config &config, int start_layer, int end_layer) : CloneableModule(kType), config_(config) {
     std::vector<std::shared_ptr<nn::Module>> h;
     for (int64_t i = start_layer; i < end_layer; ++i) {
         auto layer = std::make_shared<Block>(config);
@@ -228,11 +228,11 @@ std::vector<std::shared_ptr<infini_train::Tensor>>
 GPT2Chunk::Forward(const std::vector<std::shared_ptr<infini_train::Tensor>> &x) {
     auto x1 = x[0];
     // (bs, seq_len, n_embd) -> transformer -> (bs, seq_len, n_embd)
-    for (auto &h : *std::dynamic_pointer_cast<nn::ModuleList>(modules_[kHLayerName])) { x1 = h->Forward({x1})[0]; }
+    for (auto &h : *std::dynamic_pointer_cast<nn::ModuleList>(modules_[kHLayerName])) { x1 = (*h)({x1})[0]; }
     return {x1};
 }
 
-GPT2LastStage::GPT2LastStage(const GPT2Config &config) : config_(config) {
+GPT2LastStage::GPT2LastStage(const GPT2Config &config) : CloneableModule(kType), config_(config) {
     modules_[kLnFLayerName] = std::make_shared<nn::LayerNorm>(std::vector<int64_t>{config_.n_embd});
     // don't init this one, we will tie weights
     modules_[kLMHeadLayerName] = std::make_shared<nn::parallel::ColumnParallelLinear>(
@@ -248,15 +248,15 @@ GPT2LastStage::GPT2LastStage(const GPT2Config &config) : config_(config) {
 std::vector<std::shared_ptr<infini_train::Tensor>>
 GPT2LastStage::Forward(const std::vector<std::shared_ptr<infini_train::Tensor>> &x) {
     // (B, T, C) -> Layernorm -> (B, T, C)
-    auto x1 = modules_[kLnFLayerName]->Forward(x);
+    auto x1 = (*modules_[kLnFLayerName])(x);
 
     // TODO(dcj): add inference-time mini-optimization
     // (B, T, C) -> Linear(C, V) -> (B, T, V)
-    return modules_[kLMHeadLayerName]->Forward(x1);
+    return (*modules_[kLMHeadLayerName])(x1);
 }
 
 GPT2::GPT2(const GPT2Config &config)
-    : config_(config), stage_info_(nn::parallel::PipelineParallel::GetStageInfo(
+    : CloneableModule(kType), config_(config), stage_info_(nn::parallel::PipelineParallel::GetStageInfo(
                            config_.n_layer, nn::parallel::global::GetPipelineParallelSize(), nn::parallel::pp_rank,
                            nn::parallel::global::GetVirtualPipelineParallelSize())) {
     auto tp_world_size = nn::parallel::global::GetTensorParallelSize();
@@ -316,11 +316,11 @@ GPT2::GPT2(const GPT2Config &config)
 
 std::vector<std::shared_ptr<infini_train::Tensor>>
 GPT2::Forward(const std::vector<std::shared_ptr<infini_train::Tensor>> &x) {
-    auto x1 = modules_[kPPFirstStageName]->Forward(x);
+    auto x1 = (*modules_[kPPFirstStageName])(x);
     for (int chunk_idx = 0; chunk_idx < stage_info_.layer_ranges_per_chunk.size(); ++chunk_idx) {
-        x1 = modules_[kPPChunkNamePrefix + std::to_string(chunk_idx)]->Forward(x1);
+        x1 = (*modules_[kPPChunkNamePrefix + std::to_string(chunk_idx)])(x1);
     }
-    return modules_[kPPLastStageName]->Forward(x1);
+    return (*modules_[kPPLastStageName])(x1);
 }
 
 std::shared_ptr<GPT2> GPT2::FromPretrained(ModelType model_type) {
diff --git a/example/gpt2/net.h b/example/gpt2/net.h
index f52e4d4e..4faf5451 100644
--- a/example/gpt2/net.h
+++ b/example/gpt2/net.h
@@ -23,12 +23,16 @@ struct GPT2Config {
 
 class NewGELU : public infini_train::nn::CloneableModule<NewGELU> {
 public:
+    static constexpr char kType[] = "NewGELU";
+    NewGELU() : CloneableModule(kType) {}
+
     std::vector<std::shared_ptr<infini_train::Tensor>>
     Forward(const std::vector<std::shared_ptr<infini_train::Tensor>> &x) override;
 };
 
 class CausalSelfAttention : public infini_train::nn::CloneableModule<CausalSelfAttention> {
 public:
+    static constexpr char kType[] = "CausalSelfAttention";
     static constexpr char kCAttnLayerName[] = "c_attn";
     static constexpr char kCProjLayerName[] = "c_proj";
 
@@ -49,6 +53,7 @@ class CausalSelfAttention : public infini_train::nn::CloneableModule<CausalSelfA
 
 class MLP : public infini_train::nn::CloneableModule<MLP> {
 public:
+    static constexpr char kType[] = "MLP";
     static constexpr char kCFcLayerName[] = "c_fc";
     static constexpr char kGeluLayerName[] = "gelu";
     static constexpr char kCProjLayerName[] = "c_proj";
@@ -61,6 +66,7 @@ class MLP : public infini_train::nn::CloneableModule<MLP> {
 
 class Block : public infini_train::nn::CloneableModule<Block> {
 public:
+    static constexpr char kType[] = "Block";
     static constexpr char kLn1LayerName[] = "ln_1";
     static constexpr char kAttnLayerName[] = "attn";
     static constexpr char kLn2LayerName[] = "ln_2";
@@ -74,6 +80,7 @@ class Block : public infini_train::nn::CloneableModule<Block> {
 
 class GPT2FirstStage : public infini_train::nn::CloneableModule<GPT2FirstStage> {
 public:
+    static constexpr char kType[] = "GPT2FirstStage";
     static constexpr char kWTELayerName[] = "wte";
     static constexpr char kWPELayerName[] = "wpe";
 
@@ -88,6 +95,7 @@ class GPT2FirstStage : public infini_train::nn::CloneableModule<GPT2FirstStage>
 
 class GPT2Chunk : public infini_train::nn::CloneableModule<GPT2Chunk> {
 public:
+    static constexpr char kType[] = "GPT2Chunk";
     static constexpr char kHLayerName[] = "h";
 
     GPT2Chunk(const GPT2Config &config, int start_layer, int end_layer);
@@ -101,6 +109,7 @@ class GPT2Chunk : public infini_train::nn::CloneableModule<GPT2Chunk> {
 
 class GPT2LastStage : public infini_train::nn::CloneableModule<GPT2LastStage> {
 public:
+    static constexpr char kType[] = "GPT2LastStage";
     static constexpr char kLnFLayerName[] = "ln_f";
     static constexpr char kLMHeadLayerName[] = "lm_head";
 
@@ -115,6 +124,7 @@ class GPT2LastStage : public infini_train::nn::CloneableModule<GPT2LastStage> {
 
 class GPT2 : public infini_train::nn::CloneableModule<GPT2> {
 public:
+    static constexpr char kType[] = "GPT2";
     static constexpr char kTransformerLayerName[] = "transformer";
 
     enum class ModelType : int8_t {
diff --git a/example/llama3/main.cc b/example/llama3/main.cc
index fdea2162..c6db113e 100644
--- a/example/llama3/main.cc
+++ b/example/llama3/main.cc
@@ -66,6 +66,9 @@ DEFINE_uint32(pipeline_parallel, 1, "Pipeline Parallel world size, specified the
 DEFINE_uint32(virtual_pipeline_parallel, 1, "Number of chunks in PP stage.");
 // precision
 DEFINE_string(dtype, "float32", "precision used in training (float32/bfloat16)");
+// precision check
+DEFINE_int32(precision_check, 0, "precision check level: 0=off, 1=module, 2=function");
+DEFINE_bool(precision_check_all_ranks, false, "enable precision check for all ranks (default: rank 0 only)");
 
 using namespace infini_train;
 
@@ -273,9 +276,9 @@ void Train(const nn::parallel::Rank &rank) {
 
                 LOG(INFO) << "Rank " << rank.GlobalRank() << ": start forward";
                 // (bs, seq_len, vocab_size)
-                auto logits = model->Forward({x, y})[0];
+                auto logits = (*model)({x, y})[0];
                 LOG(INFO) << "Rank " << rank.GlobalRank() << ": finish model forward, start loss forward";
-                auto loss = loss_fn->Forward({logits, y})[0];
+                auto loss = (*loss_fn)({logits, y})[0];
                 // FIXME(jym): verify gradient accumulation precision
                 loss = loss / grad_accum_steps;
 
@@ -340,7 +343,8 @@ int main(int argc, char *argv[]) {
     google::InitGoogleLogging(argv[0]);
 
     nn::parallel::global::InitAllEnv(FLAGS_nthread_per_process, FLAGS_tensor_parallel, FLAGS_sequence_parallel,
-                                     FLAGS_pipeline_parallel, FLAGS_virtual_pipeline_parallel);
+                                     FLAGS_pipeline_parallel, FLAGS_virtual_pipeline_parallel, FLAGS_precision_check,
+                                     FLAGS_precision_check_all_ranks);
 
     LOG(INFO) << nn::parallel::global::ProcessGroupOverview();
 
diff --git a/example/llama3/net.cc b/example/llama3/net.cc
index a70a811a..12bcf0ed 100644
--- a/example/llama3/net.cc
+++ b/example/llama3/net.cc
@@ -127,7 +127,7 @@ std::vector<std::shared_ptr<Tensor>> SwiGLU::Forward(const std::vector<std::shar
     return {x[0] * nn::function::Sigmoid(x[0])};
 }
 
-RMSNorm::RMSNorm(int64_t dim, float eps, const infini_train::Device *device) : eps_(eps) {
+RMSNorm::RMSNorm(int64_t dim, float eps, const infini_train::Device *device) : CloneableModule(kType), eps_(eps) {
     parameters_[kParamWeightName]
         = std::make_shared<Tensor>(std::vector<int64_t>{dim}, DataType::kFLOAT32, device)->RequiresGrad();
     nn::init::Ones(parameters_[kParamWeightName]);
@@ -140,7 +140,7 @@ std::vector<std::shared_ptr<Tensor>> RMSNorm::Forward(const std::vector<std::sha
 }
 
 CausalSelfAttention::CausalSelfAttention(const LLaMA3Config &config)
-    : config_(config), n_head_(config.n_head), n_embd_(config.n_embd), n_kv_head_(config.n_kv_head),
+    : CloneableModule(kType), config_(config), n_head_(config.n_head), n_embd_(config.n_embd), n_kv_head_(config.n_kv_head),
       n_rep_(config.n_head / config.n_kv_head), head_dim_(config.n_embd / config.n_head) {
     CHECK_LE(config.n_kv_head, config.n_head);
     CHECK_EQ(config.n_head % config.n_kv_head, 0);
@@ -185,7 +185,7 @@ std::vector<std::shared_ptr<Tensor>> CausalSelfAttention::Forward(const std::vec
     CHECK(freqs_cis != nullptr) << "freqs_cis is null.";
 
     // (B, T, C) -> (B, T, (H + 2 * n_kv_head) * D)
-    auto qkv = modules_[kCAttnLayerName]->Forward({x[0]})[0];
+    auto qkv = (*modules_[kCAttnLayerName])({x[0]})[0];
     // NOTE(zbl): Acquire full T after AllGather is performed in ColumnParallelLinear
     const auto T = qkv->Dims()[1];
     // NOTE(zbl): torch script uses torch.split({...}, dim) to split tensors into sub-tensors in different sizes
@@ -240,12 +240,12 @@ std::vector<std::shared_ptr<Tensor>> CausalSelfAttention::Forward(const std::vec
     y = y->Transpose(1, 2)->Contiguous()->View({B, T, C_local});
     // output projection
     // (B, T, C_local) -> RowParallelLinear(C, C) -> (B, T, C)
-    y = modules_[kCProjLayerName]->Forward({y})[0];
+    y = (*modules_[kCProjLayerName])({y})[0];
     // (B, H, C) == (bs, seq_len, n_embd)
     return {y};
 }
 
-MLP::MLP(const LLaMA3Config &config) {
+MLP::MLP(const LLaMA3Config &config) : CloneableModule(kType) {
     hidden_dim_ = 4 * config.n_embd;
     hidden_dim_ = int(2 * hidden_dim_ / 3);
     // use custom dim factor multiplier
@@ -286,20 +286,20 @@ MLP::MLP(const LLaMA3Config &config) {
 
 std::vector<std::shared_ptr<Tensor>> MLP::Forward(const std::vector<std::shared_ptr<Tensor>> &x) {
     // (bs, seq_len, n_embd) -> Linear(n_embd, hidden_dim) -> (bs, seq_len, hidden_dim)
-    auto x1 = modules_[kCFcLayerName]->Forward(x)[0];
+    auto x1 = (*modules_[kCFcLayerName])(x)[0];
     // (bs, seq_len, n_embd) -> Linear(n_embd, hidden_dim) -> (bs, seq_len, hidden_dim)
-    auto x2 = modules_[kCFc2LayerName]->Forward(x)[0];
+    auto x2 = (*modules_[kCFc2LayerName])(x)[0];
     // (bs, seq_len, hidden_dim) -> SwiGLU -> (bs, seq_len, hidden_dim)
-    x2 = modules_[kSiluLayerName]->Forward({x2})[0];
+    x2 = (*modules_[kSiluLayerName])({x2})[0];
     // (bs, seq_len, hidden_dim)
     auto x3 = x1 * x2;
     // (bs, seq_len, hidden_dim) -> Linear(hidden_dim, n_embd) -> (bs, seq_len, n_embd)
-    auto x4 = modules_[kCProjLayerName]->Forward({x3});
+    auto x4 = (*modules_[kCProjLayerName])({x3});
     // (bs, seq_len, n_embd)
     return x4;
 }
 
-Block::Block(const LLaMA3Config &config) {
+Block::Block(const LLaMA3Config &config) : CloneableModule(kType) {
     modules_[kLn1LayerName] = std::make_shared<RMSNorm>(config.n_embd, config.norm_eps);
     modules_[kAttnLayerName] = std::make_shared<CausalSelfAttention>(config);
     modules_[kLn2LayerName] = std::make_shared<RMSNorm>(config.n_embd, config.norm_eps);
@@ -314,27 +314,27 @@ std::vector<std::shared_ptr<Tensor>> Block::Forward(const std::vector<std::share
     // (bs, seq_len, n_embd) -> RMSNorm -> (bs, seq_len, n_embd) -> attention -> (bs, seq_len, n_embd)
     // -> Add -> (bs, seq_len, n_embd)
     auto x1 = x[0]
-            + modules_[kAttnLayerName]->Forward(std::vector<std::shared_ptr<Tensor>>{
-                modules_[kLn1LayerName]->Forward({x[0]})[0], freqs_cis, start_pos, mask})[0];
+            + (*modules_[kAttnLayerName])(std::vector<std::shared_ptr<Tensor>>{
+                (*modules_[kLn1LayerName])({x[0]})[0], freqs_cis, start_pos, mask})[0];
     // (bs, seq_len, n_embd) -> RMSNorm -> (bs, seq_len, n_embd) -> MLP -> (bs, seq_len, n_embd)
     // -> Add -> (bs, seq_len, n_embd)
     auto x2 = x1
-            + modules_[kMlpLayerName]->Forward(
-                std::vector<std::shared_ptr<Tensor>>(modules_[kLn2LayerName]->Forward({x1})))[0];
+            + (*modules_[kMlpLayerName])(
+                std::vector<std::shared_ptr<Tensor>>((*modules_[kLn2LayerName])({x1})))[0];
     // (bs, seq_len, n_embd)
     return {x2};
 }
 
-LLaMA3FirstStage::LLaMA3FirstStage(const LLaMA3Config &config) : config_(config) {
+LLaMA3FirstStage::LLaMA3FirstStage(const LLaMA3Config &config) : CloneableModule(kType), config_(config) {
     modules_[LLaMA3FirstStage::kWTELayerName] = std::make_shared<nn::parallel::VocabParallelEmbedding>(
         config.vocab_size, config.n_embd, nn::parallel::global::GetSequenceParallelEnabled());
 }
 
 std::vector<std::shared_ptr<Tensor>> LLaMA3FirstStage::Forward(const std::vector<std::shared_ptr<Tensor>> &x) {
-    return modules_[LLaMA3FirstStage::kWTELayerName]->Forward(x);
+    return (*modules_[LLaMA3FirstStage::kWTELayerName])(x);
 }
 
-LLaMA3Chunk::LLaMA3Chunk(const LLaMA3Config &config, int start_layer, int end_layer) : config_(config) {
+LLaMA3Chunk::LLaMA3Chunk(const LLaMA3Config &config, int start_layer, int end_layer) : CloneableModule(kType), config_(config) {
     std::vector<std::shared_ptr<nn::Module>> h;
     for (int64_t i = start_layer; i < end_layer; ++i) {
         auto layer = std::make_shared<Block>(config);
@@ -368,12 +368,12 @@ std::vector<std::shared_ptr<Tensor>> LLaMA3Chunk::Forward(const std::vector<std:
 
     // (bs, seq_len, n_embd) -> transformer -> (bs, seq_len, n_embd)
     for (auto &h : *std::dynamic_pointer_cast<nn::ModuleList>(modules_[LLaMA3Chunk::kHLayerName])) {
-        x1 = h->Forward({x1, freqs_view, start_pos_ptr, mask})[0];
+        x1 = (*h)({x1, freqs_view, start_pos_ptr, mask})[0];
     }
     return {x1};
 }
 
-LLaMA3LastStage::LLaMA3LastStage(const LLaMA3Config &config) : config_(config) {
+LLaMA3LastStage::LLaMA3LastStage(const LLaMA3Config &config) : CloneableModule(kType), config_(config) {
     modules_[kLnFLayerName] = std::make_shared<RMSNorm>(config.n_embd, config.norm_eps);
     // NOTE(zbl): weight-tying is possible but torch script did not do so
     modules_[kLMHeadLayerName] = std::make_shared<nn::parallel::ColumnParallelLinear>(
@@ -388,15 +388,15 @@ LLaMA3LastStage::LLaMA3LastStage(const LLaMA3Config &config) : config_(config) {
 
 std::vector<std::shared_ptr<Tensor>> LLaMA3LastStage::Forward(const std::vector<std::shared_ptr<Tensor>> &x) {
     // (bs, seq_len, n_embd) -> RMSNorm -> (bs, seq_len, n_embd)
-    auto x1 = modules_[kLnFLayerName]->Forward(x);
+    auto x1 = (*modules_[kLnFLayerName])(x);
 
     // TODO(zbl): add inference-time mini-optimization
     // (bs, seq_len, n_embd) -> Linear(n_embd, vocab_size) -> (bs, seq_len, vocab_size)
-    return modules_[kLMHeadLayerName]->Forward(x1);
+    return (*modules_[kLMHeadLayerName])(x1);
 }
 
 LLaMA3::LLaMA3(const LLaMA3Config &config)
-    : config_(config), stage_info_(nn::parallel::PipelineParallel::GetStageInfo(
+    : CloneableModule(kType), config_(config), stage_info_(nn::parallel::PipelineParallel::GetStageInfo(
                            config_.n_layer, nn::parallel::global::GetPipelineParallelSize(), nn::parallel::pp_rank,
                            nn::parallel::global::GetVirtualPipelineParallelSize())) {
     std::unordered_map<std::string, std::shared_ptr<nn::Module>> transformer;
@@ -439,11 +439,11 @@ LLaMA3::LLaMA3(const LLaMA3Config &config)
 }
 
 std::vector<std::shared_ptr<Tensor>> LLaMA3::Forward(const std::vector<std::shared_ptr<Tensor>> &x) {
-    auto x1 = modules_[kPPFirstStageName]->Forward({x[0]});
+    auto x1 = (*modules_[kPPFirstStageName])({x[0]});
     for (int chunk_idx = 0; chunk_idx < stage_info_.layer_ranges_per_chunk.size(); ++chunk_idx) {
-        x1 = modules_[kPPChunkNamePrefix + std::to_string(chunk_idx)]->Forward(x1);
+        x1 = (*modules_[kPPChunkNamePrefix + std::to_string(chunk_idx)])(x1);
     }
-    return modules_[kPPLastStageName]->Forward(x1);
+    return (*modules_[kPPLastStageName])(x1);
 }
 
 std::shared_ptr<LLaMA3> LLaMA3::FromPretrained(ModelType model_type) {
diff --git a/example/llama3/net.h b/example/llama3/net.h
index 9bd7f9da..034aa9e8 100644
--- a/example/llama3/net.h
+++ b/example/llama3/net.h
@@ -42,6 +42,9 @@ struct LLaMA3Config {
 
 class SwiGLU : public infini_train::nn::CloneableModule<SwiGLU> {
 public:
+    static constexpr char kType[] = "SwiGLU";
+    SwiGLU() : CloneableModule(kType) {}
+
     std::vector<std::shared_ptr<infini_train::Tensor>>
     Forward(const std::vector<std::shared_ptr<infini_train::Tensor>> &x) override;
 };
@@ -49,6 +52,7 @@ class SwiGLU : public infini_train::nn::CloneableModule<SwiGLU> {
 // TODO(zbl): implement fused kernel
 class RMSNorm : public infini_train::nn::CloneableModule<RMSNorm> {
 public:
+    static constexpr char kType[] = "RMSNorm";
     static constexpr char kParamWeightName[] = "weight";
 
     explicit RMSNorm(int64_t dim, float eps = 1e-6f,
@@ -63,6 +67,7 @@ class RMSNorm : public infini_train::nn::CloneableModule<RMSNorm> {
 
 class CausalSelfAttention : public infini_train::nn::CloneableModule<CausalSelfAttention> {
 public:
+    static constexpr char kType[] = "CausalSelfAttention";
     static constexpr char kCAttnLayerName[] = "c_attn";
     static constexpr char kCProjLayerName[] = "c_proj";
 
@@ -82,6 +87,7 @@ class CausalSelfAttention : public infini_train::nn::CloneableModule<CausalSelfA
 
 class MLP : public infini_train::nn::CloneableModule<MLP> {
 public:
+    static constexpr char kType[] = "MLP";
     static constexpr char kCFcLayerName[] = "c_fc";
     static constexpr char kCFc2LayerName[] = "c_fc2";
     static constexpr char kSiluLayerName[] = "silu";
@@ -98,6 +104,7 @@ class MLP : public infini_train::nn::CloneableModule<MLP> {
 
 class Block : public infini_train::nn::CloneableModule<Block> {
 public:
+    static constexpr char kType[] = "Block";
     static constexpr char kLn1LayerName[] = "ln_1";
     static constexpr char kAttnLayerName[] = "attn";
     static constexpr char kLn2LayerName[] = "ln_2";
@@ -111,6 +118,7 @@ class Block : public infini_train::nn::CloneableModule<Block> {
 
 class LLaMA3FirstStage : public infini_train::nn::CloneableModule<LLaMA3FirstStage> {
 public:
+    static constexpr char kType[] = "LLaMA3FirstStage";
     static constexpr char kWTELayerName[] = "wte";
 
     explicit LLaMA3FirstStage(const LLaMA3Config &config);
@@ -124,6 +132,7 @@ class LLaMA3FirstStage : public infini_train::nn::CloneableModule<LLaMA3FirstSta
 
 class LLaMA3Chunk : public infini_train::nn::CloneableModule<LLaMA3Chunk> {
 public:
+    static constexpr char kType[] = "LLaMA3Chunk";
     static constexpr char kHLayerName[] = "h";
     static constexpr char kFreqsCisName[] = "freqs_cis";
 
@@ -138,6 +147,7 @@ class LLaMA3Chunk : public infini_train::nn::CloneableModule<LLaMA3Chunk> {
 
 class LLaMA3LastStage : public infini_train::nn::CloneableModule<LLaMA3LastStage> {
 public:
+    static constexpr char kType[] = "LLaMA3LastStage";
     static constexpr char kLnFLayerName[] = "ln_f";
     static constexpr char kLMHeadLayerName[] = "lm_head";
 
@@ -152,6 +162,7 @@ class LLaMA3LastStage : public infini_train::nn::CloneableModule<LLaMA3LastStage
 
 class LLaMA3 : public infini_train::nn::CloneableModule<LLaMA3> {
 public:
+    static constexpr char kType[] = "LLaMA3";
     static constexpr char kTransformerLayerName[] = "transformer";
 
     enum class ModelType : int8_t {
diff --git a/example/mnist/net.cc b/example/mnist/net.cc
index 4ac83613..501fee7e 100644
--- a/example/mnist/net.cc
+++ b/example/mnist/net.cc
@@ -25,7 +25,7 @@ MNIST::MNIST() {
 std::vector<std::shared_ptr<infini_train::Tensor>>
 MNIST::Forward(const std::vector<std::shared_ptr<infini_train::Tensor>> &x) {
     CHECK_EQ(x.size(), 1);
-    auto x1 = modules_["sequential"]->Forward(x);
-    auto x2 = modules_["linear2"]->Forward(x1);
+    auto x1 = (*modules_["sequential"])(x);
+    auto x2 = (*modules_["linear2"])(x1);
     return x2;
 }
diff --git a/infini_train/include/autograd/accumulate.h b/infini_train/include/autograd/accumulate.h
index f3519cb1..a8e41e67 100644
--- a/infini_train/include/autograd/accumulate.h
+++ b/infini_train/include/autograd/accumulate.h
@@ -18,6 +18,8 @@ class AccumulateGrad final : public Function {
 
     std::vector<std::shared_ptr<Tensor>> Backward(const std::vector<std::shared_ptr<Tensor>> &) override;
 
+    std::shared_ptr<Tensor> tensor() const { return tensor_; }
+
 private:
     std::shared_ptr<Tensor> tensor_ = nullptr;
     float learning_rate_ = 1.0f;
diff --git a/infini_train/include/autograd/function.h b/infini_train/include/autograd/function.h
index bbc091d4..defbf907 100644
--- a/infini_train/include/autograd/function.h
+++ b/infini_train/include/autograd/function.h
@@ -1,5 +1,6 @@
 #pragma once
 
+#include <functional>
 #include <memory>
 #include <utility>
 #include <vector>
@@ -9,8 +10,17 @@ class Tensor;
 }
 
 namespace infini_train::autograd {
+class HookHandle;
+
 class Function : public std::enable_shared_from_this<Function> {
 public:
+    using FunctionForwardPreHook = std::function<void(Function*, const std::vector<std::shared_ptr<Tensor>>&)>;
+    using FunctionForwardPostHook = std::function<void(Function*, const std::vector<std::shared_ptr<Tensor>>&,
+                                                        const std::vector<std::shared_ptr<Tensor>>&)>;
+    using FunctionBackwardPreHook = std::function<void(Function*, const std::vector<std::shared_ptr<Tensor>>&)>;
+    using FunctionBackwardPostHook = std::function<void(Function*, const std::vector<std::shared_ptr<Tensor>>&,
+                                                         const std::vector<std::shared_ptr<Tensor>>&)>;
+
     static constexpr char kUndefinedType[] = "Undefined";
 
     Function() : type_(kUndefinedType) {}
@@ -28,6 +38,13 @@ class Function : public std::enable_shared_from_this<Function> {
 
     void IncreaseDependenciesNumber();
 
+    std::shared_ptr<HookHandle> RegisterForwardPreHook(FunctionForwardPreHook hook);
+    std::shared_ptr<HookHandle> RegisterForwardPostHook(FunctionForwardPostHook hook);
+    std::shared_ptr<HookHandle> RegisterBackwardPreHook(FunctionBackwardPreHook hook);
+    std::shared_ptr<HookHandle> RegisterBackwardPostHook(FunctionBackwardPostHook hook);
+
+    const std::string& type() const { return type_; }
+
 protected:
     std::vector<std::shared_ptr<Tensor>> saved_tensors_;
 
@@ -38,5 +55,10 @@ class Function : public std::enable_shared_from_this<Function> {
     int grad_outputs_reached_ = 0;
     std::vector<std::shared_ptr<Tensor>> grad_outputs_;
     const std::string type_ = kUndefinedType;
+    std::vector<FunctionForwardPreHook> forward_pre_hooks_;
+    std::vector<FunctionForwardPostHook> forward_post_hooks_;
+    std::vector<FunctionBackwardPreHook> backward_pre_hooks_;
+    std::vector<FunctionBackwardPostHook> backward_post_hooks_;
+    bool precision_check_registered_ = false;
 };
 } // namespace infini_train::autograd
diff --git a/infini_train/include/autograd/function_hook.h b/infini_train/include/autograd/function_hook.h
index 7d750e03..7d57926f 100644
--- a/infini_train/include/autograd/function_hook.h
+++ b/infini_train/include/autograd/function_hook.h
@@ -1,6 +1,8 @@
 #pragma once
 
+#include <functional>
 #include <memory>
+#include <vector>
 
 #include "infini_train/include/nn/parallel/reduce_op_type.h"
 
@@ -13,6 +15,14 @@ class ProcessGroup;
 } // namespace infini_train
 
 namespace infini_train::autograd {
+class Function;
+
+class HookHandle {
+public:
+    virtual ~HookHandle() = default;
+    virtual void Remove() = 0;
+};
+
 class PostAccumulateGradHook {
 public:
     virtual void operator()(const std::shared_ptr<Tensor> &tensor) = 0;
@@ -30,4 +40,22 @@ class AllReducePostAccumulateHook : public PostAccumulateGradHook {
     infini_train::nn::parallel::function::ReduceOpType reduce_op_;
     const infini_train::nn::parallel::ProcessGroup *pg_ = nullptr;
 };
+
+template <typename HookType>
+class FunctionHookHandleImpl : public HookHandle {
+public:
+    FunctionHookHandleImpl(std::vector<HookType>* hooks, size_t id) : hooks_(hooks), id_(id) {}
+
+    void Remove() override {
+        if (!removed_ && hooks_ && id_ < hooks_->size()) {
+            (*hooks_)[id_] = nullptr;
+            removed_ = true;
+        }
+    }
+
+private:
+    std::vector<HookType>* hooks_;
+    size_t id_;
+    bool removed_ = false;
+};
 } // namespace infini_train::autograd
diff --git a/infini_train/include/autograd/tensor_hook.h b/infini_train/include/autograd/tensor_hook.h
new file mode 100644
index 00000000..f7fcbe37
--- /dev/null
+++ b/infini_train/include/autograd/tensor_hook.h
@@ -0,0 +1,37 @@
+#pragma once
+
+#include <functional>
+#include <memory>
+#include <vector>
+
+namespace infini_train {
+class Tensor;
+
+namespace autograd {
+
+// Hook handle for removing hooks
+class HookHandle {
+public:
+    virtual ~HookHandle() = default;
+    virtual void Remove() = 0;
+};
+
+// Tensor backward hook: modifies gradient during backward pass
+// Returns modified gradient or nullptr to keep original
+using TensorBackwardHook = std::function<std::shared_ptr<Tensor>(const std::shared_ptr<Tensor>&)>;
+
+class TensorBackwardHookHandle : public HookHandle {
+public:
+    TensorBackwardHookHandle(std::vector<TensorBackwardHook>* hooks, size_t id)
+        : hooks_(hooks), id_(id) {}
+
+    void Remove() override;
+
+private:
+    std::vector<TensorBackwardHook>* hooks_;
+    size_t id_;
+    bool removed_ = false;
+};
+
+} // namespace autograd
+} // namespace infini_train
diff --git a/infini_train/include/nn/module_hook.h b/infini_train/include/nn/module_hook.h
new file mode 100644
index 00000000..ea3b9219
--- /dev/null
+++ b/infini_train/include/nn/module_hook.h
@@ -0,0 +1,56 @@
+#pragma once
+
+#include <functional>
+#include <memory>
+#include <vector>
+
+namespace infini_train {
+class Tensor;
+
+namespace nn {
+class Module;
+
+// Forward pre-hook: called before forward pass
+// Args: (module, input_tensors)
+using ForwardPreHook = std::function<void(Module*, const std::vector<std::shared_ptr<Tensor>>&)>;
+
+// Forward post-hook: called after forward pass
+// Args: (module, input_tensors, output_tensors)
+using ForwardPostHook = std::function<void(Module*, const std::vector<std::shared_ptr<Tensor>>&,
+                                           const std::vector<std::shared_ptr<Tensor>>&)>;
+
+// Backward pre-hook: called before backward pass
+// Args: (module, grad_output)
+using BackwardPreHook = std::function<void(Module*, const std::vector<std::shared_ptr<Tensor>>&)>;
+
+// Backward post-hook: called after backward pass
+// Args: (module, grad_input, grad_output)
+using BackwardPostHook = std::function<void(Module*, const std::vector<std::shared_ptr<Tensor>>&,
+                                            const std::vector<std::shared_ptr<Tensor>>&)>;
+
+class ModuleHookHandle {
+public:
+    virtual ~ModuleHookHandle() = default;
+    virtual void Remove() = 0;
+};
+
+template <typename HookType>
+class ModuleHookHandleImpl : public ModuleHookHandle {
+public:
+    ModuleHookHandleImpl(std::vector<HookType>* hooks, size_t id) : hooks_(hooks), id_(id) {}
+
+    void Remove() override {
+        if (!removed_ && hooks_ && id_ < hooks_->size()) {
+            (*hooks_)[id_] = nullptr;
+            removed_ = true;
+        }
+    }
+
+private:
+    std::vector<HookType>* hooks_;
+    size_t id_;
+    bool removed_ = false;
+};
+
+} // namespace nn
+} // namespace infini_train
diff --git a/infini_train/include/nn/modules/activations.h b/infini_train/include/nn/modules/activations.h
index b7435cd9..e47be5a3 100644
--- a/infini_train/include/nn/modules/activations.h
+++ b/infini_train/include/nn/modules/activations.h
@@ -12,7 +12,8 @@ class Tensor;
 namespace infini_train::nn {
 class Sigmoid : public CloneableModule<Sigmoid> {
 public:
-    Sigmoid() = default;
+    static constexpr char kType[] = "Sigmoid";
+    Sigmoid() : CloneableModule(kType) {}
     std::vector<std::shared_ptr<Tensor>> Forward(const std::vector<std::shared_ptr<Tensor>> &input_tensors) override;
 };
 } // namespace infini_train::nn
diff --git a/infini_train/include/nn/modules/container.h b/infini_train/include/nn/modules/container.h
index 4da8b3e6..28bceccf 100644
--- a/infini_train/include/nn/modules/container.h
+++ b/infini_train/include/nn/modules/container.h
@@ -13,6 +13,7 @@ class Tensor;
 namespace infini_train::nn {
 class Sequential : public CloneableModule<Sequential> {
 public:
+    static constexpr char kType[] = "Sequential";
     // TODO(dcj): Use better ctor signature later.
     explicit Sequential(std::vector<std::shared_ptr<Module>> &&layers);
 
@@ -21,6 +22,7 @@ class Sequential : public CloneableModule<Sequential> {
 
 class ModuleDict : public CloneableModule<ModuleDict> {
 public:
+    static constexpr char kType[] = "ModuleDict";
     // TODO(dcj): in torch, there is a dict with the order of insertion
     explicit ModuleDict(std::unordered_map<std::string, std::shared_ptr<Module>> modules);
 
diff --git a/infini_train/include/nn/modules/loss.h b/infini_train/include/nn/modules/loss.h
index 5b3ddf25..f0543f53 100644
--- a/infini_train/include/nn/modules/loss.h
+++ b/infini_train/include/nn/modules/loss.h
@@ -8,7 +8,8 @@
 namespace infini_train::nn {
 class CrossEntropyLoss : public CloneableModule<CrossEntropyLoss> {
 public:
-    CrossEntropyLoss() = default;
+    static constexpr char kType[] = "CrossEntropyLoss";
+    CrossEntropyLoss() : CloneableModule(kType) {}
 
     std::vector<std::shared_ptr<Tensor>> Forward(const std::vector<std::shared_ptr<Tensor>> &input_tensors) override;
 };
diff --git a/infini_train/include/nn/modules/module.h b/infini_train/include/nn/modules/module.h
index 9bc78bcc..266684c5 100644
--- a/infini_train/include/nn/modules/module.h
+++ b/infini_train/include/nn/modules/module.h
@@ -7,6 +7,7 @@
 #include <vector>
 
 #include "infini_train/include/datatype.h"
+#include "infini_train/include/nn/module_hook.h"
 
 namespace infini_train {
 class Tensor;
@@ -50,6 +51,10 @@ class Module : public std::enable_shared_from_this<Module> {
 
     std::unordered_map<std::string, std::shared_ptr<Tensor>> StateDict() const;
 
+    // operator() calls hooks and Forward
+    std::vector<std::shared_ptr<Tensor>> operator()(const std::vector<std::shared_ptr<Tensor>> &input_tensors);
+
+    // Forward to be overridden by subclasses
     virtual std::vector<std::shared_ptr<Tensor>> Forward(const std::vector<std::shared_ptr<Tensor>> &input_tensors);
 
     virtual float TrainStep(const std::vector<std::shared_ptr<Tensor>> &input_tensors,
@@ -66,6 +71,12 @@ class Module : public std::enable_shared_from_this<Module> {
 
     virtual std::shared_ptr<Module> ReplicateForDataParallel(int device_idx) const;
 
+    // Hook registration methods
+    std::shared_ptr<ModuleHookHandle> RegisterForwardPreHook(ForwardPreHook hook);
+    std::shared_ptr<ModuleHookHandle> RegisterForwardPostHook(ForwardPostHook hook);
+    std::shared_ptr<ModuleHookHandle> RegisterBackwardPreHook(BackwardPreHook hook);
+    std::shared_ptr<ModuleHookHandle> RegisterBackwardPostHook(BackwardPostHook hook);
+
 protected:
     const Device *device_ = nullptr;
     const std::string type_ = kUndefinedType;
@@ -73,6 +84,12 @@ class Module : public std::enable_shared_from_this<Module> {
     std::unordered_map<std::string, std::shared_ptr<Tensor>> parameters_;
     std::unordered_map<std::string, std::shared_ptr<Tensor>> buffers_;
 
+    std::vector<ForwardPreHook> forward_pre_hooks_;
+    std::vector<ForwardPostHook> forward_post_hooks_;
+    std::vector<BackwardPreHook> backward_pre_hooks_;
+    std::vector<BackwardPostHook> backward_post_hooks_;
+    bool precision_check_registered_ = false;
+
 private:
     std::unordered_map<std::string, std::shared_ptr<Module>>
     NamedModules(const std::string &prefix = "", bool remove_duplicate = true,
diff --git a/infini_train/include/nn/modules/normalization.h b/infini_train/include/nn/modules/normalization.h
index 4dcdf807..111e96b7 100644
--- a/infini_train/include/nn/modules/normalization.h
+++ b/infini_train/include/nn/modules/normalization.h
@@ -13,6 +13,7 @@ class Device;
 namespace infini_train::nn {
 class LayerNorm : public CloneableModule<LayerNorm> {
 public:
+    static constexpr char kType[] = "LayerNorm";
     static constexpr char kParamWeightName[] = "weight";
     static constexpr char kParamBiasName[] = "bias";
 
diff --git a/infini_train/include/nn/parallel/global.h b/infini_train/include/nn/parallel/global.h
index 480c1286..bc178c19 100644
--- a/infini_train/include/nn/parallel/global.h
+++ b/infini_train/include/nn/parallel/global.h
@@ -27,7 +27,13 @@ class GlobalEnv {
     static GlobalEnv &Instance();
 
     void Init(int threads_per_process, int tensor_parallel_size, bool sequence_parallel_enabled,
-              int pipeline_parallel_size, int virtual_pipeline_parallel_size);
+              int pipeline_parallel_size, int virtual_pipeline_parallel_size, int precision_check_level = 0,
+              bool precision_check_all_ranks = false);
+
+    enum class PrecisionCheckLevel { NONE, FUNCTION, MODULE };
+    void SetPrecisionCheckLevel(PrecisionCheckLevel level);
+    PrecisionCheckLevel GetPrecisionCheckLevel() const;
+    bool GetPrecisionCheckAllRanks() const;
 
     int nnodes() const;
 
@@ -83,14 +89,17 @@ class GlobalEnv {
     bool initialized_ = false;
 
     Layout layout_;
+    PrecisionCheckLevel precision_check_level_ = PrecisionCheckLevel::NONE;
+    bool precision_check_all_ranks_ = false;
 };
 
 inline void InitAllEnv(int nthread_per_process, int tensor_parallel_size, bool sequence_parallel_enabled,
-                       int pipeline_parallel_size, int virtual_pipeline_parallel) {
+                       int pipeline_parallel_size, int virtual_pipeline_parallel, int precision_check_level = 0,
+                       bool precision_check_all_ranks = false) {
     GlobalEnv::Instance().Init(nthread_per_process, tensor_parallel_size, sequence_parallel_enabled,
-                               pipeline_parallel_size, virtual_pipeline_parallel);
+                               pipeline_parallel_size, virtual_pipeline_parallel, precision_check_level,
+                               precision_check_all_ranks);
 }
-
 inline int GetNnodes() { return GlobalEnv::Instance().nnodes(); }
 inline int GetWorldSize() { return GlobalEnv::Instance().world_size(); }
 inline int GetNprocPerNode() { return GlobalEnv::Instance().nproc_per_node(); }
diff --git a/infini_train/include/nn/parallel/tensor_parallel.h b/infini_train/include/nn/parallel/tensor_parallel.h
index a2aa61ea..3a6f6498 100644
--- a/infini_train/include/nn/parallel/tensor_parallel.h
+++ b/infini_train/include/nn/parallel/tensor_parallel.h
@@ -103,8 +103,9 @@ class VocabParallelCrossEntropy : public autograd::Function {
 
 class VocabParallelCrossEntropyLoss : public nn::CloneableModule<VocabParallelCrossEntropyLoss> {
 public:
+    static constexpr char kType[] = "VocabParallelCrossEntropyLoss";
     VocabParallelCrossEntropyLoss(int64_t vocab_size_original = 0, float label_smoothing = 0.f)
-        : vocab_size_original_(vocab_size_original), label_smoothing_(label_smoothing){};
+        : CloneableModule(kType), vocab_size_original_(vocab_size_original), label_smoothing_(label_smoothing){};
 
     std::vector<std::shared_ptr<Tensor>> Forward(const std::vector<std::shared_ptr<Tensor>> &input_tensors) override;
 
diff --git a/infini_train/include/utils/precision_checker.h b/infini_train/include/utils/precision_checker.h
new file mode 100644
index 00000000..6c09202b
--- /dev/null
+++ b/infini_train/include/utils/precision_checker.h
@@ -0,0 +1,49 @@
+#pragma once
+
+#include <memory>
+#include <string>
+#include <vector>
+
+namespace infini_train {
+class Tensor;
+
+namespace autograd {
+class Function;
+class HookHandle;
+} // namespace autograd
+
+namespace nn {
+class Module;
+} // namespace nn
+
+namespace utils {
+
+class PrecisionChecker {
+public:
+    struct Config {
+        bool check_nan = true;
+        bool check_inf = true;
+        bool print_stats = true;
+        bool abort_on_error = false;
+    };
+
+    static const Config& DefaultConfig() {
+        static Config default_config;
+        return default_config;
+    }
+
+    static void RegisterForFunction(autograd::Function* func, const std::string& name = "",
+                                    const Config& config = DefaultConfig());
+
+    // Register hooks for a Module (checks forward inputs/outputs)
+    static void RegisterForModule(nn::Module* module, const std::string& name = "",
+                                 const Config& config = DefaultConfig());
+
+private:
+    static void CheckTensors(const std::string& stage, const std::string& name,
+                           const std::vector<std::shared_ptr<Tensor>>& tensors,
+                           const Config& config);
+};
+
+} // namespace utils
+} // namespace infini_train
diff --git a/infini_train/src/autograd/function.cc b/infini_train/src/autograd/function.cc
index 48ad02a9..de39302a 100644
--- a/infini_train/src/autograd/function.cc
+++ b/infini_train/src/autograd/function.cc
@@ -3,10 +3,13 @@
 #include "glog/logging.h"
 
 #include "infini_train/include/autograd/accumulate.h"
+#include "infini_train/include/autograd/function_hook.h"
 #include "infini_train/include/autograd/grad_mode.h"
 #include "infini_train/include/device.h"
 #include "infini_train/include/dispatcher.h"
+#include "infini_train/include/nn/parallel/global.h"
 #include "infini_train/include/tensor.h"
+#include "infini_train/include/utils/precision_checker.h"
 
 namespace infini_train::autograd {
 
@@ -16,6 +19,22 @@ std::vector<std::shared_ptr<Tensor>> Function::Apply(const std::vector<std::shar
     // TODO(dcj): Cache context information to reduce setDevice overhead.
     device->SetDevice();
 
+    // Register precision check hooks if enabled (before forward)
+    if (!precision_check_registered_) {
+        auto precision_level = nn::parallel::global::GlobalEnv::Instance().GetPrecisionCheckLevel();
+        if (precision_level == nn::parallel::global::GlobalEnv::PrecisionCheckLevel::FUNCTION) {
+            utils::PrecisionChecker::RegisterForFunction(this, type_);
+            precision_check_registered_ = true;
+        }
+    }
+
+    // Call forward pre-hooks
+    for (const auto& hook : forward_pre_hooks_) {
+        if (hook) {
+            hook(this, input_tensors);
+        }
+    }
+
     std::vector<std::shared_ptr<Tensor>> output_tensors;
     {
         autograd::NoGradGuard no_grad;
@@ -24,6 +43,13 @@ std::vector<std::shared_ptr<Tensor>> Function::Apply(const std::vector<std::shar
         SetupContext(input_tensors, output_tensors);
     }
 
+    // Call forward post-hooks
+    for (const auto& hook : forward_post_hooks_) {
+        if (hook) {
+            hook(this, input_tensors, output_tensors);
+        }
+    }
+
     if (!autograd::GradMode::IsEnabled()) {
         // with no_grad: block graph building operations
         return output_tensors;
@@ -78,12 +104,28 @@ void Function::BackwardPartial(const std::shared_ptr<Tensor> &grad_output, int g
     ++dependencies_reached_;
     if (grad_outputs_reached_ == grad_outputs_.size()
         && (dependencies_reached_ == dependencies_number_ || dependencies_number_ == 0)) {
+
+        // Call backward pre-hooks
+        for (const auto& hook : backward_pre_hooks_) {
+            if (hook) {
+                hook(this, grad_outputs_);
+            }
+        }
+
         std::vector<std::shared_ptr<Tensor>> grad_inputs;
         {
             autograd::NoGradGuard no_grad;
             // no_grad in autograd.Function.Backward()
             grad_inputs = Backward(grad_outputs_);
         }
+
+        // Call backward post-hooks
+        for (const auto& hook : backward_post_hooks_) {
+            if (hook) {
+                hook(this, grad_inputs, grad_outputs_);
+            }
+        }
+
         saved_tensors_.clear();
         grad_outputs_.clear();
         grad_outputs_reached_ = 0;
@@ -94,6 +136,23 @@ void Function::BackwardPartial(const std::shared_ptr<Tensor> &grad_output, int g
             auto &grad_input = grad_inputs[idx];
             auto &[next_function, output_idx] = next_functions_[idx];
             if (grad_input && next_function) {
+                // // Apply tensor backward hooks only for leaf tensors
+                // // Only AccumulateGrad corresponds to a leaf tensor that user can register hooks on
+                // auto accumulate_grad = std::dynamic_pointer_cast<AccumulateGrad>(next_function);
+                // if (accumulate_grad) {
+                //     auto tensor = accumulate_grad->tensor();
+                //     if (tensor) {
+                //         const auto& hooks = tensor->backward_post_hooks_();
+                //         for (const auto& hook : hooks) {
+                //             if (hook) {
+                //                 auto modified_grad = hook(grad_input);
+                //                 if (modified_grad) {
+                //                     grad_input = modified_grad;
+                //                 }
+                //             }
+                //         }
+                //     }
+                // }
                 next_function->BackwardPartial(grad_input, output_idx);
             }
         }
@@ -101,4 +160,24 @@ void Function::BackwardPartial(const std::shared_ptr<Tensor> &grad_output, int g
 }
 
 void Function::IncreaseDependenciesNumber() { ++dependencies_number_; }
+
+std::shared_ptr<HookHandle> Function::RegisterForwardPreHook(FunctionForwardPreHook hook) {
+    forward_pre_hooks_.push_back(std::move(hook));
+    return std::make_shared<FunctionHookHandleImpl<FunctionForwardPreHook>>(&forward_pre_hooks_, forward_pre_hooks_.size() - 1);
+}
+
+std::shared_ptr<HookHandle> Function::RegisterForwardPostHook(FunctionForwardPostHook hook) {
+    forward_post_hooks_.push_back(std::move(hook));
+    return std::make_shared<FunctionHookHandleImpl<FunctionForwardPostHook>>(&forward_post_hooks_, forward_post_hooks_.size() - 1);
+}
+
+std::shared_ptr<HookHandle> Function::RegisterBackwardPreHook(FunctionBackwardPreHook hook) {
+    backward_pre_hooks_.push_back(std::move(hook));
+    return std::make_shared<FunctionHookHandleImpl<FunctionBackwardPreHook>>(&backward_pre_hooks_, backward_pre_hooks_.size() - 1);
+}
+
+std::shared_ptr<HookHandle> Function::RegisterBackwardPostHook(FunctionBackwardPostHook hook) {
+    backward_post_hooks_.push_back(std::move(hook));
+    return std::make_shared<FunctionHookHandleImpl<FunctionBackwardPostHook>>(&backward_post_hooks_, backward_post_hooks_.size() - 1);
+}
 } // namespace infini_train::autograd
diff --git a/infini_train/src/nn/modules/container.cc b/infini_train/src/nn/modules/container.cc
index 6df46663..33707b63 100644
--- a/infini_train/src/nn/modules/container.cc
+++ b/infini_train/src/nn/modules/container.cc
@@ -7,7 +7,7 @@
 #include "infini_train/include/tensor.h"
 
 namespace infini_train::nn {
-Sequential::Sequential(std::vector<std::shared_ptr<Module>> &&layers) {
+Sequential::Sequential(std::vector<std::shared_ptr<Module>> &&layers) : CloneableModule(kType) {
     int idx = 0;
     for (auto &layer : layers) {
         modules_[std::to_string(idx)] = std::move(layer);
@@ -17,11 +17,11 @@ Sequential::Sequential(std::vector<std::shared_ptr<Module>> &&layers) {
 
 std::vector<std::shared_ptr<Tensor>> Sequential::Forward(const std::vector<std::shared_ptr<Tensor>> &input_tensors) {
     auto &x = const_cast<std::vector<std::shared_ptr<Tensor>> &>(input_tensors);
-    for (int idx = 0; idx < modules_.size(); ++idx) { x = modules_[std::to_string(idx)]->Forward(x); }
+    for (int idx = 0; idx < modules_.size(); ++idx) { x = (*modules_[std::to_string(idx)])(x); }
     return x;
 }
 
-ModuleDict::ModuleDict(std::unordered_map<std::string, std::shared_ptr<Module>> modules) {
+ModuleDict::ModuleDict(std::unordered_map<std::string, std::shared_ptr<Module>> modules) : CloneableModule(kType) {
     for (auto &[name, layer] : modules) { modules_[name] = std::move(layer); }
 }
 
diff --git a/infini_train/src/nn/modules/module.cc b/infini_train/src/nn/modules/module.cc
index 4e0c6a28..b80a37ab 100644
--- a/infini_train/src/nn/modules/module.cc
+++ b/infini_train/src/nn/modules/module.cc
@@ -7,8 +7,11 @@
 
 #include "glog/logging.h"
 
+#include "infini_train/include/autograd/function.h"
 #include "infini_train/include/device.h"
+#include "infini_train/include/nn/parallel/global.h"
 #include "infini_train/include/tensor.h"
+#include "infini_train/include/utils/precision_checker.h"
 
 namespace infini_train::nn {
 
@@ -125,6 +128,61 @@ std::vector<std::shared_ptr<Tensor>> Module::Forward(const std::vector<std::shar
     return {};
 }
 
+std::vector<std::shared_ptr<Tensor>> Module::operator()(const std::vector<std::shared_ptr<Tensor>> &input_tensors) {
+    // Register precision check hooks if enabled and not already registered
+    if (!precision_check_registered_) {
+        auto precision_level = parallel::global::GlobalEnv::Instance().GetPrecisionCheckLevel();
+        if (precision_level == parallel::global::GlobalEnv::PrecisionCheckLevel::MODULE) {
+            utils::PrecisionChecker::RegisterForModule(this);
+            precision_check_registered_ = true;
+        }
+    }
+
+    // Call forward pre-hooks
+    for (const auto& hook : forward_pre_hooks_) {
+        if (hook) {
+            hook(this, input_tensors);
+        }
+    }
+
+    // Call actual Forward implementation
+    auto output_tensors = Forward(input_tensors);
+
+    // Call forward post-hooks
+    for (const auto& hook : forward_post_hooks_) {
+        if (hook) {
+            hook(this, input_tensors, output_tensors);
+        }
+    }
+
+    // Register backward hooks on output tensors' grad_fn
+    if (!backward_pre_hooks_.empty() || !backward_post_hooks_.empty()) {
+        for (const auto& output : output_tensors) {
+            if (output && output->grad_fn()) {
+                if (!backward_pre_hooks_.empty()) {
+                    output->grad_fn()->RegisterBackwardPreHook(
+                        [this](autograd::Function*, const std::vector<std::shared_ptr<Tensor>>& grad_outputs) {
+                            for (const auto& hook : backward_pre_hooks_) {
+                                if (hook) hook(this, grad_outputs);
+                            }
+                        });
+                }
+                if (!backward_post_hooks_.empty()) {
+                    output->grad_fn()->RegisterBackwardPostHook(
+                        [this](autograd::Function*, const std::vector<std::shared_ptr<Tensor>>& grad_inputs,
+                               const std::vector<std::shared_ptr<Tensor>>& grad_outputs) {
+                            for (const auto& hook : backward_post_hooks_) {
+                                if (hook) hook(this, grad_inputs, grad_outputs);
+                            }
+                        });
+                }
+            }
+        }
+    }
+
+    return output_tensors;
+}
+
 void Module::To(const Device *device) {
     CHECK_NOTNULL(device);
     if (device == device_) {
@@ -166,4 +224,24 @@ std::shared_ptr<Module> Module::ReplicateForDataParallel(int device_idx) const {
     // TODO(dcj): use device_idx later
     return std::make_shared<Module>(*this);
 }
+
+std::shared_ptr<ModuleHookHandle> Module::RegisterForwardPreHook(ForwardPreHook hook) {
+    forward_pre_hooks_.push_back(std::move(hook));
+    return std::make_shared<ModuleHookHandleImpl<ForwardPreHook>>(&forward_pre_hooks_, forward_pre_hooks_.size() - 1);
+}
+
+std::shared_ptr<ModuleHookHandle> Module::RegisterForwardPostHook(ForwardPostHook hook) {
+    forward_post_hooks_.push_back(std::move(hook));
+    return std::make_shared<ModuleHookHandleImpl<ForwardPostHook>>(&forward_post_hooks_, forward_post_hooks_.size() - 1);
+}
+
+std::shared_ptr<ModuleHookHandle> Module::RegisterBackwardPreHook(BackwardPreHook hook) {
+    backward_pre_hooks_.push_back(std::move(hook));
+    return std::make_shared<ModuleHookHandleImpl<BackwardPreHook>>(&backward_pre_hooks_, backward_pre_hooks_.size() - 1);
+}
+
+std::shared_ptr<ModuleHookHandle> Module::RegisterBackwardPostHook(BackwardPostHook hook) {
+    backward_post_hooks_.push_back(std::move(hook));
+    return std::make_shared<ModuleHookHandleImpl<BackwardPostHook>>(&backward_post_hooks_, backward_post_hooks_.size() - 1);
+}
 } // namespace infini_train::nn
diff --git a/infini_train/src/nn/modules/normalization.cc b/infini_train/src/nn/modules/normalization.cc
index 5479fc72..4c7c68f1 100644
--- a/infini_train/src/nn/modules/normalization.cc
+++ b/infini_train/src/nn/modules/normalization.cc
@@ -9,7 +9,8 @@
 #include "infini_train/include/tensor.h"
 
 namespace infini_train::nn {
-LayerNorm::LayerNorm(const std::vector<int64_t> &normalized_shape, float eps, const Device *device) : eps_(eps) {
+LayerNorm::LayerNorm(const std::vector<int64_t> &normalized_shape, float eps, const Device *device)
+    : CloneableModule(kType), eps_(eps) {
     device_ = device ? device : DeviceManager::Instance()->GetDefaultDevice();
 
     parameters_[kParamWeightName]
diff --git a/infini_train/src/nn/parallel/data_parallel.cc b/infini_train/src/nn/parallel/data_parallel.cc
index 0dec0c8f..1a64ab8a 100644
--- a/infini_train/src/nn/parallel/data_parallel.cc
+++ b/infini_train/src/nn/parallel/data_parallel.cc
@@ -31,7 +31,7 @@ ParallelApply(const std::vector<std::shared_ptr<Module>> &modules,
     auto worker = [&](const std::shared_ptr<Module> &module, const std::vector<std::shared_ptr<Tensor>> &inputs,
                       const Device *device, int idx) {
         device->SetDevice();
-        auto output = module->Forward(inputs);
+        auto output = (*module)(inputs);
         results[idx] = output;
     };
 
@@ -86,7 +86,7 @@ std::vector<std::shared_ptr<Tensor>> DataParallel::Forward(const std::vector<std
     auto scattered_inputs = function::Scatter(input_tensors, devices_, dim_);
 
     if (devices_.size() == 1) {
-        return module->Forward(scattered_inputs[0]);
+        return (*module)(scattered_inputs[0]);
     }
 
     auto replicas = function::Replicate(module, devices_);
diff --git a/infini_train/src/nn/parallel/distributed_data_parallel.cc b/infini_train/src/nn/parallel/distributed_data_parallel.cc
index a25a7d16..aacbfa38 100644
--- a/infini_train/src/nn/parallel/distributed_data_parallel.cc
+++ b/infini_train/src/nn/parallel/distributed_data_parallel.cc
@@ -50,7 +50,7 @@ DistributedDataParallel::DistributedDataParallel(std::shared_ptr<nn::Module> mod
 
 std::vector<std::shared_ptr<Tensor>>
 DistributedDataParallel::Forward(const std::vector<std::shared_ptr<Tensor>> &input_tensors) {
-    auto outputs = modules_[kModuleName]->Forward(input_tensors);
+    auto outputs = (*modules_[kModuleName])(input_tensors);
     if (reducer_) {
         reducer_->PrepareForBackward();
     }
diff --git a/infini_train/src/nn/parallel/global.cc b/infini_train/src/nn/parallel/global.cc
index 39cd95dd..80a01f57 100644
--- a/infini_train/src/nn/parallel/global.cc
+++ b/infini_train/src/nn/parallel/global.cc
@@ -1,5 +1,6 @@
 #include "infini_train/include/nn/parallel/global.h"
 
+#include <algorithm>
 #include <cstdlib>
 #include <format>
 #include <string>
@@ -90,7 +91,8 @@ GlobalEnv &GlobalEnv::Instance() {
 }
 
 void GlobalEnv::Init(int nthread_per_process, int tensor_parallel_size, bool sequence_parallel_enabled,
-                     int pipeline_parallel_size, int virtual_pipeline_parallel_size) {
+                     int pipeline_parallel_size, int virtual_pipeline_parallel_size, int precision_check_level,
+                     bool precision_check_all_ranks) {
     std::lock_guard<std::mutex> lock(mutex_);
 
     CHECK(!initialized_) << "Repeated initialization of GlobalEnv!";
@@ -114,6 +116,16 @@ void GlobalEnv::Init(int nthread_per_process, int tensor_parallel_size, bool seq
     layout_.sizes[PP] = pipeline_parallel_size_;
     layout_.InitStrides();
 
+    // Initialize precision check level from parameter
+    if (precision_check_level == 1) {
+        precision_check_level_ = PrecisionCheckLevel::MODULE;
+    } else if (precision_check_level == 2) {
+        precision_check_level_ = PrecisionCheckLevel::FUNCTION;
+    } else {
+        precision_check_level_ = PrecisionCheckLevel::NONE;
+    }
+    precision_check_all_ranks_ = precision_check_all_ranks;
+
     initialized_ = true;
 }
 
@@ -182,6 +194,19 @@ Layout GlobalEnv::layout() const {
     return layout_;
 }
 
+void GlobalEnv::SetPrecisionCheckLevel(PrecisionCheckLevel level) {
+    precision_check_level_ = level;
+}
+
+GlobalEnv::PrecisionCheckLevel GlobalEnv::GetPrecisionCheckLevel() const {
+    return precision_check_level_;
+}
+
+bool GlobalEnv::GetPrecisionCheckAllRanks() const {
+    CHECK(initialized_) << "GlobalEnv is not initialized!";
+    return precision_check_all_ranks_;
+}
+
 namespace {
 inline const char *AxisName(Axis a) { return a == DP ? "DP" : (a == TP ? "TP" : "PP"); }
 
diff --git a/infini_train/src/nn/parallel/pp/pipeline_schedule.cc b/infini_train/src/nn/parallel/pp/pipeline_schedule.cc
index 95dd3bbc..5eb32488 100644
--- a/infini_train/src/nn/parallel/pp/pipeline_schedule.cc
+++ b/infini_train/src/nn/parallel/pp/pipeline_schedule.cc
@@ -244,7 +244,7 @@ float PipelineSchedule::StepMicroBatches(const std::vector<std::shared_ptr<Tenso
                     infini_train::AutocastGuard autocast_guard(stage_->device()->Type(), dtype);
 
                     auto target_on_device = target->To(activations[task.local_chunk_idx][mb][0]->GetDevice());
-                    loss = loss_fn->Forward(
+                    loss = (*loss_fn)(
                         {activations[task.local_chunk_idx][mb][0], std::make_shared<Tensor>(target_on_device)})[0];
                     loss = loss / n;
                 }
diff --git a/infini_train/src/nn/parallel/pp/pipeline_stage.cc b/infini_train/src/nn/parallel/pp/pipeline_stage.cc
index 582b9bd2..968bb0a6 100644
--- a/infini_train/src/nn/parallel/pp/pipeline_stage.cc
+++ b/infini_train/src/nn/parallel/pp/pipeline_stage.cc
@@ -25,7 +25,7 @@ std::vector<std::shared_ptr<Tensor>> PipelineStage::ForwardOneChunk(const std::v
         LOG(FATAL) << "PipelineStage::ForwardOneChunk: local_chunk_idx=" << local_chunk_idx << " out of range [0, "
                    << chunks_.size() << ")";
     }
-    return chunks_[local_chunk_idx]->Forward(inputs);
+    return (*chunks_[local_chunk_idx])(inputs);
 }
 
 bool PipelineStage::IsFirstStage() const { return stage_index_ == 0; }
diff --git a/infini_train/src/nn/parallel/tensor_parallel.cc b/infini_train/src/nn/parallel/tensor_parallel.cc
index b91028df..129e2f4b 100644
--- a/infini_train/src/nn/parallel/tensor_parallel.cc
+++ b/infini_train/src/nn/parallel/tensor_parallel.cc
@@ -269,8 +269,8 @@ std::vector<std::shared_ptr<Tensor>> GatherFromSPRegionFunc(const std::shared_pt
 
 ColumnParallelLinear::ColumnParallelLinear(int64_t in_features, int64_t out_features, bool bias, bool gather_output,
                                            bool input_is_parallel, bool skip_bias_add, bool sequence_parallel)
-    : bias_(bias), gather_output_(gather_output), input_is_parallel_(input_is_parallel), skip_bias_add_(skip_bias_add),
-      sequence_parallel_(sequence_parallel) {
+    : CloneableModule(kType), bias_(bias), gather_output_(gather_output), input_is_parallel_(input_is_parallel),
+      skip_bias_add_(skip_bias_add), sequence_parallel_(sequence_parallel) {
     auto tp_size = global::GetTensorParallelSize();
     CHECK_GT(tp_size, 0) << "No available devices found";
     CHECK_EQ(out_features % tp_size, 0) << "out_features must be divisible by TP world size for ColumnParallel";
@@ -315,8 +315,8 @@ ColumnParallelLinear::Forward(const std::vector<std::shared_ptr<Tensor>> &input_
 
 RowParallelLinear::RowParallelLinear(int64_t in_features, int64_t out_features, bool bias, bool reduce_output,
                                      bool input_is_parallel, bool skip_bias_add, bool sequence_parallel)
-    : bias_(bias), reduce_output_(reduce_output), input_is_parallel_(input_is_parallel), skip_bias_add_(skip_bias_add),
-      sequence_parallel_(sequence_parallel) {
+    : CloneableModule(kType), bias_(bias), reduce_output_(reduce_output), input_is_parallel_(input_is_parallel),
+      skip_bias_add_(skip_bias_add), sequence_parallel_(sequence_parallel) {
     auto tp_size = global::GetTensorParallelSize();
     CHECK_GT(tp_size, 0) << "No available devices found";
     CHECK_EQ(in_features % tp_size, 0) << "in_features must be divisible by TP world size for RowParallel";
@@ -362,7 +362,7 @@ RowParallelLinear::Forward(const std::vector<std::shared_ptr<Tensor>> &input_ten
 
 VocabParallelEmbedding::VocabParallelEmbedding(int64_t num_embeddings, int64_t embedding_dim,
                                                bool reduce_scatter_embeddings)
-    : vocab_size_global_(num_embeddings), embedding_dim_(embedding_dim),
+    : CloneableModule(kType), vocab_size_global_(num_embeddings), embedding_dim_(embedding_dim),
       reduce_scatter_embeddings_(reduce_scatter_embeddings) {
     auto tp_size = global::GetTensorParallelSize();
     CHECK_GT(tp_size, 0) << "No available devices found for VocabParallelEmbedding";
diff --git a/infini_train/src/utils/precision_checker.cc b/infini_train/src/utils/precision_checker.cc
new file mode 100644
index 00000000..0f040df9
--- /dev/null
+++ b/infini_train/src/utils/precision_checker.cc
@@ -0,0 +1,184 @@
+#include "infini_train/include/utils/precision_checker.h"
+
+#include <chrono>
+#include <cmath>
+#include <cstdlib>
+#include <fstream>
+#include <iomanip>
+#include <iostream>
+#include <limits>
+#include <mutex>
+#include <sstream>
+
+#include "infini_train/include/autograd/function.h"
+#include "infini_train/include/nn/modules/module.h"
+#include "infini_train/include/nn/parallel/global.h"
+#include "infini_train/include/tensor.h"
+
+namespace infini_train::utils {
+
+namespace {
+std::ofstream& GetLogStream() {
+    static std::ofstream log_file;
+    static std::mutex init_mutex;
+    static bool initialized = false;
+
+    if (!initialized) {
+        std::lock_guard<std::mutex> lock(init_mutex);
+        if (!initialized) {
+            int rank = nn::parallel::global::GlobalEnv::Instance().global_proc_rank();
+            std::string filename = "precision_check_rank_" + std::to_string(rank) + ".log";
+            log_file.open(filename, std::ios::out | std::ios::trunc);
+            initialized = true;
+        }
+    }
+    return log_file;
+}
+
+bool ShouldPrint() {
+    if (nn::parallel::global::GlobalEnv::Instance().GetPrecisionCheckAllRanks()) {
+        return true;
+    }
+    return nn::parallel::global::GlobalEnv::Instance().global_proc_rank() == 0;
+}
+
+std::string GetTimestamp() {
+    auto now = std::chrono::system_clock::now();
+    auto time_t = std::chrono::system_clock::to_time_t(now);
+    auto ms = std::chrono::duration_cast<std::chrono::milliseconds>(now.time_since_epoch()) % 1000;
+
+    std::tm tm;
+    localtime_r(&time_t, &tm);
+
+    std::ostringstream oss;
+    oss << std::setfill('0')
+        << std::setw(2) << (tm.tm_mon + 1)
+        << std::setw(2) << tm.tm_mday << ' '
+        << std::setw(2) << tm.tm_hour << ':'
+        << std::setw(2) << tm.tm_min << ':'
+        << std::setw(2) << tm.tm_sec << '.'
+        << std::setw(3) << ms.count();
+    return oss.str();
+}
+} // namespace
+
+void PrecisionChecker::CheckTensors(const std::string& stage, const std::string& name,
+                                   const std::vector<std::shared_ptr<Tensor>>& tensors,
+                                   const Config& config) {
+    if (!ShouldPrint()) {
+        return;
+    }
+
+    int rank = nn::parallel::global::GlobalEnv::Instance().global_proc_rank();
+
+    for (size_t i = 0; i < tensors.size(); ++i) {
+        if (!tensors[i]) continue;
+
+        auto& tensor = tensors[i];
+
+        // Copy tensor to CPU if it's on GPU
+        std::shared_ptr<Tensor> cpu_tensor;
+        if (tensor->GetDevice()->Type() == DeviceType::kCUDA) {
+            auto cpu_device = DeviceManager::Instance()->GetDevice(DeviceType::kCPU);
+            cpu_tensor = std::make_shared<Tensor>(tensor->To(cpu_device));
+        } else {
+            cpu_tensor = tensor;
+        }
+
+        const float* data = static_cast<const float*>(cpu_tensor->DataPtr());
+        size_t size = cpu_tensor->NumElements();
+
+        bool has_nan = false;
+        bool has_inf = false;
+
+        for (size_t j = 0; j < size; ++j) {
+            float val = data[j];
+            if (std::isnan(val)) has_nan = true;
+            if (std::isinf(val)) has_inf = true;
+        }
+
+        bool has_error = (config.check_nan && has_nan) || (config.check_inf && has_inf);
+
+        if (has_error || config.print_stats) {
+            auto& log_stream = GetLogStream();
+            std::string level = has_error ? "E" : "I";
+
+            log_stream << level << GetTimestamp() << " [Rank " << rank << "][PrecisionCheck] "
+                      << stage << " " << name << " tensor[" << i << "]: [";
+
+            if (has_nan) log_stream << " NaN detected!";
+            if (has_inf) log_stream << " Inf detected!";
+
+            if (config.print_stats) {
+                constexpr size_t max_print = 10;
+                for (size_t j = 0; j < std::min(size, max_print); ++j) {
+                    if (j > 0) log_stream << ", ";
+                    log_stream << data[j];
+                }
+                if (size > max_print) log_stream << ", ...";
+            }
+            log_stream << "]" << std::endl;
+        }
+
+        if (has_error && config.abort_on_error) {
+            std::cerr << "Precision check failed, aborting!" << std::endl;
+            std::abort();
+        }
+    }
+}
+
+void PrecisionChecker::RegisterForFunction(autograd::Function* func, const std::string& name,
+                                          const Config& config) {
+    std::string func_name = name.empty() ? "Function" : name;
+
+    func->RegisterForwardPreHook([func_name, config](autograd::Function*,
+                                                      const std::vector<std::shared_ptr<Tensor>>& inputs) {
+        CheckTensors("Forward Input", func_name, inputs, config);
+    });
+
+    func->RegisterForwardPostHook([func_name, config](autograd::Function*,
+                                                       const std::vector<std::shared_ptr<Tensor>>&,
+                                                       const std::vector<std::shared_ptr<Tensor>>& outputs) {
+        CheckTensors("Forward Output", func_name, outputs, config);
+    });
+
+    func->RegisterBackwardPreHook([func_name, config](autograd::Function*,
+                                                       const std::vector<std::shared_ptr<Tensor>>& grad_outputs) {
+        CheckTensors("Backward Input", func_name, grad_outputs, config);
+    });
+
+    func->RegisterBackwardPostHook([func_name, config](autograd::Function*,
+                                                        const std::vector<std::shared_ptr<Tensor>>& grad_inputs,
+                                                        const std::vector<std::shared_ptr<Tensor>>&) {
+        CheckTensors("Backward Output", func_name, grad_inputs, config);
+    });
+}
+
+void PrecisionChecker::RegisterForModule(nn::Module* module, const std::string& name,
+                                        const Config& config) {
+    std::string module_name = name.empty() ? module->type() : name;
+
+    // module->RegisterForwardPreHook([module_name, config](nn::Module*,
+    //                                                       const std::vector<std::shared_ptr<Tensor>>& inputs) {
+    //     CheckTensors("Module Forward Input", module_name, inputs, config);
+    // });
+
+    module->RegisterForwardPostHook([module_name, config](nn::Module*,
+                                                           const std::vector<std::shared_ptr<Tensor>>&,
+                                                           const std::vector<std::shared_ptr<Tensor>>& outputs) {
+        CheckTensors("Module Forward Output", module_name, outputs, config);
+    });
+
+    // module->RegisterBackwardPreHook([module_name, config](nn::Module*,
+    //                                                        const std::vector<std::shared_ptr<Tensor>>& grad_outputs) {
+    //     CheckTensors("Module Backward Input", module_name, grad_outputs, config);
+    // });
+
+    module->RegisterBackwardPostHook([module_name, config](nn::Module*,
+                                                            const std::vector<std::shared_ptr<Tensor>>& grad_inputs,
+                                                            const std::vector<std::shared_ptr<Tensor>>&) {
+        CheckTensors("Module Backward Output", module_name, grad_inputs, config);
+    });
+}
+
+} // namespace infini_train::utils

From 8ca49b159bde599f02894d77b22878101ecbbd1e Mon Sep 17 00:00:00 2001
From: chen <cx2016013@163.com>
Date: Tue, 13 Jan 2026 10:20:31 +0000
Subject: [PATCH 2/6] refactor: enhance precision checker with context
 management and add comprehensive docs

- Add PrecisionCheckConfig and PrecisionCheckContext for better state management
- Refactor precision checker to use context-based architecture
- Add comprehensive documentation (hook_mechanism.md, precision_checker_guide.md)
- Add test cases for hook system and precision checking
- Update CMakeLists.txt to include new test targets
- Improve command-line flag handling in examples

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
---
 CMakeLists.txt                                |   6 +
 docs/hook_mechanism.md                        | 198 ++++++++
 docs/precision_checker_guide.md               | 309 ++++++++++++
 example/gpt2/main.cc                          |   8 +-
 example/llama3/main.cc                        |   8 +-
 infini_train/include/autograd/function.h      |  25 +-
 infini_train/include/autograd/tensor_hook.h   |   9 +-
 infini_train/include/nn/parallel/global.h     |  20 +-
 .../include/utils/precision_check_config.h    |  45 ++
 .../include/utils/precision_check_context.h   |  50 ++
 infini_train/src/autograd/function.cc         |  16 +-
 infini_train/src/nn/parallel/global.cc        |  20 +-
 infini_train/src/utils/precision_checker.cc   | 438 +++++++++++++++---
 test/hook/test_hook.cc                        | 189 ++++++++
 test/hook/test_precision_check.cc             |  90 ++++
 15 files changed, 1296 insertions(+), 135 deletions(-)
 create mode 100644 docs/hook_mechanism.md
 create mode 100644 docs/precision_checker_guide.md
 create mode 100644 infini_train/include/utils/precision_check_config.h
 create mode 100644 infini_train/include/utils/precision_check_context.h
 create mode 100644 test/hook/test_hook.cc
 create mode 100644 test/hook/test_precision_check.cc

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 06242344..9ff66ecf 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -88,6 +88,12 @@ target_link_libraries(gpt2 infini_train)
 add_executable(llama3 example/llama3/main.cc example/common/tiny_shakespeare_dataset.cc example/common/utils.cc example/llama3/net.cc example/common/tokenizer.cc)
 target_link_libraries(llama3 infini_train)
 
+add_executable(test_hook test/hook/test_hook.cc)
+target_link_libraries(test_hook infini_train)
+
+add_executable(test_precision_check test/hook/test_precision_check.cc)
+target_link_libraries(test_precision_check infini_train)
+
 add_subdirectory(tools/infini_run)
 
 set_target_properties(infini_run PROPERTIES
diff --git a/docs/hook_mechanism.md b/docs/hook_mechanism.md
new file mode 100644
index 00000000..34aa9607
--- /dev/null
+++ b/docs/hook_mechanism.md
@@ -0,0 +1,198 @@
+# Hook Mechanism Design
+
+仿照 PyTorch 设计的 Hook 机制，支持 Module 和 Function 级别的 hook。
+
+## 1. Module Hooks
+
+### 1.1 Forward Pre-Hook
+在 forward 执行前调用。
+
+```cpp
+auto handle = module->RegisterForwardPreHook(
+    [](Module* mod, const std::vector<std::shared_ptr<Tensor>>& inputs) {
+        // 在 forward 前执行的逻辑
+    }
+);
+```
+
+### 1.2 Forward Post-Hook
+在 forward 执行后调用。
+
+```cpp
+auto handle = module->RegisterForwardPostHook(
+    [](Module* mod,
+       const std::vector<std::shared_ptr<Tensor>>& inputs,
+       const std::vector<std::shared_ptr<Tensor>>& outputs) {
+        // 在 forward 后执行的逻辑
+    }
+);
+```
+
+### 1.3 Backward Pre-Hook
+在 backward 执行前调用。
+
+```cpp
+auto handle = module->RegisterBackwardPreHook(
+    [](Module* mod, const std::vector<std::shared_ptr<Tensor>>& grad_outputs) {
+        // 在 backward 前执行的逻辑
+    }
+);
+```
+
+### 1.4 Backward Post-Hook
+在 backward 执行后调用。
+
+```cpp
+auto handle = module->RegisterBackwardPostHook(
+    [](Module* mod,
+       const std::vector<std::shared_ptr<Tensor>>& grad_inputs,
+       const std::vector<std::shared_ptr<Tensor>>& grad_outputs) {
+        // 在 backward 后执行的逻辑
+    }
+);
+```
+
+### 使用场景
+- 特征提取和可视化
+- 激活值监控
+- 梯度流分析
+- 性能分析和 profiling
+
+### 实现位置
+- `infini_train/include/nn/module_hook.h`
+- Module hooks 在 `Module::operator()` 中被调用（forward_pre_hooks_ 和 forward_post_hooks_）
+- 子类只需重写 `Forward()` 方法，hooks 会自动执行
+
+### 使用说明
+- **调用方式**: 使用 `(*module)(inputs)` 而不是 `module->Forward(inputs)`
+- **子类实现**: 只需重写 `Forward()` 方法，不需要手动调用 hooks
+- **Hook 自动执行**: `operator()` 会自动调用 pre-hooks、Forward、post-hooks
+
+## 2. Function Hooks
+
+Function hooks 使用统一的类型定义：
+- `FunctionPreHook`: 用于 Forward Pre-Hook 和 Backward Pre-Hook
+- `FunctionPostHook`: 用于 Forward Post-Hook 和 Backward Post-Hook
+
+### 2.1 Function Forward Pre-Hook
+在 Function 的 forward 执行前调用。
+
+```cpp
+auto handle = function->RegisterForwardPreHook(
+    [](autograd::Function* func, const std::vector<std::shared_ptr<Tensor>>& inputs) {
+        // 在 forward 前执行的逻辑
+    }
+);
+```
+
+### 2.2 Function Forward Post-Hook
+在 Function 的 forward 执行后调用。
+
+```cpp
+auto handle = function->RegisterForwardPostHook(
+    [](autograd::Function* func,
+       const std::vector<std::shared_ptr<Tensor>>& inputs,
+       const std::vector<std::shared_ptr<Tensor>>& outputs) {
+        // 在 forward 后执行的逻辑
+    }
+);
+```
+
+### 2.3 Function Backward Pre-Hook
+在 Function 的 backward 执行前调用。
+
+```cpp
+auto handle = function->RegisterBackwardPreHook(
+    [](autograd::Function* func, const std::vector<std::shared_ptr<Tensor>>& grad_outputs) {
+        // 在 backward 前执行的逻辑
+    }
+);
+```
+
+### 2.4 Function Backward Post-Hook
+在 Function 的 backward 执行后调用。
+
+```cpp
+auto handle = function->RegisterBackwardPostHook(
+    [](autograd::Function* func,
+       const std::vector<std::shared_ptr<Tensor>>& grad_inputs,
+       const std::vector<std::shared_ptr<Tensor>>& grad_outputs) {
+        // 在 backward 后执行的逻辑
+    }
+);
+```
+
+### 使用场景
+- 算子级别的性能分析
+- 中间结果监控
+- 自动微分图调试
+- 梯度流分析
+
+### 实现位置
+- `infini_train/include/autograd/function_hook.h`
+- `infini_train/include/autograd/function.h`
+- Function forward hooks 在 `Function::Apply()` 中被调用
+- Function backward hooks 在 `Function::BackwardPartial()` 中被调用
+
+## 3. Hook 类型简化
+
+为了减少冗余，Function hooks 使用了统一的类型定义：
+
+```cpp
+// 在 function.h 中定义
+using FunctionPreHook = std::function<void(Function*, const std::vector<std::shared_ptr<Tensor>>&)>;
+using FunctionPostHook = std::function<void(Function*, const std::vector<std::shared_ptr<Tensor>>&,
+                                             const std::vector<std::shared_ptr<Tensor>>&)>;
+```
+
+- `FunctionPreHook` 用于 Forward Pre-Hook 和 Backward Pre-Hook（签名相同）
+- `FunctionPostHook` 用于 Forward Post-Hook 和 Backward Post-Hook（签名相同）
+
+## 4. Hook Handle 和移除机制
+
+所有 hook 注册函数都返回 `std::shared_ptr<HookHandle>`，可用于移除 hook：
+
+```cpp
+auto handle = function->RegisterForwardPreHook(...);
+
+// 移除 hook
+handle->Remove();
+```
+
+移除后的 hook 会被设置为 `nullptr`，不会影响其他 hook 的执行。
+
+## 5. 调用流程
+
+### Forward Pass
+```
+Module::operator()
+  ├─> Forward Pre-Hooks
+  ├─> Forward()
+  │     └─> Function::Apply()
+  │           ├─> Function Forward Pre-Hooks
+  │           ├─> Forward()
+  │           └─> Function Forward Post-Hooks
+  └─> Forward Post-Hooks
+```
+
+### Backward Pass
+```
+Function::BackwardPartial()
+  ├─> Backward Pre-Hooks
+  ├─> Backward()
+  └─> Backward Post-Hooks
+```
+
+## 6. 示例代码
+
+参见：
+- `test/hook/test_hook.cc` - 完整的 hook 使用示例
+- `infini_train/include/autograd/function_hook.h` - Hook API 定义
+
+## 7. 注意事项
+
+1. Hook 按注册顺序执行
+2. 移除的 hook 会被设置为 nullptr，不会影响其他 hook
+3. **Module 调用**: 使用 `(*module)(inputs)` 而不是 `module->Forward(inputs)`
+4. **Module 子类**: 只需重写 `Forward()` 方法，hooks 会自动执行
+5. Function hooks 在 Function::Apply() 和 Function::BackwardPartial() 中自动调用
diff --git a/docs/precision_checker_guide.md b/docs/precision_checker_guide.md
new file mode 100644
index 00000000..e33eef22
--- /dev/null
+++ b/docs/precision_checker_guide.md
@@ -0,0 +1,309 @@
+# Precision Checker 使用指南
+
+精度检查工具，用于检测模型训练过程中的数值稳定性问题（NaN、Inf 等），支持 MD5 哈希对比和多种输出格式。
+
+## 功能特性
+
+- **自动检测 NaN/Inf**：在前向和反向传播过程中自动检测异常值
+- **多级别检查**：支持 Module 级别和 Function 级别的精度检查
+- **灵活配置**：通过 key=value 字符串配置所有选项
+- **MD5 哈希**：支持输出 tensor 的 MD5 值用于对比
+- **表格格式**：支持表格化输出，便于查看和对比
+- **基准对比**：支持加载基准文件进行自动对比
+- **上下文追踪**：支持 GAS（梯度累积步）和层号追踪
+- **性能优化**：仅在需要时计算 MD5，避免冗余计算
+
+## 配置方式
+
+### 配置结构
+
+```cpp
+struct PrecisionCheckConfig {
+    int level = 0;                    // 0=关闭, 1=MODULE级别, 2=FUNCTION级别
+    std::string output_path = "";     // 空=控制台(仅rank0), 非空=文件(所有rank)
+    bool output_md5 = false;          // 输出 MD5 还是 tensor 值
+    std::string format = "simple";    // "simple" 或 "table"
+    std::string baseline_path = "";   // 基准文件路径（用于对比）
+};
+```
+
+### 配置字符串格式
+
+使用 `key=value,key=value` 格式：
+
+```cpp
+auto config = utils::PrecisionCheckConfig::Parse("level=2,format=table,output_md5=true");
+nn::parallel::global::InitAllEnv(nthread, tp_size, sp_enabled, pp_size, vpp_size, config);
+```
+
+### 配置选项说明
+
+| 选项 | 类型 | 默认值 | 说明 |
+|------|------|--------|------|
+| `level` | int | 0 | 0=关闭, 1=MODULE级别, 2=FUNCTION级别 |
+| `output_path` | string | "" | 空=控制台(仅rank0), 非空=文件路径(所有rank) |
+| `output_md5` | bool | false | true=输出MD5哈希, false=输出tensor值 |
+| `format` | string | "simple" | "simple"=简单格式, "table"=表格格式 |
+| `baseline` | string | "" | 基准文件路径，用于对比 |
+
+## 使用方法
+
+### 1. 基本用法（简单格式）
+
+```cpp
+#include "infini_train/include/nn/parallel/global.h"
+#include "infini_train/include/utils/precision_check_config.h"
+
+// 启用 Function 级别检查，输出 tensor 值
+auto config = utils::PrecisionCheckConfig::Parse("level=2");
+nn::parallel::global::InitAllEnv(1, 1, false, 1, 1, config);
+
+// 创建并运行模型
+auto x = std::make_shared<Tensor>(std::vector<int64_t>{2, 3}, DataType::kFLOAT32);
+x->Fill<float>(2.0f);
+x->RequiresGrad();
+
+auto y = x->Mul(x);
+auto loss = y->Sum(0, false);
+loss->Backward();
+```
+
+输出示例：
+```
+I0113 06:44:10.575 [Rank 0][PrecisionCheck] Forward Input MulFunction tensor[0]: [2, 2, 2, 2, 2, 2]
+I0113 06:44:10.575 [Rank 0][PrecisionCheck] Forward Output MulFunction tensor[0]: [4, 4, 4, 4, 4, 4]
+```
+
+### 2. MD5 哈希输出
+
+```cpp
+// 输出 MD5 而不是 tensor 值
+auto config = utils::PrecisionCheckConfig::Parse("level=2,output_md5=true");
+nn::parallel::global::InitAllEnv(1, 1, false, 1, 1, config);
+```
+
+输出示例：
+```
+I0113 06:44:37.751 [Rank 0][PrecisionCheck] Forward Input MulFunction tensor[0]: md5=522b4223c3a2f0dd964caa87cb6eab65
+I0113 06:44:37.751 [Rank 0][PrecisionCheck] Forward Output MulFunction tensor[0]: md5=91d1e78bf226d8735a3bc0ca6968339c
+```
+
+### 3. 表格格式输出
+
+```cpp
+// 使用表格格式，便于查看和对比
+auto config = utils::PrecisionCheckConfig::Parse("level=2,format=table,output_md5=true");
+nn::parallel::global::InitAllEnv(1, 1, false, 1, 1, config);
+```
+
+输出示例：
+```
++--------------------------------------------------+-------+------------------+---------------+----------+----------+
+| key                                              | level | shape            | dtype         | same_hash| diff_order|
++--------------------------------------------------+-------+------------------+---------------+----------+----------+
+| [GAS-0] [L-0] Forward Input MulFunction          | 2     | (2, 3)           | float32       | True     | 0        |
+| [GAS-0] [L-0] Forward Output MulFunction         | 2     | (2, 3)           | float32       | True     | 0        |
+```
+
+### 4. 基准对比
+
+```cpp
+// 第一次运行：生成基准文件
+auto config1 = utils::PrecisionCheckConfig::Parse("level=2,output_md5=true,output_path=./baseline");
+nn::parallel::global::InitAllEnv(1, 1, false, 1, 1, config1);
+// ... 运行模型 ...
+// 生成文件: ./baseline/precision_check_rank_0.log
+
+// 第二次运行：与基准对比
+auto config2 = utils::PrecisionCheckConfig::Parse("level=2,format=table,baseline=./baseline/precision_check_rank_0.log");
+nn::parallel::global::InitAllEnv(1, 1, false, 1, 1, config2);
+// ... 运行模型 ...
+// 输出会显示 same_hash 列，标识是否与基准一致
+```
+
+### 5. 文件输出（所有 rank）
+
+```cpp
+// 输出到文件，所有 rank 都会输出
+auto config = utils::PrecisionCheckConfig::Parse("level=2,output_path=./logs");
+nn::parallel::global::InitAllEnv(8, 2, false, 2, 1, config);
+// 生成文件: ./logs/precision_check_rank_0.log, ./logs/precision_check_rank_1.log, ...
+```
+
+## 命令行使用
+
+### GPT2 示例
+
+```bash
+# 基本检查（简单格式，输出 tensor 值）
+./gpt2 <other params> --precision_check "level=2"
+
+# 输出 MD5 哈希
+./gpt2 <other params> --precision_check "level=2,output_md5=true"
+
+# 表格格式
+./gpt2 <other params> --precision_check "level=2,format=table,output_md5=true"
+
+# 生成基准文件
+./gpt2 <other params> --precision_check "level=2,output_md5=true,output_path=./baseline"
+
+# 与基准对比
+./gpt2 <other params> --precision_check "level=2,format=table,baseline=./baseline/precision_check_rank_0.log"
+```
+
+### LLaMA3 示例
+
+```bash
+# 基本检查
+./llama3 --device cuda \
+  --input_bin /path/to/data.bin \
+  --llmc_filepath /path/to/model.bin \
+  --precision_check "level=2"
+
+# 表格格式 + MD5
+./llama3 --device cuda \
+  --input_bin /path/to/data.bin \
+  --llmc_filepath /path/to/model.bin \
+  --precision_check "level=2,format=table,output_md5=true"
+```
+
+## 上下文追踪
+
+使用 `PrecisionCheckContext` 设置 GAS（梯度累积步）和层号信息：
+
+```cpp
+#include "infini_train/include/utils/precision_check_context.h"
+
+// 在训练循环中设置上下文
+for (int gas_step = 0; gas_step < grad_accum_steps; ++gas_step) {
+    PrecisionCheckContext::Instance().SetGAS(gas_step);
+
+    for (int layer = 0; layer < num_layers; ++layer) {
+        PrecisionCheckContext::Instance().SetLayer(layer);
+        PrecisionCheckContext::Instance().SetLayerName("transformer_block");
+
+        // 运行该层的前向传播
+        // 输出会包含 [GAS-X] [L-Y] 前缀
+    }
+}
+```
+
+输出示例：
+```
+[GAS-0] [L-0] Forward Input MulFunction
+[GAS-0] [L-1] Forward Input MulFunction
+[GAS-1] [L-0] Forward Input MulFunction
+```
+
+## 性能优化
+
+### MD5 计算优化
+
+MD5 仅在以下情况计算：
+- `output_md5=true` 时
+- `baseline_path` 非空时（需要对比）
+
+默认情况下（`output_md5=false` 且无基准对比），不会计算 MD5，避免性能开销。
+
+### 使用建议
+
+| 场景 | 推荐配置 |
+|------|----------|
+| 快速调试 | `level=2` |
+| 详细调试 | `level=2,format=table` |
+| 生成基准 | `level=2,output_md5=true,output_path=./baseline` |
+| 对比测试 | `level=2,format=table,baseline=./baseline/...` |
+| 生产环境 | `level=0`（关闭） |
+
+## 输出格式对比
+
+### Simple 格式
+
+```
+I0113 06:44:10.575 [Rank 0][PrecisionCheck] Forward Input MulFunction tensor[0]: [2, 2, 2, 2, 2, 2]
+```
+
+优点：紧凑，易于阅读
+缺点：不便于对比多个 tensor
+
+### Table 格式
+
+```
++--------------------------------------------------+-------+------------------+---------------+----------+----------+
+| key                                              | level | shape            | dtype         | same_hash| diff_order|
++--------------------------------------------------+-------+------------------+---------------+----------+----------+
+| [GAS-0] [L-0] Forward Input MulFunction          | 2     | (2, 3)           | float32       | True     | 0        |
+```
+
+优点：结构化，便于对比和分析
+缺点：占用更多空间
+
+## 手动注册（高级用法）
+
+除了通过 `InitAllEnv` 自动注册，也可以手动为特定模块注册：
+
+```cpp
+#include "infini_train/include/utils/precision_checker.h"
+
+// 配置精度检查器
+utils::PrecisionChecker::Config config;
+config.check_nan = true;
+config.check_inf = true;
+config.print_stats = true;
+config.abort_on_error = false;
+
+// 为特定模块注册
+utils::PrecisionChecker::RegisterForModule(model.get(), "MyModel", config);
+
+// 为特定 Function 注册
+utils::PrecisionChecker::RegisterForFunction(my_function.get(), "MyFunction", config);
+```
+
+## 实现原理
+
+精度检查器通过 Hook 机制实现：
+
+1. **Forward Pre-Hook**：检查输入 tensor
+2. **Forward Post-Hook**：检查输出 tensor
+3. **Backward Hooks**：自动检查梯度
+
+检查流程：
+```
+Forward Pass:
+  ├─> Pre-Hook: 检查输入
+  ├─> Forward: 执行计算
+  └─> Post-Hook: 检查输出
+        ├─> 检测 NaN/Inf
+        ├─> 计算 MD5（如果需要）
+        ├─> 与基准对比（如果有）
+        └─> 输出结果
+
+Backward Pass:
+  ├─> Backward Pre-Hook: 检查梯度输入
+  ├─> Backward: 执行梯度计算
+  └─> Backward Post-Hook: 检查梯度输出
+```
+
+## 示例代码
+
+参见：
+- `test/hook/test_precision_check.cc` - 完整使用示例
+- `infini_train/include/utils/precision_checker.h` - API 文档
+- `infini_train/include/utils/precision_check_config.h` - 配置结构
+- `infini_train/include/utils/precision_check_context.h` - 上下文追踪
+
+## 测试
+
+```bash
+# 运行测试（默认：简单格式）
+./test_precision_check
+
+# Function 级别 + MD5
+./test_precision_check "level=2,output_md5=true"
+
+# 表格格式
+./test_precision_check "level=2,format=table,output_md5=true"
+
+# Module 级别
+./test_precision_check "level=1"
+```
diff --git a/example/gpt2/main.cc b/example/gpt2/main.cc
index c43a04d2..29fdb917 100644
--- a/example/gpt2/main.cc
+++ b/example/gpt2/main.cc
@@ -26,6 +26,7 @@
 #include "infini_train/include/profiler.h"
 #endif
 #include "infini_train/include/nn/parallel/utils.h"
+#include "infini_train/include/utils/precision_check_config.h"
 
 #include "example/common/tiny_shakespeare_dataset.h"
 #include "example/common/tokenizer.h"
@@ -69,8 +70,7 @@ DEFINE_uint32(virtual_pipeline_parallel, 1, "Number of chunks in PP stage.");
 // precision
 DEFINE_string(dtype, "float32", "precision used in training (float32/bfloat16)");
 // precision check
-DEFINE_int32(precision_check, 0, "precision check level: 0=off, 1=module, 2=function");
-DEFINE_bool(precision_check_all_ranks, false, "enable precision check for all ranks (default: rank 0 only)");
+DEFINE_string(precision_check, "", "precision check config: level=N,format=simple|table,output_md5=true|false,output_path=PATH,baseline=PATH");
 
 using namespace infini_train;
 
@@ -366,9 +366,9 @@ int main(int argc, char *argv[]) {
     gflags::ParseCommandLineFlags(&argc, &argv, true);
     google::InitGoogleLogging(argv[0]);
 
+    auto precision_config = utils::PrecisionCheckConfig::Parse(FLAGS_precision_check);
     nn::parallel::global::InitAllEnv(FLAGS_nthread_per_process, FLAGS_tensor_parallel, FLAGS_sequence_parallel,
-                                     FLAGS_pipeline_parallel, FLAGS_virtual_pipeline_parallel, FLAGS_precision_check,
-                                     FLAGS_precision_check_all_ranks);
+                                     FLAGS_pipeline_parallel, FLAGS_virtual_pipeline_parallel, precision_config);
 
     LOG(INFO) << nn::parallel::global::ProcessGroupOverview();
 
diff --git a/example/llama3/main.cc b/example/llama3/main.cc
index c6db113e..2dcffd73 100644
--- a/example/llama3/main.cc
+++ b/example/llama3/main.cc
@@ -25,6 +25,7 @@
 #include "infini_train/include/nn/parallel/global.h"
 #include "infini_train/include/nn/parallel/process_group.h"
 #include "infini_train/include/nn/parallel/utils.h"
+#include "infini_train/include/utils/precision_check_config.h"
 
 #include "example/common/tiny_shakespeare_dataset.h"
 #include "example/common/tokenizer.h"
@@ -67,8 +68,7 @@ DEFINE_uint32(virtual_pipeline_parallel, 1, "Number of chunks in PP stage.");
 // precision
 DEFINE_string(dtype, "float32", "precision used in training (float32/bfloat16)");
 // precision check
-DEFINE_int32(precision_check, 0, "precision check level: 0=off, 1=module, 2=function");
-DEFINE_bool(precision_check_all_ranks, false, "enable precision check for all ranks (default: rank 0 only)");
+DEFINE_string(precision_check, "", "precision check config: level=N,format=simple|table,output_md5=true|false,output_path=PATH,baseline=PATH");
 
 using namespace infini_train;
 
@@ -342,9 +342,9 @@ int main(int argc, char *argv[]) {
     gflags::ParseCommandLineFlags(&argc, &argv, true);
     google::InitGoogleLogging(argv[0]);
 
+    auto precision_config = utils::PrecisionCheckConfig::Parse(FLAGS_precision_check);
     nn::parallel::global::InitAllEnv(FLAGS_nthread_per_process, FLAGS_tensor_parallel, FLAGS_sequence_parallel,
-                                     FLAGS_pipeline_parallel, FLAGS_virtual_pipeline_parallel, FLAGS_precision_check,
-                                     FLAGS_precision_check_all_ranks);
+                                     FLAGS_pipeline_parallel, FLAGS_virtual_pipeline_parallel, precision_config);
 
     LOG(INFO) << nn::parallel::global::ProcessGroupOverview();
 
diff --git a/infini_train/include/autograd/function.h b/infini_train/include/autograd/function.h
index defbf907..71b709ff 100644
--- a/infini_train/include/autograd/function.h
+++ b/infini_train/include/autograd/function.h
@@ -14,12 +14,9 @@ class HookHandle;
 
 class Function : public std::enable_shared_from_this<Function> {
 public:
-    using FunctionForwardPreHook = std::function<void(Function*, const std::vector<std::shared_ptr<Tensor>>&)>;
-    using FunctionForwardPostHook = std::function<void(Function*, const std::vector<std::shared_ptr<Tensor>>&,
-                                                        const std::vector<std::shared_ptr<Tensor>>&)>;
-    using FunctionBackwardPreHook = std::function<void(Function*, const std::vector<std::shared_ptr<Tensor>>&)>;
-    using FunctionBackwardPostHook = std::function<void(Function*, const std::vector<std::shared_ptr<Tensor>>&,
-                                                         const std::vector<std::shared_ptr<Tensor>>&)>;
+    using FunctionPreHook = std::function<void(Function*, const std::vector<std::shared_ptr<Tensor>>&)>;
+    using FunctionPostHook = std::function<void(Function*, const std::vector<std::shared_ptr<Tensor>>&,
+                                                 const std::vector<std::shared_ptr<Tensor>>&)>;
 
     static constexpr char kUndefinedType[] = "Undefined";
 
@@ -38,10 +35,10 @@ class Function : public std::enable_shared_from_this<Function> {
 
     void IncreaseDependenciesNumber();
 
-    std::shared_ptr<HookHandle> RegisterForwardPreHook(FunctionForwardPreHook hook);
-    std::shared_ptr<HookHandle> RegisterForwardPostHook(FunctionForwardPostHook hook);
-    std::shared_ptr<HookHandle> RegisterBackwardPreHook(FunctionBackwardPreHook hook);
-    std::shared_ptr<HookHandle> RegisterBackwardPostHook(FunctionBackwardPostHook hook);
+    std::shared_ptr<HookHandle> RegisterForwardPreHook(FunctionPreHook hook);
+    std::shared_ptr<HookHandle> RegisterForwardPostHook(FunctionPostHook hook);
+    std::shared_ptr<HookHandle> RegisterBackwardPreHook(FunctionPreHook hook);
+    std::shared_ptr<HookHandle> RegisterBackwardPostHook(FunctionPostHook hook);
 
     const std::string& type() const { return type_; }
 
@@ -55,10 +52,10 @@ class Function : public std::enable_shared_from_this<Function> {
     int grad_outputs_reached_ = 0;
     std::vector<std::shared_ptr<Tensor>> grad_outputs_;
     const std::string type_ = kUndefinedType;
-    std::vector<FunctionForwardPreHook> forward_pre_hooks_;
-    std::vector<FunctionForwardPostHook> forward_post_hooks_;
-    std::vector<FunctionBackwardPreHook> backward_pre_hooks_;
-    std::vector<FunctionBackwardPostHook> backward_post_hooks_;
+    std::vector<FunctionPreHook> forward_pre_hooks_;
+    std::vector<FunctionPostHook> forward_post_hooks_;
+    std::vector<FunctionPreHook> backward_pre_hooks_;
+    std::vector<FunctionPostHook> backward_post_hooks_;
     bool precision_check_registered_ = false;
 };
 } // namespace infini_train::autograd
diff --git a/infini_train/include/autograd/tensor_hook.h b/infini_train/include/autograd/tensor_hook.h
index f7fcbe37..e566df8f 100644
--- a/infini_train/include/autograd/tensor_hook.h
+++ b/infini_train/include/autograd/tensor_hook.h
@@ -4,18 +4,13 @@
 #include <memory>
 #include <vector>
 
+#include "infini_train/include/autograd/function_hook.h"
+
 namespace infini_train {
 class Tensor;
 
 namespace autograd {
 
-// Hook handle for removing hooks
-class HookHandle {
-public:
-    virtual ~HookHandle() = default;
-    virtual void Remove() = 0;
-};
-
 // Tensor backward hook: modifies gradient during backward pass
 // Returns modified gradient or nullptr to keep original
 using TensorBackwardHook = std::function<std::shared_ptr<Tensor>(const std::shared_ptr<Tensor>&)>;
diff --git a/infini_train/include/nn/parallel/global.h b/infini_train/include/nn/parallel/global.h
index bc178c19..bd3f102f 100644
--- a/infini_train/include/nn/parallel/global.h
+++ b/infini_train/include/nn/parallel/global.h
@@ -4,6 +4,8 @@
 #include <string>
 #include <vector>
 
+#include "infini_train/include/utils/precision_check_config.h"
+
 namespace infini_train::nn::parallel::global {
 
 enum Axis : uint8_t { DP = 0, TP = 1, PP = 2, AXIS_COUNT = 3 };
@@ -27,13 +29,12 @@ class GlobalEnv {
     static GlobalEnv &Instance();
 
     void Init(int threads_per_process, int tensor_parallel_size, bool sequence_parallel_enabled,
-              int pipeline_parallel_size, int virtual_pipeline_parallel_size, int precision_check_level = 0,
-              bool precision_check_all_ranks = false);
+              int pipeline_parallel_size, int virtual_pipeline_parallel_size,
+              const utils::PrecisionCheckConfig& precision_config = utils::PrecisionCheckConfig());
 
-    enum class PrecisionCheckLevel { NONE, FUNCTION, MODULE };
-    void SetPrecisionCheckLevel(PrecisionCheckLevel level);
+    enum class PrecisionCheckLevel { NONE, MODULE, FUNCTION };
     PrecisionCheckLevel GetPrecisionCheckLevel() const;
-    bool GetPrecisionCheckAllRanks() const;
+    const utils::PrecisionCheckConfig& GetPrecisionCheckConfig() const;
 
     int nnodes() const;
 
@@ -90,15 +91,14 @@ class GlobalEnv {
 
     Layout layout_;
     PrecisionCheckLevel precision_check_level_ = PrecisionCheckLevel::NONE;
-    bool precision_check_all_ranks_ = false;
+    utils::PrecisionCheckConfig precision_check_config_;
 };
 
 inline void InitAllEnv(int nthread_per_process, int tensor_parallel_size, bool sequence_parallel_enabled,
-                       int pipeline_parallel_size, int virtual_pipeline_parallel, int precision_check_level = 0,
-                       bool precision_check_all_ranks = false) {
+                       int pipeline_parallel_size, int virtual_pipeline_parallel,
+                       const utils::PrecisionCheckConfig& precision_config = utils::PrecisionCheckConfig()) {
     GlobalEnv::Instance().Init(nthread_per_process, tensor_parallel_size, sequence_parallel_enabled,
-                               pipeline_parallel_size, virtual_pipeline_parallel, precision_check_level,
-                               precision_check_all_ranks);
+                               pipeline_parallel_size, virtual_pipeline_parallel, precision_config);
 }
 inline int GetNnodes() { return GlobalEnv::Instance().nnodes(); }
 inline int GetWorldSize() { return GlobalEnv::Instance().world_size(); }
diff --git a/infini_train/include/utils/precision_check_config.h b/infini_train/include/utils/precision_check_config.h
new file mode 100644
index 00000000..f2b53a65
--- /dev/null
+++ b/infini_train/include/utils/precision_check_config.h
@@ -0,0 +1,45 @@
+#pragma once
+
+#include <sstream>
+#include <string>
+#include <unordered_map>
+
+namespace infini_train {
+namespace utils {
+
+struct PrecisionCheckConfig {
+    int level = 0;                    // 0=off, 1=module, 2=function
+    std::string output_path = "";     // empty=console(rank0), non-empty=file(all ranks)
+    bool output_md5 = false;          // output MD5 hash or tensor values
+    std::string format = "simple";    // "simple" or "table"
+    std::string baseline_path = "";   // baseline file path for comparison
+
+    // Parse from "key=value,key=value" string
+    static PrecisionCheckConfig Parse(const std::string& config_str) {
+        PrecisionCheckConfig config;
+        if (config_str.empty()) return config;
+
+        std::unordered_map<std::string, std::string> kv_map;
+        std::istringstream ss(config_str);
+        std::string item;
+        while (std::getline(ss, item, ',')) {
+            auto pos = item.find('=');
+            if (pos != std::string::npos) {
+                kv_map[item.substr(0, pos)] = item.substr(pos + 1);
+            }
+        }
+
+        if (kv_map.count("level")) config.level = std::stoi(kv_map["level"]);
+        if (kv_map.count("output_path")) config.output_path = kv_map["output_path"];
+        if (kv_map.count("output_md5")) {
+            config.output_md5 = (kv_map["output_md5"] == "true" || kv_map["output_md5"] == "1");
+        }
+        if (kv_map.count("format")) config.format = kv_map["format"];
+        if (kv_map.count("baseline")) config.baseline_path = kv_map["baseline"];
+
+        return config;
+    }
+};
+
+}  // namespace utils
+}  // namespace infini_train
diff --git a/infini_train/include/utils/precision_check_context.h b/infini_train/include/utils/precision_check_context.h
new file mode 100644
index 00000000..722825c7
--- /dev/null
+++ b/infini_train/include/utils/precision_check_context.h
@@ -0,0 +1,50 @@
+#pragma once
+
+#include <string>
+
+namespace infini_train {
+namespace utils {
+
+// Context for tracking precision check information (GAS step, layer number, etc.)
+// Thread-local to ensure thread safety in multi-threaded training
+class PrecisionCheckContext {
+public:
+    static PrecisionCheckContext& Instance() {
+        static thread_local PrecisionCheckContext instance;
+        return instance;
+    }
+
+    void SetGAS(int gas) { gas_ = gas; }
+    void SetLayer(int layer) { layer_ = layer; }
+    void SetLayerName(const std::string& name) { layer_name_ = name; }
+
+    int GetGAS() const { return gas_; }
+    int GetLayer() const { return layer_; }
+    const std::string& GetLayerName() const { return layer_name_; }
+
+    // Returns formatted key, e.g., "[GAS-0] [L-0] attn_out"
+    std::string GetKey() const {
+        std::string key = "[GAS-" + std::to_string(gas_) + "]";
+        key += " [L-" + std::to_string(layer_) + "]";
+        if (!layer_name_.empty()) {
+            key += " " + layer_name_;
+        }
+        return key;
+    }
+
+    // Reset context
+    void Reset() {
+        gas_ = 0;
+        layer_ = 0;
+        layer_name_.clear();
+    }
+
+private:
+    PrecisionCheckContext() = default;
+    int gas_ = 0;
+    int layer_ = 0;
+    std::string layer_name_;
+};
+
+}  // namespace utils
+}  // namespace infini_train
diff --git a/infini_train/src/autograd/function.cc b/infini_train/src/autograd/function.cc
index de39302a..f10e9b10 100644
--- a/infini_train/src/autograd/function.cc
+++ b/infini_train/src/autograd/function.cc
@@ -161,23 +161,23 @@ void Function::BackwardPartial(const std::shared_ptr<Tensor> &grad_output, int g
 
 void Function::IncreaseDependenciesNumber() { ++dependencies_number_; }
 
-std::shared_ptr<HookHandle> Function::RegisterForwardPreHook(FunctionForwardPreHook hook) {
+std::shared_ptr<HookHandle> Function::RegisterForwardPreHook(FunctionPreHook hook) {
     forward_pre_hooks_.push_back(std::move(hook));
-    return std::make_shared<FunctionHookHandleImpl<FunctionForwardPreHook>>(&forward_pre_hooks_, forward_pre_hooks_.size() - 1);
+    return std::make_shared<FunctionHookHandleImpl<FunctionPreHook>>(&forward_pre_hooks_, forward_pre_hooks_.size() - 1);
 }
 
-std::shared_ptr<HookHandle> Function::RegisterForwardPostHook(FunctionForwardPostHook hook) {
+std::shared_ptr<HookHandle> Function::RegisterForwardPostHook(FunctionPostHook hook) {
     forward_post_hooks_.push_back(std::move(hook));
-    return std::make_shared<FunctionHookHandleImpl<FunctionForwardPostHook>>(&forward_post_hooks_, forward_post_hooks_.size() - 1);
+    return std::make_shared<FunctionHookHandleImpl<FunctionPostHook>>(&forward_post_hooks_, forward_post_hooks_.size() - 1);
 }
 
-std::shared_ptr<HookHandle> Function::RegisterBackwardPreHook(FunctionBackwardPreHook hook) {
+std::shared_ptr<HookHandle> Function::RegisterBackwardPreHook(FunctionPreHook hook) {
     backward_pre_hooks_.push_back(std::move(hook));
-    return std::make_shared<FunctionHookHandleImpl<FunctionBackwardPreHook>>(&backward_pre_hooks_, backward_pre_hooks_.size() - 1);
+    return std::make_shared<FunctionHookHandleImpl<FunctionPreHook>>(&backward_pre_hooks_, backward_pre_hooks_.size() - 1);
 }
 
-std::shared_ptr<HookHandle> Function::RegisterBackwardPostHook(FunctionBackwardPostHook hook) {
+std::shared_ptr<HookHandle> Function::RegisterBackwardPostHook(FunctionPostHook hook) {
     backward_post_hooks_.push_back(std::move(hook));
-    return std::make_shared<FunctionHookHandleImpl<FunctionBackwardPostHook>>(&backward_post_hooks_, backward_post_hooks_.size() - 1);
+    return std::make_shared<FunctionHookHandleImpl<FunctionPostHook>>(&backward_post_hooks_, backward_post_hooks_.size() - 1);
 }
 } // namespace infini_train::autograd
diff --git a/infini_train/src/nn/parallel/global.cc b/infini_train/src/nn/parallel/global.cc
index 80a01f57..09d93825 100644
--- a/infini_train/src/nn/parallel/global.cc
+++ b/infini_train/src/nn/parallel/global.cc
@@ -91,8 +91,8 @@ GlobalEnv &GlobalEnv::Instance() {
 }
 
 void GlobalEnv::Init(int nthread_per_process, int tensor_parallel_size, bool sequence_parallel_enabled,
-                     int pipeline_parallel_size, int virtual_pipeline_parallel_size, int precision_check_level,
-                     bool precision_check_all_ranks) {
+                     int pipeline_parallel_size, int virtual_pipeline_parallel_size,
+                     const utils::PrecisionCheckConfig& precision_config) {
     std::lock_guard<std::mutex> lock(mutex_);
 
     CHECK(!initialized_) << "Repeated initialization of GlobalEnv!";
@@ -116,15 +116,15 @@ void GlobalEnv::Init(int nthread_per_process, int tensor_parallel_size, bool seq
     layout_.sizes[PP] = pipeline_parallel_size_;
     layout_.InitStrides();
 
-    // Initialize precision check level from parameter
-    if (precision_check_level == 1) {
+    // Store precision check config
+    precision_check_config_ = precision_config;
+    if (precision_config.level == 1) {
         precision_check_level_ = PrecisionCheckLevel::MODULE;
-    } else if (precision_check_level == 2) {
+    } else if (precision_config.level == 2) {
         precision_check_level_ = PrecisionCheckLevel::FUNCTION;
     } else {
         precision_check_level_ = PrecisionCheckLevel::NONE;
     }
-    precision_check_all_ranks_ = precision_check_all_ranks;
 
     initialized_ = true;
 }
@@ -194,17 +194,13 @@ Layout GlobalEnv::layout() const {
     return layout_;
 }
 
-void GlobalEnv::SetPrecisionCheckLevel(PrecisionCheckLevel level) {
-    precision_check_level_ = level;
-}
-
 GlobalEnv::PrecisionCheckLevel GlobalEnv::GetPrecisionCheckLevel() const {
     return precision_check_level_;
 }
 
-bool GlobalEnv::GetPrecisionCheckAllRanks() const {
+const utils::PrecisionCheckConfig& GlobalEnv::GetPrecisionCheckConfig() const {
     CHECK(initialized_) << "GlobalEnv is not initialized!";
-    return precision_check_all_ranks_;
+    return precision_check_config_;
 }
 
 namespace {
diff --git a/infini_train/src/utils/precision_checker.cc b/infini_train/src/utils/precision_checker.cc
index 0f040df9..bbb7a134 100644
--- a/infini_train/src/utils/precision_checker.cc
+++ b/infini_train/src/utils/precision_checker.cc
@@ -2,41 +2,229 @@
 
 #include <chrono>
 #include <cmath>
+#include <cstdint>
 #include <cstdlib>
+#include <cstring>
 #include <fstream>
 #include <iomanip>
 #include <iostream>
 #include <limits>
 #include <mutex>
 #include <sstream>
+#include <unordered_map>
 
 #include "infini_train/include/autograd/function.h"
 #include "infini_train/include/nn/modules/module.h"
 #include "infini_train/include/nn/parallel/global.h"
 #include "infini_train/include/tensor.h"
+#include "infini_train/include/utils/precision_check_context.h"
 
 namespace infini_train::utils {
 
 namespace {
-std::ofstream& GetLogStream() {
+
+// Simple MD5 implementation
+class MD5 {
+public:
+    MD5() { Init(); }
+
+    void Update(const void* data, size_t len) {
+        const uint8_t* ptr = static_cast<const uint8_t*>(data);
+        size_t buffer_space = 64 - buffer_len_;
+
+        if (len >= buffer_space) {
+            memcpy(buffer_ + buffer_len_, ptr, buffer_space);
+            Transform(buffer_);
+            ptr += buffer_space;
+            len -= buffer_space;
+            total_len_ += buffer_space;
+            buffer_len_ = 0;
+
+            while (len >= 64) {
+                Transform(ptr);
+                ptr += 64;
+                len -= 64;
+                total_len_ += 64;
+            }
+        }
+
+        memcpy(buffer_ + buffer_len_, ptr, len);
+        buffer_len_ += len;
+        total_len_ += len;
+    }
+
+    std::string Finalize() {
+        uint8_t padding[64] = {0x80};
+        uint64_t bits = total_len_ * 8;
+
+        size_t pad_len = (buffer_len_ < 56) ? (56 - buffer_len_) : (120 - buffer_len_);
+        Update(padding, pad_len);
+
+        uint8_t len_bytes[8];
+        for (int i = 0; i < 8; ++i) len_bytes[i] = (bits >> (i * 8)) & 0xff;
+        Update(len_bytes, 8);
+
+        std::ostringstream oss;
+        oss << std::hex << std::setfill('0');
+        for (int i = 0; i < 4; ++i) {
+            oss << std::setw(2) << ((state_[0] >> (i * 8)) & 0xff);
+        }
+        for (int i = 0; i < 4; ++i) {
+            oss << std::setw(2) << ((state_[1] >> (i * 8)) & 0xff);
+        }
+        for (int i = 0; i < 4; ++i) {
+            oss << std::setw(2) << ((state_[2] >> (i * 8)) & 0xff);
+        }
+        for (int i = 0; i < 4; ++i) {
+            oss << std::setw(2) << ((state_[3] >> (i * 8)) & 0xff);
+        }
+        return oss.str();
+    }
+
+private:
+    void Init() {
+        state_[0] = 0x67452301;
+        state_[1] = 0xefcdab89;
+        state_[2] = 0x98badcfe;
+        state_[3] = 0x10325476;
+        buffer_len_ = 0;
+        total_len_ = 0;
+    }
+
+    static uint32_t F(uint32_t x, uint32_t y, uint32_t z) { return (x & y) | (~x & z); }
+    static uint32_t G(uint32_t x, uint32_t y, uint32_t z) { return (x & z) | (y & ~z); }
+    static uint32_t H(uint32_t x, uint32_t y, uint32_t z) { return x ^ y ^ z; }
+    static uint32_t I(uint32_t x, uint32_t y, uint32_t z) { return y ^ (x | ~z); }
+    static uint32_t RotateLeft(uint32_t x, int n) { return (x << n) | (x >> (32 - n)); }
+
+    void Transform(const uint8_t* block) {
+        uint32_t a = state_[0], b = state_[1], c = state_[2], d = state_[3];
+        uint32_t x[16];
+        for (int i = 0; i < 16; ++i) {
+            x[i] = block[i * 4] | (block[i * 4 + 1] << 8) | (block[i * 4 + 2] << 16) | (block[i * 4 + 3] << 24);
+        }
+
+        static const uint32_t k[] = {
+            0xd76aa478, 0xe8c7b756, 0x242070db, 0xc1bdceee, 0xf57c0faf, 0x4787c62a, 0xa8304613, 0xfd469501,
+            0x698098d8, 0x8b44f7af, 0xffff5bb1, 0x895cd7be, 0x6b901122, 0xfd987193, 0xa679438e, 0x49b40821,
+            0xf61e2562, 0xc040b340, 0x265e5a51, 0xe9b6c7aa, 0xd62f105d, 0x02441453, 0xd8a1e681, 0xe7d3fbc8,
+            0x21e1cde6, 0xc33707d6, 0xf4d50d87, 0x455a14ed, 0xa9e3e905, 0xfcefa3f8, 0x676f02d9, 0x8d2a4c8a,
+            0xfffa3942, 0x8771f681, 0x6d9d6122, 0xfde5380c, 0xa4beea44, 0x4bdecfa9, 0xf6bb4b60, 0xbebfbc70,
+            0x289b7ec6, 0xeaa127fa, 0xd4ef3085, 0x04881d05, 0xd9d4d039, 0xe6db99e5, 0x1fa27cf8, 0xc4ac5665,
+            0xf4292244, 0x432aff97, 0xab9423a7, 0xfc93a039, 0x655b59c3, 0x8f0ccc92, 0xffeff47d, 0x85845dd1,
+            0x6fa87e4f, 0xfe2ce6e0, 0xa3014314, 0x4e0811a1, 0xf7537e82, 0xbd3af235, 0x2ad7d2bb, 0xeb86d391};
+        static const int s[] = {7, 12, 17, 22, 7, 12, 17, 22, 7, 12, 17, 22, 7, 12, 17, 22,
+                                5, 9,  14, 20, 5, 9,  14, 20, 5, 9,  14, 20, 5, 9,  14, 20,
+                                4, 11, 16, 23, 4, 11, 16, 23, 4, 11, 16, 23, 4, 11, 16, 23,
+                                6, 10, 15, 21, 6, 10, 15, 21, 6, 10, 15, 21, 6, 10, 15, 21};
+
+        for (int i = 0; i < 64; ++i) {
+            uint32_t f, g;
+            if (i < 16) {
+                f = F(b, c, d);
+                g = i;
+            } else if (i < 32) {
+                f = G(b, c, d);
+                g = (5 * i + 1) % 16;
+            } else if (i < 48) {
+                f = H(b, c, d);
+                g = (3 * i + 5) % 16;
+            } else {
+                f = I(b, c, d);
+                g = (7 * i) % 16;
+            }
+            uint32_t temp = d;
+            d = c;
+            c = b;
+            b = b + RotateLeft(a + f + k[i] + x[g], s[i]);
+            a = temp;
+        }
+
+        state_[0] += a;
+        state_[1] += b;
+        state_[2] += c;
+        state_[3] += d;
+    }
+
+    uint32_t state_[4];
+    uint8_t buffer_[64];
+    size_t buffer_len_;
+    uint64_t total_len_;
+};
+
+std::string ComputeMD5(const void* data, size_t size) {
+    MD5 md5;
+    md5.Update(data, size);
+    return md5.Finalize();
+}
+
+// Baseline storage
+std::unordered_map<std::string, std::string>& GetBaseline() {
+    static std::unordered_map<std::string, std::string> baseline;
+    static bool loaded = false;
+    static std::mutex load_mutex;
+
+    if (!loaded) {
+        std::lock_guard<std::mutex> lock(load_mutex);
+        if (!loaded) {
+            const auto& config = nn::parallel::global::GlobalEnv::Instance().GetPrecisionCheckConfig();
+            if (!config.baseline_path.empty()) {
+                std::ifstream file(config.baseline_path);
+                std::string line;
+                while (std::getline(file, line)) {
+                    // Format: key|md5
+                    auto pos = line.rfind('|');
+                    if (pos != std::string::npos) {
+                        std::string key = line.substr(0, pos);
+                        std::string md5 = line.substr(pos + 1);
+                        baseline[key] = md5;
+                    }
+                }
+                std::cout << "[PrecisionCheck] Loaded " << baseline.size() << " baseline entries from "
+                          << config.baseline_path << std::endl;
+            }
+            loaded = true;
+        }
+    }
+    return baseline;
+}
+
+// Table header printed flag
+bool& TableHeaderPrinted() {
+    static bool printed = false;
+    return printed;
+}
+
+std::ostream& GetLogStream() {
     static std::ofstream log_file;
     static std::mutex init_mutex;
     static bool initialized = false;
+    static bool use_console = false;
 
     if (!initialized) {
         std::lock_guard<std::mutex> lock(init_mutex);
         if (!initialized) {
-            int rank = nn::parallel::global::GlobalEnv::Instance().global_proc_rank();
-            std::string filename = "precision_check_rank_" + std::to_string(rank) + ".log";
-            log_file.open(filename, std::ios::out | std::ios::trunc);
+            const auto& config = nn::parallel::global::GlobalEnv::Instance().GetPrecisionCheckConfig();
+
+            if (config.output_path.empty()) {
+                use_console = true;
+            } else {
+                int rank = nn::parallel::global::GlobalEnv::Instance().global_proc_rank();
+                std::string filename = config.output_path + "/precision_check_rank_" + std::to_string(rank) + ".log";
+                log_file.open(filename, std::ios::out | std::ios::trunc);
+                use_console = false;
+                std::cout << "[Rank " << rank << "] Precision check output: " << filename << std::endl;
+            }
             initialized = true;
         }
     }
-    return log_file;
+
+    return use_console ? std::cout : log_file;
 }
 
 bool ShouldPrint() {
-    if (nn::parallel::global::GlobalEnv::Instance().GetPrecisionCheckAllRanks()) {
+    const auto& config = nn::parallel::global::GlobalEnv::Instance().GetPrecisionCheckConfig();
+    if (!config.output_path.empty()) {
         return true;
     }
     return nn::parallel::global::GlobalEnv::Instance().global_proc_rank() == 0;
@@ -51,25 +239,86 @@ std::string GetTimestamp() {
     localtime_r(&time_t, &tm);
 
     std::ostringstream oss;
-    oss << std::setfill('0')
-        << std::setw(2) << (tm.tm_mon + 1)
-        << std::setw(2) << tm.tm_mday << ' '
-        << std::setw(2) << tm.tm_hour << ':'
-        << std::setw(2) << tm.tm_min << ':'
-        << std::setw(2) << tm.tm_sec << '.'
-        << std::setw(3) << ms.count();
+    oss << std::setfill('0') << std::setw(2) << (tm.tm_mon + 1) << std::setw(2) << tm.tm_mday << ' ' << std::setw(2)
+        << tm.tm_hour << ':' << std::setw(2) << tm.tm_min << ':' << std::setw(2) << tm.tm_sec << '.' << std::setw(3)
+        << ms.count();
     return oss.str();
 }
-} // namespace
+
+std::string FormatShape(const std::vector<int64_t>& shape) {
+    std::ostringstream oss;
+    oss << "(";
+    for (size_t i = 0; i < shape.size(); ++i) {
+        if (i > 0) oss << ", ";
+        oss << shape[i];
+    }
+    oss << ")";
+    return oss.str();
+}
+
+std::string DataTypeToString(DataType dtype) {
+    switch (dtype) {
+        case DataType::kFLOAT32: return "float32";
+        case DataType::kFLOAT16: return "float16";
+        case DataType::kBFLOAT16: return "bfloat16";
+        case DataType::kINT32: return "int32";
+        case DataType::kINT64: return "int64";
+        default: return "unknown";
+    }
+}
+
+void PrintTableHeader(std::ostream& os) {
+    if (TableHeaderPrinted()) return;
+    TableHeaderPrinted() = true;
+
+    os << "+" << std::string(50, '-') << "+" << std::string(7, '-') << "+" << std::string(18, '-') << "+"
+       << std::string(15, '-') << "+" << std::string(10, '-') << "+" << std::string(10, '-') << "+\n";
+    os << "| " << std::left << std::setw(49) << "key"
+       << "| " << std::setw(6) << "level"
+       << "| " << std::setw(17) << "shape"
+       << "| " << std::setw(14) << "dtype"
+       << "| " << std::setw(9) << "same_hash"
+       << "| " << std::setw(9) << "diff_order"
+       << "|\n";
+    os << "+" << std::string(50, '-') << "+" << std::string(7, '-') << "+" << std::string(18, '-') << "+"
+       << std::string(15, '-') << "+" << std::string(10, '-') << "+" << std::string(10, '-') << "+\n";
+}
+
+void PrintTableRow(std::ostream& os, const std::string& key, int level, const std::string& shape,
+                   const std::string& dtype, const std::string& same_hash, const std::string& diff_order) {
+    os << "| " << std::left << std::setw(49) << key.substr(0, 49) << "| " << std::setw(6) << level << "| "
+       << std::setw(17) << shape.substr(0, 17) << "| " << std::setw(14) << dtype << "| " << std::setw(9)
+       << same_hash << "| " << std::setw(9) << diff_order << "|\n";
+}
+
+// Calculate diff order between two tensors (returns string like "1e-3" or "0")
+std::string CalculateDiffOrder(const float* data1, const float* data2, size_t size) {
+    if (!data1 || !data2 || size == 0) return "N/A";
+
+    double max_diff = 0.0;
+    for (size_t i = 0; i < size; ++i) {
+        double diff = std::abs(static_cast<double>(data1[i]) - static_cast<double>(data2[i]));
+        max_diff = std::max(max_diff, diff);
+    }
+
+    if (max_diff == 0.0) return "0";
+
+    int order = static_cast<int>(std::floor(std::log10(max_diff)));
+    return "1e" + std::to_string(order);
+}
+
+}  // namespace
 
 void PrecisionChecker::CheckTensors(const std::string& stage, const std::string& name,
-                                   const std::vector<std::shared_ptr<Tensor>>& tensors,
-                                   const Config& config) {
+                                    const std::vector<std::shared_ptr<Tensor>>& tensors, const Config& config) {
     if (!ShouldPrint()) {
         return;
     }
 
+    const auto& global_config = nn::parallel::global::GlobalEnv::Instance().GetPrecisionCheckConfig();
     int rank = nn::parallel::global::GlobalEnv::Instance().global_proc_rank();
+    int level = global_config.level;
+    auto& baseline = GetBaseline();
 
     for (size_t i = 0; i < tensors.size(); ++i) {
         if (!tensors[i]) continue;
@@ -85,100 +334,137 @@ void PrecisionChecker::CheckTensors(const std::string& stage, const std::string&
             cpu_tensor = tensor;
         }
 
-        const float* data = static_cast<const float*>(cpu_tensor->DataPtr());
-        size_t size = cpu_tensor->NumElements();
+        const void* data = cpu_tensor->DataPtr();
+        size_t byte_size = cpu_tensor->SizeInBytes();
+        size_t num_elements = cpu_tensor->NumElements();
 
-        bool has_nan = false;
-        bool has_inf = false;
+        // Build key
+        std::string context_key = PrecisionCheckContext::Instance().GetKey();
+        std::string full_key = context_key.empty() ? (stage + " " + name + " tensor[" + std::to_string(i) + "]")
+                                                   : (context_key + " " + stage + " " + name);
 
-        for (size_t j = 0; j < size; ++j) {
-            float val = data[j];
-            if (std::isnan(val)) has_nan = true;
-            if (std::isinf(val)) has_inf = true;
+        // Only compute MD5 if needed (for output or baseline comparison)
+        bool need_md5 = global_config.output_md5 || !baseline.empty();
+        std::string md5;
+        if (need_md5) {
+            md5 = ComputeMD5(data, byte_size);
         }
 
-        bool has_error = (config.check_nan && has_nan) || (config.check_inf && has_inf);
-
-        if (has_error || config.print_stats) {
-            auto& log_stream = GetLogStream();
-            std::string level = has_error ? "E" : "I";
+        // Check baseline
+        bool has_baseline = !baseline.empty();
+        bool same_hash = true;
+        std::string diff_order = "--";
+        if (has_baseline) {
+            auto it = baseline.find(full_key);
+            if (it != baseline.end()) {
+                same_hash = (it->second == md5);
+                diff_order = same_hash ? "0" : "N/A";
+            }
+        }
 
-            log_stream << level << GetTimestamp() << " [Rank " << rank << "][PrecisionCheck] "
-                      << stage << " " << name << " tensor[" << i << "]: [";
+        auto& log_stream = GetLogStream();
 
-            if (has_nan) log_stream << " NaN detected!";
-            if (has_inf) log_stream << " Inf detected!";
+        if (global_config.format == "table") {
+            static bool header_printed = false;
+            if (!header_printed) {
+                PrintTableHeader(log_stream);
+                header_printed = true;
+            }
+            std::string same_hash_str = has_baseline ? (same_hash ? "True" : "False") : "--";
+            PrintTableRow(log_stream, full_key, level, FormatShape(cpu_tensor->Dims()),
+                          DataTypeToString(cpu_tensor->Dtype()), same_hash_str, diff_order);
+        } else {
+            // Simple format
+            const float* float_data = static_cast<const float*>(data);
+
+            bool has_nan = false;
+            bool has_inf = false;
+            for (size_t j = 0; j < num_elements; ++j) {
+                float val = float_data[j];
+                if (std::isnan(val)) has_nan = true;
+                if (std::isinf(val)) has_inf = true;
+            }
 
-            if (config.print_stats) {
-                constexpr size_t max_print = 10;
-                for (size_t j = 0; j < std::min(size, max_print); ++j) {
-                    if (j > 0) log_stream << ", ";
-                    log_stream << data[j];
+            bool has_error = (config.check_nan && has_nan) || (config.check_inf && has_inf);
+
+            if (has_error || config.print_stats) {
+                std::string log_level = has_error ? "E" : "I";
+
+                log_stream << log_level << GetTimestamp() << " [Rank " << rank << "][PrecisionCheck] " << stage << " "
+                           << name << " tensor[" << i << "]: ";
+
+                if (global_config.output_md5) {
+                    log_stream << "md5=" << md5;
+                    if (!same_hash) log_stream << " (MISMATCH)";
+                } else {
+                    log_stream << "[";
+                    if (has_nan) log_stream << " NaN detected!";
+                    if (has_inf) log_stream << " Inf detected!";
+
+                    if (config.print_stats) {
+                        constexpr size_t max_print = 6;
+                        for (size_t j = 0; j < std::min(num_elements, max_print); ++j) {
+                            if (j > 0) log_stream << ", ";
+                            log_stream << float_data[j];
+                        }
+                        if (num_elements > max_print) log_stream << ", ...";
+                    }
+                    log_stream << "]";
                 }
-                if (size > max_print) log_stream << ", ...";
+                log_stream << std::endl;
+            }
+
+            if (has_error && config.abort_on_error) {
+                std::cerr << "Precision check failed, aborting!" << std::endl;
+                std::abort();
             }
-            log_stream << "]" << std::endl;
         }
 
-        if (has_error && config.abort_on_error) {
-            std::cerr << "Precision check failed, aborting!" << std::endl;
-            std::abort();
+        // Save to baseline file if output_path is set and output_md5 is true
+        if (!global_config.output_path.empty() && global_config.output_md5) {
+            log_stream << full_key << "|" << md5 << std::endl;
         }
     }
 }
 
-void PrecisionChecker::RegisterForFunction(autograd::Function* func, const std::string& name,
-                                          const Config& config) {
+void PrecisionChecker::RegisterForFunction(autograd::Function* func, const std::string& name, const Config& config) {
     std::string func_name = name.empty() ? "Function" : name;
 
-    func->RegisterForwardPreHook([func_name, config](autograd::Function*,
-                                                      const std::vector<std::shared_ptr<Tensor>>& inputs) {
-        CheckTensors("Forward Input", func_name, inputs, config);
-    });
+    func->RegisterForwardPreHook(
+        [func_name, config](autograd::Function*, const std::vector<std::shared_ptr<Tensor>>& inputs) {
+            CheckTensors("Forward Input", func_name, inputs, config);
+        });
 
-    func->RegisterForwardPostHook([func_name, config](autograd::Function*,
-                                                       const std::vector<std::shared_ptr<Tensor>>&,
-                                                       const std::vector<std::shared_ptr<Tensor>>& outputs) {
+    func->RegisterForwardPostHook([func_name, config](autograd::Function*, const std::vector<std::shared_ptr<Tensor>>&,
+                                                      const std::vector<std::shared_ptr<Tensor>>& outputs) {
         CheckTensors("Forward Output", func_name, outputs, config);
     });
 
-    func->RegisterBackwardPreHook([func_name, config](autograd::Function*,
-                                                       const std::vector<std::shared_ptr<Tensor>>& grad_outputs) {
-        CheckTensors("Backward Input", func_name, grad_outputs, config);
-    });
+    func->RegisterBackwardPreHook(
+        [func_name, config](autograd::Function*, const std::vector<std::shared_ptr<Tensor>>& grad_outputs) {
+            CheckTensors("Backward Input", func_name, grad_outputs, config);
+        });
 
     func->RegisterBackwardPostHook([func_name, config](autograd::Function*,
-                                                        const std::vector<std::shared_ptr<Tensor>>& grad_inputs,
-                                                        const std::vector<std::shared_ptr<Tensor>>&) {
+                                                       const std::vector<std::shared_ptr<Tensor>>& grad_inputs,
+                                                       const std::vector<std::shared_ptr<Tensor>>&) {
         CheckTensors("Backward Output", func_name, grad_inputs, config);
     });
 }
 
-void PrecisionChecker::RegisterForModule(nn::Module* module, const std::string& name,
-                                        const Config& config) {
+void PrecisionChecker::RegisterForModule(nn::Module* module, const std::string& name, const Config& config) {
     std::string module_name = name.empty() ? module->type() : name;
 
-    // module->RegisterForwardPreHook([module_name, config](nn::Module*,
-    //                                                       const std::vector<std::shared_ptr<Tensor>>& inputs) {
-    //     CheckTensors("Module Forward Input", module_name, inputs, config);
-    // });
-
-    module->RegisterForwardPostHook([module_name, config](nn::Module*,
-                                                           const std::vector<std::shared_ptr<Tensor>>&,
-                                                           const std::vector<std::shared_ptr<Tensor>>& outputs) {
+    module->RegisterForwardPostHook([module_name, config](nn::Module*, const std::vector<std::shared_ptr<Tensor>>&,
+                                                          const std::vector<std::shared_ptr<Tensor>>& outputs) {
         CheckTensors("Module Forward Output", module_name, outputs, config);
     });
 
-    // module->RegisterBackwardPreHook([module_name, config](nn::Module*,
-    //                                                        const std::vector<std::shared_ptr<Tensor>>& grad_outputs) {
-    //     CheckTensors("Module Backward Input", module_name, grad_outputs, config);
-    // });
-
     module->RegisterBackwardPostHook([module_name, config](nn::Module*,
-                                                            const std::vector<std::shared_ptr<Tensor>>& grad_inputs,
-                                                            const std::vector<std::shared_ptr<Tensor>>&) {
+                                                           const std::vector<std::shared_ptr<Tensor>>& grad_inputs,
+                                                           const std::vector<std::shared_ptr<Tensor>>&) {
         CheckTensors("Module Backward Output", module_name, grad_inputs, config);
     });
 }
 
-} // namespace infini_train::utils
+}  // namespace infini_train::utils
diff --git a/test/hook/test_hook.cc b/test/hook/test_hook.cc
new file mode 100644
index 00000000..25f76ca2
--- /dev/null
+++ b/test/hook/test_hook.cc
@@ -0,0 +1,189 @@
+#include <iostream>
+#include <memory>
+
+#include "glog/logging.h"
+
+#include "infini_train/include/autograd/elementwise.h"
+#include "infini_train/include/autograd/function.h"
+#include "infini_train/include/autograd/function_hook.h"
+#include "infini_train/include/autograd/tensor_hook.h"
+#include "infini_train/include/nn/modules/module.h"
+#include "infini_train/include/nn/parallel/global.h"
+#include "infini_train/include/tensor.h"
+
+using namespace infini_train;
+
+// ============================================================================
+// Test 1: Basic Module Hooks
+// ============================================================================
+void test_basic_hooks() {
+    std::cout << "\n=== Test 1: Basic Module Hooks ===" << std::endl;
+
+    auto x = std::make_shared<Tensor>(std::vector<int64_t>{2, 3}, DataType::kFLOAT32);
+    x->set_requires_grad(true);
+
+    // Module hook example
+    class MyModule : public nn::Module {
+    public:
+        MyModule() : Module("MyModule") {}
+
+        std::vector<std::shared_ptr<Tensor>> Forward(
+            const std::vector<std::shared_ptr<Tensor>>& inputs) override {
+            std::cout << "Forward pass executing..." << std::endl;
+            return inputs;
+        }
+    };
+
+    auto module = std::make_shared<MyModule>();
+
+    // Register forward pre-hook
+    auto pre_hook = module->RegisterForwardPreHook(
+        [](nn::Module* mod, const std::vector<std::shared_ptr<Tensor>>& inputs) {
+            std::cout << "Forward pre-hook: Module type = " << mod->type() << std::endl;
+        }
+    );
+
+    // Register forward post-hook
+    auto fwd_hook = module->RegisterForwardPostHook(
+        [](nn::Module* mod,
+           const std::vector<std::shared_ptr<Tensor>>& inputs,
+           const std::vector<std::shared_ptr<Tensor>>& outputs) {
+            std::cout << "Forward post-hook: Got " << outputs.size() << " outputs" << std::endl;
+        }
+    );
+
+    // Register backward pre-hook
+    auto bwd_pre_hook = module->RegisterBackwardPreHook(
+        [](nn::Module* mod, const std::vector<std::shared_ptr<Tensor>>& grad_outputs) {
+            std::cout << "Backward pre-hook called!" << std::endl;
+        }
+    );
+
+    // Register backward post-hook
+    auto bwd_post_hook = module->RegisterBackwardPostHook(
+        [](nn::Module* mod,
+           const std::vector<std::shared_ptr<Tensor>>& grad_inputs,
+           const std::vector<std::shared_ptr<Tensor>>& grad_outputs) {
+            std::cout << "Backward post-hook called!" << std::endl;
+        }
+    );
+
+    // Test forward pass
+    std::vector<std::shared_ptr<Tensor>> inputs = {x};
+    auto outputs = (*module)(inputs);
+
+    std::cout << "Module hook test completed!" << std::endl;
+}
+
+// ============================================================================
+// Test 2: Hook Remove() Functionality Test
+// ============================================================================
+void test_hook_remove() {
+    std::cout << "\n=== Test 2: Hook Remove() Functionality Test ===" << std::endl;
+
+    auto a = std::make_shared<Tensor>(std::vector<int64_t>{2, 2}, DataType::kFLOAT32);
+    auto b = std::make_shared<Tensor>(std::vector<int64_t>{2, 2}, DataType::kFLOAT32);
+    a->set_requires_grad(true);
+    b->set_requires_grad(true);
+
+    int hook1_count = 0;
+    int hook2_count = 0;
+    int hook3_count = 0;
+
+    auto add_fn = std::make_shared<autograd::Add>();
+
+    // Register three forward pre-hooks
+    auto handle1 = add_fn->RegisterForwardPreHook(
+        [&hook1_count](autograd::Function*, const std::vector<std::shared_ptr<Tensor>>&) {
+            hook1_count++;
+            std::cout << "Hook 1 called (count: " << hook1_count << ")" << std::endl;
+        }
+    );
+
+    auto handle2 = add_fn->RegisterForwardPreHook(
+        [&hook2_count](autograd::Function*, const std::vector<std::shared_ptr<Tensor>>&) {
+            hook2_count++;
+            std::cout << "Hook 2 called (count: " << hook2_count << ")" << std::endl;
+        }
+    );
+
+    auto handle3 = add_fn->RegisterForwardPreHook(
+        [&hook3_count](autograd::Function*, const std::vector<std::shared_ptr<Tensor>>&) {
+            hook3_count++;
+            std::cout << "Hook 3 called (count: " << hook3_count << ")" << std::endl;
+        }
+    );
+
+    // First call - all hooks should fire
+    std::cout << "\n--- First Apply (all hooks active) ---" << std::endl;
+    std::vector<std::shared_ptr<Tensor>> inputs;
+    inputs.push_back(a);
+    inputs.push_back(b);
+    auto result1 = add_fn->Apply(inputs);
+    std::cout << "Hook counts: " << hook1_count << ", " << hook2_count << ", " << hook3_count << std::endl;
+
+    // Remove hook 2
+    std::cout << "\n--- Removing Hook 2 ---" << std::endl;
+    handle2->Remove();
+
+    // Second call - hook 2 should not fire
+    std::cout << "\n--- Second Apply (hook 2 removed) ---" << std::endl;
+    auto result2 = add_fn->Apply(inputs);
+    std::cout << "Hook counts: " << hook1_count << ", " << hook2_count << ", " << hook3_count << std::endl;
+
+    // Remove hook 1
+    std::cout << "\n--- Removing Hook 1 ---" << std::endl;
+    handle1->Remove();
+
+    // Third call - only hook 3 should fire
+    std::cout << "\n--- Third Apply (hooks 1 and 2 removed) ---" << std::endl;
+    auto result3 = add_fn->Apply(inputs);
+    std::cout << "Hook counts: " << hook1_count << ", " << hook2_count << ", " << hook3_count << std::endl;
+
+    // Verify results
+    std::cout << "\n=== Test Results ===" << std::endl;
+    bool test_passed = true;
+
+    if (hook1_count != 2) {
+        std::cout << "FAIL: Hook 1 should be called 2 times, got " << hook1_count << std::endl;
+        test_passed = false;
+    }
+
+    if (hook2_count != 1) {
+        std::cout << "FAIL: Hook 2 should be called 1 time, got " << hook2_count << std::endl;
+        test_passed = false;
+    }
+
+    if (hook3_count != 3) {
+        std::cout << "FAIL: Hook 3 should be called 3 times, got " << hook3_count << std::endl;
+        test_passed = false;
+    }
+
+    if (test_passed) {
+        std::cout << "SUCCESS: All hooks behaved correctly!" << std::endl;
+        std::cout << "  - Hook 1: called 2 times (before removal)" << std::endl;
+        std::cout << "  - Hook 2: called 1 time (removed after first call)" << std::endl;
+        std::cout << "  - Hook 3: called 3 times (never removed)" << std::endl;
+    }
+}
+
+// ============================================================================
+// Main
+// ============================================================================
+int main(int argc, char* argv[]) {
+    google::InitGoogleLogging(argv[0]);
+    nn::parallel::global::GlobalEnv::Instance().Init(0, 1, 1, 1, 1);
+
+    std::cout << "========================================" << std::endl;
+    std::cout << "    Hook Mechanism Tests" << std::endl;
+    std::cout << "========================================" << std::endl;
+
+    test_basic_hooks();
+    test_hook_remove();
+
+    std::cout << "\n========================================" << std::endl;
+    std::cout << "    All Tests Completed Successfully" << std::endl;
+    std::cout << "========================================" << std::endl;
+
+    return 0;
+}
diff --git a/test/hook/test_precision_check.cc b/test/hook/test_precision_check.cc
new file mode 100644
index 00000000..d3fa7ed2
--- /dev/null
+++ b/test/hook/test_precision_check.cc
@@ -0,0 +1,90 @@
+#include <iostream>
+#include <memory>
+
+#include "glog/logging.h"
+
+#include "infini_train/include/nn/modules/module.h"
+#include "infini_train/include/nn/parallel/global.h"
+#include "infini_train/include/tensor.h"
+#include "infini_train/include/utils/precision_check_config.h"
+
+using namespace infini_train;
+
+class MyModel : public nn::Module {
+public:
+    MyModel() : Module("MyModel") {}
+
+    std::vector<std::shared_ptr<Tensor>> Forward(
+        const std::vector<std::shared_ptr<Tensor>>& inputs) override {
+        auto x = inputs[0];
+        x->RequiresGrad();
+        auto y = x->Mul(x);
+        return {y};
+    }
+};
+
+void TestFunctionLevel(const std::string& config_str) {
+    std::cout << "\n========================================" << std::endl;
+    std::cout << "  Function-Level Test: " << config_str << std::endl;
+    std::cout << "========================================" << std::endl;
+
+    auto x = std::make_shared<Tensor>(std::vector<int64_t>{2, 3}, DataType::kFLOAT32);
+    x->Fill<float>(2.0f);
+    x->RequiresGrad();
+
+    auto y = std::make_shared<Tensor>(std::vector<int64_t>{2, 3}, DataType::kFLOAT32);
+    y->Fill<float>(3.0f);
+    y->RequiresGrad();
+
+    auto z = x->Mul(y);
+    auto loss = z->Sum(0, false)->Sum(0, false);
+    loss->Backward();
+
+    std::cout << "Test completed." << std::endl;
+}
+
+void TestModuleLevel() {
+    std::cout << "\n========================================" << std::endl;
+    std::cout << "  Module-Level Test" << std::endl;
+    std::cout << "========================================" << std::endl;
+
+    auto model = std::make_shared<MyModel>();
+    auto x = std::make_shared<Tensor>(std::vector<int64_t>{2, 3}, DataType::kFLOAT32);
+    x->Fill<float>(2.0f);
+    x->RequiresGrad();
+
+    std::vector<std::shared_ptr<Tensor>> inputs = {x};
+    auto outputs = (*model)(inputs);
+    auto loss = outputs[0]->Sum(0, false)->Sum(0, false);
+    loss->Backward();
+
+    std::cout << "Test completed." << std::endl;
+}
+
+int main(int argc, char* argv[]) {
+    google::InitGoogleLogging(argv[0]);
+
+    std::string config_str = argc > 1 ? argv[1] : "level=2";
+
+    std::cout << "========================================" << std::endl;
+    std::cout << "  Precision Check Test Suite" << std::endl;
+    std::cout << "========================================" << std::endl;
+    std::cout << "Config: " << config_str << std::endl;
+
+    auto config = utils::PrecisionCheckConfig::Parse(config_str);
+    nn::parallel::global::InitAllEnv(1, 1, false, 1, 1, config);
+
+    if (config.level == 1) {
+        TestModuleLevel();
+    } else if (config.level == 2) {
+        TestFunctionLevel(config_str);
+    } else {
+        std::cout << "No tests to run (level=0)" << std::endl;
+    }
+
+    std::cout << "\n========================================" << std::endl;
+    std::cout << "  All Tests Completed Successfully" << std::endl;
+    std::cout << "========================================" << std::endl;
+
+    return 0;
+}

From 70397b9c4eb5cb2e39e6e790db0b01ba0f56f2cf Mon Sep 17 00:00:00 2001
From: chen <cx2016013@163.com>
Date: Wed, 14 Jan 2026 03:11:57 +0000
Subject: [PATCH 3/6] style: apply clang-format to precision checker code

---
 example/gpt2/main.cc                          |   4 +-
 example/gpt2/net.cc                           |  10 +-
 example/llama3/main.cc                        |   4 +-
 example/llama3/net.cc                         |  23 ++-
 infini_train/include/autograd/function.h      |   8 +-
 infini_train/include/autograd/function_hook.h |   7 +-
 infini_train/include/autograd/tensor_hook.h   |   7 +-
 infini_train/include/nn/module_hook.h         |  19 +-
 infini_train/include/nn/parallel/global.h     |   6 +-
 .../include/utils/precision_check_config.h    |  36 ++--
 .../include/utils/precision_check_context.h   |  10 +-
 .../include/utils/precision_checker.h         |  15 +-
 infini_train/src/autograd/function.cc         |  20 +-
 infini_train/src/nn/modules/module.cc         |  33 +--
 infini_train/src/nn/parallel/global.cc        |   8 +-
 infini_train/src/utils/precision_checker.cc   | 192 ++++++++++--------
 test/hook/test_hook.cc                        |  58 +++---
 test/hook/test_precision_check.cc             |   7 +-
 18 files changed, 250 insertions(+), 217 deletions(-)

diff --git a/example/gpt2/main.cc b/example/gpt2/main.cc
index 29fdb917..abed7fda 100644
--- a/example/gpt2/main.cc
+++ b/example/gpt2/main.cc
@@ -70,7 +70,9 @@ DEFINE_uint32(virtual_pipeline_parallel, 1, "Number of chunks in PP stage.");
 // precision
 DEFINE_string(dtype, "float32", "precision used in training (float32/bfloat16)");
 // precision check
-DEFINE_string(precision_check, "", "precision check config: level=N,format=simple|table,output_md5=true|false,output_path=PATH,baseline=PATH");
+DEFINE_string(
+    precision_check, "",
+    "precision check config: level=N,format=simple|table,output_md5=true|false,output_path=PATH,baseline=PATH");
 
 using namespace infini_train;
 
diff --git a/example/gpt2/net.cc b/example/gpt2/net.cc
index 18e07dca..8df0bfe5 100644
--- a/example/gpt2/net.cc
+++ b/example/gpt2/net.cc
@@ -215,7 +215,8 @@ GPT2FirstStage::Forward(const std::vector<std::shared_ptr<infini_train::Tensor>>
     return {tok_emb + pos_emb};
 }
 
-GPT2Chunk::GPT2Chunk(const GPT2Config &config, int start_layer, int end_layer) : CloneableModule(kType), config_(config) {
+GPT2Chunk::GPT2Chunk(const GPT2Config &config, int start_layer, int end_layer)
+    : CloneableModule(kType), config_(config) {
     std::vector<std::shared_ptr<nn::Module>> h;
     for (int64_t i = start_layer; i < end_layer; ++i) {
         auto layer = std::make_shared<Block>(config);
@@ -256,9 +257,10 @@ GPT2LastStage::Forward(const std::vector<std::shared_ptr<infini_train::Tensor>>
 }
 
 GPT2::GPT2(const GPT2Config &config)
-    : CloneableModule(kType), config_(config), stage_info_(nn::parallel::PipelineParallel::GetStageInfo(
-                           config_.n_layer, nn::parallel::global::GetPipelineParallelSize(), nn::parallel::pp_rank,
-                           nn::parallel::global::GetVirtualPipelineParallelSize())) {
+    : CloneableModule(kType), config_(config),
+      stage_info_(nn::parallel::PipelineParallel::GetStageInfo(
+          config_.n_layer, nn::parallel::global::GetPipelineParallelSize(), nn::parallel::pp_rank,
+          nn::parallel::global::GetVirtualPipelineParallelSize())) {
     auto tp_world_size = nn::parallel::global::GetTensorParallelSize();
 
     // NOTE(zbl): VocabParallelEmbedding requires vocab_size % tp_size == 0
diff --git a/example/llama3/main.cc b/example/llama3/main.cc
index 2dcffd73..6e5c74d8 100644
--- a/example/llama3/main.cc
+++ b/example/llama3/main.cc
@@ -68,7 +68,9 @@ DEFINE_uint32(virtual_pipeline_parallel, 1, "Number of chunks in PP stage.");
 // precision
 DEFINE_string(dtype, "float32", "precision used in training (float32/bfloat16)");
 // precision check
-DEFINE_string(precision_check, "", "precision check config: level=N,format=simple|table,output_md5=true|false,output_path=PATH,baseline=PATH");
+DEFINE_string(
+    precision_check, "",
+    "precision check config: level=N,format=simple|table,output_md5=true|false,output_path=PATH,baseline=PATH");
 
 using namespace infini_train;
 
diff --git a/example/llama3/net.cc b/example/llama3/net.cc
index 12bcf0ed..50f200f8 100644
--- a/example/llama3/net.cc
+++ b/example/llama3/net.cc
@@ -140,8 +140,8 @@ std::vector<std::shared_ptr<Tensor>> RMSNorm::Forward(const std::vector<std::sha
 }
 
 CausalSelfAttention::CausalSelfAttention(const LLaMA3Config &config)
-    : CloneableModule(kType), config_(config), n_head_(config.n_head), n_embd_(config.n_embd), n_kv_head_(config.n_kv_head),
-      n_rep_(config.n_head / config.n_kv_head), head_dim_(config.n_embd / config.n_head) {
+    : CloneableModule(kType), config_(config), n_head_(config.n_head), n_embd_(config.n_embd),
+      n_kv_head_(config.n_kv_head), n_rep_(config.n_head / config.n_kv_head), head_dim_(config.n_embd / config.n_head) {
     CHECK_LE(config.n_kv_head, config.n_head);
     CHECK_EQ(config.n_head % config.n_kv_head, 0);
     CHECK_EQ(config.n_embd % config.n_head, 0);
@@ -314,13 +314,12 @@ std::vector<std::shared_ptr<Tensor>> Block::Forward(const std::vector<std::share
     // (bs, seq_len, n_embd) -> RMSNorm -> (bs, seq_len, n_embd) -> attention -> (bs, seq_len, n_embd)
     // -> Add -> (bs, seq_len, n_embd)
     auto x1 = x[0]
-            + (*modules_[kAttnLayerName])(std::vector<std::shared_ptr<Tensor>>{
-                (*modules_[kLn1LayerName])({x[0]})[0], freqs_cis, start_pos, mask})[0];
+            + (*modules_[kAttnLayerName])(std::vector<std::shared_ptr<Tensor>>{(*modules_[kLn1LayerName])({x[0]})[0],
+                                                                               freqs_cis, start_pos, mask})[0];
     // (bs, seq_len, n_embd) -> RMSNorm -> (bs, seq_len, n_embd) -> MLP -> (bs, seq_len, n_embd)
     // -> Add -> (bs, seq_len, n_embd)
-    auto x2 = x1
-            + (*modules_[kMlpLayerName])(
-                std::vector<std::shared_ptr<Tensor>>((*modules_[kLn2LayerName])({x1})))[0];
+    auto x2
+        = x1 + (*modules_[kMlpLayerName])(std::vector<std::shared_ptr<Tensor>>((*modules_[kLn2LayerName])({x1})))[0];
     // (bs, seq_len, n_embd)
     return {x2};
 }
@@ -334,7 +333,8 @@ std::vector<std::shared_ptr<Tensor>> LLaMA3FirstStage::Forward(const std::vector
     return (*modules_[LLaMA3FirstStage::kWTELayerName])(x);
 }
 
-LLaMA3Chunk::LLaMA3Chunk(const LLaMA3Config &config, int start_layer, int end_layer) : CloneableModule(kType), config_(config) {
+LLaMA3Chunk::LLaMA3Chunk(const LLaMA3Config &config, int start_layer, int end_layer)
+    : CloneableModule(kType), config_(config) {
     std::vector<std::shared_ptr<nn::Module>> h;
     for (int64_t i = start_layer; i < end_layer; ++i) {
         auto layer = std::make_shared<Block>(config);
@@ -396,9 +396,10 @@ std::vector<std::shared_ptr<Tensor>> LLaMA3LastStage::Forward(const std::vector<
 }
 
 LLaMA3::LLaMA3(const LLaMA3Config &config)
-    : CloneableModule(kType), config_(config), stage_info_(nn::parallel::PipelineParallel::GetStageInfo(
-                           config_.n_layer, nn::parallel::global::GetPipelineParallelSize(), nn::parallel::pp_rank,
-                           nn::parallel::global::GetVirtualPipelineParallelSize())) {
+    : CloneableModule(kType), config_(config),
+      stage_info_(nn::parallel::PipelineParallel::GetStageInfo(
+          config_.n_layer, nn::parallel::global::GetPipelineParallelSize(), nn::parallel::pp_rank,
+          nn::parallel::global::GetVirtualPipelineParallelSize())) {
     std::unordered_map<std::string, std::shared_ptr<nn::Module>> transformer;
     if (stage_info_.is_first_stage) {
         modules_[kPPFirstStageName] = std::make_shared<LLaMA3FirstStage>(config_);
diff --git a/infini_train/include/autograd/function.h b/infini_train/include/autograd/function.h
index 71b709ff..a7665f84 100644
--- a/infini_train/include/autograd/function.h
+++ b/infini_train/include/autograd/function.h
@@ -14,9 +14,9 @@ class HookHandle;
 
 class Function : public std::enable_shared_from_this<Function> {
 public:
-    using FunctionPreHook = std::function<void(Function*, const std::vector<std::shared_ptr<Tensor>>&)>;
-    using FunctionPostHook = std::function<void(Function*, const std::vector<std::shared_ptr<Tensor>>&,
-                                                 const std::vector<std::shared_ptr<Tensor>>&)>;
+    using FunctionPreHook = std::function<void(Function *, const std::vector<std::shared_ptr<Tensor>> &)>;
+    using FunctionPostHook = std::function<void(Function *, const std::vector<std::shared_ptr<Tensor>> &,
+                                                const std::vector<std::shared_ptr<Tensor>> &)>;
 
     static constexpr char kUndefinedType[] = "Undefined";
 
@@ -40,7 +40,7 @@ class Function : public std::enable_shared_from_this<Function> {
     std::shared_ptr<HookHandle> RegisterBackwardPreHook(FunctionPreHook hook);
     std::shared_ptr<HookHandle> RegisterBackwardPostHook(FunctionPostHook hook);
 
-    const std::string& type() const { return type_; }
+    const std::string &type() const { return type_; }
 
 protected:
     std::vector<std::shared_ptr<Tensor>> saved_tensors_;
diff --git a/infini_train/include/autograd/function_hook.h b/infini_train/include/autograd/function_hook.h
index 7d57926f..6ece328c 100644
--- a/infini_train/include/autograd/function_hook.h
+++ b/infini_train/include/autograd/function_hook.h
@@ -41,10 +41,9 @@ class AllReducePostAccumulateHook : public PostAccumulateGradHook {
     const infini_train::nn::parallel::ProcessGroup *pg_ = nullptr;
 };
 
-template <typename HookType>
-class FunctionHookHandleImpl : public HookHandle {
+template <typename HookType> class FunctionHookHandleImpl : public HookHandle {
 public:
-    FunctionHookHandleImpl(std::vector<HookType>* hooks, size_t id) : hooks_(hooks), id_(id) {}
+    FunctionHookHandleImpl(std::vector<HookType> *hooks, size_t id) : hooks_(hooks), id_(id) {}
 
     void Remove() override {
         if (!removed_ && hooks_ && id_ < hooks_->size()) {
@@ -54,7 +53,7 @@ class FunctionHookHandleImpl : public HookHandle {
     }
 
 private:
-    std::vector<HookType>* hooks_;
+    std::vector<HookType> *hooks_;
     size_t id_;
     bool removed_ = false;
 };
diff --git a/infini_train/include/autograd/tensor_hook.h b/infini_train/include/autograd/tensor_hook.h
index e566df8f..749da025 100644
--- a/infini_train/include/autograd/tensor_hook.h
+++ b/infini_train/include/autograd/tensor_hook.h
@@ -13,17 +13,16 @@ namespace autograd {
 
 // Tensor backward hook: modifies gradient during backward pass
 // Returns modified gradient or nullptr to keep original
-using TensorBackwardHook = std::function<std::shared_ptr<Tensor>(const std::shared_ptr<Tensor>&)>;
+using TensorBackwardHook = std::function<std::shared_ptr<Tensor>(const std::shared_ptr<Tensor> &)>;
 
 class TensorBackwardHookHandle : public HookHandle {
 public:
-    TensorBackwardHookHandle(std::vector<TensorBackwardHook>* hooks, size_t id)
-        : hooks_(hooks), id_(id) {}
+    TensorBackwardHookHandle(std::vector<TensorBackwardHook> *hooks, size_t id) : hooks_(hooks), id_(id) {}
 
     void Remove() override;
 
 private:
-    std::vector<TensorBackwardHook>* hooks_;
+    std::vector<TensorBackwardHook> *hooks_;
     size_t id_;
     bool removed_ = false;
 };
diff --git a/infini_train/include/nn/module_hook.h b/infini_train/include/nn/module_hook.h
index ea3b9219..ffff0d0c 100644
--- a/infini_train/include/nn/module_hook.h
+++ b/infini_train/include/nn/module_hook.h
@@ -12,21 +12,21 @@ class Module;
 
 // Forward pre-hook: called before forward pass
 // Args: (module, input_tensors)
-using ForwardPreHook = std::function<void(Module*, const std::vector<std::shared_ptr<Tensor>>&)>;
+using ForwardPreHook = std::function<void(Module *, const std::vector<std::shared_ptr<Tensor>> &)>;
 
 // Forward post-hook: called after forward pass
 // Args: (module, input_tensors, output_tensors)
-using ForwardPostHook = std::function<void(Module*, const std::vector<std::shared_ptr<Tensor>>&,
-                                           const std::vector<std::shared_ptr<Tensor>>&)>;
+using ForwardPostHook = std::function<void(Module *, const std::vector<std::shared_ptr<Tensor>> &,
+                                           const std::vector<std::shared_ptr<Tensor>> &)>;
 
 // Backward pre-hook: called before backward pass
 // Args: (module, grad_output)
-using BackwardPreHook = std::function<void(Module*, const std::vector<std::shared_ptr<Tensor>>&)>;
+using BackwardPreHook = std::function<void(Module *, const std::vector<std::shared_ptr<Tensor>> &)>;
 
 // Backward post-hook: called after backward pass
 // Args: (module, grad_input, grad_output)
-using BackwardPostHook = std::function<void(Module*, const std::vector<std::shared_ptr<Tensor>>&,
-                                            const std::vector<std::shared_ptr<Tensor>>&)>;
+using BackwardPostHook = std::function<void(Module *, const std::vector<std::shared_ptr<Tensor>> &,
+                                            const std::vector<std::shared_ptr<Tensor>> &)>;
 
 class ModuleHookHandle {
 public:
@@ -34,10 +34,9 @@ class ModuleHookHandle {
     virtual void Remove() = 0;
 };
 
-template <typename HookType>
-class ModuleHookHandleImpl : public ModuleHookHandle {
+template <typename HookType> class ModuleHookHandleImpl : public ModuleHookHandle {
 public:
-    ModuleHookHandleImpl(std::vector<HookType>* hooks, size_t id) : hooks_(hooks), id_(id) {}
+    ModuleHookHandleImpl(std::vector<HookType> *hooks, size_t id) : hooks_(hooks), id_(id) {}
 
     void Remove() override {
         if (!removed_ && hooks_ && id_ < hooks_->size()) {
@@ -47,7 +46,7 @@ class ModuleHookHandleImpl : public ModuleHookHandle {
     }
 
 private:
-    std::vector<HookType>* hooks_;
+    std::vector<HookType> *hooks_;
     size_t id_;
     bool removed_ = false;
 };
diff --git a/infini_train/include/nn/parallel/global.h b/infini_train/include/nn/parallel/global.h
index bd3f102f..afac220a 100644
--- a/infini_train/include/nn/parallel/global.h
+++ b/infini_train/include/nn/parallel/global.h
@@ -30,11 +30,11 @@ class GlobalEnv {
 
     void Init(int threads_per_process, int tensor_parallel_size, bool sequence_parallel_enabled,
               int pipeline_parallel_size, int virtual_pipeline_parallel_size,
-              const utils::PrecisionCheckConfig& precision_config = utils::PrecisionCheckConfig());
+              const utils::PrecisionCheckConfig &precision_config = utils::PrecisionCheckConfig());
 
     enum class PrecisionCheckLevel { NONE, MODULE, FUNCTION };
     PrecisionCheckLevel GetPrecisionCheckLevel() const;
-    const utils::PrecisionCheckConfig& GetPrecisionCheckConfig() const;
+    const utils::PrecisionCheckConfig &GetPrecisionCheckConfig() const;
 
     int nnodes() const;
 
@@ -96,7 +96,7 @@ class GlobalEnv {
 
 inline void InitAllEnv(int nthread_per_process, int tensor_parallel_size, bool sequence_parallel_enabled,
                        int pipeline_parallel_size, int virtual_pipeline_parallel,
-                       const utils::PrecisionCheckConfig& precision_config = utils::PrecisionCheckConfig()) {
+                       const utils::PrecisionCheckConfig &precision_config = utils::PrecisionCheckConfig()) {
     GlobalEnv::Instance().Init(nthread_per_process, tensor_parallel_size, sequence_parallel_enabled,
                                pipeline_parallel_size, virtual_pipeline_parallel, precision_config);
 }
diff --git a/infini_train/include/utils/precision_check_config.h b/infini_train/include/utils/precision_check_config.h
index f2b53a65..3966a544 100644
--- a/infini_train/include/utils/precision_check_config.h
+++ b/infini_train/include/utils/precision_check_config.h
@@ -8,16 +8,18 @@ namespace infini_train {
 namespace utils {
 
 struct PrecisionCheckConfig {
-    int level = 0;                    // 0=off, 1=module, 2=function
-    std::string output_path = "";     // empty=console(rank0), non-empty=file(all ranks)
-    bool output_md5 = false;          // output MD5 hash or tensor values
-    std::string format = "simple";    // "simple" or "table"
-    std::string baseline_path = "";   // baseline file path for comparison
+    int level = 0;                  // 0=off, 1=module, 2=function
+    std::string output_path = "";   // empty=console(rank0), non-empty=file(all ranks)
+    bool output_md5 = false;        // output MD5 hash or tensor values
+    std::string format = "simple";  // "simple" or "table"
+    std::string baseline_path = ""; // baseline file path for comparison
 
     // Parse from "key=value,key=value" string
-    static PrecisionCheckConfig Parse(const std::string& config_str) {
+    static PrecisionCheckConfig Parse(const std::string &config_str) {
         PrecisionCheckConfig config;
-        if (config_str.empty()) return config;
+        if (config_str.empty()) {
+            return config;
+        }
 
         std::unordered_map<std::string, std::string> kv_map;
         std::istringstream ss(config_str);
@@ -29,17 +31,25 @@ struct PrecisionCheckConfig {
             }
         }
 
-        if (kv_map.count("level")) config.level = std::stoi(kv_map["level"]);
-        if (kv_map.count("output_path")) config.output_path = kv_map["output_path"];
+        if (kv_map.count("level")) {
+            config.level = std::stoi(kv_map["level"]);
+        }
+        if (kv_map.count("output_path")) {
+            config.output_path = kv_map["output_path"];
+        }
         if (kv_map.count("output_md5")) {
             config.output_md5 = (kv_map["output_md5"] == "true" || kv_map["output_md5"] == "1");
         }
-        if (kv_map.count("format")) config.format = kv_map["format"];
-        if (kv_map.count("baseline")) config.baseline_path = kv_map["baseline"];
+        if (kv_map.count("format")) {
+            config.format = kv_map["format"];
+        }
+        if (kv_map.count("baseline")) {
+            config.baseline_path = kv_map["baseline"];
+        }
 
         return config;
     }
 };
 
-}  // namespace utils
-}  // namespace infini_train
+} // namespace utils
+} // namespace infini_train
diff --git a/infini_train/include/utils/precision_check_context.h b/infini_train/include/utils/precision_check_context.h
index 722825c7..11ec1f9d 100644
--- a/infini_train/include/utils/precision_check_context.h
+++ b/infini_train/include/utils/precision_check_context.h
@@ -9,18 +9,18 @@ namespace utils {
 // Thread-local to ensure thread safety in multi-threaded training
 class PrecisionCheckContext {
 public:
-    static PrecisionCheckContext& Instance() {
+    static PrecisionCheckContext &Instance() {
         static thread_local PrecisionCheckContext instance;
         return instance;
     }
 
     void SetGAS(int gas) { gas_ = gas; }
     void SetLayer(int layer) { layer_ = layer; }
-    void SetLayerName(const std::string& name) { layer_name_ = name; }
+    void SetLayerName(const std::string &name) { layer_name_ = name; }
 
     int GetGAS() const { return gas_; }
     int GetLayer() const { return layer_; }
-    const std::string& GetLayerName() const { return layer_name_; }
+    const std::string &GetLayerName() const { return layer_name_; }
 
     // Returns formatted key, e.g., "[GAS-0] [L-0] attn_out"
     std::string GetKey() const {
@@ -46,5 +46,5 @@ class PrecisionCheckContext {
     std::string layer_name_;
 };
 
-}  // namespace utils
-}  // namespace infini_train
+} // namespace utils
+} // namespace infini_train
diff --git a/infini_train/include/utils/precision_checker.h b/infini_train/include/utils/precision_checker.h
index 6c09202b..b3b03aa4 100644
--- a/infini_train/include/utils/precision_checker.h
+++ b/infini_train/include/utils/precision_checker.h
@@ -27,22 +27,21 @@ class PrecisionChecker {
         bool abort_on_error = false;
     };
 
-    static const Config& DefaultConfig() {
+    static const Config &DefaultConfig() {
         static Config default_config;
         return default_config;
     }
 
-    static void RegisterForFunction(autograd::Function* func, const std::string& name = "",
-                                    const Config& config = DefaultConfig());
+    static void RegisterForFunction(autograd::Function *func, const std::string &name = "",
+                                    const Config &config = DefaultConfig());
 
     // Register hooks for a Module (checks forward inputs/outputs)
-    static void RegisterForModule(nn::Module* module, const std::string& name = "",
-                                 const Config& config = DefaultConfig());
+    static void RegisterForModule(nn::Module *module, const std::string &name = "",
+                                  const Config &config = DefaultConfig());
 
 private:
-    static void CheckTensors(const std::string& stage, const std::string& name,
-                           const std::vector<std::shared_ptr<Tensor>>& tensors,
-                           const Config& config);
+    static void CheckTensors(const std::string &stage, const std::string &name,
+                             const std::vector<std::shared_ptr<Tensor>> &tensors, const Config &config);
 };
 
 } // namespace utils
diff --git a/infini_train/src/autograd/function.cc b/infini_train/src/autograd/function.cc
index f10e9b10..3fe9c96d 100644
--- a/infini_train/src/autograd/function.cc
+++ b/infini_train/src/autograd/function.cc
@@ -29,7 +29,7 @@ std::vector<std::shared_ptr<Tensor>> Function::Apply(const std::vector<std::shar
     }
 
     // Call forward pre-hooks
-    for (const auto& hook : forward_pre_hooks_) {
+    for (const auto &hook : forward_pre_hooks_) {
         if (hook) {
             hook(this, input_tensors);
         }
@@ -44,7 +44,7 @@ std::vector<std::shared_ptr<Tensor>> Function::Apply(const std::vector<std::shar
     }
 
     // Call forward post-hooks
-    for (const auto& hook : forward_post_hooks_) {
+    for (const auto &hook : forward_post_hooks_) {
         if (hook) {
             hook(this, input_tensors, output_tensors);
         }
@@ -106,7 +106,7 @@ void Function::BackwardPartial(const std::shared_ptr<Tensor> &grad_output, int g
         && (dependencies_reached_ == dependencies_number_ || dependencies_number_ == 0)) {
 
         // Call backward pre-hooks
-        for (const auto& hook : backward_pre_hooks_) {
+        for (const auto &hook : backward_pre_hooks_) {
             if (hook) {
                 hook(this, grad_outputs_);
             }
@@ -120,7 +120,7 @@ void Function::BackwardPartial(const std::shared_ptr<Tensor> &grad_output, int g
         }
 
         // Call backward post-hooks
-        for (const auto& hook : backward_post_hooks_) {
+        for (const auto &hook : backward_post_hooks_) {
             if (hook) {
                 hook(this, grad_inputs, grad_outputs_);
             }
@@ -163,21 +163,25 @@ void Function::IncreaseDependenciesNumber() { ++dependencies_number_; }
 
 std::shared_ptr<HookHandle> Function::RegisterForwardPreHook(FunctionPreHook hook) {
     forward_pre_hooks_.push_back(std::move(hook));
-    return std::make_shared<FunctionHookHandleImpl<FunctionPreHook>>(&forward_pre_hooks_, forward_pre_hooks_.size() - 1);
+    return std::make_shared<FunctionHookHandleImpl<FunctionPreHook>>(&forward_pre_hooks_,
+                                                                     forward_pre_hooks_.size() - 1);
 }
 
 std::shared_ptr<HookHandle> Function::RegisterForwardPostHook(FunctionPostHook hook) {
     forward_post_hooks_.push_back(std::move(hook));
-    return std::make_shared<FunctionHookHandleImpl<FunctionPostHook>>(&forward_post_hooks_, forward_post_hooks_.size() - 1);
+    return std::make_shared<FunctionHookHandleImpl<FunctionPostHook>>(&forward_post_hooks_,
+                                                                      forward_post_hooks_.size() - 1);
 }
 
 std::shared_ptr<HookHandle> Function::RegisterBackwardPreHook(FunctionPreHook hook) {
     backward_pre_hooks_.push_back(std::move(hook));
-    return std::make_shared<FunctionHookHandleImpl<FunctionPreHook>>(&backward_pre_hooks_, backward_pre_hooks_.size() - 1);
+    return std::make_shared<FunctionHookHandleImpl<FunctionPreHook>>(&backward_pre_hooks_,
+                                                                     backward_pre_hooks_.size() - 1);
 }
 
 std::shared_ptr<HookHandle> Function::RegisterBackwardPostHook(FunctionPostHook hook) {
     backward_post_hooks_.push_back(std::move(hook));
-    return std::make_shared<FunctionHookHandleImpl<FunctionPostHook>>(&backward_post_hooks_, backward_post_hooks_.size() - 1);
+    return std::make_shared<FunctionHookHandleImpl<FunctionPostHook>>(&backward_post_hooks_,
+                                                                      backward_post_hooks_.size() - 1);
 }
 } // namespace infini_train::autograd
diff --git a/infini_train/src/nn/modules/module.cc b/infini_train/src/nn/modules/module.cc
index b80a37ab..d9938fc4 100644
--- a/infini_train/src/nn/modules/module.cc
+++ b/infini_train/src/nn/modules/module.cc
@@ -139,7 +139,7 @@ std::vector<std::shared_ptr<Tensor>> Module::operator()(const std::vector<std::s
     }
 
     // Call forward pre-hooks
-    for (const auto& hook : forward_pre_hooks_) {
+    for (const auto &hook : forward_pre_hooks_) {
         if (hook) {
             hook(this, input_tensors);
         }
@@ -149,7 +149,7 @@ std::vector<std::shared_ptr<Tensor>> Module::operator()(const std::vector<std::s
     auto output_tensors = Forward(input_tensors);
 
     // Call forward post-hooks
-    for (const auto& hook : forward_post_hooks_) {
+    for (const auto &hook : forward_post_hooks_) {
         if (hook) {
             hook(this, input_tensors, output_tensors);
         }
@@ -157,22 +157,26 @@ std::vector<std::shared_ptr<Tensor>> Module::operator()(const std::vector<std::s
 
     // Register backward hooks on output tensors' grad_fn
     if (!backward_pre_hooks_.empty() || !backward_post_hooks_.empty()) {
-        for (const auto& output : output_tensors) {
+        for (const auto &output : output_tensors) {
             if (output && output->grad_fn()) {
                 if (!backward_pre_hooks_.empty()) {
                     output->grad_fn()->RegisterBackwardPreHook(
-                        [this](autograd::Function*, const std::vector<std::shared_ptr<Tensor>>& grad_outputs) {
-                            for (const auto& hook : backward_pre_hooks_) {
-                                if (hook) hook(this, grad_outputs);
+                        [this](autograd::Function *, const std::vector<std::shared_ptr<Tensor>> &grad_outputs) {
+                            for (const auto &hook : backward_pre_hooks_) {
+                                if (hook) {
+                                    hook(this, grad_outputs);
+                                }
                             }
                         });
                 }
                 if (!backward_post_hooks_.empty()) {
                     output->grad_fn()->RegisterBackwardPostHook(
-                        [this](autograd::Function*, const std::vector<std::shared_ptr<Tensor>>& grad_inputs,
-                               const std::vector<std::shared_ptr<Tensor>>& grad_outputs) {
-                            for (const auto& hook : backward_post_hooks_) {
-                                if (hook) hook(this, grad_inputs, grad_outputs);
+                        [this](autograd::Function *, const std::vector<std::shared_ptr<Tensor>> &grad_inputs,
+                               const std::vector<std::shared_ptr<Tensor>> &grad_outputs) {
+                            for (const auto &hook : backward_post_hooks_) {
+                                if (hook) {
+                                    hook(this, grad_inputs, grad_outputs);
+                                }
                             }
                         });
                 }
@@ -232,16 +236,19 @@ std::shared_ptr<ModuleHookHandle> Module::RegisterForwardPreHook(ForwardPreHook
 
 std::shared_ptr<ModuleHookHandle> Module::RegisterForwardPostHook(ForwardPostHook hook) {
     forward_post_hooks_.push_back(std::move(hook));
-    return std::make_shared<ModuleHookHandleImpl<ForwardPostHook>>(&forward_post_hooks_, forward_post_hooks_.size() - 1);
+    return std::make_shared<ModuleHookHandleImpl<ForwardPostHook>>(&forward_post_hooks_,
+                                                                   forward_post_hooks_.size() - 1);
 }
 
 std::shared_ptr<ModuleHookHandle> Module::RegisterBackwardPreHook(BackwardPreHook hook) {
     backward_pre_hooks_.push_back(std::move(hook));
-    return std::make_shared<ModuleHookHandleImpl<BackwardPreHook>>(&backward_pre_hooks_, backward_pre_hooks_.size() - 1);
+    return std::make_shared<ModuleHookHandleImpl<BackwardPreHook>>(&backward_pre_hooks_,
+                                                                   backward_pre_hooks_.size() - 1);
 }
 
 std::shared_ptr<ModuleHookHandle> Module::RegisterBackwardPostHook(BackwardPostHook hook) {
     backward_post_hooks_.push_back(std::move(hook));
-    return std::make_shared<ModuleHookHandleImpl<BackwardPostHook>>(&backward_post_hooks_, backward_post_hooks_.size() - 1);
+    return std::make_shared<ModuleHookHandleImpl<BackwardPostHook>>(&backward_post_hooks_,
+                                                                    backward_post_hooks_.size() - 1);
 }
 } // namespace infini_train::nn
diff --git a/infini_train/src/nn/parallel/global.cc b/infini_train/src/nn/parallel/global.cc
index 09d93825..04da0a51 100644
--- a/infini_train/src/nn/parallel/global.cc
+++ b/infini_train/src/nn/parallel/global.cc
@@ -92,7 +92,7 @@ GlobalEnv &GlobalEnv::Instance() {
 
 void GlobalEnv::Init(int nthread_per_process, int tensor_parallel_size, bool sequence_parallel_enabled,
                      int pipeline_parallel_size, int virtual_pipeline_parallel_size,
-                     const utils::PrecisionCheckConfig& precision_config) {
+                     const utils::PrecisionCheckConfig &precision_config) {
     std::lock_guard<std::mutex> lock(mutex_);
 
     CHECK(!initialized_) << "Repeated initialization of GlobalEnv!";
@@ -194,11 +194,9 @@ Layout GlobalEnv::layout() const {
     return layout_;
 }
 
-GlobalEnv::PrecisionCheckLevel GlobalEnv::GetPrecisionCheckLevel() const {
-    return precision_check_level_;
-}
+GlobalEnv::PrecisionCheckLevel GlobalEnv::GetPrecisionCheckLevel() const { return precision_check_level_; }
 
-const utils::PrecisionCheckConfig& GlobalEnv::GetPrecisionCheckConfig() const {
+const utils::PrecisionCheckConfig &GlobalEnv::GetPrecisionCheckConfig() const {
     CHECK(initialized_) << "GlobalEnv is not initialized!";
     return precision_check_config_;
 }
diff --git a/infini_train/src/utils/precision_checker.cc b/infini_train/src/utils/precision_checker.cc
index bbb7a134..be025f41 100644
--- a/infini_train/src/utils/precision_checker.cc
+++ b/infini_train/src/utils/precision_checker.cc
@@ -28,8 +28,8 @@ class MD5 {
 public:
     MD5() { Init(); }
 
-    void Update(const void* data, size_t len) {
-        const uint8_t* ptr = static_cast<const uint8_t*>(data);
+    void Update(const void *data, size_t len) {
+        const uint8_t *ptr = static_cast<const uint8_t *>(data);
         size_t buffer_space = 64 - buffer_len_;
 
         if (len >= buffer_space) {
@@ -61,23 +61,15 @@ class MD5 {
         Update(padding, pad_len);
 
         uint8_t len_bytes[8];
-        for (int i = 0; i < 8; ++i) len_bytes[i] = (bits >> (i * 8)) & 0xff;
+        for (int i = 0; i < 8; ++i) { len_bytes[i] = (bits >> (i * 8)) & 0xff; }
         Update(len_bytes, 8);
 
         std::ostringstream oss;
         oss << std::hex << std::setfill('0');
-        for (int i = 0; i < 4; ++i) {
-            oss << std::setw(2) << ((state_[0] >> (i * 8)) & 0xff);
-        }
-        for (int i = 0; i < 4; ++i) {
-            oss << std::setw(2) << ((state_[1] >> (i * 8)) & 0xff);
-        }
-        for (int i = 0; i < 4; ++i) {
-            oss << std::setw(2) << ((state_[2] >> (i * 8)) & 0xff);
-        }
-        for (int i = 0; i < 4; ++i) {
-            oss << std::setw(2) << ((state_[3] >> (i * 8)) & 0xff);
-        }
+        for (int i = 0; i < 4; ++i) { oss << std::setw(2) << ((state_[0] >> (i * 8)) & 0xff); }
+        for (int i = 0; i < 4; ++i) { oss << std::setw(2) << ((state_[1] >> (i * 8)) & 0xff); }
+        for (int i = 0; i < 4; ++i) { oss << std::setw(2) << ((state_[2] >> (i * 8)) & 0xff); }
+        for (int i = 0; i < 4; ++i) { oss << std::setw(2) << ((state_[3] >> (i * 8)) & 0xff); }
         return oss.str();
     }
 
@@ -97,26 +89,25 @@ class MD5 {
     static uint32_t I(uint32_t x, uint32_t y, uint32_t z) { return y ^ (x | ~z); }
     static uint32_t RotateLeft(uint32_t x, int n) { return (x << n) | (x >> (32 - n)); }
 
-    void Transform(const uint8_t* block) {
+    void Transform(const uint8_t *block) {
         uint32_t a = state_[0], b = state_[1], c = state_[2], d = state_[3];
         uint32_t x[16];
         for (int i = 0; i < 16; ++i) {
             x[i] = block[i * 4] | (block[i * 4 + 1] << 8) | (block[i * 4 + 2] << 16) | (block[i * 4 + 3] << 24);
         }
 
-        static const uint32_t k[] = {
-            0xd76aa478, 0xe8c7b756, 0x242070db, 0xc1bdceee, 0xf57c0faf, 0x4787c62a, 0xa8304613, 0xfd469501,
-            0x698098d8, 0x8b44f7af, 0xffff5bb1, 0x895cd7be, 0x6b901122, 0xfd987193, 0xa679438e, 0x49b40821,
-            0xf61e2562, 0xc040b340, 0x265e5a51, 0xe9b6c7aa, 0xd62f105d, 0x02441453, 0xd8a1e681, 0xe7d3fbc8,
-            0x21e1cde6, 0xc33707d6, 0xf4d50d87, 0x455a14ed, 0xa9e3e905, 0xfcefa3f8, 0x676f02d9, 0x8d2a4c8a,
-            0xfffa3942, 0x8771f681, 0x6d9d6122, 0xfde5380c, 0xa4beea44, 0x4bdecfa9, 0xf6bb4b60, 0xbebfbc70,
-            0x289b7ec6, 0xeaa127fa, 0xd4ef3085, 0x04881d05, 0xd9d4d039, 0xe6db99e5, 0x1fa27cf8, 0xc4ac5665,
-            0xf4292244, 0x432aff97, 0xab9423a7, 0xfc93a039, 0x655b59c3, 0x8f0ccc92, 0xffeff47d, 0x85845dd1,
-            0x6fa87e4f, 0xfe2ce6e0, 0xa3014314, 0x4e0811a1, 0xf7537e82, 0xbd3af235, 0x2ad7d2bb, 0xeb86d391};
-        static const int s[] = {7, 12, 17, 22, 7, 12, 17, 22, 7, 12, 17, 22, 7, 12, 17, 22,
-                                5, 9,  14, 20, 5, 9,  14, 20, 5, 9,  14, 20, 5, 9,  14, 20,
-                                4, 11, 16, 23, 4, 11, 16, 23, 4, 11, 16, 23, 4, 11, 16, 23,
-                                6, 10, 15, 21, 6, 10, 15, 21, 6, 10, 15, 21, 6, 10, 15, 21};
+        static const uint32_t k[]
+            = {0xd76aa478, 0xe8c7b756, 0x242070db, 0xc1bdceee, 0xf57c0faf, 0x4787c62a, 0xa8304613, 0xfd469501,
+               0x698098d8, 0x8b44f7af, 0xffff5bb1, 0x895cd7be, 0x6b901122, 0xfd987193, 0xa679438e, 0x49b40821,
+               0xf61e2562, 0xc040b340, 0x265e5a51, 0xe9b6c7aa, 0xd62f105d, 0x02441453, 0xd8a1e681, 0xe7d3fbc8,
+               0x21e1cde6, 0xc33707d6, 0xf4d50d87, 0x455a14ed, 0xa9e3e905, 0xfcefa3f8, 0x676f02d9, 0x8d2a4c8a,
+               0xfffa3942, 0x8771f681, 0x6d9d6122, 0xfde5380c, 0xa4beea44, 0x4bdecfa9, 0xf6bb4b60, 0xbebfbc70,
+               0x289b7ec6, 0xeaa127fa, 0xd4ef3085, 0x04881d05, 0xd9d4d039, 0xe6db99e5, 0x1fa27cf8, 0xc4ac5665,
+               0xf4292244, 0x432aff97, 0xab9423a7, 0xfc93a039, 0x655b59c3, 0x8f0ccc92, 0xffeff47d, 0x85845dd1,
+               0x6fa87e4f, 0xfe2ce6e0, 0xa3014314, 0x4e0811a1, 0xf7537e82, 0xbd3af235, 0x2ad7d2bb, 0xeb86d391};
+        static const int s[] = {7,  12, 17, 22, 7,  12, 17, 22, 7,  12, 17, 22, 7,  12, 17, 22, 5,  9,  14, 20, 5,  9,
+                                14, 20, 5,  9,  14, 20, 5,  9,  14, 20, 4,  11, 16, 23, 4,  11, 16, 23, 4,  11, 16, 23,
+                                4,  11, 16, 23, 6,  10, 15, 21, 6,  10, 15, 21, 6,  10, 15, 21, 6,  10, 15, 21};
 
         for (int i = 0; i < 64; ++i) {
             uint32_t f, g;
@@ -152,14 +143,14 @@ class MD5 {
     uint64_t total_len_;
 };
 
-std::string ComputeMD5(const void* data, size_t size) {
+std::string ComputeMD5(const void *data, size_t size) {
     MD5 md5;
     md5.Update(data, size);
     return md5.Finalize();
 }
 
 // Baseline storage
-std::unordered_map<std::string, std::string>& GetBaseline() {
+std::unordered_map<std::string, std::string> &GetBaseline() {
     static std::unordered_map<std::string, std::string> baseline;
     static bool loaded = false;
     static std::mutex load_mutex;
@@ -167,7 +158,7 @@ std::unordered_map<std::string, std::string>& GetBaseline() {
     if (!loaded) {
         std::lock_guard<std::mutex> lock(load_mutex);
         if (!loaded) {
-            const auto& config = nn::parallel::global::GlobalEnv::Instance().GetPrecisionCheckConfig();
+            const auto &config = nn::parallel::global::GlobalEnv::Instance().GetPrecisionCheckConfig();
             if (!config.baseline_path.empty()) {
                 std::ifstream file(config.baseline_path);
                 std::string line;
@@ -190,12 +181,12 @@ std::unordered_map<std::string, std::string>& GetBaseline() {
 }
 
 // Table header printed flag
-bool& TableHeaderPrinted() {
+bool &TableHeaderPrinted() {
     static bool printed = false;
     return printed;
 }
 
-std::ostream& GetLogStream() {
+std::ostream &GetLogStream() {
     static std::ofstream log_file;
     static std::mutex init_mutex;
     static bool initialized = false;
@@ -204,7 +195,7 @@ std::ostream& GetLogStream() {
     if (!initialized) {
         std::lock_guard<std::mutex> lock(init_mutex);
         if (!initialized) {
-            const auto& config = nn::parallel::global::GlobalEnv::Instance().GetPrecisionCheckConfig();
+            const auto &config = nn::parallel::global::GlobalEnv::Instance().GetPrecisionCheckConfig();
 
             if (config.output_path.empty()) {
                 use_console = true;
@@ -223,7 +214,7 @@ std::ostream& GetLogStream() {
 }
 
 bool ShouldPrint() {
-    const auto& config = nn::parallel::global::GlobalEnv::Instance().GetPrecisionCheckConfig();
+    const auto &config = nn::parallel::global::GlobalEnv::Instance().GetPrecisionCheckConfig();
     if (!config.output_path.empty()) {
         return true;
     }
@@ -245,11 +236,13 @@ std::string GetTimestamp() {
     return oss.str();
 }
 
-std::string FormatShape(const std::vector<int64_t>& shape) {
+std::string FormatShape(const std::vector<int64_t> &shape) {
     std::ostringstream oss;
     oss << "(";
     for (size_t i = 0; i < shape.size(); ++i) {
-        if (i > 0) oss << ", ";
+        if (i > 0) {
+            oss << ", ";
+        }
         oss << shape[i];
     }
     oss << ")";
@@ -258,17 +251,25 @@ std::string FormatShape(const std::vector<int64_t>& shape) {
 
 std::string DataTypeToString(DataType dtype) {
     switch (dtype) {
-        case DataType::kFLOAT32: return "float32";
-        case DataType::kFLOAT16: return "float16";
-        case DataType::kBFLOAT16: return "bfloat16";
-        case DataType::kINT32: return "int32";
-        case DataType::kINT64: return "int64";
-        default: return "unknown";
+    case DataType::kFLOAT32:
+        return "float32";
+    case DataType::kFLOAT16:
+        return "float16";
+    case DataType::kBFLOAT16:
+        return "bfloat16";
+    case DataType::kINT32:
+        return "int32";
+    case DataType::kINT64:
+        return "int64";
+    default:
+        return "unknown";
     }
 }
 
-void PrintTableHeader(std::ostream& os) {
-    if (TableHeaderPrinted()) return;
+void PrintTableHeader(std::ostream &os) {
+    if (TableHeaderPrinted()) {
+        return;
+    }
     TableHeaderPrinted() = true;
 
     os << "+" << std::string(50, '-') << "+" << std::string(7, '-') << "+" << std::string(18, '-') << "+"
@@ -284,16 +285,18 @@ void PrintTableHeader(std::ostream& os) {
        << std::string(15, '-') << "+" << std::string(10, '-') << "+" << std::string(10, '-') << "+\n";
 }
 
-void PrintTableRow(std::ostream& os, const std::string& key, int level, const std::string& shape,
-                   const std::string& dtype, const std::string& same_hash, const std::string& diff_order) {
+void PrintTableRow(std::ostream &os, const std::string &key, int level, const std::string &shape,
+                   const std::string &dtype, const std::string &same_hash, const std::string &diff_order) {
     os << "| " << std::left << std::setw(49) << key.substr(0, 49) << "| " << std::setw(6) << level << "| "
-       << std::setw(17) << shape.substr(0, 17) << "| " << std::setw(14) << dtype << "| " << std::setw(9)
-       << same_hash << "| " << std::setw(9) << diff_order << "|\n";
+       << std::setw(17) << shape.substr(0, 17) << "| " << std::setw(14) << dtype << "| " << std::setw(9) << same_hash
+       << "| " << std::setw(9) << diff_order << "|\n";
 }
 
 // Calculate diff order between two tensors (returns string like "1e-3" or "0")
-std::string CalculateDiffOrder(const float* data1, const float* data2, size_t size) {
-    if (!data1 || !data2 || size == 0) return "N/A";
+std::string CalculateDiffOrder(const float *data1, const float *data2, size_t size) {
+    if (!data1 || !data2 || size == 0) {
+        return "N/A";
+    }
 
     double max_diff = 0.0;
     for (size_t i = 0; i < size; ++i) {
@@ -301,29 +304,33 @@ std::string CalculateDiffOrder(const float* data1, const float* data2, size_t si
         max_diff = std::max(max_diff, diff);
     }
 
-    if (max_diff == 0.0) return "0";
+    if (max_diff == 0.0) {
+        return "0";
+    }
 
     int order = static_cast<int>(std::floor(std::log10(max_diff)));
     return "1e" + std::to_string(order);
 }
 
-}  // namespace
+} // namespace
 
-void PrecisionChecker::CheckTensors(const std::string& stage, const std::string& name,
-                                    const std::vector<std::shared_ptr<Tensor>>& tensors, const Config& config) {
+void PrecisionChecker::CheckTensors(const std::string &stage, const std::string &name,
+                                    const std::vector<std::shared_ptr<Tensor>> &tensors, const Config &config) {
     if (!ShouldPrint()) {
         return;
     }
 
-    const auto& global_config = nn::parallel::global::GlobalEnv::Instance().GetPrecisionCheckConfig();
+    const auto &global_config = nn::parallel::global::GlobalEnv::Instance().GetPrecisionCheckConfig();
     int rank = nn::parallel::global::GlobalEnv::Instance().global_proc_rank();
     int level = global_config.level;
-    auto& baseline = GetBaseline();
+    auto &baseline = GetBaseline();
 
     for (size_t i = 0; i < tensors.size(); ++i) {
-        if (!tensors[i]) continue;
+        if (!tensors[i]) {
+            continue;
+        }
 
-        auto& tensor = tensors[i];
+        auto &tensor = tensors[i];
 
         // Copy tensor to CPU if it's on GPU
         std::shared_ptr<Tensor> cpu_tensor;
@@ -334,7 +341,7 @@ void PrecisionChecker::CheckTensors(const std::string& stage, const std::string&
             cpu_tensor = tensor;
         }
 
-        const void* data = cpu_tensor->DataPtr();
+        const void *data = cpu_tensor->DataPtr();
         size_t byte_size = cpu_tensor->SizeInBytes();
         size_t num_elements = cpu_tensor->NumElements();
 
@@ -362,7 +369,7 @@ void PrecisionChecker::CheckTensors(const std::string& stage, const std::string&
             }
         }
 
-        auto& log_stream = GetLogStream();
+        auto &log_stream = GetLogStream();
 
         if (global_config.format == "table") {
             static bool header_printed = false;
@@ -375,14 +382,18 @@ void PrecisionChecker::CheckTensors(const std::string& stage, const std::string&
                           DataTypeToString(cpu_tensor->Dtype()), same_hash_str, diff_order);
         } else {
             // Simple format
-            const float* float_data = static_cast<const float*>(data);
+            const float *float_data = static_cast<const float *>(data);
 
             bool has_nan = false;
             bool has_inf = false;
             for (size_t j = 0; j < num_elements; ++j) {
                 float val = float_data[j];
-                if (std::isnan(val)) has_nan = true;
-                if (std::isinf(val)) has_inf = true;
+                if (std::isnan(val)) {
+                    has_nan = true;
+                }
+                if (std::isinf(val)) {
+                    has_inf = true;
+                }
             }
 
             bool has_error = (config.check_nan && has_nan) || (config.check_inf && has_inf);
@@ -395,19 +406,29 @@ void PrecisionChecker::CheckTensors(const std::string& stage, const std::string&
 
                 if (global_config.output_md5) {
                     log_stream << "md5=" << md5;
-                    if (!same_hash) log_stream << " (MISMATCH)";
+                    if (!same_hash) {
+                        log_stream << " (MISMATCH)";
+                    }
                 } else {
                     log_stream << "[";
-                    if (has_nan) log_stream << " NaN detected!";
-                    if (has_inf) log_stream << " Inf detected!";
+                    if (has_nan) {
+                        log_stream << " NaN detected!";
+                    }
+                    if (has_inf) {
+                        log_stream << " Inf detected!";
+                    }
 
                     if (config.print_stats) {
                         constexpr size_t max_print = 6;
                         for (size_t j = 0; j < std::min(num_elements, max_print); ++j) {
-                            if (j > 0) log_stream << ", ";
+                            if (j > 0) {
+                                log_stream << ", ";
+                            }
                             log_stream << float_data[j];
                         }
-                        if (num_elements > max_print) log_stream << ", ...";
+                        if (num_elements > max_print) {
+                            log_stream << ", ...";
+                        }
                     }
                     log_stream << "]";
                 }
@@ -427,44 +448,45 @@ void PrecisionChecker::CheckTensors(const std::string& stage, const std::string&
     }
 }
 
-void PrecisionChecker::RegisterForFunction(autograd::Function* func, const std::string& name, const Config& config) {
+void PrecisionChecker::RegisterForFunction(autograd::Function *func, const std::string &name, const Config &config) {
     std::string func_name = name.empty() ? "Function" : name;
 
     func->RegisterForwardPreHook(
-        [func_name, config](autograd::Function*, const std::vector<std::shared_ptr<Tensor>>& inputs) {
+        [func_name, config](autograd::Function *, const std::vector<std::shared_ptr<Tensor>> &inputs) {
             CheckTensors("Forward Input", func_name, inputs, config);
         });
 
-    func->RegisterForwardPostHook([func_name, config](autograd::Function*, const std::vector<std::shared_ptr<Tensor>>&,
-                                                      const std::vector<std::shared_ptr<Tensor>>& outputs) {
+    func->RegisterForwardPostHook([func_name, config](autograd::Function *,
+                                                      const std::vector<std::shared_ptr<Tensor>> &,
+                                                      const std::vector<std::shared_ptr<Tensor>> &outputs) {
         CheckTensors("Forward Output", func_name, outputs, config);
     });
 
     func->RegisterBackwardPreHook(
-        [func_name, config](autograd::Function*, const std::vector<std::shared_ptr<Tensor>>& grad_outputs) {
+        [func_name, config](autograd::Function *, const std::vector<std::shared_ptr<Tensor>> &grad_outputs) {
             CheckTensors("Backward Input", func_name, grad_outputs, config);
         });
 
-    func->RegisterBackwardPostHook([func_name, config](autograd::Function*,
-                                                       const std::vector<std::shared_ptr<Tensor>>& grad_inputs,
-                                                       const std::vector<std::shared_ptr<Tensor>>&) {
+    func->RegisterBackwardPostHook([func_name, config](autograd::Function *,
+                                                       const std::vector<std::shared_ptr<Tensor>> &grad_inputs,
+                                                       const std::vector<std::shared_ptr<Tensor>> &) {
         CheckTensors("Backward Output", func_name, grad_inputs, config);
     });
 }
 
-void PrecisionChecker::RegisterForModule(nn::Module* module, const std::string& name, const Config& config) {
+void PrecisionChecker::RegisterForModule(nn::Module *module, const std::string &name, const Config &config) {
     std::string module_name = name.empty() ? module->type() : name;
 
-    module->RegisterForwardPostHook([module_name, config](nn::Module*, const std::vector<std::shared_ptr<Tensor>>&,
-                                                          const std::vector<std::shared_ptr<Tensor>>& outputs) {
+    module->RegisterForwardPostHook([module_name, config](nn::Module *, const std::vector<std::shared_ptr<Tensor>> &,
+                                                          const std::vector<std::shared_ptr<Tensor>> &outputs) {
         CheckTensors("Module Forward Output", module_name, outputs, config);
     });
 
-    module->RegisterBackwardPostHook([module_name, config](nn::Module*,
-                                                           const std::vector<std::shared_ptr<Tensor>>& grad_inputs,
-                                                           const std::vector<std::shared_ptr<Tensor>>&) {
+    module->RegisterBackwardPostHook([module_name, config](nn::Module *,
+                                                           const std::vector<std::shared_ptr<Tensor>> &grad_inputs,
+                                                           const std::vector<std::shared_ptr<Tensor>> &) {
         CheckTensors("Module Backward Output", module_name, grad_inputs, config);
     });
 }
 
-}  // namespace infini_train::utils
+} // namespace infini_train::utils
diff --git a/test/hook/test_hook.cc b/test/hook/test_hook.cc
index 25f76ca2..8763d7c1 100644
--- a/test/hook/test_hook.cc
+++ b/test/hook/test_hook.cc
@@ -27,8 +27,7 @@ void test_basic_hooks() {
     public:
         MyModule() : Module("MyModule") {}
 
-        std::vector<std::shared_ptr<Tensor>> Forward(
-            const std::vector<std::shared_ptr<Tensor>>& inputs) override {
+        std::vector<std::shared_ptr<Tensor>> Forward(const std::vector<std::shared_ptr<Tensor>> &inputs) override {
             std::cout << "Forward pass executing..." << std::endl;
             return inputs;
         }
@@ -37,36 +36,30 @@ void test_basic_hooks() {
     auto module = std::make_shared<MyModule>();
 
     // Register forward pre-hook
-    auto pre_hook = module->RegisterForwardPreHook(
-        [](nn::Module* mod, const std::vector<std::shared_ptr<Tensor>>& inputs) {
-            std::cout << "Forward pre-hook: Module type = " << mod->type() << std::endl;
-        }
-    );
+    auto pre_hook
+        = module->RegisterForwardPreHook([](nn::Module *mod, const std::vector<std::shared_ptr<Tensor>> &inputs) {
+              std::cout << "Forward pre-hook: Module type = " << mod->type() << std::endl;
+          });
 
     // Register forward post-hook
-    auto fwd_hook = module->RegisterForwardPostHook(
-        [](nn::Module* mod,
-           const std::vector<std::shared_ptr<Tensor>>& inputs,
-           const std::vector<std::shared_ptr<Tensor>>& outputs) {
-            std::cout << "Forward post-hook: Got " << outputs.size() << " outputs" << std::endl;
-        }
-    );
+    auto fwd_hook
+        = module->RegisterForwardPostHook([](nn::Module *mod, const std::vector<std::shared_ptr<Tensor>> &inputs,
+                                             const std::vector<std::shared_ptr<Tensor>> &outputs) {
+              std::cout << "Forward post-hook: Got " << outputs.size() << " outputs" << std::endl;
+          });
 
     // Register backward pre-hook
     auto bwd_pre_hook = module->RegisterBackwardPreHook(
-        [](nn::Module* mod, const std::vector<std::shared_ptr<Tensor>>& grad_outputs) {
+        [](nn::Module *mod, const std::vector<std::shared_ptr<Tensor>> &grad_outputs) {
             std::cout << "Backward pre-hook called!" << std::endl;
-        }
-    );
+        });
 
     // Register backward post-hook
-    auto bwd_post_hook = module->RegisterBackwardPostHook(
-        [](nn::Module* mod,
-           const std::vector<std::shared_ptr<Tensor>>& grad_inputs,
-           const std::vector<std::shared_ptr<Tensor>>& grad_outputs) {
-            std::cout << "Backward post-hook called!" << std::endl;
-        }
-    );
+    auto bwd_post_hook
+        = module->RegisterBackwardPostHook([](nn::Module *mod, const std::vector<std::shared_ptr<Tensor>> &grad_inputs,
+                                              const std::vector<std::shared_ptr<Tensor>> &grad_outputs) {
+              std::cout << "Backward post-hook called!" << std::endl;
+          });
 
     // Test forward pass
     std::vector<std::shared_ptr<Tensor>> inputs = {x};
@@ -94,25 +87,22 @@ void test_hook_remove() {
 
     // Register three forward pre-hooks
     auto handle1 = add_fn->RegisterForwardPreHook(
-        [&hook1_count](autograd::Function*, const std::vector<std::shared_ptr<Tensor>>&) {
+        [&hook1_count](autograd::Function *, const std::vector<std::shared_ptr<Tensor>> &) {
             hook1_count++;
             std::cout << "Hook 1 called (count: " << hook1_count << ")" << std::endl;
-        }
-    );
+        });
 
     auto handle2 = add_fn->RegisterForwardPreHook(
-        [&hook2_count](autograd::Function*, const std::vector<std::shared_ptr<Tensor>>&) {
+        [&hook2_count](autograd::Function *, const std::vector<std::shared_ptr<Tensor>> &) {
             hook2_count++;
             std::cout << "Hook 2 called (count: " << hook2_count << ")" << std::endl;
-        }
-    );
+        });
 
     auto handle3 = add_fn->RegisterForwardPreHook(
-        [&hook3_count](autograd::Function*, const std::vector<std::shared_ptr<Tensor>>&) {
+        [&hook3_count](autograd::Function *, const std::vector<std::shared_ptr<Tensor>> &) {
             hook3_count++;
             std::cout << "Hook 3 called (count: " << hook3_count << ")" << std::endl;
-        }
-    );
+        });
 
     // First call - all hooks should fire
     std::cout << "\n--- First Apply (all hooks active) ---" << std::endl;
@@ -170,7 +160,7 @@ void test_hook_remove() {
 // ============================================================================
 // Main
 // ============================================================================
-int main(int argc, char* argv[]) {
+int main(int argc, char *argv[]) {
     google::InitGoogleLogging(argv[0]);
     nn::parallel::global::GlobalEnv::Instance().Init(0, 1, 1, 1, 1);
 
diff --git a/test/hook/test_precision_check.cc b/test/hook/test_precision_check.cc
index d3fa7ed2..a0c2dc1a 100644
--- a/test/hook/test_precision_check.cc
+++ b/test/hook/test_precision_check.cc
@@ -14,8 +14,7 @@ class MyModel : public nn::Module {
 public:
     MyModel() : Module("MyModel") {}
 
-    std::vector<std::shared_ptr<Tensor>> Forward(
-        const std::vector<std::shared_ptr<Tensor>>& inputs) override {
+    std::vector<std::shared_ptr<Tensor>> Forward(const std::vector<std::shared_ptr<Tensor>> &inputs) override {
         auto x = inputs[0];
         x->RequiresGrad();
         auto y = x->Mul(x);
@@ -23,7 +22,7 @@ class MyModel : public nn::Module {
     }
 };
 
-void TestFunctionLevel(const std::string& config_str) {
+void TestFunctionLevel(const std::string &config_str) {
     std::cout << "\n========================================" << std::endl;
     std::cout << "  Function-Level Test: " << config_str << std::endl;
     std::cout << "========================================" << std::endl;
@@ -61,7 +60,7 @@ void TestModuleLevel() {
     std::cout << "Test completed." << std::endl;
 }
 
-int main(int argc, char* argv[]) {
+int main(int argc, char *argv[]) {
     google::InitGoogleLogging(argv[0]);
 
     std::string config_str = argc > 1 ? argv[1] : "level=2";

From 95ae35d208712b648231d84477fbbd5ab5dd7f96 Mon Sep 17 00:00:00 2001
From: chen <cx2016013@163.com>
Date: Wed, 14 Jan 2026 08:38:19 +0000
Subject: [PATCH 4/6] refactor: unify hook infrastructure and enhance
 documentation

- Unify Function and Module hook infrastructure into common/hook.h
- Remove duplicated HookHandle and HookHandleImpl classes
- Update precision_checker_guide.md and hook_mechanism.md
---
 docs/hook_mechanism.md                        | 138 ++++++++++++++++--
 docs/precision_checker_guide.md               |  14 +-
 infini_train/include/autograd/accumulate.h    |   2 -
 infini_train/include/autograd/function.h      |  15 +-
 infini_train/include/autograd/function_hook.h |  22 ---
 infini_train/include/autograd/tensor_hook.h   |  31 ----
 infini_train/include/common/hook.h            |  30 ++++
 infini_train/include/nn/module_hook.h         |  55 -------
 infini_train/include/nn/modules/module.h      |  25 ++--
 .../include/utils/precision_checker.h         |   2 +-
 infini_train/src/autograd/function.cc         |  26 +---
 infini_train/src/nn/modules/module.cc         |  21 ++-
 test/hook/test_hook.cc                        |   2 +-
 13 files changed, 196 insertions(+), 187 deletions(-)
 delete mode 100644 infini_train/include/autograd/tensor_hook.h
 create mode 100644 infini_train/include/common/hook.h
 delete mode 100644 infini_train/include/nn/module_hook.h

diff --git a/docs/hook_mechanism.md b/docs/hook_mechanism.md
index 34aa9607..4494f37f 100644
--- a/docs/hook_mechanism.md
+++ b/docs/hook_mechanism.md
@@ -15,6 +15,12 @@ auto handle = module->RegisterForwardPreHook(
 );
 ```
 
+**调用栈**:
+```
+Module::operator()(inputs)
+  └─> for (hook : forward_pre_hooks_) { hook(this, inputs); }
+```
+
 ### 1.2 Forward Post-Hook
 在 forward 执行后调用。
 
@@ -28,6 +34,13 @@ auto handle = module->RegisterForwardPostHook(
 );
 ```
 
+**调用栈**:
+```
+Module::operator()(inputs)
+  ├─> outputs = Forward(inputs)
+  └─> for (hook : forward_post_hooks_) { hook(this, inputs, outputs); }
+```
+
 ### 1.3 Backward Pre-Hook
 在 backward 执行前调用。
 
@@ -39,6 +52,21 @@ auto handle = module->RegisterBackwardPreHook(
 );
 ```
 
+**调用栈**:
+```
+Module::operator()(inputs)
+  ├─> outputs = Forward(inputs)
+  └─> for (output : outputs) {
+        output->grad_fn()->RegisterBackwardPreHook([module_hooks] {
+          for (hook : module_hooks) { hook(module, grad_outputs); }
+        });
+      }
+
+反向传播时:
+Function::BackwardPartial()
+  └─> for (hook : backward_pre_hooks_) { hook(this, grad_outputs); }
+```
+
 ### 1.4 Backward Post-Hook
 在 backward 执行后调用。
 
@@ -52,6 +80,22 @@ auto handle = module->RegisterBackwardPostHook(
 );
 ```
 
+**调用栈**:
+```
+Module::operator()(inputs)
+  ├─> outputs = Forward(inputs)
+  └─> for (output : outputs) {
+        output->grad_fn()->RegisterBackwardPostHook([module_hooks] {
+          for (hook : module_hooks) { hook(module, grad_inputs, grad_outputs); }
+        });
+      }
+
+反向传播时:
+Function::BackwardPartial()
+  ├─> grad_inputs = Backward(grad_outputs)
+  └─> for (hook : backward_post_hooks_) { hook(this, grad_inputs, grad_outputs); }
+```
+
 ### 使用场景
 - 特征提取和可视化
 - 激活值监控
@@ -59,14 +103,17 @@ auto handle = module->RegisterBackwardPostHook(
 - 性能分析和 profiling
 
 ### 实现位置
+- `infini_train/include/nn/modules/module.h`
 - `infini_train/include/nn/module_hook.h`
-- Module hooks 在 `Module::operator()` 中被调用（forward_pre_hooks_ 和 forward_post_hooks_）
+- Module forward hooks 在 `Module::operator()` 中被调用
+- Module backward hooks 在 `Module::operator()` 中注册到输出 tensor 的 `grad_fn`，在反向传播时由 `Function::BackwardPartial()` 调用
 - 子类只需重写 `Forward()` 方法，hooks 会自动执行
 
 ### 使用说明
 - **调用方式**: 使用 `(*module)(inputs)` 而不是 `module->Forward(inputs)`
 - **子类实现**: 只需重写 `Forward()` 方法，不需要手动调用 hooks
-- **Hook 自动执行**: `operator()` 会自动调用 pre-hooks、Forward、post-hooks
+- **Hook 自动执行**: `operator()` 会自动调用 forward pre-hooks、Forward、forward post-hooks
+- **Backward Hook 执行**: Module 的 backward hooks 会在 `operator()` 中注册到输出 tensor 的 `grad_fn` 上，在反向传播时自动执行
 
 ## 2. Function Hooks
 
@@ -85,6 +132,12 @@ auto handle = function->RegisterForwardPreHook(
 );
 ```
 
+**调用栈**:
+```
+Function::Apply(inputs)
+  └─> for (hook : forward_pre_hooks_) { hook(this, inputs); }
+```
+
 ### 2.2 Function Forward Post-Hook
 在 Function 的 forward 执行后调用。
 
@@ -98,6 +151,13 @@ auto handle = function->RegisterForwardPostHook(
 );
 ```
 
+**调用栈**:
+```
+Function::Apply(inputs)
+  ├─> outputs = Forward(inputs)
+  └─> for (hook : forward_post_hooks_) { hook(this, inputs, outputs); }
+```
+
 ### 2.3 Function Backward Pre-Hook
 在 Function 的 backward 执行前调用。
 
@@ -109,6 +169,14 @@ auto handle = function->RegisterBackwardPreHook(
 );
 ```
 
+**调用栈**:
+```
+Function::BackwardPartial(grad_output, idx)
+  ├─> 累积 grad_outputs
+  └─> 当所有依赖满足时:
+        for (hook : backward_pre_hooks_) { hook(this, grad_outputs); }
+```
+
 ### 2.4 Function Backward Post-Hook
 在 Function 的 backward 执行后调用。
 
@@ -122,6 +190,15 @@ auto handle = function->RegisterBackwardPostHook(
 );
 ```
 
+**调用栈**:
+```
+Function::BackwardPartial(grad_output, idx)
+  ├─> 累积 grad_outputs
+  └─> 当所有依赖满足时:
+        ├─> grad_inputs = Backward(grad_outputs)
+        └─> for (hook : backward_post_hooks_) { hook(this, grad_inputs, grad_outputs); }
+```
+
 ### 使用场景
 - 算子级别的性能分析
 - 中间结果监控
@@ -134,32 +211,54 @@ auto handle = function->RegisterBackwardPostHook(
 - Function forward hooks 在 `Function::Apply()` 中被调用
 - Function backward hooks 在 `Function::BackwardPartial()` 中被调用
 
-## 3. Hook 类型简化
+## 3. Hook 基础设施统一
+
+为了减少代码重复，Function 和 Module 的 hook 基础设施已统一到 `infini_train/include/common/hook.h`：
+
+```cpp
+// 统一的 HookHandle 基类
+class HookHandle {
+public:
+    virtual ~HookHandle() = default;
+    virtual void Remove() = 0;
+};
+
+// 统一的 HookHandleImpl 模板
+template <typename HookType>
+class HookHandleImpl : public HookHandle {
+    // 实现细节...
+};
+```
 
-为了减少冗余，Function hooks 使用了统一的类型定义：
+Function 和 Module 使用各自的 hook 类型定义：
 
 ```cpp
-// 在 function.h 中定义
+// Function hooks (在 function.h 中定义)
 using FunctionPreHook = std::function<void(Function*, const std::vector<std::shared_ptr<Tensor>>&)>;
 using FunctionPostHook = std::function<void(Function*, const std::vector<std::shared_ptr<Tensor>>&,
                                              const std::vector<std::shared_ptr<Tensor>>&)>;
+
+// Module hooks (在 module_hook.h 中定义)
+using ModulePreHook = std::function<void(Module*, const std::vector<std::shared_ptr<Tensor>>&)>;
+using ModulePostHook = std::function<void(Module*, const std::vector<std::shared_ptr<Tensor>>&,
+                                          const std::vector<std::shared_ptr<Tensor>>&)>;
 ```
 
-- `FunctionPreHook` 用于 Forward Pre-Hook 和 Backward Pre-Hook（签名相同）
-- `FunctionPostHook` 用于 Forward Post-Hook 和 Backward Post-Hook（签名相同）
+- `FunctionPreHook` / `ModulePreHook` 用于 Forward Pre-Hook 和 Backward Pre-Hook（签名相同）
+- `FunctionPostHook` / `ModulePostHook` 用于 Forward Post-Hook 和 Backward Post-Hook（签名相同）
 
 ## 4. Hook Handle 和移除机制
 
 所有 hook 注册函数都返回 `std::shared_ptr<HookHandle>`，可用于移除 hook：
 
 ```cpp
-auto handle = function->RegisterForwardPreHook(...);
+auto handle = module->RegisterForwardPreHook(...);
 
 // 移除 hook
 handle->Remove();
 ```
 
-移除后的 hook 会被设置为 `nullptr`，不会影响其他 hook 的执行。
+移除后的 hook 会被设置为 `nullptr`，在执行时会被跳过，不会影响其他 hook 的执行。
 
 ## 5. 调用流程
 
@@ -177,12 +276,18 @@ Module::operator()
 
 ### Backward Pass
 ```
-Function::BackwardPartial()
-  ├─> Backward Pre-Hooks
-  ├─> Backward()
-  └─> Backward Post-Hooks
+Tensor::Backward()
+  └─> Function::BackwardPartial()
+        ├─> 累积 grad_outputs (等待所有依赖)
+        └─> 当所有依赖满足时:
+              ├─> Backward Pre-Hooks (包括 Module backward pre-hooks)
+              ├─> Backward()
+              ├─> Backward Post-Hooks (包括 Module backward post-hooks)
+              └─> 传播梯度到下一层
 ```
 
+注：Module backward hooks 在 forward 时注册到输出 tensor 的 `grad_fn`，在反向传播时由 Function 层执行。
+
 ## 6. 示例代码
 
 参见：
@@ -192,7 +297,8 @@ Function::BackwardPartial()
 ## 7. 注意事项
 
 1. Hook 按注册顺序执行
-2. 移除的 hook 会被设置为 nullptr，不会影响其他 hook
-3. **Module 调用**: 使用 `(*module)(inputs)` 而不是 `module->Forward(inputs)`
+2. 移除的 hook 会被设置为 nullptr，执行时会被跳过
+3. **Module 调用**: 使用 `(*module)(inputs)` 而不是 `module->Forward(inputs)` 才能触发 hooks
 4. **Module 子类**: 只需重写 `Forward()` 方法，hooks 会自动执行
-5. Function hooks 在 Function::Apply() 和 Function::BackwardPartial() 中自动调用
+5. **Module backward hooks**: 在 forward 时注册到输出 tensor 的 `grad_fn`，在反向传播时自动执行
+6. Function hooks 在 `Function::Apply()` 和 `Function::BackwardPartial()` 中自动调用
diff --git a/docs/precision_checker_guide.md b/docs/precision_checker_guide.md
index e33eef22..66535ff7 100644
--- a/docs/precision_checker_guide.md
+++ b/docs/precision_checker_guide.md
@@ -155,16 +155,10 @@ nn::parallel::global::InitAllEnv(8, 2, false, 2, 1, config);
 
 ```bash
 # 基本检查
-./llama3 --device cuda \
-  --input_bin /path/to/data.bin \
-  --llmc_filepath /path/to/model.bin \
-  --precision_check "level=2"
+./llama3 <other params> --precision_check "level=2"
 
 # 表格格式 + MD5
-./llama3 --device cuda \
-  --input_bin /path/to/data.bin \
-  --llmc_filepath /path/to/model.bin \
-  --precision_check "level=2,format=table,output_md5=true"
+./llama3 <other params> --precision_check "level=2,format=table,output_md5=true"
 ```
 
 ## 上下文追踪
@@ -273,10 +267,6 @@ Forward Pass:
   ├─> Pre-Hook: 检查输入
   ├─> Forward: 执行计算
   └─> Post-Hook: 检查输出
-        ├─> 检测 NaN/Inf
-        ├─> 计算 MD5（如果需要）
-        ├─> 与基准对比（如果有）
-        └─> 输出结果
 
 Backward Pass:
   ├─> Backward Pre-Hook: 检查梯度输入
diff --git a/infini_train/include/autograd/accumulate.h b/infini_train/include/autograd/accumulate.h
index a8e41e67..f3519cb1 100644
--- a/infini_train/include/autograd/accumulate.h
+++ b/infini_train/include/autograd/accumulate.h
@@ -18,8 +18,6 @@ class AccumulateGrad final : public Function {
 
     std::vector<std::shared_ptr<Tensor>> Backward(const std::vector<std::shared_ptr<Tensor>> &) override;
 
-    std::shared_ptr<Tensor> tensor() const { return tensor_; }
-
 private:
     std::shared_ptr<Tensor> tensor_ = nullptr;
     float learning_rate_ = 1.0f;
diff --git a/infini_train/include/autograd/function.h b/infini_train/include/autograd/function.h
index a7665f84..651b6bf3 100644
--- a/infini_train/include/autograd/function.h
+++ b/infini_train/include/autograd/function.h
@@ -7,13 +7,16 @@
 
 namespace infini_train {
 class Tensor;
-}
+class HookHandle;
+template <typename HookType> class HookHandleImpl;
+} // namespace infini_train
 
 namespace infini_train::autograd {
-class HookHandle;
 
 class Function : public std::enable_shared_from_this<Function> {
 public:
+    template <typename HookType> using FunctionHookHandleImpl = infini_train::HookHandleImpl<HookType>;
+
     using FunctionPreHook = std::function<void(Function *, const std::vector<std::shared_ptr<Tensor>> &)>;
     using FunctionPostHook = std::function<void(Function *, const std::vector<std::shared_ptr<Tensor>> &,
                                                 const std::vector<std::shared_ptr<Tensor>> &)>;
@@ -35,10 +38,10 @@ class Function : public std::enable_shared_from_this<Function> {
 
     void IncreaseDependenciesNumber();
 
-    std::shared_ptr<HookHandle> RegisterForwardPreHook(FunctionPreHook hook);
-    std::shared_ptr<HookHandle> RegisterForwardPostHook(FunctionPostHook hook);
-    std::shared_ptr<HookHandle> RegisterBackwardPreHook(FunctionPreHook hook);
-    std::shared_ptr<HookHandle> RegisterBackwardPostHook(FunctionPostHook hook);
+    std::shared_ptr<infini_train::HookHandle> RegisterForwardPreHook(FunctionPreHook hook);
+    std::shared_ptr<infini_train::HookHandle> RegisterForwardPostHook(FunctionPostHook hook);
+    std::shared_ptr<infini_train::HookHandle> RegisterBackwardPreHook(FunctionPreHook hook);
+    std::shared_ptr<infini_train::HookHandle> RegisterBackwardPostHook(FunctionPostHook hook);
 
     const std::string &type() const { return type_; }
 
diff --git a/infini_train/include/autograd/function_hook.h b/infini_train/include/autograd/function_hook.h
index 6ece328c..4e4a31f7 100644
--- a/infini_train/include/autograd/function_hook.h
+++ b/infini_train/include/autograd/function_hook.h
@@ -17,12 +17,6 @@ class ProcessGroup;
 namespace infini_train::autograd {
 class Function;
 
-class HookHandle {
-public:
-    virtual ~HookHandle() = default;
-    virtual void Remove() = 0;
-};
-
 class PostAccumulateGradHook {
 public:
     virtual void operator()(const std::shared_ptr<Tensor> &tensor) = 0;
@@ -41,20 +35,4 @@ class AllReducePostAccumulateHook : public PostAccumulateGradHook {
     const infini_train::nn::parallel::ProcessGroup *pg_ = nullptr;
 };
 
-template <typename HookType> class FunctionHookHandleImpl : public HookHandle {
-public:
-    FunctionHookHandleImpl(std::vector<HookType> *hooks, size_t id) : hooks_(hooks), id_(id) {}
-
-    void Remove() override {
-        if (!removed_ && hooks_ && id_ < hooks_->size()) {
-            (*hooks_)[id_] = nullptr;
-            removed_ = true;
-        }
-    }
-
-private:
-    std::vector<HookType> *hooks_;
-    size_t id_;
-    bool removed_ = false;
-};
 } // namespace infini_train::autograd
diff --git a/infini_train/include/autograd/tensor_hook.h b/infini_train/include/autograd/tensor_hook.h
deleted file mode 100644
index 749da025..00000000
--- a/infini_train/include/autograd/tensor_hook.h
+++ /dev/null
@@ -1,31 +0,0 @@
-#pragma once
-
-#include <functional>
-#include <memory>
-#include <vector>
-
-#include "infini_train/include/autograd/function_hook.h"
-
-namespace infini_train {
-class Tensor;
-
-namespace autograd {
-
-// Tensor backward hook: modifies gradient during backward pass
-// Returns modified gradient or nullptr to keep original
-using TensorBackwardHook = std::function<std::shared_ptr<Tensor>(const std::shared_ptr<Tensor> &)>;
-
-class TensorBackwardHookHandle : public HookHandle {
-public:
-    TensorBackwardHookHandle(std::vector<TensorBackwardHook> *hooks, size_t id) : hooks_(hooks), id_(id) {}
-
-    void Remove() override;
-
-private:
-    std::vector<TensorBackwardHook> *hooks_;
-    size_t id_;
-    bool removed_ = false;
-};
-
-} // namespace autograd
-} // namespace infini_train
diff --git a/infini_train/include/common/hook.h b/infini_train/include/common/hook.h
new file mode 100644
index 00000000..96d77c9b
--- /dev/null
+++ b/infini_train/include/common/hook.h
@@ -0,0 +1,30 @@
+#pragma once
+
+#include <vector>
+
+namespace infini_train {
+
+class HookHandle {
+public:
+    virtual ~HookHandle() = default;
+    virtual void Remove() = 0;
+};
+
+template <typename HookType> class HookHandleImpl : public HookHandle {
+public:
+    HookHandleImpl(std::vector<HookType> *hooks, size_t id) : hooks_(hooks), id_(id) {}
+
+    void Remove() override {
+        if (!removed_ && hooks_ && id_ < hooks_->size()) {
+            (*hooks_)[id_] = nullptr;
+            removed_ = true;
+        }
+    }
+
+private:
+    std::vector<HookType> *hooks_;
+    size_t id_;
+    bool removed_ = false;
+};
+
+} // namespace infini_train
diff --git a/infini_train/include/nn/module_hook.h b/infini_train/include/nn/module_hook.h
deleted file mode 100644
index ffff0d0c..00000000
--- a/infini_train/include/nn/module_hook.h
+++ /dev/null
@@ -1,55 +0,0 @@
-#pragma once
-
-#include <functional>
-#include <memory>
-#include <vector>
-
-namespace infini_train {
-class Tensor;
-
-namespace nn {
-class Module;
-
-// Forward pre-hook: called before forward pass
-// Args: (module, input_tensors)
-using ForwardPreHook = std::function<void(Module *, const std::vector<std::shared_ptr<Tensor>> &)>;
-
-// Forward post-hook: called after forward pass
-// Args: (module, input_tensors, output_tensors)
-using ForwardPostHook = std::function<void(Module *, const std::vector<std::shared_ptr<Tensor>> &,
-                                           const std::vector<std::shared_ptr<Tensor>> &)>;
-
-// Backward pre-hook: called before backward pass
-// Args: (module, grad_output)
-using BackwardPreHook = std::function<void(Module *, const std::vector<std::shared_ptr<Tensor>> &)>;
-
-// Backward post-hook: called after backward pass
-// Args: (module, grad_input, grad_output)
-using BackwardPostHook = std::function<void(Module *, const std::vector<std::shared_ptr<Tensor>> &,
-                                            const std::vector<std::shared_ptr<Tensor>> &)>;
-
-class ModuleHookHandle {
-public:
-    virtual ~ModuleHookHandle() = default;
-    virtual void Remove() = 0;
-};
-
-template <typename HookType> class ModuleHookHandleImpl : public ModuleHookHandle {
-public:
-    ModuleHookHandleImpl(std::vector<HookType> *hooks, size_t id) : hooks_(hooks), id_(id) {}
-
-    void Remove() override {
-        if (!removed_ && hooks_ && id_ < hooks_->size()) {
-            (*hooks_)[id_] = nullptr;
-            removed_ = true;
-        }
-    }
-
-private:
-    std::vector<HookType> *hooks_;
-    size_t id_;
-    bool removed_ = false;
-};
-
-} // namespace nn
-} // namespace infini_train
diff --git a/infini_train/include/nn/modules/module.h b/infini_train/include/nn/modules/module.h
index 266684c5..43d77de6 100644
--- a/infini_train/include/nn/modules/module.h
+++ b/infini_train/include/nn/modules/module.h
@@ -7,11 +7,12 @@
 #include <vector>
 
 #include "infini_train/include/datatype.h"
-#include "infini_train/include/nn/module_hook.h"
 
 namespace infini_train {
 class Tensor;
 class Device;
+class HookHandle;
+template <typename HookType> class HookHandleImpl;
 } // namespace infini_train
 
 namespace infini_train::nn {
@@ -24,6 +25,12 @@ std::vector<std::shared_ptr<Module>> Replicate(const std::shared_ptr<Module> &ne
 
 class Module : public std::enable_shared_from_this<Module> {
 public:
+    template <typename HookType> using ModuleHookHandleImpl = infini_train::HookHandleImpl<HookType>;
+
+    using ModulePreHook = std::function<void(Module *, const std::vector<std::shared_ptr<Tensor>> &)>;
+    using ModulePostHook = std::function<void(Module *, const std::vector<std::shared_ptr<Tensor>> &,
+                                              const std::vector<std::shared_ptr<Tensor>> &)>;
+
     static constexpr char kUndefinedType[] = "Undefined";
 
     static constexpr char kPPFirstStageName[] = "__pp_first_stage";
@@ -72,10 +79,10 @@ class Module : public std::enable_shared_from_this<Module> {
     virtual std::shared_ptr<Module> ReplicateForDataParallel(int device_idx) const;
 
     // Hook registration methods
-    std::shared_ptr<ModuleHookHandle> RegisterForwardPreHook(ForwardPreHook hook);
-    std::shared_ptr<ModuleHookHandle> RegisterForwardPostHook(ForwardPostHook hook);
-    std::shared_ptr<ModuleHookHandle> RegisterBackwardPreHook(BackwardPreHook hook);
-    std::shared_ptr<ModuleHookHandle> RegisterBackwardPostHook(BackwardPostHook hook);
+    std::shared_ptr<infini_train::HookHandle> RegisterForwardPreHook(ModulePreHook hook);
+    std::shared_ptr<infini_train::HookHandle> RegisterForwardPostHook(ModulePostHook hook);
+    std::shared_ptr<infini_train::HookHandle> RegisterBackwardPreHook(ModulePreHook hook);
+    std::shared_ptr<infini_train::HookHandle> RegisterBackwardPostHook(ModulePostHook hook);
 
 protected:
     const Device *device_ = nullptr;
@@ -84,10 +91,10 @@ class Module : public std::enable_shared_from_this<Module> {
     std::unordered_map<std::string, std::shared_ptr<Tensor>> parameters_;
     std::unordered_map<std::string, std::shared_ptr<Tensor>> buffers_;
 
-    std::vector<ForwardPreHook> forward_pre_hooks_;
-    std::vector<ForwardPostHook> forward_post_hooks_;
-    std::vector<BackwardPreHook> backward_pre_hooks_;
-    std::vector<BackwardPostHook> backward_post_hooks_;
+    std::vector<ModulePreHook> forward_pre_hooks_;
+    std::vector<ModulePostHook> forward_post_hooks_;
+    std::vector<ModulePreHook> backward_pre_hooks_;
+    std::vector<ModulePostHook> backward_post_hooks_;
     bool precision_check_registered_ = false;
 
 private:
diff --git a/infini_train/include/utils/precision_checker.h b/infini_train/include/utils/precision_checker.h
index b3b03aa4..060ccb98 100644
--- a/infini_train/include/utils/precision_checker.h
+++ b/infini_train/include/utils/precision_checker.h
@@ -6,10 +6,10 @@
 
 namespace infini_train {
 class Tensor;
+class HookHandle;
 
 namespace autograd {
 class Function;
-class HookHandle;
 } // namespace autograd
 
 namespace nn {
diff --git a/infini_train/src/autograd/function.cc b/infini_train/src/autograd/function.cc
index 3fe9c96d..23217783 100644
--- a/infini_train/src/autograd/function.cc
+++ b/infini_train/src/autograd/function.cc
@@ -5,6 +5,7 @@
 #include "infini_train/include/autograd/accumulate.h"
 #include "infini_train/include/autograd/function_hook.h"
 #include "infini_train/include/autograd/grad_mode.h"
+#include "infini_train/include/common/hook.h"
 #include "infini_train/include/device.h"
 #include "infini_train/include/dispatcher.h"
 #include "infini_train/include/nn/parallel/global.h"
@@ -136,23 +137,6 @@ void Function::BackwardPartial(const std::shared_ptr<Tensor> &grad_output, int g
             auto &grad_input = grad_inputs[idx];
             auto &[next_function, output_idx] = next_functions_[idx];
             if (grad_input && next_function) {
-                // // Apply tensor backward hooks only for leaf tensors
-                // // Only AccumulateGrad corresponds to a leaf tensor that user can register hooks on
-                // auto accumulate_grad = std::dynamic_pointer_cast<AccumulateGrad>(next_function);
-                // if (accumulate_grad) {
-                //     auto tensor = accumulate_grad->tensor();
-                //     if (tensor) {
-                //         const auto& hooks = tensor->backward_post_hooks_();
-                //         for (const auto& hook : hooks) {
-                //             if (hook) {
-                //                 auto modified_grad = hook(grad_input);
-                //                 if (modified_grad) {
-                //                     grad_input = modified_grad;
-                //                 }
-                //             }
-                //         }
-                //     }
-                // }
                 next_function->BackwardPartial(grad_input, output_idx);
             }
         }
@@ -161,25 +145,25 @@ void Function::BackwardPartial(const std::shared_ptr<Tensor> &grad_output, int g
 
 void Function::IncreaseDependenciesNumber() { ++dependencies_number_; }
 
-std::shared_ptr<HookHandle> Function::RegisterForwardPreHook(FunctionPreHook hook) {
+std::shared_ptr<infini_train::HookHandle> Function::RegisterForwardPreHook(FunctionPreHook hook) {
     forward_pre_hooks_.push_back(std::move(hook));
     return std::make_shared<FunctionHookHandleImpl<FunctionPreHook>>(&forward_pre_hooks_,
                                                                      forward_pre_hooks_.size() - 1);
 }
 
-std::shared_ptr<HookHandle> Function::RegisterForwardPostHook(FunctionPostHook hook) {
+std::shared_ptr<infini_train::HookHandle> Function::RegisterForwardPostHook(FunctionPostHook hook) {
     forward_post_hooks_.push_back(std::move(hook));
     return std::make_shared<FunctionHookHandleImpl<FunctionPostHook>>(&forward_post_hooks_,
                                                                       forward_post_hooks_.size() - 1);
 }
 
-std::shared_ptr<HookHandle> Function::RegisterBackwardPreHook(FunctionPreHook hook) {
+std::shared_ptr<infini_train::HookHandle> Function::RegisterBackwardPreHook(FunctionPreHook hook) {
     backward_pre_hooks_.push_back(std::move(hook));
     return std::make_shared<FunctionHookHandleImpl<FunctionPreHook>>(&backward_pre_hooks_,
                                                                      backward_pre_hooks_.size() - 1);
 }
 
-std::shared_ptr<HookHandle> Function::RegisterBackwardPostHook(FunctionPostHook hook) {
+std::shared_ptr<infini_train::HookHandle> Function::RegisterBackwardPostHook(FunctionPostHook hook) {
     backward_post_hooks_.push_back(std::move(hook));
     return std::make_shared<FunctionHookHandleImpl<FunctionPostHook>>(&backward_post_hooks_,
                                                                       backward_post_hooks_.size() - 1);
diff --git a/infini_train/src/nn/modules/module.cc b/infini_train/src/nn/modules/module.cc
index d9938fc4..5bb7474c 100644
--- a/infini_train/src/nn/modules/module.cc
+++ b/infini_train/src/nn/modules/module.cc
@@ -8,6 +8,7 @@
 #include "glog/logging.h"
 
 #include "infini_train/include/autograd/function.h"
+#include "infini_train/include/common/hook.h"
 #include "infini_train/include/device.h"
 #include "infini_train/include/nn/parallel/global.h"
 #include "infini_train/include/tensor.h"
@@ -229,26 +230,24 @@ std::shared_ptr<Module> Module::ReplicateForDataParallel(int device_idx) const {
     return std::make_shared<Module>(*this);
 }
 
-std::shared_ptr<ModuleHookHandle> Module::RegisterForwardPreHook(ForwardPreHook hook) {
+std::shared_ptr<infini_train::HookHandle> Module::RegisterForwardPreHook(ModulePreHook hook) {
     forward_pre_hooks_.push_back(std::move(hook));
-    return std::make_shared<ModuleHookHandleImpl<ForwardPreHook>>(&forward_pre_hooks_, forward_pre_hooks_.size() - 1);
+    return std::make_shared<ModuleHookHandleImpl<ModulePreHook>>(&forward_pre_hooks_, forward_pre_hooks_.size() - 1);
 }
 
-std::shared_ptr<ModuleHookHandle> Module::RegisterForwardPostHook(ForwardPostHook hook) {
+std::shared_ptr<infini_train::HookHandle> Module::RegisterForwardPostHook(ModulePostHook hook) {
     forward_post_hooks_.push_back(std::move(hook));
-    return std::make_shared<ModuleHookHandleImpl<ForwardPostHook>>(&forward_post_hooks_,
-                                                                   forward_post_hooks_.size() - 1);
+    return std::make_shared<ModuleHookHandleImpl<ModulePostHook>>(&forward_post_hooks_, forward_post_hooks_.size() - 1);
 }
 
-std::shared_ptr<ModuleHookHandle> Module::RegisterBackwardPreHook(BackwardPreHook hook) {
+std::shared_ptr<infini_train::HookHandle> Module::RegisterBackwardPreHook(ModulePreHook hook) {
     backward_pre_hooks_.push_back(std::move(hook));
-    return std::make_shared<ModuleHookHandleImpl<BackwardPreHook>>(&backward_pre_hooks_,
-                                                                   backward_pre_hooks_.size() - 1);
+    return std::make_shared<ModuleHookHandleImpl<ModulePreHook>>(&backward_pre_hooks_, backward_pre_hooks_.size() - 1);
 }
 
-std::shared_ptr<ModuleHookHandle> Module::RegisterBackwardPostHook(BackwardPostHook hook) {
+std::shared_ptr<infini_train::HookHandle> Module::RegisterBackwardPostHook(ModulePostHook hook) {
     backward_post_hooks_.push_back(std::move(hook));
-    return std::make_shared<ModuleHookHandleImpl<BackwardPostHook>>(&backward_post_hooks_,
-                                                                    backward_post_hooks_.size() - 1);
+    return std::make_shared<ModuleHookHandleImpl<ModulePostHook>>(&backward_post_hooks_,
+                                                                  backward_post_hooks_.size() - 1);
 }
 } // namespace infini_train::nn
diff --git a/test/hook/test_hook.cc b/test/hook/test_hook.cc
index 8763d7c1..7dfa740b 100644
--- a/test/hook/test_hook.cc
+++ b/test/hook/test_hook.cc
@@ -6,7 +6,7 @@
 #include "infini_train/include/autograd/elementwise.h"
 #include "infini_train/include/autograd/function.h"
 #include "infini_train/include/autograd/function_hook.h"
-#include "infini_train/include/autograd/tensor_hook.h"
+#include "infini_train/include/common/hook.h"
 #include "infini_train/include/nn/modules/module.h"
 #include "infini_train/include/nn/parallel/global.h"
 #include "infini_train/include/tensor.h"

From 994ff49d518d1e3d2519261c7a0d8eec39d54474 Mon Sep 17 00:00:00 2001
From: chen <cx2016013@163.com>
Date: Thu, 15 Jan 2026 07:00:12 +0000
Subject: [PATCH 5/6] fix: enable multi-rank precision check output in tensor
 parallel mode

This commit fixes the issue where only rank 0 generated precision check
log files when running with tensor parallelism. The root cause was that
GetLogStream() used process-global static variables, causing all threads
in a single process to share the same log file handle.

Changes:
- Add thread_global_rank thread-local variable to track per-thread rank
- Convert GetLogStream() and TableHeaderPrinted() to use thread_local storage
- Set thread_global_rank in Train() function for each thread
- Move baseline output (key|md5 format) into table format branch to avoid
  duplicate output in simple format
- Add directory creation and error handling for log file opening

With these changes, each thread now creates its own log file based on
its global rank (process_rank * nthread_per_process + thread_rank).

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
---
 example/gpt2/main.cc                        |  3 ++
 example/llama3/main.cc                      |  3 ++
 infini_train/include/nn/parallel/global.h   |  2 +
 infini_train/src/nn/parallel/global.cc      |  2 +
 infini_train/src/utils/precision_checker.cc | 41 +++++++++++++--------
 5 files changed, 36 insertions(+), 15 deletions(-)

diff --git a/example/gpt2/main.cc b/example/gpt2/main.cc
index abed7fda..72accdf7 100644
--- a/example/gpt2/main.cc
+++ b/example/gpt2/main.cc
@@ -125,6 +125,9 @@ void Train(const nn::parallel::Rank &rank) {
     int tp_rank = 0;
     int pp_rank = 0;
 
+    // Set thread-local global rank
+    nn::parallel::global::thread_global_rank = rank.GlobalRank();
+
     const ProcessGroup *ddp_pg = nullptr;
     const ProcessGroup *tp_pg = nullptr;
     const ProcessGroup *pp_pg = nullptr;
diff --git a/example/llama3/main.cc b/example/llama3/main.cc
index 6e5c74d8..37d46717 100644
--- a/example/llama3/main.cc
+++ b/example/llama3/main.cc
@@ -107,6 +107,9 @@ void Train(const nn::parallel::Rank &rank) {
     int tp_rank = 0;
     int pp_rank = 0;
 
+    // Set thread-local global rank
+    nn::parallel::global::thread_global_rank = rank.GlobalRank();
+
     const ProcessGroup *ddp_pg = nullptr;
     const ProcessGroup *tp_pg = nullptr;
     const ProcessGroup *pp_pg = nullptr;
diff --git a/infini_train/include/nn/parallel/global.h b/infini_train/include/nn/parallel/global.h
index afac220a..f99b5ed2 100644
--- a/infini_train/include/nn/parallel/global.h
+++ b/infini_train/include/nn/parallel/global.h
@@ -8,6 +8,8 @@
 
 namespace infini_train::nn::parallel::global {
 
+extern thread_local int thread_global_rank;
+
 enum Axis : uint8_t { DP = 0, TP = 1, PP = 2, AXIS_COUNT = 3 };
 
 struct Layout {
diff --git a/infini_train/src/nn/parallel/global.cc b/infini_train/src/nn/parallel/global.cc
index 04da0a51..dd4b6646 100644
--- a/infini_train/src/nn/parallel/global.cc
+++ b/infini_train/src/nn/parallel/global.cc
@@ -23,6 +23,8 @@ std::string GetEnvAsStr(const std::string &name, const std::string &default_valu
 
 namespace infini_train::nn::parallel::global {
 
+thread_local int thread_global_rank = 0;
+
 void Layout::InitStrides() {
     // Calculate strides
     int stride = 1;
diff --git a/infini_train/src/utils/precision_checker.cc b/infini_train/src/utils/precision_checker.cc
index be025f41..fcc91df2 100644
--- a/infini_train/src/utils/precision_checker.cc
+++ b/infini_train/src/utils/precision_checker.cc
@@ -11,11 +11,14 @@
 #include <limits>
 #include <mutex>
 #include <sstream>
+#include <sys/stat.h>
+#include <sys/types.h>
 #include <unordered_map>
 
 #include "infini_train/include/autograd/function.h"
 #include "infini_train/include/nn/modules/module.h"
 #include "infini_train/include/nn/parallel/global.h"
+#include "infini_train/include/nn/parallel/tensor_parallel.h"
 #include "infini_train/include/tensor.h"
 #include "infini_train/include/utils/precision_check_context.h"
 
@@ -182,15 +185,15 @@ std::unordered_map<std::string, std::string> &GetBaseline() {
 
 // Table header printed flag
 bool &TableHeaderPrinted() {
-    static bool printed = false;
+    thread_local bool printed = false;
     return printed;
 }
 
 std::ostream &GetLogStream() {
-    static std::ofstream log_file;
-    static std::mutex init_mutex;
-    static bool initialized = false;
-    static bool use_console = false;
+    thread_local std::ofstream log_file;
+    thread_local std::mutex init_mutex;
+    thread_local bool initialized = false;
+    thread_local bool use_console = false;
 
     if (!initialized) {
         std::lock_guard<std::mutex> lock(init_mutex);
@@ -200,11 +203,19 @@ std::ostream &GetLogStream() {
             if (config.output_path.empty()) {
                 use_console = true;
             } else {
-                int rank = nn::parallel::global::GlobalEnv::Instance().global_proc_rank();
-                std::string filename = config.output_path + "/precision_check_rank_" + std::to_string(rank) + ".log";
+                // Create output directory if it doesn't exist
+                mkdir(config.output_path.c_str(), 0755);
+
+                int global_rank = nn::parallel::global::thread_global_rank;
+                std::string filename = config.output_path + "/precision_check_rank_" + std::to_string(global_rank) + ".log";
                 log_file.open(filename, std::ios::out | std::ios::trunc);
-                use_console = false;
-                std::cout << "[Rank " << rank << "] Precision check output: " << filename << std::endl;
+                if (!log_file.is_open()) {
+                    std::cerr << "[Rank " << global_rank << "] Failed to open precision check log file: " << filename << std::endl;
+                    use_console = true;
+                } else {
+                    use_console = false;
+                    std::cout << "[Rank " << global_rank << "] Precision check output: " << filename << std::endl;
+                }
             }
             initialized = true;
         }
@@ -321,7 +332,7 @@ void PrecisionChecker::CheckTensors(const std::string &stage, const std::string
     }
 
     const auto &global_config = nn::parallel::global::GlobalEnv::Instance().GetPrecisionCheckConfig();
-    int rank = nn::parallel::global::GlobalEnv::Instance().global_proc_rank();
+    int rank = nn::parallel::global::thread_global_rank;
     int level = global_config.level;
     auto &baseline = GetBaseline();
 
@@ -380,6 +391,11 @@ void PrecisionChecker::CheckTensors(const std::string &stage, const std::string
             std::string same_hash_str = has_baseline ? (same_hash ? "True" : "False") : "--";
             PrintTableRow(log_stream, full_key, level, FormatShape(cpu_tensor->Dims()),
                           DataTypeToString(cpu_tensor->Dtype()), same_hash_str, diff_order);
+
+            // Save to baseline file if output_path is set and output_md5 is true
+            if (!global_config.output_path.empty() && global_config.output_md5) {
+                log_stream << full_key << "|" << md5 << std::endl;
+            }
         } else {
             // Simple format
             const float *float_data = static_cast<const float *>(data);
@@ -440,11 +456,6 @@ void PrecisionChecker::CheckTensors(const std::string &stage, const std::string
                 std::abort();
             }
         }
-
-        // Save to baseline file if output_path is set and output_md5 is true
-        if (!global_config.output_path.empty() && global_config.output_md5) {
-            log_stream << full_key << "|" << md5 << std::endl;
-        }
     }
 }
 

From d35e92ac4b5b10657cb3c7d635900d6a271151d5 Mon Sep 17 00:00:00 2001
From: chen <cx2016013@163.com>
Date: Thu, 15 Jan 2026 09:24:04 +0000
Subject: [PATCH 6/6] amend

---
 docs/precision_checker_guide.md               |  2 +-
 .../include/utils/precision_check_config.h    | 10 +--
 infini_train/src/utils/precision_checker.cc   | 67 +++++++++++++------
 3 files changed, 54 insertions(+), 25 deletions(-)

diff --git a/docs/precision_checker_guide.md b/docs/precision_checker_guide.md
index 66535ff7..0ea4ec8f 100644
--- a/docs/precision_checker_guide.md
+++ b/docs/precision_checker_guide.md
@@ -23,7 +23,7 @@ struct PrecisionCheckConfig {
     std::string output_path = "";     // 空=控制台(仅rank0), 非空=文件(所有rank)
     bool output_md5 = false;          // 输出 MD5 还是 tensor 值
     std::string format = "simple";    // "simple" 或 "table"
-    std::string baseline_path = "";   // 基准文件路径（用于对比）
+    std::string baseline_path = "";   // 基准文件路径（用于对比），指定后默认开启 format=table
 };
 ```
 
diff --git a/infini_train/include/utils/precision_check_config.h b/infini_train/include/utils/precision_check_config.h
index 3966a544..782684c5 100644
--- a/infini_train/include/utils/precision_check_config.h
+++ b/infini_train/include/utils/precision_check_config.h
@@ -40,13 +40,15 @@ struct PrecisionCheckConfig {
         if (kv_map.count("output_md5")) {
             config.output_md5 = (kv_map["output_md5"] == "true" || kv_map["output_md5"] == "1");
         }
-        if (kv_map.count("format")) {
-            config.format = kv_map["format"];
-        }
         if (kv_map.count("baseline")) {
             config.baseline_path = kv_map["baseline"];
         }
-
+        if (kv_map.count("format")) {
+            config.format = kv_map["format"];
+        } else if (!config.baseline_path.empty()) {
+            // Default to table format when baseline is specified
+            config.format = "table";
+        }
         return config;
     }
 };
diff --git a/infini_train/src/utils/precision_checker.cc b/infini_train/src/utils/precision_checker.cc
index fcc91df2..faa82385 100644
--- a/infini_train/src/utils/precision_checker.cc
+++ b/infini_train/src/utils/precision_checker.cc
@@ -164,18 +164,40 @@ std::unordered_map<std::string, std::string> &GetBaseline() {
             const auto &config = nn::parallel::global::GlobalEnv::Instance().GetPrecisionCheckConfig();
             if (!config.baseline_path.empty()) {
                 std::ifstream file(config.baseline_path);
-                std::string line;
-                while (std::getline(file, line)) {
-                    // Format: key|md5
-                    auto pos = line.rfind('|');
-                    if (pos != std::string::npos) {
-                        std::string key = line.substr(0, pos);
-                        std::string md5 = line.substr(pos + 1);
-                        baseline[key] = md5;
+                if (!file.is_open()) {
+                    std::cerr << "[PrecisionCheck] Failed to open baseline file: " << config.baseline_path << std::endl;
+                } else {
+                    std::string line;
+                    while (std::getline(file, line)) {
+                        // Try format 1: key|md5
+                        auto pipe_pos = line.rfind('|');
+                        if (pipe_pos != std::string::npos) {
+                            std::string key = line.substr(0, pipe_pos);
+                            std::string md5 = line.substr(pipe_pos + 1);
+                            baseline[key] = md5;
+                        } else {
+                            // Try format 2: simple log format with "md5="
+                            auto md5_pos = line.find("md5=");
+                            if (md5_pos != std::string::npos) {
+                                // Extract md5 value
+                                std::string md5 = line.substr(md5_pos + 4);
+
+                                // Extract key: find text between "][PrecisionCheck] " and ": md5="
+                                auto check_pos = line.find("][PrecisionCheck] ");
+                                if (check_pos != std::string::npos) {
+                                    size_t key_start = check_pos + 18; // length of "][PrecisionCheck] "
+                                    size_t key_end = line.find(": md5=", key_start);
+                                    if (key_end != std::string::npos) {
+                                        std::string key = line.substr(key_start, key_end - key_start);
+                                        baseline[key] = md5;
+                                    }
+                                }
+                            }
+                        }
                     }
+                    std::cout << "[PrecisionCheck] Loaded " << baseline.size() << " baseline entries from "
+                              << config.baseline_path << std::endl;
                 }
-                std::cout << "[PrecisionCheck] Loaded " << baseline.size() << " baseline entries from "
-                          << config.baseline_path << std::endl;
             }
             loaded = true;
         }
@@ -284,23 +306,22 @@ void PrintTableHeader(std::ostream &os) {
     TableHeaderPrinted() = true;
 
     os << "+" << std::string(50, '-') << "+" << std::string(7, '-') << "+" << std::string(18, '-') << "+"
-       << std::string(15, '-') << "+" << std::string(10, '-') << "+" << std::string(10, '-') << "+\n";
+       << std::string(15, '-') << "+" << std::string(10, '-') << "+\n";
     os << "| " << std::left << std::setw(49) << "key"
        << "| " << std::setw(6) << "level"
        << "| " << std::setw(17) << "shape"
        << "| " << std::setw(14) << "dtype"
        << "| " << std::setw(9) << "same_hash"
-       << "| " << std::setw(9) << "diff_order"
        << "|\n";
     os << "+" << std::string(50, '-') << "+" << std::string(7, '-') << "+" << std::string(18, '-') << "+"
-       << std::string(15, '-') << "+" << std::string(10, '-') << "+" << std::string(10, '-') << "+\n";
+       << std::string(15, '-') << "+" << std::string(10, '-') << "+\n";
 }
 
 void PrintTableRow(std::ostream &os, const std::string &key, int level, const std::string &shape,
-                   const std::string &dtype, const std::string &same_hash, const std::string &diff_order) {
+                   const std::string &dtype, const std::string &same_hash) {
     os << "| " << std::left << std::setw(49) << key.substr(0, 49) << "| " << std::setw(6) << level << "| "
        << std::setw(17) << shape.substr(0, 17) << "| " << std::setw(14) << dtype << "| " << std::setw(9) << same_hash
-       << "| " << std::setw(9) << diff_order << "|\n";
+       << "|\n";
 }
 
 // Calculate diff order between two tensors (returns string like "1e-3" or "0")
@@ -371,26 +392,29 @@ void PrecisionChecker::CheckTensors(const std::string &stage, const std::string
         // Check baseline
         bool has_baseline = !baseline.empty();
         bool same_hash = true;
-        std::string diff_order = "--";
         if (has_baseline) {
             auto it = baseline.find(full_key);
+            if (it == baseline.end() && !context_key.empty()) {
+                // Try without context: "stage name tensor[i]"
+                std::string key_without_context = stage + " " + name + " tensor[" + std::to_string(i) + "]";
+                it = baseline.find(key_without_context);
+            }
             if (it != baseline.end()) {
                 same_hash = (it->second == md5);
-                diff_order = same_hash ? "0" : "N/A";
             }
         }
 
         auto &log_stream = GetLogStream();
 
         if (global_config.format == "table") {
-            static bool header_printed = false;
+            thread_local bool header_printed = false;
             if (!header_printed) {
                 PrintTableHeader(log_stream);
                 header_printed = true;
             }
             std::string same_hash_str = has_baseline ? (same_hash ? "True" : "False") : "--";
             PrintTableRow(log_stream, full_key, level, FormatShape(cpu_tensor->Dims()),
-                          DataTypeToString(cpu_tensor->Dtype()), same_hash_str, diff_order);
+                          DataTypeToString(cpu_tensor->Dtype()), same_hash_str);
 
             // Save to baseline file if output_path is set and output_md5 is true
             if (!global_config.output_path.empty() && global_config.output_md5) {
@@ -414,7 +438,10 @@ void PrecisionChecker::CheckTensors(const std::string &stage, const std::string
 
             bool has_error = (config.check_nan && has_nan) || (config.check_inf && has_inf);
 
-            if (has_error || config.print_stats) {
+            // When output_path is set, always write to file; otherwise only write on error or if print_stats is enabled
+            bool should_output = !global_config.output_path.empty() || has_error || config.print_stats;
+
+            if (should_output) {
                 std::string log_level = has_error ? "E" : "I";
 
                 log_stream << log_level << GetTimestamp() << " [Rank " << rank << "][PrecisionCheck] " << stage << " "