@@ -263,6 +263,7 @@ class ModelType(Enum):
263263 DotsOCR = ModelTypeTagChatImageIn + 0x0000020
264264 Mistral3 = ModelTypeTagChatImageIn + 0x0000030
265265 StepVL = ModelTypeTagChatImageIn + 0x0000040
266+ GLM_OCR = ModelTypeTagChatImageIn + 0x0000050
266267
267268 Qwen2Audio = ModelTypeTagChatAudioIn + 0x0000001
268269 Qwen3ForcedAligner = ModelTypeTagChatAudioIn + 0x0000002
@@ -3972,6 +3973,7 @@ def get_weight_names(config):
39723973
39733974class GLM4VConverter (BaseConverter ):
39743975 MODEL_TYPE = ModelType .GLM4V
3976+ ASSERT_HEAD_DIM = True
39753977
39763978 @classmethod
39773979 def state_dict_pp (cls , config , state_dict ):
@@ -3987,7 +3989,7 @@ def state_dict_pp(cls, config, state_dict):
39873989 r [name .replace ('gate_up_proj.weight' , 'up_proj.weight' )] = part (tensor , 1 , 2 ).contiguous ()
39883990 elif ('.k_proj.' in name ) or ('.q_proj.' in name ):
39893991 rope_dim = GLM4VConverter .rope_dim
3990- head_dim = GLM4VConverter .txt_config .hidden_size // GLM4VConverter . txt_config . num_attention_heads
3992+ head_dim = GLM4VConverter .txt_config .head_dim
39913993 r [name ] = permute_pair_rope_nope (tensor , tensor .shape [0 ] // head_dim , rope_dim )
39923994 else :
39933995 r [name ] = tensor
@@ -4020,11 +4022,16 @@ def state_dict_pp(cls, config, state_dict):
40204022 def dump_config (f , config , ggml_type ):
40214023 GLM4VConverter .txt_config = AttributeDict (config .text_config )
40224024 txt_config = GLM4VConverter .txt_config
4023- assert txt_config . attention_bias
4025+
40244026 if isinstance (txt_config .eos_token_id , list ):
40254027 txt_config .eos_token_id = txt_config .eos_token_id [0 ]
40264028
4027- head_dim = txt_config .hidden_size // txt_config .num_attention_heads
4029+ if 'head_dim' not in txt_config :
4030+ txt_config .head_dim = txt_config .hidden_size // txt_config .num_attention_heads
4031+ head_dim = txt_config .head_dim
4032+
4033+ if GLM4VConverter .ASSERT_HEAD_DIM :
4034+ assert head_dim == txt_config .hidden_size // txt_config .num_attention_heads
40284035
40294036 rope_dim = int (txt_config .rope_parameters ["partial_rotary_factor" ] * head_dim )
40304037 GLM4VConverter .rope_dim = rope_dim
@@ -4076,6 +4083,68 @@ def get_weight_names(config):
40764083 weights += GLM4VConverter .get_vit_weight_names (config .vision_config ['depth' ])
40774084 return weights
40784085
4086+ class GLMOCRConverter (BaseConverter ):
4087+ MODEL_TYPE = ModelType .GLM_OCR
4088+
4089+ @classmethod
4090+ def state_dict_pp (cls , config , state_dict ):
4091+ return GLM4VConverter .state_dict_pp (config , state_dict )
4092+
4093+ @staticmethod
4094+ def dump_config (f , config , ggml_type ):
4095+ print ("WARNING: MTP not supported!" )
4096+ GLM4VConverter .ASSERT_HEAD_DIM = False
4097+
4098+ GLM4VConverter .dump_config (f , config , ggml_type )
4099+
4100+ config_values = [
4101+ GLM4VConverter .txt_config .head_dim
4102+ ]
4103+ f .write (struct .pack ("<i" , * config_values ))
4104+
4105+ @staticmethod
4106+ def get_vit_weight_names (num_layer ):
4107+ weight_names = ["visual.downsample.weight" ,
4108+ "visual.downsample.bias" ,
4109+ "visual.merger.gate_proj.weight" ,
4110+ "visual.merger.up_proj.weight" ,
4111+ "visual.merger.down_proj.weight" ,
4112+ "visual.merger.proj.weight" ,
4113+ "visual.merger.post_projection_norm.weight" ,
4114+ "visual.merger.post_projection_norm.bias" ,
4115+ "visual.patch_embed.proj.0.weight" ,
4116+ "visual.patch_embed.proj.bias" ,
4117+ "visual.patch_embed.proj.1.weight" ,
4118+ "visual.post_layernorm.weight" ]
4119+ for i in range (num_layer ):
4120+ weight_names += [
4121+ f"visual.layers.{ i } .norm1.weight" ,
4122+ f"visual.layers.{ i } .norm2.weight" ,
4123+ f"visual.layers.{ i } .attn.q_proj.weight" ,
4124+ f"visual.layers.{ i } .attn.k_proj.weight" ,
4125+ f"visual.layers.{ i } .attn.v_proj.weight" ,
4126+ f"visual.layers.{ i } .attn.o_proj.weight" ,
4127+ f"visual.layers.{ i } .attn.q_norm.weight" ,
4128+ f"visual.layers.{ i } .attn.k_norm.weight" ,
4129+ f"visual.layers.{ i } .mlp.gate_proj.weight" ,
4130+ f"visual.layers.{ i } .mlp.up_proj.weight" ,
4131+ f"visual.layers.{ i } .mlp.down_proj.weight" ,
4132+ f"visual.layers.{ i } .attn.q_proj.bias" ,
4133+ f"visual.layers.{ i } .attn.k_proj.bias" ,
4134+ f"visual.layers.{ i } .attn.v_proj.bias" ,
4135+ f"visual.layers.{ i } .attn.o_proj.bias" ,
4136+ f"visual.layers.{ i } .mlp.gate_proj.bias" ,
4137+ f"visual.layers.{ i } .mlp.up_proj.bias" ,
4138+ f"visual.layers.{ i } .mlp.down_proj.bias" ,
4139+ ]
4140+ return weight_names
4141+
4142+ @staticmethod
4143+ def get_weight_names (config ):
4144+ weights = GLM4Converter .get_weight_names (GLM4VConverter .txt_config )
4145+ weights += GLMOCRConverter .get_vit_weight_names (config .vision_config ['depth' ])
4146+ return weights
4147+
40794148class Phi2Converter (BaseConverter ):
40804149 MODEL_TYPE = ModelType .Phi2
40814150
@@ -9694,6 +9763,8 @@ def main():
96949763 DotsOCRConverter .convert (config , model_files , vocab , ggml_type , args .save_path )
96959764 elif arch .endswith ('Glm4vForConditionalGeneration' ):
96969765 GLM4VConverter .convert (config , model_files , vocab , ggml_type , args .save_path )
9766+ elif arch .endswith ('GlmOcrForConditionalGeneration' ):
9767+ GLMOCRConverter .convert (config , model_files , vocab , ggml_type , args .save_path )
96979768 elif arch == 'MegrezMoeForCausalLM' :
96989769 MegrezMoEConverter .convert (config , model_files , vocab , ggml_type , args .save_path )
96999770 elif arch == 'OuroForCausalLM' :
0 commit comments