我希望使用ds_train_finetune对chatglm-6b-int4-qe模型进行微调,但报错:RuntimeError: expected scalar type Half but found Float
- 运行结果track back如下:
05/11/2023 13:43:01 - WARNING - transformers_modules.chatglm-6b-int4-qe.modeling_chatglm - `use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... ╭─────────────────────────────── Traceback (most recent call last) ────────────────────────────────╮ │ /home/ecs-user/chatglm/ChatGLM-6B/ptuning/main.py:431 in <module> │ │ │ │ 428 │ │ 429 │ │ 430 if __name__ == "__main__": │ │ ❱ 431 │ main() │ │ 432 │ │ │ │ /home/ecs-user/chatglm/ChatGLM-6B/ptuning/main.py:370 in main │ │ │ │ 367 │ │ # checkpoint = last_checkpoint │ │ 368 │ │ model.gradient_checkpointing_enable() │ │ 369 │ │ model.enable_input_require_grads() │ │ ❱ 370 │ │ train_result = trainer.train(resume_from_checkpoint=checkpoint) │ │ 371 │ │ # trainer.save_model() # Saves the tokenizer too for easy upload │ │ 372 │ │ │ │ 373 │ │ metrics = train_result.metrics │ │ │ │ /home/ecs-user/chatglm/ChatGLM-6B/ptuning/trainer.py:1635 in train │ │ │ │ 1632 │ │ inner_training_loop = find_executable_batch_size( │ │ 1633 │ │ │ self._inner_training_loop, self._train_batch_size, args.auto_find_batch_size │ │ 1634 │ │ ) │ │ ❱ 1635 │ │ return inner_training_loop( │ │ 1636 │ │ │ args=args, │ │ 1637 │ │ │ resume_from_checkpoint=resume_from_checkpoint, │ │ 1638 │ │ │ trial=trial, │ │ │ │ /home/ecs-user/chatglm/ChatGLM-6B/ptuning/trainer.py:1904 in _inner_training_loop │ │ │ │ 1901 │ │ │ │ │ with model.no_sync(): │ │ 1902 │ │ │ │ │ │ tr_loss_step = self.training_step(model, inputs) │ │ 1903 │ │ │ │ else: │ │ ❱ 1904 │ │ │ │ │ tr_loss_step = self.training_step(model, inputs) │ │ 1905 │ │ │ │ │ │ 1906 │ │ │ │ if ( │ │ 1907 │ │ │ │ │ args.logging_nan_inf_filter │ │ │ │ /home/ecs-user/chatglm/ChatGLM-6B/ptuning/trainer.py:2647 in training_step │ │ │ │ 2644 │ │ │ return loss_mb.reduce_mean().detach().to(self.args.device) │ │ 2645 │ │ │ │ 2646 │ │ with self.compute_loss_context_manager(): │ │ ❱ 2647 │ │ │ loss = self.compute_loss(model, inputs) │ │ 2648 │ │ │ │ 2649 │ │ if self.args.n_gpu > 1: │ │ 2650 │ │ │ loss = loss.mean() # mean() to average on multi-gpu parallel training │ │ │ │ /home/ecs-user/chatglm/ChatGLM-6B/ptuning/trainer.py:2679 in compute_loss │ │ │ │ 2676 │ │ │ labels = inputs.pop("labels") │ │ 2677 │ │ else: │ │ 2678 │ │ │ labels = None │ │ ❱ 2679 │ │ outputs = model(**inputs) │ │ 2680 │ │ # Save past state if it exists │ │ 2681 │ │ # TODO: this needs to be fixed and made cleaner later. │ │ 2682 │ │ if self.args.past_index >= 0: │ │ │ │ /home/ecs-user/.local/lib/python3.8/site-packages/torch/nn/modules/module.py:1501 in _call_impl │ │ │ │ 1498 │ │ if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks │ │ 1499 │ │ │ │ or _global_backward_pre_hooks or _global_backward_hooks │ │ 1500 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks): │ │ ❱ 1501 │ │ │ return forward_call(*args, **kwargs) │ │ 1502 │ │ # Do not call functions when jit is used │ │ 1503 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │ │ 1504 │ │ backward_pre_hooks = [] │ │ │ │ /home/ecs-user/.local/lib/python3.8/site-packages/deepspeed/utils/nvtx.py:15 in wrapped_fn │ │ │ │ 12 │ │ │ 13 │ def wrapped_fn(*args, **kwargs): │ │ 14 │ │ get_accelerator().range_push(func.__qualname__) │ │ ❱ 15 │ │ ret_val = func(*args, **kwargs) │ │ 16 │ │ get_accelerator().range_pop() │ │ 17 │ │ return ret_val │ │ 18 │ │ │ │ /home/ecs-user/.local/lib/python3.8/site-packages/deepspeed/runtime/engine.py:1724 in forward │ │ │ │ 1721 │ │ if self.fp16_auto_cast(): │ │ 1722 │ │ │ inputs = self._cast_inputs_half(inputs) │ │ 1723 │ │ │ │ ❱ 1724 │ │ loss = self.module(*inputs, **kwargs) │ │ 1725 │ │ │ │ 1726 │ │ if self.zero_optimization_partition_weights(): │ │ 1727 │ │ │ # Disable automated discovery of external parameters │ │ │ │ /home/ecs-user/.local/lib/python3.8/site-packages/torch/nn/modules/module.py:1501 in _call_impl │ │ │ │ 1498 │ │ if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks │ │ 1499 │ │ │ │ or _global_backward_pre_hooks or _global_backward_hooks │ │ 1500 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks): │ │ ❱ 1501 │ │ │ return forward_call(*args, **kwargs) │ │ 1502 │ │ # Do not call functions when jit is used │ │ 1503 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │ │ 1504 │ │ backward_pre_hooks = [] │ │ │ │ /home/ecs-user/.cache/huggingface/modules/transformers_modules/chatglm-6b-int4-qe/modeling_chatg │ │ lm.py:1158 in forward │ │ │ │ 1155 │ │ use_cache = use_cache if use_cache is not None else self.config.use_cache │ │ 1156 │ │ return_dict = return_dict if return_dict is not None else self.config.use_return │ │ 1157 │ │ │ │ ❱ 1158 │ │ transformer_outputs = self.transformer( │ │ 1159 │ │ │ input_ids=input_ids, │ │ 1160 │ │ │ position_ids=position_ids, │ │ 1161 │ │ │ attention_mask=attention_mask, │ │ │ │ /home/ecs-user/.local/lib/python3.8/site-packages/torch/nn/modules/module.py:1501 in _call_impl │ │ │ │ 1498 │ │ if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks │ │ 1499 │ │ │ │ or _global_backward_pre_hooks or _global_backward_hooks │ │ 1500 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks): │ │ ❱ 1501 │ │ │ return forward_call(*args, **kwargs) │ │ 1502 │ │ # Do not call functions when jit is used │ │ 1503 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │ │ 1504 │ │ backward_pre_hooks = [] │ │ │ │ /home/ecs-user/.cache/huggingface/modules/transformers_modules/chatglm-6b-int4-qe/modeling_chatg │ │ lm.py:960 in forward │ │ │ │ 957 │ │ │ layer_past = past_key_values[i] │ │ 958 │ │ │ │ │ 959 │ │ │ if self.gradient_checkpointing and self.training: │ │ ❱ 960 │ │ │ │ layer_ret = torch.utils.checkpoint.checkpoint( │ │ 961 │ │ │ │ │ layer, │ │ 962 │ │ │ │ │ hidden_states, │ │ 963 │ │ │ │ │ position_ids, │ │ │ │ /home/ecs-user/.local/lib/python3.8/site-packages/torch/utils/checkpoint.py:249 in checkpoint │ │ │ │ 246 │ │ raise ValueError("Unexpected keyword arguments: " + ",".join(arg for arg in kwar │ │ 247 │ │ │ 248 │ if use_reentrant: │ │ ❱ 249 │ │ return CheckpointFunction.apply(function, preserve, *args) │ │ 250 │ else: │ │ 251 │ │ return _checkpoint_without_reentrant( │ │ 252 │ │ │ function, │ │ │ │ /home/ecs-user/.local/lib/python3.8/site-packages/torch/autograd/function.py:506 in apply │ │ │ │ 503 │ │ if not torch._C._are_functorch_transforms_active(): │ │ 504 │ │ │ # See NOTE: [functorch vjp and autograd interaction] │ │ 505 │ │ │ args = _functorch.utils.unwrap_dead_wrappers(args) │ │ ❱ 506 │ │ │ return super().apply(*args, **kwargs) # type: ignore[misc] │ │ 507 │ │ │ │ 508 │ │ if cls.setup_context == _SingleLevelFunction.setup_context: │ │ 509 │ │ │ raise RuntimeError( │ │ │ │ /home/ecs-user/.local/lib/python3.8/site-packages/torch/utils/checkpoint.py:107 in forward │ │ │ │ 104 │ │ ctx.save_for_backward(*tensor_inputs) │ │ 105 │ │ │ │ 106 │ │ with torch.no_grad(): │ │ ❱ 107 │ │ │ outputs = run_function(*args) │ │ 108 │ │ return outputs │ │ 109 │ │ │ 110 │ @staticmethod │ │ │ │ /home/ecs-user/.local/lib/python3.8/site-packages/torch/nn/modules/module.py:1501 in _call_impl │ │ │ │ 1498 │ │ if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks │ │ 1499 │ │ │ │ or _global_backward_pre_hooks or _global_backward_hooks │ │ 1500 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks): │ │ ❱ 1501 │ │ │ return forward_call(*args, **kwargs) │ │ 1502 │ │ # Do not call functions when jit is used │ │ 1503 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │ │ 1504 │ │ backward_pre_hooks = [] │ │ │ │ /home/ecs-user/.cache/huggingface/modules/transformers_modules/chatglm-6b-int4-qe/modeling_chatg │ │ lm.py:609 in forward │ │ │ │ 606 │ │ │ │ 607 │ │ # Layer norm at the begining of the transformer layer. │ │ 608 │ │ # [seq_len, batch, hidden_size] │ │ ❱ 609 │ │ attention_input = self.input_layernorm(hidden_states) │ │ 610 │ │ │ │ 611 │ │ # Self attention. │ │ 612 │ │ attention_outputs = self.attention( │ │ │ │ /home/ecs-user/.local/lib/python3.8/site-packages/torch/nn/modules/module.py:1501 in _call_impl │ │ │ │ 1498 │ │ if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks │ │ 1499 │ │ │ │ or _global_backward_pre_hooks or _global_backward_hooks │ │ 1500 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks): │ │ ❱ 1501 │ │ │ return forward_call(*args, **kwargs) │ │ 1502 │ │ # Do not call functions when jit is used │ │ 1503 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │ │ 1504 │ │ backward_pre_hooks = [] │ │ │ │ /home/ecs-user/.local/lib/python3.8/site-packages/torch/nn/modules/normalization.py:190 in │ │ forward │ │ │ │ 187 │ │ │ init.zeros_(self.bias) │ │ 188 │ │ │ 189 │ def forward(self, input: Tensor) -> Tensor: │ │ ❱ 190 │ │ return F.layer_norm( │ │ 191 │ │ │ input, self.normalized_shape, self.weight, self.bias, self.eps) │ │ 192 │ │ │ 193 │ def extra_repr(self) -> str: │ │ │ │ /home/ecs-user/.local/lib/python3.8/site-packages/torch/nn/functional.py:2515 in layer_norm │ │ │ │ 2512 │ │ return handle_torch_function( │ │ 2513 │ │ │ layer_norm, (input, weight, bias), input, normalized_shape, weight=weight, b │ │ 2514 │ │ ) │ │ ❱ 2515 │ return torch.layer_norm(input, normalized_shape, weight, bias, eps, torch.backends.c │ │ 2516 │ │ 2517 │ │ 2518 def group_norm(
RuntimeError: expected scalar type Half but found Float 0%| | 0/3000 [00:02<?, ?it/s] [2023-05-11 13:43:05,765] [INFO] [launch.py:428:sigkill_handler] Killing subprocess 31673 [2023-05-11 13:43:05,766] [ERROR] [launch.py:434:sigkill_handler] ['/usr/bin/python3', '-u', 'main.py', '--local_rank=0', '--deepspeed', 'deepspeed.json', '--do_train', '--train_file', 'AdvertiseGen/train.json', '--test_file', 'AdvertiseGen/dev.json', '--prompt_column', 'content', '--response_column', 'summary', '--overwrite_cache', '--model_name_or_path', 'train_models/chatglm-6b-int4-qe', '--output_dir', './output/adgen-chatglm-6b-ft-128-2e-2', '--overwrite_output_dir', '--max_source_length', '64', '--max_target_length', '64', '--per_device_train_batch_size', '1', '--per_device_eval_batch_size', '1', '--gradient_accumulation_steps', '16', '--predict_with_generate', '--max_steps', '3000', '--logging_steps', '10', '--save_steps', '1000', '--learning_rate', '2e-2'] exits with return code = 1
### Expected Behavior
_No response_
### Steps To Reproduce
运行 ds_train_finetune.sh :
LR=2e-2 MASTER_PORT=$(shuf -n 1 -i 10000-65535) CHECKPOINT=adgen-chatglm-6b-pt-128-2e-2 STEP=3000
deepspeed --num_gpus=1 --master_port $MASTER_PORT main.py \ --deepspeed deepspeed.json \ --do_train \ --train_file AdvertiseGen/train.json \ --test_file AdvertiseGen/dev.json \ --prompt_column content \ --response_column summary \ --overwrite_cache \ --model_name_or_path train_models/chatglm-6b-int4-qe \ --output_dir ./output/adgen-chatglm-6b-ft-$LR \ --overwrite_output_dir \ --max_source_length 64 \ --max_target_length 64 \ --per_device_train_batch_size 1 \ --per_device_eval_batch_size 1 \ --gradient_accumulation_steps 16\ --predict_with_generate \ --max_steps 3000 \ --logging_steps 10 \ --save_steps 1000 \ --learning_rate $LR \
### Environment
```markdown
- OS: Ubuntu 20.04
- Python: 3.8.10
- Transformers: 4.27.1
- PyTorch: 2.0.0+cu117
- CUDA Support: True