[THUDM/ChatGLM-6B][BUG/Help] p-tuning v2 报错:RuntimeError: Default process group has not been initialized, please make sure to call init_process_group.

2024-05-20 221 views
5

GPU为单卡A100 运行p-tuning v2脚本后报错:

Traceback (most recent call last): File "/mnt/dolphinfs/hdd_pool/docker/user/hadoop-shangchao/chenkunlong/main.py", line 431, in main() File "/mnt/dolphinfs/hdd_pool/docker/user/hadoop-shangchao/chenkunlong/main.py", line 370, in main train_result = trainer.train(resume_from_checkpoint=checkpoint) File "/mnt/dolphinfs/hdd_pool/docker/user/hadoop-shangchao/chenkunlong/trainer.py", line 1635, in train return inner_training_loop( File "/mnt/dolphinfs/hdd_pool/docker/user/hadoop-shangchao/chenkunlong/trainer.py", line 1722, in _inner_training_loop model = self._wrap_model(self.model_wrapped) File "/mnt/dolphinfs/hdd_pool/docker/user/hadoop-shangchao/chenkunlong/trainer.py", line 1547, in _wrap_model model = nn.parallel.DistributedDataParallel( File "/home/hadoop-shangchao/.conda/envs/python3.9/lib/python3.9/site-packages/torch/nn/parallel/distributed.py", line 530, in init self.process_group = _get_default_group() File "/home/hadoop-shangchao/.conda/envs/python3.9/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py", line 410, in _get_default_group raise RuntimeError( RuntimeError: Default process group has not been initialized, please make sure to call init_process_group.

  1. 准备数据
  2. 运行bash train.sh
  3. 显示
  4. 05/09/2023 21:00:51 - WARNING - main - Process rank: 0, device: cuda:0, n_gpu: 1distributed training: True, 16-bits training: False 05/09/2023 21:00:51 - INFO - main - Training/evaluation parameters Seq2SeqTrainingArguments( _n_gpu=1, adafactor=False, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, auto_find_batch_size=False, bf16=False, bf16_full_eval=False, data_seed=None, dataloader_drop_last=False, dataloader_num_workers=0, dataloader_pin_memory=True, ddp_backend=None, ddp_bucket_cap_mb=None, ddp_find_unused_parameters=None, ddp_timeout=1800, debug=[], deepspeed=None, disable_tqdm=False, do_eval=False, do_predict=False, do_train=True, eval_accumulation_steps=None, eval_delay=0, eval_steps=None, evaluation_strategy=no, fp16=False, fp16_backend=auto, fp16_full_eval=False, fp16_opt_level=O1, fsdp=[], fsdp_config={'fsdp_min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False}, fsdp_min_num_params=0, fsdp_transformer_layer_cls_to_wrap=None, full_determinism=False, generation_config=None, generation_max_length=None, generation_num_beams=None, gradient_accumulation_steps=16, gradient_checkpointing=False, greater_is_better=None, group_by_length=False, half_precision_backend=auto, hub_model_id=None, hub_private_repo=False, hub_strategy=every_save, hub_token=, ignore_data_skip=False, include_inputs_for_metrics=False, jit_mode_eval=False, label_names=None, label_smoothing_factor=0.0, learning_rate=0.02, length_column_name=length, load_best_model_at_end=False, local_rank=0, log_level=passive, log_level_replica=warning, log_on_each_node=True, logging_dir=output/adgen-chatglm-6b-pt-128-2e-2/runs/May09_21-00-51_set-zw04-kubernetes-pc03.mt, logging_first_step=False, logging_nan_inf_filter=True, logging_steps=10, logging_strategy=steps, lr_scheduler_type=linear, max_grad_norm=1.0, max_steps=3000, metric_for_best_model=None, mp_parameters=, no_cuda=False, num_train_epochs=3.0, optim=adamw_hf, optim_args=None, output_dir=output/adgen-chatglm-6b-pt-128-2e-2, overwrite_output_dir=True, past_index=-1, per_device_eval_batch_size=1, per_device_train_batch_size=1, predict_with_generate=True, prediction_loss_only=False, push_to_hub=False, push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=, ray_scope=last, remove_unused_columns=True, report_to=['tensorboard', 'wandb'], resume_from_checkpoint=None, run_name=output/adgen-chatglm-6b-pt-128-2e-2, save_on_each_node=False, save_safetensors=False, save_steps=1000, save_strategy=steps, save_total_limit=None, seed=42, sharded_ddp=[], skip_memory_metrics=True, sortish_sampler=False, tf32=None, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, torchdynamo=None, tpu_metrics_debug=False, tpu_num_cores=None, use_ipex=False, use_legacy_prediction_loop=False, use_mps_device=False, warmup_ratio=0.0, warmup_steps=0, weight_decay=0.0, xpu_backend=None, ) 05/09/2023 21:01:01 - WARNING - datasets.builder - Using custom data configuration default-d7099ee4edaac92e 05/09/2023 21:01:01 - WARNING - datasets.builder - Reusing dataset json (/home/hadoop-shangchao/.cache/huggingface/datasets/json/default-d7099ee4edaac92e/0.0.0/da492aad5680612e4028e7f6ddc04b1dfcec4b64db470ed7cc5f2bb265b9b6b5) 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 969.33it/s] [INFO|configuration_utils.py:667] 2023-05-09 21:01:01,360 >> loading configuration file chatglm/config.json [INFO|configuration_utils.py:667] 2023-05-09 21:01:01,367 >> loading configuration file chatglm/config.json [INFO|configuration_utils.py:725] 2023-05-09 21:01:01,367 >> Model config ChatGLMConfig { "_name_or_path": "chatglm", "architectures": [ "ChatGLMModel" ], "auto_map": { "AutoConfig": "configuration_chatglm.ChatGLMConfig", "AutoModel": "modeling_chatglm.ChatGLMForConditionalGeneration", "AutoModelForSeq2SeqLM": "modeling_chatglm.ChatGLMForConditionalGeneration" }, "bos_token_id": 130004, "eos_token_id": 130005, "gmask_token_id": 130001, "hidden_size": 4096, "inner_hidden_size": 16384, "layernorm_epsilon": 1e-05, "mask_token_id": 130000, "max_sequence_length": 2048, "model_type": "chatglm", "num_attention_heads": 32, "num_layers": 28, "pad_token_id": 3, "position_encoding_2d": true, "pre_seq_len": null, "prefix_projection": false, "quantization_bit": 0, "torch_dtype": "float16", "transformers_version": "4.29.0.dev0", "use_cache": true, "vocab_size": 130528 }

[INFO|tokenization_utils_base.py:1808] 2023-05-09 21:01:01,373 >> loading file ice_text.model [INFO|tokenization_utils_base.py:1808] 2023-05-09 21:01:01,373 >> loading file added_tokens.json [INFO|tokenization_utils_base.py:1808] 2023-05-09 21:01:01,373 >> loading file special_tokens_map.json [INFO|tokenization_utils_base.py:1808] 2023-05-09 21:01:01,373 >> loading file tokenizer_config.json [INFO|modeling_utils.py:2513] 2023-05-09 21:01:01,827 >> loading weights file chatglm/pytorch_model.bin.index.json [INFO|configuration_utils.py:577] 2023-05-09 21:01:01,828 >> Generate config GenerationConfig { "_from_model_config": true, "bos_token_id": 130004, "eos_token_id": 130005, "pad_token_id": 3, "transformers_version": "4.29.0.dev0" }

Loading checkpoint shards: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 8/8 [01:45<00:00, 13.13s/it] [INFO|modeling_utils.py:3185] 2023-05-09 21:02:47,763 >> All model checkpoint weights were used when initializing ChatGLMForConditionalGeneration.

[WARNING|modeling_utils.py:3187] 2023-05-09 21:02:47,763 >> Some weights of ChatGLMForConditionalGeneration were not initialized from the model checkpoint at chatglm and are newly initialized: ['transformer.prefix_encoder.embedding.weight'] You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference. [INFO|modeling_utils.py:2821] 2023-05-09 21:02:49,292 >> Generation config file not found, using a generation config created from the model config. 05/09/2023 21:02:49 - WARNING - datasets.fingerprint - Parameter 'function'=<function main..preprocess_function_train at 0x7fb7321dbd30> of the transform datasets.arrow_dataset.Dataset._map_single couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed. Running tokenizer on train dataset: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 40/40 [00:22<00:00, 1.74ba/s] input_ids ... inputs ... label_ids ... labels ... /home/hadoop-shangchao/.conda/envs/python3.9/lib/python3.9/site-packages/transformers/optimization.py:407: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set no_deprecation_warning=True to disable this warning warnings.warn( Traceback (most recent call last): File "/mnt/dolphinfs/hdd_pool/docker/user/hadoop-shangchao/chenkunlong/main.py", line 431, in main() File "/mnt/dolphinfs/hdd_pool/docker/user/hadoop-shangchao/chenkunlong/main.py", line 370, in main train_result = trainer.train(resume_from_checkpoint=checkpoint) File "/mnt/dolphinfs/hdd_pool/docker/user/hadoop-shangchao/chenkunlong/trainer.py", line 1635, in train return inner_training_loop( File "/mnt/dolphinfs/hdd_pool/docker/user/hadoop-shangchao/chenkunlong/trainer.py", line 1722, in _inner_training_loop model = self._wrap_model(self.model_wrapped) File "/mnt/dolphinfs/hdd_pool/docker/user/hadoop-shangchao/chenkunlong/trainer.py", line 1547, in _wrap_model model = nn.parallel.DistributedDataParallel( File "/home/hadoop-shangchao/.conda/envs/python3.9/lib/python3.9/site-packages/torch/nn/parallel/distributed.py", line 530, in init self.process_group = _get_default_group() File "/home/hadoop-shangchao/.conda/envs/python3.9/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py", line 410, in _get_default_group raise RuntimeError( RuntimeError: Default process group has not been initialized, please make sure to call init_process_group.

Environment
- OS: Ubuntu 
- Python: 3.9
- Transformers: 4.29.0.dev0
- PyTorch: 1.10.1+cu111
- CUDA Support: True

回答

8

增加 local_rank = -1

6

增加 local_rank = -1

在哪增加? 放在参数里不好使

PRE_SEQ_LEN=512
local_rank=-1
LR=2e-2

CUDA_VISIBLE_DEVICES=0 python3 main.py \
    --do_train \
    --train_file /workspace/data/llm/questionLLM/flip_instinwild_ch.json \
    --validation_file /workspace/data/llm/questionLLM/valid_flip_instinwild_ch.json \
    --prompt_column instruction \
    --response_column output \
    --overwrite_cache \
    --model_name_or_path /workspace/para/chatglm-6b \
    --output_dir output/adgen-chatglm-6b-pt-$PRE_SEQ_LEN-$LR \
    --overwrite_output_dir \
    --max_source_length 64 \
    --max_target_length 64 \
    --per_device_train_batch_size 1 \
    --per_device_eval_batch_size 1 \
    --gradient_accumulation_steps 16 \
    --predict_with_generate \
    --max_steps 3000 \
    --logging_steps 10 \
    --save_steps 1000 \
    --learning_rate $LR \
    --pre_seq_len $PRE_SEQ_LEN \
    --quantization_bit 8 \
    --local_rank -1
3

放在arg前面一点的地方 比如第二行

2

在train.sh中第二行增加local_rank=-1仍然无法解决问题。

9

在train.sh中第二行增加local_rank=-1仍然无法解决问题。

俺也一样 我最后的解决方案是,在main.py中添加:

def main():

    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, Seq2SeqTrainingArguments))
    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
        # If we pass only one argument to the script and it's the path to a json file,
        # let's parse it to get our arguments.
        model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]),local_rank=-1)
    else:
        model_args, data_args, training_args = parser.parse_args_into_dataclasses()

    # 这两行
    training_args.local_rank = -1
    print(training_args.local_rank)

    # Setup logging
6

我试试,应该可以!

2

确实,在main.py加入training_args.local_rank = -1就可以了,感谢! @StarWorkXc

Log on each process the small summary:
logger.warning(
    f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
    + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
)

main.py的代码有bug. @Cklwanfifa

0

请问这主要是什么原因呢

4

挺奇怪的 昨天跑还没有问题 今天就有这个bug了

5

请问有不修改原始代码,在命令行修改就可以的办法吗?

2

请问有不修改原始代码,在命令行修改就可以的办法吗?

impossible right now, please modify main.py until author's pr.

6

在train.sh中第二行增加local_rank=-1仍然无法解决问题。

俺也一样 我最后的解决方案是,在main.py中添加:

def main():

    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, Seq2SeqTrainingArguments))
    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
        # If we pass only one argument to the script and it's the path to a json file,
        # let's parse it to get our arguments.
        model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]),local_rank=-1)
    else:
        model_args, data_args, training_args = parser.parse_args_into_dataclasses()

    # 这两行
    training_args.local_rank = -1
    print(training_args.local_rank)

    # Setup logging

感谢感谢 已经可以了

9
from torch.nn import SyncBatchNorm, BatchNorm2d

def simple_model_convert(model):
    for child_name, child in model.named_children():
        if isinstance(child, SyncBatchNorm):
            setattr(model, child_name, BatchNorm2d(child.num_features))
        else:
            model.add_module(child_name, simple_model_convert(child))
    return model

'''
'''
# convert SyncBatchNorm to BatchNorm2d in single GPU environment
if not distributed:
    model = simple_model_convert(model)
4

在train.sh中第二行增加local_rank=-1仍然无法解决问题。

俺也一样 我最后的解决方案是,在main.py中添加:

def main():

    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, Seq2SeqTrainingArguments))
    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
        # If we pass only one argument to the script and it's the path to a json file,
        # let's parse it to get our arguments.
        model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]),local_rank=-1)
    else:
        model_args, data_args, training_args = parser.parse_args_into_dataclasses()

    # 这两行
    training_args.local_rank = -1
    print(training_args.local_rank)

    # Setup logging

感谢感谢 已经可以了

我也是这样解决的。