从输入导入操作系统导入字典、元组、联合、可选
从 torch.nn 导入模块 从 Transformers 导入 AutoModel
def fix_configure_device_map(): num_trans_layers = 28
# bugfix: 在linux中调用torch.embedding传入的weight,input不在同一device上,导致RuntimeError
# windows下 model.device 会被设置成 transformer.word_embeddings.device
# linux下 model.device 会被设置成 lm_head.device
# 在调用chat或者stream_chat时,input_ids会被放到model.device上
# 如果transformer.word_embeddings.device和model.device不同,则会导致RuntimeError
# 因此这里将transformer.word_embeddings,transformer.final_layernorm,lm_head都放到第一张卡上
device_map = {'transformer.word_embeddings': 0,"transformer":0,
'transformer.final_layernorm': 0, 'lm_head': 0,'lm_head.device':0}
# GPU0 - 6GB; GPU1 - 12GB
# 将前8个Transformer layers 放置在GPU0上,其他的放置在GPU1上
gpu0_layers = 8
gpu1_layers = num_trans_layers - gpu0_layers
gpu_target = 0
for i in range(num_trans_layers):
if i >= gpu0_layers:
gpu_target = 1
device_map[f'transformer.layers.{i}'] = gpu_target
return device_map
def load_model_on_gpus(checkpoint_path: Union[str, os.PathLike], num_gpus: int = 2, device_map:Optional[Dict[str, int]] = None, kwargs) -> 模块:如果 num_gpus < 2 并且 device_map 为 None:模型= AutoModel.from_pretrained(checkpoint_path, trust_remote_code=True, kwargs).half().cuda() else: fromaccelerate importdispatch_model model = AutoModel.from_pretrained(checkpoint_path, trust_remote_code=True, **kwargs).half()
if device_map is None:
device_map = auto_configure_device_map(num_gpus)
print(device_map)
model = dispatch_model(model, device_map=device_map)
return model
tokenizer = AutoTokenizer.from_pretrained(ckpt, trust_remote_code=True) model = load_model_on_gpus(ckpt, num_gpus=2, device_map=fix_configure_device_map()
context = """你好~帮我生成一份法国旅行计划 """history = [] input = context response,history = model.chat(tokenizer,input,history,do_sample=True,top_p=0.5,温度= 1)打印(响应)