> 文章列表 > 大语言模型-ChatGLM-Tuning

大语言模型-ChatGLM-Tuning

大语言模型-ChatGLM-Tuning

https://github.com/mymusise/ChatGLM-Tuning

数据

准备数据为standford alpaca的形式

主要讲alpaca的数据改变为形式。相当于转化为Instrucrtion+Answer的形式


def format_example(example: dict) -> dict:context = f"Instruction: {example['instruction']}\\n"if example.get("input"):context += f"Input: {example['input']}\\n"context += "Answer: "target = example["output"]return {"context": context, "target": target}def main():parser = argparse.ArgumentParser()parser.add_argument("--data_path", type=str, default="data/alpaca_data.json")parser.add_argument("--save_path", type=str, default="data/alpaca_data.jsonl")args = parser.parse_args()with open(args.data_path) as f:examples = json.load(f)with open(args.save_path, 'w') as f:for example in tqdm(examples, desc="formatting.."):f.write(json.dumps(format_example(example)) + '\\n')

tokenizer函数转化为input_id、seq_len的形式

def preprocess(tokenizer, config, example, max_seq_length):prompt = example["context"]target = example["target"]prompt_ids = tokenizer.encode(prompt, max_length=max_seq_length, truncation=True)target_ids = tokenizer.encode(target,max_length=max_seq_length,truncation=True,add_special_tokens=False)input_ids = prompt_ids + target_ids + [config.eos_token_id]return {"input_ids": input_ids, "seq_len": len(prompt_ids)}def read_jsonl(path, max_seq_length, skip_overlength=False):model_name = "THUDM/chatglm-6b"tokenizer = transformers.AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)config = transformers.AutoConfig.from_pretrained(model_name, trust_remote_code=True, device_map='auto')with open(path, "r") as f:for line in tqdm(f.readlines()):example = json.loads(line)feature = preprocess(tokenizer, config, example, max_seq_length)if skip_overlength and len(feature["input_ids"]) > max_seq_length:continuefeature["input_ids"] = feature["input_ids"][:max_seq_length]yield featuredef main():parser = argparse.ArgumentParser()parser.add_argument("--jsonl_path", type=str, default="data/alpaca_data.jsonl")parser.add_argument("--save_path", type=str, default="data/alpaca")parser.add_argument("--max_seq_length", type=int, default=384)parser.add_argument("--skip_overlength", type=bool, default=False)args = parser.parse_args()dataset = datasets.Dataset.from_generator(lambda: read_jsonl(args.jsonl_path, args.max_seq_length, args.skip_overlength))dataset.save_to_disk(args.save_path)

finetune

  • finetune dataset
  • finetune model
  • train

数据loader

tokenizer = AutoTokenizer.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True)@dataclass
class FinetuneArguments:dataset_path: str = field(default="data/alpaca")model_path: str = field(default="output")lora_rank: int = field(default=8)class CastOutputToFloat(nn.Sequential):def forward(self, x):return super().forward(x).to(torch.float32)def data_collator(features: list) -> dict:len_ids = [len(feature["input_ids"]) for feature in features]longest = max(len_ids)input_ids = []labels_list = []for ids_l, feature in sorted(zip(len_ids, features), key=lambda x: -x[0]):ids = feature["input_ids"]seq_len = feature["seq_len"]labels = ([-100] * (seq_len - 1) + ids[(seq_len - 1) :] + [-100] * (longest - ids_l))ids = ids + [tokenizer.pad_token_id] * (longest - ids_l)_ids = torch.LongTensor(ids)labels_list.append(torch.LongTensor(labels))input_ids.append(_ids)input_ids = torch.stack(input_ids)labels = torch.stack(labels_list)return {"input_ids": input_ids,"labels": labels,}

模型

# init modelmodel = AutoModel.from_pretrained("THUDM/chatglm-6b", load_in_8bit=True, trust_remote_code=True, device_map="auto")model.gradient_checkpointing_enable()model.enable_input_require_grads()model.is_parallelizable = Truemodel.model_parallel = Truemodel.lm_head = CastOutputToFloat(model.lm_head)model.config.use_cache = (False  # silence the warnings. Please re-enable for inference!)

训练器

class ModifiedTrainer(Trainer):def compute_loss(self, model, inputs, return_outputs=False):return model(input_ids=inputs["input_ids"],labels=inputs["labels"],).lossdef save_model(self, output_dir=None, _internal_call=False):from transformers.trainer import TRAINING_ARGS_NAMEos.makedirs(output_dir, exist_ok=True)torch.save(self.args, os.path.join(output_dir, TRAINING_ARGS_NAME))saved_params = {k: v.to("cpu") for k, v in self.model.named_parameters() if v.requires_grad}torch.save(saved_params, os.path.join(output_dir, "adapter_model.bin"))

推理

  • inference
model = AutoModel.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True, load_in_8bit=True, device_map='auto')
tokenizer = AutoTokenizer.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True)model = PeftModel.from_pretrained(model, "./output/")
instructions = json.load(open("data/alpaca_data.json"))answers = []
from cover_alpaca2jsonl import format_examplewith torch.no_grad():for idx, item in enumerate(instructions[:3]):feature = format_example(item)input_text = feature['context']ids = tokenizer.encode(input_text)input_ids = torch.LongTensor([ids])out = model.generate(input_ids=input_ids,max_length=150,do_sample=False,temperature=0)out_text = tokenizer.decode(out[0])answer = out_text.replace(input_text, "").replace("\\nEND", "").strip()item['infer_answer'] = answerprint(out_text)print(f"### {idx+1}.Answer:\\n", item.get('output'), '\\n\\n')answers.append({'index': idx, **item})

RLHF

Reward model

PPO

Lora