> 文章列表 > 大语言模型-DocumentSearch解读

大语言模型-DocumentSearch解读

大语言模型-DocumentSearch解读

大语言模型学习,继续还是在应用端进行学习,之后才会深入模型本身和原理。

DocumentSearch

https://github.com/yuanzhoulvpi2017/DocumentSearch

该仓库简洁展示了如何自己写一个类似langchain的代码,大体思路如下:

  • sentence transformers将文本转化为向量
  • search top info选择top n 相近的结果
  • 结果制作成prompt进入LLM给出最终结果

主体是一个Knowledge的类,使用时knowledge.search_result()

class KnowLedge:def __init__(self,global_dir: str = None,gen_model_name_or_path: str = "THUDM/chatglm-6b",sen_embedding_model_name_or_path: str = "hfl/chinese-roberta-wwm-ext",batch_top_k=5) -> None:self.batch_top_k = batch_top_kall_file_list = cal_detail_in_dir(global_dir)all_file_list = [Path(i) for i in all_file_list]all_file_list = [i for i in all_file_list if i.suffix in ['.pdf', '.docx']]all_trans_data = [transfile(i) for i in tqdm(all_file_list)]all_trans_data = [clean_text_data(i) for i in all_trans_data]all_trans_data = [i for i in all_trans_data if i.text_data.shape[0] > 0]all_trans_data = [chunk_text4TransOutput(i) for i in all_trans_data]self.sv = SentenceVector(model_name_or_path=sen_embedding_model_name_or_path)all_vector = [self.sv.encode_fun_plus(i.text_data['chunk_text'].tolist()) for i in all_trans_data]self.all_trans_data = all_trans_dataself.all_vector = all_vectorself.gen_tokenizer = AutoTokenizer.from_pretrained(gen_model_name_or_path, trust_remote_code=True)self.gen_model = AutoModel.from_pretrained(gen_model_name_or_path, trust_remote_code=True).half().cuda(1)def search_top_info(self, index: int, question_vector: np.ndarray) -> pd.DataFrame:# print("".format(index))similar_score = numpy_cos_sim(self.all_vector[index], question_vector).flatten()if similar_score.shape[0] < self.batch_top_k:res = self.all_trans_data[index].text_data.reset_index(drop=True).pipe(lambda x: x.assign(**{'score': similar_score})).pipe(lambda x: x.assign(**{'file_name': self.all_trans_data[index].file_name,'file_path': self.all_trans_data[index].file_type}))else:top_k_location = np.argpartition(similar_score, kth=-self.batch_top_k)[-self.batch_top_k:]res = self.all_trans_data[index].text_data.reset_index(drop=True).iloc[top_k_location].pipe(lambda x: x.assign(**{'score': similar_score[top_k_location]})).pipe(lambda x: x.assign(**{'file_name': self.all_trans_data[index].file_name,'file_path': self.all_trans_data[index].file_type}))return resdef search_result(self, question_str: str) -> Tuple[str, pd.DataFrame]:# question_str ="大学生创业有什么补贴" #"做集成电路的企业,有什么补贴"#question_vector = self.sv.encode_fun([question_str])# question_vector.shape# index = 0search_table_info = pd.concat([self.search_top_info(index, question_vector) for index in range(len(self.all_vector))]).pipe(lambda x: x.sort_values(by=['score'], ascending=False))search_table = search_table_info.drop_duplicates(['chunk_text']).head(30)search_text_list = search_table['chunk_text'].tolist()# len(search_text_list), search_text_list[:3]prompt_template = """基于以下已知信息,简洁和专业的来回答用户的问题。如果无法从中得到答案,请说 "根据已知信息无法回答该问题" 或 "没有提供足够的相关信息",不允许在答案中添加编造成分,答案请使用中文。问题:{question}已知内容:{context}"""text2chatglm = prompt_template.format_map({'question': question_str,'context': '\\n'.join(search_text_list)})response, history = self.gen_model.chat(self.gen_tokenizer, text2chatglm, history=[])torch.cuda.empty_cache()return response, search_table

将文本句子转化为向量的函数

class SentenceVector:def __init__(self,model_name_or_path: str = None,device: str = "cuda:0") -> None:self.model_name_or_path = model_name_or_pathself.device = deviceself.tokenizer = AutoTokenizer.from_pretrained(self.model_name_or_path)self.model = AutoModel.from_pretrained(self.model_name_or_path)self.model.to(self.device)def encode_fun(self, texts: List[str]) -> np.ndarray:texts = [cleanquestion(i) for i in texts]inputs = self.tokenizer.batch_encode_plus(texts, padding=True, truncation=True, return_tensors="pt", max_length=64)inputs.to(device=self.device)with t.no_grad():embeddings = self.model(**inputs)embeddings = embeddings.last_hidden_state[:, 0]embeddings = embeddings.to('cpu').numpy()return embeddingsdef encode_fun_plus(self, texts: List[str], batch_size: int = 100) -> np.ndarray:embeddings = np.concatenate([self.encode_fun(texts[i:(i + batch_size)]) for i in tqdm(range(0, len(texts), batch_size))])return embeddings

其中的LLM可以采用开源做成线下

self.gen_tokenizer = AutoTokenizer.from_pretrained(gen_model_name_or_path, trust_remote_code=True)
self.gen_model = AutoModel.from_pretrained(gen_model_name_or_path, trust_remote_code=True).half().cuda(1)

生成句子embedd: “hfl/chinese-roberta-wwm-ext”
LLM: “THUDM/chatglm-6b”

webui

webui也是最近火爆的agic必备

作者采用了streamlit来实现