> 文章列表 > 大语言模型-中文Langchain

大语言模型-中文Langchain

大语言模型-中文Langchain

中文Langchain

使用chatGLM6b + langchain实现本地化知识库检索与智能答案生成

https://github.com/yanqiangmiffy/Chinese-LangChain

配置

class LangChainCFG:llm_model_name = 'chatglm-6b'  # 本地模型文件 or huggingface远程仓库embedding_model_name = 'text2vec-large-chinese'  # 检索模型文件 or huggingface远程仓库vector_store_path = '.'docs_path = './docs'

向量近邻检索

文本进行向量化后存入数据库,还不清楚langchain包里的FAISS做了哪些改变

class SourceService(object):def __init__(self, config):self.vector_store = Noneself.config = configself.embeddings = HuggingFaceEmbeddings(model_name=self.config.embedding_model_name)self.docs_path = self.config.docs_pathself.vector_store_path = self.config.vector_store_pathdef init_source_vector(self):"""初始化本地知识库向量:return:"""docs = []for doc in os.listdir(self.docs_path):if doc.endswith('.txt'):print(doc)loader = UnstructuredFileLoader(f'{self.docs_path}/{doc}', mode="elements")doc = loader.load()docs.extend(doc)self.vector_store = FAISS.from_documents(docs, self.embeddings)self.vector_store.save_local(self.vector_store_path)def add_document(self, document_path):loader = UnstructuredFileLoader(document_path, mode="elements")doc = loader.load()self.vector_store.add_documents(doc)self.vector_store.save_local(self.vector_store_path)def load_vector_store(self, path):if path is None:self.vector_store = FAISS.load_local(self.vector_store_path, self.embeddings)else:self.vector_store = FAISS.load_local(path, self.embeddings)return self.vector_storedef search_web(self, query):SESSION.proxies = {"http": f"socks5h://localhost:7890","https": f"socks5h://localhost:7890"}results = ddg(query)web_content = ''if results:for result in results:web_content += result['body']return web_content

chatGLM

调用chatGLM

from typing import List, Optionalfrom langchain.llms.base import LLM
from langchain.llms.utils import enforce_stop_tokens
from transformers import AutoModel, AutoTokenizerclass ChatGLMService(LLM):max_token: int = 10000temperature: float = 0.1top_p = 0.9history = []tokenizer: object = Nonemodel: object = Nonedef __init__(self):super().__init__()@propertydef _llm_type(self) -> str:return "ChatGLM"def _call(self,prompt: str,stop: Optional[List[str]] = None) -> str:response, _ = self.model.chat(self.tokenizer,prompt,history=self.history,max_length=self.max_token,temperature=self.temperature,)if stop is not None:response = enforce_stop_tokens(response, stop)self.history = self.history + [[None, response]]return responsedef load_model(self,model_name_or_path: str = "THUDM/chatglm-6b"):self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path,trust_remote_code=True)self.model = AutoModel.from_pretrained(model_name_or_path, trust_remote_code=True).half().cuda()self.model=self.model.eval()  

langchain

道理还是一样,把搜索的结果包装成prompt之后调用LLM

class LangChainApplication(object):def __init__(self, config):self.config = configself.llm_service = ChatGLMService()self.llm_service.load_model(model_name_or_path=self.config.llm_model_name)self.source_service = SourceService(config)if self.config.kg_vector_stores is None:print("init a source vector store")self.source_service.init_source_vector()else:print("load zh_wikipedia source vector store ")try:self.source_service.load_vector_store(self.config.kg_vector_stores['初始化知识库'])except Exception as e:self.source_service.init_source_vector()def get_knowledge_based_answer(self, query,history_len=5,temperature=0.1,top_p=0.9,top_k=4,web_content='',chat_history=[]):if web_content:prompt_template = f"""基于以下已知信息,简洁和专业的来回答用户的问题。如果无法从中得到答案,请说 "根据已知信息无法回答该问题" 或 "没有提供足够的相关信息",不允许在答案中添加编造成分,答案请使用中文。已知网络检索内容:{web_content}""" + """已知内容:{context}问题:{question}"""else:prompt_template = """基于以下已知信息,简洁和专业的来回答用户的问题。如果无法从中得到答案,请说 "根据已知信息无法回答该问题" 或 "没有提供足够的相关信息",不允许在答案中添加编造成分,答案请使用中文。已知内容:{context}问题:{question}"""prompt = PromptTemplate(template=prompt_template,input_variables=["context", "question"])self.llm_service.history = chat_history[-history_len:] if history_len > 0 else []self.llm_service.temperature = temperatureself.llm_service.top_p = top_pknowledge_chain = RetrievalQA.from_llm(llm=self.llm_service,retriever=self.source_service.vector_store.as_retriever(search_kwargs={"k": top_k}),prompt=prompt)knowledge_chain.combine_documents_chain.document_prompt = PromptTemplate(input_variables=["page_content"], template="{page_content}")knowledge_chain.return_source_documents = Trueresult = knowledge_chain({"query": query})return result# if __name__ == '__main__':
#     config = LangChainCFG()
#     application = LangChainApplication(config)
#     result = application.get_knowledge_based_answer('马保国是谁')
#     print(result)
#     application.source_service.add_document('/home/searchgpt/yq/Knowledge-ChatGLM/docs/added/马保国.txt')
#     result = application.get_knowledge_based_answer('马保国是谁')
#     print(result)

web search

from duckduckgo_search import ddg
from duckduckgo_search.utils import SESSIONSESSION.proxies = {"http": f"socks5h://localhost:7890","https": f"socks5h://localhost:7890"
}
r = ddg("马保国")
print(r[:2])
"""
[{'title': '马保国 - 维基百科,自由的百科全书', 'href': 'https://zh.wikipedia.org/wiki/%E9%A9%AC%E4%BF%9D%E5%9B%BD', 'body': '马保国(1951年 — ) ,男,籍贯 山东 临沂,出生及长大于河南,中国大陆太极拳师,自称"浑元形意太极门掌门人" 。 马保国因2017年约战mma格斗家徐晓冬首次出现
大众视野中。 2020年5月,马保国在对阵民间武术爱好者王庆民的比赛中,30秒内被连续高速击倒三次,此事件成为了持续多日的社交 ...'}, {'title': '馬保國的主页 - 抖音', 'href': 'https://www.douyin.com/user/MS4wLjABAAAAW0E1ziOvxgUh3VVv5FE6xmoo3w5WtZalfphYZKj4mCg', 'body': '6.3万. #马马国教扛打功 最近有几个人模芳我动作,很危险啊,不可以的,朋友们不要受伤了。. 5.3万. #马保国直播带货榜第一 朋友们周末愉快,本周六早上湿点,我本人在此号进行第一次带货直播,活到老,学到老,越活越年轻。. 7.0万. #马保国击破红牛罐 昨天 ...'}]
"""

webui

采用transformers的gradio

huggingface space

  • https://huggingface.co/spaces/launch
  • https://huggingface.co/docs/hub/spaces
  • 可以使用github actions 同步:https://github.com/marketplace/actions/sync-with-hugging-face-hub

名医百科知识库