前言
有时间在写吧
评估指标
llama-Index 内置了有评估工具,今天教大家如何使用
llama-Index 有以下评估指标:
-
Answer Relevcancy
-
Context Relevancy
-
Relevancy
-
Faithfulness
-
Correctness
感兴趣可以去 llama_index.core.evaluation 文件查看
当然llama-Index 还提供了测试数据的生成功能,可以帮助我们轻松地生成评估所需的测试数据,包括评估的问题、参考答案等,这样我们就可以快速地进行评估工作,而不需要花费大量的时间去准备测试数据
生成测试数据
from llama_index.core.llama_dataset.rag import LabelledRagDataset
from llama_index.core.llama_dataset.generator import RagDatasetGenerator
from llama_index.core.evaluation import AnswerRelevancyEvaluator
from llama_index.core.node_parser import SentenceSplitter
import os
from typing import Anyfrom llama_index.core.llms import (CustomLLM,CompletionResponse,CompletionResponseGen,LLMMetadata,
)
from llama_index.core.llms.callbacks import llm_completion_callback
from transformers import AutoTokenizer, AutoModelForCausalLM
from llama_index.core import Settings, VectorStoreIndex, SimpleDirectoryReader
from datetime import datetime
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core.evaluation import ContextRelevancyEvaluatorclass GLMCustomLLM(CustomLLM):context_window: int = 128000 # 上下文窗口大小num_output: int = 18000 # 输出的token数量model_name: str = "glm-4-9b-chat" # 模型名称tokenizer: object = None # 分词器model: object = None # 模型dummy_response: str = "My response"def __init__(self, pretrained_model_name_or_path):super().__init__()# GPU方式加载模型self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path, device_map="cuda",trust_remote_code=True)self.model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path, device_map="cuda",trust_remote_code=True).eval()self.model = self.model.float()@propertydef metadata(self) -> LLMMetadata:"""Get LLM metadata."""# 得到LLM的元数据return LLMMetadata(context_window=self.context_window,num_output=self.num_output,model_name=self.model_name,)@llm_completion_callback() # 回调函数def complete(self, prompt: str, **kwargs: Any) -> CompletionResponse:now = datetime.now()inputs = self.tokenizer.encode(prompt, return_tensors='pt').cuda() # GPU方式# inputs = self.tokenizer.encode(prompt, return_tensors='pt') # CPU方式outputs = self.model.generate(inputs, max_length=self.num_output)response = self.tokenizer.decode(outputs[0])# 完成函数# print(f"完成函数 当前时间为{now} response={response}")return CompletionResponse(text=response)@llm_completion_callback()def stream_complete(self, prompt: str, **kwargs: Any) -> CompletionResponseGen:# 流式完成函数now = datetime.now()print("流式完成函数")inputs = self.tokenizer.encode(prompt, return_tensors='pt').cuda() # GPU方式# inputs = self.tokenizer.encode(prompt, return_tensors='pt') # CPU方式outputs = self.model.generate(inputs, max_length=self.num_output)response = self.tokenizer.decode(outputs[0])for token in response:yield CompletionResponse(text=token, delta=token)llm = GLMCustomLLM(pretrained_model_name_or_path='/home/kelvin/nlp/model/LLM/THUDM/glm-4-9b-chat')embed_model_path = '/home/kelvin/nlp/model/Embedding/BAAI/bge-m3'#输入文件夹的路径
documents = SimpleDirectoryReader("/home/kelvin/nlp/GLM-4/basic_demo/input").load_data()dataset_generator = RagDatasetGenerator.from_documents(documents,llm=llm,num_questions_per_chunk=1,
)
dataset = dataset_generator.generate_questions_from_nodes()
examples = dataset.examples
for i, example in enumerate(examples):contexts = [n[:100] for n in example.reference_contexts]print(f"{i + 1}. {example.query}")#测试数据保存路径
dataset_json = "/home/kelvin/nlp/GLM-4/basic_demo/test-dataset.json"
# dataset = LabelledRagDataset.from_json(dataset_json)
# examples = dataset.examplesif not os.path.exists(dataset_json):dataset = dataset_generator.generate_dataset_from_nodes()examples = dataset.examplesdataset.save_json(dataset_json)
else:dataset = LabelledRagDataset.from_json(dataset_json)examples = dataset.examples
Answer Relevcancy与Context Relevancy
Answer Revelancy 是评估 Answer 和 Question 的相关性,这个指标可以帮助我们评估生成的答案是否和问题相关
Context Relevancy 是评估 Context 和 Question 的相关性,这个指标可以帮助我们评估检索到的文档上下文和问题的相关性
Answer Revelancy:将问题和答案传递给AnswerRelevancyEvaluator评估器,通过evaluate方法来评估问题和答案的相关性
Context Relevancy:将问题和检索到的文档上下文传递给ContextRelevancyEvaluator评估器,通过evaluate方法来评估问题和答案的相关性
评估结果的score范围是 0~1,得分越高表示答案和问题的相关性越高,得分为 1 表示完全相关
评估结果中还有feedback属性,用来解释评估结果,这个属性可以帮助我们了解评估结果的产生原因
示例代码:
from llama_index.core.llama_dataset.rag import LabelledRagDataset
from llama_index.core.llama_dataset.generator import RagDatasetGenerator
from llama_index.core.evaluation import AnswerRelevancyEvaluator
from llama_index.core.node_parser import SentenceSplitter
import os
from typing import Anyfrom llama_index.core.llms import (CustomLLM,CompletionResponse,CompletionResponseGen,LLMMetadata,
)
from llama_index.core.llms.callbacks import llm_completion_callback
from transformers import AutoTokenizer, AutoModelForCausalLM
from llama_index.core import Settings, VectorStoreIndex, SimpleDirectoryReader
from datetime import datetime
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core.evaluation import ContextRelevancyEvaluatorclass GLMCustomLLM(CustomLLM):context_window: int = 128000 # 上下文窗口大小num_output: int = 18000 # 输出的token数量model_name: str = "glm-4-9b-chat" # 模型名称tokenizer: object = None # 分词器model: object = None # 模型dummy_response: str = "My response"def __init__(self, pretrained_model_name_or_path):super().__init__()# GPU方式加载模型self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path, device_map="cuda",trust_remote_code=True)self.model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path, device_map="cuda",trust_remote_code=True).eval()self.model = self.model.float()@propertydef metadata(self) -> LLMMetadata:"""Get LLM metadata."""# 得到LLM的元数据return LLMMetadata(context_window=self.context_window,num_output=self.num_output,model_name=self.model_name,)@llm_completion_callback() # 回调函数def complete(self, prompt: str, **kwargs: Any) -> CompletionResponse:now = datetime.now()inputs = self.tokenizer.encode(prompt, return_tensors='pt').cuda() # GPU方式# inputs = self.tokenizer.encode(prompt, return_tensors='pt') # CPU方式outputs = self.model.generate(inputs, max_length=self.num_output)response = self.tokenizer.decode(outputs[0])# 完成函数# print(f"完成函数 当前时间为{now} response={response}")return CompletionResponse(text=response)@llm_completion_callback()def stream_complete(self, prompt: str, **kwargs: Any) -> CompletionResponseGen:# 流式完成函数now = datetime.now()print("流式完成函数")inputs = self.tokenizer.encode(prompt, return_tensors='pt').cuda() # GPU方式# inputs = self.tokenizer.encode(prompt, return_tensors='pt') # CPU方式outputs = self.model.generate(inputs, max_length=self.num_output)response = self.tokenizer.decode(outputs[0])for token in response:yield CompletionResponse(text=token, delta=token)llm = GLMCustomLLM(pretrained_model_name_or_path='/home/kelvin/nlp/model/LLM/THUDM/glm-4-9b-chat')embed_model_path = '/home/kelvin/nlp/model/Embedding/BAAI/bge-m3'documents = SimpleDirectoryReader("/home/kelvin/nlp/GLM-4/basic_demo/input").load_data()dataset_json = "/home/kelvin/nlp/GLM-4/basic_demo/test-dataset.json"
dataset = LabelledRagDataset.from_json(dataset_json)
examples = dataset.examplesquestion = examples[0].querynode_parser = SentenceSplitter()
nodes = node_parser.get_nodes_from_documents(documents)
Settings.llm = llm
Settings.embed_model = HuggingFaceEmbedding(model_name=f"{embed_model_path}", device='cuda')
print(f'检索中...')
vector_index = VectorStoreIndex(nodes)
engine = vector_index.as_query_engine()
response = engine.query(question)
answer = str(response)print(f"question={question}")
print(f'************')
print(f"Answer: {answer}")
evaluator = AnswerRelevancyEvaluator(llm)
result = evaluator.evaluate(query=question, response=answer)
print(f"score: {result.score}")
print(f"feedback: {result.feedback}")print(f'*****----------******')
contexts = [n.get_content() for n in response.source_nodes]
evaluator = ContextRelevancyEvaluator(llm)
result = evaluator.evaluate(query=question, contexts=contexts)
print(f"ContextRelevancy_score: {result.score}")
print(f"ContextRelevancy_feedback: {result.feedback}")
print(f'ContextRelevancy_query: {result.query}"')
欢迎大家点赞或收藏~
大家的点赞或收藏可以鼓励作者加快更新哟~