Schemas

BaseSample

Bases: BaseModel

Base class for evaluation samples.

to_dict

to_dict() -> Dict

Get the dictionary representation of the sample without attributes that are None.

Source code in src/ragas/dataset_schema.py

def to_dict(self) -> t.Dict:
    """
    Get the dictionary representation of the sample without attributes that are None.
    """
    return self.model_dump(exclude_none=True)

get_features

get_features() -> List[str]

Get the features of the sample that are not None.

Source code in src/ragas/dataset_schema.py

def get_features(self) -> t.List[str]:
    """
    Get the features of the sample that are not None.
    """
    return list(self.to_dict().keys())

SingleTurnSample

Bases: BaseSample

Represents evaluation samples for single-turn interactions.

Attributes:

Name	Type	Description
`user_input`	`Optional[str]`	The input query from the user.
`retrieved_contexts`	`Optional[List[str]]`	List of contexts retrieved for the query.
`reference_contexts`	`Optional[List[str]]`	List of reference contexts for the query.
`response`	`Optional[str]`	The generated response for the query.
`multi_responses`	`Optional[List[str]]`	List of multiple responses generated for the query.
`reference`	`Optional[str]`	The reference answer for the query.
`rubric`	`Optional[Dict[str, str]]`	Evaluation rubric for the sample.

MultiTurnSample

Bases: BaseSample

Represents evaluation samples for multi-turn interactions.

Attributes:

Name	Type	Description
`user_input`	`List[Union[HumanMessage, AIMessage, ToolMessage]]`	A list of messages representing the conversation turns.
`reference`	`(Optional[str], optional)`	The reference answer or expected outcome for the conversation.
`reference_tool_calls`	`(Optional[List[ToolCall]], optional)`	A list of expected tool calls for the conversation.
`rubrics`	`(Optional[Dict[str, str]], optional)`	Evaluation rubrics for the conversation.
`reference_topics`	`(Optional[List[str]], optional)`	A list of reference topics for the conversation.

validate_user_input `classmethod`

validate_user_input(messages: List[Union[HumanMessage, AIMessage, ToolMessage]]) -> List[Union[HumanMessage, AIMessage, ToolMessage]]

Validates the user input messages.

Source code in src/ragas/dataset_schema.py

@field_validator("user_input")
@classmethod
def validate_user_input(
    cls,
    messages: t.List[t.Union[HumanMessage, AIMessage, ToolMessage]],
) -> t.List[t.Union[HumanMessage, AIMessage, ToolMessage]]:
    """Validates the user input messages."""
    if not (
        isinstance(m, (HumanMessage, AIMessage, ToolMessage)) for m in messages
    ):
        raise ValueError(
            "All inputs must be instances of HumanMessage, AIMessage, or ToolMessage."
        )

    prev_message = None
    for m in messages:
        if isinstance(m, ToolMessage):
            if not isinstance(prev_message, AIMessage):
                raise ValueError(
                    "ToolMessage instances must be preceded by an AIMessage instance."
                )
            if prev_message.tool_calls is None:
                raise ValueError(
                    f"ToolMessage instances must be preceded by an AIMessage instance with tool_calls. Got {prev_message}"
                )
        prev_message = m

    return messages

to_messages

to_messages()

Converts the user input messages to a list of dictionaries.

Source code in src/ragas/dataset_schema.py

def to_messages(self):
    """Converts the user input messages to a list of dictionaries."""
    return [m.model_dump() for m in self.user_input]

pretty_repr

pretty_repr()

Returns a pretty string representation of the conversation.

Source code in src/ragas/dataset_schema.py

def pretty_repr(self):
    """Returns a pretty string representation of the conversation."""
    lines = []
    for m in self.user_input:
        lines.append(m.pretty_repr())

    return "\n".join(lines)

RagasDataset

Bases: BaseModel, Generic[Sample]

validate_samples

validate_samples(samples: List[BaseSample]) -> List[BaseSample]

Validates that all samples are of the same type.

Source code in src/ragas/dataset_schema.py

@field_validator("samples")
def validate_samples(cls, samples: t.List[BaseSample]) -> t.List[BaseSample]:
    """Validates that all samples are of the same type."""
    if len(samples) == 0:
        return samples

    first_sample_type = type(samples[0])
    if not all(isinstance(sample, first_sample_type) for sample in samples):
        raise ValueError("All samples must be of the same type")

    return samples

get_sample_type

get_sample_type() -> Type[Sample]

Returns the type of the samples in the dataset.

Source code in src/ragas/dataset_schema.py

def get_sample_type(self) -> t.Type[Sample]:
    """Returns the type of the samples in the dataset."""
    return type(self.samples[0])

to_hf_dataset

to_hf_dataset() -> Dataset

Converts the dataset to a Hugging Face Dataset.

Source code in src/ragas/dataset_schema.py

def to_hf_dataset(self) -> HFDataset:
    """Converts the dataset to a Hugging Face Dataset."""
    try:
        from datasets import Dataset as HFDataset
    except ImportError:
        raise ImportError(
            "datasets is not installed. Please install it to use this function."
        )

    return HFDataset.from_list(self._to_list())

from_hf_dataset `classmethod`

from_hf_dataset(dataset: Dataset)

Creates an EvaluationDataset from a Hugging Face Dataset.

Source code in src/ragas/dataset_schema.py

@classmethod
def from_hf_dataset(cls, dataset: HFDataset):
    """Creates an EvaluationDataset from a Hugging Face Dataset."""
    return cls.from_list(dataset.to_list())

to_pandas

to_pandas() -> DataFrame

Converts the dataset to a pandas DataFrame.

Source code in src/ragas/dataset_schema.py

def to_pandas(self) -> PandasDataframe:
    """Converts the dataset to a pandas DataFrame."""
    try:
        import pandas as pd
    except ImportError:
        raise ImportError(
            "pandas is not installed. Please install it to use this function."
        )

    data = self._to_list()
    return pd.DataFrame(data)

features

features()

Returns the features of the samples.

Source code in src/ragas/dataset_schema.py

def features(self):
    """Returns the features of the samples."""
    return self.samples[0].get_features()

from_list `classmethod`

from_list(mapping: List[Dict])

Creates an EvaluationDataset from a list of dictionaries.

Source code in src/ragas/dataset_schema.py

@classmethod
def from_list(cls, mapping: t.List[t.Dict]):
    """Creates an EvaluationDataset from a list of dictionaries."""
    samples = []
    if all(
        "user_input" in item and isinstance(mapping[0]["user_input"], list)
        for item in mapping
    ):
        samples.extend(MultiTurnSample(**sample) for sample in mapping)
    else:
        samples.extend(SingleTurnSample(**sample) for sample in mapping)
    return cls(samples=samples)

from_dict `classmethod`

from_dict(mapping: Dict)

Creates an EvaluationDataset from a dictionary.

Source code in src/ragas/dataset_schema.py

@classmethod
def from_dict(cls, mapping: t.Dict):
    """Creates an EvaluationDataset from a dictionary."""
    samples = []
    if all(
        "user_input" in item and isinstance(mapping[0]["user_input"], list)
        for item in mapping
    ):
        samples.extend(MultiTurnSample(**sample) for sample in mapping)
    else:
        samples.extend(SingleTurnSample(**sample) for sample in mapping)
    return cls(samples=samples)

from_csv `classmethod`

from_csv(path: str)

Creates an EvaluationDataset from a CSV file.

Source code in src/ragas/dataset_schema.py

@classmethod
def from_csv(cls, path: str):
    """Creates an EvaluationDataset from a CSV file."""
    import csv

    with open(path, "r", newline="") as csvfile:
        reader = csv.DictReader(csvfile)
        data = [row for row in reader]
    return cls.from_list(data)

to_csv

to_csv(path: str)

Converts the dataset to a CSV file.

Source code in src/ragas/dataset_schema.py

def to_csv(self, path: str):
    """Converts the dataset to a CSV file."""
    import csv

    data = self._to_list()
    if not data:
        return

    fieldnames = self.features()

    with open(path, "w", newline="") as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        for row in data:
            writer.writerow(row)

to_jsonl

to_jsonl(path: str)

Converts the dataset to a JSONL file.

Source code in src/ragas/dataset_schema.py

def to_jsonl(self, path: str):
    """Converts the dataset to a JSONL file."""
    with open(path, "w") as jsonlfile:
        for sample in self.samples:
            jsonlfile.write(json.dumps(sample.to_dict(), ensure_ascii=False) + "\n")

from_jsonl `classmethod`

from_jsonl(path: str)

Creates an EvaluationDataset from a JSONL file.

Source code in src/ragas/dataset_schema.py

@classmethod
def from_jsonl(cls, path: str):
    """Creates an EvaluationDataset from a JSONL file."""
    with open(path, "r") as jsonlfile:
        data = [json.loads(line) for line in jsonlfile]
    return cls.from_list(data)

EvaluationDataset

Bases: RagasDataset[SingleTurnSampleOrMultiTurnSample]

Represents a dataset of evaluation samples.

Attributes:

Name	Type	Description
`samples`	`List[BaseSample]`	A list of evaluation samples.

Methods:

Name	Description
`validate_samples`	Validates that all samples are of the same type.
`get_sample_type`	Returns the type of the samples in the dataset.
`to_hf_dataset`	Converts the dataset to a Hugging Face Dataset.
`to_pandas`	Converts the dataset to a pandas DataFrame.
`features`	Returns the features of the samples.
`from_list`	Creates an EvaluationDataset from a list of dictionaries.
`from_dict`	Creates an EvaluationDataset from a dictionary.
`from_csv`	Creates an EvaluationDataset from a CSV file.
`to_csv`	Converts the dataset to a CSV file.
`to_jsonl`	Converts the dataset to a JSONL file.
`from_jsonl`	Creates an EvaluationDataset from a JSONL file.

EvaluationResult `dataclass`

EvaluationResult(scores: List[Dict[str, Any]], dataset: Optional[EvaluationDataset] = None, binary_columns: List[str] = list(), cost_cb: Optional[CostCallbackHandler] = None)

A class to store and process the results of the evaluation.

Attributes:

Name	Type	Description
`scores`	`Dataset`	The dataset containing the scores of the evaluation.
`dataset`	`(Dataset, optional)`	The original dataset used for the evaluation. Default is None.
`binary_columns`	`list of str, optional`	List of columns that are binary metrics. Default is an empty list.
`cost_cb`	`(CostCallbackHandler, optional)`	The callback handler for cost computation. Default is None.

to_pandas

to_pandas(batch_size: int | None = None, batched: bool = False)

Convert the result to a pandas DataFrame.

Parameters:

Name	Type	Description	Default
`batch_size`	`int`	The batch size for conversion. Default is None.	`None`
`batched`	`bool`	Whether to convert in batches. Default is False.	`False`

Returns:

Type	Description
`DataFrame`	The result as a pandas DataFrame.

Raises:

Type	Description
`ValueError`	If the dataset is not provided.

Source code in src/ragas/dataset_schema.py

def to_pandas(self, batch_size: int | None = None, batched: bool = False):
    """
    Convert the result to a pandas DataFrame.

    Parameters
    ----------
    batch_size : int, optional
        The batch size for conversion. Default is None.
    batched : bool, optional
        Whether to convert in batches. Default is False.

    Returns
    -------
    pandas.DataFrame
        The result as a pandas DataFrame.

    Raises
    ------
    ValueError
        If the dataset is not provided.
    """
    try:
        import pandas as pd
    except ImportError:
        raise ImportError(
            "pandas is not installed. Please install it to use this function."
        )

    if self.dataset is None:
        raise ValueError("dataset is not provided for the results class")
    assert len(self.scores) == len(self.dataset)
    # convert both to pandas dataframes and concatenate
    scores_df = pd.DataFrame(self.scores)
    dataset_df = self.dataset.to_pandas()
    return pd.concat([dataset_df, scores_df], axis=1)

total_tokens

total_tokens() -> Union[List[TokenUsage], TokenUsage]

Compute the total tokens used in the evaluation.

Returns:

Type	Description
`list of TokenUsage or TokenUsage`	The total tokens used.

Raises:

Type	Description
`ValueError`	If the cost callback handler is not provided.

Source code in src/ragas/dataset_schema.py

def total_tokens(self) -> t.Union[t.List[TokenUsage], TokenUsage]:
    """
    Compute the total tokens used in the evaluation.

    Returns
    -------
    list of TokenUsage or TokenUsage
        The total tokens used.

    Raises
    ------
    ValueError
        If the cost callback handler is not provided.
    """
    if self.cost_cb is None:
        raise ValueError(
            "The evaluate() run was not configured for computing cost. Please provide a token_usage_parser function to evaluate() to compute cost."
        )
    return self.cost_cb.total_tokens()

total_cost

total_cost(cost_per_input_token: Optional[float] = None, cost_per_output_token: Optional[float] = None, per_model_costs: Dict[str, Tuple[float, float]] = {}) -> float

Compute the total cost of the evaluation.

Parameters:

Name	Type	Description	Default
`cost_per_input_token`	`float`	The cost per input token. Default is None.	`None`
`cost_per_output_token`	`float`	The cost per output token. Default is None.	`None`
`per_model_costs`	`dict of str to tuple of float`	The per model costs. Default is an empty dictionary.	`{}`

Returns:

Type	Description
`float`	The total cost of the evaluation.

Raises:

Type	Description
`ValueError`	If the cost callback handler is not provided.

Source code in src/ragas/dataset_schema.py

def total_cost(
    self,
    cost_per_input_token: t.Optional[float] = None,
    cost_per_output_token: t.Optional[float] = None,
    per_model_costs: t.Dict[str, t.Tuple[float, float]] = {},
) -> float:
    """
    Compute the total cost of the evaluation.

    Parameters
    ----------
    cost_per_input_token : float, optional
        The cost per input token. Default is None.
    cost_per_output_token : float, optional
        The cost per output token. Default is None.
    per_model_costs : dict of str to tuple of float, optional
        The per model costs. Default is an empty dictionary.

    Returns
    -------
    float
        The total cost of the evaluation.

    Raises
    ------
    ValueError
        If the cost callback handler is not provided.
    """
    if self.cost_cb is None:
        raise ValueError(
            "The evaluate() run was not configured for computing cost. Please provide a token_usage_parser function to evaluate() to compute cost."
        )
    return self.cost_cb.total_cost(
        cost_per_input_token, cost_per_output_token, per_model_costs
    )

Message

Bases: BaseModel

Represents a generic message.

Attributes:

Name	Type	Description
`content`	`str`	The content of the message.
`metadata`	`(Optional[Dict[str, Any]], optional)`	Additional metadata associated with the message.

ToolCall

Bases: BaseModel

Represents a tool call with a name and arguments.

Parameters:

Name	Type	Description	Default
`name`	`str`	The name of the tool being called.	required
`args`	`Dict[str, Union[str, int, float]]`	A dictionary of arguments for the tool call, where keys are argument names and values can be strings, integers, or floats.	required

HumanMessage

Bases: Message

Represents a message from a human user.

Attributes:

Name	Type	Description
`type`	`Literal[human]`	The type of the message, always set to "human".

Methods:

Name	Description
`pretty_repr`	Returns a formatted string representation of the human message.

pretty_repr

pretty_repr()

Returns a formatted string representation of the human message.

Source code in src/ragas/messages.py

def pretty_repr(self):
    """Returns a formatted string representation of the human message."""
    return f"Human: {self.content}"

ToolMessage

Bases: Message

Represents a message from a tool.

Attributes:

Name	Type	Description
`type`	`Literal[tool]`	The type of the message, always set to "tool".

Methods:

Name	Description
`pretty_repr`	Returns a formatted string representation of the tool message.

pretty_repr

pretty_repr()

Returns a formatted string representation of the tool message.

Source code in src/ragas/messages.py

def pretty_repr(self):
    """Returns a formatted string representation of the tool message."""
    return f"ToolOutput: {self.content}"

AIMessage

Bases: Message

Represents a message from an AI.

Attributes:

Name	Type	Description
`type`	`Literal[ai]`	The type of the message, always set to "ai".
`tool_calls`	`Optional[List[ToolCall]]`	A list of tool calls made by the AI, if any.
`metadata`	`Optional[Dict[str, Any]]`	Additional metadata associated with the AI message.

Methods:

Name	Description
`dict`	Returns a dictionary representation of the AI message.
`pretty_repr`	Returns a formatted string representation of the AI message.

to_dict

to_dict(**kwargs)

Returns a dictionary representation of the AI message.

Source code in src/ragas/messages.py

def to_dict(self, **kwargs):
    """
    Returns a dictionary representation of the AI message.
    """
    content = (
        self.content
        if self.tool_calls is None
        else {
            "text": self.content,
            "tool_calls": [tc.dict() for tc in self.tool_calls],
        }
    )
    return {"content": content, "type": self.type}

pretty_repr

pretty_repr()

Returns a formatted string representation of the AI message.

Source code in src/ragas/messages.py

def pretty_repr(self):
    """
    Returns a formatted string representation of the AI message.
    """
    lines = []
    if self.content != "":
        lines.append(f"AI: {self.content}")
    if self.tool_calls is not None:
        lines.append("Tools:")
        for tc in self.tool_calls:
            lines.append(f"  {tc.name}: {tc.args}")

    return "\n".join(lines)

ragas.evaluation.EvaluationResult `dataclass`

EvaluationResult(scores: List[Dict[str, Any]], dataset: Optional[EvaluationDataset] = None, binary_columns: List[str] = list(), cost_cb: Optional[CostCallbackHandler] = None)

A class to store and process the results of the evaluation.

Attributes:

Name	Type	Description
`scores`	`Dataset`	The dataset containing the scores of the evaluation.
`dataset`	`(Dataset, optional)`	The original dataset used for the evaluation. Default is None.
`binary_columns`	`list of str, optional`	List of columns that are binary metrics. Default is an empty list.
`cost_cb`	`(CostCallbackHandler, optional)`	The callback handler for cost computation. Default is None.

to_pandas

to_pandas(batch_size: int | None = None, batched: bool = False)

Convert the result to a pandas DataFrame.

Parameters:

Name	Type	Description	Default
`batch_size`	`int`	The batch size for conversion. Default is None.	`None`
`batched`	`bool`	Whether to convert in batches. Default is False.	`False`

Returns:

Type	Description
`DataFrame`	The result as a pandas DataFrame.

Raises:

Type	Description
`ValueError`	If the dataset is not provided.

Source code in src/ragas/dataset_schema.py

def to_pandas(self, batch_size: int | None = None, batched: bool = False):
    """
    Convert the result to a pandas DataFrame.

    Parameters
    ----------
    batch_size : int, optional
        The batch size for conversion. Default is None.
    batched : bool, optional
        Whether to convert in batches. Default is False.

    Returns
    -------
    pandas.DataFrame
        The result as a pandas DataFrame.

    Raises
    ------
    ValueError
        If the dataset is not provided.
    """
    try:
        import pandas as pd
    except ImportError:
        raise ImportError(
            "pandas is not installed. Please install it to use this function."
        )

    if self.dataset is None:
        raise ValueError("dataset is not provided for the results class")
    assert len(self.scores) == len(self.dataset)
    # convert both to pandas dataframes and concatenate
    scores_df = pd.DataFrame(self.scores)
    dataset_df = self.dataset.to_pandas()
    return pd.concat([dataset_df, scores_df], axis=1)

total_tokens

total_tokens() -> Union[List[TokenUsage], TokenUsage]

Compute the total tokens used in the evaluation.

Returns:

Type	Description
`list of TokenUsage or TokenUsage`	The total tokens used.

Raises:

Type	Description
`ValueError`	If the cost callback handler is not provided.

Source code in src/ragas/dataset_schema.py

def total_tokens(self) -> t.Union[t.List[TokenUsage], TokenUsage]:
    """
    Compute the total tokens used in the evaluation.

    Returns
    -------
    list of TokenUsage or TokenUsage
        The total tokens used.

    Raises
    ------
    ValueError
        If the cost callback handler is not provided.
    """
    if self.cost_cb is None:
        raise ValueError(
            "The evaluate() run was not configured for computing cost. Please provide a token_usage_parser function to evaluate() to compute cost."
        )
    return self.cost_cb.total_tokens()

total_cost

total_cost(cost_per_input_token: Optional[float] = None, cost_per_output_token: Optional[float] = None, per_model_costs: Dict[str, Tuple[float, float]] = {}) -> float

Compute the total cost of the evaluation.

Parameters:

Name	Type	Description	Default
`cost_per_input_token`	`float`	The cost per input token. Default is None.	`None`
`cost_per_output_token`	`float`	The cost per output token. Default is None.	`None`
`per_model_costs`	`dict of str to tuple of float`	The per model costs. Default is an empty dictionary.	`{}`

Returns:

Type	Description
`float`	The total cost of the evaluation.

Raises:

Type	Description
`ValueError`	If the cost callback handler is not provided.

Source code in src/ragas/dataset_schema.py

def total_cost(
    self,
    cost_per_input_token: t.Optional[float] = None,
    cost_per_output_token: t.Optional[float] = None,
    per_model_costs: t.Dict[str, t.Tuple[float, float]] = {},
) -> float:
    """
    Compute the total cost of the evaluation.

    Parameters
    ----------
    cost_per_input_token : float, optional
        The cost per input token. Default is None.
    cost_per_output_token : float, optional
        The cost per output token. Default is None.
    per_model_costs : dict of str to tuple of float, optional
        The per model costs. Default is an empty dictionary.

    Returns
    -------
    float
        The total cost of the evaluation.

    Raises
    ------
    ValueError
        If the cost callback handler is not provided.
    """
    if self.cost_cb is None:
        raise ValueError(
            "The evaluate() run was not configured for computing cost. Please provide a token_usage_parser function to evaluate() to compute cost."
        )
    return self.cost_cb.total_cost(
        cost_per_input_token, cost_per_output_token, per_model_costs
    )

Schemas

BaseSample

to_dict

get_features

SingleTurnSample

MultiTurnSample

validate_user_input classmethod

to_messages

pretty_repr

RagasDataset

validate_samples

get_sample_type

to_hf_dataset

from_hf_dataset classmethod

to_pandas

features

from_list classmethod

from_dict classmethod

from_csv classmethod

to_csv

to_jsonl

from_jsonl classmethod

EvaluationDataset

EvaluationResult dataclass

to_pandas

total_tokens

total_cost

Message

ToolCall

HumanMessage

pretty_repr

ToolMessage

pretty_repr

AIMessage

to_dict

pretty_repr

ragas.evaluation.EvaluationResult dataclass

to_pandas

total_tokens

total_cost

validate_user_input `classmethod`

from_hf_dataset `classmethod`

from_list `classmethod`

from_dict `classmethod`

from_csv `classmethod`

from_jsonl `classmethod`

EvaluationResult `dataclass`

ragas.evaluation.EvaluationResult `dataclass`