Integrations

ragas.integrations.langchain

EvaluatorChain

EvaluatorChain(metric: Metric, **kwargs: Any)

Bases: Chain, RunEvaluator

Wrapper around ragas Metrics to use them with langsmith.

Source code in src/ragas/integrations/langchain.py

def __init__(self, metric: Metric, **kwargs: t.Any):
    kwargs["metric"] = metric
    super().__init__(**kwargs)
    if "run_config" in kwargs:
        run_config = kwargs["run_config"]
    else:
        run_config = RunConfig()
    if isinstance(self.metric, MetricWithLLM):
        llm = get_or_init(kwargs, "llm", ChatOpenAI)
        t.cast(MetricWithLLM, self.metric).llm = LangchainLLMWrapper(llm)
    if isinstance(self.metric, MetricWithEmbeddings):
        embeddings = get_or_init(kwargs, "embeddings", OpenAIEmbeddings)
        t.cast(MetricWithEmbeddings, self.metric).embeddings = (
            LangchainEmbeddingsWrapper(embeddings)
        )
    self.metric.init(run_config)

    assert isinstance(
        self.metric, SingleTurnMetric
    ), "Metric must be SingleTurnMetric"

evaluate_run

evaluate_run(run: Run, example: Optional[Example] = None) -> EvaluationResult

Evaluate a langsmith run

Source code in src/ragas/integrations/langchain.py

def evaluate_run(
    self, run: Run, example: t.Optional[Example] = None
) -> EvaluationResult:
    """
    Evaluate a langsmith run
    """
    self._validate_langsmith_eval(run, example)

    # this is just to suppress the type checker error
    # actual check and error message is in the _validate_langsmith_eval
    assert run.outputs is not None
    assert example is not None
    assert example.inputs is not None
    assert example.outputs is not None

    chain_eval = run.outputs
    chain_eval["question"] = example.inputs["question"]
    if "ground_truth" in get_required_columns_v1(self.metric):
        if example.outputs is None or "ground_truth" not in example.outputs:
            raise ValueError("expected `ground_truth` in example outputs.")
        chain_eval["ground_truth"] = example.outputs["ground_truth"]
    eval_output = self.invoke(chain_eval, include_run_info=True)

    evaluation_result = EvaluationResult(
        key=self.metric.name, score=eval_output[self.metric.name]
    )
    if RUN_KEY in eval_output:
        evaluation_result.evaluator_info[RUN_KEY] = eval_output[RUN_KEY]
    return evaluation_result

ragas.integrations.langsmith

upload_dataset

upload_dataset(dataset: Testset, dataset_name: str, dataset_desc: str = '') -> Dataset

Uploads a new dataset to LangSmith, converting it from a TestDataset object to a pandas DataFrame before upload. If a dataset with the specified name already exists, the function raises an error.

Parameters:

Name	Type	Description	Default
`dataset`	`TestDataset`	The dataset to be uploaded.	required
`dataset_name`	`str`	The name for the new dataset in LangSmith.	required
`dataset_desc`	`str`	A description for the new dataset. The default is an empty string.	`''`

Returns:

Type	Description
`Dataset`	The dataset object as stored in LangSmith after upload.

Raises:

Type	Description
`ValueError`	If a dataset with the specified name already exists in LangSmith.

Notes

The function attempts to read a dataset by the given name to check its existence. If not found, it proceeds to upload the dataset after converting it to a pandas DataFrame. This involves specifying input and output keys for the dataset being uploaded.

Source code in src/ragas/integrations/langsmith.py

def upload_dataset(
    dataset: Testset, dataset_name: str, dataset_desc: str = ""
) -> LangsmithDataset:
    """
    Uploads a new dataset to LangSmith, converting it from a TestDataset object to a
    pandas DataFrame before upload. If a dataset with the specified name already
    exists, the function raises an error.

    Parameters
    ----------
    dataset : TestDataset
        The dataset to be uploaded.
    dataset_name : str
        The name for the new dataset in LangSmith.
    dataset_desc : str, optional
        A description for the new dataset. The default is an empty string.

    Returns
    -------
    LangsmithDataset
        The dataset object as stored in LangSmith after upload.

    Raises
    ------
    ValueError
        If a dataset with the specified name already exists in LangSmith.

    Notes
    -----
    The function attempts to read a dataset by the given name to check its existence.
    If not found, it proceeds to upload the dataset after converting it to a pandas
    DataFrame. This involves specifying input and output keys for the dataset being
    uploaded.
    """
    client = Client()
    try:
        # check if dataset exists
        langsmith_dataset: LangsmithDataset = client.read_dataset(
            dataset_name=dataset_name
        )
        raise ValueError(
            f"Dataset {dataset_name} already exists in langsmith. [{langsmith_dataset}]"
        )
    except LangSmithNotFoundError:
        # if not create a new one with the generated query examples
        langsmith_dataset: LangsmithDataset = client.upload_dataframe(
            df=dataset.to_pandas(),
            name=dataset_name,
            input_keys=["question"],
            output_keys=["ground_truth"],
            description=dataset_desc,
        )

        print(
            f"Created a new dataset '{langsmith_dataset.name}'. Dataset is accessible at {langsmith_dataset.url}"
        )
        return langsmith_dataset

evaluate

evaluate(dataset_name: str, llm_or_chain_factory: Any, experiment_name: Optional[str] = None, metrics: Optional[list] = None, verbose: bool = False) -> Dict[str, Any]

Evaluates a language model or a chain factory on a specified dataset using LangSmith, with the option to customize metrics and verbosity.

Parameters:

Name	Type	Description	Default
`dataset_name`	`str`	The name of the dataset to use for evaluation. This dataset must exist in LangSmith.	required
`llm_or_chain_factory`	`Any`	The language model or chain factory to be evaluated. This parameter is flexible and can accept a variety of objects depending on the implementation.	required
`experiment_name`	`Optional[str]`	The name of the experiment. This can be used to categorize or identify the evaluation run within LangSmith. The default is None.	`None`
`metrics`	`Optional[list]`	A list of custom metrics (functions or evaluators) to be used for the evaluation. If None, a default set of metrics (answer relevancy, context precision, context recall, and faithfulness) are used. The default is None.	`None`
`verbose`	`bool`	If True, detailed progress and results will be printed during the evaluation process. The default is False.	`False`

Returns:

Type	Description
`Dict[str, Any]`	A dictionary containing the results of the evaluation.

Raises:

Type	Description
`ValueError`	If the specified dataset does not exist in LangSmith.

See Also

Client.read_dataset : Method to read an existing dataset. Client.run_on_dataset : Method to run the evaluation on the specified dataset.

Examples:

>>> results = evaluate(
...     dataset_name="MyDataset",
...     llm_or_chain_factory=my_llm,
...     experiment_name="experiment_1_with_vanila_rag",
...     verbose=True
... )
>>> print(results)
{'evaluation_result': ...}

Notes

The function initializes a client to interact with LangSmith, validates the existence of the specified dataset, prepares evaluation metrics, and runs the evaluation, returning the results. Custom evaluation metrics can be specified, or a default set will be used if none are provided.

Source code in src/ragas/integrations/langsmith.py

def evaluate(
    dataset_name: str,
    llm_or_chain_factory: t.Any,
    experiment_name: t.Optional[str] = None,
    metrics: t.Optional[list] = None,
    verbose: bool = False,
) -> t.Dict[str, t.Any]:
    """
    Evaluates a language model or a chain factory on a specified dataset using
    LangSmith, with the option to customize metrics and verbosity.

    Parameters
    ----------
    dataset_name : str
        The name of the dataset to use for evaluation. This dataset must exist in
        LangSmith.
    llm_or_chain_factory : Any
        The language model or chain factory to be evaluated. This parameter is
        flexible and can accept a variety of objects depending on the implementation.
    experiment_name : Optional[str], optional
        The name of the experiment. This can be used to categorize or identify the
        evaluation run within LangSmith. The default is None.
    metrics : Optional[list], optional
        A list of custom metrics (functions or evaluators) to be used for the
        evaluation. If None, a default set of metrics (answer relevancy, context
        precision, context recall, and faithfulness) are used.
        The default is None.
    verbose : bool, optional
        If True, detailed progress and results will be printed during the evaluation
        process.
        The default is False.

    Returns
    -------
    Dict[str, Any]
        A dictionary containing the results of the evaluation.

    Raises
    ------
    ValueError
        If the specified dataset does not exist in LangSmith.

    See Also
    --------
    Client.read_dataset : Method to read an existing dataset.
    Client.run_on_dataset : Method to run the evaluation on the specified dataset.

    Examples
    --------
    >>> results = evaluate(
    ...     dataset_name="MyDataset",
    ...     llm_or_chain_factory=my_llm,
    ...     experiment_name="experiment_1_with_vanila_rag",
    ...     verbose=True
    ... )
    >>> print(results)
    {'evaluation_result': ...}

    Notes
    -----
    The function initializes a client to interact with LangSmith, validates the existence
    of the specified dataset, prepares evaluation metrics, and runs the evaluation,
    returning the results. Custom evaluation metrics can be specified, or a default set
    will be used if none are provided.
    """
    # init client and validate dataset
    client = Client()
    try:
        _ = client.read_dataset(dataset_name=dataset_name)
    except LangSmithNotFoundError:
        raise ValueError(
            f"Dataset {dataset_name} not found in langsmith, make sure it exists in langsmith"
        )

    # make config
    if metrics is None:
        from ragas.metrics import (
            answer_relevancy,
            context_precision,
            context_recall,
            faithfulness,
        )

        metrics = [answer_relevancy, context_precision, faithfulness, context_recall]

    metrics = [EvaluatorChain(m) for m in metrics]
    eval_config = RunEvalConfig(
        custom_evaluators=metrics,
    )

    # run evaluation with langsmith
    run = client.run_on_dataset(
        dataset_name=dataset_name,
        llm_or_chain_factory=llm_or_chain_factory,
        evaluation=eval_config,
        verbose=verbose,
        # Any experiment metadata can be specified here
        project_name=experiment_name,
    )

    return run

ragas.integrations.llama_index

ragas.integrations.opik

OpikTracer

Bases: OpikTracer

Callback for Opik that can be used to log traces and evaluation scores to the Opik platform.

Attributes:

Name	Type	Description
`tags`	`list[string]`	The tags to set on each trace.
`metadata`	`dict`	Additional metadata to log for each trace.