Response Quality

Measure LLM response quality across multiple dimensions for comprehensive evaluation.

Problem

A single quality score doesn't capture all aspects of response quality:

Is the answer accurate and correct?
Is it helpful for the user's actual need?
Is it clear and well-structured?
Is it concise or unnecessarily verbose?
Is the tone appropriate?

Solution

from brokle import Brokle
from brokle.evaluation import LLMJudge, evaluate
from dataclasses import dataclass
from typing import Optional

brokle = Brokle()

@dataclass
class QualityScores:
    accuracy: float
    helpfulness: float
    clarity: float
    conciseness: float
    tone: float
    overall: float
    details: dict

# Define dimension-specific judges
judges = {
    "accuracy": LLMJudge(
        name="accuracy",
        prompt="""Rate the factual accuracy of this response.

Question: {input}
Response: {output}
Reference (if available): {reference}

Consider:
- Are facts correct?
- Are any claims verifiable?
- Are there any errors or misconceptions?

Rate 0-1 where 1 is completely accurate.""",
        model="gpt-4o"
    ),

    "helpfulness": LLMJudge(
        name="helpfulness",
        prompt="""Rate how helpful this response is for the user's need.

User Query: {input}
Response: {output}

Consider:
- Does it address the actual question?
- Does it provide actionable information?
- Would the user get what they need?

Rate 0-1 where 1 is extremely helpful.""",
        model="gpt-4o-mini"
    ),

    "clarity": LLMJudge(
        name="clarity",
        prompt="""Rate the clarity and structure of this response.

Response: {output}

Consider:
- Is it well-organized?
- Is the language clear and unambiguous?
- Is it easy to follow?

Rate 0-1 where 1 is crystal clear.""",
        model="gpt-4o-mini"
    ),

    "conciseness": LLMJudge(
        name="conciseness",
        prompt="""Rate the conciseness of this response.

Query: {input}
Response: {output}

Consider:
- Is it appropriately detailed for the question?
- Is there unnecessary repetition?
- Could it be shorter without losing value?

Rate 0-1 where 1 is optimally concise (not too short, not too long).""",
        model="gpt-4o-mini"
    ),

    "tone": LLMJudge(
        name="tone",
        prompt="""Rate the appropriateness of the tone.

Context: {context}
Response: {output}

Consider:
- Is the tone professional and appropriate?
- Does it match the expected context?
- Is it respectful and neutral?

Rate 0-1 where 1 is perfectly appropriate tone.""",
        model="gpt-4o-mini"
    )
}

def evaluate_quality(
    query: str,
    response: str,
    reference: str = None,
    context: str = "general assistant",
    weights: dict = None
) -> QualityScores:
    """Evaluate response quality across multiple dimensions."""

    # Default weights
    if weights is None:
        weights = {
            "accuracy": 0.3,
            "helpfulness": 0.3,
            "clarity": 0.2,
            "conciseness": 0.1,
            "tone": 0.1
        }

    with brokle.start_as_current_span(
        name="quality_evaluation",
        metadata={"dimensions": list(weights.keys())}
    ) as span:
        scores = {}
        details = {}

        # Evaluate each dimension
        for dimension, judge in judges.items():
            result = judge.evaluate(
                input=query,
                output=response,
                reference=reference or "Not provided",
                context=context
            )

            scores[dimension] = result.score
            details[dimension] = result.comment

            # Record individual scores
            span.score(
                name=dimension,
                value=result.score,
                comment=result.comment[:200] if result.comment else None
            )

        # Calculate weighted overall score
        overall = sum(
            scores[dim] * weights[dim]
            for dim in weights
        )

        span.score(name="overall_quality", value=overall)
        span.update(
            output={
                "overall": overall,
                "dimensions": scores
            },
            metadata={"overall_quality": overall}
        )

        return QualityScores(
            accuracy=scores["accuracy"],
            helpfulness=scores["helpfulness"],
            clarity=scores["clarity"],
            conciseness=scores["conciseness"],
            tone=scores["tone"],
            overall=overall,
            details=details
        )

# Usage
query = "How do I reset my password?"
response = """
To reset your password, follow these steps:

1. Go to the login page
2. Click "Forgot Password"
3. Enter your email address
4. Check your inbox for a reset link
5. Click the link and create a new password

Make sure your new password is at least 8 characters with a mix of letters and numbers.
"""

quality = evaluate_quality(query, response)
print(f"Overall Quality: {quality.overall:.0%}")
print(f"  Accuracy: {quality.accuracy:.0%}")
print(f"  Helpfulness: {quality.helpfulness:.0%}")
print(f"  Clarity: {quality.clarity:.0%}")
print(f"  Conciseness: {quality.conciseness:.0%}")
print(f"  Tone: {quality.tone:.0%}")

brokle.flush()

import { Brokle, LLMJudge } from 'brokle';

const brokle = new Brokle();

const judges = {
  accuracy: new LLMJudge({
    name: 'accuracy',
    prompt: `Rate the factual accuracy. Question: {input} Response: {output}
Rate 0-1 where 1 is completely accurate.`,
    model: 'gpt-4o',
  }),
  helpfulness: new LLMJudge({
    name: 'helpfulness',
    prompt: `Rate how helpful this is. Query: {input} Response: {output}
Rate 0-1 where 1 is extremely helpful.`,
    model: 'gpt-4o-mini',
  }),
  clarity: new LLMJudge({
    name: 'clarity',
    prompt: `Rate the clarity. Response: {output}
Rate 0-1 where 1 is crystal clear.`,
    model: 'gpt-4o-mini',
  }),
};

async function evaluateQuality(query, response, weights = null) {
  weights = weights || { accuracy: 0.4, helpfulness: 0.4, clarity: 0.2 };

  return brokle.startActiveSpan('quality_evaluation', async (span) => {
    const scores = {};

    for (const [dimension, judge] of Object.entries(judges)) {
      const result = await judge.evaluate({ input: query, output: response });
      scores[dimension] = result.score;
      await brokle.scores.submit({
        traceId: span.spanContext().traceId,
        spanId: span.spanContext().spanId,
        name: dimension,
        value: result.score,
      });
    }

    const overall = Object.entries(weights).reduce(
      (sum, [dim, weight]) => sum + (scores[dim] || 0) * weight,
      0
    );

    await brokle.scores.submit({
      traceId: span.spanContext().traceId,
      spanId: span.spanContext().spanId,
      name: 'overall_quality',
      value: overall,
    });

    return { ...scores, overall };
  });
}

Quality Dimensions

Dimension	What It Measures	Weight (Default)
Accuracy	Factual correctness	30%
Helpfulness	Addresses user need	30%
Clarity	Clear and well-organized	20%
Conciseness	Appropriate length	10%
Tone	Professional and appropriate	10%

Variations

Domain-Specific Quality

def evaluate_customer_support_quality(
    query: str,
    response: str,
    ticket_type: str
) -> dict:
    """Quality evaluation tailored for customer support."""

    support_judges = {
        "resolution_potential": LLMJudge(
            name="resolution",
            prompt="""Rate if this response would likely resolve the customer's issue.

Customer Query: {input}
Agent Response: {output}
Ticket Type: {ticket_type}

Rate 0-1 where 1 means the issue would likely be resolved.""",
            model="gpt-4o"
        ),

        "empathy": LLMJudge(
            name="empathy",
            prompt="""Rate the empathy shown in this response.

Customer Query: {input}
Agent Response: {output}

Consider:
- Acknowledgment of the issue
- Understanding of customer frustration
- Supportive language

Rate 0-1.""",
            model="gpt-4o-mini"
        ),

        "action_clarity": LLMJudge(
            name="action_clarity",
            prompt="""Rate how clear the next steps are.

Response: {output}

Consider:
- Are next steps clearly stated?
- Does the customer know what to do?
- Are expectations set?

Rate 0-1.""",
            model="gpt-4o-mini"
        )
    }

    with brokle.start_as_current_span(
        name="support_quality",
        metadata={"ticket_type": ticket_type}
    ) as span:
        scores = {}
        for name, judge in support_judges.items():
            result = judge.evaluate(
                input=query,
                output=response,
                ticket_type=ticket_type
            )
            scores[name] = result.score
            span.score(name=name, value=result.score)

        return scores

Comparative Quality

def compare_responses(
    query: str,
    response_a: str,
    response_b: str
) -> dict:
    """Compare quality of two responses side-by-side."""

    comparison_judge = LLMJudge(
        name="comparison",
        prompt="""Compare these two responses to the same query.

Query: {input}

Response A:
{response_a}

Response B:
{response_b}

For each dimension, indicate which is better (A, B, or TIE):
1. Accuracy
2. Helpfulness
3. Clarity
4. Conciseness
5. Overall

Provide brief explanation for each.""",
        model="gpt-4o"
    )

    result = comparison_judge.evaluate(
        input=query,
        response_a=response_a,
        response_b=response_b
    )

    return {
        "comparison": result.comment,
        "raw_score": result.score  # Can encode preference
    }

Quality Trend Analysis

def analyze_quality_trend(
    trace_ids: list[str],
    dimension: str = "overall_quality"
) -> dict:
    """Analyze quality trends over time."""

    scores = []
    for trace_id in trace_ids:
        trace = brokle.traces.get(trace_id)
        score = next(
            (s for s in trace.scores if s.name == dimension),
            None
        )
        if score:
            scores.append({
                "timestamp": trace.timestamp,
                "score": score.value
            })

    # Calculate statistics
    if not scores:
        return {"error": "No scores found"}

    values = [s["score"] for s in scores]
    return {
        "count": len(values),
        "mean": sum(values) / len(values),
        "min": min(values),
        "max": max(values),
        "trend": "improving" if values[-1] > values[0] else "declining",
        "scores": scores
    }

Batch Quality Evaluation

def batch_evaluate_quality(
    samples: list[dict],  # [{"query": ..., "response": ...}, ...]
    sample_rate: float = 0.1
) -> dict:
    """Evaluate quality on a sample of responses."""

    import random

    # Sample responses
    sample_size = max(1, int(len(samples) * sample_rate))
    sampled = random.sample(samples, sample_size)

    with brokle.start_as_current_span(
        name="batch_quality_evaluation",
        metadata={
            "total_samples": len(samples),
            "evaluated_samples": sample_size,
            "sample_rate": sample_rate
        }
    ) as span:
        all_scores = []

        for sample in sampled:
            quality = evaluate_quality(
                query=sample["query"],
                response=sample["response"]
            )
            all_scores.append(quality)

        # Aggregate
        avg_overall = sum(q.overall for q in all_scores) / len(all_scores)
        avg_accuracy = sum(q.accuracy for q in all_scores) / len(all_scores)
        avg_helpfulness = sum(q.helpfulness for q in all_scores) / len(all_scores)

        summary = {
            "samples_evaluated": len(all_scores),
            "avg_overall": avg_overall,
            "avg_accuracy": avg_accuracy,
            "avg_helpfulness": avg_helpfulness,
            "quality_distribution": {
                "excellent": sum(1 for q in all_scores if q.overall >= 0.9),
                "good": sum(1 for q in all_scores if 0.7 <= q.overall < 0.9),
                "fair": sum(1 for q in all_scores if 0.5 <= q.overall < 0.7),
                "poor": sum(1 for q in all_scores if q.overall < 0.5)
            }
        }

        span.update(output=summary)
        span.score(name="batch_avg_quality", value=avg_overall)

        return summary

Use cheaper models (gpt-4o-mini) for subjective dimensions like tone, and stronger models (gpt-4o) for accuracy and fact-checking.

Best Practices

Weight by Importance: Adjust weights based on your use case
Use Reference Answers: When available, provide reference answers for accuracy
Sample for Scale: Evaluate a sample rather than all responses at scale
Track Over Time: Monitor quality trends to catch regressions early

Built-in Evaluators - Pre-built quality evaluators
LLM as Judge - Creating custom judges
Datasets - Batch evaluation with datasets

Response Quality

Measure LLM response quality across multiple dimensions for comprehensive evaluation.

Problem

A single quality score doesn't capture all aspects of response quality:

Is the answer accurate and correct?
Is it helpful for the user's actual need?
Is it clear and well-structured?
Is it concise or unnecessarily verbose?
Is the tone appropriate?

Solution

from brokle import Brokle
from brokle.evaluation import LLMJudge, evaluate
from dataclasses import dataclass
from typing import Optional

brokle = Brokle()

@dataclass
class QualityScores:
    accuracy: float
    helpfulness: float
    clarity: float
    conciseness: float
    tone: float
    overall: float
    details: dict

# Define dimension-specific judges
judges = {
    "accuracy": LLMJudge(
        name="accuracy",
        prompt="""Rate the factual accuracy of this response.

Question: {input}
Response: {output}
Reference (if available): {reference}

Consider:
- Are facts correct?
- Are any claims verifiable?
- Are there any errors or misconceptions?

Rate 0-1 where 1 is completely accurate.""",
        model="gpt-4o"
    ),

    "helpfulness": LLMJudge(
        name="helpfulness",
        prompt="""Rate how helpful this response is for the user's need.

User Query: {input}
Response: {output}

Consider:
- Does it address the actual question?
- Does it provide actionable information?
- Would the user get what they need?

Rate 0-1 where 1 is extremely helpful.""",
        model="gpt-4o-mini"
    ),

    "clarity": LLMJudge(
        name="clarity",
        prompt="""Rate the clarity and structure of this response.

Response: {output}

Consider:
- Is it well-organized?
- Is the language clear and unambiguous?
- Is it easy to follow?

Rate 0-1 where 1 is crystal clear.""",
        model="gpt-4o-mini"
    ),

    "conciseness": LLMJudge(
        name="conciseness",
        prompt="""Rate the conciseness of this response.

Query: {input}
Response: {output}

Consider:
- Is it appropriately detailed for the question?
- Is there unnecessary repetition?
- Could it be shorter without losing value?

Rate 0-1 where 1 is optimally concise (not too short, not too long).""",
        model="gpt-4o-mini"
    ),

    "tone": LLMJudge(
        name="tone",
        prompt="""Rate the appropriateness of the tone.

Context: {context}
Response: {output}

Consider:
- Is the tone professional and appropriate?
- Does it match the expected context?
- Is it respectful and neutral?

Rate 0-1 where 1 is perfectly appropriate tone.""",
        model="gpt-4o-mini"
    )
}

def evaluate_quality(
    query: str,
    response: str,
    reference: str = None,
    context: str = "general assistant",
    weights: dict = None
) -> QualityScores:
    """Evaluate response quality across multiple dimensions."""

    # Default weights
    if weights is None:
        weights = {
            "accuracy": 0.3,
            "helpfulness": 0.3,
            "clarity": 0.2,
            "conciseness": 0.1,
            "tone": 0.1
        }

    with brokle.start_as_current_span(
        name="quality_evaluation",
        metadata={"dimensions": list(weights.keys())}
    ) as span:
        scores = {}
        details = {}

        # Evaluate each dimension
        for dimension, judge in judges.items():
            result = judge.evaluate(
                input=query,
                output=response,
                reference=reference or "Not provided",
                context=context
            )

            scores[dimension] = result.score
            details[dimension] = result.comment

            # Record individual scores
            span.score(
                name=dimension,
                value=result.score,
                comment=result.comment[:200] if result.comment else None
            )

        # Calculate weighted overall score
        overall = sum(
            scores[dim] * weights[dim]
            for dim in weights
        )

        span.score(name="overall_quality", value=overall)
        span.update(
            output={
                "overall": overall,
                "dimensions": scores
            },
            metadata={"overall_quality": overall}
        )

        return QualityScores(
            accuracy=scores["accuracy"],
            helpfulness=scores["helpfulness"],
            clarity=scores["clarity"],
            conciseness=scores["conciseness"],
            tone=scores["tone"],
            overall=overall,
            details=details
        )

# Usage
query = "How do I reset my password?"
response = """
To reset your password, follow these steps:

1. Go to the login page
2. Click "Forgot Password"
3. Enter your email address
4. Check your inbox for a reset link
5. Click the link and create a new password

Make sure your new password is at least 8 characters with a mix of letters and numbers.
"""

quality = evaluate_quality(query, response)
print(f"Overall Quality: {quality.overall:.0%}")
print(f"  Accuracy: {quality.accuracy:.0%}")
print(f"  Helpfulness: {quality.helpfulness:.0%}")
print(f"  Clarity: {quality.clarity:.0%}")
print(f"  Conciseness: {quality.conciseness:.0%}")
print(f"  Tone: {quality.tone:.0%}")

brokle.flush()

import { Brokle, LLMJudge } from 'brokle';

const brokle = new Brokle();

const judges = {
  accuracy: new LLMJudge({
    name: 'accuracy',
    prompt: `Rate the factual accuracy. Question: {input} Response: {output}
Rate 0-1 where 1 is completely accurate.`,
    model: 'gpt-4o',
  }),
  helpfulness: new LLMJudge({
    name: 'helpfulness',
    prompt: `Rate how helpful this is. Query: {input} Response: {output}
Rate 0-1 where 1 is extremely helpful.`,
    model: 'gpt-4o-mini',
  }),
  clarity: new LLMJudge({
    name: 'clarity',
    prompt: `Rate the clarity. Response: {output}
Rate 0-1 where 1 is crystal clear.`,
    model: 'gpt-4o-mini',
  }),
};

async function evaluateQuality(query, response, weights = null) {
  weights = weights || { accuracy: 0.4, helpfulness: 0.4, clarity: 0.2 };

  return brokle.startActiveSpan('quality_evaluation', async (span) => {
    const scores = {};

    for (const [dimension, judge] of Object.entries(judges)) {
      const result = await judge.evaluate({ input: query, output: response });
      scores[dimension] = result.score;
      await brokle.scores.submit({
        traceId: span.spanContext().traceId,
        spanId: span.spanContext().spanId,
        name: dimension,
        value: result.score,
      });
    }

    const overall = Object.entries(weights).reduce(
      (sum, [dim, weight]) => sum + (scores[dim] || 0) * weight,
      0
    );

    await brokle.scores.submit({
      traceId: span.spanContext().traceId,
      spanId: span.spanContext().spanId,
      name: 'overall_quality',
      value: overall,
    });

    return { ...scores, overall };
  });
}

Quality Dimensions

Dimension	What It Measures	Weight (Default)
Accuracy	Factual correctness	30%
Helpfulness	Addresses user need	30%
Clarity	Clear and well-organized	20%
Conciseness	Appropriate length	10%
Tone	Professional and appropriate	10%

Variations

Domain-Specific Quality

def evaluate_customer_support_quality(
    query: str,
    response: str,
    ticket_type: str
) -> dict:
    """Quality evaluation tailored for customer support."""

    support_judges = {
        "resolution_potential": LLMJudge(
            name="resolution",
            prompt="""Rate if this response would likely resolve the customer's issue.

Customer Query: {input}
Agent Response: {output}
Ticket Type: {ticket_type}

Rate 0-1 where 1 means the issue would likely be resolved.""",
            model="gpt-4o"
        ),

        "empathy": LLMJudge(
            name="empathy",
            prompt="""Rate the empathy shown in this response.

Customer Query: {input}
Agent Response: {output}

Consider:
- Acknowledgment of the issue
- Understanding of customer frustration
- Supportive language

Rate 0-1.""",
            model="gpt-4o-mini"
        ),

        "action_clarity": LLMJudge(
            name="action_clarity",
            prompt="""Rate how clear the next steps are.

Response: {output}

Consider:
- Are next steps clearly stated?
- Does the customer know what to do?
- Are expectations set?

Rate 0-1.""",
            model="gpt-4o-mini"
        )
    }

    with brokle.start_as_current_span(
        name="support_quality",
        metadata={"ticket_type": ticket_type}
    ) as span:
        scores = {}
        for name, judge in support_judges.items():
            result = judge.evaluate(
                input=query,
                output=response,
                ticket_type=ticket_type
            )
            scores[name] = result.score
            span.score(name=name, value=result.score)

        return scores

Comparative Quality

def compare_responses(
    query: str,
    response_a: str,
    response_b: str
) -> dict:
    """Compare quality of two responses side-by-side."""

    comparison_judge = LLMJudge(
        name="comparison",
        prompt="""Compare these two responses to the same query.

Query: {input}

Response A:
{response_a}

Response B:
{response_b}

For each dimension, indicate which is better (A, B, or TIE):
1. Accuracy
2. Helpfulness
3. Clarity
4. Conciseness
5. Overall

Provide brief explanation for each.""",
        model="gpt-4o"
    )

    result = comparison_judge.evaluate(
        input=query,
        response_a=response_a,
        response_b=response_b
    )

    return {
        "comparison": result.comment,
        "raw_score": result.score  # Can encode preference
    }

Quality Trend Analysis

def analyze_quality_trend(
    trace_ids: list[str],
    dimension: str = "overall_quality"
) -> dict:
    """Analyze quality trends over time."""

    scores = []
    for trace_id in trace_ids:
        trace = brokle.traces.get(trace_id)
        score = next(
            (s for s in trace.scores if s.name == dimension),
            None
        )
        if score:
            scores.append({
                "timestamp": trace.timestamp,
                "score": score.value
            })

    # Calculate statistics
    if not scores:
        return {"error": "No scores found"}

    values = [s["score"] for s in scores]
    return {
        "count": len(values),
        "mean": sum(values) / len(values),
        "min": min(values),
        "max": max(values),
        "trend": "improving" if values[-1] > values[0] else "declining",
        "scores": scores
    }

Batch Quality Evaluation

def batch_evaluate_quality(
    samples: list[dict],  # [{"query": ..., "response": ...}, ...]
    sample_rate: float = 0.1
) -> dict:
    """Evaluate quality on a sample of responses."""

    import random

    # Sample responses
    sample_size = max(1, int(len(samples) * sample_rate))
    sampled = random.sample(samples, sample_size)

    with brokle.start_as_current_span(
        name="batch_quality_evaluation",
        metadata={
            "total_samples": len(samples),
            "evaluated_samples": sample_size,
            "sample_rate": sample_rate
        }
    ) as span:
        all_scores = []

        for sample in sampled:
            quality = evaluate_quality(
                query=sample["query"],
                response=sample["response"]
            )
            all_scores.append(quality)

        # Aggregate
        avg_overall = sum(q.overall for q in all_scores) / len(all_scores)
        avg_accuracy = sum(q.accuracy for q in all_scores) / len(all_scores)
        avg_helpfulness = sum(q.helpfulness for q in all_scores) / len(all_scores)

        summary = {
            "samples_evaluated": len(all_scores),
            "avg_overall": avg_overall,
            "avg_accuracy": avg_accuracy,
            "avg_helpfulness": avg_helpfulness,
            "quality_distribution": {
                "excellent": sum(1 for q in all_scores if q.overall >= 0.9),
                "good": sum(1 for q in all_scores if 0.7 <= q.overall < 0.9),
                "fair": sum(1 for q in all_scores if 0.5 <= q.overall < 0.7),
                "poor": sum(1 for q in all_scores if q.overall < 0.5)
            }
        }

        span.update(output=summary)
        span.score(name="batch_avg_quality", value=avg_overall)

        return summary

Use cheaper models (gpt-4o-mini) for subjective dimensions like tone, and stronger models (gpt-4o) for accuracy and fact-checking.

Best Practices

Weight by Importance: Adjust weights based on your use case
Use Reference Answers: When available, provide reference answers for accuracy
Sample for Scale: Evaluate a sample rather than all responses at scale
Track Over Time: Monitor quality trends to catch regressions early

Built-in Evaluators - Pre-built quality evaluators
LLM as Judge - Creating custom judges
Datasets - Batch evaluation with datasets

Response Quality

Response Quality

Problem

Solution

Quality Dimensions

Variations

Domain-Specific Quality

Comparative Quality

Quality Trend Analysis

Batch Quality Evaluation

Best Practices

On this page

Response Quality

Response Quality

Problem

Solution

Quality Dimensions

Variations

Domain-Specific Quality

Comparative Quality

Quality Trend Analysis

Batch Quality Evaluation

Best Practices

On this page