Skip to content

Llama CPP Backend

check_token_metric_compatibility(sampler: Any, token_metric: str) -> None

Check that the llama.cpp engine can support the given token metric with the given configuration.

Parameters:

Name Type Description Default
sampler Any

The sampler object containing sampling parameters and the LLM engine.

required
token_metric str

The token metric to check compatibility for.

required

Raises:

Type Description
ValueError

If the configuration doesn't support the requested token metric.

Source code in pita/inference/llama_cpp_backend.py
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
def check_token_metric_compatibility(sampler: Any, token_metric: str) -> None:
    """
    Check that the llama.cpp engine can support the given token metric with the given configuration.

    Args:
        sampler: The sampler object containing sampling parameters and the LLM engine.
        token_metric: The token metric to check compatibility for.

    Raises:
        ValueError: If the configuration doesn't support the requested token metric.
    """
    if token_metric == "logprobs":
        # logprobs requires logits_per_token to be set
        if sampler.sampling_params.logits_per_token is None or sampler.sampling_params.logits_per_token < 1:
            raise ValueError(
                "logits_per_token must be set to at least 1 to use 'logprobs' token metric with llama.cpp backend."
            )
        # Enable normalization constants for logprobs calculation
        sampler.sampling_params.enable_normalization_constants = True
        print("Enabled normalization constants in sampling params for logprobs metric.")

    elif token_metric == "power_distribution":
        # power_distribution requires normalization constants
        if sampler.sampling_params.logits_per_token is None or sampler.sampling_params.logits_per_token < 1:
            raise ValueError(
                "logits_per_token must be set to at least 1 to use 'power_distribution' token metric with llama.cpp backend."
            )
        # Enable normalization constants
        sampler.sampling_params.enable_normalization_constants = True
        print("Enabled normalization constants in sampling params for power_distribution metric.")

    elif token_metric == "entropy":
        # entropy requires the entropy calculation to be enabled
        if sampler.sampling_params.logits_per_token is None or sampler.sampling_params.logits_per_token < 1:
            raise ValueError(
                "logits_per_token must be set to at least 1 to use 'entropy' token metric with llama.cpp backend."
            )
        # Enable entropy calculation
        sampler.sampling_params.enable_entropy = True
        print("Enabled entropy calculation in sampling params for entropy metric.")

    elif token_metric == "likelihood_confidence":
        # likelihood_confidence requires logprobs
        if sampler.sampling_params.logits_per_token is None or sampler.sampling_params.logits_per_token < 1:
            raise ValueError(
                "logits_per_token must be set to at least 1 to use 'likelihood_confidence' token metric with llama.cpp backend."
            )
        sampler.sampling_params.enable_normalization_constants = True
        print("Enabled normalization constants in sampling params for likelihood_confidence metric.")
    else:
        raise ValueError(f"Unknown token metric: {token_metric}")

create_LLM_object(model_name: str, model_type: str | None = None, dtype: str = 'auto', gpu_memory_utilization: float = 0.85, max_model_len: int = 2048, max_logprobs: int | None = None, logits_processor: bool = False, **kwargs: Any) -> Llama

Create the LLM object given the model name and engine parameters.

Parameters:

Name Type Description Default
model_name str

The name of the model to load (Hugging Face repo ID for GGUF models).

required
model_type str

The type of model. Inferred from model_name if not provided. Currently only 'gguf' is supported.

None
dtype str

The data type/quantization to use. Defaults to "auto" (f16).

'auto'
gpu_memory_utilization float

The fraction of GPU memory to use. Defaults to 0.85.

0.85
max_model_len int

The maximum context length. Defaults to 2048.

2048
max_logprobs int

Unused for llama.cpp, kept for API compatibility.

None
logits_processor bool

Whether logits processing is enabled. When True, scores are available via llm.scores. Defaults to False.

False
**kwargs Any

Additional keyword arguments passed to the Llama constructor.

{}

Returns:

Name Type Description
Llama Llama

The initialized llama.cpp Llama object.

Raises:

Type Description
ValueError

If model_type is 'safetensors' (not supported) or unsupported.

Source code in pita/inference/llama_cpp_backend.py
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
def create_LLM_object(
        model_name: str,
        model_type: str | None = None,
        dtype: str = "auto",
        gpu_memory_utilization: float = 0.85,
        max_model_len: int = 2048,
        max_logprobs: int | None = None,
        logits_processor: bool = False,
        **kwargs: Any
    ) -> Llama:
    """
    Create the LLM object given the model name and engine parameters.

    Args:
        model_name (str): The name of the model to load (Hugging Face repo ID for GGUF models).
        model_type (str, optional): The type of model. Inferred from model_name if not provided.
            Currently only 'gguf' is supported.
        dtype (str, optional): The data type/quantization to use. Defaults to "auto" (f16).
        gpu_memory_utilization (float, optional): The fraction of GPU memory to use. Defaults to 0.85.
        max_model_len (int, optional): The maximum context length. Defaults to 2048.
        max_logprobs (int, optional): Unused for llama.cpp, kept for API compatibility.
        logits_processor (bool, optional): Whether logits processing is enabled. 
            When True, scores are available via llm.scores. Defaults to False.
        **kwargs: Additional keyword arguments passed to the Llama constructor.

    Returns:
        Llama: The initialized llama.cpp Llama object.

    Raises:
        ValueError: If model_type is 'safetensors' (not supported) or unsupported.
    """
    # Infer model_type from model_name if not provided
    if model_type is None:
        # Check if model name contains common GGUF indicators
        if 'gguf' in model_name.lower() or model_name.endswith('.gguf'):
            model_type = "gguf"
        else:
            # Default to gguf for llama.cpp
            model_type = "gguf"
            print(f"Warning: model_type not specified, defaulting to 'gguf' for llama.cpp backend.")

    if model_type == "gguf":
        # Find the correct dtype in the GGUF Hugging Face Repo
        if dtype == "auto":
            kwargs['filename'] = "*f16*"
        else:
            kwargs['filename'] = f"*{dtype}*"
    elif model_type == "safetensors":
        raise ValueError("safetensors model type is not currently supported in llama.cpp backend. Please use gguf model type.")
    else:
        raise ValueError(f"{model_type} is an unsupported model type. Supported types are 'gguf'.")

    # Check to see if the user wants to use the GPU
    if gpu_memory_utilization > 0 and 'n_gpu_layers' not in kwargs:
        kwargs['n_gpu_layers'] = -1  # Use as many GPU layers as possible

    # Get the System VRAM
    total_vram_mb = get_total_vram()

    # Get the VRAM usage before loading the model
    vram_before = get_gpu_vram_usage_mb() or 0

    # Determine if we need logits_all
    # With logits processor, we don't need logits_all=True as the processor captures what we need
    # However, for compatibility with direct scores access, we may still want it
    logits_all = logits_processor

    # Initialize LLaMA.cpp locally
    llm = Llama.from_pretrained(
        repo_id=model_name,
        n_ctx=max_model_len,
        logits_all=logits_all,
        **kwargs
    )

    # Get the VRAM used to load the model
    vram_after = get_gpu_vram_usage_mb() or 0
    vram_mb = vram_after - vram_before

    if vram_mb < 1:
        print("Warning: Could not extract VRAM usage from llama.cpp logs. Model may be loaded into CPU RAM. Proceeding without VRAM check.")    
    else:
        try:
            total_vram_int = int(total_vram_mb)
            vram_mb_int = int(vram_mb)
        except (ValueError, TypeError):
            print(f"Warning: Could not extract total VRAM value ('{total_vram_mb}'). Skipping VRAM utilization check.")
        else:
            if vram_mb_int / total_vram_int > gpu_memory_utilization:
                raise ValueError(
                    "VRAM usage exceeds the specified GPU memory utilization threshold.\n"
                    "Options to Reduce VRAM:\n"
                    "1. Reduce the context size (n_ctx parameter)\n"
                    "2. Turn off GPU KV-caching with kwarg: offload_kqv = True\n"
                    "3. Load only 'N' layers to the GPU kwarg: n_gpu_layers = N\n"
                )
            else:
                print(f"VRAM Usage for Model Load: {vram_mb_int} MiB / {total_vram_int} MiB ({(vram_mb_int/total_vram_int)*100:.2f} %)")

    print("--- Model Initialization Complete. ---")

    # Return created LLM object
    return llm

sample(self, context: str | list[str], **kwargs: Any) -> Output

Generate text from the given context using the llama.cpp backend.

Parameters:

Name Type Description Default
context str | list[str]

The input context string to generate from.

required
**kwargs Any

Additional keyword arguments passed to the underlying llama.cpp generation function.

{}

Returns:

Name Type Description
Output Output

An Output object containing: - tokens: The generated token IDs. - top_k_logits: The top_k logits (if logits_per_token is set). - top_k_logprobs: The top_k logprobs (if logprobs_per_token is set). - unprocessed_log_normalization_constant: The log normalization constants for each token. - temp_processed_log_normalization_constant: The temperature-scaled log normalization constants. - entropy: The entropy for each token.

Source code in pita/inference/llama_cpp_backend.py
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
def sample(
        self,
        context: str | list[str],
        **kwargs: Any
    ) -> Output:
    """
    Generate text from the given context using the llama.cpp backend.

    Args:
        context (str | list[str]): The input context string to generate from.
        **kwargs: Additional keyword arguments passed to the underlying llama.cpp generation function.

    Returns:
        Output: An Output object containing:
            - tokens: The generated token IDs.
            - top_k_logits: The top_k logits (if logits_per_token is set).
            - top_k_logprobs: The top_k logprobs (if logprobs_per_token is set).
            - unprocessed_log_normalization_constant: The log normalization constants for each token.
            - temp_processed_log_normalization_constant: The temperature-scaled log normalization constants.
            - entropy: The entropy for each token.
    """
    # Determine if we need normalization constants or entropy
    calculate_normalization = getattr(self.sampling_params, 'enable_normalization_constants', False)
    calculate_entropy = getattr(self.sampling_params, 'enable_entropy', False)

    # Create a fresh logits processor for this sample call
    logits_processor_list, logits_processor = create_logits_processor_list(
        temperature=self.sampling_params.temperature,
        calculate_normalization=calculate_normalization,
        calculate_entropy=calculate_entropy
    )

    # Check if context is a list of strings or a single string
    if isinstance(context, list):
        context_list_len = len(context)
    else:
        context_list_len = 1
        context = [context]  # Normalize to list for uniform handling

    # For batch processing, we'd need to handle multiple contexts
    # Currently llama.cpp doesn't support true batching, so we process sequentially
    all_outputs = []

    for context_input in context:
        # Reset the LLM state for a fresh start with each context
        self.llm.reset()
        logits_processor.reset()

        # Use generate() to extract the token_ids instead of create_completion
        if isinstance(context_input, str):
            prompt_tokens = self.llm.tokenize(context_input.encode('utf-8'))
        else:
            prompt_tokens = context_input

        tokens = []
        top_k_logits = []
        logits_per_token = self.sampling_params.logits_per_token or 0

        if len(prompt_tokens) > 0:
            self.llm.eval(prompt_tokens)

        # Generation loop
        generator = self.llm.generate(
            [],
            top_k=self.sampling_params.top_k,
            top_p=self.sampling_params.top_p,
            min_p=self.sampling_params.min_p,
            temp=self.sampling_params.temperature,
            repeat_penalty=self.sampling_params.repetition_penalty,
            frequency_penalty=self.sampling_params.frequency_penalty,
            presence_penalty=self.sampling_params.presence_penalty,
            logits_processor=logits_processor_list,
            reset=False,
            **kwargs
        )

        for token in generator:
            # For each token, self.llm.scores[self.llm.n_tokens - 1] contains the logits
            # that were used to sample it.
            current_logits = self.llm.scores[self.llm.n_tokens - 1, :]

            if logits_per_token > 0:
                # Extract logits for the current step.
                # We always place the chosen token's logit first, then fill with the
                # highest remaining logits until we reach logits_per_token elements
                # or run out of logits.

                # Use argpartition to find top logits efficiently (O(N) instead of O(N log N))
                if len(current_logits) > logits_per_token:
                    # Get indices of top logits_per_token elements
                    # We might need logits_per_token elements to fill the list if the chosen token isn't in top K
                    top_indices = np.argpartition(current_logits, -logits_per_token)[-logits_per_token:]
                    # Sort only these top elements
                    sorted_short_indices = np.argsort(current_logits[top_indices])[::-1]
                    sorted_indices = top_indices[sorted_short_indices]
                else:
                    sorted_indices = np.argsort(current_logits)[::-1]

                # Ensure the chosen token logit is first as requested
                step_logits = [float(current_logits[token])]

                for idx in sorted_indices:
                    if idx == token:
                        continue
                    if len(step_logits) >= logits_per_token:
                        break
                    step_logits.append(float(current_logits[idx]))
                top_k_logits.append(step_logits)

            tokens.append(int(token))

            # Check stopping criteria
            if len(tokens) >= self.sampling_params.max_tokens:
                break
            if token == self.llm.token_eos():
                break
            if self.sampling_params.stop_token_ids and token in self.sampling_params.stop_token_ids:
                break

        # Find the token count from the token_ids
        token_count = len(tokens)

        # We only trim data from the logits processor as it is the only source that is guaranteed to have the wrong length
        unprocessed_log_normalization_constant = logits_processor.log_norm_constants[:token_count]
        temp_processed_log_normalization_constant = logits_processor.log_norm_constants_temp_scaled[:token_count]
        entropy = logits_processor.entropy[:token_count]

        # Use the temp_processed_log_normalization_constant to calculate the logprobs
        top_k_logprobs = []
        logprobs_per_token = self.sampling_params.logprobs_per_token or 0
        if logprobs_per_token > 0 and top_k_logits:
            for i in range(token_count):
                logits_row = np.array(top_k_logits[i])
                temp_norm = temp_processed_log_normalization_constant[i]
                # logprob = (logit / temp) - logsumexp(logits / temp)
                row_logprobs = (logits_row / self.sampling_params.temperature) - temp_norm
                # Slice to the requested logprobs amount
                top_k_logprobs.append(row_logprobs[:logprobs_per_token].tolist())
        else:
            top_k_logprobs = [[]] * token_count

        if not top_k_logits:
            top_k_logits = [[]] * token_count

        output = Output(
            tokens=tokens,
            top_k_logits=top_k_logits,
            top_k_logprobs=top_k_logprobs,
            unprocessed_log_normalization_constant=unprocessed_log_normalization_constant,
            temp_processed_log_normalization_constant=temp_processed_log_normalization_constant,
            entropy=entropy
        )
        all_outputs.append(output)

    # If only one context was provided, return single Output
    if context_list_len == 1:
        return all_outputs[0]

    # For multiple contexts, combine into a single Output with lists of lists
    # This matches the vLLM batch behavior
    combined = Output(
        tokens=[o.tokens for o in all_outputs],
        top_k_logits=[o.top_k_logits for o in all_outputs],
        top_k_logprobs=[o.top_k_logprobs for o in all_outputs],
        unprocessed_log_normalization_constant=[o.unprocessed_log_normalization_constant for o in all_outputs],
        temp_processed_log_normalization_constant=[o.temp_processed_log_normalization_constant for o in all_outputs],
        entropy=[o.entropy for o in all_outputs]
    )
    return combined