Skip to content

LLM Backend

AutoregressiveSampler

Stores parameters concerning the LLM, autoregressive sampling, and power sampling.

Attributes:

Name Type Description
engine str

The engine used for sampling.

model str

The LLM Model name.

llm object

LLM object from engine used for inference/sampling.

tokenizer object

Tokenizer to use for encoding/decoding (HuggingFace AutoTokenizer).

sample_fn object

Standard Sampling Function to use for sampling from the autoregressive model without test time scaling.

sampling_params object

Parameters to use for standard sampling.

chain_sampling object

Chain Sampling Object used for chain level test time scaling (i.e Best-of-N, SMC, etc.)

token_sampling object

Token Sampling Object used for token level test time scaling (i.e Metropolis-Hastings Sampling)

chain_sample_fn object

The chain sampling function to use for chain level test time scaling.

token_sample_fn object

The token sampling function to use for token level test time scaling.

Source code in pita/inference/LLM_backend.py
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
class AutoregressiveSampler:
    """Stores parameters concerning the LLM, autoregressive sampling, and power sampling.

    Attributes:
        engine (str): The engine used for sampling.
        model (str): The LLM Model name.
        llm (object): LLM object from engine used for inference/sampling.
        tokenizer (object): Tokenizer to use for encoding/decoding (HuggingFace AutoTokenizer).
        sample_fn (object): Standard Sampling Function to use for sampling from the autoregressive model without test time scaling.
        sampling_params (object): Parameters to use for standard sampling.
        chain_sampling (object): Chain Sampling Object used for chain level test time scaling (i.e Best-of-N, SMC, etc.)
        token_sampling (object): Token Sampling Object used for token level test time scaling (i.e Metropolis-Hastings Sampling)
        chain_sample_fn (object): The chain sampling function to use for chain level test time scaling.
        token_sample_fn (object): The token sampling function to use for token level test time scaling.
    """
    def __init__(
        self,
        engine: str,
        model: str,
        dtype: str,
        tokenizer_path: str,
        gpu_memory_utilization: float,
        max_model_len: int,
        max_probs: int,
        logits_processor: bool,
        trust_remote_code: bool,
        sampling_params: Sampling_Params,
        **kwargs: Any
    ) -> None:      

        """Create an AutoregressiveSampler object given the engine, engine parameters, and model name.

        Args:
            engine (str): Engine to use for autoregressive sampling. Currently only "vllm" and "llama_cpp" are supported.
            model (str): Model to load.
            dtype (str): Data type to use when loading the model. "auto" lets the engine decide.
            tokenizer_path (str): Path to a model with a tokenizer if the model path doesn't include a tokenizer.
            gpu_memory_utilization (float): GPU memory utilization to use.
            max_model_len (int): Max model context length (context window = prompt + generated tokens).
            max_probs (int): Number of top ranked probabilities (logits & logprobs) to store per output token.
            logits_processor (bool): Whether to enable the internal logits processor that allows for normalization constants and entropy to be calculated.
            trust_remote_code (bool): Whether to trust remote code when loading the model.
            sampling_params (Sampling_Params): General sampling parameters to use (Sampling_Params Class).
            **kwargs: Additional keyword arguments passed to the backend LLM creation function.

        Raises:
            ValueError: If the engine is not supported.
        """
        self.engine = engine
        self.model = model

        print(f"Loading model {model} with {engine}...")

        # Separate Backend Loading for each engine
        if(engine == "vllm"):
            backend = _get_vllm_backend()

            if(max_probs > 0 and logits_processor == False):
                print("max_probs is set but logits_processor is False. Setting logits_processor to True.")
                logits_processor = True

            # Create the LLM object
            self.llm = backend.create_LLM_object(
                model_name = model, 
                dtype = dtype,
                gpu_memory_utilization = gpu_memory_utilization,
                max_model_len = max_model_len,
                max_probs = max_probs,
                logits_processor = logits_processor,
                **kwargs
            )  

            # Set the autoregressive sampler function
            self.sample_fn = backend.sample

            # Create the engine parameters used for the completion function in vLLM
            engine_params = backend.create_vllm_engine_params()

            # Set the redis client for the LogitsLoggingProcessor
            # Add the normalization_constants and normalization_constants_temp_scaled lists to extra_args
            if(logits_processor):
                print("Enabling logits processing in engine parameters extra_args.")
                engine_params.extra_args = {}
                engine_params.extra_args["req_id"] = "my_request_" + str(time.time())

        elif(engine == "llama_cpp"):
            backend = _get_llama_cpp_backend()
            # Extract model_type from kwargs if provided, otherwise let backend infer it
            llama_model_type = kwargs.pop('model_type', None)
            # Create the LLM object
            self.llm = backend.create_LLM_object(
                model_name = model, 
                model_type = llama_model_type,
                dtype = dtype, 
                gpu_memory_utilization = gpu_memory_utilization, 
                max_model_len = max_model_len,
                max_logprobs = max_probs,
                logits_processor = logits_processor,
                **kwargs
            )
            # Set the autoregressive sampler function
            self.sample_fn = backend.sample
            # Llama.cpp does not have a separate engine params class
            engine_params = None

        elif(engine == "tensorrt"):
            backend = _get_tensorrt_backend()
            # Create the LLM object
            self.llm = backend.create_LLM_object(
                model_name = model, 
                dtype = dtype, 
                gpu_memory_utilization = gpu_memory_utilization, 
                max_model_len = max_model_len,
                max_logprobs = max_probs,
                logits_processor = logits_processor,
                **kwargs
            )
            # Set the autoregressive sampler function
            self.sample_fn = backend.sample
            # TensorRT-LLM uses per-request engine params, create a default instance
            engine_params = backend.create_tensorrt_engine_params()

        else:
            raise ValueError(f"Engine {engine} not supported for Autoregressive Sampler. Supported engines are: 'vllm', 'llama_cpp', 'tensorrt'")

        # Create tokenizer depending on whether a tokenizer path is provided
        # Needed as some models do not include the tokenizer files in the same repo as the model
        if tokenizer_path is not None:
            self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_path, trust_remote_code=trust_remote_code)
        else:
            self.tokenizer = AutoTokenizer.from_pretrained(model, trust_remote_code=trust_remote_code)

        print("Engine Params Extra Args:", getattr(engine_params, "extra_args", "N/A") if engine_params is not None else "N/A")

        # Intialize the Sampling Params
        if(sampling_params is None):
            self.sampling_params = Sampling_Params(
                engine = engine, 
                engine_params = engine_params, 
                logprobs_per_token = max_probs,
                logits_per_token = max_probs
            )
        else:
            self.sampling_params = sampling_params

        # Intialize the other test-time sampling parameters to Nones 
        self.chain_sampling = None
        self.token_sampling = None

    def sample(self,
        context: str,
        **kwargs: Any
    )-> Output:
        """Samples programmatically from the LLM given a context and max new tokens. Sample function is the engine_backend.sample function.

        Args:
            context (str): The input context.
            **kwargs: Additional keyword arguments passed to the chosen LLM Inference Engine.

        Returns:
            Output: The output of the sample function.
        """
        return self.sample_fn(self, context, **kwargs)

    def token_sample(self,
        context: str,
        **kwargs: Any
    )-> Output:
        """Samples programmatically from the LLM using the token sampling function

        Args:
            context (str): The input context.
            **kwargs: Additional keyword arguments passed to the chosen LLM Inference Engine.

        Returns:
            Output: The output of the sample function.
        """
        if getattr(self, "token_sample_name", None) == "Power Sampling":
            return self.token_sample_fn(self, context, **kwargs)
        else:
            raise ValueError("Token sampling is not enabled for this LLM/Engine.")

    def chain_sample(self,
        context: str,
        **kwargs: Any
    )-> Output:
        """Samples programmatically from the LLM using the chain sampling function

        Args:
            context (str): The input context.
            **kwargs: Additional keyword arguments passed to the chosen LLM Inference Engine.

        Returns:
            Output: The output of the sample function.
        """
        if getattr(self, "chain_sample_name", None) == "SMC" or getattr(self, "chain_sample_name", None) == "Best-of-N":
            return self.chain_sample_fn(self, context, **kwargs)
        else:
            raise ValueError("Chain sampling is not enabled for this LLM/Engine.")

    # Chain Sampling Methods
    def enable_smc(
        self,
        num_particles: int,
        tokens_per_step: int,
        stop_on_eos: bool,
        token_metric: str,  
        aggregation: str
    )-> None:
        """
        Enables SMC sampling for the chosen LLM/Engine.

        Args:
            num_particles (int): Number of particles to use for SMC.
            tokens_per_step (int): Number of tokens to generate per step.
            stop_on_eos (bool): (WIP)Whether to stop on end of sequence.
            token_metric (str): Token metric to use to grade each particle. Can be logprobs, power_distribution, entropy, or PRM
            aggregation (str): Aggregation method of the scores of each particle. Can be the last, minimum, product, or model_aggregate.

        Returns:
            None
        """
        # Check if chain sampling has already been enabled. If so replace it with SMC.
        if(self.chain_sampling is not None):
            print("Warning: Current Chain Sampling Strategy is being replaced with SMC.")

        # Check if the engine/LLM is set up for SMC
        if(token_metric == "PRM"):
            raise ValueError("PRM is not supported YET for SMC.")
        elif(token_metric == "logprobs" or token_metric == "power_distribution" or token_metric == "entropy"):
            if(self.engine == "vllm"):
                vllm_backend.check_token_metric_compatibility(self, token_metric)
            elif(self.engine == "llama_cpp"):
                llama_cpp_backend.check_token_metric_compatibility(self, token_metric)
            elif(self.engine == "tensorrt"):
                tensorrt_backend.check_token_metric_compatibility(self, token_metric)
        else:
            raise ValueError(f"{token_metric} not supported for SMC.")

        # Check if the aggregation method is supported
        if(aggregation == "last" or aggregation == "minimum" or aggregation == "product" or aggregation == "model_aggregate"):
            pass
        else:
            raise ValueError(f"{aggregation} not supported for SMC.")

        # Create the SMC Class
        from pita.sampling.smc import Sequential_Monte_Carlo
        self.chain_sampling = Sequential_Monte_Carlo(
            num_particles=num_particles,
            tokens_per_step=tokens_per_step,
            stop_on_eos=stop_on_eos,
            token_metric=token_metric,
            aggregation=aggregation
        )

        # Set the chain sampling function to the SMC sample function
        self.chain_sample_fn = self.chain_sampling.sample
        self.chain_sample_name = "SMC"

    # Token Sampling Methods
    def enable_power_sampling(
        self,
        block_size: int,
        MCMC_steps: int,
        token_metric: str,
    )-> None:
        """
        Enables Power Sampling for the chosen LLM/Engine. Checks to see if the engine/LLM is compatible with Power Sampling by verifying that the token metric is supported/available to be used

        Args:
            block_size (int): Number of tokens to generate per step.
            MCMC_steps (int): Number of MCMC steps to use for Power Sampling.
            token_metric (str): Token metric to use to grade each particle. Can be logprobs, power_distribution, entropy, or PRM

        Returns:
            None
        """
        # Check if chain sampling has already been enabled. If so replace it with Power Sampling.
        if(self.token_sampling is not None):
            print("Warning: Current Token Sampling Strategy is being replaced with Power Sampling.")

        # Check if the engine/LLM is set up for Power Sampling
        if(token_metric == "PRM"):
            raise ValueError("PRM is not supported YET for Power Sampling.")
        elif(token_metric == "logprobs" or token_metric == "power_distribution" or token_metric == "entropy"):
            if(self.engine == "vllm"):
                vllm_backend.check_token_metric_compatibility(self, token_metric)
            elif(self.engine == "llama_cpp"):
                llama_cpp_backend.check_token_metric_compatibility(self, token_metric)
            elif(self.engine == "tensorrt"):
                tensorrt_backend.check_token_metric_compatibility(self, token_metric)
        else:
            raise ValueError(f"{token_metric} not supported for Power Sampling.")

        # Create the Power Sampling Class
        from pita.sampling.power_sample import Power_Sampling
        self.token_sampling = Power_Sampling(
            block_size=block_size,
            MCMC_steps=MCMC_steps,
            token_metric=token_metric
        )

        # Set the token sampling function to the Power Sampling sample function
        self.token_sample_fn = self.token_sampling.sample
        self.token_sample_name = "Power Sampling"

__init__(engine: str, model: str, dtype: str, tokenizer_path: str, gpu_memory_utilization: float, max_model_len: int, max_probs: int, logits_processor: bool, trust_remote_code: bool, sampling_params: Sampling_Params, **kwargs: Any) -> None

Create an AutoregressiveSampler object given the engine, engine parameters, and model name.

Parameters:

Name Type Description Default
engine str

Engine to use for autoregressive sampling. Currently only "vllm" and "llama_cpp" are supported.

required
model str

Model to load.

required
dtype str

Data type to use when loading the model. "auto" lets the engine decide.

required
tokenizer_path str

Path to a model with a tokenizer if the model path doesn't include a tokenizer.

required
gpu_memory_utilization float

GPU memory utilization to use.

required
max_model_len int

Max model context length (context window = prompt + generated tokens).

required
max_probs int

Number of top ranked probabilities (logits & logprobs) to store per output token.

required
logits_processor bool

Whether to enable the internal logits processor that allows for normalization constants and entropy to be calculated.

required
trust_remote_code bool

Whether to trust remote code when loading the model.

required
sampling_params Sampling_Params

General sampling parameters to use (Sampling_Params Class).

required
**kwargs Any

Additional keyword arguments passed to the backend LLM creation function.

{}

Raises:

Type Description
ValueError

If the engine is not supported.

Source code in pita/inference/LLM_backend.py
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
def __init__(
    self,
    engine: str,
    model: str,
    dtype: str,
    tokenizer_path: str,
    gpu_memory_utilization: float,
    max_model_len: int,
    max_probs: int,
    logits_processor: bool,
    trust_remote_code: bool,
    sampling_params: Sampling_Params,
    **kwargs: Any
) -> None:      

    """Create an AutoregressiveSampler object given the engine, engine parameters, and model name.

    Args:
        engine (str): Engine to use for autoregressive sampling. Currently only "vllm" and "llama_cpp" are supported.
        model (str): Model to load.
        dtype (str): Data type to use when loading the model. "auto" lets the engine decide.
        tokenizer_path (str): Path to a model with a tokenizer if the model path doesn't include a tokenizer.
        gpu_memory_utilization (float): GPU memory utilization to use.
        max_model_len (int): Max model context length (context window = prompt + generated tokens).
        max_probs (int): Number of top ranked probabilities (logits & logprobs) to store per output token.
        logits_processor (bool): Whether to enable the internal logits processor that allows for normalization constants and entropy to be calculated.
        trust_remote_code (bool): Whether to trust remote code when loading the model.
        sampling_params (Sampling_Params): General sampling parameters to use (Sampling_Params Class).
        **kwargs: Additional keyword arguments passed to the backend LLM creation function.

    Raises:
        ValueError: If the engine is not supported.
    """
    self.engine = engine
    self.model = model

    print(f"Loading model {model} with {engine}...")

    # Separate Backend Loading for each engine
    if(engine == "vllm"):
        backend = _get_vllm_backend()

        if(max_probs > 0 and logits_processor == False):
            print("max_probs is set but logits_processor is False. Setting logits_processor to True.")
            logits_processor = True

        # Create the LLM object
        self.llm = backend.create_LLM_object(
            model_name = model, 
            dtype = dtype,
            gpu_memory_utilization = gpu_memory_utilization,
            max_model_len = max_model_len,
            max_probs = max_probs,
            logits_processor = logits_processor,
            **kwargs
        )  

        # Set the autoregressive sampler function
        self.sample_fn = backend.sample

        # Create the engine parameters used for the completion function in vLLM
        engine_params = backend.create_vllm_engine_params()

        # Set the redis client for the LogitsLoggingProcessor
        # Add the normalization_constants and normalization_constants_temp_scaled lists to extra_args
        if(logits_processor):
            print("Enabling logits processing in engine parameters extra_args.")
            engine_params.extra_args = {}
            engine_params.extra_args["req_id"] = "my_request_" + str(time.time())

    elif(engine == "llama_cpp"):
        backend = _get_llama_cpp_backend()
        # Extract model_type from kwargs if provided, otherwise let backend infer it
        llama_model_type = kwargs.pop('model_type', None)
        # Create the LLM object
        self.llm = backend.create_LLM_object(
            model_name = model, 
            model_type = llama_model_type,
            dtype = dtype, 
            gpu_memory_utilization = gpu_memory_utilization, 
            max_model_len = max_model_len,
            max_logprobs = max_probs,
            logits_processor = logits_processor,
            **kwargs
        )
        # Set the autoregressive sampler function
        self.sample_fn = backend.sample
        # Llama.cpp does not have a separate engine params class
        engine_params = None

    elif(engine == "tensorrt"):
        backend = _get_tensorrt_backend()
        # Create the LLM object
        self.llm = backend.create_LLM_object(
            model_name = model, 
            dtype = dtype, 
            gpu_memory_utilization = gpu_memory_utilization, 
            max_model_len = max_model_len,
            max_logprobs = max_probs,
            logits_processor = logits_processor,
            **kwargs
        )
        # Set the autoregressive sampler function
        self.sample_fn = backend.sample
        # TensorRT-LLM uses per-request engine params, create a default instance
        engine_params = backend.create_tensorrt_engine_params()

    else:
        raise ValueError(f"Engine {engine} not supported for Autoregressive Sampler. Supported engines are: 'vllm', 'llama_cpp', 'tensorrt'")

    # Create tokenizer depending on whether a tokenizer path is provided
    # Needed as some models do not include the tokenizer files in the same repo as the model
    if tokenizer_path is not None:
        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_path, trust_remote_code=trust_remote_code)
    else:
        self.tokenizer = AutoTokenizer.from_pretrained(model, trust_remote_code=trust_remote_code)

    print("Engine Params Extra Args:", getattr(engine_params, "extra_args", "N/A") if engine_params is not None else "N/A")

    # Intialize the Sampling Params
    if(sampling_params is None):
        self.sampling_params = Sampling_Params(
            engine = engine, 
            engine_params = engine_params, 
            logprobs_per_token = max_probs,
            logits_per_token = max_probs
        )
    else:
        self.sampling_params = sampling_params

    # Intialize the other test-time sampling parameters to Nones 
    self.chain_sampling = None
    self.token_sampling = None

chain_sample(context: str, **kwargs: Any) -> Output

Samples programmatically from the LLM using the chain sampling function

Parameters:

Name Type Description Default
context str

The input context.

required
**kwargs Any

Additional keyword arguments passed to the chosen LLM Inference Engine.

{}

Returns:

Name Type Description
Output Output

The output of the sample function.

Source code in pita/inference/LLM_backend.py
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
def chain_sample(self,
    context: str,
    **kwargs: Any
)-> Output:
    """Samples programmatically from the LLM using the chain sampling function

    Args:
        context (str): The input context.
        **kwargs: Additional keyword arguments passed to the chosen LLM Inference Engine.

    Returns:
        Output: The output of the sample function.
    """
    if getattr(self, "chain_sample_name", None) == "SMC" or getattr(self, "chain_sample_name", None) == "Best-of-N":
        return self.chain_sample_fn(self, context, **kwargs)
    else:
        raise ValueError("Chain sampling is not enabled for this LLM/Engine.")

enable_power_sampling(block_size: int, MCMC_steps: int, token_metric: str) -> None

Enables Power Sampling for the chosen LLM/Engine. Checks to see if the engine/LLM is compatible with Power Sampling by verifying that the token metric is supported/available to be used

Parameters:

Name Type Description Default
block_size int

Number of tokens to generate per step.

required
MCMC_steps int

Number of MCMC steps to use for Power Sampling.

required
token_metric str

Token metric to use to grade each particle. Can be logprobs, power_distribution, entropy, or PRM

required

Returns:

Type Description
None

None

Source code in pita/inference/LLM_backend.py
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
def enable_power_sampling(
    self,
    block_size: int,
    MCMC_steps: int,
    token_metric: str,
)-> None:
    """
    Enables Power Sampling for the chosen LLM/Engine. Checks to see if the engine/LLM is compatible with Power Sampling by verifying that the token metric is supported/available to be used

    Args:
        block_size (int): Number of tokens to generate per step.
        MCMC_steps (int): Number of MCMC steps to use for Power Sampling.
        token_metric (str): Token metric to use to grade each particle. Can be logprobs, power_distribution, entropy, or PRM

    Returns:
        None
    """
    # Check if chain sampling has already been enabled. If so replace it with Power Sampling.
    if(self.token_sampling is not None):
        print("Warning: Current Token Sampling Strategy is being replaced with Power Sampling.")

    # Check if the engine/LLM is set up for Power Sampling
    if(token_metric == "PRM"):
        raise ValueError("PRM is not supported YET for Power Sampling.")
    elif(token_metric == "logprobs" or token_metric == "power_distribution" or token_metric == "entropy"):
        if(self.engine == "vllm"):
            vllm_backend.check_token_metric_compatibility(self, token_metric)
        elif(self.engine == "llama_cpp"):
            llama_cpp_backend.check_token_metric_compatibility(self, token_metric)
        elif(self.engine == "tensorrt"):
            tensorrt_backend.check_token_metric_compatibility(self, token_metric)
    else:
        raise ValueError(f"{token_metric} not supported for Power Sampling.")

    # Create the Power Sampling Class
    from pita.sampling.power_sample import Power_Sampling
    self.token_sampling = Power_Sampling(
        block_size=block_size,
        MCMC_steps=MCMC_steps,
        token_metric=token_metric
    )

    # Set the token sampling function to the Power Sampling sample function
    self.token_sample_fn = self.token_sampling.sample
    self.token_sample_name = "Power Sampling"

enable_smc(num_particles: int, tokens_per_step: int, stop_on_eos: bool, token_metric: str, aggregation: str) -> None

Enables SMC sampling for the chosen LLM/Engine.

Parameters:

Name Type Description Default
num_particles int

Number of particles to use for SMC.

required
tokens_per_step int

Number of tokens to generate per step.

required
stop_on_eos bool

(WIP)Whether to stop on end of sequence.

required
token_metric str

Token metric to use to grade each particle. Can be logprobs, power_distribution, entropy, or PRM

required
aggregation str

Aggregation method of the scores of each particle. Can be the last, minimum, product, or model_aggregate.

required

Returns:

Type Description
None

None

Source code in pita/inference/LLM_backend.py
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
def enable_smc(
    self,
    num_particles: int,
    tokens_per_step: int,
    stop_on_eos: bool,
    token_metric: str,  
    aggregation: str
)-> None:
    """
    Enables SMC sampling for the chosen LLM/Engine.

    Args:
        num_particles (int): Number of particles to use for SMC.
        tokens_per_step (int): Number of tokens to generate per step.
        stop_on_eos (bool): (WIP)Whether to stop on end of sequence.
        token_metric (str): Token metric to use to grade each particle. Can be logprobs, power_distribution, entropy, or PRM
        aggregation (str): Aggregation method of the scores of each particle. Can be the last, minimum, product, or model_aggregate.

    Returns:
        None
    """
    # Check if chain sampling has already been enabled. If so replace it with SMC.
    if(self.chain_sampling is not None):
        print("Warning: Current Chain Sampling Strategy is being replaced with SMC.")

    # Check if the engine/LLM is set up for SMC
    if(token_metric == "PRM"):
        raise ValueError("PRM is not supported YET for SMC.")
    elif(token_metric == "logprobs" or token_metric == "power_distribution" or token_metric == "entropy"):
        if(self.engine == "vllm"):
            vllm_backend.check_token_metric_compatibility(self, token_metric)
        elif(self.engine == "llama_cpp"):
            llama_cpp_backend.check_token_metric_compatibility(self, token_metric)
        elif(self.engine == "tensorrt"):
            tensorrt_backend.check_token_metric_compatibility(self, token_metric)
    else:
        raise ValueError(f"{token_metric} not supported for SMC.")

    # Check if the aggregation method is supported
    if(aggregation == "last" or aggregation == "minimum" or aggregation == "product" or aggregation == "model_aggregate"):
        pass
    else:
        raise ValueError(f"{aggregation} not supported for SMC.")

    # Create the SMC Class
    from pita.sampling.smc import Sequential_Monte_Carlo
    self.chain_sampling = Sequential_Monte_Carlo(
        num_particles=num_particles,
        tokens_per_step=tokens_per_step,
        stop_on_eos=stop_on_eos,
        token_metric=token_metric,
        aggregation=aggregation
    )

    # Set the chain sampling function to the SMC sample function
    self.chain_sample_fn = self.chain_sampling.sample
    self.chain_sample_name = "SMC"

sample(context: str, **kwargs: Any) -> Output

Samples programmatically from the LLM given a context and max new tokens. Sample function is the engine_backend.sample function.

Parameters:

Name Type Description Default
context str

The input context.

required
**kwargs Any

Additional keyword arguments passed to the chosen LLM Inference Engine.

{}

Returns:

Name Type Description
Output Output

The output of the sample function.

Source code in pita/inference/LLM_backend.py
414
415
416
417
418
419
420
421
422
423
424
425
426
427
def sample(self,
    context: str,
    **kwargs: Any
)-> Output:
    """Samples programmatically from the LLM given a context and max new tokens. Sample function is the engine_backend.sample function.

    Args:
        context (str): The input context.
        **kwargs: Additional keyword arguments passed to the chosen LLM Inference Engine.

    Returns:
        Output: The output of the sample function.
    """
    return self.sample_fn(self, context, **kwargs)

token_sample(context: str, **kwargs: Any) -> Output

Samples programmatically from the LLM using the token sampling function

Parameters:

Name Type Description Default
context str

The input context.

required
**kwargs Any

Additional keyword arguments passed to the chosen LLM Inference Engine.

{}

Returns:

Name Type Description
Output Output

The output of the sample function.

Source code in pita/inference/LLM_backend.py
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
def token_sample(self,
    context: str,
    **kwargs: Any
)-> Output:
    """Samples programmatically from the LLM using the token sampling function

    Args:
        context (str): The input context.
        **kwargs: Additional keyword arguments passed to the chosen LLM Inference Engine.

    Returns:
        Output: The output of the sample function.
    """
    if getattr(self, "token_sample_name", None) == "Power Sampling":
        return self.token_sample_fn(self, context, **kwargs)
    else:
        raise ValueError("Token sampling is not enabled for this LLM/Engine.")

Output

Output object for any LLM sampling.

Attributes:

Name Type Description
tokens list[int] | list[list[int]]

The generated token IDs.

top_k_logits list[float] | list[list[float]] | None

The top_k logits (if logits_per_token is set). First value is always the chosen token logit.

top_k_logprobs list[float] | list[list[float]] | None

The top_k logprobs (if logprobs is set). First value is always the chosen token logprob.

unprocessed_log_normalization_constant list[float] | list[list[float]]

The log(Normalization Constants - Unprocessed) for each token.

temp_processed_log_normalization_constant list[float] | list[list[float]]

The log(Normalization Constants - Temperature Processed) for each token.

entropy list[float] | list[list[float]]

The entropy for each token.

Source code in pita/inference/LLM_backend.py
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
class Output:
    """ Output object for any LLM sampling.

    Attributes:
        tokens (list[int] | list[list[int]]): The generated token IDs.
        top_k_logits (list[float] | list[list[float]] | None): The top_k logits (if logits_per_token is set). First value is always the chosen token logit.
        top_k_logprobs (list[float] | list[list[float]] | None): The top_k logprobs (if logprobs is set). First value is always the chosen token logprob.
        unprocessed_log_normalization_constant (list[float] | list[list[float]]): The log(Normalization Constants - Unprocessed) for each token.
        temp_processed_log_normalization_constant (list[float] | list[list[float]]): The log(Normalization Constants - Temperature Processed) for each token.
        entropy (list[float] | list[list[float]]): The entropy for each token.
    """
    def __init__(
        self,
        tokens: list[int] | list[list[int]] = None,
        top_k_logits: list[float] | list[list[float]] | None = None,
        top_k_logprobs: list[float] | list[list[float]] | None = None,
        unprocessed_log_normalization_constant: list[float] | list[list[float]] = None,
        temp_processed_log_normalization_constant: list[float] | list[list[float]] = None,
        entropy: list[float] | list[list[float]] = None,
    ):
        self.tokens = tokens
        self.top_k_logits = top_k_logits
        self.top_k_logprobs = top_k_logprobs
        self.unprocessed_log_normalization_constant = unprocessed_log_normalization_constant
        self.temp_processed_log_normalization_constant = temp_processed_log_normalization_constant
        self.entropy = entropy

    def append(self, other: 'Output'):
        """
        Appends the data from another Output object to this one by extending internal lists.

        Args:
            other (Output): The other output object to append.
        """
        if other is None:
            return

        # Helper function to extend list attributes safely
        def _extend_field(field_name):
            self_val = getattr(self, field_name)
            other_val = getattr(other, field_name)

            if other_val is not None:
                if self_val is None:
                    # Use deepcopy for consistency
                    setattr(self, field_name, copy.deepcopy(other_val) if isinstance(other_val, list) else other_val)
                elif isinstance(self_val, list) and isinstance(other_val, list):
                    self_val.extend(other_val)

        _extend_field('tokens')
        _extend_field('top_k_logits')
        _extend_field('top_k_logprobs')
        _extend_field('unprocessed_log_normalization_constant')
        _extend_field('temp_processed_log_normalization_constant')
        _extend_field('entropy')

append(other: Output)

Appends the data from another Output object to this one by extending internal lists.

Parameters:

Name Type Description Default
other Output

The other output object to append.

required
Source code in pita/inference/LLM_backend.py
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
def append(self, other: 'Output'):
    """
    Appends the data from another Output object to this one by extending internal lists.

    Args:
        other (Output): The other output object to append.
    """
    if other is None:
        return

    # Helper function to extend list attributes safely
    def _extend_field(field_name):
        self_val = getattr(self, field_name)
        other_val = getattr(other, field_name)

        if other_val is not None:
            if self_val is None:
                # Use deepcopy for consistency
                setattr(self, field_name, copy.deepcopy(other_val) if isinstance(other_val, list) else other_val)
            elif isinstance(self_val, list) and isinstance(other_val, list):
                self_val.extend(other_val)

    _extend_field('tokens')
    _extend_field('top_k_logits')
    _extend_field('top_k_logprobs')
    _extend_field('unprocessed_log_normalization_constant')
    _extend_field('temp_processed_log_normalization_constant')
    _extend_field('entropy')

Sampling_Params

Sampling parameters used for generating results from the LLM. Generalized across all engines. Changes to this class should be reflected in the engine specific parameter classes.

Parameters:

Name Type Description Default
engine str

Engine name (e.g., "vllm", "transformers", etc.).

None
engine_params object

Engine specific parameter Class (vLLM: SamplingParams, llama.cpp: None).

None
enable_thinking bool

Whether to enable thinking.

False
max_tokens int

Max Number of tokens to generate per sequence.

16
temperature float

Controls randomness of sampling. Lower is more deterministic, higher is more random.

1.0
top_p float

Controls tokens to consider based on cumulative probability. Must be in (0, 1].

1.0
top_k int

Controls number of top tokens to consider. 0 considers all tokens.

0
logprobs_per_token int

Number of logprobs to return per output token. logprobs+1 token returned (includes chosen token).

None
logits_per_token int

Number of descending ranked logits to return per output token.

None
presence_penalty float

Penalizes new tokens based on appearance in generated text so far. > 0 encourages new tokens, < 0 encourages repeats.

0.0
frequency_penalty float

Penalizes new tokens based on frequency in generated text so far. > 0 encourages new tokens, < 0 encourages repeats.

0.0
repetition_penalty float

Penalizes new tokens based on appearance in prompt AND generated text so far. > 1 encourages new tokens, < 1 encourages repeats.

1.0
min_p float

Represents the minimum probability for a token to be considered. 0 disables.

0.0
seed int

Random seed.

None
stop list[str]

Strings that stop token generation. Returned output excludes stop strings.

None
stop_token_ids list[int]

Token IDs that stop token generation. Returned output excludes stop tokens.

None
ignore_eos bool

Continues generating tokens after EOS token is generated.

False
min_tokens int

Minimum Number of tokens to generate per sequence before EOS or stop is considered.

0
enable_normalization_constants bool

Whether to enable normalization constants.

False
enable_entropy bool

Whether to enable entropy.

False
Source code in pita/inference/LLM_backend.py
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
class Sampling_Params:
    """Sampling parameters used for generating results from the LLM. Generalized across all engines. Changes to this class should be reflected in the engine specific parameter classes.

    Args:
        engine (str): Engine name (e.g., "vllm", "transformers", etc.).
        engine_params (object): Engine specific parameter Class (vLLM: SamplingParams, llama.cpp: None).
        enable_thinking (bool): Whether to enable thinking.
        max_tokens (int): Max Number of tokens to generate per sequence.
        temperature (float): Controls randomness of sampling. Lower is more deterministic, higher is more random.
        top_p (float): Controls tokens to consider based on cumulative probability. Must be in (0, 1].
        top_k (int): Controls number of top tokens to consider. 0 considers all tokens.
        logprobs_per_token (int): Number of logprobs to return per output token. logprobs+1 token returned (includes chosen token).
        logits_per_token (int): Number of descending ranked logits to return per output token.
        presence_penalty (float): Penalizes new tokens based on appearance in generated text so far. > 0 encourages new tokens, < 0 encourages repeats.
        frequency_penalty (float): Penalizes new tokens based on frequency in generated text so far. > 0 encourages new tokens, < 0 encourages repeats.
        repetition_penalty (float): Penalizes new tokens based on appearance in prompt AND generated text so far. > 1 encourages new tokens, < 1 encourages repeats.
        min_p (float): Represents the minimum probability for a token to be considered. 0 disables.
        seed (int): Random seed.
        stop (list[str]): Strings that stop token generation. Returned output excludes stop strings.
        stop_token_ids (list[int]): Token IDs that stop token generation. Returned output excludes stop tokens.
        ignore_eos (bool): Continues generating tokens after EOS token is generated.
        min_tokens (int): Minimum Number of tokens to generate per sequence before EOS or stop is considered.
        enable_normalization_constants (bool): Whether to enable normalization constants.
        enable_entropy (bool): Whether to enable entropy.
    """
    def __init__(
        self,
        engine: str = None,
        engine_params: object = None,
        enable_thinking: bool = False,
        max_tokens: int = 16,
        temperature: float = 1.0,
        top_p: float = 1.0,
        top_k: int = 0,
        logprobs_per_token: int = None,
        logits_per_token: int = None,
        presence_penalty: float = 0.0,
        frequency_penalty: float = 0.0,
        repetition_penalty: float = 1.0,
        min_p: float = 0.0,
        seed: int = None,
        stop: list[str] = None,
        stop_token_ids: list[int] = None,
        ignore_eos: bool = False,
        min_tokens: int = 0,
        enable_normalization_constants: bool = False,
        enable_entropy: bool = False
    ):  
        object.__setattr__(self, 'engine', engine)
        object.__setattr__(self, 'engine_params', engine_params)
        object.__setattr__(self, 'enable_thinking', enable_thinking)
        object.__setattr__(self, 'max_tokens', max_tokens)
        object.__setattr__(self, 'temperature', temperature)
        object.__setattr__(self, 'top_p', top_p)
        object.__setattr__(self, 'top_k', top_k)
        object.__setattr__(self, 'logprobs_per_token', logprobs_per_token)
        object.__setattr__(self, 'logits_per_token', logits_per_token)
        object.__setattr__(self, 'presence_penalty', presence_penalty)
        object.__setattr__(self, 'frequency_penalty', frequency_penalty)
        object.__setattr__(self, 'repetition_penalty', repetition_penalty)
        object.__setattr__(self, 'min_p', min_p)
        object.__setattr__(self, 'seed', seed)
        object.__setattr__(self, 'stop', stop)
        object.__setattr__(self, 'stop_token_ids', stop_token_ids)
        object.__setattr__(self, 'ignore_eos', ignore_eos)
        object.__setattr__(self, 'min_tokens', min_tokens)
        object.__setattr__(self, 'enable_normalization_constants', enable_normalization_constants)
        object.__setattr__(self, 'enable_entropy', enable_entropy)

        # Sync all parameters to engine_params after initialization
        if engine is not None and engine_params is not None:
            for param_name in ['max_tokens', 'temperature', 'top_p', 'top_k', 'logprobs_per_token', 'logits_per_token',
                               'presence_penalty', 'frequency_penalty', 'repetition_penalty',
                               'min_p', 'seed', 'stop', 'stop_token_ids', 'ignore_eos', 'min_tokens']:
                self._sync_param_to_engine(param_name, getattr(self, param_name))


    def __setattr__(self, name, value):
        # Also sync to engine_params if it exists
        super().__setattr__(name, value)

        # If attribute is dependent on a Logits Processor, makes sure to propagate the change
        if(self.engine == "vllm"):
            if(name == "enable_normalization_constants"):
                self.engine_params.extra_args["normalization_constants"] = value
                return
            elif(name == "enable_entropy"):
                self.engine_params.extra_args["entropy"] = value
                return

        self._sync_param_to_engine(name, value)


    def _sync_param_to_engine(self, param_name, value):
        # Skip syncing for llama_cpp as it does not use a separate engine_params class
        if self.engine == "llama_cpp":
            return

        """Sync a single parameter to engine_params"""
        if not hasattr(self, 'engine') or self.engine is None:
            raise ValueError("Engine must be set in Sampling_Params to sync parameters to engine_params.")

        if self.engine_params is None:
            raise ValueError("engine_params Class must be set in Sampling_Params to sync parameters to engine_params.")

        # Check if engine is vLLM and logprobs/logits are being changed
        if self.engine == "vllm":
            if(param_name == "logprobs_per_token"):
                if(value < self.logits_per_token):
                    # Do not overwrite the vLLM engine parameter "logprobs" as logits_per_token will fail
                    return
            if(param_name == "logits_per_token"):
                if(value < self.logprobs_per_token):
                    # Do not overwrite the vLLM engine parameter "logits_per_token" as logprobs_per_token will fail
                    return

        # Handle tensorrt-specific top_k value conversion
        # TensorRT-LLM requires top_k >= 0, where 0 means "consider all tokens"
        # Other backends like vLLM use -1 to mean the same thing
        if self.engine == "tensorrt" and param_name == "top_k" and value == -1:
            value = 0

        # Sync logic here
        engine_map = ENGINE_PARAM_MAPS.get(self.engine, {})
        engine_param_name = engine_map.get(param_name)
        # If the engine supports this parameter, set it
        if engine_param_name is not None:
            setattr(self.engine_params, engine_param_name, value)