Skip to content

Getting started

This guide will walk you through the basics of using Litelines to get structured generation from language models. By the end, you'll understand how to:

  1. Install Litelines
  2. Generate a basic structured response
  3. Generate a basic streamed structured response

Installation

To install Litelines:

pip install litelines
uv pip install litelines

Your First Structured Generation

Let's start with a simple example.

Download a model and its tokenizer

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

device = torch.device("cuda") # "cuda", "mps", or "cpu"

model_id = "Qwen/Qwen3-0.6B"
model = AutoModelForCausalLM.from_pretrained(model_id).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_id)

Prepare the inputs to the LLM

user_input = "What is the sentiment of the following text: 'Awesome'"
messages = [{"role": "user", "content": user_input}]
inputs = tokenizer.apply_chat_template(
    messages, 
    add_generation_prompt=True, 
    return_tensors="pt", 
    return_dict=True
).to(model.device)

Define a Pydantic schema describing the required JSON

from typing import Literal
from pydantic import BaseModel, Field

class Sentiment(BaseModel):
    """Correctly inferred `Sentiment`."""
    label: Literal["positive", "negative"] = Field(
        ..., description="Sentiment of the text"
    )

Define the processor and visualize it

from litelines.transformers import SchemaProcessor

processor = SchemaProcessor(response_format=Sentiment, tokenizer=tokenizer)
processor.show_graph()

%3Allowed PathsRegular expression: [\n\t ]*{[\n\t ]*"label"[\n\t ]*:[\n\t ]*("positive"|"negative")[\n\t ]*}000->0idtoken2345959101503......110->1idtoken53632{1476{90{......23230->23idtoken4913{"5212{"1->1idtoken2345959101503......221->2idtoken92667"label1->23idtoken330"1"332->3idtoken15620"1837"698"......442->4idtoken788":51418":4660":552->5idtoken3252":"3->3idtoken2345959101503......3->4idtoken25:47446:549:......3->5idtoken2974:"34638:"4->4idtoken2345959101503......4->5idtoken330"1"665->6idtoken2724posit885->8idtoken30487positive11115->11idtoken77n12125->12idtoken811ne13135->13idtoken28775neg16165->16idtoken42224negative19195->19idtoken79p20205->20idtoken966pos22225->22idtoken5368po776->7idtoken72i6->8idtoken533ive10106->10idtoken344iv7->8idtoken586ve7->10idtoken85v998->9idtoken15620"698"1"......28288->28idtoken9207"}9->9idtoken2345959101503......9->28idtoken92}335}10->8idtoken68e11->12idtoken68e11->13idtoken791eg141411->14idtoken11188ega11->16idtoken15060egative12->13idtoken70g12->14idtoken6743ga13->14idtoken64a151513->15idtoken9307ati13->16idtoken1388ative171713->17idtoken19488ativ181813->18idtoken266at14->15idtoken10251ti14->18idtoken83t15->16idtoken586ve15->17idtoken85v16->9idtoken15620"698"1"......16->28idtoken9207"}17->16idtoken68e18->15idtoken72i18->16idtoken533ive18->17idtoken344iv19->6idtoken34054osit19->20idtoken436os212119->21idtoken30724osi19->22idtoken78o20->6idtoken275it20->7idtoken12303iti20->8idtoken3404itive20->21idtoken72i21->6idtoken83t21->7idtoken10251ti22->6idtoken46865sit22->20idtoken82s22->21idtoken6321si23->2idtoken1502label242423->24idtoken14380lab262623->26idtoken75l272723->27idtoken4260la24->2idtoken301el252524->25idtoken68e25->2idtoken75l26->2idtoken780abel26->24idtoken370ab26->25idtoken8229abe26->27idtoken64a27->2idtoken9779bel27->24idtoken65b27->25idtoken1371be->0

Generate a structured response

generated = model.generate(**inputs, logits_processor=[processor])
print(tokenizer.decode(generated[0][inputs['input_ids'].shape[-1]:]))
# {"label": "positive"}

Visualize the selected path

processor.show_graph()

%3Allowed PathsRegular expression: [\n\t ]*{[\n\t ]*"label"[\n\t ]*:[\n\t ]*("positive"|"negative")[\n\t ]*}000->0idtoken2345959101503......110->1idtoken53632{1476{90{......23230->23idtoken4913{"5212{"1->1idtoken2345959101503......221->2idtoken92667"label1->23idtoken330"1"332->3idtoken15620"1837"698"......442->4idtoken788":51418":4660":552->5idtoken3252":"3->3idtoken2345959101503......3->4idtoken25:47446:549:......3->5idtoken2974:"34638:"4->4idtoken2345959101503......4->5idtoken330"1"665->6idtoken2724posit885->8idtoken30487positive11115->11idtoken77n12125->12idtoken811ne13135->13idtoken28775neg16165->16idtoken42224negative19195->19idtoken79p20205->20idtoken966pos22225->22idtoken5368po776->7idtoken72i6->8idtoken533ive10106->10idtoken344iv7->8idtoken586ve7->10idtoken85v998->9idtoken15620"698"1"......28288->28idtoken9207"}9->9idtoken2345959101503......9->28idtoken92}335}10->8idtoken68e11->12idtoken68e11->13idtoken791eg141411->14idtoken11188ega11->16idtoken15060egative12->13idtoken70g12->14idtoken6743ga13->14idtoken64a151513->15idtoken9307ati13->16idtoken1388ative171713->17idtoken19488ativ181813->18idtoken266at14->15idtoken10251ti14->18idtoken83t15->16idtoken586ve15->17idtoken85v16->9idtoken15620"698"1"......16->28idtoken9207"}17->16idtoken68e18->15idtoken72i18->16idtoken533ive18->17idtoken344iv19->6idtoken34054osit19->20idtoken436os212119->21idtoken30724osi19->22idtoken78o20->6idtoken275it20->7idtoken12303iti20->8idtoken3404itive20->21idtoken72i21->6idtoken83t21->7idtoken10251ti22->6idtoken46865sit22->20idtoken82s22->21idtoken6321si23->2idtoken1502label242423->24idtoken14380lab262623->26idtoken75l272723->27idtoken4260la24->2idtoken301el252524->25idtoken68e25->2idtoken75l26->2idtoken780abel26->24idtoken370ab26->25idtoken8229abe26->27idtoken64a27->2idtoken9779bel27->24idtoken65b27->25idtoken1371be->0

Your First Streamed Structured Generation

Since Litelines gives you the processor, you can do whatever you want with it. In particular, you can generate a streaming response like you would normally do (just don't forget to add the processor).

from threading import Thread
from transformers import TextIteratorStreamer

streamer = TextIteratorStreamer(tokenizer, skip_prompt=True)
generation_kwargs = dict(
    inputs, streamer=streamer, logits_processor=[processor], max_new_tokens=100
)

thread = Thread(target=model.generate, kwargs=generation_kwargs)
thread.start()

assistant_response = ""
for chunk in streamer:
    if tokenizer.eos_token in chunk or tokenizer.pad_token in chunk:
        chunk = chunk.split(tokenizer.eos_token)[0]
        chunk = chunk.split(tokenizer.pad_token)[0]
    assistant_response += chunk
    print(chunk, end="")

thread.join()