Understanding Structured Generation (Draft)

code
analysis
Author

Alonso Silva

Published

July 30, 2025

Show the code
import torch
from typing import List
from threading import Thread
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
from transformers.generation import LogitsProcessor

model_id = "Qwen/Qwen3-0.6B"
# model = AutoModelForCausalLM.from_pretrained(
#     model_id, cache_dir="/big_storage/llms/hf_models/"
# ).to("cuda")
tokenizer = AutoTokenizer.from_pretrained(model_id)
# streamer = TextIteratorStreamer(tokenizer, skip_prompt=True)
def find(target, subwords):
    if target == '':
        return [[]]  # base case: one valid way to make the empty string
    else:
        results = []
        matching_subwords = [s for s in subwords if target.endswith(s)]
        for s in matching_subwords:
            suffix_combos = find(target[:-len(s)], subwords)
            for combo in suffix_combos:
                results.append(combo + [s])
        return results
subwords = {"ouse", "h", "o", "u", "s", "e", "ho", "use"}
target = "the houses"
import interegular

list_of_strings_fsm = interegular.parse_pattern(target).to_fsm()
list_of_strings_fsm
fsm(alphabet = Alphabet({' ': 0, 'h': 1, 't': 2, 'u': 3, anything_else: 4, 'o': 5, 'e': 6, 's': 7}), states = frozenset({0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10}), initial = 0, finals = frozenset({10}), map = {0: {2: 1}, 1: {1: 2}, 2: {6: 3}, 3: {0: 4}, 4: {1: 5}, 5: {5: 6}, 6: {3: 7}, 7: {7: 8}, 8: {6: 9}, 9: {7: 10}, 10: {}})
from outlines.fsm.regex import make_deterministic_fsm

new_fsm, _ = make_deterministic_fsm(list_of_strings_fsm)
new_fsm
/home/asilva/quarto/alonsosilvaallende.github.io/blog/posts/2025-07-05-Understanding-Function-Calling/.venv/lib/python3.12/site-packages/pyairports/airports.py:1: UserWarning: pkg_resources is deprecated as an API. See https://setuptools.pypa.io/en/latest/pkg_resources.html. The pkg_resources package is slated for removal as early as 2025-11-30. Refrain from using this package or pin to Setuptools<81.
  from pkg_resources import resource_string
fsm(alphabet = BetterAlphabet({' ': 0, 'h': 2, 't': 5, 'u': 6, anything_else: 7, 'o': 3, 'e': 1, 's': 4}), states = frozenset({0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10}), initial = 0, finals = frozenset({10}), map = {0: {5: 1}, 1: {2: 2}, 2: {1: 3}, 3: {0: 4}, 4: {2: 5}, 5: {3: 6}, 6: {6: 7}, 7: {4: 8}, 8: {1: 9}, 9: {4: 10}, 10: {}})
alphabet = {**new_fsm.alphabet}
mapping = {value: key for key, value in alphabet.items()}
transitions = dict()
for src, dests in new_fsm.map.items():
    transitions_from_src = dict()
    for input_symbol, dest in dests.items():
        label = f"{list(alphabet.keys())[list(alphabet.values()).index(input_symbol)]}"
        transitions_from_src[label] = dest
    transitions[src] = transitions_from_src
transitions
{0: {'t': 1},
 1: {'h': 2},
 2: {'e': 3},
 3: {' ': 4},
 4: {'h': 5},
 5: {'o': 6},
 6: {'u': 7},
 7: {'s': 8},
 8: {'e': 9},
 9: {'s': 10},
 10: {}}
from outlines.models.transformers import TransformerTokenizer

new_tokenizer = TransformerTokenizer(tokenizer)
from outlines.fsm.regex import create_fsm_index_tokenizer

index, _ = create_fsm_index_tokenizer(new_fsm, new_tokenizer)
index
Compiling FSM index for all state transitions: 100%|██████████| 11/11 [00:03<00:00,  3.17it/s]
{0: {83: 1, 1782: 3, 339: 2},
 1: {71: 2, 383: 3},
 2: {68: 3},
 3: {220: 4, 3753: 9, 305: 5, 14967: 10, 7696: 8, 11386: 6},
 4: {18166: 7, 36741: 10, 7675: 9, 71: 5, 6161: 6},
 5: {782: 8, 19757: 10, 283: 7, 78: 6, 1530: 9},
 6: {84: 7, 4776: 10, 355: 8, 810: 9},
 7: {9275: 10, 82: 8, 325: 9},
 8: {68: 9, 288: 10},
 9: {82: 10}}
# target = "the houses"

combinations = find(target, subwords)
for combo in combinations:
    print(" + ".join(combo))
tokens = [tokenizer.decode(token_id) for token_id in range(tokenizer.vocab_size)]
subwords = set(tokens)
combinations = find(target, subwords)
for combo in combinations:
    print(" + ".join(combo))
t + he +  ho + uses
th + e +  ho + uses
t + h + e +  ho + uses
the +  ho + uses
t + he +   + h + o + uses
th + e +   + h + o + uses
t + h + e +   + h + o + uses
the +   + h + o + uses
t + he +  h + o + uses
th + e +  h + o + uses
t + h + e +  h + o + uses
the +  h + o + uses
t + he +   + ho + uses
th + e +   + ho + uses
t + h + e +   + ho + uses
the +   + ho + uses
t + he +  houses
th + e +  houses
t + h + e +  houses
the +  houses
t + he +   + h + ou + ses
th + e +   + h + ou + ses
t + h + e +   + h + ou + ses
the +   + h + ou + ses
t + he +  h + ou + ses
th + e +  h + ou + ses
t + h + e +  h + ou + ses
the +  h + ou + ses
t + he +   + hou + ses
th + e +   + hou + ses
t + h + e +   + hou + ses
the +   + hou + ses
t + he +  ho + u + ses
th + e +  ho + u + ses
t + h + e +  ho + u + ses
the +  ho + u + ses
t + he +   + h + o + u + ses
th + e +   + h + o + u + ses
t + h + e +   + h + o + u + ses
the +   + h + o + u + ses
t + he +  h + o + u + ses
th + e +  h + o + u + ses
t + h + e +  h + o + u + ses
the +  h + o + u + ses
t + he +   + ho + u + ses
th + e +   + ho + u + ses
t + h + e +   + ho + u + ses
the +   + ho + u + ses
t + he +   + h + ouse + s
th + e +   + h + ouse + s
t + h + e +   + h + ouse + s
the +   + h + ouse + s
t + he +  h + ouse + s
th + e +  h + ouse + s
t + h + e +  h + ouse + s
the +  h + ouse + s
t + he +   + house + s
th + e +   + house + s
t + h + e +   + house + s
the +   + house + s
t + he +  ho + us + e + s
th + e +  ho + us + e + s
t + h + e +  ho + us + e + s
the +  ho + us + e + s
t + he +   + h + o + us + e + s
th + e +   + h + o + us + e + s
t + h + e +   + h + o + us + e + s
the +   + h + o + us + e + s
t + he +  h + o + us + e + s
th + e +  h + o + us + e + s
t + h + e +  h + o + us + e + s
the +  h + o + us + e + s
t + he +   + ho + us + e + s
th + e +   + ho + us + e + s
t + h + e +   + ho + us + e + s
the +   + ho + us + e + s
t + he +   + h + ous + e + s
th + e +   + h + ous + e + s
t + h + e +   + h + ous + e + s
the +   + h + ous + e + s
t + he +  h + ous + e + s
th + e +  h + ous + e + s
t + h + e +  h + ous + e + s
the +  h + ous + e + s
t + he +   + h + ou + s + e + s
th + e +   + h + ou + s + e + s
t + h + e +   + h + ou + s + e + s
the +   + h + ou + s + e + s
t + he +  h + ou + s + e + s
th + e +  h + ou + s + e + s
t + h + e +  h + ou + s + e + s
the +  h + ou + s + e + s
t + he +   + hou + s + e + s
th + e +   + hou + s + e + s
t + h + e +   + hou + s + e + s
the +   + hou + s + e + s
t + he +  ho + u + s + e + s
th + e +  ho + u + s + e + s
t + h + e +  ho + u + s + e + s
the +  ho + u + s + e + s
t + he +   + h + o + u + s + e + s
th + e +   + h + o + u + s + e + s
t + h + e +   + h + o + u + s + e + s
the +   + h + o + u + s + e + s
t + he +  h + o + u + s + e + s
th + e +  h + o + u + s + e + s
t + h + e +  h + o + u + s + e + s
the +  h + o + u + s + e + s
t + he +   + ho + u + s + e + s
th + e +   + ho + u + s + e + s
t + h + e +   + ho + u + s + e + s
the +   + ho + u + s + e + s
t + he +  hous + e + s
th + e +  hous + e + s
t + h + e +  hous + e + s
the +  hous + e + s
t + he +  house + s
th + e +  house + s
t + h + e +  house + s
the +  house + s
t + he +  ho + use + s
th + e +  ho + use + s
t + h + e +  ho + use + s
the +  ho + use + s
t + he +   + h + o + use + s
th + e +   + h + o + use + s
t + h + e +   + h + o + use + s
the +   + h + o + use + s
t + he +  h + o + use + s
th + e +  h + o + use + s
t + h + e +  h + o + use + s
the +  h + o + use + s
t + he +   + ho + use + s
th + e +   + ho + use + s
t + h + e +   + ho + use + s
the +   + ho + use + s
t + he +   + h + ou + se + s
th + e +   + h + ou + se + s
t + h + e +   + h + ou + se + s
the +   + h + ou + se + s
t + he +  h + ou + se + s
th + e +  h + ou + se + s
t + h + e +  h + ou + se + s
the +  h + ou + se + s
t + he +   + hou + se + s
th + e +   + hou + se + s
t + h + e +   + hou + se + s
the +   + hou + se + s
t + he +  ho + u + se + s
th + e +  ho + u + se + s
t + h + e +  ho + u + se + s
the +  ho + u + se + s
t + he +   + h + o + u + se + s
th + e +   + h + o + u + se + s
t + h + e +   + h + o + u + se + s
the +   + h + o + u + se + s
t + he +  h + o + u + se + s
th + e +  h + o + u + se + s
t + h + e +  h + o + u + se + s
the +  h + o + u + se + s
t + he +   + ho + u + se + s
th + e +   + ho + u + se + s
t + h + e +   + ho + u + se + s
the +   + ho + u + se + s
t + he +  ho + us + es
th + e +  ho + us + es
t + h + e +  ho + us + es
the +  ho + us + es
t + he +   + h + o + us + es
th + e +   + h + o + us + es
t + h + e +   + h + o + us + es
the +   + h + o + us + es
t + he +  h + o + us + es
th + e +  h + o + us + es
t + h + e +  h + o + us + es
the +  h + o + us + es
t + he +   + ho + us + es
th + e +   + ho + us + es
t + h + e +   + ho + us + es
the +   + ho + us + es
t + he +   + h + ous + es
th + e +   + h + ous + es
t + h + e +   + h + ous + es
the +   + h + ous + es
t + he +  h + ous + es
th + e +  h + ous + es
t + h + e +  h + ous + es
the +  h + ous + es
t + he +   + h + ou + s + es
th + e +   + h + ou + s + es
t + h + e +   + h + ou + s + es
the +   + h + ou + s + es
t + he +  h + ou + s + es
th + e +  h + ou + s + es
t + h + e +  h + ou + s + es
the +  h + ou + s + es
t + he +   + hou + s + es
th + e +   + hou + s + es
t + h + e +   + hou + s + es
the +   + hou + s + es
t + he +  ho + u + s + es
th + e +  ho + u + s + es
t + h + e +  ho + u + s + es
the +  ho + u + s + es
t + he +   + h + o + u + s + es
th + e +   + h + o + u + s + es
t + h + e +   + h + o + u + s + es
the +   + h + o + u + s + es
t + he +  h + o + u + s + es
th + e +  h + o + u + s + es
t + h + e +  h + o + u + s + es
the +  h + o + u + s + es
t + he +   + ho + u + s + es
th + e +   + ho + u + s + es
t + h + e +   + ho + u + s + es
the +   + ho + u + s + es
t + he +  hous + es
th + e +  hous + es
t + h + e +  hous + es
the +  hous + es
t + he +   + h + ouses
th + e +   + h + ouses
t + h + e +   + h + ouses
the +   + h + ouses
t + he +  h + ouses
th + e +  h + ouses
t + h + e +  h + ouses
the +  h + ouses
t + he +   + houses
th + e +   + houses
t + h + e +   + houses
the +   + houses
combo = combinations[0]
combo
['h', 'ouse']
states = [""] + ["".join(combo[:k]) for k in range(1,len(combo)+1)]
states
['', 'h', 'house']
transitions = {}
transitions[""] = ["".join(combo[:k]) for k in range(1,len(combo)+1)]
transitions
{'': ['h', 'house']}
combo = combinations[1]
combo
['hou', 'se']
for k in range(1,len(combo)+1):
    print(f'{"".join(combo[:k-1])}->{"".join(combo[:k])}')
->hou
hou->house
["".join(combo[:k]) for k in range(1,len(combo)+1)]
['hou', 'house']
transitions = {"".join(combo[:k-1]): "".join(combo[:k]) for k in range(1,len(combo)+1)}
transitions
{'': 'hou', 'hou': 'house'}
for transition in transitions:
    print(transition)
    print(transitions[transition])
    # print(transition.key)

hou
hou
house
combo = combinations[2]
combo
['h', 'ou', 'se']
for k in range(1,len(combo)+1):
    for j in range(k):
        print(f'{"".join(combo[j:k-1])} -> {"".join(combo[j:k])}')
 -> h
h -> hou
 -> ou
hou -> house
ou -> ouse
 -> se
transitions = set()
for combo in combinations:
    for k in range(1,len(combo)+1):
        for j in range(k):
            transitions.add(("".join(combo[j:k-1]),"".join(combo[j:k])))
len(transitions)
166
{("".join(combo[j:k-1]), "".join(combo[j:k])) for k in range(1,len(combo)+1) for j in range(k)}
{('', 'h'),
 ('', 'ou'),
 ('', 'se'),
 ('h', 'hou'),
 ('hou', 'house'),
 ('ou', 'ouse')}
states = set()
for combo in combinations:
    states_combo = ["".join(combo[:k]) for k in range(1,len(combo)+1)]
    for state in states_combo:
        states.add(state)
states
{'h', 'ho', 'hou', 'hous', 'house'}
states = set()
for combo in combinations:
    for k in range(1,len(combo)+1):
        states.add("".join(combo[:k]))
states
{'h', 'ho', 'hou', 'hous', 'house'}
states = ['']
transitions = {}
for combo in combinations:
    states_combo = ["".join(combo[:k]) for k in range(1,len(combo)+1)]
    transitions_combo = {("".join(combo[:k-1]): "".join(combo[:k]) for k in range(1,len(combo)+1)}
    for state in states_combo:
        if state not in states:
            states.append(state)
    for key in transitions_combo:
        transitions[key] = transitions_combo[key]
  Cell In[104], line 5
    transitions_combo = {("".join(combo[:k-1]): "".join(combo[:k]) for k in range(1,len(combo)+1)}
                                                                                                 ^
SyntaxError: closing parenthesis '}' does not match opening parenthesis '('
states
{'h', 'ho', 'hou', 'hous', 'house'}
transitions
{('', 'h'),
 ('', 'ou'),
 ('', 'se'),
 ('h', 'hou'),
 ('hou', 'house'),
 ('ou', 'ouse')}