Understanding Structured Generation (Draft)

Show the code

import torch
from typing import List
from threading import Thread
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
from transformers.generation import LogitsProcessor

model_id = "Qwen/Qwen3-0.6B"
# model = AutoModelForCausalLM.from_pretrained(
#     model_id, cache_dir="/big_storage/llms/hf_models/"
# ).to("cuda")
tokenizer = AutoTokenizer.from_pretrained(model_id)
# streamer = TextIteratorStreamer(tokenizer, skip_prompt=True)

def find(target, subwords):
    if target == '':
        return [[]]  # base case: one valid way to make the empty string
    else:
        results = []
        matching_subwords = [s for s in subwords if target.endswith(s)]
        for s in matching_subwords:
            suffix_combos = find(target[:-len(s)], subwords)
            for combo in suffix_combos:
                results.append(combo + [s])
        return results

subwords = {"ouse", "h", "o", "u", "s", "e", "ho", "use"}

target = "the houses"

import interegular

list_of_strings_fsm = interegular.parse_pattern(target).to_fsm()
list_of_strings_fsm

fsm(alphabet = Alphabet({' ': 0, 'h': 1, 't': 2, 'u': 3, anything_else: 4, 'o': 5, 'e': 6, 's': 7}), states = frozenset({0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10}), initial = 0, finals = frozenset({10}), map = {0: {2: 1}, 1: {1: 2}, 2: {6: 3}, 3: {0: 4}, 4: {1: 5}, 5: {5: 6}, 6: {3: 7}, 7: {7: 8}, 8: {6: 9}, 9: {7: 10}, 10: {}})

from outlines.fsm.regex import make_deterministic_fsm

new_fsm, _ = make_deterministic_fsm(list_of_strings_fsm)
new_fsm

/home/asilva/quarto/alonsosilvaallende.github.io/blog/posts/2025-07-05-Understanding-Function-Calling/.venv/lib/python3.12/site-packages/pyairports/airports.py:1: UserWarning: pkg_resources is deprecated as an API. See https://setuptools.pypa.io/en/latest/pkg_resources.html. The pkg_resources package is slated for removal as early as 2025-11-30. Refrain from using this package or pin to Setuptools<81.
  from pkg_resources import resource_string

fsm(alphabet = BetterAlphabet({' ': 0, 'h': 2, 't': 5, 'u': 6, anything_else: 7, 'o': 3, 'e': 1, 's': 4}), states = frozenset({0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10}), initial = 0, finals = frozenset({10}), map = {0: {5: 1}, 1: {2: 2}, 2: {1: 3}, 3: {0: 4}, 4: {2: 5}, 5: {3: 6}, 6: {6: 7}, 7: {4: 8}, 8: {1: 9}, 9: {4: 10}, 10: {}})

alphabet = {**new_fsm.alphabet}
mapping = {value: key for key, value in alphabet.items()}
transitions = dict()
for src, dests in new_fsm.map.items():
    transitions_from_src = dict()
    for input_symbol, dest in dests.items():
        label = f"{list(alphabet.keys())[list(alphabet.values()).index(input_symbol)]}"
        transitions_from_src[label] = dest
    transitions[src] = transitions_from_src
transitions

{0: {'t': 1},
 1: {'h': 2},
 2: {'e': 3},
 3: {' ': 4},
 4: {'h': 5},
 5: {'o': 6},
 6: {'u': 7},
 7: {'s': 8},
 8: {'e': 9},
 9: {'s': 10},
 10: {}}

from outlines.models.transformers import TransformerTokenizer

new_tokenizer = TransformerTokenizer(tokenizer)

from outlines.fsm.regex import create_fsm_index_tokenizer

index, _ = create_fsm_index_tokenizer(new_fsm, new_tokenizer)
index

Compiling FSM index for all state transitions: 100%|██████████| 11/11 [00:03<00:00,  3.17it/s]

{0: {83: 1, 1782: 3, 339: 2},
 1: {71: 2, 383: 3},
 2: {68: 3},
 3: {220: 4, 3753: 9, 305: 5, 14967: 10, 7696: 8, 11386: 6},
 4: {18166: 7, 36741: 10, 7675: 9, 71: 5, 6161: 6},
 5: {782: 8, 19757: 10, 283: 7, 78: 6, 1530: 9},
 6: {84: 7, 4776: 10, 355: 8, 810: 9},
 7: {9275: 10, 82: 8, 325: 9},
 8: {68: 9, 288: 10},
 9: {82: 10}}

# target = "the houses"

combinations = find(target, subwords)
for combo in combinations:
    print(" + ".join(combo))

tokens = [tokenizer.decode(token_id) for token_id in range(tokenizer.vocab_size)]

subwords = set(tokens)

combinations = find(target, subwords)

for combo in combinations:
    print(" + ".join(combo))

t + he +  ho + uses
th + e +  ho + uses
t + h + e +  ho + uses
the +  ho + uses
t + he +   + h + o + uses
th + e +   + h + o + uses
t + h + e +   + h + o + uses
the +   + h + o + uses
t + he +  h + o + uses
th + e +  h + o + uses
t + h + e +  h + o + uses
the +  h + o + uses
t + he +   + ho + uses
th + e +   + ho + uses
t + h + e +   + ho + uses
the +   + ho + uses
t + he +  houses
th + e +  houses
t + h + e +  houses
the +  houses
t + he +   + h + ou + ses
th + e +   + h + ou + ses
t + h + e +   + h + ou + ses
the +   + h + ou + ses
t + he +  h + ou + ses
th + e +  h + ou + ses
t + h + e +  h + ou + ses
the +  h + ou + ses
t + he +   + hou + ses
th + e +   + hou + ses
t + h + e +   + hou + ses
the +   + hou + ses
t + he +  ho + u + ses
th + e +  ho + u + ses
t + h + e +  ho + u + ses
the +  ho + u + ses
t + he +   + h + o + u + ses
th + e +   + h + o + u + ses
t + h + e +   + h + o + u + ses
the +   + h + o + u + ses
t + he +  h + o + u + ses
th + e +  h + o + u + ses
t + h + e +  h + o + u + ses
the +  h + o + u + ses
t + he +   + ho + u + ses
th + e +   + ho + u + ses
t + h + e +   + ho + u + ses
the +   + ho + u + ses
t + he +   + h + ouse + s
th + e +   + h + ouse + s
t + h + e +   + h + ouse + s
the +   + h + ouse + s
t + he +  h + ouse + s
th + e +  h + ouse + s
t + h + e +  h + ouse + s
the +  h + ouse + s
t + he +   + house + s
th + e +   + house + s
t + h + e +   + house + s
the +   + house + s
t + he +  ho + us + e + s
th + e +  ho + us + e + s
t + h + e +  ho + us + e + s
the +  ho + us + e + s
t + he +   + h + o + us + e + s
th + e +   + h + o + us + e + s
t + h + e +   + h + o + us + e + s
the +   + h + o + us + e + s
t + he +  h + o + us + e + s
th + e +  h + o + us + e + s
t + h + e +  h + o + us + e + s
the +  h + o + us + e + s
t + he +   + ho + us + e + s
th + e +   + ho + us + e + s
t + h + e +   + ho + us + e + s
the +   + ho + us + e + s
t + he +   + h + ous + e + s
th + e +   + h + ous + e + s
t + h + e +   + h + ous + e + s
the +   + h + ous + e + s
t + he +  h + ous + e + s
th + e +  h + ous + e + s
t + h + e +  h + ous + e + s
the +  h + ous + e + s
t + he +   + h + ou + s + e + s
th + e +   + h + ou + s + e + s
t + h + e +   + h + ou + s + e + s
the +   + h + ou + s + e + s
t + he +  h + ou + s + e + s
th + e +  h + ou + s + e + s
t + h + e +  h + ou + s + e + s
the +  h + ou + s + e + s
t + he +   + hou + s + e + s
th + e +   + hou + s + e + s
t + h + e +   + hou + s + e + s
the +   + hou + s + e + s
t + he +  ho + u + s + e + s
th + e +  ho + u + s + e + s
t + h + e +  ho + u + s + e + s
the +  ho + u + s + e + s
t + he +   + h + o + u + s + e + s
th + e +   + h + o + u + s + e + s
t + h + e +   + h + o + u + s + e + s
the +   + h + o + u + s + e + s
t + he +  h + o + u + s + e + s
th + e +  h + o + u + s + e + s
t + h + e +  h + o + u + s + e + s
the +  h + o + u + s + e + s
t + he +   + ho + u + s + e + s
th + e +   + ho + u + s + e + s
t + h + e +   + ho + u + s + e + s
the +   + ho + u + s + e + s
t + he +  hous + e + s
th + e +  hous + e + s
t + h + e +  hous + e + s
the +  hous + e + s
t + he +  house + s
th + e +  house + s
t + h + e +  house + s
the +  house + s
t + he +  ho + use + s
th + e +  ho + use + s
t + h + e +  ho + use + s
the +  ho + use + s
t + he +   + h + o + use + s
th + e +   + h + o + use + s
t + h + e +   + h + o + use + s
the +   + h + o + use + s
t + he +  h + o + use + s
th + e +  h + o + use + s
t + h + e +  h + o + use + s
the +  h + o + use + s
t + he +   + ho + use + s
th + e +   + ho + use + s
t + h + e +   + ho + use + s
the +   + ho + use + s
t + he +   + h + ou + se + s
th + e +   + h + ou + se + s
t + h + e +   + h + ou + se + s
the +   + h + ou + se + s
t + he +  h + ou + se + s
th + e +  h + ou + se + s
t + h + e +  h + ou + se + s
the +  h + ou + se + s
t + he +   + hou + se + s
th + e +   + hou + se + s
t + h + e +   + hou + se + s
the +   + hou + se + s
t + he +  ho + u + se + s
th + e +  ho + u + se + s
t + h + e +  ho + u + se + s
the +  ho + u + se + s
t + he +   + h + o + u + se + s
th + e +   + h + o + u + se + s
t + h + e +   + h + o + u + se + s
the +   + h + o + u + se + s
t + he +  h + o + u + se + s
th + e +  h + o + u + se + s
t + h + e +  h + o + u + se + s
the +  h + o + u + se + s
t + he +   + ho + u + se + s
th + e +   + ho + u + se + s
t + h + e +   + ho + u + se + s
the +   + ho + u + se + s
t + he +  ho + us + es
th + e +  ho + us + es
t + h + e +  ho + us + es
the +  ho + us + es
t + he +   + h + o + us + es
th + e +   + h + o + us + es
t + h + e +   + h + o + us + es
the +   + h + o + us + es
t + he +  h + o + us + es
th + e +  h + o + us + es
t + h + e +  h + o + us + es
the +  h + o + us + es
t + he +   + ho + us + es
th + e +   + ho + us + es
t + h + e +   + ho + us + es
the +   + ho + us + es
t + he +   + h + ous + es
th + e +   + h + ous + es
t + h + e +   + h + ous + es
the +   + h + ous + es
t + he +  h + ous + es
th + e +  h + ous + es
t + h + e +  h + ous + es
the +  h + ous + es
t + he +   + h + ou + s + es
th + e +   + h + ou + s + es
t + h + e +   + h + ou + s + es
the +   + h + ou + s + es
t + he +  h + ou + s + es
th + e +  h + ou + s + es
t + h + e +  h + ou + s + es
the +  h + ou + s + es
t + he +   + hou + s + es
th + e +   + hou + s + es
t + h + e +   + hou + s + es
the +   + hou + s + es
t + he +  ho + u + s + es
th + e +  ho + u + s + es
t + h + e +  ho + u + s + es
the +  ho + u + s + es
t + he +   + h + o + u + s + es
th + e +   + h + o + u + s + es
t + h + e +   + h + o + u + s + es
the +   + h + o + u + s + es
t + he +  h + o + u + s + es
th + e +  h + o + u + s + es
t + h + e +  h + o + u + s + es
the +  h + o + u + s + es
t + he +   + ho + u + s + es
th + e +   + ho + u + s + es
t + h + e +   + ho + u + s + es
the +   + ho + u + s + es
t + he +  hous + es
th + e +  hous + es
t + h + e +  hous + es
the +  hous + es
t + he +   + h + ouses
th + e +   + h + ouses
t + h + e +   + h + ouses
the +   + h + ouses
t + he +  h + ouses
th + e +  h + ouses
t + h + e +  h + ouses
the +  h + ouses
t + he +   + houses
th + e +   + houses
t + h + e +   + houses
the +   + houses

combo = combinations[0]

combo

['h', 'ouse']

states = [""] + ["".join(combo[:k]) for k in range(1,len(combo)+1)]
states

['', 'h', 'house']

transitions = {}

transitions[""] = ["".join(combo[:k]) for k in range(1,len(combo)+1)]

transitions

{'': ['h', 'house']}

combo = combinations[1]

combo

['hou', 'se']

for k in range(1,len(combo)+1):
    print(f'{"".join(combo[:k-1])}->{"".join(combo[:k])}')

->hou
hou->house

["".join(combo[:k]) for k in range(1,len(combo)+1)]

['hou', 'house']

transitions = {"".join(combo[:k-1]): "".join(combo[:k]) for k in range(1,len(combo)+1)}

transitions

{'': 'hou', 'hou': 'house'}

for transition in transitions:
    print(transition)
    print(transitions[transition])
    # print(transition.key)


hou
hou
house

combo = combinations[2]
combo

['h', 'ou', 'se']

for k in range(1,len(combo)+1):
    for j in range(k):
        print(f'{"".join(combo[j:k-1])} -> {"".join(combo[j:k])}')

 -> h
h -> hou
 -> ou
hou -> house
ou -> ouse
 -> se

transitions = set()
for combo in combinations:
    for k in range(1,len(combo)+1):
        for j in range(k):
            transitions.add(("".join(combo[j:k-1]),"".join(combo[j:k])))

len(transitions)

{("".join(combo[j:k-1]), "".join(combo[j:k])) for k in range(1,len(combo)+1) for j in range(k)}

{('', 'h'),
 ('', 'ou'),
 ('', 'se'),
 ('h', 'hou'),
 ('hou', 'house'),
 ('ou', 'ouse')}

states = set()
for combo in combinations:
    states_combo = ["".join(combo[:k]) for k in range(1,len(combo)+1)]
    for state in states_combo:
        states.add(state)

states

{'h', 'ho', 'hou', 'hous', 'house'}

states = set()
for combo in combinations:
    for k in range(1,len(combo)+1):
        states.add("".join(combo[:k]))

states

{'h', 'ho', 'hou', 'hous', 'house'}

states = ['']
transitions = {}
for combo in combinations:
    states_combo = ["".join(combo[:k]) for k in range(1,len(combo)+1)]
    transitions_combo = {("".join(combo[:k-1]): "".join(combo[:k]) for k in range(1,len(combo)+1)}
    for state in states_combo:
        if state not in states:
            states.append(state)
    for key in transitions_combo:
        transitions[key] = transitions_combo[key]

  Cell In[104], line 5
    transitions_combo = {("".join(combo[:k-1]): "".join(combo[:k]) for k in range(1,len(combo)+1)}
                                                                                                 ^
SyntaxError: closing parenthesis '}' does not match opening parenthesis '('

states

{'h', 'ho', 'hou', 'hous', 'house'}

transitions

{('', 'h'),
 ('', 'ou'),
 ('', 'se'),
 ('h', 'hou'),
 ('hou', 'house'),
 ('ou', 'ouse')}