# Copyright 2026, University of Freiburg
# Chair of Algorithms and Data Structures
# Author: Hannah Bast <bast@cs.uni-freiburg.de>
import argparse
import random
import time
import requests
def make_counts_cumulative(counts: list[int]):
"""
Given counts c1, ..., cn, compute the cumulative counts c1, c1 + c2, c1 +
c2 + c3, and so on.
"""
for i in range(1, len(counts)):
counts[i] += counts[i - 1]
def random_char(cumulative_counts: list[int]) -> str:
"""
Pick a character at random, from the probability distribution implicitly
given by the cumulative counts.
"""
r = random.randint(1, cumulative_counts[-1])
for i, ci in enumerate(cumulative_counts):
if r <= cumulative_counts[i]:
return chr(i + ord(" "))
class UnigramModel:
def __init__(self):
self.vocab_size = ord("~") - ord(" ") + 1
self.counts = [0] * self.vocab_size
def train(self, text: str) -> None:
"""
Given the text, count the number of occurrences of each character
between " " and "~"; ignore all other characters.
"""
for c in text:
ci = ord(c) - ord(" ")
if 0 <= ci < self.vocab_size:
self.counts[ci] += 1
make_counts_cumulative(self.counts)
def complete(self, prompt: str, k: int, sleep: float):
"""
Given the text `prompt`, generate `k` characters from our unigram
model, and print them one after the other, waiting `sleep` seconds
inbetween, just for fun.
"""
print(prompt, end="", flush=True)
for _ in range(k):
c = random_char(self.counts)
print(c, end="", flush=True)
time.sleep(sleep)
class BigramModel:
def __init__(self):
self.vocab_size = ord("~") - ord(" ") + 1
self.counts = [[0] * self.vocab_size for _ in range(self.vocab_size)]
def train(self, text: str) -> None:
"""
Given the text, count the number of occurrences of each bigram
(sequence of two characters between " " and "~")
"""
ci_prev = None
for c in text:
ci = ord(c) - ord(" ")
if 0 <= ci < self.vocab_size and ci_prev is not None:
self.counts[ci_prev][ci] += 1
ci_prev = ci if 0 <= ci < self.vocab_size else None
for i in range(self.vocab_size):
make_counts_cumulative(self.counts[i])
def complete(self, prompt: str, k: int, sleep: float):
"""
Given the text `prompt`, generate `k` characters from our bigram
model, and print them one after the other, waiting `sleep` seconds
inbetween, just for fun.
"""
print(prompt, end="", flush=True)
assert len(prompt) >= 1
assert ord(" ") <= ord(prompt[-1]) <= ord("~")
for _ in range(k):
ci = ord(prompt[-1]) - ord(" ")
c = random_char(self.counts[ci])
print(c, end="", flush=True)
time.sleep(sleep)
prompt += c
class TrigramModel:
def __init__(self):
self.vocab_size = ord("~") - ord(" ") + 1
self.counts = [
[[0] * self.vocab_size for _ in range(self.vocab_size)]
for _ in range(self.vocab_size)
]
def train(self, text: str) -> None:
"""
Given the text, count the number of occurrences of each trigram
(sequence of three characters between " " and "~")
"""
ci_prev2 = None
ci_prev1 = None
for c in text:
ci = ord(c) - ord(" ")
if (
0 <= ci < self.vocab_size
and ci_prev1 is not None
and ci_prev2 is not None
):
self.counts[ci_prev2][ci_prev1][ci] += 1
if 0 <= ci < self.vocab_size:
ci_prev2 = ci_prev1
ci_prev1 = ci
else:
ci_prev2 = None
ci_prev1 = None
for i in range(self.vocab_size):
for j in range(self.vocab_size):
make_counts_cumulative(self.counts[i][j])
def complete(self, prompt: str, k: int, sleep: float):
"""
Given the text `prompt`, generate `k` characters from our trigram
model, and print them one after the other, waiting `sleep` seconds
inbetween, just for fun.
"""
print(prompt, end="", flush=True)
assert len(prompt) >= 2
assert ord(" ") <= ord(prompt[-1]) <= ord("~")
assert ord(" ") <= ord(prompt[-2]) <= ord("~")
for _ in range(k):
ci_prev2 = ord(prompt[-2]) - ord(" ")
ci_prev1 = ord(prompt[-1]) - ord(" ")
c = random_char(self.counts[ci_prev2][ci_prev1])
print(c, end="", flush=True)
time.sleep(sleep)
prompt += c
class NeuralLanguageModel:
def __init__(self):
self.api_url = "https://ad-llm.cs.uni-freiburg.de/v1/chat/completions"
def complete(self, prompt: str, k: int, sleep: float):
"""
Given the text `prompt`, generate `k` characters from the specified
neural language model.
"""
system_prompt = (
"Act like a character-based language model and predict the next characters"
)
headers = {
"Content-Type": "application/json",
# "Authorization": "Bearer YOUR_API_KEY_HERE",
}
data = {
"model": "Qwen/Qwen3-Next-80B-A3B-Instruct-FP8",
"messages": [
{"role": "system", "content": system_prompt},
{"role": "user", "content": prompt},
],
"max_completion_tokens": k,
"temperature": 0.7,
}
response = requests.post(self.api_url, headers=headers, json=data)
print(prompt, end="", flush=True)
if response.status_code == 200:
result = response.json()
completion = result["choices"][0]["message"]["content"]
for c in completion[:k]:
print(c, end="", flush=True)
time.sleep(sleep)
if __name__ == "__main__":
# Parse command line arguments.
parser = argparse.ArgumentParser(
description="Train and complete with a character-level language model"
)
parser.add_argument("text_file", type=str, help="The input text file")
parser.add_argument(
"--sleep", type=float, default=0.1, help="Sleep time between characters"
)
parser.add_argument(
"--k", type=int, default=100, help="Number of characters to generate"
)
parser.add_argument(
"--prompt", type=str, default="", help="Prompt to start the completion"
)
args = parser.parse_args()
# Read the input text file.
with open(args.text_file, "r", encoding="utf-8") as f:
text = f.read()
# Train a unigram model.
um = UnigramModel()
um.train(text)
# print(f"Counts: {um.counts}")
# Train a bigram model.
bm = BigramModel()
bm.train(text)
# Train a trigram model.
tm = TrigramModel()
tm.train(text)
# And finally, our neural language model.
nm = NeuralLanguageModel()
# Let's have fun and generate some text!
print()
um.complete(args.prompt, args.k, args.sleep)
print()
print()
bm.complete(args.prompt, args.k, args.sleep)
print()
print()
tm.complete(args.prompt, args.k, args.sleep)
print()
print()
nm.complete(args.prompt, args.k, args.sleep)
print()
print()