ngram_model.py

Revision 2158
Date2026-01-27T16:12:29+01:00
Committerhb1003
# Copyright 2026, University of Freiburg
# Chair of Algorithms and Data Structures
# Author: Hannah Bast <bast@cs.uni-freiburg.de>

import argparse
import random
import time
import requests


def make_counts_cumulative(counts: list[int]):
    """
    Given counts c1, ..., cn, compute the cumulative counts c1, c1 + c2, c1 +
    c2 + c3, and so on.
    """
    for i in range(1, len(counts)):
        counts[i] += counts[i - 1]


def random_char(cumulative_counts: list[int]) -> str:
    """
    Pick a character at random, from the probability distribution implicitly
    given by the cumulative counts.
    """
    r = random.randint(1, cumulative_counts[-1])
    for i, ci in enumerate(cumulative_counts):
        if r <= cumulative_counts[i]:
            return chr(i + ord(" "))


class UnigramModel:
    def __init__(self):
        self.vocab_size = ord("~") - ord(" ") + 1
        self.counts = [0] * self.vocab_size

    def train(self, text: str) -> None:
        """
        Given the text, count the number of occurrences of each character
        between " " and "~"; ignore all other characters.
        """
        for c in text:
            ci = ord(c) - ord(" ")
            if 0 <= ci < self.vocab_size:
                self.counts[ci] += 1
        make_counts_cumulative(self.counts)

    def complete(self, prompt: str, k: int, sleep: float):
        """
        Given the text `prompt`, generate `k` characters from our unigram
        model, and print them one after the other, waiting `sleep` seconds
        inbetween, just for fun.
        """

        print(prompt, end="", flush=True)
        for _ in range(k):
            c = random_char(self.counts)
            print(c, end="", flush=True)
            time.sleep(sleep)


class BigramModel:
    def __init__(self):
        self.vocab_size = ord("~") - ord(" ") + 1
        self.counts = [[0] * self.vocab_size for _ in range(self.vocab_size)]

    def train(self, text: str) -> None:
        """
        Given the text, count the number of occurrences of each bigram
        (sequence of two characters between " " and "~")
        """
        ci_prev = None
        for c in text:
            ci = ord(c) - ord(" ")
            if 0 <= ci < self.vocab_size and ci_prev is not None:
                self.counts[ci_prev][ci] += 1
            ci_prev = ci if 0 <= ci < self.vocab_size else None
        for i in range(self.vocab_size):
            make_counts_cumulative(self.counts[i])

    def complete(self, prompt: str, k: int, sleep: float):
        """
        Given the text `prompt`, generate `k` characters from our bigram
        model, and print them one after the other, waiting `sleep` seconds
        inbetween, just for fun.
        """

        print(prompt, end="", flush=True)
        assert len(prompt) >= 1
        assert ord(" ") <= ord(prompt[-1]) <= ord("~")
        for _ in range(k):
            ci = ord(prompt[-1]) - ord(" ")
            c = random_char(self.counts[ci])
            print(c, end="", flush=True)
            time.sleep(sleep)
            prompt += c


class TrigramModel:
    def __init__(self):
        self.vocab_size = ord("~") - ord(" ") + 1
        self.counts = [
            [[0] * self.vocab_size for _ in range(self.vocab_size)]
            for _ in range(self.vocab_size)
        ]

    def train(self, text: str) -> None:
        """
        Given the text, count the number of occurrences of each trigram
        (sequence of three characters between " " and "~")
        """
        ci_prev2 = None
        ci_prev1 = None
        for c in text:
            ci = ord(c) - ord(" ")
            if (
                0 <= ci < self.vocab_size
                and ci_prev1 is not None
                and ci_prev2 is not None
            ):
                self.counts[ci_prev2][ci_prev1][ci] += 1
            if 0 <= ci < self.vocab_size:
                ci_prev2 = ci_prev1
                ci_prev1 = ci
            else:
                ci_prev2 = None
                ci_prev1 = None
        for i in range(self.vocab_size):
            for j in range(self.vocab_size):
                make_counts_cumulative(self.counts[i][j])

    def complete(self, prompt: str, k: int, sleep: float):
        """
        Given the text `prompt`, generate `k` characters from our trigram
        model, and print them one after the other, waiting `sleep` seconds
        inbetween, just for fun.
        """

        print(prompt, end="", flush=True)
        assert len(prompt) >= 2
        assert ord(" ") <= ord(prompt[-1]) <= ord("~")
        assert ord(" ") <= ord(prompt[-2]) <= ord("~")
        for _ in range(k):
            ci_prev2 = ord(prompt[-2]) - ord(" ")
            ci_prev1 = ord(prompt[-1]) - ord(" ")
            c = random_char(self.counts[ci_prev2][ci_prev1])
            print(c, end="", flush=True)
            time.sleep(sleep)
            prompt += c


class NeuralLanguageModel:
    def __init__(self):
        self.api_url = "https://ad-llm.cs.uni-freiburg.de/v1/chat/completions"

    def complete(self, prompt: str, k: int, sleep: float):
        """
        Given the text `prompt`, generate `k` characters from the specified
        neural language model.
        """
        system_prompt = (
            "Act like a character-based language model and predict the next characters"
        )
        headers = {
            "Content-Type": "application/json",
            # "Authorization": "Bearer YOUR_API_KEY_HERE",
        }
        data = {
            "model": "Qwen/Qwen3-Next-80B-A3B-Instruct-FP8",
            "messages": [
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": prompt},
            ],
            "max_completion_tokens": k,
            "temperature": 0.7,
        }

        response = requests.post(self.api_url, headers=headers, json=data)
        print(prompt, end="", flush=True)
        if response.status_code == 200:
            result = response.json()
            completion = result["choices"][0]["message"]["content"]
            for c in completion[:k]:
                print(c, end="", flush=True)
                time.sleep(sleep)


if __name__ == "__main__":
    # Parse command line arguments.
    parser = argparse.ArgumentParser(
        description="Train and complete with a character-level language model"
    )
    parser.add_argument("text_file", type=str, help="The input text file")
    parser.add_argument(
        "--sleep", type=float, default=0.1, help="Sleep time between characters"
    )
    parser.add_argument(
        "--k", type=int, default=100, help="Number of characters to generate"
    )
    parser.add_argument(
        "--prompt", type=str, default="", help="Prompt to start the completion"
    )
    args = parser.parse_args()

    # Read the input text file.
    with open(args.text_file, "r", encoding="utf-8") as f:
        text = f.read()

    # Train a unigram model.
    um = UnigramModel()
    um.train(text)
    # print(f"Counts: {um.counts}")

    # Train a bigram model.
    bm = BigramModel()
    bm.train(text)

    # Train a trigram model.
    tm = TrigramModel()
    tm.train(text)

    # And finally, our neural language model.
    nm = NeuralLanguageModel()

    # Let's have fun and generate some text!
    print()
    um.complete(args.prompt, args.k, args.sleep)
    print()
    print()
    bm.complete(args.prompt, args.k, args.sleep)
    print()
    print()
    tm.complete(args.prompt, args.k, args.sleep)
    print()
    print()
    nm.complete(args.prompt, args.k, args.sleep)
    print()
    print()