SVN / public / code / lecture-06 / query-planning.py

Revision 1428
Date2025-11-25T16:27:41+01:00
Committerhb1003
Download
#!/usr/bin/env python3

# Copyright 2025, University of Freiburg
# Chair of Algorithms and Data Structures
# Author: Hannah Bast <bast@cs.uni-freiburg.de>


class QueryPlan:
    """
    A class representing a simplified query plan (joins only) with cost and
    size estimates.

    The plan is represented as a parenthesized string. For example, we write
    `((AB)C)` for the query plan that first joins A and B, and then joins the
    result with C.

    We store the list of tables being joined as a string of sorted table names.
    For example, `ABC` for the plan `((AB)(C))`. That way, we can use this
    string as a key for a dictionary storing the best query plan for each set
    of tables.
    """

    def __init__(self, plan: str, cost: int, size: int):
        self.plan = plan
        self.tables = "".join(sorted(plan.replace("(", "").replace(")", "")))
        self.cost = cost
        self.size = size

    @staticmethod
    def join(plan1: "QueryPlan", plan2: "QueryPlan") -> "QueryPlan":
        """
        Join two query plans and return the new query plan, using the
        simplistic cost and size estimates from the lecture.
        """
        plan = f"({plan1.plan}{plan2.plan})"
        size = min(plan1.size, plan2.size)
        cost = plan1.cost + plan2.cost + size
        return QueryPlan(plan, cost, size)


class JoinGraph:
    """
    A class representing the join graph for a set of input tables. We use an
    adjacency list for each input table. See `main` for some examples.
    """

    def __init__(self, adjacency_list: dict[str, list[str]]):
        self.adjacency_list = adjacency_list

    def joinable(self, plan1: QueryPlan, plan2: QueryPlan) -> bool:
        """
        Check whether the two plans are joinable, that is, whether the two
        subsects of tables do not intersect and there is an edge in the query
        graph between some table in the first plan and some table in the second
        plan.

        NOTE: This is a very inefficient implementation, but it suffices for
        our purposes here.
        """

        tables_disjoint = set(plan1.tables).isdisjoint(set(plan2.tables))
        edge_exists = False
        for table1 in plan1.tables:
            for table2 in plan2.tables:
                if table2 in self.adjacency_list[table1]:
                    edge_exists = True
        return tables_disjoint and edge_exists


class QueryPlanner:
    """
    Compute the optimal query plan for joining a set of input tables, as
    explained in the lecture.
    """

    def __init__(self, input_plans: list[QueryPlan], join_graph: JoinGraph):
        """
        The input plans simply consist of one table each, with zero cost and
        the size of the table as size estimate.
        """
        self.input_plans = input_plans
        self.join_graph = join_graph

    def compute_optimal_plan(self) -> QueryPlan:
        """
        Compute the optimal query plan using dynamic programming, as explained
        in the lecture. In round i, we compute the optimal plans for all
        subsets of size i.
        """

        # In the first round, start with the singleton query plans.
        optimal_plans = {}
        optimal_plans[1] = {}
        print("Optimal plans with one table:")
        for plan in self.input_plans:
            optimal_plans[1][plan.tables] = plan
            print(f"  Table {plan.tables} [size={plan.size}]")

        for k in range(2, len(self.input_plans) + 1):
            # Compute all optimal plans of k tables.
            optimal_plans[k] = {}
            for k1 in range(1, k // 2 + 1):
                k2 = k - k1
                for plan1 in optimal_plans[k1].values():
                    for plan2 in optimal_plans[k2].values():
                        if self.join_graph.joinable(plan1, plan2):
                            plan = QueryPlan.join(plan1, plan2)
                            if (
                                plan.tables not in optimal_plans[k]
                                or plan.cost < optimal_plans[k][plan.tables].cost
                            ):
                                optimal_plans[k][plan.tables] = plan
            # Show all optimal plans of k tables.
            print(f"Optimal plans with {k} tables:")
            for plan in optimal_plans[k].values():
                print(f"  Tables {plan.tables} [size={plan.size}, cost={plan.cost}]")


if __name__ == "__main__":
    # Five input tables A, B, C, D, E, with their sizes.
    A = QueryPlan("A", 0, 100)
    B = QueryPlan("B", 0, 200)
    C = QueryPlan("C", 0, 150)
    D = QueryPlan("D", 0, 300)
    E = QueryPlan("E", 0, 250)

    # Example join graph (a "star").
    join_graph = JoinGraph(
        {
            "A": ["B", "C", "D", "E"],
            "B": ["A"],
            "C": ["A"],
            "D": ["A"],
            "E": ["A"],
        }
    )

    # Compute the optimal query plan.
    planner = QueryPlanner([A, B, C, D, E], join_graph)
    optimal_plan = planner.compute_optimal_plan()