DataStream.join

Join a DataStream with another DataStream. This may result in a distributed hash join or a broadcast join depending on cardinality estimates.

Parameters:

Name	Type	Description	Default
`right`	`DataStream`	the DataStream to join to.	required
`on`	`str`	You could either specify this, if the join column has the same name in this DataStream and `right`, or `left_on` and `right_on` if the join columns don't have the same name.	`None`
`left_on`	`str`	the name of the join column in this DataStream.	`None`
`right_on`	`str`	the name of the join column in `right`.	`None`
`suffix`	`str`	if `right` has columns with the same names as columns in this DataStream, their names will be appended with the suffix in the result.	`'_2'`
`how`	`str`	supports "inner", "left", "semi" or "anti"	`'inner'`

Return

A new DataStream that's the joined result of this DataStream and "right". By default, columns from both side will be retained, except for right_on from the right side.

Examples:

>>> lineitem = qc.read_csv("lineitem.csv")

>>> orders = qc.read_csv("orders.csv")

>>> result = lineitem.join(orders, left_on = "l_orderkey", right_on = "o_orderkey")

>>> result = result.select(["o_orderkey"])

Source code in pyquokka/datastream.py

def join(self, right, on=None, left_on=None, right_on=None, suffix="_2", how="inner", maintain_sort_order=None):

    """
    Join a DataStream with another DataStream. This may result in a distributed hash join or a broadcast join depending on cardinality estimates.





    Args:
        right (DataStream): the DataStream to join to.
        on (str): You could either specify this, if the join column has the same name in this DataStream and `right`, or `left_on` and `right_on` 
            if the join columns don't have the same name.
        left_on (str): the name of the join column in this DataStream.
        right_on (str): the name of the join column in `right`.
        suffix (str): if `right` has columns with the same names as columns in this DataStream, their names will be appended with the suffix in the result.
        how (str): supports "inner", "left", "semi" or "anti"

    Return:
        A new DataStream that's the joined result of this DataStream and "right". By default, columns from both side will be retained, 
        except for `right_on` from the right side. 

    Examples:

        >>> lineitem = qc.read_csv("lineitem.csv")

        >>> orders = qc.read_csv("orders.csv")

        >>> result = lineitem.join(orders, left_on = "l_orderkey", right_on = "o_orderkey")

        >>> result = result.select(["o_orderkey"])
    """

    assert how in {"inner", "left", "semi", "anti"}
    assert issubclass(type(right), DataStream), "must join against a Quokka DataStream"

    if maintain_sort_order is not None:

        assert how in {"inner", "left"}

        # our broadcast join strategy should automatically satisfy this, no need to do anything special
        if type(right) == polars.DataFrame:
            assert maintain_sort_order == "left"
            assert self.sorted is not None

        else:
            assert maintain_sort_order in {"left", "right"}
            if maintain_sort_order == "left":
                assert self.sorted is not None
            else:
                assert right.sorted is not None
            if how == "left":
                assert maintain_sort_order == "right", "in a left join, can only maintain order of the right table"

    #if type(right) == polars.DataFrame and right.to_arrow().nbytes > 10485760:
    #    raise Exception("You cannot join a DataStream against a Polars DataFrame more than 10MB in size. Sorry.")

    if on is None:
        assert left_on is not None and right_on is not None
        assert left_on in self.schema, "join key not found in left table"
        assert right_on in right.schema, "join key not found in right table"
    else:
        assert on in self.schema, "join key not found in left table"
        assert on in right.schema, "join key not found in right table"
        left_on = on
        right_on = on
        on = None

    # we can't do this check since schema is now a list of names with no type info. This should change in the future.
    #assert node1.schema[left_on] == node2.schema[right_on], "join column has different schema in tables"

    new_schema = self.schema.copy()
    if self.materialized:
        schema_mapping = {col: {-1: col} for col in self.schema}
    else:
        schema_mapping = {col: {0: col} for col in self.schema}

    # if the right table is already materialized, the schema mapping should forget about it since we can't push anything down anyways.
    # an optimization could be to push down the predicate directly to the materialized Polars DataFrame in the BroadcastJoinExecutor
    # leave this as a TODO. this could be greatly benenficial if it significantly reduces the size of the small table.
    if right.materialized:
        right_table_id = -1
    else:
        right_table_id = 1

    rename_dict = {}

    # import pdb;pdb.set_trace()

    right_cols = right.schema if how not in {"semi", "anti"} else [right_on]
    for col in right_cols:
        if col == right_on:
            continue
        if col in new_schema:
            assert col + \
                suffix not in new_schema, (
                    "the suffix was not enough to guarantee unique col names", col + suffix, new_schema)
            new_schema.append(col + suffix)
            schema_mapping[col+suffix] = {right_table_id: col + suffix}
            rename_dict[col] = col + suffix
        else:
            new_schema.append(col)
            schema_mapping[col] = {right_table_id: col}

    # you only need the key column on the RHS! select overloads in DataStream or Polars DataFrame runtime polymorphic
    if how == "semi" or how == "anti":
        right = right.select([right_on])

    if len(rename_dict) > 0:
        right = right.rename(rename_dict)

    if (not self.materialized and not right.materialized) or (self.materialized and not right.materialized and how != "inner"):

        # if self.materialized, rewrite the schema_mapping
        for col in schema_mapping:
            if list(schema_mapping[col].keys())[0] == -1:
                schema_mapping[col] = {0: col}

        if maintain_sort_order is None:
            assume_sorted = {}
        elif maintain_sort_order == "left":
            assume_sorted = {0: True}
        else:
            assume_sorted = {1: True}

        return self.quokka_context.new_stream(
            sources={0: self, 1: right},
            partitioners={0: HashPartitioner(
                left_on), 1: HashPartitioner(right_on)},
            node=JoinNode(
                schema=new_schema,
                schema_mapping=schema_mapping,
                required_columns={0: {left_on}, 1: {right_on}},
                join_spec=(how, {0: left_on, 1: right_on}),
                assume_sorted=assume_sorted),
            schema=new_schema,
            )

    elif self.materialized and not right.materialized:

        assert how in {"inner"}

        new_schema.remove(left_on)
        new_schema = [right_on] + new_schema
        del schema_mapping[left_on]
        schema_mapping[right_on] = {1: right_on}

        new_stream = self.quokka_context.new_stream(
            sources={1: right},
            partitioners={1: PassThroughPartitioner()},
            node=BroadcastJoinNode(
                schema=new_schema,
                schema_mapping=schema_mapping,
                required_columns={1: {right_on}},
                operator=BroadcastJoinExecutor(
                    self._get_materialized_df(), small_on=left_on, big_on=right_on, suffix=suffix, how=how)
            ),
            schema=new_schema,
            )
        if right_on == left_on:
            return new_stream
        else:
            return new_stream.rename({right_on: left_on})

    elif not self.materialized and right.materialized:

        return self.quokka_context.new_stream(
            sources={0: self},
            partitioners={0: PassThroughPartitioner()},
            node=BroadcastJoinNode(
                schema=new_schema,
                schema_mapping=schema_mapping,
                required_columns={0: {left_on}},
                operator=BroadcastJoinExecutor(
                    right._get_materialized_df(), small_on=right_on, big_on=left_on, suffix=suffix, how=how)
            ),
            schema=new_schema,
            )

    else:

        right_df = right._get_materialized_df()
        left_df = self._get_materialized_df()
        result = left_df.join(right_df, how=how, left_on=left_on, right_on=right_on, suffix=suffix)
        return self.quokka_context.from_polars(result)