QuokkaContext.read_parquet

Read Parquet. It can be a single Parquet or a list of Parquets. It can be Parquet(s) on disk or Parquet(s) on S3. Currently other clouds are not supported.

Parameters:

Name	Type	Description	Default
`table_location`	`str`	where the Parquet(s) are. This mostly mimics Spark behavior. Look at the examples.	required

Return

DataStream.

Examples:

Read a single Parquet. It's better always to specify the absolute path.

>>> lineitem = qc.read_parquet("/home/ubuntu/tpch/lineitem.parquet")

Read a directory of Parquets

>>> lineitem = qc.read_parquet("/home/ubuntu/tpch/lineitem/*")

Read a single Parquet from S3

>>> lineitem = qc.read_parquet("s3://tpc-h-parquet/lineitem.parquet")

Read Parquets from S3 bucket with prefix

>>> lineitem = qc.read_parquet("s3://tpc-h-parquet/lineitem.parquet/*")

Source code in pyquokka/df.py

def read_parquet(self, table_location: str):

    """
    Read Parquet. It can be a single Parquet or a list of Parquets. It can be Parquet(s) on disk
    or Parquet(s) on S3. Currently other clouds are not supported. 

    Args:
        table_location (str): where the Parquet(s) are. This mostly mimics Spark behavior. Look at the examples.

    Return:
        DataStream.

    Examples:

        Read a single Parquet. It's better always to specify the absolute path.
        >>> lineitem = qc.read_parquet("/home/ubuntu/tpch/lineitem.parquet")

        Read a directory of Parquets 
        >>> lineitem = qc.read_parquet("/home/ubuntu/tpch/lineitem/*")

        Read a single Parquet from S3
        >>> lineitem = qc.read_parquet("s3://tpc-h-parquet/lineitem.parquet")

        Read Parquets from S3 bucket with prefix
        >>> lineitem = qc.read_parquet("s3://tpc-h-parquet/lineitem.parquet/*")
    """

    def return_materialized_stream(df):
        self.nodes[self.latest_node_id] = InputPolarsNode(df)
        self.latest_node_id += 1
        return DataStream(self, df.columns, self.latest_node_id - 1, materialized=True)

    s3 = boto3.client('s3')
    if table_location[:5] == "s3://":

        if type(self.cluster) == LocalCluster:
            print("Warning: trying to read S3 dataset on local machine. This assumes high network bandwidth.")

        table_location = table_location[5:]
        bucket = table_location.split("/")[0]
        if "*" in table_location:
            assert "*" not in table_location[:-1], "wildcard can only be the last character in address string"
            table_location = table_location[:-1]
            prefix = "/".join(table_location[:-1].split("/")[1:])

            z = s3.list_objects_v2(Bucket=bucket, Prefix=prefix)
            if 'Contents' not in z:
                raise Exception("Wrong S3 path")
            files = [bucket + "/" + i['Key'] for i in z['Contents'] if i['Key'].endswith(".parquet")]
            sizes = [i['Size'] for i in z['Contents'] if i['Key'].endswith('.parquet')]
            while 'NextContinuationToken' in z.keys():
                z = s3.list_objects_v2(
                    Bucket=bucket, Prefix=prefix, ContinuationToken=z['NextContinuationToken'])
                files.extend([bucket + "/" + i['Key'] for i in z['Contents']
                                if i['Key'].endswith(".parquet")])
                sizes.extend([i['Size'] for i in z['Contents'] if i['Key'].endswith('.parquet')])

            assert len(files) > 0, "could not find any parquet files. make sure they end with .parquet"
            if sum(sizes) < self.sql_config["s3_parquet_materialize_threshold"] and len(files) == 1:
                df = polars.from_arrow(pq.read_table(files[0], filesystem = S3FileSystem()))
                return return_materialized_stream(df)

            try:
                f = pq.ParquetFile(S3FileSystem().open_input_file(files[0]))
                schema = [k.name for k in f.schema_arrow]
            except:
                raise Exception("schema discovery failed for Parquet dataset at location {}. Please raise Github issue.".format(table_location))

            token = ray.get(self.catalog.register_s3_parquet_source.remote(files[0], len(sizes)))
            self.nodes[self.latest_node_id] = InputS3ParquetNode(files, schema)
            self.nodes[self.latest_node_id].set_catalog_id(token)
        else:
            try:
                f = pq.ParquetFile(S3FileSystem().open_input_file(table_location))
                schema = [k.name for k in f.schema_arrow]
            except:
                raise Exception("""schema discovery failed for Parquet dataset at location {}. 
                                Note if you are specifying a prefix to many parquet files, must use asterix. E.g.
                                qc.read_parquet("s3://rottnest/happy.parquet/*")""".format(table_location))
            key = "/".join(table_location.split("/")[1:])
            response = s3.head_object(Bucket= bucket, Key=key)
            size = response['ContentLength']
            if size < self.sql_config["s3_parquet_materialize_threshold"]:
                df = polars.from_arrow(pq.read_table(table_location, filesystem = S3FileSystem()))
                return return_materialized_stream(df)

            token = ray.get(self.catalog.register_s3_parquet_source.remote(bucket + "/" + key, 1))
            self.nodes[self.latest_node_id] = InputS3ParquetNode([table_location], schema)
            self.nodes[self.latest_node_id].set_catalog_id(token)

        # self.nodes[self.latest_node_id].set_placement_strategy(CustomChannelsStrategy(2))
    else:
        if type(self.cluster) == EC2Cluster:
            raise NotImplementedError("Does not support reading local dataset with S3 cluster. Must use S3 bucket.")

        if "*" in table_location:
            table_location = table_location[:-1]
            assert table_location[-1] == "/", "must specify * with entire directory, doesn't support prefixes yet"
            try:
                files = [i for i in os.listdir(table_location) if i.endswith(".parquet")]
            except:
                raise Exception("Tried to get list of parquet files at ", table_location, " failed. Make sure specify absolute path and filenames end with .parquet")
            assert len(files) > 0
            f = pq.ParquetFile(table_location + files[0])
            schema = [k.name for k in f.schema_arrow]
            if len(files) == 1 and os.path.getsize(table_location + files[0]) < self.sql_config["disk_parquet_materialize_threshold"]:
                df = polars.read_parquet(table_location + files[0])
                return return_materialized_stream(df)

            token = ray.get(self.catalog.register_disk_parquet_source.remote(table_location))
            self.nodes[self.latest_node_id] = InputDiskParquetNode(table_location, schema)
            self.nodes[self.latest_node_id].set_catalog_id(token)

        else:
            try:
                size = os.path.getsize(table_location)
            except:
                raise Exception("could not find the parquet file at ", table_location)

            if size < self.sql_config["disk_parquet_materialize_threshold"]:
                df = polars.read_parquet(table_location)
                return return_materialized_stream(df)

            f = pq.ParquetFile(table_location)
            schema = [k.name for k in f.schema_arrow]
            token = ray.get(self.catalog.register_disk_parquet_source.remote(table_location))
            self.nodes[self.latest_node_id] = InputDiskParquetNode(table_location, schema)
            self.nodes[self.latest_node_id].set_catalog_id(token)

    self.latest_node_id += 1
    return DataStream(self, schema, self.latest_node_id - 1)