Skip to content

QuokkaClusterManager.create_cluster

Create a Ray cluster configured to run Quokka applications.

Parameters:

Name Type Description Default
aws_access_key str

AWS access key.

required
aws_access_id str

AWS access id.

required
num_instances int

Number of instances to create.

required
instance_type str

Instance type to use, defaults to i3.2xlarge.

'i3.2xlarge'
ami str

AMI to use, defaults to "ami-0530ca8899fac469f", which is us-west-2 ubuntu 20.04. Please change accordingly for your region and OS.

'ami-0530ca8899fac469f'
requirements list

List of requirements to install on cluster, defaults to empty list.

[]
spill_dir str

Directory to use for spill directory, defaults to "/data". Quokka will detect if your instance have NVME SSD and mount it to this directory.

'/data'
Return

EC2Cluster object. See EC2Cluster for more details.

Examples:

>>> from pyquokka.utils import * 
>>> manager = QuokkaClusterManager()
>>> cluster = manager.create_cluster(aws_access_key, aws_access_id, num_instances = 2, instance_type = "i3.2xlarge", ami="ami-0530ca8899fac469f", requirements = ["numpy", "pandas"])
>>> cluster.to_json("my_cluster.json")
>>> from pyquokka.df import QuokkaContext
>>> qc = QuokkaContext(cluster)
>>> df = qc.read_csv("s3://my_bucket/my_file.csv")
Source code in pyquokka/utils.py
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
def create_cluster(self, aws_access_key, aws_access_id, num_instances, instance_type = "i3.2xlarge", ami="ami-0530ca8899fac469f", requirements = [], spill_dir = "/data", volume_size = 8):

    """
    Create a Ray cluster configured to run Quokka applications.

    Args:
        aws_access_key (str): AWS access key.
        aws_access_id (str): AWS access id.
        num_instances (int): Number of instances to create.
        instance_type (str): Instance type to use, defaults to i3.2xlarge.
        ami (str): AMI to use, defaults to "ami-0530ca8899fac469f", which is us-west-2 ubuntu 20.04. Please change accordingly for your region and OS.
        requirements (list): List of requirements to install on cluster, defaults to empty list.
        spill_dir (str): Directory to use for spill directory, defaults to "/data". Quokka will detect if your instance have NVME SSD and mount it to this directory.

    Return:
        EC2Cluster object. See EC2Cluster for more details.

    Examples:

        >>> from pyquokka.utils import * 
        >>> manager = QuokkaClusterManager()
        >>> cluster = manager.create_cluster(aws_access_key, aws_access_id, num_instances = 2, instance_type = "i3.2xlarge", ami="ami-0530ca8899fac469f", requirements = ["numpy", "pandas"])
        >>> cluster.to_json("my_cluster.json")
        >>> from pyquokka.df import QuokkaContext
        >>> qc = QuokkaContext(cluster)
        >>> df = qc.read_csv("s3://my_bucket/my_file.csv")

    """

    start_time = time.time()
    ec2 = boto3.client("ec2")
    vcpu_per_node = ec2.describe_instance_types(InstanceTypes=[instance_type])['InstanceTypes'][0]['VCpuInfo']['DefaultVCpus']
    waiter = ec2.get_waiter('instance_running')
    res = ec2.run_instances(
        BlockDeviceMappings=[
            {
                'DeviceName': ec2.describe_images(ImageIds=[ami])['Images'][0]['RootDeviceName'],
                'Ebs': {
                    'DeleteOnTermination': True,
                    'VolumeSize': volume_size,
                    'VolumeType': 'gp3'
                }
            }
        ],
        ImageId=ami, InstanceType = instance_type, SecurityGroupIds = [self.security_group], KeyName=self.key_name ,MaxCount=num_instances, MinCount=num_instances)
    instance_ids = [res['Instances'][i]['InstanceId'] for i in range(num_instances)] 
    waiter.wait(InstanceIds=instance_ids)
    a = ec2.describe_instances(InstanceIds = instance_ids)
    public_ips = [a['Reservations'][0]['Instances'][i]['PublicIpAddress'] for i in range(num_instances)]
    private_ips = [a['Reservations'][0]['Instances'][i]['PrivateIpAddress'] for i in range(num_instances)]

    self.check_instance_alive(public_ips)

    self.set_up_envs(public_ips, requirements, aws_access_key, aws_access_id)
    self.launch_all("sudo mkdir {}".format(spill_dir), public_ips, "failed to make temp spill directory")
    self._initialize_instances(instance_ids, spill_dir)

    print("Launching of Quokka cluster used: ", time.time() - start_time)

    return EC2Cluster(public_ips, private_ips, instance_ids, vcpu_per_node, spill_dir)